diff options
| author | Keith Whitwell <keith@tungstengraphics.com> | 2009-07-16 07:50:34 +0100 | 
|---|---|---|
| committer | Keith Whitwell <keithw@vmware.com> | 2009-07-16 09:53:08 +0100 | 
| commit | ebc4a9bf2eff7d2c0d89785e865a1df23733e64b (patch) | |
| tree | c56a007a7378cdf02f0c07c6d5c065f7bf469ad2 /src | |
| parent | 4e3002b50fcedf3a6db1ac7394077bc3337ccda1 (diff) | |
tgsi: reduce x86 reg usage in tgsi_sse generated programs
Pass the tgsi_exec_machine struct in directly and just hold a single
pointer to this struct, rather than keeping one for each of its
internal members.
Diffstat (limited to 'src')
| -rw-r--r-- | src/gallium/auxiliary/draw/draw_vs_sse.c | 24 | ||||
| -rw-r--r-- | src/gallium/auxiliary/tgsi/tgsi_sse2.c | 190 | ||||
| -rw-r--r-- | src/gallium/auxiliary/tgsi/tgsi_sse2.h | 28 | ||||
| -rw-r--r-- | src/gallium/drivers/softpipe/sp_fs_sse.c | 23 | 
4 files changed, 115 insertions, 150 deletions
| diff --git a/src/gallium/auxiliary/draw/draw_vs_sse.c b/src/gallium/auxiliary/draw/draw_vs_sse.c index a4f72c40ef..fb58983e01 100644 --- a/src/gallium/auxiliary/draw/draw_vs_sse.c +++ b/src/gallium/auxiliary/draw/draw_vs_sse.c @@ -52,24 +52,12 @@  #define SSE_MAX_VERTICES 4 -typedef void (PIPE_CDECL *codegen_function) ( -   const struct tgsi_exec_vector *input, /* 1 */ -   struct tgsi_exec_vector *output, /* 2 */ -   float (*constant)[4],        /* 3 */ -   struct tgsi_exec_vector *temporary, /* 4 */ -   float (*immediates)[4],      /* 5 */ -   const float (*aos_input)[4], /* 6 */ -   uint num_inputs,             /* 7 */ -   uint input_stride,           /* 8 */ -   float (*aos_output)[4],      /* 9 */ -   uint num_outputs,            /* 10 */ -   uint output_stride );        /* 11 */  struct draw_sse_vertex_shader {     struct draw_vertex_shader base;     struct x86_function sse2_program; -   codegen_function func; +   tgsi_sse2_vs_func func;     struct tgsi_exec_machine *machine;  }; @@ -119,11 +107,9 @@ vs_sse_run_linear( struct draw_vertex_shader *base,        /* run compiled shader         */ -      shader->func(machine->Inputs, -		   machine->Outputs, -		   (float (*)[4])constants, -		   machine->Temps, -		   (float (*)[4])shader->base.immediates, +      shader->func(machine, +		   constants, +		   shader->base.immediates,                     input,                     base->info.num_inputs,                     input_stride, @@ -195,7 +181,7 @@ draw_create_vs_sse(struct draw_context *draw,                          TRUE ))         goto fail; -   vs->func = (codegen_function) x86_get_func( &vs->sse2_program ); +   vs->func = (tgsi_sse2_vs_func) x86_get_func( &vs->sse2_program );     if (!vs->func) {        goto fail;     } diff --git a/src/gallium/auxiliary/tgsi/tgsi_sse2.c b/src/gallium/auxiliary/tgsi/tgsi_sse2.c index 5084befc4e..cfe8ef0ecf 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_sse2.c +++ b/src/gallium/auxiliary/tgsi/tgsi_sse2.c @@ -32,6 +32,7 @@  #include "util/u_debug.h"  #include "pipe/p_shader_tokens.h"  #include "util/u_math.h" +#include "util/u_memory.h"  #if defined(PIPE_ARCH_SSE)  #include "util/u_sse.h"  #endif @@ -104,7 +105,7 @@ get_const_base( void )  }  static struct x86_reg -get_input_base( void ) +get_machine_base( void )  {     return x86_make_reg(        file_REG32, @@ -112,25 +113,35 @@ get_input_base( void )  }  static struct x86_reg +get_input_base( void ) +{ +   return x86_make_disp( +      get_machine_base(), +      Offset(struct tgsi_exec_machine, Inputs) ); +} + +static struct x86_reg  get_output_base( void )  { -   return x86_make_reg( -      file_REG32, -      reg_DX ); +   return x86_make_disp( +      get_machine_base(), +      Offset(struct tgsi_exec_machine, Outputs) );  }  static struct x86_reg  get_temp_base( void )  { -   return x86_make_reg( -      file_REG32, -      reg_BX ); +   return x86_make_disp( +      get_machine_base(), +      Offset(struct tgsi_exec_machine, Temps) );  }  static struct x86_reg  get_coef_base( void )  { -   return get_output_base(); +   return x86_make_reg( +      file_REG32, +      reg_BX );  }  static struct x86_reg @@ -138,7 +149,7 @@ get_immediate_base( void )  {     return x86_make_reg(        file_REG32, -      reg_DI ); +      reg_DX );  } @@ -2551,7 +2562,7 @@ emit_declaration(  static void aos_to_soa( struct x86_function *func,                           uint arg_aos, -                        uint arg_soa,  +                        uint arg_machine,                           uint arg_num,                           uint arg_stride )  { @@ -2566,7 +2577,10 @@ static void aos_to_soa( struct x86_function *func,     x86_push( func, x86_make_reg( file_REG32, reg_BX ) );     x86_mov( func, aos_input,  x86_fn_arg( func, arg_aos ) ); -   x86_mov( func, soa_input,  x86_fn_arg( func, arg_soa ) ); +   x86_mov( func, soa_input,  x86_fn_arg( func, arg_machine ) ); +   x86_lea( func, soa_input,   +	    x86_make_disp( soa_input,  +			   Offset(struct tgsi_exec_machine, Inputs) ) );     x86_mov( func, num_inputs, x86_fn_arg( func, arg_num ) );     x86_mov( func, stride,     x86_fn_arg( func, arg_stride ) ); @@ -2608,28 +2622,30 @@ static void aos_to_soa( struct x86_function *func,     x86_jcc( func, cc_NE, inner_loop );     /* Restore EBX */ -   x86_pop( func, aos_input ); +   x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );  } -static void soa_to_aos( struct x86_function *func, uint aos, uint soa, uint num, uint stride ) +static void soa_to_aos( struct x86_function *func,  +			uint arg_aos,  +			uint arg_machine,  +			uint arg_num,  +			uint arg_stride )  { -   struct x86_reg soa_output; -   struct x86_reg aos_output; -   struct x86_reg num_outputs; -   struct x86_reg temp; +   struct x86_reg soa_output = x86_make_reg( file_REG32, reg_AX ); +   struct x86_reg aos_output = x86_make_reg( file_REG32, reg_BX ); +   struct x86_reg num_outputs = x86_make_reg( file_REG32, reg_CX ); +   struct x86_reg temp = x86_make_reg( file_REG32, reg_DX );     int inner_loop; -   soa_output = x86_make_reg( file_REG32, reg_AX ); -   aos_output = x86_make_reg( file_REG32, reg_BX ); -   num_outputs = x86_make_reg( file_REG32, reg_CX ); -   temp = x86_make_reg( file_REG32, reg_DX ); -     /* Save EBX */ -   x86_push( func, aos_output ); +   x86_push( func, x86_make_reg( file_REG32, reg_BX ) ); -   x86_mov( func, soa_output, x86_fn_arg( func, soa ) ); -   x86_mov( func, aos_output, x86_fn_arg( func, aos ) ); -   x86_mov( func, num_outputs, x86_fn_arg( func, num ) ); +   x86_mov( func, aos_output, x86_fn_arg( func, arg_aos ) ); +   x86_mov( func, soa_output, x86_fn_arg( func, arg_machine ) ); +   x86_lea( func, soa_output,  +	    x86_make_disp( soa_output,  +			   Offset(struct tgsi_exec_machine, Outputs) ) ); +   x86_mov( func, num_outputs, x86_fn_arg( func, arg_num ) );     /* do */     inner_loop = x86_get_label( func ); @@ -2646,7 +2662,7 @@ static void soa_to_aos( struct x86_function *func, uint aos, uint soa, uint num,        sse_unpcklps( func, make_xmm( 3 ), make_xmm( 4 ) );        sse_unpckhps( func, make_xmm( 5 ), make_xmm( 4 ) ); -      x86_mov( func, temp, x86_fn_arg( func, stride ) ); +      x86_mov( func, temp, x86_fn_arg( func, arg_stride ) );        x86_push( func, aos_output );        sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );        sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) ); @@ -2670,20 +2686,13 @@ static void soa_to_aos( struct x86_function *func, uint aos, uint soa, uint num,     x86_jcc( func, cc_NE, inner_loop );     /* Restore EBX */ -   x86_pop( func, aos_output ); +   x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );  }  /**   * Translate a TGSI vertex/fragment shader to SSE2 code.   * Slightly different things are done for vertex vs. fragment shaders.   * - * Note that fragment shaders are responsible for interpolating shader - * inputs. Because on x86 we have only 4 GP registers, and here we - * have 5 shader arguments (input, output, const, temp and coef), the - * code is split into two phases -- DECLARATION and INSTRUCTION phase. - * GP register holding the output argument is aliased with the coeff - * argument, as outputs are not needed in the DECLARATION phase. - *   * \param tokens  the TGSI input shader   * \param func  the output SSE code/function   * \param immediates  buffer to place immediates, later passed to SSE func @@ -2697,7 +2706,6 @@ tgsi_emit_sse2(     boolean do_swizzles )  {     struct tgsi_parse_context parse; -   boolean instruction_phase = FALSE;     unsigned ok = 1;     uint num_immediates = 0; @@ -2709,74 +2717,42 @@ tgsi_emit_sse2(     /* Can't just use EDI, EBX without save/restoring them:      */ -   x86_push( -      func, -      get_immediate_base() ); - -   x86_push( -      func, -      get_temp_base() ); - +   x86_push( func, x86_make_reg( file_REG32, reg_BX ) ); +   x86_push( func, x86_make_reg( file_REG32, reg_DI ) );     /*      * Different function args for vertex/fragment shaders:      */ -   if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) { -      /* DECLARATION phase, do not load output argument. */ -      x86_mov( -         func, -         get_input_base(), -         x86_fn_arg( func, 1 ) ); -      /* skipping outputs argument here */ -      x86_mov( -         func, -         get_const_base(), -         x86_fn_arg( func, 3 ) ); -      x86_mov( -         func, -         get_temp_base(), -         x86_fn_arg( func, 4 ) ); -      x86_mov( -         func, -         get_coef_base(), -         x86_fn_arg( func, 5 ) ); -      x86_mov( -         func, -         get_immediate_base(), -         x86_fn_arg( func, 6 ) ); -   } -   else { -      assert(parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX); - +   if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {        if (do_swizzles)           aos_to_soa( func,  -                     6,         /* aos_input */ -                     1,         /* machine->input */ -                     7,         /* num_inputs */ -                     8 );       /* input_stride */ +                     4,         /* aos_input */ +                     1,         /* machine */ +                     5,         /* num_inputs */ +                     6 );       /* input_stride */ +   } +   x86_mov( +      func, +      get_machine_base(), +      x86_fn_arg( func, 1 ) ); +   x86_mov( +      func, +      get_const_base(), +      x86_fn_arg( func, 2 ) ); +   x86_mov( +      func, +      get_immediate_base(), +      x86_fn_arg( func, 3 ) ); + +   if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {        x86_mov( -         func, -         get_input_base(), -         x86_fn_arg( func, 1 ) ); -      x86_mov( -         func, -         get_output_base(), -         x86_fn_arg( func, 2 ) ); -      x86_mov( -         func, -         get_const_base(), -         x86_fn_arg( func, 3 ) ); -      x86_mov( -         func, -         get_temp_base(), -         x86_fn_arg( func, 4 ) ); -      x86_mov( -         func, -         get_immediate_base(), -         x86_fn_arg( func, 5 ) ); +	 func, +	 get_coef_base(), +	 x86_fn_arg( func, 4 ) );     } +     while( !tgsi_parse_end_of_tokens( &parse ) && ok ) {        tgsi_parse_token( &parse ); @@ -2790,17 +2766,6 @@ tgsi_emit_sse2(           break;        case TGSI_TOKEN_TYPE_INSTRUCTION: -         if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) { -            if( !instruction_phase ) { -               /* INSTRUCTION phase, overwrite coeff with output. */ -               instruction_phase = TRUE; -               x86_mov( -                  func, -                  get_output_base(), -                  x86_fn_arg( func, 2 ) ); -            } -         } -           ok = emit_instruction(              func,              &parse.FullToken.FullInstruction ); @@ -2844,18 +2809,17 @@ tgsi_emit_sse2(     if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {        if (do_swizzles) -         soa_to_aos( func, 9, 2, 10, 11 ); +         soa_to_aos( func,  +		     7, 	/* aos_output */ +		     1, 	/* machine */ +		     8, 	/* num_outputs */ +		     9 );	/* output_stride */     }     /* Can't just use EBX, EDI without save/restoring them:      */ -   x86_pop( -      func, -      get_temp_base() ); - -   x86_pop( -      func, -      get_immediate_base() ); +   x86_pop( func, x86_make_reg( file_REG32, reg_DI ) ); +   x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );     emit_ret( func ); diff --git a/src/gallium/auxiliary/tgsi/tgsi_sse2.h b/src/gallium/auxiliary/tgsi/tgsi_sse2.h index af838b2a25..d81ee3d00e 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_sse2.h +++ b/src/gallium/auxiliary/tgsi/tgsi_sse2.h @@ -34,6 +34,7 @@ extern "C" {  struct tgsi_token;  struct x86_function; +struct tgsi_interp_coef;  unsigned  tgsi_emit_sse2( @@ -42,6 +43,33 @@ tgsi_emit_sse2(     float (*immediates)[4],     boolean do_swizzles ); + +/* This is the function prototype generated when do_swizzles is false + * -- effectively for fragment shaders. + */ +typedef void (PIPE_CDECL *tgsi_sse2_fs_function) ( +   struct tgsi_exec_machine *machine, /* 1 */ +   const float (*constant)[4],		    /* 2 */ +   const float (*immediate)[4],		    /* 3 */ +   const struct tgsi_interp_coef *coef	    /* 4 */ +   ); + + +/* This is the function prototype generated when do_swizzles is true + * -- effectively for vertex shaders. + */ +typedef void (PIPE_CDECL *tgsi_sse2_vs_func) ( +   struct tgsi_exec_machine *machine, /* 1 */ +   const float (*constant)[4],        /* 2 */ +   const float (*immediate)[4],       /* 3 */ +   const float (*aos_input)[4], /* 4 */ +   uint num_inputs,             /* 5 */ +   uint input_stride,           /* 6 */ +   float (*aos_output)[4],      /* 7 */ +   uint num_outputs,            /* 8 */ +   uint output_stride );        /* 9 */ + +  #if defined __cplusplus  }  #endif diff --git a/src/gallium/drivers/softpipe/sp_fs_sse.c b/src/gallium/drivers/softpipe/sp_fs_sse.c index 31c3ca21c5..f9362efcb7 100644 --- a/src/gallium/drivers/softpipe/sp_fs_sse.c +++ b/src/gallium/drivers/softpipe/sp_fs_sse.c @@ -45,17 +45,6 @@  #include "rtasm/rtasm_x86sse.h" -/* Surely this should be defined somewhere in a tgsi header: - */ -typedef void (PIPE_CDECL *codegen_function)( -   const struct tgsi_exec_vector *input, -   struct tgsi_exec_vector *output, -   const float (*constant)[4], -   struct tgsi_exec_vector *temporary, -   const struct tgsi_interp_coef *coef, -   float (*immediates)[4] -   //, const struct tgsi_exec_vector *quadPos - );  /** @@ -65,7 +54,7 @@ struct sp_sse_fragment_shader  {     struct sp_fragment_shader base;     struct x86_function sse2_program; -   codegen_function func; +   tgsi_sse2_fs_function func;     float immediates[TGSI_EXEC_NUM_IMMEDIATES][4];  }; @@ -107,12 +96,10 @@ fs_sse_run( const struct sp_fragment_shader *base,     tgsi_set_kill_mask(machine, 0x0);     tgsi_set_exec_mask(machine, 1, 1, 1, 1); -   shader->func( machine->Inputs, -		 machine->Outputs, +   shader->func( machine,  		 machine->Consts, -		 machine->Temps, -		 machine->InterpCoefs, -                 shader->immediates +                 (const float (*)[4])shader->immediates, +		 machine->InterpCoefs  		 //	 , &machine->QuadPos        ); @@ -151,7 +138,7 @@ softpipe_create_fs_sse(struct softpipe_context *softpipe,        return NULL;     } -   shader->func = (codegen_function) x86_get_func( &shader->sse2_program ); +   shader->func = (tgsi_sse2_fs_function) x86_get_func( &shader->sse2_program );     if (!shader->func) {        x86_release_func( &shader->sse2_program );        FREE(shader); | 
