summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKeith Whitwell <keith@tungstengraphics.com>2009-07-16 07:50:34 +0100
committerKeith Whitwell <keithw@vmware.com>2009-07-16 09:53:08 +0100
commitebc4a9bf2eff7d2c0d89785e865a1df23733e64b (patch)
treec56a007a7378cdf02f0c07c6d5c065f7bf469ad2
parent4e3002b50fcedf3a6db1ac7394077bc3337ccda1 (diff)
tgsi: reduce x86 reg usage in tgsi_sse generated programs
Pass the tgsi_exec_machine struct in directly and just hold a single pointer to this struct, rather than keeping one for each of its internal members.
-rw-r--r--src/gallium/auxiliary/draw/draw_vs_sse.c24
-rw-r--r--src/gallium/auxiliary/tgsi/tgsi_sse2.c190
-rw-r--r--src/gallium/auxiliary/tgsi/tgsi_sse2.h28
-rw-r--r--src/gallium/drivers/softpipe/sp_fs_sse.c23
4 files changed, 115 insertions, 150 deletions
diff --git a/src/gallium/auxiliary/draw/draw_vs_sse.c b/src/gallium/auxiliary/draw/draw_vs_sse.c
index a4f72c40ef..fb58983e01 100644
--- a/src/gallium/auxiliary/draw/draw_vs_sse.c
+++ b/src/gallium/auxiliary/draw/draw_vs_sse.c
@@ -52,24 +52,12 @@
#define SSE_MAX_VERTICES 4
-typedef void (PIPE_CDECL *codegen_function) (
- const struct tgsi_exec_vector *input, /* 1 */
- struct tgsi_exec_vector *output, /* 2 */
- float (*constant)[4], /* 3 */
- struct tgsi_exec_vector *temporary, /* 4 */
- float (*immediates)[4], /* 5 */
- const float (*aos_input)[4], /* 6 */
- uint num_inputs, /* 7 */
- uint input_stride, /* 8 */
- float (*aos_output)[4], /* 9 */
- uint num_outputs, /* 10 */
- uint output_stride ); /* 11 */
struct draw_sse_vertex_shader {
struct draw_vertex_shader base;
struct x86_function sse2_program;
- codegen_function func;
+ tgsi_sse2_vs_func func;
struct tgsi_exec_machine *machine;
};
@@ -119,11 +107,9 @@ vs_sse_run_linear( struct draw_vertex_shader *base,
/* run compiled shader
*/
- shader->func(machine->Inputs,
- machine->Outputs,
- (float (*)[4])constants,
- machine->Temps,
- (float (*)[4])shader->base.immediates,
+ shader->func(machine,
+ constants,
+ shader->base.immediates,
input,
base->info.num_inputs,
input_stride,
@@ -195,7 +181,7 @@ draw_create_vs_sse(struct draw_context *draw,
TRUE ))
goto fail;
- vs->func = (codegen_function) x86_get_func( &vs->sse2_program );
+ vs->func = (tgsi_sse2_vs_func) x86_get_func( &vs->sse2_program );
if (!vs->func) {
goto fail;
}
diff --git a/src/gallium/auxiliary/tgsi/tgsi_sse2.c b/src/gallium/auxiliary/tgsi/tgsi_sse2.c
index 5084befc4e..cfe8ef0ecf 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_sse2.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_sse2.c
@@ -32,6 +32,7 @@
#include "util/u_debug.h"
#include "pipe/p_shader_tokens.h"
#include "util/u_math.h"
+#include "util/u_memory.h"
#if defined(PIPE_ARCH_SSE)
#include "util/u_sse.h"
#endif
@@ -104,7 +105,7 @@ get_const_base( void )
}
static struct x86_reg
-get_input_base( void )
+get_machine_base( void )
{
return x86_make_reg(
file_REG32,
@@ -112,25 +113,35 @@ get_input_base( void )
}
static struct x86_reg
+get_input_base( void )
+{
+ return x86_make_disp(
+ get_machine_base(),
+ Offset(struct tgsi_exec_machine, Inputs) );
+}
+
+static struct x86_reg
get_output_base( void )
{
- return x86_make_reg(
- file_REG32,
- reg_DX );
+ return x86_make_disp(
+ get_machine_base(),
+ Offset(struct tgsi_exec_machine, Outputs) );
}
static struct x86_reg
get_temp_base( void )
{
- return x86_make_reg(
- file_REG32,
- reg_BX );
+ return x86_make_disp(
+ get_machine_base(),
+ Offset(struct tgsi_exec_machine, Temps) );
}
static struct x86_reg
get_coef_base( void )
{
- return get_output_base();
+ return x86_make_reg(
+ file_REG32,
+ reg_BX );
}
static struct x86_reg
@@ -138,7 +149,7 @@ get_immediate_base( void )
{
return x86_make_reg(
file_REG32,
- reg_DI );
+ reg_DX );
}
@@ -2551,7 +2562,7 @@ emit_declaration(
static void aos_to_soa( struct x86_function *func,
uint arg_aos,
- uint arg_soa,
+ uint arg_machine,
uint arg_num,
uint arg_stride )
{
@@ -2566,7 +2577,10 @@ static void aos_to_soa( struct x86_function *func,
x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
x86_mov( func, aos_input, x86_fn_arg( func, arg_aos ) );
- x86_mov( func, soa_input, x86_fn_arg( func, arg_soa ) );
+ x86_mov( func, soa_input, x86_fn_arg( func, arg_machine ) );
+ x86_lea( func, soa_input,
+ x86_make_disp( soa_input,
+ Offset(struct tgsi_exec_machine, Inputs) ) );
x86_mov( func, num_inputs, x86_fn_arg( func, arg_num ) );
x86_mov( func, stride, x86_fn_arg( func, arg_stride ) );
@@ -2608,28 +2622,30 @@ static void aos_to_soa( struct x86_function *func,
x86_jcc( func, cc_NE, inner_loop );
/* Restore EBX */
- x86_pop( func, aos_input );
+ x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
}
-static void soa_to_aos( struct x86_function *func, uint aos, uint soa, uint num, uint stride )
+static void soa_to_aos( struct x86_function *func,
+ uint arg_aos,
+ uint arg_machine,
+ uint arg_num,
+ uint arg_stride )
{
- struct x86_reg soa_output;
- struct x86_reg aos_output;
- struct x86_reg num_outputs;
- struct x86_reg temp;
+ struct x86_reg soa_output = x86_make_reg( file_REG32, reg_AX );
+ struct x86_reg aos_output = x86_make_reg( file_REG32, reg_BX );
+ struct x86_reg num_outputs = x86_make_reg( file_REG32, reg_CX );
+ struct x86_reg temp = x86_make_reg( file_REG32, reg_DX );
int inner_loop;
- soa_output = x86_make_reg( file_REG32, reg_AX );
- aos_output = x86_make_reg( file_REG32, reg_BX );
- num_outputs = x86_make_reg( file_REG32, reg_CX );
- temp = x86_make_reg( file_REG32, reg_DX );
-
/* Save EBX */
- x86_push( func, aos_output );
+ x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
- x86_mov( func, soa_output, x86_fn_arg( func, soa ) );
- x86_mov( func, aos_output, x86_fn_arg( func, aos ) );
- x86_mov( func, num_outputs, x86_fn_arg( func, num ) );
+ x86_mov( func, aos_output, x86_fn_arg( func, arg_aos ) );
+ x86_mov( func, soa_output, x86_fn_arg( func, arg_machine ) );
+ x86_lea( func, soa_output,
+ x86_make_disp( soa_output,
+ Offset(struct tgsi_exec_machine, Outputs) ) );
+ x86_mov( func, num_outputs, x86_fn_arg( func, arg_num ) );
/* do */
inner_loop = x86_get_label( func );
@@ -2646,7 +2662,7 @@ static void soa_to_aos( struct x86_function *func, uint aos, uint soa, uint num,
sse_unpcklps( func, make_xmm( 3 ), make_xmm( 4 ) );
sse_unpckhps( func, make_xmm( 5 ), make_xmm( 4 ) );
- x86_mov( func, temp, x86_fn_arg( func, stride ) );
+ x86_mov( func, temp, x86_fn_arg( func, arg_stride ) );
x86_push( func, aos_output );
sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
@@ -2670,20 +2686,13 @@ static void soa_to_aos( struct x86_function *func, uint aos, uint soa, uint num,
x86_jcc( func, cc_NE, inner_loop );
/* Restore EBX */
- x86_pop( func, aos_output );
+ x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
}
/**
* Translate a TGSI vertex/fragment shader to SSE2 code.
* Slightly different things are done for vertex vs. fragment shaders.
*
- * Note that fragment shaders are responsible for interpolating shader
- * inputs. Because on x86 we have only 4 GP registers, and here we
- * have 5 shader arguments (input, output, const, temp and coef), the
- * code is split into two phases -- DECLARATION and INSTRUCTION phase.
- * GP register holding the output argument is aliased with the coeff
- * argument, as outputs are not needed in the DECLARATION phase.
- *
* \param tokens the TGSI input shader
* \param func the output SSE code/function
* \param immediates buffer to place immediates, later passed to SSE func
@@ -2697,7 +2706,6 @@ tgsi_emit_sse2(
boolean do_swizzles )
{
struct tgsi_parse_context parse;
- boolean instruction_phase = FALSE;
unsigned ok = 1;
uint num_immediates = 0;
@@ -2709,74 +2717,42 @@ tgsi_emit_sse2(
/* Can't just use EDI, EBX without save/restoring them:
*/
- x86_push(
- func,
- get_immediate_base() );
-
- x86_push(
- func,
- get_temp_base() );
-
+ x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
+ x86_push( func, x86_make_reg( file_REG32, reg_DI ) );
/*
* Different function args for vertex/fragment shaders:
*/
- if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
- /* DECLARATION phase, do not load output argument. */
- x86_mov(
- func,
- get_input_base(),
- x86_fn_arg( func, 1 ) );
- /* skipping outputs argument here */
- x86_mov(
- func,
- get_const_base(),
- x86_fn_arg( func, 3 ) );
- x86_mov(
- func,
- get_temp_base(),
- x86_fn_arg( func, 4 ) );
- x86_mov(
- func,
- get_coef_base(),
- x86_fn_arg( func, 5 ) );
- x86_mov(
- func,
- get_immediate_base(),
- x86_fn_arg( func, 6 ) );
- }
- else {
- assert(parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX);
-
+ if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
if (do_swizzles)
aos_to_soa( func,
- 6, /* aos_input */
- 1, /* machine->input */
- 7, /* num_inputs */
- 8 ); /* input_stride */
+ 4, /* aos_input */
+ 1, /* machine */
+ 5, /* num_inputs */
+ 6 ); /* input_stride */
+ }
+ x86_mov(
+ func,
+ get_machine_base(),
+ x86_fn_arg( func, 1 ) );
+ x86_mov(
+ func,
+ get_const_base(),
+ x86_fn_arg( func, 2 ) );
+ x86_mov(
+ func,
+ get_immediate_base(),
+ x86_fn_arg( func, 3 ) );
+
+ if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
x86_mov(
- func,
- get_input_base(),
- x86_fn_arg( func, 1 ) );
- x86_mov(
- func,
- get_output_base(),
- x86_fn_arg( func, 2 ) );
- x86_mov(
- func,
- get_const_base(),
- x86_fn_arg( func, 3 ) );
- x86_mov(
- func,
- get_temp_base(),
- x86_fn_arg( func, 4 ) );
- x86_mov(
- func,
- get_immediate_base(),
- x86_fn_arg( func, 5 ) );
+ func,
+ get_coef_base(),
+ x86_fn_arg( func, 4 ) );
}
+
while( !tgsi_parse_end_of_tokens( &parse ) && ok ) {
tgsi_parse_token( &parse );
@@ -2790,17 +2766,6 @@ tgsi_emit_sse2(
break;
case TGSI_TOKEN_TYPE_INSTRUCTION:
- if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
- if( !instruction_phase ) {
- /* INSTRUCTION phase, overwrite coeff with output. */
- instruction_phase = TRUE;
- x86_mov(
- func,
- get_output_base(),
- x86_fn_arg( func, 2 ) );
- }
- }
-
ok = emit_instruction(
func,
&parse.FullToken.FullInstruction );
@@ -2844,18 +2809,17 @@ tgsi_emit_sse2(
if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
if (do_swizzles)
- soa_to_aos( func, 9, 2, 10, 11 );
+ soa_to_aos( func,
+ 7, /* aos_output */
+ 1, /* machine */
+ 8, /* num_outputs */
+ 9 ); /* output_stride */
}
/* Can't just use EBX, EDI without save/restoring them:
*/
- x86_pop(
- func,
- get_temp_base() );
-
- x86_pop(
- func,
- get_immediate_base() );
+ x86_pop( func, x86_make_reg( file_REG32, reg_DI ) );
+ x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
emit_ret( func );
diff --git a/src/gallium/auxiliary/tgsi/tgsi_sse2.h b/src/gallium/auxiliary/tgsi/tgsi_sse2.h
index af838b2a25..d81ee3d00e 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_sse2.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_sse2.h
@@ -34,6 +34,7 @@ extern "C" {
struct tgsi_token;
struct x86_function;
+struct tgsi_interp_coef;
unsigned
tgsi_emit_sse2(
@@ -42,6 +43,33 @@ tgsi_emit_sse2(
float (*immediates)[4],
boolean do_swizzles );
+
+/* This is the function prototype generated when do_swizzles is false
+ * -- effectively for fragment shaders.
+ */
+typedef void (PIPE_CDECL *tgsi_sse2_fs_function) (
+ struct tgsi_exec_machine *machine, /* 1 */
+ const float (*constant)[4], /* 2 */
+ const float (*immediate)[4], /* 3 */
+ const struct tgsi_interp_coef *coef /* 4 */
+ );
+
+
+/* This is the function prototype generated when do_swizzles is true
+ * -- effectively for vertex shaders.
+ */
+typedef void (PIPE_CDECL *tgsi_sse2_vs_func) (
+ struct tgsi_exec_machine *machine, /* 1 */
+ const float (*constant)[4], /* 2 */
+ const float (*immediate)[4], /* 3 */
+ const float (*aos_input)[4], /* 4 */
+ uint num_inputs, /* 5 */
+ uint input_stride, /* 6 */
+ float (*aos_output)[4], /* 7 */
+ uint num_outputs, /* 8 */
+ uint output_stride ); /* 9 */
+
+
#if defined __cplusplus
}
#endif
diff --git a/src/gallium/drivers/softpipe/sp_fs_sse.c b/src/gallium/drivers/softpipe/sp_fs_sse.c
index 31c3ca21c5..f9362efcb7 100644
--- a/src/gallium/drivers/softpipe/sp_fs_sse.c
+++ b/src/gallium/drivers/softpipe/sp_fs_sse.c
@@ -45,17 +45,6 @@
#include "rtasm/rtasm_x86sse.h"
-/* Surely this should be defined somewhere in a tgsi header:
- */
-typedef void (PIPE_CDECL *codegen_function)(
- const struct tgsi_exec_vector *input,
- struct tgsi_exec_vector *output,
- const float (*constant)[4],
- struct tgsi_exec_vector *temporary,
- const struct tgsi_interp_coef *coef,
- float (*immediates)[4]
- //, const struct tgsi_exec_vector *quadPos
- );
/**
@@ -65,7 +54,7 @@ struct sp_sse_fragment_shader
{
struct sp_fragment_shader base;
struct x86_function sse2_program;
- codegen_function func;
+ tgsi_sse2_fs_function func;
float immediates[TGSI_EXEC_NUM_IMMEDIATES][4];
};
@@ -107,12 +96,10 @@ fs_sse_run( const struct sp_fragment_shader *base,
tgsi_set_kill_mask(machine, 0x0);
tgsi_set_exec_mask(machine, 1, 1, 1, 1);
- shader->func( machine->Inputs,
- machine->Outputs,
+ shader->func( machine,
machine->Consts,
- machine->Temps,
- machine->InterpCoefs,
- shader->immediates
+ (const float (*)[4])shader->immediates,
+ machine->InterpCoefs
// , &machine->QuadPos
);
@@ -151,7 +138,7 @@ softpipe_create_fs_sse(struct softpipe_context *softpipe,
return NULL;
}
- shader->func = (codegen_function) x86_get_func( &shader->sse2_program );
+ shader->func = (tgsi_sse2_fs_function) x86_get_func( &shader->sse2_program );
if (!shader->func) {
x86_release_func( &shader->sse2_program );
FREE(shader);