summaryrefslogtreecommitdiff
path: root/src/gallium/auxiliary/tgsi/exec
diff options
context:
space:
mode:
Diffstat (limited to 'src/gallium/auxiliary/tgsi/exec')
-rw-r--r--src/gallium/auxiliary/tgsi/exec/Makefile2
-rw-r--r--src/gallium/auxiliary/tgsi/exec/tgsi_exec.c171
-rw-r--r--src/gallium/auxiliary/tgsi/exec/tgsi_exec.h12
-rwxr-xr-xsrc/gallium/auxiliary/tgsi/exec/tgsi_sse2.c1090
-rwxr-xr-xsrc/gallium/auxiliary/tgsi/exec/tgsi_sse2.h11
5 files changed, 607 insertions, 679 deletions
diff --git a/src/gallium/auxiliary/tgsi/exec/Makefile b/src/gallium/auxiliary/tgsi/exec/Makefile
new file mode 100644
index 0000000000..451911a354
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/exec/Makefile
@@ -0,0 +1,2 @@
+default:
+ cd .. ; make
diff --git a/src/gallium/auxiliary/tgsi/exec/tgsi_exec.c b/src/gallium/auxiliary/tgsi/exec/tgsi_exec.c
index 78e7dec569..826b432f09 100644
--- a/src/gallium/auxiliary/tgsi/exec/tgsi_exec.c
+++ b/src/gallium/auxiliary/tgsi/exec/tgsi_exec.c
@@ -88,6 +88,10 @@
#define TEMP_OUTPUT_C TGSI_EXEC_TEMP_OUTPUT_C
#define TEMP_PRIMITIVE_I TGSI_EXEC_TEMP_PRIMITIVE_I
#define TEMP_PRIMITIVE_C TGSI_EXEC_TEMP_PRIMITIVE_C
+#define TEMP_3_I TGSI_EXEC_TEMP_THREE_I
+#define TEMP_3_C TGSI_EXEC_TEMP_THREE_C
+#define TEMP_HALF_I TGSI_EXEC_TEMP_HALF_I
+#define TEMP_HALF_C TGSI_EXEC_TEMP_HALF_C
#define TEMP_R0 TGSI_EXEC_TEMP_R0
#define FOR_EACH_CHANNEL(CHAN)\
@@ -262,6 +266,8 @@ tgsi_exec_machine_init(
mach->Temps[TEMP_2_I].xyzw[TEMP_2_C].f[i] = 2.0f;
mach->Temps[TEMP_128_I].xyzw[TEMP_128_C].f[i] = 128.0f;
mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C].f[i] = -128.0f;
+ mach->Temps[TEMP_3_I].xyzw[TEMP_3_C].f[i] = 3.0f;
+ mach->Temps[TEMP_HALF_I].xyzw[TEMP_HALF_C].f[i] = 0.5f;
}
}
@@ -287,10 +293,10 @@ micro_abs(
union tgsi_exec_channel *dst,
const union tgsi_exec_channel *src )
{
- dst->f[0] = (float) fabs( (double) src->f[0] );
- dst->f[1] = (float) fabs( (double) src->f[1] );
- dst->f[2] = (float) fabs( (double) src->f[2] );
- dst->f[3] = (float) fabs( (double) src->f[3] );
+ dst->f[0] = fabsf( src->f[0] );
+ dst->f[1] = fabsf( src->f[1] );
+ dst->f[2] = fabsf( src->f[2] );
+ dst->f[3] = fabsf( src->f[3] );
}
static void
@@ -334,10 +340,10 @@ micro_ceil(
union tgsi_exec_channel *dst,
const union tgsi_exec_channel *src )
{
- dst->f[0] = (float) ceil( (double) src->f[0] );
- dst->f[1] = (float) ceil( (double) src->f[1] );
- dst->f[2] = (float) ceil( (double) src->f[2] );
- dst->f[3] = (float) ceil( (double) src->f[3] );
+ dst->f[0] = ceilf( src->f[0] );
+ dst->f[1] = ceilf( src->f[1] );
+ dst->f[2] = ceilf( src->f[2] );
+ dst->f[3] = ceilf( src->f[3] );
}
static void
@@ -345,10 +351,10 @@ micro_cos(
union tgsi_exec_channel *dst,
const union tgsi_exec_channel *src )
{
- dst->f[0] = (float) cos( (double) src->f[0] );
- dst->f[1] = (float) cos( (double) src->f[1] );
- dst->f[2] = (float) cos( (double) src->f[2] );
- dst->f[3] = (float) cos( (double) src->f[3] );
+ dst->f[0] = cosf( src->f[0] );
+ dst->f[1] = cosf( src->f[1] );
+ dst->f[2] = cosf( src->f[2] );
+ dst->f[3] = cosf( src->f[3] );
}
static void
@@ -430,10 +436,10 @@ micro_exp2(
union tgsi_exec_channel *dst,
const union tgsi_exec_channel *src)
{
- dst->f[0] = (float) pow( 2.0, (double) src->f[0] );
- dst->f[1] = (float) pow( 2.0, (double) src->f[1] );
- dst->f[2] = (float) pow( 2.0, (double) src->f[2] );
- dst->f[3] = (float) pow( 2.0, (double) src->f[3] );
+ dst->f[0] = powf( 2.0f, src->f[0] );
+ dst->f[1] = powf( 2.0f, src->f[1] );
+ dst->f[2] = powf( 2.0f, src->f[2] );
+ dst->f[3] = powf( 2.0f, src->f[3] );
}
static void
@@ -463,10 +469,10 @@ micro_flr(
union tgsi_exec_channel *dst,
const union tgsi_exec_channel *src )
{
- dst->f[0] = (float) floor( (double) src->f[0] );
- dst->f[1] = (float) floor( (double) src->f[1] );
- dst->f[2] = (float) floor( (double) src->f[2] );
- dst->f[3] = (float) floor( (double) src->f[3] );
+ dst->f[0] = floorf( src->f[0] );
+ dst->f[1] = floorf( src->f[1] );
+ dst->f[2] = floorf( src->f[2] );
+ dst->f[3] = floorf( src->f[3] );
}
static void
@@ -474,10 +480,10 @@ micro_frc(
union tgsi_exec_channel *dst,
const union tgsi_exec_channel *src )
{
- dst->f[0] = src->f[0] - (float) floor( (double) src->f[0] );
- dst->f[1] = src->f[1] - (float) floor( (double) src->f[1] );
- dst->f[2] = src->f[2] - (float) floor( (double) src->f[2] );
- dst->f[3] = src->f[3] - (float) floor( (double) src->f[3] );
+ dst->f[0] = src->f[0] - floorf( src->f[0] );
+ dst->f[1] = src->f[1] - floorf( src->f[1] );
+ dst->f[2] = src->f[2] - floorf( src->f[2] );
+ dst->f[3] = src->f[3] - floorf( src->f[3] );
}
static void
@@ -510,10 +516,24 @@ micro_lg2(
union tgsi_exec_channel *dst,
const union tgsi_exec_channel *src )
{
- dst->f[0] = (float) log( (double) src->f[0] ) * 1.442695f;
- dst->f[1] = (float) log( (double) src->f[1] ) * 1.442695f;
- dst->f[2] = (float) log( (double) src->f[2] ) * 1.442695f;
- dst->f[3] = (float) log( (double) src->f[3] ) * 1.442695f;
+ dst->f[0] = logf( src->f[0] ) * 1.442695f;
+ dst->f[1] = logf( src->f[1] ) * 1.442695f;
+ dst->f[2] = logf( src->f[2] ) * 1.442695f;
+ dst->f[3] = logf( src->f[3] ) * 1.442695f;
+}
+
+static void
+micro_le(
+ union tgsi_exec_channel *dst,
+ const union tgsi_exec_channel *src0,
+ const union tgsi_exec_channel *src1,
+ const union tgsi_exec_channel *src2,
+ const union tgsi_exec_channel *src3 )
+{
+ dst->f[0] = src0->f[0] <= src1->f[0] ? src2->f[0] : src3->f[0];
+ dst->f[1] = src0->f[1] <= src1->f[1] ? src2->f[1] : src3->f[1];
+ dst->f[2] = src0->f[2] <= src1->f[2] ? src2->f[2] : src3->f[2];
+ dst->f[3] = src0->f[3] <= src1->f[3] ? src2->f[3] : src3->f[3];
}
static void
@@ -764,10 +784,10 @@ micro_pow(
const union tgsi_exec_channel *src0,
const union tgsi_exec_channel *src1 )
{
- dst->f[0] = (float) pow( (double) src0->f[0], (double) src1->f[0] );
- dst->f[1] = (float) pow( (double) src0->f[1], (double) src1->f[1] );
- dst->f[2] = (float) pow( (double) src0->f[2], (double) src1->f[2] );
- dst->f[3] = (float) pow( (double) src0->f[3], (double) src1->f[3] );
+ dst->f[0] = powf( src0->f[0], src1->f[0] );
+ dst->f[1] = powf( src0->f[1], src1->f[1] );
+ dst->f[2] = powf( src0->f[2], src1->f[2] );
+ dst->f[3] = powf( src0->f[3], src1->f[3] );
}
static void
@@ -775,10 +795,10 @@ micro_rnd(
union tgsi_exec_channel *dst,
const union tgsi_exec_channel *src )
{
- dst->f[0] = (float) floor( (double) (src->f[0] + 0.5f) );
- dst->f[1] = (float) floor( (double) (src->f[1] + 0.5f) );
- dst->f[2] = (float) floor( (double) (src->f[2] + 0.5f) );
- dst->f[3] = (float) floor( (double) (src->f[3] + 0.5f) );
+ dst->f[0] = floorf( src->f[0] + 0.5f );
+ dst->f[1] = floorf( src->f[1] + 0.5f );
+ dst->f[2] = floorf( src->f[2] + 0.5f );
+ dst->f[3] = floorf( src->f[3] + 0.5f );
}
static void
@@ -833,20 +853,20 @@ micro_sin(
union tgsi_exec_channel *dst,
const union tgsi_exec_channel *src )
{
- dst->f[0] = (float) sin( (double) src->f[0] );
- dst->f[1] = (float) sin( (double) src->f[1] );
- dst->f[2] = (float) sin( (double) src->f[2] );
- dst->f[3] = (float) sin( (double) src->f[3] );
+ dst->f[0] = sinf( src->f[0] );
+ dst->f[1] = sinf( src->f[1] );
+ dst->f[2] = sinf( src->f[2] );
+ dst->f[3] = sinf( src->f[3] );
}
static void
micro_sqrt( union tgsi_exec_channel *dst,
const union tgsi_exec_channel *src )
{
- dst->f[0] = (float) sqrt( (double) src->f[0] );
- dst->f[1] = (float) sqrt( (double) src->f[1] );
- dst->f[2] = (float) sqrt( (double) src->f[2] );
- dst->f[3] = (float) sqrt( (double) src->f[3] );
+ dst->f[0] = sqrtf( src->f[0] );
+ dst->f[1] = sqrtf( src->f[1] );
+ dst->f[2] = sqrtf( src->f[2] );
+ dst->f[3] = sqrtf( src->f[3] );
}
static void
@@ -1516,41 +1536,44 @@ exec_instruction(
break;
case TGSI_OPCODE_EXP:
- debug_printf("TGSI: EXP opcode not implemented\n");
- /* from ARB_v_p:
- tmp = ScalarLoad(op0);
- result.x = 2^floor(tmp);
- result.y = tmp - floor(tmp);
- result.z = RoughApprox2ToX(tmp);
- result.w = 1.0;
- */
-#if 0
- /* something like this: */
FETCH( &r[0], 0, CHAN_X );
- micro_exp2( &r[0], &r[0] );
- FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
- STORE( &r[0], 0, chan_index );
+ micro_flr( &r[1], &r[0] ); /* r1 = floor(r0) */
+ if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
+ micro_exp2( &r[2], &r[1] ); /* r2 = 2 ^ r1 */
+ STORE( &r[2], 0, CHAN_X ); /* store r2 */
+ }
+ if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
+ micro_sub( &r[2], &r[0], &r[1] ); /* r2 = r0 - r1 */
+ STORE( &r[2], 0, CHAN_Y ); /* store r2 */
+ }
+ if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
+ micro_exp2( &r[2], &r[0] ); /* r2 = 2 ^ r0 */
+ STORE( &r[2], 0, CHAN_Z ); /* store r2 */
+ }
+ if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
+ STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
}
-#endif
break;
case TGSI_OPCODE_LOG:
- debug_printf("TGSI: LOG opcode not implemented\n");
- /* from ARB_v_p:
- tmp = fabs(ScalarLoad(op0));
- result.x = floor(log2(tmp));
- result.y = tmp / 2^(floor(log2(tmp)));
- result.z = RoughApproxLog2(tmp);
- result.w = 1.0;
- */
-#if 0
- /* something like this: */
FETCH( &r[0], 0, CHAN_X );
- micro_lg2( &r[0], &r[0] );
- FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
- STORE( &r[0], 0, chan_index );
+ micro_abs( &r[2], &r[0] ); /* r2 = abs(r0) */
+ micro_lg2( &r[1], &r[2] ); /* r1 = lg2(r2) */
+ micro_flr( &r[0], &r[1] ); /* r0 = floor(r1) */
+ if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
+ STORE( &r[0], 0, CHAN_X );
+ }
+ if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
+ micro_exp2( &r[0], &r[0] ); /* r0 = 2 ^ r0 */
+ micro_div( &r[0], &r[2], &r[0] ); /* r0 = r2 / r0 */
+ STORE( &r[0], 0, CHAN_Y );
+ }
+ if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
+ STORE( &r[1], 0, CHAN_Z );
+ }
+ if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
+ STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
}
-#endif
break;
case TGSI_OPCODE_MUL:
@@ -1975,7 +1998,7 @@ exec_instruction(
FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
FETCH( &r[0], 0, chan_index );
FETCH( &r[1], 1, chan_index );
- micro_lt( &r[0], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C] );
+ micro_le( &r[0], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C] );
STORE( &r[0], 0, chan_index );
}
break;
@@ -1992,7 +2015,7 @@ exec_instruction(
FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
FETCH( &r[0], 0, chan_index );
FETCH( &r[1], 1, chan_index );
- micro_ge( &r[0], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C] );
+ micro_le( &r[0], &r[0], &r[1], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
STORE( &r[0], 0, chan_index );
}
break;
diff --git a/src/gallium/auxiliary/tgsi/exec/tgsi_exec.h b/src/gallium/auxiliary/tgsi/exec/tgsi_exec.h
index 45c49dd007..19bd78df3d 100644
--- a/src/gallium/auxiliary/tgsi/exec/tgsi_exec.h
+++ b/src/gallium/auxiliary/tgsi/exec/tgsi_exec.h
@@ -133,9 +133,15 @@ struct tgsi_exec_labels
#define TGSI_EXEC_TEMP_PRIMITIVE_I 34
#define TGSI_EXEC_TEMP_PRIMITIVE_C 2
-#define TGSI_EXEC_TEMP_R0 35
+#define TGSI_EXEC_TEMP_THREE_I 34
+#define TGSI_EXEC_TEMP_THREE_C 3
-#define TGSI_EXEC_NUM_TEMPS (32 + 4)
+#define TGSI_EXEC_TEMP_HALF_I 35
+#define TGSI_EXEC_TEMP_HALF_C 0
+
+#define TGSI_EXEC_TEMP_R0 36
+
+#define TGSI_EXEC_NUM_TEMPS (32 + 5)
#define TGSI_EXEC_NUM_ADDRS 1
#define TGSI_EXEC_NUM_IMMEDIATES 256
@@ -166,7 +172,7 @@ struct tgsi_exec_machine
float Imms[TGSI_EXEC_NUM_IMMEDIATES][4];
unsigned ImmLimit;
- float (*Consts)[4];
+ const float (*Consts)[4];
struct tgsi_exec_vector *Inputs;
struct tgsi_exec_vector *Outputs;
const struct tgsi_token *Tokens;
diff --git a/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c b/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c
index 4e80597b3f..8018bd7fa4 100755
--- a/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c
+++ b/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c
@@ -34,115 +34,14 @@
#include "rtasm/rtasm_x86sse.h"
-#if defined(__i386__) || defined(__386__)
+#ifdef PIPE_ARCH_X86
-#define DUMP_SSE 0
-
-#if DUMP_SSE
-
-static void
-_print_reg(
- struct x86_reg reg )
-{
- if (reg.mod != mod_REG)
- debug_printf( "[" );
-
- switch( reg.file ) {
- case file_REG32:
- switch( reg.idx ) {
- case reg_AX:
- debug_printf( "EAX" );
- break;
- case reg_CX:
- debug_printf( "ECX" );
- break;
- case reg_DX:
- debug_printf( "EDX" );
- break;
- case reg_BX:
- debug_printf( "EBX" );
- break;
- case reg_SP:
- debug_printf( "ESP" );
- break;
- case reg_BP:
- debug_printf( "EBP" );
- break;
- case reg_SI:
- debug_printf( "ESI" );
- break;
- case reg_DI:
- debug_printf( "EDI" );
- break;
- }
- break;
- case file_MMX:
- assert( 0 );
- break;
- case file_XMM:
- debug_printf( "XMM%u", reg.idx );
- break;
- case file_x87:
- assert( 0 );
- break;
- }
-
- if (reg.mod == mod_DISP8 ||
- reg.mod == mod_DISP32)
- debug_printf("+%d", reg.disp);
-
- if (reg.mod != mod_REG)
- debug_printf( "]" );
-}
-
-static void
-_fill(
- const char *op )
-{
- unsigned count = 10 - strlen( op );
-
- while( count-- ) {
- debug_printf( " " );
- }
-}
-
-#define DUMP_START() debug_printf( "\nsse-dump start ----------------" )
-#define DUMP_END() debug_printf( "\nsse-dump end ----------------\n" )
-#define DUMP( OP ) debug_printf( "\n%s", OP )
-#define DUMP_I( OP, I ) do {\
- debug_printf( "\n%s", OP );\
- _fill( OP );\
- debug_printf( "%u", I ); } while( 0 )
-#define DUMP_R( OP, R0 ) do {\
- debug_printf( "\n%s", OP );\
- _fill( OP );\
- _print_reg( R0 ); } while( 0 )
-#define DUMP_RR( OP, R0, R1 ) do {\
- debug_printf( "\n%s", OP );\
- _fill( OP );\
- _print_reg( R0 );\
- debug_printf( ", " );\
- _print_reg( R1 ); } while( 0 )
-#define DUMP_RRI( OP, R0, R1, I ) do {\
- debug_printf( "\n%s", OP );\
- _fill( OP );\
- _print_reg( R0 );\
- debug_printf( ", " );\
- _print_reg( R1 );\
- debug_printf( ", " );\
- debug_printf( "%u", I ); } while( 0 )
-
-#else
-
-#define DUMP_START()
-#define DUMP_END()
-#define DUMP( OP )
-#define DUMP_I( OP, I )
-#define DUMP_R( OP, R0 )
-#define DUMP_RR( OP, R0, R1 )
-#define DUMP_RRI( OP, R0, R1, I )
+/* for 1/sqrt()
+ *
+ * This costs about 100fps (close to 10%) in gears:
+ */
+#define HIGH_PRECISION 1
-#endif
#define FOR_EACH_CHANNEL( CHAN )\
for( CHAN = 0; CHAN < 4; CHAN++ )
@@ -208,15 +107,9 @@ get_output_base( void )
static struct x86_reg
get_temp_base( void )
{
-#ifdef WIN32
return x86_make_reg(
file_REG32,
reg_BX );
-#else
- return x86_make_reg(
- file_REG32,
- reg_SI );
-#endif
}
static struct x86_reg
@@ -225,17 +118,28 @@ get_coef_base( void )
return get_output_base();
}
+static struct x86_reg
+get_immediate_base( void )
+{
+ return x86_make_reg(
+ file_REG32,
+ reg_DI );
+}
+
+
/**
* Data access helpers.
*/
+
static struct x86_reg
-get_argument(
- unsigned index )
+get_immediate(
+ unsigned vec,
+ unsigned chan )
{
return x86_make_disp(
- x86_make_reg( file_REG32, reg_SP ),
- (index + 1) * 4 );
+ get_immediate_base(),
+ (vec * 4 + chan) * 4 );
}
static struct x86_reg
@@ -289,200 +193,6 @@ get_coef(
((vec * 3 + member) * 4 + chan) * 4 );
}
-/**
- * X86 rtasm wrappers.
- */
-
-static void
-emit_addps(
- struct x86_function *func,
- struct x86_reg dst,
- struct x86_reg src )
-{
- DUMP_RR( "ADDPS", dst, src );
- sse_addps( func, dst, src );
-}
-
-static void
-emit_andnps(
- struct x86_function *func,
- struct x86_reg dst,
- struct x86_reg src )
-{
- DUMP_RR( "ANDNPS", dst, src );
- sse_andnps( func, dst, src );
-}
-
-static void
-emit_andps(
- struct x86_function *func,
- struct x86_reg dst,
- struct x86_reg src )
-{
- DUMP_RR( "ANDPS", dst, src );
- sse_andps( func, dst, src );
-}
-
-static void
-emit_call(
- struct x86_function *func,
- void (* addr)() )
-{
- struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
-
- DUMP_I( "CALL", addr );
- x86_mov_reg_imm( func, ecx, (unsigned long) addr );
- x86_call( func, ecx );
-}
-
-static void
-emit_cmpps(
- struct x86_function *func,
- struct x86_reg dst,
- struct x86_reg src,
- enum sse_cc cc )
-{
- DUMP_RRI( "CMPPS", dst, src, cc );
- sse_cmpps( func, dst, src, cc );
-}
-
-static void
-emit_cvttps2dq(
- struct x86_function *func,
- struct x86_reg dst,
- struct x86_reg src )
-{
- DUMP_RR( "CVTTPS2DQ", dst, src );
- sse2_cvttps2dq( func, dst, src );
-}
-
-static void
-emit_maxps(
- struct x86_function *func,
- struct x86_reg dst,
- struct x86_reg src )
-{
- DUMP_RR( "MAXPS", dst, src );
- sse_maxps( func, dst, src );
-}
-
-static void
-emit_minps(
- struct x86_function *func,
- struct x86_reg dst,
- struct x86_reg src )
-{
- DUMP_RR( "MINPS", dst, src );
- sse_minps( func, dst, src );
-}
-
-static void
-emit_mov(
- struct x86_function *func,
- struct x86_reg dst,
- struct x86_reg src )
-{
- DUMP_RR( "MOV", dst, src );
- x86_mov( func, dst, src );
-}
-
-static void
-emit_movaps(
- struct x86_function *func,
- struct x86_reg dst,
- struct x86_reg src )
-{
- DUMP_RR( "MOVAPS", dst, src );
- sse_movaps( func, dst, src );
-}
-
-static void
-emit_movss(
- struct x86_function *func,
- struct x86_reg dst,
- struct x86_reg src )
-{
- DUMP_RR( "MOVSS", dst, src );
- sse_movss( func, dst, src );
-}
-
-static void
-emit_movups(
- struct x86_function *func,
- struct x86_reg dst,
- struct x86_reg src )
-{
- DUMP_RR( "MOVUPS", dst, src );
- sse_movups( func, dst, src );
-}
-
-static void
-emit_mulps(
- struct x86_function *func,
- struct x86_reg dst,
- struct x86_reg src )
-{
- DUMP_RR( "MULPS", dst, src );
- sse_mulps( func, dst, src );
-}
-
-static void
-emit_or(
- struct x86_function *func,
- struct x86_reg dst,
- struct x86_reg src )
-{
- DUMP_RR( "OR", dst, src );
- x86_or( func, dst, src );
-}
-
-static void
-emit_orps(
- struct x86_function *func,
- struct x86_reg dst,
- struct x86_reg src )
-{
- DUMP_RR( "ORPS", dst, src );
- sse_orps( func, dst, src );
-}
-
-static void
-emit_pmovmskb(
- struct x86_function *func,
- struct x86_reg dst,
- struct x86_reg src )
-{
- DUMP_RR( "PMOVMSKB", dst, src );
- sse_pmovmskb( func, dst, src );
-}
-
-static void
-emit_pop(
- struct x86_function *func,
- struct x86_reg dst )
-{
- DUMP_R( "POP", dst );
- x86_pop( func, dst );
-}
-
-static void
-emit_push(
- struct x86_function *func,
- struct x86_reg dst )
-{
- DUMP_R( "PUSH", dst );
- x86_push( func, dst );
-}
-
-static void
-emit_rcpps(
- struct x86_function *func,
- struct x86_reg dst,
- struct x86_reg src )
-{
- DUMP_RR( "RCPPS", dst, src );
- sse2_rcpps( func, dst, src );
-}
#ifdef WIN32
static void
@@ -490,7 +200,6 @@ emit_retw(
struct x86_function *func,
unsigned size )
{
- DUMP_I( "RET", size );
x86_retw( func, size );
}
#else
@@ -498,56 +207,21 @@ static void
emit_ret(
struct x86_function *func )
{
- DUMP( "RET" );
x86_ret( func );
}
#endif
-static void
-emit_rsqrtps(
- struct x86_function *func,
- struct x86_reg dst,
- struct x86_reg src )
-{
- DUMP_RR( "RSQRTPS", dst, src );
- sse_rsqrtps( func, dst, src );
-}
-
-static void
-emit_shufps(
- struct x86_function *func,
- struct x86_reg dst,
- struct x86_reg src,
- unsigned char shuf )
-{
- DUMP_RRI( "SHUFPS", dst, src, shuf );
- sse_shufps( func, dst, src, shuf );
-}
-
-static void
-emit_subps(
- struct x86_function *func,
- struct x86_reg dst,
- struct x86_reg src )
-{
- DUMP_RR( "SUBPS", dst, src );
- sse_subps( func, dst, src );
-}
-
-static void
-emit_xorps(
- struct x86_function *func,
- struct x86_reg dst,
- struct x86_reg src )
-{
- DUMP_RR( "XORPS", dst, src );
- sse_xorps( func, dst, src );
-}
/**
* Data fetch helpers.
*/
+/**
+ * Copy a shader constant to xmm register
+ * \param xmm the destination xmm register
+ * \param vec the src const buffer index
+ * \param chan src channel to fetch (X, Y, Z or W)
+ */
static void
emit_const(
struct x86_function *func,
@@ -555,11 +229,11 @@ emit_const(
unsigned vec,
unsigned chan )
{
- emit_movss(
+ sse_movss(
func,
make_xmm( xmm ),
get_const( vec, chan ) );
- emit_shufps(
+ sse_shufps(
func,
make_xmm( xmm ),
make_xmm( xmm ),
@@ -567,18 +241,49 @@ emit_const(
}
static void
+emit_immediate(
+ struct x86_function *func,
+ unsigned xmm,
+ unsigned vec,
+ unsigned chan )
+{
+ sse_movss(
+ func,
+ make_xmm( xmm ),
+ get_immediate( vec, chan ) );
+ sse_shufps(
+ func,
+ make_xmm( xmm ),
+ make_xmm( xmm ),
+ SHUF( 0, 0, 0, 0 ) );
+}
+
+
+/**
+ * Copy a shader input to xmm register
+ * \param xmm the destination xmm register
+ * \param vec the src input attrib
+ * \param chan src channel to fetch (X, Y, Z or W)
+ */
+static void
emit_inputf(
struct x86_function *func,
unsigned xmm,
unsigned vec,
unsigned chan )
{
- emit_movups(
+ sse_movups(
func,
make_xmm( xmm ),
get_input( vec, chan ) );
}
+/**
+ * Store an xmm register to a shader output
+ * \param xmm the source xmm register
+ * \param vec the dest output attrib
+ * \param chan src dest channel to store (X, Y, Z or W)
+ */
static void
emit_output(
struct x86_function *func,
@@ -586,12 +291,18 @@ emit_output(
unsigned vec,
unsigned chan )
{
- emit_movups(
+ sse_movups(
func,
get_output( vec, chan ),
make_xmm( xmm ) );
}
+/**
+ * Copy a shader temporary to xmm register
+ * \param xmm the destination xmm register
+ * \param vec the src temp register
+ * \param chan src channel to fetch (X, Y, Z or W)
+ */
static void
emit_tempf(
struct x86_function *func,
@@ -599,12 +310,19 @@ emit_tempf(
unsigned vec,
unsigned chan )
{
- emit_movaps(
+ sse_movaps(
func,
make_xmm( xmm ),
get_temp( vec, chan ) );
}
+/**
+ * Load an xmm register with an input attrib coefficient (a0, dadx or dady)
+ * \param xmm the destination xmm register
+ * \param vec the src input/attribute coefficient index
+ * \param chan src channel to fetch (X, Y, Z or W)
+ * \param member 0=a0, 1=dadx, 2=dady
+ */
static void
emit_coef(
struct x86_function *func,
@@ -613,11 +331,11 @@ emit_coef(
unsigned chan,
unsigned member )
{
- emit_movss(
+ sse_movss(
func,
make_xmm( xmm ),
get_coef( vec, chan, member ) );
- emit_shufps(
+ sse_shufps(
func,
make_xmm( xmm ),
make_xmm( xmm ),
@@ -635,7 +353,7 @@ emit_inputs(
unsigned vec,
unsigned chan )
{
- emit_movups(
+ sse_movups(
func,
get_input( vec, chan ),
make_xmm( xmm ) );
@@ -648,7 +366,7 @@ emit_temps(
unsigned vec,
unsigned chan )
{
- emit_movaps(
+ sse_movaps(
func,
get_temp( vec, chan ),
make_xmm( xmm ) );
@@ -725,41 +443,32 @@ static void
emit_push_gp(
struct x86_function *func )
{
- emit_push(
+ x86_push(
func,
- get_const_base() );
- emit_push(
+ x86_make_reg( file_REG32, reg_AX) );
+ x86_push(
func,
- get_input_base() );
- emit_push(
+ x86_make_reg( file_REG32, reg_CX) );
+ x86_push(
func,
- get_output_base() );
-
- /* It is important on non-win32 platforms that temp base is pushed last.
- */
- emit_push(
- func,
- get_temp_base() );
+ x86_make_reg( file_REG32, reg_DX) );
}
static void
-emit_pop_gp(
+x86_pop_gp(
struct x86_function *func )
{
/* Restore GP registers in a reverse order.
*/
- emit_pop(
- func,
- get_temp_base() );
- emit_pop(
+ x86_pop(
func,
- get_output_base() );
- emit_pop(
+ x86_make_reg( file_REG32, reg_DX) );
+ x86_pop(
func,
- get_input_base() );
- emit_pop(
+ x86_make_reg( file_REG32, reg_CX) );
+ x86_pop(
func,
- get_const_base() );
+ x86_make_reg( file_REG32, reg_AX) );
}
static void
@@ -768,7 +477,7 @@ emit_func_call_dst(
unsigned xmm_dst,
void (*code)() )
{
- emit_movaps(
+ sse_movaps(
func,
get_temp( TEMP_R0, 0 ),
make_xmm( xmm_dst ) );
@@ -776,20 +485,27 @@ emit_func_call_dst(
emit_push_gp(
func );
-#ifdef WIN32
- emit_push(
- func,
- get_temp( TEMP_R0, 0 ) );
+ {
+ struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
+
+ x86_lea(
+ func,
+ ecx,
+ get_temp( TEMP_R0, 0 ) );
+
+ x86_push( func, ecx );
+ x86_mov_reg_imm( func, ecx, (unsigned long) code );
+ x86_call( func, ecx );
+#ifndef WIN32
+ x86_pop(func, ecx );
#endif
+ }
- emit_call(
- func,
- code );
- emit_pop_gp(
+ x86_pop_gp(
func );
- emit_movaps(
+ sse_movaps(
func,
make_xmm( xmm_dst ),
get_temp( TEMP_R0, 0 ) );
@@ -802,7 +518,7 @@ emit_func_call_dst_src(
unsigned xmm_src,
void (*code)() )
{
- emit_movaps(
+ sse_movaps(
func,
get_temp( TEMP_R0, 1 ),
make_xmm( xmm_src ) );
@@ -822,7 +538,7 @@ emit_abs(
struct x86_function *func,
unsigned xmm )
{
- emit_andps(
+ sse_andps(
func,
make_xmm( xmm ),
get_temp(
@@ -836,7 +552,7 @@ emit_add(
unsigned xmm_dst,
unsigned xmm_src )
{
- emit_addps(
+ sse_addps(
func,
make_xmm( xmm_dst ),
make_xmm( xmm_src ) );
@@ -846,18 +562,12 @@ static void XSTDCALL
cos4f(
float *store )
{
-#ifdef WIN32
- store[0] = (float) cos( (double) store[0] );
- store[1] = (float) cos( (double) store[1] );
- store[2] = (float) cos( (double) store[2] );
- store[3] = (float) cos( (double) store[3] );
-#else
- const unsigned X = TEMP_R0 * 16;
+ const unsigned X = 0;
+
store[X + 0] = cosf( store[X + 0] );
store[X + 1] = cosf( store[X + 1] );
store[X + 2] = cosf( store[X + 2] );
store[X + 3] = cosf( store[X + 3] );
-#endif
}
static void
@@ -875,18 +585,12 @@ static void XSTDCALL
ex24f(
float *store )
{
-#ifdef WIN32
- store[0] = (float) pow( 2.0, (double) store[0] );
- store[1] = (float) pow( 2.0, (double) store[1] );
- store[2] = (float) pow( 2.0, (double) store[2] );
- store[3] = (float) pow( 2.0, (double) store[3] );
-#else
- const unsigned X = TEMP_R0 * 16;
+ const unsigned X = 0;
+
store[X + 0] = powf( 2.0f, store[X + 0] );
store[X + 1] = powf( 2.0f, store[X + 1] );
store[X + 2] = powf( 2.0f, store[X + 2] );
store[X + 3] = powf( 2.0f, store[X + 3] );
-#endif
}
static void
@@ -905,7 +609,7 @@ emit_f2it(
struct x86_function *func,
unsigned xmm )
{
- emit_cvttps2dq(
+ sse2_cvttps2dq(
func,
make_xmm( xmm ),
make_xmm( xmm ) );
@@ -915,15 +619,12 @@ static void XSTDCALL
flr4f(
float *store )
{
-#ifdef WIN32
const unsigned X = 0;
-#else
- const unsigned X = TEMP_R0 * 16;
-#endif
- store[X + 0] = (float) floor( (double) store[X + 0] );
- store[X + 1] = (float) floor( (double) store[X + 1] );
- store[X + 2] = (float) floor( (double) store[X + 2] );
- store[X + 3] = (float) floor( (double) store[X + 3] );
+
+ store[X + 0] = floorf( store[X + 0] );
+ store[X + 1] = floorf( store[X + 1] );
+ store[X + 2] = floorf( store[X + 2] );
+ store[X + 3] = floorf( store[X + 3] );
}
static void
@@ -941,15 +642,12 @@ static void XSTDCALL
frc4f(
float *store )
{
-#ifdef WIN32
const unsigned X = 0;
-#else
- const unsigned X = TEMP_R0 * 16;
-#endif
- store[X + 0] -= (float) floor( (double) store[X + 0] );
- store[X + 1] -= (float) floor( (double) store[X + 1] );
- store[X + 2] -= (float) floor( (double) store[X + 2] );
- store[X + 3] -= (float) floor( (double) store[X + 3] );
+
+ store[X + 0] -= floorf( store[X + 0] );
+ store[X + 1] -= floorf( store[X + 1] );
+ store[X + 2] -= floorf( store[X + 2] );
+ store[X + 3] -= floorf( store[X + 3] );
}
static void
@@ -967,11 +665,8 @@ static void XSTDCALL
lg24f(
float *store )
{
-#ifdef WIN32
const unsigned X = 0;
-#else
- const unsigned X = TEMP_R0 * 16;
-#endif
+
store[X + 0] = LOG2( store[X + 0] );
store[X + 1] = LOG2( store[X + 1] );
store[X + 2] = LOG2( store[X + 2] );
@@ -995,7 +690,7 @@ emit_MOV(
unsigned xmm_dst,
unsigned xmm_src )
{
- emit_movups(
+ sse_movups(
func,
make_xmm( xmm_dst ),
make_xmm( xmm_src ) );
@@ -1006,7 +701,7 @@ emit_mul (struct x86_function *func,
unsigned xmm_dst,
unsigned xmm_src)
{
- emit_mulps(
+ sse_mulps(
func,
make_xmm( xmm_dst ),
make_xmm( xmm_src ) );
@@ -1017,7 +712,7 @@ emit_neg(
struct x86_function *func,
unsigned xmm )
{
- emit_xorps(
+ sse_xorps(
func,
make_xmm( xmm ),
get_temp(
@@ -1029,18 +724,12 @@ static void XSTDCALL
pow4f(
float *store )
{
-#ifdef WIN32
- store[0] = (float) pow( (double) store[0], (double) store[4] );
- store[1] = (float) pow( (double) store[1], (double) store[5] );
- store[2] = (float) pow( (double) store[2], (double) store[6] );
- store[3] = (float) pow( (double) store[3], (double) store[7] );
-#else
- const unsigned X = TEMP_R0 * 16;
+ const unsigned X = 0;
+
store[X + 0] = powf( store[X + 0], store[X + 4] );
store[X + 1] = powf( store[X + 1], store[X + 5] );
store[X + 2] = powf( store[X + 2], store[X + 6] );
store[X + 3] = powf( store[X + 3], store[X + 7] );
-#endif
}
static void
@@ -1062,7 +751,11 @@ emit_rcp (
unsigned xmm_dst,
unsigned xmm_src )
{
- emit_rcpps(
+ /* On Intel CPUs at least, this is only accurate to 12 bits -- not
+ * good enough. Need to either emit a proper divide or use the
+ * iterative technique described below in emit_rsqrt().
+ */
+ sse2_rcpps(
func,
make_xmm( xmm_dst ),
make_xmm( xmm_src ) );
@@ -1074,10 +767,44 @@ emit_rsqrt(
unsigned xmm_dst,
unsigned xmm_src )
{
- emit_rsqrtps(
+#if HIGH_PRECISION
+ /* Although rsqrtps() and rcpps() are low precision on some/all SSE
+ * implementations, it is possible to improve its precision at
+ * fairly low cost, using a newton/raphson step, as below:
+ *
+ * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
+ * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
+ *
+ * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
+ */
+ {
+ struct x86_reg dst = make_xmm( xmm_dst );
+ struct x86_reg src = make_xmm( xmm_src );
+ struct x86_reg tmp0 = make_xmm( 2 );
+ struct x86_reg tmp1 = make_xmm( 3 );
+
+ assert( xmm_dst != xmm_src );
+ assert( xmm_dst != 2 && xmm_dst != 3 );
+ assert( xmm_src != 2 && xmm_src != 3 );
+
+ sse_movaps( func, dst, get_temp( TGSI_EXEC_TEMP_HALF_I, TGSI_EXEC_TEMP_HALF_C ) );
+ sse_movaps( func, tmp0, get_temp( TGSI_EXEC_TEMP_THREE_I, TGSI_EXEC_TEMP_THREE_C ) );
+ sse_rsqrtps( func, tmp1, src );
+ sse_mulps( func, src, tmp1 );
+ sse_mulps( func, dst, tmp1 );
+ sse_mulps( func, src, tmp1 );
+ sse_subps( func, tmp0, src );
+ sse_mulps( func, dst, tmp0 );
+ }
+#else
+ /* On Intel CPUs at least, this is only accurate to 12 bits -- not
+ * good enough.
+ */
+ sse_rsqrtps(
func,
make_xmm( xmm_dst ),
make_xmm( xmm_src ) );
+#endif
}
static void
@@ -1085,7 +812,7 @@ emit_setsign(
struct x86_function *func,
unsigned xmm )
{
- emit_orps(
+ sse_orps(
func,
make_xmm( xmm ),
get_temp(
@@ -1097,18 +824,12 @@ static void XSTDCALL
sin4f(
float *store )
{
-#ifdef WIN32
- store[0] = (float) sin( (double) store[0] );
- store[1] = (float) sin( (double) store[1] );
- store[2] = (float) sin( (double) store[2] );
- store[3] = (float) sin( (double) store[3] );
-#else
- const unsigned X = TEMP_R0 * 16;
+ const unsigned X = 0;
+
store[X + 0] = sinf( store[X + 0] );
store[X + 1] = sinf( store[X + 1] );
store[X + 2] = sinf( store[X + 2] );
store[X + 3] = sinf( store[X + 3] );
-#endif
}
static void
@@ -1127,7 +848,7 @@ emit_sub(
unsigned xmm_dst,
unsigned xmm_src )
{
- emit_subps(
+ sse_subps(
func,
make_xmm( xmm_dst ),
make_xmm( xmm_src ) );
@@ -1160,6 +881,14 @@ emit_fetch(
swizzle );
break;
+ case TGSI_FILE_IMMEDIATE:
+ emit_immediate(
+ func,
+ xmm,
+ reg->SrcRegister.Index,
+ swizzle );
+ break;
+
case TGSI_FILE_INPUT:
emit_inputf(
func,
@@ -1328,16 +1057,16 @@ emit_kil(
}
}
- emit_push(
+ x86_push(
func,
x86_make_reg( file_REG32, reg_AX ) );
- emit_push(
+ x86_push(
func,
x86_make_reg( file_REG32, reg_DX ) );
FOR_EACH_CHANNEL( chan_index ) {
if( uniquemask & (1 << chan_index) ) {
- emit_cmpps(
+ sse_cmpps(
func,
make_xmm( registers[chan_index] ),
get_temp(
@@ -1346,17 +1075,17 @@ emit_kil(
cc_LessThan );
if( chan_index == firstchan ) {
- emit_pmovmskb(
+ sse_pmovmskb(
func,
x86_make_reg( file_REG32, reg_AX ),
make_xmm( registers[chan_index] ) );
}
else {
- emit_pmovmskb(
+ sse_pmovmskb(
func,
x86_make_reg( file_REG32, reg_DX ),
make_xmm( registers[chan_index] ) );
- emit_or(
+ x86_or(
func,
x86_make_reg( file_REG32, reg_AX ),
x86_make_reg( file_REG32, reg_DX ) );
@@ -1364,17 +1093,17 @@ emit_kil(
}
}
- emit_or(
+ x86_or(
func,
get_temp(
TGSI_EXEC_TEMP_KILMASK_I,
TGSI_EXEC_TEMP_KILMASK_C ),
x86_make_reg( file_REG32, reg_AX ) );
- emit_pop(
+ x86_pop(
func,
x86_make_reg( file_REG32, reg_DX ) );
- emit_pop(
+ x86_pop(
func,
x86_make_reg( file_REG32, reg_AX ) );
}
@@ -1390,12 +1119,12 @@ emit_setcc(
FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
FETCH( func, *inst, 0, 0, chan_index );
FETCH( func, *inst, 1, 1, chan_index );
- emit_cmpps(
+ sse_cmpps(
func,
make_xmm( 0 ),
make_xmm( 1 ),
cc );
- emit_andps(
+ sse_andps(
func,
make_xmm( 0 ),
get_temp(
@@ -1416,22 +1145,22 @@ emit_cmp(
FETCH( func, *inst, 0, 0, chan_index );
FETCH( func, *inst, 1, 1, chan_index );
FETCH( func, *inst, 2, 2, chan_index );
- emit_cmpps(
+ sse_cmpps(
func,
make_xmm( 0 ),
get_temp(
TGSI_EXEC_TEMP_00000000_I,
TGSI_EXEC_TEMP_00000000_C ),
cc_LessThan );
- emit_andps(
+ sse_andps(
func,
make_xmm( 1 ),
make_xmm( 0 ) );
- emit_andnps(
+ sse_andnps(
func,
make_xmm( 0 ),
make_xmm( 2 ) );
- emit_orps(
+ sse_orps(
func,
make_xmm( 0 ),
make_xmm( 1 ) );
@@ -1448,11 +1177,16 @@ emit_instruction(
switch( inst->Instruction.Opcode ) {
case TGSI_OPCODE_ARL:
+#if 0
+ /* XXX this isn't working properly (see glean vertProg1 test) */
FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
FETCH( func, *inst, 0, 0, chan_index );
emit_f2it( func, 0 );
STORE( func, *inst, 0, 0, chan_index );
}
+#else
+ return 0;
+#endif
break;
case TGSI_OPCODE_MOV:
@@ -1482,7 +1216,7 @@ emit_instruction(
IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
FETCH( func, *inst, 0, 0, CHAN_X );
- emit_maxps(
+ sse_maxps(
func,
make_xmm( 0 ),
get_temp(
@@ -1491,21 +1225,26 @@ emit_instruction(
STORE( func, *inst, 0, 0, CHAN_Y );
}
if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
+ /* XMM[1] = SrcReg[0].yyyy */
FETCH( func, *inst, 1, 0, CHAN_Y );
- emit_maxps(
+ /* XMM[1] = max(XMM[1], 0) */
+ sse_maxps(
func,
make_xmm( 1 ),
get_temp(
TGSI_EXEC_TEMP_00000000_I,
TGSI_EXEC_TEMP_00000000_C ) );
+ /* XMM[2] = SrcReg[0].wwww */
FETCH( func, *inst, 2, 0, CHAN_W );
- emit_minps(
+ /* XMM[2] = min(XMM[2], 128.0) */
+ sse_minps(
func,
make_xmm( 2 ),
get_temp(
TGSI_EXEC_TEMP_128_I,
TGSI_EXEC_TEMP_128_C ) );
- emit_maxps(
+ /* XMM[2] = max(XMM[2], -128.0) */
+ sse_maxps(
func,
make_xmm( 2 ),
get_temp(
@@ -1513,16 +1252,16 @@ emit_instruction(
TGSI_EXEC_TEMP_MINUS_128_C ) );
emit_pow( func, 1, 2 );
FETCH( func, *inst, 0, 0, CHAN_X );
- emit_xorps(
+ sse_xorps(
func,
make_xmm( 2 ),
make_xmm( 2 ) );
- emit_cmpps(
+ sse_cmpps(
func,
make_xmm( 2 ),
make_xmm( 0 ),
cc_LessThanEqual );
- emit_andps(
+ sse_andps(
func,
make_xmm( 2 ),
make_xmm( 1 ) );
@@ -1543,9 +1282,9 @@ emit_instruction(
case TGSI_OPCODE_RSQ:
/* TGSI_OPCODE_RECIPSQRT */
FETCH( func, *inst, 0, 0, CHAN_X );
- emit_rsqrt( func, 0, 0 );
+ emit_rsqrt( func, 1, 0 );
FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
- STORE( func, *inst, 0, 0, chan_index );
+ STORE( func, *inst, 1, 0, chan_index );
}
break;
@@ -1644,7 +1383,7 @@ emit_instruction(
FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
FETCH( func, *inst, 0, 0, chan_index );
FETCH( func, *inst, 1, 1, chan_index );
- emit_minps(
+ sse_minps(
func,
make_xmm( 0 ),
make_xmm( 1 ) );
@@ -1656,7 +1395,7 @@ emit_instruction(
FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
FETCH( func, *inst, 0, 0, chan_index );
FETCH( func, *inst, 1, 1, chan_index );
- emit_maxps(
+ sse_maxps(
func,
make_xmm( 0 ),
make_xmm( 1 ) );
@@ -1997,7 +1736,6 @@ emit_instruction(
break;
case TGSI_OPCODE_RET:
- case TGSI_OPCODE_END:
#ifdef WIN32
emit_retw( func, 16 );
#else
@@ -2005,6 +1743,9 @@ emit_instruction(
#endif
break;
+ case TGSI_OPCODE_END:
+ break;
+
case TGSI_OPCODE_SSG:
return 0;
break;
@@ -2020,7 +1761,7 @@ emit_instruction(
STORE( func, *inst, 0, 0, CHAN_X );
}
IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
- FETCH( func, *inst, 0, 0, CHAN_Y );
+ FETCH( func, *inst, 0, 0, CHAN_X );
emit_sin( func, 0 );
STORE( func, *inst, 0, 0, CHAN_Y );
}
@@ -2236,149 +1977,289 @@ emit_declaration(
}
}
-unsigned
-tgsi_emit_sse2(
- struct tgsi_token *tokens,
- struct x86_function *func )
-{
- struct tgsi_parse_context parse;
- unsigned ok = 1;
-
- DUMP_START();
-
- func->csr = func->store;
-
- emit_mov(
- func,
- get_input_base(),
- get_argument( 0 ) );
- emit_mov(
- func,
- get_output_base(),
- get_argument( 1 ) );
- emit_mov(
- func,
- get_const_base(),
- get_argument( 2 ) );
- emit_mov(
- func,
- get_temp_base(),
- get_argument( 3 ) );
-
- tgsi_parse_init( &parse, tokens );
-
- while( !tgsi_parse_end_of_tokens( &parse ) && ok ) {
- tgsi_parse_token( &parse );
-
- switch( parse.FullToken.Token.Type ) {
- case TGSI_TOKEN_TYPE_DECLARATION:
- break;
-
- case TGSI_TOKEN_TYPE_INSTRUCTION:
- ok = emit_instruction(
- func,
- &parse.FullToken.FullInstruction );
-
- if (!ok) {
- debug_printf("failed to translate tgsi opcode %d to SSE\n",
- parse.FullToken.FullInstruction.Instruction.Opcode );
- }
- break;
-
- case TGSI_TOKEN_TYPE_IMMEDIATE:
- /* XXX implement this */
- ok = 0;
- debug_printf("failed to emit immediate value to SSE\n");
- break;
-
- default:
- assert( 0 );
- ok = 0;
- break;
- }
+static void aos_to_soa( struct x86_function *func,
+ uint arg_aos,
+ uint arg_soa,
+ uint arg_num,
+ uint arg_stride )
+{
+ struct x86_reg soa_input = x86_make_reg( file_REG32, reg_AX );
+ struct x86_reg aos_input = x86_make_reg( file_REG32, reg_BX );
+ struct x86_reg num_inputs = x86_make_reg( file_REG32, reg_CX );
+ struct x86_reg stride = x86_make_reg( file_REG32, reg_DX );
+ int inner_loop;
+
+
+ /* Save EBX */
+ x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
+
+ x86_mov( func, aos_input, x86_fn_arg( func, arg_aos ) );
+ x86_mov( func, soa_input, x86_fn_arg( func, arg_soa ) );
+ x86_mov( func, num_inputs, x86_fn_arg( func, arg_num ) );
+ x86_mov( func, stride, x86_fn_arg( func, arg_stride ) );
+
+ /* do */
+ inner_loop = x86_get_label( func );
+ {
+ x86_push( func, aos_input );
+ sse_movlps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
+ sse_movlps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
+ x86_add( func, aos_input, stride );
+ sse_movhps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
+ sse_movhps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
+ x86_add( func, aos_input, stride );
+ sse_movlps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
+ sse_movlps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
+ x86_add( func, aos_input, stride );
+ sse_movhps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
+ sse_movhps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
+ x86_pop( func, aos_input );
+
+ sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
+ sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
+ sse_shufps( func, make_xmm( 0 ), make_xmm( 1 ), 0x88 );
+ sse_shufps( func, make_xmm( 2 ), make_xmm( 1 ), 0xdd );
+ sse_shufps( func, make_xmm( 3 ), make_xmm( 4 ), 0x88 );
+ sse_shufps( func, make_xmm( 5 ), make_xmm( 4 ), 0xdd );
+
+ sse_movups( func, x86_make_disp( soa_input, 0 ), make_xmm( 0 ) );
+ sse_movups( func, x86_make_disp( soa_input, 16 ), make_xmm( 2 ) );
+ sse_movups( func, x86_make_disp( soa_input, 32 ), make_xmm( 3 ) );
+ sse_movups( func, x86_make_disp( soa_input, 48 ), make_xmm( 5 ) );
+
+ /* Advance to next input */
+ x86_lea( func, aos_input, x86_make_disp(aos_input, 16) );
+ x86_lea( func, soa_input, x86_make_disp(soa_input, 64) );
}
+ /* while --num_inputs */
+ x86_dec( func, num_inputs );
+ x86_jcc( func, cc_NE, inner_loop );
+
+ /* Restore EBX */
+ x86_pop( func, aos_input );
+}
+
+static void soa_to_aos( struct x86_function *func, uint aos, uint soa, uint num, uint stride )
+{
+ struct x86_reg soa_output;
+ struct x86_reg aos_output;
+ struct x86_reg num_outputs;
+ struct x86_reg temp;
+ int inner_loop;
+
+ soa_output = x86_make_reg( file_REG32, reg_AX );
+ aos_output = x86_make_reg( file_REG32, reg_BX );
+ num_outputs = x86_make_reg( file_REG32, reg_CX );
+ temp = x86_make_reg( file_REG32, reg_DX );
+
+ /* Save EBX */
+ x86_push( func, aos_output );
+
+ x86_mov( func, soa_output, x86_fn_arg( func, soa ) );
+ x86_mov( func, aos_output, x86_fn_arg( func, aos ) );
+ x86_mov( func, num_outputs, x86_fn_arg( func, num ) );
+
+ /* do */
+ inner_loop = x86_get_label( func );
+ {
+ sse_movups( func, make_xmm( 0 ), x86_make_disp( soa_output, 0 ) );
+ sse_movups( func, make_xmm( 1 ), x86_make_disp( soa_output, 16 ) );
+ sse_movups( func, make_xmm( 3 ), x86_make_disp( soa_output, 32 ) );
+ sse_movups( func, make_xmm( 4 ), x86_make_disp( soa_output, 48 ) );
+
+ sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
+ sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
+ sse_unpcklps( func, make_xmm( 0 ), make_xmm( 1 ) );
+ sse_unpckhps( func, make_xmm( 2 ), make_xmm( 1 ) );
+ sse_unpcklps( func, make_xmm( 3 ), make_xmm( 4 ) );
+ sse_unpckhps( func, make_xmm( 5 ), make_xmm( 4 ) );
+
+ x86_mov( func, temp, x86_fn_arg( func, stride ) );
+ x86_push( func, aos_output );
+ sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
+ sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
+ x86_add( func, aos_output, temp );
+ sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
+ sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
+ x86_add( func, aos_output, temp );
+ sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
+ sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
+ x86_add( func, aos_output, temp );
+ sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
+ sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
+ x86_pop( func, aos_output );
+
+ /* Advance to next output */
+ x86_lea( func, aos_output, x86_make_disp(aos_output, 16) );
+ x86_lea( func, soa_output, x86_make_disp(soa_output, 64) );
+ }
+ /* while --num_outputs */
+ x86_dec( func, num_outputs );
+ x86_jcc( func, cc_NE, inner_loop );
- tgsi_parse_free( &parse );
-
- DUMP_END();
-
- return ok;
+ /* Restore EBX */
+ x86_pop( func, aos_output );
}
/**
- * Fragment shaders are responsible for interpolating shader inputs. Because on
- * x86 we have only 4 GP registers, and here we have 5 shader arguments (input,
- * output, const, temp and coef), the code is split into two phases --
- * DECLARATION and INSTRUCTION phase.
- * GP register holding the output argument is aliased with the coeff argument,
- * as outputs are not needed in the DECLARATION phase.
+ * Translate a TGSI vertex/fragment shader to SSE2 code.
+ * Slightly different things are done for vertex vs. fragment shaders.
+ *
+ * Note that fragment shaders are responsible for interpolating shader
+ * inputs. Because on x86 we have only 4 GP registers, and here we
+ * have 5 shader arguments (input, output, const, temp and coef), the
+ * code is split into two phases -- DECLARATION and INSTRUCTION phase.
+ * GP register holding the output argument is aliased with the coeff
+ * argument, as outputs are not needed in the DECLARATION phase.
+ *
+ * \param tokens the TGSI input shader
+ * \param func the output SSE code/function
+ * \param immediates buffer to place immediates, later passed to SSE func
+ * \param return 1 for success, 0 if translation failed
*/
unsigned
-tgsi_emit_sse2_fs(
- struct tgsi_token *tokens,
- struct x86_function *func )
+tgsi_emit_sse2(
+ const struct tgsi_token *tokens,
+ struct x86_function *func,
+ float (*immediates)[4],
+ boolean do_swizzles )
{
struct tgsi_parse_context parse;
boolean instruction_phase = FALSE;
unsigned ok = 1;
-
- DUMP_START();
+ uint num_immediates = 0;
func->csr = func->store;
- /* DECLARATION phase, do not load output argument. */
- emit_mov(
- func,
- get_input_base(),
- get_argument( 0 ) );
- emit_mov(
- func,
- get_const_base(),
- get_argument( 2 ) );
- emit_mov(
+ tgsi_parse_init( &parse, tokens );
+
+ /* Can't just use EDI, EBX without save/restoring them:
+ */
+ x86_push(
func,
- get_temp_base(),
- get_argument( 3 ) );
- emit_mov(
+ get_immediate_base() );
+
+ x86_push(
func,
- get_coef_base(),
- get_argument( 4 ) );
+ get_temp_base() );
- tgsi_parse_init( &parse, tokens );
+
+ /*
+ * Different function args for vertex/fragment shaders:
+ */
+ if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
+ /* DECLARATION phase, do not load output argument. */
+ x86_mov(
+ func,
+ get_input_base(),
+ x86_fn_arg( func, 1 ) );
+ /* skipping outputs argument here */
+ x86_mov(
+ func,
+ get_const_base(),
+ x86_fn_arg( func, 3 ) );
+ x86_mov(
+ func,
+ get_temp_base(),
+ x86_fn_arg( func, 4 ) );
+ x86_mov(
+ func,
+ get_coef_base(),
+ x86_fn_arg( func, 5 ) );
+ x86_mov(
+ func,
+ get_immediate_base(),
+ x86_fn_arg( func, 6 ) );
+ }
+ else {
+ assert(parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX);
+
+ if (do_swizzles)
+ aos_to_soa( func,
+ 6, /* aos_input */
+ 1, /* machine->input */
+ 7, /* num_inputs */
+ 8 ); /* input_stride */
+
+ x86_mov(
+ func,
+ get_input_base(),
+ x86_fn_arg( func, 1 ) );
+ x86_mov(
+ func,
+ get_output_base(),
+ x86_fn_arg( func, 2 ) );
+ x86_mov(
+ func,
+ get_const_base(),
+ x86_fn_arg( func, 3 ) );
+ x86_mov(
+ func,
+ get_temp_base(),
+ x86_fn_arg( func, 4 ) );
+ x86_mov(
+ func,
+ get_immediate_base(),
+ x86_fn_arg( func, 5 ) );
+ }
while( !tgsi_parse_end_of_tokens( &parse ) && ok ) {
tgsi_parse_token( &parse );
switch( parse.FullToken.Token.Type ) {
case TGSI_TOKEN_TYPE_DECLARATION:
- emit_declaration(
- func,
- &parse.FullToken.FullDeclaration );
+ if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
+ emit_declaration(
+ func,
+ &parse.FullToken.FullDeclaration );
+ }
break;
case TGSI_TOKEN_TYPE_INSTRUCTION:
- if( !instruction_phase ) {
- /* INSTRUCTION phase, overwrite coeff with output. */
- instruction_phase = TRUE;
- emit_mov(
- func,
- get_output_base(),
- get_argument( 1 ) );
+ if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
+ if( !instruction_phase ) {
+ /* INSTRUCTION phase, overwrite coeff with output. */
+ instruction_phase = TRUE;
+ x86_mov(
+ func,
+ get_output_base(),
+ x86_fn_arg( func, 2 ) );
+ }
}
+
ok = emit_instruction(
func,
&parse.FullToken.FullInstruction );
if (!ok) {
- debug_printf("failed to translate tgsi opcode %d to SSE\n",
- parse.FullToken.FullInstruction.Instruction.Opcode );
+ debug_printf("failed to translate tgsi opcode %d to SSE (%s)\n",
+ parse.FullToken.FullInstruction.Instruction.Opcode,
+ parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX ?
+ "vertex shader" : "fragment shader");
}
break;
case TGSI_TOKEN_TYPE_IMMEDIATE:
- /* XXX implement this */
- ok = 0;
- debug_printf("failed to emit immediate value to SSE\n");
+ /* simply copy the immediate values into the next immediates[] slot */
+ {
+ const uint size = parse.FullToken.FullImmediate.Immediate.Size - 1;
+ uint i;
+ assert(size <= 4);
+ assert(num_immediates < TGSI_EXEC_NUM_IMMEDIATES);
+ for( i = 0; i < size; i++ ) {
+ immediates[num_immediates][i] =
+ parse.FullToken.FullImmediate.u.ImmediateFloat32[i].Float;
+ }
+#if 0
+ debug_printf("SSE FS immediate[%d] = %f %f %f %f\n",
+ num_immediates,
+ immediates[num_immediates][0],
+ immediates[num_immediates][1],
+ immediates[num_immediates][2],
+ immediates[num_immediates][3]);
+#endif
+ num_immediates++;
+ }
break;
default:
@@ -2387,11 +2268,30 @@ tgsi_emit_sse2_fs(
}
}
- tgsi_parse_free( &parse );
+ if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
+ if (do_swizzles)
+ soa_to_aos( func, 9, 2, 10, 11 );
+ }
- DUMP_END();
+ /* Can't just use EBX, EDI without save/restoring them:
+ */
+ x86_pop(
+ func,
+ get_temp_base() );
+
+ x86_pop(
+ func,
+ get_immediate_base() );
+
+#ifdef WIN32
+ emit_retw( func, 16 );
+#else
+ emit_ret( func );
+#endif
+
+ tgsi_parse_free( &parse );
return ok;
}
-#endif /* i386 */
+#endif /* PIPE_ARCH_X86 */
diff --git a/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.h b/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.h
index 63b8ef3911..e66d115283 100755
--- a/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.h
+++ b/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.h
@@ -10,13 +10,10 @@ struct x86_function;
unsigned
tgsi_emit_sse2(
- struct tgsi_token *tokens,
- struct x86_function *function );
-
-unsigned
-tgsi_emit_sse2_fs(
- struct tgsi_token *tokens,
- struct x86_function *function );
+ const struct tgsi_token *tokens,
+ struct x86_function *function,
+ float (*immediates)[4],
+ boolean do_swizzles );
#if defined __cplusplus
}