5 files changed, 607 insertions, 679 deletions
diff --git a/src/gallium/auxiliary/tgsi/exec/Makefile b/src/gallium/auxiliary/tgsi/exec/Makefile
new file mode 100644
index 0000000000..451911a354
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/exec/Makefile
@@ -0,0 +1,2 @@
+default:
+	cd .. ; make
diff --git a/src/gallium/auxiliary/tgsi/exec/tgsi_exec.c b/src/gallium/auxiliary/tgsi/exec/tgsi_exec.c
index 78e7dec569..826b432f09 100644
--- a/src/gallium/auxiliary/tgsi/exec/tgsi_exec.c
+++ b/src/gallium/auxiliary/tgsi/exec/tgsi_exec.c
@@ -88,6 +88,10 @@
 #define TEMP_OUTPUT_C      TGSI_EXEC_TEMP_OUTPUT_C
 #define TEMP_PRIMITIVE_I   TGSI_EXEC_TEMP_PRIMITIVE_I
 #define TEMP_PRIMITIVE_C   TGSI_EXEC_TEMP_PRIMITIVE_C
+#define TEMP_3_I           TGSI_EXEC_TEMP_THREE_I
+#define TEMP_3_C           TGSI_EXEC_TEMP_THREE_C
+#define TEMP_HALF_I        TGSI_EXEC_TEMP_HALF_I
+#define TEMP_HALF_C        TGSI_EXEC_TEMP_HALF_C
 #define TEMP_R0            TGSI_EXEC_TEMP_R0
 
 #define FOR_EACH_CHANNEL(CHAN)\
@@ -262,6 +266,8 @@ tgsi_exec_machine_init(
       mach->Temps[TEMP_2_I].xyzw[TEMP_2_C].f[i] = 2.0f;
       mach->Temps[TEMP_128_I].xyzw[TEMP_128_C].f[i] = 128.0f;
       mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C].f[i] = -128.0f;
+      mach->Temps[TEMP_3_I].xyzw[TEMP_3_C].f[i] = 3.0f;
+      mach->Temps[TEMP_HALF_I].xyzw[TEMP_HALF_C].f[i] = 0.5f;
    }
 }
 
@@ -287,10 +293,10 @@ micro_abs(
    union tgsi_exec_channel *dst,
    const union tgsi_exec_channel *src )
 {
-   dst->f[0] = (float) fabs( (double) src->f[0] );
-   dst->f[1] = (float) fabs( (double) src->f[1] );
-   dst->f[2] = (float) fabs( (double) src->f[2] );
-   dst->f[3] = (float) fabs( (double) src->f[3] );
+   dst->f[0] = fabsf( src->f[0] );
+   dst->f[1] = fabsf( src->f[1] );
+   dst->f[2] = fabsf( src->f[2] );
+   dst->f[3] = fabsf( src->f[3] );
 }
 
 static void
@@ -334,10 +340,10 @@ micro_ceil(
    union tgsi_exec_channel *dst,
    const union tgsi_exec_channel *src )
 {
-   dst->f[0] = (float) ceil( (double) src->f[0] );
-   dst->f[1] = (float) ceil( (double) src->f[1] );
-   dst->f[2] = (float) ceil( (double) src->f[2] );
-   dst->f[3] = (float) ceil( (double) src->f[3] );
+   dst->f[0] = ceilf( src->f[0] );
+   dst->f[1] = ceilf( src->f[1] );
+   dst->f[2] = ceilf( src->f[2] );
+   dst->f[3] = ceilf( src->f[3] );
 }
 
 static void
@@ -345,10 +351,10 @@ micro_cos(
    union tgsi_exec_channel *dst,
    const union tgsi_exec_channel *src )
 {
-   dst->f[0] = (float) cos( (double) src->f[0] );
-   dst->f[1] = (float) cos( (double) src->f[1] );
-   dst->f[2] = (float) cos( (double) src->f[2] );
-   dst->f[3] = (float) cos( (double) src->f[3] );
+   dst->f[0] = cosf( src->f[0] );
+   dst->f[1] = cosf( src->f[1] );
+   dst->f[2] = cosf( src->f[2] );
+   dst->f[3] = cosf( src->f[3] );
 }
 
 static void
@@ -430,10 +436,10 @@ micro_exp2(
    union tgsi_exec_channel *dst,
    const union tgsi_exec_channel *src)
 {
-   dst->f[0] = (float) pow( 2.0, (double) src->f[0] );
-   dst->f[1] = (float) pow( 2.0, (double) src->f[1] );
-   dst->f[2] = (float) pow( 2.0, (double) src->f[2] );
-   dst->f[3] = (float) pow( 2.0, (double) src->f[3] );
+   dst->f[0] = powf( 2.0f, src->f[0] );
+   dst->f[1] = powf( 2.0f, src->f[1] );
+   dst->f[2] = powf( 2.0f, src->f[2] );
+   dst->f[3] = powf( 2.0f, src->f[3] );
 }
 
 static void
@@ -463,10 +469,10 @@ micro_flr(
    union tgsi_exec_channel *dst,
    const union tgsi_exec_channel *src )
 {
-   dst->f[0] = (float) floor( (double) src->f[0] );
-   dst->f[1] = (float) floor( (double) src->f[1] );
-   dst->f[2] = (float) floor( (double) src->f[2] );
-   dst->f[3] = (float) floor( (double) src->f[3] );
+   dst->f[0] = floorf( src->f[0] );
+   dst->f[1] = floorf( src->f[1] );
+   dst->f[2] = floorf( src->f[2] );
+   dst->f[3] = floorf( src->f[3] );
 }
 
 static void
@@ -474,10 +480,10 @@ micro_frc(
    union tgsi_exec_channel *dst,
    const union tgsi_exec_channel *src )
 {
-   dst->f[0] = src->f[0] - (float) floor( (double) src->f[0] );
-   dst->f[1] = src->f[1] - (float) floor( (double) src->f[1] );
-   dst->f[2] = src->f[2] - (float) floor( (double) src->f[2] );
-   dst->f[3] = src->f[3] - (float) floor( (double) src->f[3] );
+   dst->f[0] = src->f[0] - floorf( src->f[0] );
+   dst->f[1] = src->f[1] - floorf( src->f[1] );
+   dst->f[2] = src->f[2] - floorf( src->f[2] );
+   dst->f[3] = src->f[3] - floorf( src->f[3] );
 }
 
 static void
@@ -510,10 +516,24 @@ micro_lg2(
    union tgsi_exec_channel *dst,
    const union tgsi_exec_channel *src )
 {
-   dst->f[0] = (float) log( (double) src->f[0] ) * 1.442695f;
-   dst->f[1] = (float) log( (double) src->f[1] ) * 1.442695f;
-   dst->f[2] = (float) log( (double) src->f[2] ) * 1.442695f;
-   dst->f[3] = (float) log( (double) src->f[3] ) * 1.442695f;
+   dst->f[0] = logf( src->f[0] ) * 1.442695f;
+   dst->f[1] = logf( src->f[1] ) * 1.442695f;
+   dst->f[2] = logf( src->f[2] ) * 1.442695f;
+   dst->f[3] = logf( src->f[3] ) * 1.442695f;
+}
+
+static void
+micro_le(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1,
+   const union tgsi_exec_channel *src2,
+   const union tgsi_exec_channel *src3 )
+{
+   dst->f[0] = src0->f[0] <= src1->f[0] ? src2->f[0] : src3->f[0];
+   dst->f[1] = src0->f[1] <= src1->f[1] ? src2->f[1] : src3->f[1];
+   dst->f[2] = src0->f[2] <= src1->f[2] ? src2->f[2] : src3->f[2];
+   dst->f[3] = src0->f[3] <= src1->f[3] ? src2->f[3] : src3->f[3];
 }
 
 static void
@@ -764,10 +784,10 @@ micro_pow(
    const union tgsi_exec_channel *src0,
    const union tgsi_exec_channel *src1 )
 {
-   dst->f[0] = (float) pow( (double) src0->f[0], (double) src1->f[0] );
-   dst->f[1] = (float) pow( (double) src0->f[1], (double) src1->f[1] );
-   dst->f[2] = (float) pow( (double) src0->f[2], (double) src1->f[2] );
-   dst->f[3] = (float) pow( (double) src0->f[3], (double) src1->f[3] );
+   dst->f[0] = powf( src0->f[0], src1->f[0] );
+   dst->f[1] = powf( src0->f[1], src1->f[1] );
+   dst->f[2] = powf( src0->f[2], src1->f[2] );
+   dst->f[3] = powf( src0->f[3], src1->f[3] );
 }
 
 static void
@@ -775,10 +795,10 @@ micro_rnd(
    union tgsi_exec_channel *dst,
    const union tgsi_exec_channel *src )
 {
-   dst->f[0] = (float) floor( (double) (src->f[0] + 0.5f) );
-   dst->f[1] = (float) floor( (double) (src->f[1] + 0.5f) );
-   dst->f[2] = (float) floor( (double) (src->f[2] + 0.5f) );
-   dst->f[3] = (float) floor( (double) (src->f[3] + 0.5f) );
+   dst->f[0] = floorf( src->f[0] + 0.5f );
+   dst->f[1] = floorf( src->f[1] + 0.5f );
+   dst->f[2] = floorf( src->f[2] + 0.5f );
+   dst->f[3] = floorf( src->f[3] + 0.5f );
 }
 
 static void
@@ -833,20 +853,20 @@ micro_sin(
    union tgsi_exec_channel *dst,
    const union tgsi_exec_channel *src )
 {
-   dst->f[0] = (float) sin( (double) src->f[0] );
-   dst->f[1] = (float) sin( (double) src->f[1] );
-   dst->f[2] = (float) sin( (double) src->f[2] );
-   dst->f[3] = (float) sin( (double) src->f[3] );
+   dst->f[0] = sinf( src->f[0] );
+   dst->f[1] = sinf( src->f[1] );
+   dst->f[2] = sinf( src->f[2] );
+   dst->f[3] = sinf( src->f[3] );
 }
 
 static void
 micro_sqrt( union tgsi_exec_channel *dst,
             const union tgsi_exec_channel *src )
 {
-   dst->f[0] = (float) sqrt( (double) src->f[0] );
-   dst->f[1] = (float) sqrt( (double) src->f[1] );
-   dst->f[2] = (float) sqrt( (double) src->f[2] );
-   dst->f[3] = (float) sqrt( (double) src->f[3] );
+   dst->f[0] = sqrtf( src->f[0] );
+   dst->f[1] = sqrtf( src->f[1] );
+   dst->f[2] = sqrtf( src->f[2] );
+   dst->f[3] = sqrtf( src->f[3] );
 }
 
 static void
@@ -1516,41 +1536,44 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_EXP:
-      debug_printf("TGSI: EXP opcode not implemented\n");
-      /* from ARB_v_p:
-      tmp = ScalarLoad(op0);
-      result.x = 2^floor(tmp);
-      result.y = tmp - floor(tmp);
-      result.z = RoughApprox2ToX(tmp);
-      result.w = 1.0;
-      */
-#if 0
-      /* something like this: */
       FETCH( &r[0], 0, CHAN_X );
-      micro_exp2( &r[0], &r[0] );
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-	 STORE( &r[0], 0, chan_index );
+      micro_flr( &r[1], &r[0] );  /* r1 = floor(r0) */
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
+         micro_exp2( &r[2], &r[1] );       /* r2 = 2 ^ r1 */
+	 STORE( &r[2], 0, CHAN_X );        /* store r2 */
+      }
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
+         micro_sub( &r[2], &r[0], &r[1] ); /* r2 = r0 - r1 */
+	 STORE( &r[2], 0, CHAN_Y );        /* store r2 */
+      }
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
+         micro_exp2( &r[2], &r[0] );       /* r2 = 2 ^ r0 */
+	 STORE( &r[2], 0, CHAN_Z );        /* store r2 */
+      }
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
+	 STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
       }
-#endif
       break;
 
    case TGSI_OPCODE_LOG:
-      debug_printf("TGSI: LOG opcode not implemented\n");
-      /* from ARB_v_p:
-      tmp = fabs(ScalarLoad(op0));
-      result.x = floor(log2(tmp));
-      result.y = tmp / 2^(floor(log2(tmp)));
-      result.z = RoughApproxLog2(tmp);
-      result.w = 1.0;
-      */
-#if 0
-      /* something like this: */
       FETCH( &r[0], 0, CHAN_X );
-      micro_lg2( &r[0], &r[0] );
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-	 STORE( &r[0], 0, chan_index );
+      micro_abs( &r[2], &r[0] );  /* r2 = abs(r0) */
+      micro_lg2( &r[1], &r[2] );  /* r1 = lg2(r2) */
+      micro_flr( &r[0], &r[1] );  /* r0 = floor(r1) */
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
+	 STORE( &r[0], 0, CHAN_X );
+      }
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
+         micro_exp2( &r[0], &r[0] );       /* r0 = 2 ^ r0 */
+         micro_div( &r[0], &r[2], &r[0] ); /* r0 = r2 / r0 */
+	 STORE( &r[0], 0, CHAN_Y );
+      }
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
+	 STORE( &r[1], 0, CHAN_Z );
+      }
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
+	 STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
       }
-#endif
       break;
 
    case TGSI_OPCODE_MUL:
@@ -1975,7 +1998,7 @@ exec_instruction(
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
          FETCH( &r[1], 1, chan_index );
-         micro_lt( &r[0], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C] );
+         micro_le( &r[0], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C] );
          STORE( &r[0], 0, chan_index );
       }
       break;
@@ -1992,7 +2015,7 @@ exec_instruction(
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
          FETCH( &r[1], 1, chan_index );
-         micro_ge( &r[0], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C] );
+         micro_le( &r[0], &r[0], &r[1], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
          STORE( &r[0], 0, chan_index );
       }
       break;
diff --git a/src/gallium/auxiliary/tgsi/exec/tgsi_exec.h b/src/gallium/auxiliary/tgsi/exec/tgsi_exec.h
index 45c49dd007..19bd78df3d 100644
--- a/src/gallium/auxiliary/tgsi/exec/tgsi_exec.h
+++ b/src/gallium/auxiliary/tgsi/exec/tgsi_exec.h
@@ -133,9 +133,15 @@ struct tgsi_exec_labels
 #define TGSI_EXEC_TEMP_PRIMITIVE_I  34
 #define TGSI_EXEC_TEMP_PRIMITIVE_C  2
 
-#define TGSI_EXEC_TEMP_R0           35
+#define TGSI_EXEC_TEMP_THREE_I      34
+#define TGSI_EXEC_TEMP_THREE_C      3
 
-#define TGSI_EXEC_NUM_TEMPS   (32 + 4)
+#define TGSI_EXEC_TEMP_HALF_I       35
+#define TGSI_EXEC_TEMP_HALF_C       0
+
+#define TGSI_EXEC_TEMP_R0           36
+
+#define TGSI_EXEC_NUM_TEMPS   (32 + 5)
 #define TGSI_EXEC_NUM_ADDRS   1
 #define TGSI_EXEC_NUM_IMMEDIATES  256
 
@@ -166,7 +172,7 @@ struct tgsi_exec_machine
 
    float                         Imms[TGSI_EXEC_NUM_IMMEDIATES][4];
    unsigned                      ImmLimit;
-   float                         (*Consts)[4];
+   const float                   (*Consts)[4];
    struct tgsi_exec_vector       *Inputs;
    struct tgsi_exec_vector       *Outputs;
    const struct tgsi_token       *Tokens;
diff --git a/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c b/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c
index 4e80597b3f..8018bd7fa4 100755
--- a/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c
+++ b/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c
@@ -34,115 +34,14 @@
 
 #include "rtasm/rtasm_x86sse.h"
 
-#if defined(__i386__) || defined(__386__)
+#ifdef PIPE_ARCH_X86
 
-#define DUMP_SSE  0
-
-#if DUMP_SSE
-
-static void
-_print_reg(
-   struct x86_reg reg )
-{
-   if (reg.mod != mod_REG) 
-      debug_printf( "[" );
-      
-   switch( reg.file ) {
-   case file_REG32:
-      switch( reg.idx ) {
-      case reg_AX:
-         debug_printf( "EAX" );
-         break;
-      case reg_CX:
-         debug_printf( "ECX" );
-         break;
-      case reg_DX:
-         debug_printf( "EDX" );
-         break;
-      case reg_BX:
-         debug_printf( "EBX" );
-         break;
-      case reg_SP:
-         debug_printf( "ESP" );
-         break;
-      case reg_BP:
-         debug_printf( "EBP" );
-         break;
-      case reg_SI:
-         debug_printf( "ESI" );
-         break;
-      case reg_DI:
-         debug_printf( "EDI" );
-         break;
-      }
-      break;
-   case file_MMX:
-      assert( 0 );
-      break;
-   case file_XMM:
-      debug_printf( "XMM%u", reg.idx );
-      break;
-   case file_x87:
-      assert( 0 );
-      break;
-   }
-
-   if (reg.mod == mod_DISP8 ||
-       reg.mod == mod_DISP32)
-      debug_printf("+%d", reg.disp);
-
-   if (reg.mod != mod_REG) 
-      debug_printf( "]" );
-}
-
-static void
-_fill(
-   const char  *op )
-{
-   unsigned count = 10 - strlen( op );
-
-   while( count-- ) {
-      debug_printf( " " );
-   }
-}
-
-#define DUMP_START() debug_printf( "\nsse-dump start ----------------" )
-#define DUMP_END() debug_printf( "\nsse-dump end ----------------\n" )
-#define DUMP( OP ) debug_printf( "\n%s", OP )
-#define DUMP_I( OP, I ) do {\
-   debug_printf( "\n%s", OP );\
-   _fill( OP );\
-   debug_printf( "%u", I ); } while( 0 )
-#define DUMP_R( OP, R0 ) do {\
-   debug_printf( "\n%s", OP );\
-   _fill( OP );\
-   _print_reg( R0 ); } while( 0 )
-#define DUMP_RR( OP, R0, R1 ) do {\
-   debug_printf( "\n%s", OP );\
-   _fill( OP );\
-   _print_reg( R0 );\
-   debug_printf( ", " );\
-   _print_reg( R1 ); } while( 0 )
-#define DUMP_RRI( OP, R0, R1, I ) do {\
-   debug_printf( "\n%s", OP );\
-   _fill( OP );\
-   _print_reg( R0 );\
-   debug_printf( ", " );\
-   _print_reg( R1 );\
-   debug_printf( ", " );\
-   debug_printf( "%u", I ); } while( 0 )
-
-#else
-
-#define DUMP_START()
-#define DUMP_END()
-#define DUMP( OP )
-#define DUMP_I( OP, I )
-#define DUMP_R( OP, R0 )
-#define DUMP_RR( OP, R0, R1 )
-#define DUMP_RRI( OP, R0, R1, I )
+/* for 1/sqrt()
+ *
+ * This costs about 100fps (close to 10%) in gears:
+ */
+#define HIGH_PRECISION 1
 
-#endif
 
 #define FOR_EACH_CHANNEL( CHAN )\
    for( CHAN = 0; CHAN < 4; CHAN++ )
@@ -208,15 +107,9 @@ get_output_base( void )
 static struct x86_reg
 get_temp_base( void )
 {
-#ifdef WIN32
    return x86_make_reg(
       file_REG32,
       reg_BX );
-#else
-   return x86_make_reg(
-      file_REG32,
-      reg_SI );
-#endif
 }
 
 static struct x86_reg
@@ -225,17 +118,28 @@ get_coef_base( void )
    return get_output_base();
 }
 
+static struct x86_reg
+get_immediate_base( void )
+{
+   return x86_make_reg(
+      file_REG32,
+      reg_DI );
+}
+
+
 /**
  * Data access helpers.
  */
 
+
 static struct x86_reg
-get_argument(
-   unsigned index )
+get_immediate(
+   unsigned vec,
+   unsigned chan )
 {
    return x86_make_disp(
-      x86_make_reg( file_REG32, reg_SP ),
-      (index + 1) * 4 );
+      get_immediate_base(),
+      (vec * 4 + chan) * 4 );
 }
 
 static struct x86_reg
@@ -289,200 +193,6 @@ get_coef(
       ((vec * 3 + member) * 4 + chan) * 4 );
 }
 
-/**
- * X86 rtasm wrappers.
- */
-
-static void
-emit_addps(
-   struct x86_function  *func,
-   struct x86_reg       dst,
-   struct x86_reg       src )
-{
-   DUMP_RR( "ADDPS", dst, src );
-   sse_addps( func, dst, src );
-}
-
-static void
-emit_andnps(
-   struct x86_function  *func,
-   struct x86_reg       dst,
-   struct x86_reg       src )
-{
-   DUMP_RR( "ANDNPS", dst, src );
-   sse_andnps( func, dst, src );
-}
-
-static void
-emit_andps(
-   struct x86_function  *func,
-   struct x86_reg       dst,
-   struct x86_reg       src )
-{
-   DUMP_RR( "ANDPS", dst, src );
-   sse_andps( func, dst, src );
-}
-
-static void
-emit_call(
-   struct x86_function  *func,
-   void                 (* addr)() )
-{
-   struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
-
-   DUMP_I( "CALL", addr );
-   x86_mov_reg_imm( func, ecx, (unsigned long) addr );
-   x86_call( func, ecx );
-}
-
-static void
-emit_cmpps(
-   struct x86_function  *func,
-   struct x86_reg       dst,
-   struct x86_reg       src,
-   enum sse_cc          cc )
-{
-   DUMP_RRI( "CMPPS", dst, src, cc );
-   sse_cmpps( func, dst, src, cc );
-}
-
-static void
-emit_cvttps2dq(
-   struct x86_function  *func,
-   struct x86_reg       dst,
-   struct x86_reg       src )
-{
-   DUMP_RR( "CVTTPS2DQ", dst, src );
-   sse2_cvttps2dq( func, dst, src );
-}
-
-static void
-emit_maxps(
-   struct x86_function  *func,
-   struct x86_reg       dst,
-   struct x86_reg       src )
-{
-   DUMP_RR( "MAXPS", dst, src );
-   sse_maxps( func, dst, src );
-}
-
-static void
-emit_minps(
-   struct x86_function  *func,
-   struct x86_reg       dst,
-   struct x86_reg       src )
-{
-   DUMP_RR( "MINPS", dst, src );
-   sse_minps( func, dst, src );
-}
-
-static void
-emit_mov(
-   struct x86_function  *func,
-   struct x86_reg       dst,
-   struct x86_reg       src )
-{
-   DUMP_RR( "MOV", dst, src );
-   x86_mov( func, dst, src );
-}
-
-static void
-emit_movaps(
-   struct x86_function  *func,
-   struct x86_reg       dst,
-   struct x86_reg       src )
-{
-   DUMP_RR( "MOVAPS", dst, src );
-   sse_movaps( func, dst, src );
-}
-
-static void
-emit_movss(
-   struct x86_function  *func,
-   struct x86_reg       dst,
-   struct x86_reg       src )
-{
-   DUMP_RR( "MOVSS", dst, src );
-   sse_movss( func, dst, src );
-}
-
-static void
-emit_movups(
-   struct x86_function  *func,
-   struct x86_reg       dst,
-   struct x86_reg       src )
-{
-   DUMP_RR( "MOVUPS", dst, src );
-   sse_movups( func, dst, src );
-}
-
-static void
-emit_mulps(
-   struct x86_function  *func,
-   struct x86_reg       dst,
-   struct x86_reg       src )
-{
-   DUMP_RR( "MULPS", dst, src );
-   sse_mulps( func, dst, src );
-}
-
-static void
-emit_or(
-   struct x86_function  *func,
-   struct x86_reg       dst,
-   struct x86_reg       src )
-{
-   DUMP_RR( "OR", dst, src );
-   x86_or( func, dst, src );
-}
-
-static void
-emit_orps(
-   struct x86_function  *func,
-   struct x86_reg       dst,
-   struct x86_reg       src )
-{
-   DUMP_RR( "ORPS", dst, src );
-   sse_orps( func, dst, src );
-}
-
-static void
-emit_pmovmskb(
-   struct x86_function  *func,
-   struct x86_reg       dst,
-   struct x86_reg       src )
-{
-   DUMP_RR( "PMOVMSKB", dst, src );
-   sse_pmovmskb( func, dst, src );
-}
-
-static void
-emit_pop(
-   struct x86_function  *func,
-   struct x86_reg       dst )
-{
-   DUMP_R( "POP", dst );
-   x86_pop( func, dst );
-}
-
-static void
-emit_push(
-   struct x86_function  *func,
-   struct x86_reg       dst )
-{
-   DUMP_R( "PUSH", dst );
-   x86_push( func, dst );
-}
-
-static void
-emit_rcpps(
-   struct x86_function  *func,
-   struct x86_reg       dst,
-   struct x86_reg       src )
-{
-   DUMP_RR( "RCPPS", dst, src );
-   sse2_rcpps( func, dst, src );
-}
 
 #ifdef WIN32
 static void
@@ -490,7 +200,6 @@ emit_retw(
    struct x86_function  *func,
    unsigned             size )
 {
-   DUMP_I( "RET", size );
    x86_retw( func, size );
 }
 #else
@@ -498,56 +207,21 @@ static void
 emit_ret(
    struct x86_function  *func )
 {
-   DUMP( "RET" );
    x86_ret( func );
 }
 #endif
 
-static void
-emit_rsqrtps(
-   struct x86_function  *func,
-   struct x86_reg       dst,
-   struct x86_reg       src )
-{
-   DUMP_RR( "RSQRTPS", dst, src );
-   sse_rsqrtps( func, dst, src );
-}
-
-static void
-emit_shufps(
-   struct x86_function  *func,
-   struct x86_reg       dst,
-   struct x86_reg       src,
-   unsigned char        shuf )
-{
-   DUMP_RRI( "SHUFPS", dst, src, shuf );
-   sse_shufps( func, dst, src, shuf );
-}
-
-static void
-emit_subps(
-   struct x86_function  *func,
-   struct x86_reg       dst,
-   struct x86_reg       src )
-{
-   DUMP_RR( "SUBPS", dst, src );
-   sse_subps( func, dst, src );
-}
-
-static void
-emit_xorps(
-   struct x86_function  *func,
-   struct x86_reg       dst,
-   struct x86_reg       src )
-{
-   DUMP_RR( "XORPS", dst, src );
-   sse_xorps( func, dst, src );
-}
 
 /**
  * Data fetch helpers.
  */
 
+/**
+ * Copy a shader constant to xmm register
+ * \param xmm  the destination xmm register
+ * \param vec  the src const buffer index
+ * \param chan  src channel to fetch (X, Y, Z or W)
+ */
 static void
 emit_const(
    struct x86_function *func,
@@ -555,11 +229,11 @@ emit_const(
    unsigned vec,
    unsigned chan )
 {
-   emit_movss(
+   sse_movss(
       func,
       make_xmm( xmm ),
       get_const( vec, chan ) );
-   emit_shufps(
+   sse_shufps(
       func,
       make_xmm( xmm ),
       make_xmm( xmm ),
@@ -567,18 +241,49 @@ emit_const(
 }
 
 static void
+emit_immediate(
+   struct x86_function *func,
+   unsigned xmm,
+   unsigned vec,
+   unsigned chan )
+{
+   sse_movss(
+      func,
+      make_xmm( xmm ),
+      get_immediate( vec, chan ) );
+   sse_shufps(
+      func,
+      make_xmm( xmm ),
+      make_xmm( xmm ),
+      SHUF( 0, 0, 0, 0 ) );
+}
+
+
+/**
+ * Copy a shader input to xmm register
+ * \param xmm  the destination xmm register
+ * \param vec  the src input attrib
+ * \param chan  src channel to fetch (X, Y, Z or W)
+ */
+static void
 emit_inputf(
    struct x86_function *func,
    unsigned xmm,
    unsigned vec,
    unsigned chan )
 {
-   emit_movups(
+   sse_movups(
       func,
       make_xmm( xmm ),
       get_input( vec, chan ) );
 }
 
+/**
+ * Store an xmm register to a shader output
+ * \param xmm  the source xmm register
+ * \param vec  the dest output attrib
+ * \param chan  src dest channel to store (X, Y, Z or W)
+ */
 static void
 emit_output(
    struct x86_function *func,
@@ -586,12 +291,18 @@ emit_output(
    unsigned vec,
    unsigned chan )
 {
-   emit_movups(
+   sse_movups(
       func,
       get_output( vec, chan ),
       make_xmm( xmm ) );
 }
 
+/**
+ * Copy a shader temporary to xmm register
+ * \param xmm  the destination xmm register
+ * \param vec  the src temp register
+ * \param chan  src channel to fetch (X, Y, Z or W)
+ */
 static void
 emit_tempf(
    struct x86_function *func,
@@ -599,12 +310,19 @@ emit_tempf(
    unsigned vec,
    unsigned chan )
 {
-   emit_movaps(
+   sse_movaps(
       func,
       make_xmm( xmm ),
       get_temp( vec, chan ) );
 }
 
+/**
+ * Load an xmm register with an input attrib coefficient (a0, dadx or dady)
+ * \param xmm  the destination xmm register
+ * \param vec  the src input/attribute coefficient index
+ * \param chan  src channel to fetch (X, Y, Z or W)
+ * \param member  0=a0, 1=dadx, 2=dady
+ */
 static void
 emit_coef(
    struct x86_function *func,
@@ -613,11 +331,11 @@ emit_coef(
    unsigned chan,
    unsigned member )
 {
-   emit_movss(
+   sse_movss(
       func,
       make_xmm( xmm ),
       get_coef( vec, chan, member ) );
-   emit_shufps(
+   sse_shufps(
       func,
       make_xmm( xmm ),
       make_xmm( xmm ),
@@ -635,7 +353,7 @@ emit_inputs(
    unsigned vec,
    unsigned chan )
 {
-   emit_movups(
+   sse_movups(
       func,
       get_input( vec, chan ),
       make_xmm( xmm ) );
@@ -648,7 +366,7 @@ emit_temps(
    unsigned vec,
    unsigned chan )
 {
-   emit_movaps(
+   sse_movaps(
       func,
       get_temp( vec, chan ),
       make_xmm( xmm ) );
@@ -725,41 +443,32 @@ static void
 emit_push_gp(
    struct x86_function *func )
 {
-   emit_push(
+   x86_push(
       func,
-      get_const_base() );
-   emit_push(
+      x86_make_reg( file_REG32, reg_AX) );
+   x86_push(
       func,
-      get_input_base() );
-   emit_push(
+      x86_make_reg( file_REG32, reg_CX) );
+   x86_push(
       func,
-      get_output_base() );
-
-   /* It is important on non-win32 platforms that temp base is pushed last.
-    */
-   emit_push(
-      func,
-      get_temp_base() );
+      x86_make_reg( file_REG32, reg_DX) );
 }
 
 static void
-emit_pop_gp(
+x86_pop_gp(
    struct x86_function *func )
 {
    /* Restore GP registers in a reverse order.
     */
-   emit_pop(
-      func,
-      get_temp_base() );
-   emit_pop(
+   x86_pop(
       func,
-      get_output_base() );
-   emit_pop(
+      x86_make_reg( file_REG32, reg_DX) );
+   x86_pop(
       func,
-      get_input_base() );
-   emit_pop(
+      x86_make_reg( file_REG32, reg_CX) );
+   x86_pop(
       func,
-      get_const_base() );
+      x86_make_reg( file_REG32, reg_AX) );
 }
 
 static void
@@ -768,7 +477,7 @@ emit_func_call_dst(
    unsigned xmm_dst,
    void (*code)() )
 {
-   emit_movaps(
+   sse_movaps(
       func,
       get_temp( TEMP_R0, 0 ),
       make_xmm( xmm_dst ) );
@@ -776,20 +485,27 @@ emit_func_call_dst(
    emit_push_gp(
       func );
 
-#ifdef WIN32
-   emit_push(
-      func,
-      get_temp( TEMP_R0, 0 ) );
+   {
+      struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
+
+      x86_lea(
+         func,
+         ecx,
+         get_temp( TEMP_R0, 0 ) );
+
+      x86_push( func, ecx );
+      x86_mov_reg_imm( func, ecx, (unsigned long) code );
+      x86_call( func, ecx );
+#ifndef WIN32
+      x86_pop(func, ecx ); 
 #endif
+   }
 
-   emit_call(
-      func,
-      code );
 
-   emit_pop_gp(
+   x86_pop_gp(
       func );
 
-   emit_movaps(
+   sse_movaps(
       func,
       make_xmm( xmm_dst ),
       get_temp( TEMP_R0, 0 ) );
@@ -802,7 +518,7 @@ emit_func_call_dst_src(
    unsigned xmm_src,
    void (*code)() )
 {
-   emit_movaps(
+   sse_movaps(
       func,
       get_temp( TEMP_R0, 1 ),
       make_xmm( xmm_src ) );
@@ -822,7 +538,7 @@ emit_abs(
    struct x86_function *func,
    unsigned xmm )
 {
-   emit_andps(
+   sse_andps(
       func,
       make_xmm( xmm ),
       get_temp(
@@ -836,7 +552,7 @@ emit_add(
    unsigned xmm_dst,
    unsigned xmm_src )
 {
-   emit_addps(
+   sse_addps(
       func,
       make_xmm( xmm_dst ),
       make_xmm( xmm_src ) );
@@ -846,18 +562,12 @@ static void XSTDCALL
 cos4f(
    float *store )
 {
-#ifdef WIN32
-   store[0] = (float) cos( (double) store[0] );
-   store[1] = (float) cos( (double) store[1] );
-   store[2] = (float) cos( (double) store[2] );
-   store[3] = (float) cos( (double) store[3] );
-#else
-   const unsigned X = TEMP_R0 * 16;
+   const unsigned X = 0;
+
    store[X + 0] = cosf( store[X + 0] );
    store[X + 1] = cosf( store[X + 1] );
    store[X + 2] = cosf( store[X + 2] );
    store[X + 3] = cosf( store[X + 3] );
-#endif
 }
 
 static void
@@ -875,18 +585,12 @@ static void XSTDCALL
 ex24f(
    float *store )
 {
-#ifdef WIN32
-   store[0] = (float) pow( 2.0, (double) store[0] );
-   store[1] = (float) pow( 2.0, (double) store[1] );
-   store[2] = (float) pow( 2.0, (double) store[2] );
-   store[3] = (float) pow( 2.0, (double) store[3] );
-#else
-   const unsigned X = TEMP_R0 * 16;
+   const unsigned X = 0;
+
    store[X + 0] = powf( 2.0f, store[X + 0] );
    store[X + 1] = powf( 2.0f, store[X + 1] );
    store[X + 2] = powf( 2.0f, store[X + 2] );
    store[X + 3] = powf( 2.0f, store[X + 3] );
-#endif
 }
 
 static void
@@ -905,7 +609,7 @@ emit_f2it(
    struct x86_function *func,
    unsigned xmm )
 {
-   emit_cvttps2dq(
+   sse2_cvttps2dq(
       func,
       make_xmm( xmm ),
       make_xmm( xmm ) );
@@ -915,15 +619,12 @@ static void XSTDCALL
 flr4f(
    float *store )
 {
-#ifdef WIN32
    const unsigned X = 0;
-#else
-   const unsigned X = TEMP_R0 * 16;
-#endif
-   store[X + 0] = (float) floor( (double) store[X + 0] );
-   store[X + 1] = (float) floor( (double) store[X + 1] );
-   store[X + 2] = (float) floor( (double) store[X + 2] );
-   store[X + 3] = (float) floor( (double) store[X + 3] );
+
+   store[X + 0] = floorf( store[X + 0] );
+   store[X + 1] = floorf( store[X + 1] );
+   store[X + 2] = floorf( store[X + 2] );
+   store[X + 3] = floorf( store[X + 3] );
 }
 
 static void
@@ -941,15 +642,12 @@ static void XSTDCALL
 frc4f(
    float *store )
 {
-#ifdef WIN32
    const unsigned X = 0;
-#else
-   const unsigned X = TEMP_R0 * 16;
-#endif
-   store[X + 0] -= (float) floor( (double) store[X + 0] );
-   store[X + 1] -= (float) floor( (double) store[X + 1] );
-   store[X + 2] -= (float) floor( (double) store[X + 2] );
-   store[X + 3] -= (float) floor( (double) store[X + 3] );
+
+   store[X + 0] -= floorf( store[X + 0] );
+   store[X + 1] -= floorf( store[X + 1] );
+   store[X + 2] -= floorf( store[X + 2] );
+   store[X + 3] -= floorf( store[X + 3] );
 }
 
 static void
@@ -967,11 +665,8 @@ static void XSTDCALL
 lg24f(
    float *store )
 {
-#ifdef WIN32
    const unsigned X = 0;
-#else
-   const unsigned X = TEMP_R0 * 16;
-#endif
+
    store[X + 0] = LOG2( store[X + 0] );
    store[X + 1] = LOG2( store[X + 1] );
    store[X + 2] = LOG2( store[X + 2] );
@@ -995,7 +690,7 @@ emit_MOV(
    unsigned xmm_dst,
    unsigned xmm_src )
 {
-   emit_movups(
+   sse_movups(
       func,
       make_xmm( xmm_dst ),
       make_xmm( xmm_src ) );
@@ -1006,7 +701,7 @@ emit_mul (struct x86_function *func,
           unsigned xmm_dst,
           unsigned xmm_src)
 {
-   emit_mulps(
+   sse_mulps(
       func,
       make_xmm( xmm_dst ),
       make_xmm( xmm_src ) );
@@ -1017,7 +712,7 @@ emit_neg(
    struct x86_function *func,
    unsigned xmm )
 {
-   emit_xorps(
+   sse_xorps(
       func,
       make_xmm( xmm ),
       get_temp(
@@ -1029,18 +724,12 @@ static void XSTDCALL
 pow4f(
    float *store )
 {
-#ifdef WIN32
-   store[0] = (float) pow( (double) store[0], (double) store[4] );
-   store[1] = (float) pow( (double) store[1], (double) store[5] );
-   store[2] = (float) pow( (double) store[2], (double) store[6] );
-   store[3] = (float) pow( (double) store[3], (double) store[7] );
-#else
-   const unsigned X = TEMP_R0 * 16;
+   const unsigned X = 0;
+
    store[X + 0] = powf( store[X + 0], store[X + 4] );
    store[X + 1] = powf( store[X + 1], store[X + 5] );
    store[X + 2] = powf( store[X + 2], store[X + 6] );
    store[X + 3] = powf( store[X + 3], store[X + 7] );
-#endif
 }
 
 static void
@@ -1062,7 +751,11 @@ emit_rcp (
    unsigned xmm_dst,
    unsigned xmm_src )
 {
-   emit_rcpps(
+   /* On Intel CPUs at least, this is only accurate to 12 bits -- not
+    * good enough.  Need to either emit a proper divide or use the
+    * iterative technique described below in emit_rsqrt().
+    */
+   sse2_rcpps(
       func,
       make_xmm( xmm_dst ),
       make_xmm( xmm_src ) );
@@ -1074,10 +767,44 @@ emit_rsqrt(
    unsigned xmm_dst,
    unsigned xmm_src )
 {
-   emit_rsqrtps(
+#if HIGH_PRECISION
+   /* Although rsqrtps() and rcpps() are low precision on some/all SSE
+    * implementations, it is possible to improve its precision at
+    * fairly low cost, using a newton/raphson step, as below:
+    * 
+    * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
+    * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
+    *
+    * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
+    */
+   {
+      struct x86_reg dst = make_xmm( xmm_dst );
+      struct x86_reg src = make_xmm( xmm_src );
+      struct x86_reg tmp0 = make_xmm( 2 );
+      struct x86_reg tmp1 = make_xmm( 3 );
+
+      assert( xmm_dst != xmm_src );
+      assert( xmm_dst != 2 && xmm_dst != 3 );
+      assert( xmm_src != 2 && xmm_src != 3 );
+
+      sse_movaps(  func, dst,  get_temp( TGSI_EXEC_TEMP_HALF_I, TGSI_EXEC_TEMP_HALF_C ) );
+      sse_movaps(  func, tmp0, get_temp( TGSI_EXEC_TEMP_THREE_I, TGSI_EXEC_TEMP_THREE_C ) );
+      sse_rsqrtps( func, tmp1, src  );
+      sse_mulps(   func, src,  tmp1 );
+      sse_mulps(   func, dst,  tmp1 );
+      sse_mulps(   func, src,  tmp1 );
+      sse_subps(   func, tmp0, src  );
+      sse_mulps(   func, dst,  tmp0 );
+   }
+#else
+   /* On Intel CPUs at least, this is only accurate to 12 bits -- not
+    * good enough.
+    */
+   sse_rsqrtps(
       func,
       make_xmm( xmm_dst ),
       make_xmm( xmm_src ) );
+#endif
 }
 
 static void
@@ -1085,7 +812,7 @@ emit_setsign(
    struct x86_function *func,
    unsigned xmm )
 {
-   emit_orps(
+   sse_orps(
       func,
       make_xmm( xmm ),
       get_temp(
@@ -1097,18 +824,12 @@ static void XSTDCALL
 sin4f(
    float *store )
 {
-#ifdef WIN32
-   store[0] = (float) sin( (double) store[0] );
-   store[1] = (float) sin( (double) store[1] );
-   store[2] = (float) sin( (double) store[2] );
-   store[3] = (float) sin( (double) store[3] );
-#else
-   const unsigned X = TEMP_R0 * 16;
+   const unsigned X = 0;
+
    store[X + 0] = sinf( store[X + 0] );
    store[X + 1] = sinf( store[X + 1] );
    store[X + 2] = sinf( store[X + 2] );
    store[X + 3] = sinf( store[X + 3] );
-#endif
 }
 
 static void
@@ -1127,7 +848,7 @@ emit_sub(
    unsigned xmm_dst,
    unsigned xmm_src )
 {
-   emit_subps(
+   sse_subps(
       func,
       make_xmm( xmm_dst ),
       make_xmm( xmm_src ) );
@@ -1160,6 +881,14 @@ emit_fetch(
             swizzle );
          break;
 
+      case TGSI_FILE_IMMEDIATE:
+         emit_immediate(
+            func,
+            xmm,
+            reg->SrcRegister.Index,
+            swizzle );
+         break;
+
       case TGSI_FILE_INPUT:
          emit_inputf(
             func,
@@ -1328,16 +1057,16 @@ emit_kil(
       }
    }
 
-   emit_push(
+   x86_push(
       func,
       x86_make_reg( file_REG32, reg_AX ) );
-   emit_push(
+   x86_push(
       func,
       x86_make_reg( file_REG32, reg_DX ) );
 
    FOR_EACH_CHANNEL( chan_index ) {
       if( uniquemask & (1 << chan_index) ) {
-         emit_cmpps(
+         sse_cmpps(
             func,
             make_xmm( registers[chan_index] ),
             get_temp(
@@ -1346,17 +1075,17 @@ emit_kil(
             cc_LessThan );
 
          if( chan_index == firstchan ) {
-            emit_pmovmskb(
+            sse_pmovmskb(
                func,
                x86_make_reg( file_REG32, reg_AX ),
                make_xmm( registers[chan_index] ) );
          }
          else {
-            emit_pmovmskb(
+            sse_pmovmskb(
                func,
                x86_make_reg( file_REG32, reg_DX ),
                make_xmm( registers[chan_index] ) );
-            emit_or(
+            x86_or(
                func,
                x86_make_reg( file_REG32, reg_AX ),
                x86_make_reg( file_REG32, reg_DX ) );
@@ -1364,17 +1093,17 @@ emit_kil(
       }
    }
 
-   emit_or(
+   x86_or(
       func,
       get_temp(
          TGSI_EXEC_TEMP_KILMASK_I,
          TGSI_EXEC_TEMP_KILMASK_C ),
       x86_make_reg( file_REG32, reg_AX ) );
 
-   emit_pop(
+   x86_pop(
       func,
       x86_make_reg( file_REG32, reg_DX ) );
-   emit_pop(
+   x86_pop(
       func,
       x86_make_reg( file_REG32, reg_AX ) );
 }
@@ -1390,12 +1119,12 @@ emit_setcc(
    FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
       FETCH( func, *inst, 0, 0, chan_index );
       FETCH( func, *inst, 1, 1, chan_index );
-      emit_cmpps(
+      sse_cmpps(
          func,
          make_xmm( 0 ),
          make_xmm( 1 ),
          cc );
-      emit_andps(
+      sse_andps(
          func,
          make_xmm( 0 ),
          get_temp(
@@ -1416,22 +1145,22 @@ emit_cmp(
       FETCH( func, *inst, 0, 0, chan_index );
       FETCH( func, *inst, 1, 1, chan_index );
       FETCH( func, *inst, 2, 2, chan_index );
-      emit_cmpps(
+      sse_cmpps(
          func,
          make_xmm( 0 ),
          get_temp(
             TGSI_EXEC_TEMP_00000000_I,
             TGSI_EXEC_TEMP_00000000_C ),
          cc_LessThan );
-      emit_andps(
+      sse_andps(
          func,
          make_xmm( 1 ),
          make_xmm( 0 ) );
-      emit_andnps(
+      sse_andnps(
          func,
          make_xmm( 0 ),
          make_xmm( 2 ) );
-      emit_orps(
+      sse_orps(
          func,
          make_xmm( 0 ),
          make_xmm( 1 ) );
@@ -1448,11 +1177,16 @@ emit_instruction(
 
    switch( inst->Instruction.Opcode ) {
    case TGSI_OPCODE_ARL:
+#if 0
+      /* XXX this isn't working properly (see glean vertProg1 test) */
       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( func, *inst, 0, 0, chan_index );
          emit_f2it( func, 0 );
          STORE( func, *inst, 0, 0, chan_index );
       }
+#else
+      return 0;
+#endif
       break;
 
    case TGSI_OPCODE_MOV:
@@ -1482,7 +1216,7 @@ emit_instruction(
           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
             FETCH( func, *inst, 0, 0, CHAN_X );
-            emit_maxps(
+            sse_maxps(
                func,
                make_xmm( 0 ),
                get_temp(
@@ -1491,21 +1225,26 @@ emit_instruction(
             STORE( func, *inst, 0, 0, CHAN_Y );
          }
          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
+            /* XMM[1] = SrcReg[0].yyyy */
             FETCH( func, *inst, 1, 0, CHAN_Y );
-            emit_maxps(
+            /* XMM[1] = max(XMM[1], 0) */
+            sse_maxps(
                func,
                make_xmm( 1 ),
                get_temp(
                   TGSI_EXEC_TEMP_00000000_I,
                   TGSI_EXEC_TEMP_00000000_C ) );
+            /* XMM[2] = SrcReg[0].wwww */
             FETCH( func, *inst, 2, 0, CHAN_W );
-            emit_minps(
+            /* XMM[2] = min(XMM[2], 128.0) */
+            sse_minps(
                func,
                make_xmm( 2 ),
                get_temp(
                   TGSI_EXEC_TEMP_128_I,
                   TGSI_EXEC_TEMP_128_C ) );
-            emit_maxps(
+            /* XMM[2] = max(XMM[2], -128.0) */
+            sse_maxps(
                func,
                make_xmm( 2 ),
                get_temp(
@@ -1513,16 +1252,16 @@ emit_instruction(
                   TGSI_EXEC_TEMP_MINUS_128_C ) );
             emit_pow( func, 1, 2 );
             FETCH( func, *inst, 0, 0, CHAN_X );
-            emit_xorps(
+            sse_xorps(
                func,
                make_xmm( 2 ),
                make_xmm( 2 ) );
-            emit_cmpps(
+            sse_cmpps(
                func,
                make_xmm( 2 ),
                make_xmm( 0 ),
                cc_LessThanEqual );
-            emit_andps(
+            sse_andps(
                func,
                make_xmm( 2 ),
                make_xmm( 1 ) );
@@ -1543,9 +1282,9 @@ emit_instruction(
    case TGSI_OPCODE_RSQ:
    /* TGSI_OPCODE_RECIPSQRT */
       FETCH( func, *inst, 0, 0, CHAN_X );
-      emit_rsqrt( func, 0, 0 );
+      emit_rsqrt( func, 1, 0 );
       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( func, *inst, 0, 0, chan_index );
+         STORE( func, *inst, 1, 0, chan_index );
       }
       break;
 
@@ -1644,7 +1383,7 @@ emit_instruction(
       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( func, *inst, 0, 0, chan_index );
          FETCH( func, *inst, 1, 1, chan_index );
-         emit_minps(
+         sse_minps(
             func,
             make_xmm( 0 ),
             make_xmm( 1 ) );
@@ -1656,7 +1395,7 @@ emit_instruction(
       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( func, *inst, 0, 0, chan_index );
          FETCH( func, *inst, 1, 1, chan_index );
-         emit_maxps(
+         sse_maxps(
             func,
             make_xmm( 0 ),
             make_xmm( 1 ) );
@@ -1997,7 +1736,6 @@ emit_instruction(
       break;
 
    case TGSI_OPCODE_RET:
-   case TGSI_OPCODE_END:
 #ifdef WIN32
       emit_retw( func, 16 );
 #else
@@ -2005,6 +1743,9 @@ emit_instruction(
 #endif
       break;
 
+   case TGSI_OPCODE_END:
+      break;
+
    case TGSI_OPCODE_SSG:
       return 0;
       break;
@@ -2020,7 +1761,7 @@ emit_instruction(
          STORE( func, *inst, 0, 0, CHAN_X );
       }
       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
-         FETCH( func, *inst, 0, 0, CHAN_Y );
+         FETCH( func, *inst, 0, 0, CHAN_X );
          emit_sin( func, 0 );
          STORE( func, *inst, 0, 0, CHAN_Y );
       }
@@ -2236,149 +1977,289 @@ emit_declaration(
    }
 }
 
-unsigned
-tgsi_emit_sse2(
-   struct tgsi_token *tokens,
-   struct x86_function *func )
-{
-   struct tgsi_parse_context parse;
-   unsigned ok = 1;
-
-   DUMP_START();
-
-   func->csr = func->store;
-
-   emit_mov(
-      func,
-      get_input_base(),
-      get_argument( 0 ) );
-   emit_mov(
-      func,
-      get_output_base(),
-      get_argument( 1 ) );
-   emit_mov(
-      func,
-      get_const_base(),
-      get_argument( 2 ) );
-   emit_mov(
-      func,
-      get_temp_base(),
-      get_argument( 3 ) );
-
-   tgsi_parse_init( &parse, tokens );
-
-   while( !tgsi_parse_end_of_tokens( &parse ) && ok ) {
-      tgsi_parse_token( &parse );
-
-      switch( parse.FullToken.Token.Type ) {
-      case TGSI_TOKEN_TYPE_DECLARATION:
-         break;
-
-      case TGSI_TOKEN_TYPE_INSTRUCTION:
-         ok = emit_instruction(
-	    func,
-	    &parse.FullToken.FullInstruction );
-
-	 if (!ok) {
-	    debug_printf("failed to translate tgsi opcode %d to SSE\n", 
-			 parse.FullToken.FullInstruction.Instruction.Opcode );
-	 }
-         break;
-
-      case TGSI_TOKEN_TYPE_IMMEDIATE:
-         /* XXX implement this */
-	 ok = 0;
-	 debug_printf("failed to emit immediate value to SSE\n");
-	 break;
-
-      default:
-         assert( 0 );
-	 ok = 0;
-	 break;
-      }
+static void aos_to_soa( struct x86_function *func, 
+                        uint arg_aos,
+                        uint arg_soa, 
+                        uint arg_num, 
+                        uint arg_stride )
+{
+   struct x86_reg soa_input = x86_make_reg( file_REG32, reg_AX );
+   struct x86_reg aos_input = x86_make_reg( file_REG32, reg_BX );
+   struct x86_reg num_inputs = x86_make_reg( file_REG32, reg_CX );
+   struct x86_reg stride = x86_make_reg( file_REG32, reg_DX );
+   int inner_loop;
+
+
+   /* Save EBX */
+   x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
+
+   x86_mov( func, aos_input,  x86_fn_arg( func, arg_aos ) );
+   x86_mov( func, soa_input,  x86_fn_arg( func, arg_soa ) );
+   x86_mov( func, num_inputs, x86_fn_arg( func, arg_num ) );
+   x86_mov( func, stride,     x86_fn_arg( func, arg_stride ) );
+
+   /* do */
+   inner_loop = x86_get_label( func );
+   {
+      x86_push( func, aos_input );
+      sse_movlps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
+      sse_movlps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
+      x86_add( func, aos_input, stride );
+      sse_movhps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
+      sse_movhps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
+      x86_add( func, aos_input, stride );
+      sse_movlps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
+      sse_movlps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
+      x86_add( func, aos_input, stride );
+      sse_movhps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
+      sse_movhps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
+      x86_pop( func, aos_input );
+
+      sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
+      sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
+      sse_shufps( func, make_xmm( 0 ), make_xmm( 1 ), 0x88 );
+      sse_shufps( func, make_xmm( 2 ), make_xmm( 1 ), 0xdd );
+      sse_shufps( func, make_xmm( 3 ), make_xmm( 4 ), 0x88 );
+      sse_shufps( func, make_xmm( 5 ), make_xmm( 4 ), 0xdd );
+
+      sse_movups( func, x86_make_disp( soa_input, 0 ), make_xmm( 0 ) );
+      sse_movups( func, x86_make_disp( soa_input, 16 ), make_xmm( 2 ) );
+      sse_movups( func, x86_make_disp( soa_input, 32 ), make_xmm( 3 ) );
+      sse_movups( func, x86_make_disp( soa_input, 48 ), make_xmm( 5 ) );
+
+      /* Advance to next input */
+      x86_lea( func, aos_input, x86_make_disp(aos_input, 16) );
+      x86_lea( func, soa_input, x86_make_disp(soa_input, 64) );
    }
+   /* while --num_inputs */
+   x86_dec( func, num_inputs );
+   x86_jcc( func, cc_NE, inner_loop );
+
+   /* Restore EBX */
+   x86_pop( func, aos_input );
+}
+
+static void soa_to_aos( struct x86_function *func, uint aos, uint soa, uint num, uint stride )
+{
+   struct x86_reg soa_output;
+   struct x86_reg aos_output;
+   struct x86_reg num_outputs;
+   struct x86_reg temp;
+   int inner_loop;
+
+   soa_output = x86_make_reg( file_REG32, reg_AX );
+   aos_output = x86_make_reg( file_REG32, reg_BX );
+   num_outputs = x86_make_reg( file_REG32, reg_CX );
+   temp = x86_make_reg( file_REG32, reg_DX );
+
+   /* Save EBX */
+   x86_push( func, aos_output );
+
+   x86_mov( func, soa_output, x86_fn_arg( func, soa ) );
+   x86_mov( func, aos_output, x86_fn_arg( func, aos ) );
+   x86_mov( func, num_outputs, x86_fn_arg( func, num ) );
+
+   /* do */
+   inner_loop = x86_get_label( func );
+   {
+      sse_movups( func, make_xmm( 0 ), x86_make_disp( soa_output, 0 ) );
+      sse_movups( func, make_xmm( 1 ), x86_make_disp( soa_output, 16 ) );
+      sse_movups( func, make_xmm( 3 ), x86_make_disp( soa_output, 32 ) );
+      sse_movups( func, make_xmm( 4 ), x86_make_disp( soa_output, 48 ) );
+
+      sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
+      sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
+      sse_unpcklps( func, make_xmm( 0 ), make_xmm( 1 ) );
+      sse_unpckhps( func, make_xmm( 2 ), make_xmm( 1 ) );
+      sse_unpcklps( func, make_xmm( 3 ), make_xmm( 4 ) );
+      sse_unpckhps( func, make_xmm( 5 ), make_xmm( 4 ) );
+
+      x86_mov( func, temp, x86_fn_arg( func, stride ) );
+      x86_push( func, aos_output );
+      sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
+      sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
+      x86_add( func, aos_output, temp );
+      sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
+      sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
+      x86_add( func, aos_output, temp );
+      sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
+      sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
+      x86_add( func, aos_output, temp );
+      sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
+      sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
+      x86_pop( func, aos_output );
+
+      /* Advance to next output */
+      x86_lea( func, aos_output, x86_make_disp(aos_output, 16) );
+      x86_lea( func, soa_output, x86_make_disp(soa_output, 64) );
+   }
+   /* while --num_outputs */
+   x86_dec( func, num_outputs );
+   x86_jcc( func, cc_NE, inner_loop );
 
-   tgsi_parse_free( &parse );
-
-   DUMP_END();
-
-   return ok;
+   /* Restore EBX */
+   x86_pop( func, aos_output );
 }
 
 /**
- * Fragment shaders are responsible for interpolating shader inputs. Because on
- * x86 we have only 4 GP registers, and here we have 5 shader arguments (input,
- * output, const, temp and coef), the code is split into two phases --
- * DECLARATION and INSTRUCTION phase.
- * GP register holding the output argument is aliased with the coeff argument,
- * as outputs are not needed in the DECLARATION phase.
+ * Translate a TGSI vertex/fragment shader to SSE2 code.
+ * Slightly different things are done for vertex vs. fragment shaders.
+ *
+ * Note that fragment shaders are responsible for interpolating shader
+ * inputs. Because on x86 we have only 4 GP registers, and here we
+ * have 5 shader arguments (input, output, const, temp and coef), the
+ * code is split into two phases -- DECLARATION and INSTRUCTION phase.
+ * GP register holding the output argument is aliased with the coeff
+ * argument, as outputs are not needed in the DECLARATION phase.
+ *
+ * \param tokens  the TGSI input shader
+ * \param func  the output SSE code/function
+ * \param immediates  buffer to place immediates, later passed to SSE func
+ * \param return  1 for success, 0 if translation failed
  */
 unsigned
-tgsi_emit_sse2_fs(
-   struct tgsi_token *tokens,
-   struct x86_function *func )
+tgsi_emit_sse2(
+   const struct tgsi_token *tokens,
+   struct x86_function *func,
+   float (*immediates)[4],
+   boolean do_swizzles )
 {
    struct tgsi_parse_context parse;
    boolean instruction_phase = FALSE;
    unsigned ok = 1;
-
-   DUMP_START();
+   uint num_immediates = 0;
 
    func->csr = func->store;
 
-   /* DECLARATION phase, do not load output argument. */
-   emit_mov(
-      func,
-      get_input_base(),
-      get_argument( 0 ) );
-   emit_mov(
-      func,
-      get_const_base(),
-      get_argument( 2 ) );
-   emit_mov(
+   tgsi_parse_init( &parse, tokens );
+
+   /* Can't just use EDI, EBX without save/restoring them:
+    */
+   x86_push(
       func,
-      get_temp_base(),
-      get_argument( 3 ) );
-   emit_mov(
+      get_immediate_base() );
+
+   x86_push(
       func,
-      get_coef_base(),
-      get_argument( 4 ) );
+      get_temp_base() );
 
-   tgsi_parse_init( &parse, tokens );
+
+   /*
+    * Different function args for vertex/fragment shaders:
+    */
+   if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
+      /* DECLARATION phase, do not load output argument. */
+      x86_mov(
+         func,
+         get_input_base(),
+         x86_fn_arg( func, 1 ) );
+      /* skipping outputs argument here */
+      x86_mov(
+         func,
+         get_const_base(),
+         x86_fn_arg( func, 3 ) );
+      x86_mov(
+         func,
+         get_temp_base(),
+         x86_fn_arg( func, 4 ) );
+      x86_mov(
+         func,
+         get_coef_base(),
+         x86_fn_arg( func, 5 ) );
+      x86_mov(
+         func,
+         get_immediate_base(),
+         x86_fn_arg( func, 6 ) );
+   }
+   else {
+      assert(parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX);
+
+      if (do_swizzles)
+         aos_to_soa( func, 
+                     6,         /* aos_input */
+                     1,         /* machine->input */
+                     7,         /* num_inputs */
+                     8 );       /* input_stride */
+
+      x86_mov(
+         func,
+         get_input_base(),
+         x86_fn_arg( func, 1 ) );
+      x86_mov(
+         func,
+         get_output_base(),
+         x86_fn_arg( func, 2 ) );
+      x86_mov(
+         func,
+         get_const_base(),
+         x86_fn_arg( func, 3 ) );
+      x86_mov(
+         func,
+         get_temp_base(),
+         x86_fn_arg( func, 4 ) );
+      x86_mov(
+         func,
+         get_immediate_base(),
+         x86_fn_arg( func, 5 ) );
+   }
 
    while( !tgsi_parse_end_of_tokens( &parse ) && ok ) {
       tgsi_parse_token( &parse );
 
       switch( parse.FullToken.Token.Type ) {
       case TGSI_TOKEN_TYPE_DECLARATION:
-         emit_declaration(
-            func,
-            &parse.FullToken.FullDeclaration );
+         if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
+            emit_declaration(
+               func,
+               &parse.FullToken.FullDeclaration );
+         }
          break;
 
       case TGSI_TOKEN_TYPE_INSTRUCTION:
-         if( !instruction_phase ) {
-            /* INSTRUCTION phase, overwrite coeff with output. */
-            instruction_phase = TRUE;
-            emit_mov(
-               func,
-               get_output_base(),
-               get_argument( 1 ) );
+         if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
+            if( !instruction_phase ) {
+               /* INSTRUCTION phase, overwrite coeff with output. */
+               instruction_phase = TRUE;
+               x86_mov(
+                  func,
+                  get_output_base(),
+                  x86_fn_arg( func, 2 ) );
+            }
          }
+
          ok = emit_instruction(
             func,
             &parse.FullToken.FullInstruction );
 
 	 if (!ok) {
-	    debug_printf("failed to translate tgsi opcode %d to SSE\n", 
-			 parse.FullToken.FullInstruction.Instruction.Opcode );
+	    debug_printf("failed to translate tgsi opcode %d to SSE (%s)\n", 
+			 parse.FullToken.FullInstruction.Instruction.Opcode,
+                         parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX ?
+                         "vertex shader" : "fragment shader");
 	 }
          break;
 
       case TGSI_TOKEN_TYPE_IMMEDIATE:
-         /* XXX implement this */
-	 ok = 0;
-	 debug_printf("failed to emit immediate value to SSE\n");
+         /* simply copy the immediate values into the next immediates[] slot */
+         {
+            const uint size = parse.FullToken.FullImmediate.Immediate.Size - 1;
+            uint i;
+            assert(size <= 4);
+            assert(num_immediates < TGSI_EXEC_NUM_IMMEDIATES);
+            for( i = 0; i < size; i++ ) {
+               immediates[num_immediates][i] =
+		  parse.FullToken.FullImmediate.u.ImmediateFloat32[i].Float;
+            }
+#if 0
+            debug_printf("SSE FS immediate[%d] = %f %f %f %f\n",
+                   num_immediates,
+                   immediates[num_immediates][0],
+                   immediates[num_immediates][1],
+                   immediates[num_immediates][2],
+                   immediates[num_immediates][3]);
+#endif
+            num_immediates++;
+         }
          break;
 
       default:
@@ -2387,11 +2268,30 @@ tgsi_emit_sse2_fs(
       }
    }
 
-   tgsi_parse_free( &parse );
+   if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
+      if (do_swizzles)
+         soa_to_aos( func, 9, 2, 10, 11 );
+   }
 
-   DUMP_END();
+   /* Can't just use EBX, EDI without save/restoring them:
+    */
+   x86_pop(
+      func,
+      get_temp_base() );
+
+   x86_pop(
+      func,
+      get_immediate_base() );
+
+#ifdef WIN32
+   emit_retw( func, 16 );
+#else
+   emit_ret( func );
+#endif
+
+   tgsi_parse_free( &parse );
 
    return ok;
 }
 
-#endif /* i386 */
+#endif /* PIPE_ARCH_X86 */
diff --git a/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.h b/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.h
index 63b8ef3911..e66d115283 100755
--- a/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.h
+++ b/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.h
@@ -10,13 +10,10 @@ struct x86_function;
 
 unsigned
 tgsi_emit_sse2(
-   struct tgsi_token *tokens,
-   struct x86_function *function );
-
-unsigned
-tgsi_emit_sse2_fs(
-   struct tgsi_token *tokens,
-   struct x86_function *function );
+   const struct tgsi_token *tokens,
+   struct x86_function *function,
+   float (*immediates)[4],
+   boolean do_swizzles );
 
 #if defined __cplusplus
 }