1 files changed, 183 insertions, 108 deletions
diff --git a/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c b/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c
index 4e80597b3f..c37e201b2b 100755
--- a/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c
+++ b/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c
@@ -225,6 +225,15 @@ get_coef_base( void )
    return get_output_base();
 }
 
+static struct x86_reg
+get_immediate_base( void )
+{
+   return x86_make_reg(
+      file_REG32,
+      reg_DI );
+}
+
+
 /**
  * Data access helpers.
  */
@@ -239,6 +248,16 @@ get_argument(
 }
 
 static struct x86_reg
+get_immediate(
+   unsigned vec,
+   unsigned chan )
+{
+   return x86_make_disp(
+      get_immediate_base(),
+      (vec * 4 + chan) * 4 );
+}
+
+static struct x86_reg
 get_const(
    unsigned vec,
    unsigned chan )
@@ -548,6 +567,12 @@ emit_xorps(
  * Data fetch helpers.
  */
 
+/**
+ * Copy a shader constant to xmm register
+ * \param xmm  the destination xmm register
+ * \param vec  the src const buffer index
+ * \param chan  src channel to fetch (X, Y, Z or W)
+ */
 static void
 emit_const(
    struct x86_function *func,
@@ -567,6 +592,31 @@ emit_const(
 }
 
 static void
+emit_immediate(
+   struct x86_function *func,
+   unsigned xmm,
+   unsigned vec,
+   unsigned chan )
+{
+   emit_movss(
+      func,
+      make_xmm( xmm ),
+      get_immediate( vec, chan ) );
+   emit_shufps(
+      func,
+      make_xmm( xmm ),
+      make_xmm( xmm ),
+      SHUF( 0, 0, 0, 0 ) );
+}
+
+
+/**
+ * Copy a shader input to xmm register
+ * \param xmm  the destination xmm register
+ * \param vec  the src input attrib
+ * \param chan  src channel to fetch (X, Y, Z or W)
+ */
+static void
 emit_inputf(
    struct x86_function *func,
    unsigned xmm,
@@ -579,6 +629,12 @@ emit_inputf(
       get_input( vec, chan ) );
 }
 
+/**
+ * Store an xmm register to a shader output
+ * \param xmm  the source xmm register
+ * \param vec  the dest output attrib
+ * \param chan  src dest channel to store (X, Y, Z or W)
+ */
 static void
 emit_output(
    struct x86_function *func,
@@ -592,6 +648,12 @@ emit_output(
       make_xmm( xmm ) );
 }
 
+/**
+ * Copy a shader temporary to xmm register
+ * \param xmm  the destination xmm register
+ * \param vec  the src temp register
+ * \param chan  src channel to fetch (X, Y, Z or W)
+ */
 static void
 emit_tempf(
    struct x86_function *func,
@@ -605,6 +667,13 @@ emit_tempf(
       get_temp( vec, chan ) );
 }
 
+/**
+ * Load an xmm register with an input attrib coefficient (a0, dadx or dady)
+ * \param xmm  the destination xmm register
+ * \param vec  the src input/attribute coefficient index
+ * \param chan  src channel to fetch (X, Y, Z or W)
+ * \param member  0=a0, 1=dadx, 2=dady
+ */
 static void
 emit_coef(
    struct x86_function *func,
@@ -1160,6 +1229,14 @@ emit_fetch(
             swizzle );
          break;
 
+      case TGSI_FILE_IMMEDIATE:
+         emit_immediate(
+            func,
+            xmm,
+            reg->SrcRegister.Index,
+            swizzle );
+         break;
+
       case TGSI_FILE_INPUT:
          emit_inputf(
             func,
@@ -2020,7 +2097,7 @@ emit_instruction(
          STORE( func, *inst, 0, 0, CHAN_X );
       }
       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
-         FETCH( func, *inst, 0, 0, CHAN_Y );
+         FETCH( func, *inst, 0, 0, CHAN_X );
          emit_sin( func, 0 );
          STORE( func, *inst, 0, 0, CHAN_Y );
       }
@@ -2236,135 +2313,116 @@ emit_declaration(
    }
 }
 
-unsigned
-tgsi_emit_sse2(
-   struct tgsi_token *tokens,
-   struct x86_function *func )
-{
-   struct tgsi_parse_context parse;
-   unsigned ok = 1;
-
-   DUMP_START();
-
-   func->csr = func->store;
-
-   emit_mov(
-      func,
-      get_input_base(),
-      get_argument( 0 ) );
-   emit_mov(
-      func,
-      get_output_base(),
-      get_argument( 1 ) );
-   emit_mov(
-      func,
-      get_const_base(),
-      get_argument( 2 ) );
-   emit_mov(
-      func,
-      get_temp_base(),
-      get_argument( 3 ) );
-
-   tgsi_parse_init( &parse, tokens );
-
-   while( !tgsi_parse_end_of_tokens( &parse ) && ok ) {
-      tgsi_parse_token( &parse );
-
-      switch( parse.FullToken.Token.Type ) {
-      case TGSI_TOKEN_TYPE_DECLARATION:
-         break;
-
-      case TGSI_TOKEN_TYPE_INSTRUCTION:
-         ok = emit_instruction(
-	    func,
-	    &parse.FullToken.FullInstruction );
-
-	 if (!ok) {
-	    debug_printf("failed to translate tgsi opcode %d to SSE\n", 
-			 parse.FullToken.FullInstruction.Instruction.Opcode );
-	 }
-         break;
-
-      case TGSI_TOKEN_TYPE_IMMEDIATE:
-         /* XXX implement this */
-	 ok = 0;
-	 debug_printf("failed to emit immediate value to SSE\n");
-	 break;
-
-      default:
-         assert( 0 );
-	 ok = 0;
-	 break;
-      }
-   }
-
-   tgsi_parse_free( &parse );
-
-   DUMP_END();
-
-   return ok;
-}
 
 /**
- * Fragment shaders are responsible for interpolating shader inputs. Because on
- * x86 we have only 4 GP registers, and here we have 5 shader arguments (input,
- * output, const, temp and coef), the code is split into two phases --
- * DECLARATION and INSTRUCTION phase.
- * GP register holding the output argument is aliased with the coeff argument,
- * as outputs are not needed in the DECLARATION phase.
+ * Translate a TGSI vertex/fragment shader to SSE2 code.
+ * Slightly different things are done for vertex vs. fragment shaders.
+ *
+ * Note that fragment shaders are responsible for interpolating shader
+ * inputs. Because on x86 we have only 4 GP registers, and here we
+ * have 5 shader arguments (input, output, const, temp and coef), the
+ * code is split into two phases -- DECLARATION and INSTRUCTION phase.
+ * GP register holding the output argument is aliased with the coeff
+ * argument, as outputs are not needed in the DECLARATION phase.
+ *
+ * \param tokens  the TGSI input shader
+ * \param func  the output SSE code/function
+ * \param immediates  buffer to place immediates, later passed to SSE func
+ * \param return  1 for success, 0 if translation failed
  */
 unsigned
-tgsi_emit_sse2_fs(
+tgsi_emit_sse2(
    struct tgsi_token *tokens,
-   struct x86_function *func )
+   struct x86_function *func,
+   float (*immediates)[4])
 {
    struct tgsi_parse_context parse;
    boolean instruction_phase = FALSE;
    unsigned ok = 1;
+   uint num_immediates = 0;
 
    DUMP_START();
 
    func->csr = func->store;
 
-   /* DECLARATION phase, do not load output argument. */
-   emit_mov(
-      func,
-      get_input_base(),
-      get_argument( 0 ) );
-   emit_mov(
-      func,
-      get_const_base(),
-      get_argument( 2 ) );
-   emit_mov(
-      func,
-      get_temp_base(),
-      get_argument( 3 ) );
-   emit_mov(
-      func,
-      get_coef_base(),
-      get_argument( 4 ) );
-
    tgsi_parse_init( &parse, tokens );
 
+   /*
+    * Different function args for vertex/fragment shaders:
+    */
+   if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
+      /* DECLARATION phase, do not load output argument. */
+      emit_mov(
+         func,
+         get_input_base(),
+         get_argument( 0 ) );
+      /* skipping outputs argument here */
+      emit_mov(
+         func,
+         get_const_base(),
+         get_argument( 2 ) );
+      emit_mov(
+         func,
+         get_temp_base(),
+         get_argument( 3 ) );
+      emit_mov(
+         func,
+         get_coef_base(),
+         get_argument( 4 ) );
+      emit_mov(
+         func,
+         get_immediate_base(),
+         get_argument( 5 ) );
+   }
+   else {
+      assert(parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX);
+
+      emit_mov(
+         func,
+         get_input_base(),
+         get_argument( 0 ) );
+      emit_mov(
+         func,
+         get_output_base(),
+         get_argument( 1 ) );
+      emit_mov(
+         func,
+         get_const_base(),
+         get_argument( 2 ) );
+      emit_mov(
+         func,
+         get_temp_base(),
+         get_argument( 3 ) );
+      emit_mov(
+         func,
+         get_immediate_base(),
+         get_argument( 4 ) );
+   }
+
    while( !tgsi_parse_end_of_tokens( &parse ) && ok ) {
       tgsi_parse_token( &parse );
 
       switch( parse.FullToken.Token.Type ) {
       case TGSI_TOKEN_TYPE_DECLARATION:
-         emit_declaration(
-            func,
-            &parse.FullToken.FullDeclaration );
+         if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
+            emit_declaration(
+               func,
+               &parse.FullToken.FullDeclaration );
+         }
          break;
 
       case TGSI_TOKEN_TYPE_INSTRUCTION:
-         if( !instruction_phase ) {
-            /* INSTRUCTION phase, overwrite coeff with output. */
-            instruction_phase = TRUE;
-            emit_mov(
-               func,
-               get_output_base(),
-               get_argument( 1 ) );
+         if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
+            if( !instruction_phase ) {
+               /* INSTRUCTION phase, overwrite coeff with output. */
+               instruction_phase = TRUE;
+               emit_mov(
+                  func,
+                  get_output_base(),
+                  get_argument( 1 ) );
+            }
          }
+
          ok = emit_instruction(
             func,
             &parse.FullToken.FullInstruction );
@@ -2376,9 +2434,26 @@ tgsi_emit_sse2_fs(
          break;
 
       case TGSI_TOKEN_TYPE_IMMEDIATE:
-         /* XXX implement this */
-	 ok = 0;
-	 debug_printf("failed to emit immediate value to SSE\n");
+         /* simply copy the immediate values into the next immediates[] slot */
+         {
+            const uint size = parse.FullToken.FullImmediate.Immediate.Size - 1;
+            uint i;
+            assert(size <= 4);
+            assert(num_immediates < TGSI_EXEC_NUM_IMMEDIATES);
+            for( i = 0; i < size; i++ ) {
+               immediates[num_immediates][i] =
+		  parse.FullToken.FullImmediate.u.ImmediateFloat32[i].Float;
+            }
+#if 0
+            debug_printf("SSE FS immediate[%d] = %f %f %f %f\n",
+                   num_immediates,
+                   immediates[num_immediates][0],
+                   immediates[num_immediates][1],
+                   immediates[num_immediates][2],
+                   immediates[num_immediates][3]);
+#endif
+            num_immediates++;
+         }
          break;
 
       default: