5 files changed, 179 insertions, 132 deletions
diff --git a/src/mesa/tnl/t_vb_arbprogram.c b/src/mesa/tnl/t_vb_arbprogram.c
index 88d8fe9546..d034929fe0 100644
--- a/src/mesa/tnl/t_vb_arbprogram.c
+++ b/src/mesa/tnl/t_vb_arbprogram.c
@@ -115,8 +115,6 @@ static GLfloat rough_approx_log2_0_1(GLfloat x)
 }
 
 
-
-
 /**
  * Perform a reduced swizzle:
  */
@@ -131,12 +129,42 @@ static void do_RSW( struct arb_vp_machine *m, union instruction op )
    /* Need a temporary to be correct in the case where result == arg0.
     */
    COPY_4V(tmp, arg0);
-   
-   result[0] = tmp[GET_RSW(swz, 0)];
-   result[1] = tmp[GET_RSW(swz, 1)];
-   result[2] = tmp[GET_RSW(swz, 2)];
-   result[3] = tmp[GET_RSW(swz, 3)];
-   
+
+   result[0] = tmp[GET_SWZ(swz, 0)];
+   result[1] = tmp[GET_SWZ(swz, 1)];
+   result[2] = tmp[GET_SWZ(swz, 2)];
+   result[3] = tmp[GET_SWZ(swz, 3)];
+
+   if (neg) {
+      if (neg & 0x1) result[0] = -result[0];
+      if (neg & 0x2) result[1] = -result[1];
+      if (neg & 0x4) result[2] = -result[2];
+      if (neg & 0x8) result[3] = -result[3];
+   }
+}
+
+/**
+ * Perform a full swizzle
+ */
+static void do_SWZ( struct arb_vp_machine *m, union instruction op ) 
+{
+   GLfloat *result = m->File[0][op.rsw.dst];
+   const GLfloat *arg0 = m->File[op.rsw.file0][op.rsw.idx0];
+   GLuint swz = op.rsw.swz;
+   GLuint neg = op.rsw.neg;
+   GLfloat tmp[6];
+   tmp[4] = 0.0;
+   tmp[5] = 1.0;
+
+   /* Need a temporary to be correct in the case where result == arg0.
+    */
+   COPY_4V(tmp, arg0);
+
+   result[0] = tmp[GET_SWZ(swz, 0)];
+   result[1] = tmp[GET_SWZ(swz, 1)];
+   result[2] = tmp[GET_SWZ(swz, 2)];
+   result[3] = tmp[GET_SWZ(swz, 3)];
+
    if (neg) {
       if (neg & 0x1) result[0] = -result[0];
       if (neg & 0x2) result[1] = -result[1];
@@ -570,11 +598,31 @@ static void print_RSW( union instruction op )
    _mesa_printf(", ");
    print_reg(op.rsw.file0, op.rsw.idx0);
    _mesa_printf(".");
-   for (i = 0; i < 4; i++, swz >>= 2) {
-      const char *cswz = "xyzw";
+   for (i = 0; i < 4; i++, swz >>= 3) {
+      const char *cswz = "xyzw01";
       if (neg & (1<<i))   
 	 _mesa_printf("-");
-      _mesa_printf("%c", cswz[swz&0x3]);
+      _mesa_printf("%c", cswz[swz&0x7]);
+   }
+   _mesa_printf("\n");
+}
+
+static void print_SWZ( union instruction op )
+{
+   GLuint swz = op.rsw.swz;
+   GLuint neg = op.rsw.neg;
+   GLuint i;
+
+   _mesa_printf("SWZ ");
+   print_reg(0, op.rsw.dst);
+   _mesa_printf(", ");
+   print_reg(op.rsw.file0, op.rsw.idx0);
+   _mesa_printf(".");
+   for (i = 0; i < 4; i++, swz >>= 3) {
+      const char *cswz = "xyzw01";
+      if (neg & (1<<i))   
+	 _mesa_printf("-");
+      _mesa_printf("%c", cswz[swz&0x7]);
    }
    _mesa_printf("\n");
 }
@@ -651,9 +699,11 @@ _tnl_disassem_vba_insn( union instruction op )
    case OPCODE_RCC:
    case OPCODE_RET:
    case OPCODE_SSG:
-   case OPCODE_SWZ:
       print_NOP(op);
       break;
+   case OPCODE_SWZ:
+      print_SWZ(op);
+      break;
    case RSW:
       print_RSW(op);
       break;
@@ -728,7 +778,7 @@ static void (* const opcode_func[MAX_OPCODE+3])(struct arb_vp_machine *, union i
    do_NOP,/*SSG*/
    do_NOP,/*STR*/
    do_SUB,
-   do_RSW,/*SWZ*/
+   do_SWZ,/*SWZ*/
    do_NOP,/*TEX*/
    do_NOP,/*TXB*/
    do_NOP,/*TXD*/
@@ -833,7 +883,7 @@ static struct reg cvp_emit_arg( struct compilation *cp,
 {
    struct reg reg = cvp_load_reg( cp, src->File, src->Index, src->RelAddr, arg );
    union instruction rsw, noop;
-   
+
    /* Emit any necessary swizzling.  
     */
    _mesa_bzero(&rsw, sizeof(rsw));
@@ -841,19 +891,17 @@ static struct reg cvp_emit_arg( struct compilation *cp,
 
    /* we're expecting 2-bit swizzles below... */
 #if 1 /* XXX THESE ASSERTIONS CURRENTLY FAIL DURING GLEAN TESTS! */
+/* hopefully no longer happens? */
    ASSERT(GET_SWZ(src->Swizzle, 0) < 4);
    ASSERT(GET_SWZ(src->Swizzle, 1) < 4);
    ASSERT(GET_SWZ(src->Swizzle, 2) < 4);
    ASSERT(GET_SWZ(src->Swizzle, 3) < 4);
 #endif
-   rsw.rsw.swz = ((GET_SWZ(src->Swizzle, 0) << 0) |
-		  (GET_SWZ(src->Swizzle, 1) << 2) |
-		  (GET_SWZ(src->Swizzle, 2) << 4) |
-		  (GET_SWZ(src->Swizzle, 3) << 6));
+   rsw.rsw.swz = src->Swizzle;
 
    _mesa_bzero(&noop, sizeof(noop));
    noop.rsw.neg = 0;
-   noop.rsw.swz = RSW_NOOP;
+   noop.rsw.swz = SWIZZLE_NOOP;
 
    if (_mesa_memcmp(&rsw, &noop, sizeof(rsw)) !=0) {
       union instruction *op = cvp_next_instruction(cp);
@@ -907,46 +955,6 @@ static GLuint cvp_choose_result( struct compilation *cp,
    }
 }
 
-static struct reg cvp_emit_rsw( struct compilation *cp, 
-				GLuint dst,
-				struct reg src,
-				GLuint neg, 
-				GLuint swz,
-				GLboolean force)
-{
-   struct reg retval;
-
-   if (swz != RSW_NOOP || neg != 0) {
-      union instruction *op = cvp_next_instruction(cp);
-      op->rsw.opcode = RSW;
-      op->rsw.dst = dst;
-      op->rsw.file0 = src.file;
-      op->rsw.idx0 = src.idx;
-      op->rsw.neg = neg;
-      op->rsw.swz = swz;
-	    
-      retval.file = FILE_REG;
-      retval.idx = dst;
-      return retval;
-   }
-   else if (force) {
-      /* Oops.  Degenerate case:
-       */
-      union instruction *op = cvp_next_instruction(cp);
-      op->alu.opcode = OPCODE_MOV;
-      op->alu.dst = dst;
-      op->alu.file0 = src.file;
-      op->alu.idx0 = src.idx;
-      
-      retval.file = FILE_REG;
-      retval.idx = dst;
-      return retval;
-   }
-   else {
-      return src;
-   }
-}
-
 
 static void cvp_emit_inst( struct compilation *cp,
 			   const struct prog_instruction *inst )
@@ -998,64 +1006,26 @@ static void cvp_emit_inst( struct compilation *cp,
       op->alu.idx0 = reg[0].idx;
       break;
 
-   case OPCODE_SWZ: {
-      GLuint swz0 = 0, swz1 = 0;
-      GLuint neg0 = 0, neg1 = 0;
-      GLuint mask = 0;
-
-      /* Translate 3-bit-per-element swizzle into two 2-bit swizzles,
-       * one from the source register the other from a constant
-       * {0,0,0,1}.
-       */
-      for (i = 0; i < 4; i++) {
-	 GLuint swzelt = GET_SWZ(inst->SrcReg[0].Swizzle, i);
-	 if (swzelt >= SWIZZLE_ZERO) {
-	    neg0 |= inst->SrcReg[0].NegateBase & (1<<i);
-	    if (swzelt == SWIZZLE_ONE)
-	       swz0 |= SWIZZLE_W << (i*2);
-	    else if (i < SWIZZLE_W)
-	       swz0 |= i << (i*2);
-	 }
-	 else {
-	    mask |= 1<<i;
-	    neg1 |= inst->SrcReg[0].NegateBase & (1<<i);
-	    swz1 |= swzelt << (i*2);
-	 }
-      }
+   case OPCODE_END:
+      break;
 
+   case OPCODE_SWZ:
       result = cvp_choose_result( cp, &inst->DstReg, &fixup );
-      reg[0].file = FILE_REG;
-      reg[0].idx = REG_ID;
-      reg[1] = cvp_emit_arg( cp, &inst->SrcReg[0], REG_ARG0 );
-
-      if (mask == WRITEMASK_XYZW) {
-	 cvp_emit_rsw(cp, result, reg[0], neg0, swz0, GL_TRUE);
-	 
-      }
-      else if (mask == 0) {
-	 cvp_emit_rsw(cp, result, reg[1], neg1, swz1, GL_TRUE);
-      }
-      else {
-	 cvp_emit_rsw(cp, result, reg[0], neg0, swz0, GL_TRUE);
-	 reg[1] = cvp_emit_rsw(cp, REG_ARG0, reg[1], neg1, swz1, GL_FALSE);
-
-	 op = cvp_next_instruction(cp);
-	 op->msk.opcode = MSK;
-	 op->msk.dst = result;
-	 op->msk.file = reg[1].file;
-	 op->msk.idx = reg[1].idx;
-	 op->msk.mask = mask;
-      }
+      reg[0] = cvp_load_reg( cp, inst->SrcReg[0].File,
+			inst->SrcReg[0].Index, inst->SrcReg[0].RelAddr, REG_ARG0 );
+      op = cvp_next_instruction(cp);
+      op->rsw.opcode = inst->Opcode;
+      op->rsw.file0 = reg[0].file;
+      op->rsw.idx0 = reg[0].idx;
+      op->rsw.dst = result;
+      op->rsw.swz = inst->SrcReg[0].Swizzle;
+      op->rsw.neg = inst->SrcReg[0].NegateBase;
 
       if (result == REG_RES) {
 	 op = cvp_next_instruction(cp);
 	 *op = fixup;
       }
       break;
-   }
-
-   case OPCODE_END:
-      break;
 
    default:
       result = cvp_choose_result( cp, &inst->DstReg, &fixup );
@@ -1074,7 +1044,7 @@ static void cvp_emit_inst( struct compilation *cp,
       if (result == REG_RES) {
 	 op = cvp_next_instruction(cp);
 	 *op = fixup;
-      }      	 
+      }
       break;
    }
 }
@@ -1485,7 +1455,7 @@ static GLboolean init_vertex_program( GLcontext *ctx,
     */
    ASSIGN_4V(m->File[0][REG_ID], 0, 0, 0, 1);
    ASSIGN_4V(m->File[0][REG_ONES], 1, 1, 1, 1);
-   ASSIGN_4V(m->File[0][REG_SWZ], -1, 1, 0, 0);
+   ASSIGN_4V(m->File[0][REG_SWZ], 1, -1, 0, 0);
    ASSIGN_4V(m->File[0][REG_NEG], -1, -1, -1, -1);
    ASSIGN_4V(m->File[0][REG_LIT], 1, 0, 0, 1);
    ASSIGN_4V(m->File[0][REG_LIT2], 1, .5, .2, 1); /* debug value */
diff --git a/src/mesa/tnl/t_vb_arbprogram.h b/src/mesa/tnl/t_vb_arbprogram.h
index 60786d6a01..dab725d7f7 100644
--- a/src/mesa/tnl/t_vb_arbprogram.h
+++ b/src/mesa/tnl/t_vb_arbprogram.h
@@ -61,7 +61,7 @@
 #define REG_IN31   63
 #define REG_ID     64		/* 0,0,0,1 */
 #define REG_ONES   65		/* 1,1,1,1 */
-#define REG_SWZ    66		/* -1,1,0,0 */
+#define REG_SWZ    66		/* 1,-1,0,0 */
 #define REG_NEG    67		/* -1,-1,-1,-1 */
 #define REG_LIT    68           /* 1,0,0,1 */
 #define REG_LIT2    69           /* 1,0,0,1 */
@@ -98,7 +98,7 @@ union instruction {
       GLuint file0:2;
       GLuint idx0:7;
       GLuint neg:4;
-      GLuint swz:8;		/* xyzw only */
+      GLuint swz:12;		/* xyzw01 */
    } rsw;
 
    struct {
@@ -114,11 +114,8 @@ union instruction {
 
 
 /**
- * Reduced swizzle is a 2-bit field; only X/Y/Z/W are allowed, not 0/1.
+ * Reduced swizzle is a 3-bit field, for simplicity same as normal swizzle, X/Y/Z/W/0/1 allowed.
  */
-#define RSW_NOOP ((0<<0) | (1<<2) | (2<<4) | (3<<6))
-#define GET_RSW(swz, idx)      (((swz) >> ((idx)*2)) & 0x3)
-
 
 struct input {
    GLuint idx;
diff --git a/src/mesa/tnl/t_vb_arbprogram_sse.c b/src/mesa/tnl/t_vb_arbprogram_sse.c
index 19061c0d8d..b9126d6d88 100644
--- a/src/mesa/tnl/t_vb_arbprogram_sse.c
+++ b/src/mesa/tnl/t_vb_arbprogram_sse.c
@@ -294,11 +294,12 @@ static GLboolean emit_RSW( struct compilation *cp, union instruction op )
 {
    struct x86_reg arg0 = get_arg(cp, op.rsw.file0, op.rsw.idx0);
    struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.rsw.dst);
-   GLuint swz = op.rsw.swz;
+   GLuint swz = GET_SWZ(op.rsw.swz, 0) | (GET_SWZ(op.rsw.swz, 1) << 2) |
+		(GET_SWZ(op.rsw.swz, 2) << 4| (GET_SWZ(op.rsw.swz, 3) << 6));
    GLuint neg = op.rsw.neg;
 
    emit_pshufd(cp, dst, arg0, swz);
-   
+
    if (neg) {
       struct x86_reg negs = get_arg(cp, FILE_REG, REG_SWZ);
       struct x86_reg tmp = get_xmm_reg(cp);
@@ -306,6 +307,7 @@ static GLboolean emit_RSW( struct compilation *cp, union instruction op )
        * Use neg as arg to pshufd
        * Multiply
        */
+      /* is the emit_pshufd necessary? only SWZ can negate individual components */
       emit_pshufd(cp, tmp, negs, 
 		  SHUF((neg & 1) ? 1 : 0,
 		       (neg & 2) ? 1 : 0,
@@ -317,6 +319,64 @@ static GLboolean emit_RSW( struct compilation *cp, union instruction op )
    return GL_TRUE;
 }
 
+/* Perform a full swizzle
+ */
+static GLboolean emit_SWZ( struct compilation *cp, union instruction op ) 
+{
+   struct x86_reg arg0 = get_arg(cp, op.rsw.file0, op.rsw.idx0);
+   struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.rsw.dst);
+   struct x86_reg negs = get_arg(cp, FILE_REG, REG_SWZ);
+   struct x86_reg tmp = get_xmm_reg(cp);
+   GLubyte neg = op.rsw.neg;
+   GLubyte shuf2, swz, savepos, savemask, swizzle[4];
+
+   swizzle[0] = GET_SWZ(op.rsw.swz, 0);
+   swizzle[1] = GET_SWZ(op.rsw.swz, 1);
+   swizzle[2] = GET_SWZ(op.rsw.swz, 2);
+   swizzle[3] = GET_SWZ(op.rsw.swz, 3);
+
+   swz = SHUF((swizzle[0] & 3), (swizzle[1] & 3),
+	      (swizzle[2] & 3), (swizzle[3] & 3));
+
+   emit_pshufd(cp, dst, arg0, swz);
+
+   /* can handle negation and replace with zero with the same shuffle/mul */
+   shuf2 = SHUF(swizzle[0] == 4 ? 2 : (neg & 1),
+	        swizzle[1] == 4 ? 2 : ((neg & 2) >> 1),
+	        swizzle[2] == 4 ? 2 : ((neg & 4) >> 2),
+	        swizzle[3] == 4 ? 2 : ((neg & 8) >> 3));
+
+   /* now the hard part is getting those 1's in there... */
+   savepos = 0;
+   savemask = 0;
+   if (swizzle[0] == 5) savepos = 1;
+   if (swizzle[1] == 5) savepos = 2;
+   else savemask |= 1 << 2;
+   if (swizzle[2] == 5) savepos = 3;
+   else savemask |= 2 << 4;
+   if (swizzle[3] == 5) savepos = 4;
+   else savemask |= 3 << 6;
+   if (savepos) {
+      /* need a mov first as movss from memory will overwrite high bits of xmm reg */
+      sse_movups(&cp->func, tmp, negs);
+      /* can only replace lowest 32bits, thus move away that part first */
+      emit_pshufd(cp, dst, dst, savemask);
+      sse_movss(&cp->func, dst, tmp);
+      emit_pshufd(cp, dst, dst, (savepos - 1) | (savemask & 0xfc));
+   }
+
+   if (shuf2) {
+      /* Load 1,-1,0,0
+       * Use neg as arg to pshufd
+       * Multiply
+       */
+      emit_pshufd(cp, tmp, negs, shuf2);
+      sse_mulps(&cp->func, dst, tmp);
+   }
+
+   return GL_TRUE;
+}
+
 /* Helper for writemask:
  */
 static GLboolean emit_shuf_copy1( struct compilation *cp,
@@ -595,20 +655,19 @@ static GLboolean emit_DPH( struct compilation *cp, union instruction op )
    struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0); 
    struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1); 
    struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);
-   struct x86_reg ones = get_reg_ptr(FILE_REG, REG_ONES);
-   struct x86_reg tmp = get_xmm_reg(cp);      
+   struct x86_reg tmp = get_xmm_reg(cp);
 
-   emit_pshufd(cp, dst, arg0, SHUF(W,X,Y,Z));
-   sse_movss(&cp->func, dst, ones);
-   emit_pshufd(cp, dst, dst, SHUF(W,X,Y,Z));
+   sse_movups(&cp->func, dst, arg0);
    sse_mulps(&cp->func, dst, arg1);
-   
-   /* Now the hard bit: sum the values (from DP4):
+
+   /* Now the hard bit: sum the values (from DP3):
     */ 
    sse_movhlps(&cp->func, tmp, dst);
-   sse_addps(&cp->func, dst, tmp); /* a*x+c*z, b*y+d*w, a*x+c*z, b*y+d*w */
+   sse_addss(&cp->func, dst, tmp); /* a*x+c*z, b*y, ?, ? */
    emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z));
    sse_addss(&cp->func, dst, tmp);
+   emit_pshufd(cp, tmp, arg1, SHUF(W,W,W,W));
+   sse_addss(&cp->func, dst, tmp);
    sse_shufps(&cp->func, dst, dst, SHUF(X, X, X, X));
    return GL_TRUE;
 }
@@ -985,15 +1044,18 @@ static GLboolean emit_RSQ( struct compilation *cp, union instruction op )
 {
    struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
    struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);
-
-   /* TODO: Calculate absolute value
-    */
 #if 0
+   struct x86_reg neg = get_reg_ptr(FILE_REG, REG_NEG);
+
+/* get abs value first. This STILL doesn't work.
+   Looks like we get bogus neg values ?
+*/
    sse_movss(&cp->func, dst, arg0);
    sse_mulss(&cp->func, dst, neg);
    sse_maxss(&cp->func, dst, arg0);
-#endif
 
+   sse_rsqrtss(&cp->func, dst, dst);
+#endif
    sse_rsqrtss(&cp->func, dst, arg0);
    sse_shufps(&cp->func, dst, dst, SHUF(X, X, X, X));
    return GL_TRUE;
@@ -1132,7 +1194,7 @@ static GLboolean (* const emit_func[])(struct compilation *, union instruction)
    emit_NOP, /* SSG */
    emit_NOP, /* STR */
    emit_SUB,
-   emit_RSW, /* SWZ */
+   emit_SWZ, /* SWZ */
    emit_NOP, /* TEX */
    emit_NOP, /* TXB */
    emit_NOP, /* TXD */
diff --git a/src/mesa/x86/rtasm/x86sse.c b/src/mesa/x86/rtasm/x86sse.c
index 9f34004ba0..6137aef8ec 100644
--- a/src/mesa/x86/rtasm/x86sse.c
+++ b/src/mesa/x86/rtasm/x86sse.c
@@ -424,6 +424,14 @@ void sse_maxps( struct x86_function *p,
    emit_modrm( p, dst, src );
 }
 
+void sse_maxss( struct x86_function *p,
+		struct x86_reg dst,
+		struct x86_reg src )
+{
+   emit_3ub(p, 0xF3, X86_TWOB, 0x5F);
+   emit_modrm( p, dst, src );
+}
+
 void sse_divss( struct x86_function *p,
 		struct x86_reg dst,
 		struct x86_reg src )
@@ -456,6 +464,14 @@ void sse_mulps( struct x86_function *p,
    emit_modrm( p, dst, src );
 }
 
+void sse_mulss( struct x86_function *p,
+		struct x86_reg dst,
+		struct x86_reg src )
+{
+   emit_3ub(p, 0xF3, X86_TWOB, 0x59);
+   emit_modrm( p, dst, src );
+}
+
 void sse_addps( struct x86_function *p,
 		struct x86_reg dst,
 		struct x86_reg src )
diff --git a/src/mesa/x86/rtasm/x86sse.h b/src/mesa/x86/rtasm/x86sse.h
index 430cf2f939..5ec5489431 100644
--- a/src/mesa/x86/rtasm/x86sse.h
+++ b/src/mesa/x86/rtasm/x86sse.h
@@ -156,6 +156,7 @@ void sse_divss( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
 void sse_andps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void sse_cmpps( struct x86_function *p, struct x86_reg dst, struct x86_reg src, GLubyte cc );
 void sse_maxps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_maxss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void sse_minps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void sse_movaps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void sse_movhlps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
@@ -165,6 +166,7 @@ void sse_movlps( struct x86_function *p, struct x86_reg dst, struct x86_reg src
 void sse_movss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void sse_movups( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void sse_mulps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_mulss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void sse_subps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void sse_rsqrtss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void sse_shufps( struct x86_function *p, struct x86_reg dest, struct x86_reg arg0, GLubyte shuf );