Fix extended swizzling in vertex programs by introducing special swizzle instruction, extend the 2 bit rsw field to 3 bit like used in other places. While here, also fix up rsw (negation), dph and try to fix up rsq with negative values (doesn't work, bug seems elsewhere) in the sse codegen code.

author: Roland Scheidegger <rscheidegger@gmx.ch> 2006-06-01 22:56:40 +0000
committer: Roland Scheidegger <rscheidegger@gmx.ch> 2006-06-01 22:56:40 +0000
commit: fe57ed4f2566e30384d0c786998842405d8e8990 (patch)
tree: dda940ef6088ccb5ffa89d760f15aebceaaacfa6 /src/mesa/tnl/t_vb_arbprogram_sse.c
parent: 4d3ab19abfc262070ff82443bf28fb8b8a616e18 (diff)
1 files changed, 77 insertions, 15 deletions
diff --git a/src/mesa/tnl/t_vb_arbprogram_sse.c b/src/mesa/tnl/t_vb_arbprogram_sse.c
index 19061c0d8d..b9126d6d88 100644
--- a/src/mesa/tnl/t_vb_arbprogram_sse.c
+++ b/src/mesa/tnl/t_vb_arbprogram_sse.c
@@ -294,11 +294,12 @@ static GLboolean emit_RSW( struct compilation *cp, union instruction op )
 {
    struct x86_reg arg0 = get_arg(cp, op.rsw.file0, op.rsw.idx0);
    struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.rsw.dst);
-   GLuint swz = op.rsw.swz;
+   GLuint swz = GET_SWZ(op.rsw.swz, 0) | (GET_SWZ(op.rsw.swz, 1) << 2) |
+		(GET_SWZ(op.rsw.swz, 2) << 4| (GET_SWZ(op.rsw.swz, 3) << 6));
    GLuint neg = op.rsw.neg;
 
    emit_pshufd(cp, dst, arg0, swz);
-   
+
    if (neg) {
       struct x86_reg negs = get_arg(cp, FILE_REG, REG_SWZ);
       struct x86_reg tmp = get_xmm_reg(cp);
@@ -306,6 +307,7 @@ static GLboolean emit_RSW( struct compilation *cp, union instruction op )
        * Use neg as arg to pshufd
        * Multiply
        */
+      /* is the emit_pshufd necessary? only SWZ can negate individual components */
       emit_pshufd(cp, tmp, negs, 
 		  SHUF((neg & 1) ? 1 : 0,
 		       (neg & 2) ? 1 : 0,
@@ -317,6 +319,64 @@ static GLboolean emit_RSW( struct compilation *cp, union instruction op )
    return GL_TRUE;
 }
 
+/* Perform a full swizzle
+ */
+static GLboolean emit_SWZ( struct compilation *cp, union instruction op ) 
+{
+   struct x86_reg arg0 = get_arg(cp, op.rsw.file0, op.rsw.idx0);
+   struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.rsw.dst);
+   struct x86_reg negs = get_arg(cp, FILE_REG, REG_SWZ);
+   struct x86_reg tmp = get_xmm_reg(cp);
+   GLubyte neg = op.rsw.neg;
+   GLubyte shuf2, swz, savepos, savemask, swizzle[4];
+
+   swizzle[0] = GET_SWZ(op.rsw.swz, 0);
+   swizzle[1] = GET_SWZ(op.rsw.swz, 1);
+   swizzle[2] = GET_SWZ(op.rsw.swz, 2);
+   swizzle[3] = GET_SWZ(op.rsw.swz, 3);
+
+   swz = SHUF((swizzle[0] & 3), (swizzle[1] & 3),
+	      (swizzle[2] & 3), (swizzle[3] & 3));
+
+   emit_pshufd(cp, dst, arg0, swz);
+
+   /* can handle negation and replace with zero with the same shuffle/mul */
+   shuf2 = SHUF(swizzle[0] == 4 ? 2 : (neg & 1),
+	        swizzle[1] == 4 ? 2 : ((neg & 2) >> 1),
+	        swizzle[2] == 4 ? 2 : ((neg & 4) >> 2),
+	        swizzle[3] == 4 ? 2 : ((neg & 8) >> 3));
+
+   /* now the hard part is getting those 1's in there... */
+   savepos = 0;
+   savemask = 0;
+   if (swizzle[0] == 5) savepos = 1;
+   if (swizzle[1] == 5) savepos = 2;
+   else savemask |= 1 << 2;
+   if (swizzle[2] == 5) savepos = 3;
+   else savemask |= 2 << 4;
+   if (swizzle[3] == 5) savepos = 4;
+   else savemask |= 3 << 6;
+   if (savepos) {
+      /* need a mov first as movss from memory will overwrite high bits of xmm reg */
+      sse_movups(&cp->func, tmp, negs);
+      /* can only replace lowest 32bits, thus move away that part first */
+      emit_pshufd(cp, dst, dst, savemask);
+      sse_movss(&cp->func, dst, tmp);
+      emit_pshufd(cp, dst, dst, (savepos - 1) | (savemask & 0xfc));
+   }
+
+   if (shuf2) {
+      /* Load 1,-1,0,0
+       * Use neg as arg to pshufd
+       * Multiply
+       */
+      emit_pshufd(cp, tmp, negs, shuf2);
+      sse_mulps(&cp->func, dst, tmp);
+   }
+
+   return GL_TRUE;
+}
+
 /* Helper for writemask:
  */
 static GLboolean emit_shuf_copy1( struct compilation *cp,
@@ -595,20 +655,19 @@ static GLboolean emit_DPH( struct compilation *cp, union instruction op )
    struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0); 
    struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1); 
    struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);
-   struct x86_reg ones = get_reg_ptr(FILE_REG, REG_ONES);
-   struct x86_reg tmp = get_xmm_reg(cp);      
+   struct x86_reg tmp = get_xmm_reg(cp);
 
-   emit_pshufd(cp, dst, arg0, SHUF(W,X,Y,Z));
-   sse_movss(&cp->func, dst, ones);
-   emit_pshufd(cp, dst, dst, SHUF(W,X,Y,Z));
+   sse_movups(&cp->func, dst, arg0);
    sse_mulps(&cp->func, dst, arg1);
-   
-   /* Now the hard bit: sum the values (from DP4):
+
+   /* Now the hard bit: sum the values (from DP3):
     */ 
    sse_movhlps(&cp->func, tmp, dst);
-   sse_addps(&cp->func, dst, tmp); /* a*x+c*z, b*y+d*w, a*x+c*z, b*y+d*w */
+   sse_addss(&cp->func, dst, tmp); /* a*x+c*z, b*y, ?, ? */
    emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z));
    sse_addss(&cp->func, dst, tmp);
+   emit_pshufd(cp, tmp, arg1, SHUF(W,W,W,W));
+   sse_addss(&cp->func, dst, tmp);
    sse_shufps(&cp->func, dst, dst, SHUF(X, X, X, X));
    return GL_TRUE;
 }
@@ -985,15 +1044,18 @@ static GLboolean emit_RSQ( struct compilation *cp, union instruction op )
 {
    struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
    struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);
-
-   /* TODO: Calculate absolute value
-    */
 #if 0
+   struct x86_reg neg = get_reg_ptr(FILE_REG, REG_NEG);
+
+/* get abs value first. This STILL doesn't work.
+   Looks like we get bogus neg values ?
+*/
    sse_movss(&cp->func, dst, arg0);
    sse_mulss(&cp->func, dst, neg);
    sse_maxss(&cp->func, dst, arg0);
-#endif
 
+   sse_rsqrtss(&cp->func, dst, dst);
+#endif
    sse_rsqrtss(&cp->func, dst, arg0);
    sse_shufps(&cp->func, dst, dst, SHUF(X, X, X, X));
    return GL_TRUE;
@@ -1132,7 +1194,7 @@ static GLboolean (* const emit_func[])(struct compilation *, union instruction)
    emit_NOP, /* SSG */
    emit_NOP, /* STR */
    emit_SUB,
-   emit_RSW, /* SWZ */
+   emit_SWZ, /* SWZ */
    emit_NOP, /* TEX */
    emit_NOP, /* TXB */
    emit_NOP, /* TXD */
author	Roland Scheidegger <rscheidegger@gmx.ch>	2006-06-01 22:56:40 +0000
committer	Roland Scheidegger <rscheidegger@gmx.ch>	2006-06-01 22:56:40 +0000
commit	fe57ed4f2566e30384d0c786998842405d8e8990 (patch)
tree	dda940ef6088ccb5ffa89d760f15aebceaaacfa6 /src/mesa/tnl/t_vb_arbprogram_sse.c
parent	4d3ab19abfc262070ff82443bf28fb8b8a616e18 (diff)