From d6213e6023ca96660db5ddc909acad9f21c3f2e9 Mon Sep 17 00:00:00 2001
From: Roland Scheidegger <rscheidegger@gmx.ch>
Date: Thu, 8 Jun 2006 17:46:21 +0000
Subject: Improve slightly wrong CMP_SRCS test to avoid unencessary
 instructions. Clean up the r200 vertex program code a bit.

---
 src/mesa/drivers/dri/r200/r200_vertprog.c | 148 +++++++++++-------------------
 src/mesa/drivers/dri/r200/r200_vertprog.h |   3 +-
 2 files changed, 56 insertions(+), 95 deletions(-)

(limited to 'src/mesa/drivers')

diff --git a/src/mesa/drivers/dri/r200/r200_vertprog.c b/src/mesa/drivers/dri/r200/r200_vertprog.c
index cd008df7aa..dfacdc88cc 100644
--- a/src/mesa/drivers/dri/r200/r200_vertprog.c
+++ b/src/mesa/drivers/dri/r200/r200_vertprog.c
@@ -44,44 +44,42 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define SCALAR_FLAG (1<<31)
 #define FLAG_MASK (1<<31)
 #define OP_MASK (0xf)  /* we are unlikely to have more than 15 */
-#define OPN(operator, ip, op) {#operator, OPCODE_##operator, ip, op}
+#define OPN(operator, ip) {#operator, OPCODE_##operator, ip}
 
 static struct{
    char *name;
    int opcode;
    unsigned long ip; /* number of input operands and flags */
-   unsigned long op;
 }op_names[]={
-   OPN(ABS, 1, 1),
-   OPN(ADD, 2, 1),
-   OPN(ARL, 1, 1|SCALAR_FLAG),
-   OPN(DP3, 2, 3|SCALAR_FLAG),
-   OPN(DP4, 2, 3|SCALAR_FLAG),
-   OPN(DPH, 2, 3|SCALAR_FLAG),
-   OPN(DST, 2, 1),
-   OPN(EX2, 1|SCALAR_FLAG, 4|SCALAR_FLAG),
-   OPN(EXP, 1|SCALAR_FLAG, 1),
-   OPN(FLR, 1, 1),
-   OPN(FRC, 1, 1),
-   OPN(LG2, 1|SCALAR_FLAG, 4|SCALAR_FLAG),
-   OPN(LIT, 1, 1),
-   OPN(LOG, 1|SCALAR_FLAG, 1),
-   OPN(MAD, 3, 1),
-   OPN(MAX, 2, 1),
-   OPN(MIN, 2, 1),
-   OPN(MOV, 1, 1),
-   OPN(MUL, 2, 1),
-   OPN(POW, 2|SCALAR_FLAG, 4|SCALAR_FLAG),
-   OPN(RCP, 1|SCALAR_FLAG, 4|SCALAR_FLAG),
-   OPN(RSQ, 1|SCALAR_FLAG, 4|SCALAR_FLAG),
-   OPN(SGE, 2, 1),
-   OPN(SLT, 2, 1),
-   OPN(SUB, 2, 1),
-   OPN(SWZ, 1, 1),
-   OPN(XPD, 2, 1),
-   OPN(RCC, 0, 0), //extra
-   OPN(PRINT, 0, 0),
-   OPN(END, 0, 0),
+   OPN(ABS, 1),
+   OPN(ADD, 2),
+   OPN(ARL, 1|SCALAR_FLAG),
+   OPN(DP3, 2),
+   OPN(DP4, 2),
+   OPN(DPH, 2),
+   OPN(DST, 2),
+   OPN(EX2, 1|SCALAR_FLAG),
+   OPN(EXP, 1|SCALAR_FLAG),
+   OPN(FLR, 1),
+   OPN(FRC, 1),
+   OPN(LG2, 1|SCALAR_FLAG),
+   OPN(LIT, 1),
+   OPN(LOG, 1|SCALAR_FLAG),
+   OPN(MAD, 3),
+   OPN(MAX, 2),
+   OPN(MIN, 2),
+   OPN(MOV, 1),
+   OPN(MUL, 2),
+   OPN(POW, 2|SCALAR_FLAG),
+   OPN(RCP, 1|SCALAR_FLAG),
+   OPN(RSQ, 1|SCALAR_FLAG),
+   OPN(SGE, 2),
+   OPN(SLT, 2),
+   OPN(SUB, 2),
+   OPN(SWZ, 1),
+   OPN(XPD, 2),
+   OPN(PRINT, 0),
+   OPN(END, 0),
 };
 #undef OPN
 
@@ -312,11 +310,14 @@ static unsigned long t_opcode(enum prog_opcode opcode)
 {
 
    switch(opcode){
+   case OPCODE_ADD: return R200_VPI_OUT_OP_ADD;
+   case OPCODE_DP4: return R200_VPI_OUT_OP_DOT;
    case OPCODE_DST: return R200_VPI_OUT_OP_DST;
    case OPCODE_EX2: return R200_VPI_OUT_OP_EX2;
    case OPCODE_EXP: return R200_VPI_OUT_OP_EXP;
    case OPCODE_FRC: return R200_VPI_OUT_OP_FRC;
    case OPCODE_LG2: return R200_VPI_OUT_OP_LG2;
+   case OPCODE_LIT: return R200_VPI_OUT_OP_LIT;
    case OPCODE_LOG: return R200_VPI_OUT_OP_LOG;
    case OPCODE_MAX: return R200_VPI_OUT_OP_MAX;
    case OPCODE_MIN: return R200_VPI_OUT_OP_MIN;
@@ -325,8 +326,6 @@ static unsigned long t_opcode(enum prog_opcode opcode)
    case OPCODE_RSQ: return R200_VPI_OUT_OP_RSQ;
    case OPCODE_SGE: return R200_VPI_OUT_OP_SGE;
    case OPCODE_SLT: return R200_VPI_OUT_OP_SLT;
-   case OPCODE_DP4: return R200_VPI_OUT_OP_DOT;
-   case OPCODE_ADD: return R200_VPI_OUT_OP_ADD;
 
    default: 
       fprintf(stderr, "%s: Should not be called with opcode %d!", __FUNCTION__, opcode);
@@ -350,11 +349,11 @@ static unsigned long op_operands(enum prog_opcode opcode)
 }
 
 /* TODO: Get rid of t_src_class call */
-#define CMP_SRCS(a, b) ((a.RelAddr != b.RelAddr) || (a.Index != b.Index && \
+#define CMP_SRCS(a, b) (((a.RelAddr != b.RelAddr) || (a.Index != b.Index)) && \
 		       ((t_src_class(a.File) == VSF_IN_CLASS_PARAM && \
 			 t_src_class(b.File) == VSF_IN_CLASS_PARAM) || \
 			(t_src_class(a.File) == VSF_IN_CLASS_ATTR && \
-			 t_src_class(b.File) == VSF_IN_CLASS_ATTR)))) \
+			 t_src_class(b.File) == VSF_IN_CLASS_ATTR))) \
 
 /* fglrx on rv250 codes up unused sources as follows:
    unused but necessary sources are same as previous source, zero-ed out.
@@ -599,10 +598,10 @@ static GLboolean r200_translate_vertex_program(struct r200_vertex_program *vp)
       are_srcs_scalar = operands & SCALAR_FLAG;
       operands &= OP_MASK;
 
-      for(i=0; i < operands; i++)
+      for(i = 0; i < operands; i++)
 	 src[i] = vpi->SrcReg[i];
 
-      if(operands == 3){ /* TODO: scalars */
+      if(operands == 3){
 	 if( CMP_SRCS(src[1], src[2]) || CMP_SRCS(src[0], src[2]) ){
 	    o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_ADD,
 		(u_temp_i << R200_VPI_OUT_REG_INDEX_SHIFT) | R200_VSF_OUT_CLASS_TMP,
@@ -661,7 +660,6 @@ static GLboolean r200_translate_vertex_program(struct r200_vertex_program *vp)
       case OPCODE_POW:
 /* pow takes only one argument, first scalar is in slot x, 2nd in slot z (other slots don't matter).
    So may need to insert additional instruction */
-/* this appears to be different to r300 */
 	 if ((src[0].File == src[1].File) &&
 	     (src[0].Index == src[1].Index)) {
 	    o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_POW, t_dst(&vpi->DstReg),
@@ -709,12 +707,14 @@ static GLboolean r200_translate_vertex_program(struct r200_vertex_program *vp)
 	 goto next;
 
       case OPCODE_MOV://ADD RESULT 1.X Y Z W PARAM 0{} {X Y Z W} PARAM 0{} {ZERO ZERO ZERO ZERO} 
+      case OPCODE_SWZ:
 	 o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_ADD, t_dst(&vpi->DstReg),
 		t_dst_mask(vpi->DstReg.WriteMask));
 	 o_inst->src0 = t_src(vp, &src[0]);
 	 o_inst->src1 = ZERO_SRC_0;
 	 o_inst->src2 = UNUSED_SRC_1;
 	 goto next;
+      
       case OPCODE_MAD:
 	 hw_op=(src[0].File == PROGRAM_TEMPORARY &&
 	    src[1].File == PROGRAM_TEMPORARY &&
@@ -768,6 +768,21 @@ else {
 	 o_inst->src2 = UNUSED_SRC_1;
 	 goto next;
 
+      case OPCODE_DPH://DOT RESULT 1.X Y Z W PARAM 0{} {X Y Z ONE} PARAM 0{} {X Y Z W} 
+	 o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_DOT, t_dst(&vpi->DstReg),
+		t_dst_mask(vpi->DstReg.WriteMask));
+
+	 o_inst->src0 = MAKE_VSF_SOURCE(t_src_index(vp, &src[0]),
+		t_swizzle(GET_SWZ(src[0].Swizzle, 0)),
+		t_swizzle(GET_SWZ(src[0].Swizzle, 1)),
+		t_swizzle(GET_SWZ(src[0].Swizzle, 2)),
+		VSF_IN_COMPONENT_ONE,
+		t_src_class(src[0].File),
+		src[0].NegateBase) | (src[0].RelAddr << 4);
+	 o_inst->src1 = t_src(vp, &src[1]);
+	 o_inst->src2 = UNUSED_SRC_1;
+	 goto next;
+
       case OPCODE_SUB://ADD RESULT 1.X Y Z W TMP 0{} {X Y Z W} PARAM 1{X Y Z W } {X Y Z W} neg Xneg Yneg Zneg W
 	 o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_ADD, t_dst(&vpi->DstReg),
 		t_dst_mask(vpi->DstReg.WriteMask));
@@ -828,46 +843,6 @@ else {
 	 u_temp_i--;
 	 goto next;
 
-      case OPCODE_LG2:// LG2 RESULT 1.X Y Z W PARAM 0{} {X X X X}
-	 o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_LG2, t_dst(&vpi->DstReg),
-		t_dst_mask(vpi->DstReg.WriteMask));
-
-	 o_inst->src0 = MAKE_VSF_SOURCE(t_src_index(vp, &src[0]),
-		t_swizzle(GET_SWZ(src[0].Swizzle, 0)),
-		t_swizzle(GET_SWZ(src[0].Swizzle, 0)),
-		t_swizzle(GET_SWZ(src[0].Swizzle, 0)),
-		t_swizzle(GET_SWZ(src[0].Swizzle, 0)),
-		t_src_class(src[0].File),
-		src[0].NegateBase ? VSF_FLAG_ALL : VSF_FLAG_NONE) | (src[0].RelAddr << 4);
-	 o_inst->src1 = UNUSED_SRC_0;
-	 o_inst->src2 = UNUSED_SRC_0;
-	 goto next;
-
-      case OPCODE_LIT://LIT TMP 1.Y Z TMP 1{} {X W Z Y} TMP 1{} {Y W Z X} TMP 1{} {Y X Z W} 
-	 o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_LIT, t_dst(&vpi->DstReg),
-		t_dst_mask(vpi->DstReg.WriteMask));
-/* r200 in contrast to r300 does not seem to need any complicated setup,
-   its LIT instruction is "more native" */
-	 o_inst->src0 = t_src(vp, &src[0]);
-	 o_inst->src1 = UNUSED_SRC_0;
-	 o_inst->src2 = UNUSED_SRC_0;
-	 goto next;
-
-      case OPCODE_DPH://DOT RESULT 1.X Y Z W PARAM 0{} {X Y Z ONE} PARAM 0{} {X Y Z W} 
-	 o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_DOT, t_dst(&vpi->DstReg),
-		t_dst_mask(vpi->DstReg.WriteMask));
-
-	 o_inst->src0 = MAKE_VSF_SOURCE(t_src_index(vp, &src[0]),
-		t_swizzle(GET_SWZ(src[0].Swizzle, 0)),
-		t_swizzle(GET_SWZ(src[0].Swizzle, 1)),
-		t_swizzle(GET_SWZ(src[0].Swizzle, 2)),
-		VSF_IN_COMPONENT_ONE,
-		t_src_class(src[0].File),
-		src[0].NegateBase) | (src[0].RelAddr << 4);
-	 o_inst->src1 = t_src(vp, &src[1]);
-	 o_inst->src2 = UNUSED_SRC_1;
-	 goto next;
-
       case OPCODE_XPD:
 	 /* mul r0, r1.yzxw, r2.zxyw
 	    mad r0, -r2.yzxw, r1.zxyw, r0
@@ -924,23 +899,8 @@ else {
 		VSF_IN_COMPONENT_W,
 		VSF_IN_CLASS_TMP,
 		VSF_FLAG_NONE);
-
-	 goto next;
-
-      case OPCODE_SWZ:
-	 o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_ADD, t_dst(&vpi->DstReg),
-		t_dst_mask(vpi->DstReg.WriteMask));
-	 o_inst->src0 = t_src(vp, &src[0]);
-	 o_inst->src1 = ZERO_SRC_0;
-	 o_inst->src2 = UNUSED_SRC_1;
 	 goto next;
 
-      case OPCODE_RCC:
-	 if (R200_DEBUG & DEBUG_FALLBACKS) {
-	    fprintf(stderr, "Don't know how to handle op %d yet\n", vpi->Opcode);
-	 }
-	 return GL_FALSE;
-      break;
       case OPCODE_END:
 	 break;
       default:
diff --git a/src/mesa/drivers/dri/r200/r200_vertprog.h b/src/mesa/drivers/dri/r200/r200_vertprog.h
index 00ad2dd1b3..0fbe5eec7d 100644
--- a/src/mesa/drivers/dri/r200/r200_vertprog.h
+++ b/src/mesa/drivers/dri/r200/r200_vertprog.h
@@ -72,7 +72,8 @@ extern void r200SetupVertexProg( GLcontext *ctx );
 #define R200_VSF_OUT_CLASS_RESULT_POINTSIZE	(8 << 8)
 #define R200_VSF_OUT_CLASS_MASK			(31 << 8)
 
-/* opcodes - they all are the same as on r300 it seems */
+/* opcodes - they all are the same as on r300 it seems, however
+   LIT and POW require different setup */
 #define R200_VPI_OUT_OP_DOT                     (1 << 0)
 #define R200_VPI_OUT_OP_MUL                     (2 << 0)
 #define R200_VPI_OUT_OP_ADD                     (3 << 0)
-- 
cgit v1.2.3