r300: Fix fragment program instruction pairing and register allocation

There were a number of bugs related to the pairing of vector and scalar operations where swizzles ended up using the wrong source register, or an instruction was moved forward and ended up overwriting an aliased register. The new algorithm for register allocation is quite conservative and may run out of registers before necessary. On the plus side, It Just Works. Pairing is done whenever possible, and in more cases than before, so in practice this change should be a net win.
author: Nicolai Haehnle <prefect@upb.de> 2007-03-18 02:15:56 +0100
committer: Nicolai Haehnle <nhaehnle@gmail.com> 2007-03-19 18:38:07 +0100
commit: 7b430acd71f04dce3e21bdcfe70115a23d751f30 (patch)
tree: a50e3628283ba79336b6eb74ca9da061f8776917 /src/mesa/drivers/dri/r300/r300_fragprog.c
parent: 07db8c9115c0b07d79be778976e25f8eb18d42a2 (diff)
1 files changed, 509 insertions, 265 deletions
diff --git a/src/mesa/drivers/dri/r300/r300_fragprog.c b/src/mesa/drivers/dri/r300/r300_fragprog.c
index 251fd26082..b2c89ccb36 100644
--- a/src/mesa/drivers/dri/r300/r300_fragprog.c
+++ b/src/mesa/drivers/dri/r300/r300_fragprog.c
@@ -94,8 +94,9 @@
 #define REG_NEGV_SHIFT		18
 #define REG_NEGS_SHIFT		19
 #define REG_ABS_SHIFT		20
-#define REG_NO_USE_SHIFT	21
-#define REG_VALID_SHIFT		22
+#define REG_NO_USE_SHIFT	21 // Hack for refcounting
+#define REG_VALID_SHIFT		22 // Does the register contain a defined value?
+#define REG_BUILTIN_SHIFT   23 // Is it a builtin (like all zero/all one)?
 
 #define REG_TYPE_MASK		(0x03 << REG_TYPE_SHIFT)
 #define REG_INDEX_MASK		(0x3F << REG_INDEX_SHIFT)
@@ -106,12 +107,14 @@
 #define REG_ABS_MASK		(0x01 << REG_ABS_SHIFT)
 #define REG_NO_USE_MASK		(0x01 << REG_NO_USE_SHIFT)
 #define REG_VALID_MASK		(0x01 << REG_VALID_SHIFT)
+#define REG_BUILTIN_MASK	(0x01 << REG_BUILTIN_SHIFT)
 
-#define REG(type, index, vswz, sswz, nouse, valid)			\
+#define REG(type, index, vswz, sswz, nouse, valid, builtin)	\
 	(((type << REG_TYPE_SHIFT) & REG_TYPE_MASK) |			\
 	 ((index << REG_INDEX_SHIFT) & REG_INDEX_MASK) |		\
 	 ((nouse << REG_NO_USE_SHIFT) & REG_NO_USE_MASK) |		\
 	 ((valid << REG_VALID_SHIFT) & REG_VALID_MASK) |		\
+	 ((builtin << REG_BUILTIN_SHIFT) & REG_BUILTIN_MASK) |	\
 	 ((vswz << REG_VSWZ_SHIFT) & REG_VSWZ_MASK) |			\
 	 ((sswz << REG_SSWZ_SHIFT) & REG_SSWZ_MASK))
 #define REG_GET_TYPE(reg)						\
@@ -126,6 +129,8 @@
 	((reg & REG_NO_USE_MASK) >> REG_NO_USE_SHIFT)
 #define REG_GET_VALID(reg)						\
 	((reg & REG_VALID_MASK) >> REG_VALID_SHIFT)
+#define REG_GET_BUILTIN(reg)						\
+	((reg & REG_BUILTIN_MASK) >> REG_BUILTIN_SHIFT)
 #define REG_SET_TYPE(reg, type)						\
 	reg = ((reg & ~REG_TYPE_MASK) |					\
 	       ((type << REG_TYPE_SHIFT) & REG_TYPE_MASK))
@@ -144,6 +149,9 @@
 #define REG_SET_VALID(reg, valid)					\
 	reg = ((reg & ~REG_VALID_MASK) |				\
 	       ((valid << REG_VALID_SHIFT) & REG_VALID_MASK))
+#define REG_SET_BUILTIN(reg, builtin)					\
+	reg = ((reg & ~REG_BUILTIN_MASK) |				\
+	       ((builtin << REG_BUILTIN_SHIFT) & REG_BUILTIN_MASK))
 #define REG_ABS(reg)							\
 	reg = (reg | REG_ABS_MASK)
 #define REG_NEGV(reg)							\
@@ -184,9 +192,6 @@ static const struct {
  *
  * REG_VSWZ/REG_SSWZ is an index into this table
  */
-#define SLOT_VECTOR	(1<<0)
-#define SLOT_SCALAR	(1<<3)
-#define SLOT_BOTH	(SLOT_VECTOR | SLOT_SCALAR)
 
 /* mapping from SWIZZLE_* to r300 native values for scalar insns */
 #define SWIZZLE_HALF 6
@@ -202,14 +207,14 @@ static const struct r300_pfs_swizzle {
 	GLuint flags;
 } v_swiz[] = {
 /* native swizzles */
-	{ MAKE_SWZ3(X, Y, Z), R300_FPI0_ARGC_SRC0C_XYZ, 4, SLOT_VECTOR },
-	{ MAKE_SWZ3(X, X, X), R300_FPI0_ARGC_SRC0C_XXX, 4, SLOT_VECTOR },
-	{ MAKE_SWZ3(Y, Y, Y), R300_FPI0_ARGC_SRC0C_YYY, 4, SLOT_VECTOR },
-	{ MAKE_SWZ3(Z, Z, Z), R300_FPI0_ARGC_SRC0C_ZZZ, 4, SLOT_VECTOR },
-	{ MAKE_SWZ3(W, W, W), R300_FPI0_ARGC_SRC0A,     1, SLOT_SCALAR },
-	{ MAKE_SWZ3(Y, Z, X), R300_FPI0_ARGC_SRC0C_YZX, 1, SLOT_VECTOR },
-	{ MAKE_SWZ3(Z, X, Y), R300_FPI0_ARGC_SRC0C_ZXY, 1, SLOT_VECTOR },
-	{ MAKE_SWZ3(W, Z, Y), R300_FPI0_ARGC_SRC0CA_WZY, 1, SLOT_BOTH },
+	{ MAKE_SWZ3(X, Y, Z), R300_FPI0_ARGC_SRC0C_XYZ, 4, SLOT_SRC_VECTOR },
+	{ MAKE_SWZ3(X, X, X), R300_FPI0_ARGC_SRC0C_XXX, 4, SLOT_SRC_VECTOR },
+	{ MAKE_SWZ3(Y, Y, Y), R300_FPI0_ARGC_SRC0C_YYY, 4, SLOT_SRC_VECTOR },
+	{ MAKE_SWZ3(Z, Z, Z), R300_FPI0_ARGC_SRC0C_ZZZ, 4, SLOT_SRC_VECTOR },
+	{ MAKE_SWZ3(W, W, W), R300_FPI0_ARGC_SRC0A,     1, SLOT_SRC_SCALAR },
+	{ MAKE_SWZ3(Y, Z, X), R300_FPI0_ARGC_SRC0C_YZX, 1, SLOT_SRC_VECTOR },
+	{ MAKE_SWZ3(Z, X, Y), R300_FPI0_ARGC_SRC0C_ZXY, 1, SLOT_SRC_VECTOR },
+	{ MAKE_SWZ3(W, Z, Y), R300_FPI0_ARGC_SRC0CA_WZY, 1, SLOT_SRC_BOTH },
 	{ MAKE_SWZ3(ONE, ONE, ONE), R300_FPI0_ARGC_ONE, 0, 0},
 	{ MAKE_SWZ3(ZERO, ZERO, ZERO), R300_FPI0_ARGC_ZERO, 0, 0},
 	{ MAKE_SWZ3(HALF, HALF, HALF), R300_FPI0_ARGC_HALF, 0, 0},
@@ -241,10 +246,10 @@ static const struct {
 	int stride;	/* difference between SRC0/1/2 */
 	GLuint flags;
 } s_swiz[] = {
-	{ R300_FPI2_ARGA_SRC0C_X, 3, SLOT_VECTOR },
-	{ R300_FPI2_ARGA_SRC0C_Y, 3, SLOT_VECTOR },
-	{ R300_FPI2_ARGA_SRC0C_Z, 3, SLOT_VECTOR },
-	{ R300_FPI2_ARGA_SRC0A  , 1, SLOT_SCALAR },
+	{ R300_FPI2_ARGA_SRC0C_X, 3, SLOT_SRC_VECTOR },
+	{ R300_FPI2_ARGA_SRC0C_Y, 3, SLOT_SRC_VECTOR },
+	{ R300_FPI2_ARGA_SRC0C_Z, 3, SLOT_SRC_VECTOR },
+	{ R300_FPI2_ARGA_SRC0A  , 1, SLOT_SRC_SCALAR },
 	{ R300_FPI2_ARGA_ZERO   , 0, 0 },
 	{ R300_FPI2_ARGA_ONE    , 0, 0 },
 	{ R300_FPI2_ARGA_HALF   , 0, 0 }
@@ -256,6 +261,7 @@ static const GLuint undef = REG(REG_TYPE_TEMP,
 				SWIZZLE_XYZ,
 				SWIZZLE_W,
 				GL_FALSE,
+				GL_FALSE,
 				GL_FALSE);
 
 /* constant one source */
@@ -264,6 +270,7 @@ static const GLuint pfs_one = REG(REG_TYPE_CONST,
 				  SWIZZLE_111,
 				  SWIZZLE_ONE,
 				  GL_FALSE,
+				  GL_TRUE,
 				  GL_TRUE);
 
 /* constant half source */
@@ -272,6 +279,7 @@ static const GLuint pfs_half = REG(REG_TYPE_CONST,
 				   SWIZZLE_HHH,
 				   SWIZZLE_HALF,
 				   GL_FALSE,
+				   GL_TRUE,
 				   GL_TRUE);
 
 /* constant zero source */
@@ -280,6 +288,7 @@ static const GLuint pfs_zero = REG(REG_TYPE_CONST,
 				   SWIZZLE_000,
 				   SWIZZLE_ZERO,
 				   GL_FALSE,
+				   GL_TRUE,
 				   GL_TRUE);
 
 /*
@@ -291,47 +300,105 @@ static void emit_arith(struct r300_fragment_program *rp, int op,
 				GLuint src0, GLuint src1, GLuint src2,
 				int flags);
 
-/*
- * Helper functions prototypes
+/**
+ * Get an R300 temporary that can be written to in the given slot.
  */
-static int get_hw_temp(struct r300_fragment_program *rp)
+static int get_hw_temp(struct r300_fragment_program *rp, int slot)
 {
 	COMPILE_STATE;
-	int r = ffs(~cs->hwreg_in_use);
-	if (!r) {
+	int r;
+	
+	for(r = 0; r < PFS_NUM_TEMP_REGS; ++r) {
+		if (cs->hwtemps[r].free >= 0 && cs->hwtemps[r].free <= slot)
+			break;
+	}
+	
+	if (r >= PFS_NUM_TEMP_REGS) {
 		ERROR("Out of hardware temps\n");
 		return 0;
 	}
-
-	cs->hwreg_in_use |= (1 << --r);
+	
+	// Reserved is used to avoid the following scenario:
+	//  R300 temporary X is first assigned to Mesa temporary Y during vector ops
+	//  R300 temporary X is then assigned to Mesa temporary Z for further vector ops
+	//  Then scalar ops on Mesa temporary Z are emitted and move back in time
+	//  to overwrite the value of temporary Y.
+	// End scenario.
+	cs->hwtemps[r].reserved = cs->hwtemps[r].free;
+	cs->hwtemps[r].free = -1;
+	
+	// Reset to some value that won't mess things up when the user
+	// tries to read from a temporary that hasn't been assigned a value yet.
+	// In the normal case, vector_valid and scalar_valid should be set to
+	// a sane value by the first emit that writes to this temporary.
+	cs->hwtemps[r].vector_valid = 0;
+	cs->hwtemps[r].scalar_valid = 0;
+	
 	if (r > rp->max_temp_idx)
 		rp->max_temp_idx = r;
-
+	
 	return r;
 }
 
+/**
+ * Get an R300 temporary that will act as a TEX destination register.
+ */
 static int get_hw_temp_tex(struct r300_fragment_program *rp)
 {
 	COMPILE_STATE;
 	int r;
 
-	r = ffs(~(cs->hwreg_in_use | cs->used_in_node));
-	if (!r)
-		return get_hw_temp(rp); /* Will cause an indirection */
+	for(r = 0; r < PFS_NUM_TEMP_REGS; ++r) {
+		if (cs->used_in_node & (1 << r))
+			continue;
+		
+		// Note: Be very careful here
+		if (cs->hwtemps[r].free >= 0 && cs->hwtemps[r].free <= 0)
+			break;
+	}
+	
+	if (r >= PFS_NUM_TEMP_REGS)
+		return get_hw_temp(rp, 0); /* Will cause an indirection */
 
-	cs->hwreg_in_use |= (1 << --r);
+	cs->hwtemps[r].reserved = cs->hwtemps[r].free;
+	cs->hwtemps[r].free = -1;
+	
+	// Reset to some value that won't mess things up when the user
+	// tries to read from a temporary that hasn't been assigned a value yet.
+	// In the normal case, vector_valid and scalar_valid should be set to
+	// a sane value by the first emit that writes to this temporary.
+	cs->hwtemps[r].vector_valid = cs->nrslots;
+	cs->hwtemps[r].scalar_valid = cs->nrslots;
+	
 	if (r > rp->max_temp_idx)
 		rp->max_temp_idx = r;
 
 	return r;
 }
 
+/**
+ * Mark the given hardware register as free.
+ */
 static void free_hw_temp(struct r300_fragment_program *rp, int idx)
 {
 	COMPILE_STATE;
-	cs->hwreg_in_use &= ~(1<<idx);
+	
+	// Be very careful here. Consider sequences like
+	//  MAD r0, r1,r2,r3
+	//  TEX r4, ...
+	// The TEX instruction may be moved in front of the MAD instruction
+	// due to the way nodes work. We don't want to alias r1 and r4 in
+	// this case.
+	// I'm certain the register allocation could be further sanitized,
+	// but it's tricky because of stuff that can happen inside emit_tex
+	// and emit_arith.
+	cs->hwtemps[idx].free = cs->nrslots+1;
 }
 
+
+/**
+ * Create a new Mesa temporary register.
+ */
 static GLuint get_temp_reg(struct r300_fragment_program *rp)
 {
 	COMPILE_STATE;
@@ -354,6 +421,10 @@ static GLuint get_temp_reg(struct r300_fragment_program *rp)
 	return r;
 }
 
+/**
+ * Create a new Mesa temporary register that will act as the destination
+ * register for a texture read.
+ */
 static GLuint get_temp_reg_tex(struct r300_fragment_program *rp)
 {
 	COMPILE_STATE;
@@ -376,6 +447,9 @@ static GLuint get_temp_reg_tex(struct r300_fragment_program *rp)
 	return r;
 }
 
+/**
+ * Free a Mesa temporary and the associated R300 temporary.
+ */
 static void free_temp(struct r300_fragment_program *rp, GLuint r)
 {
 	COMPILE_STATE;
@@ -762,10 +836,10 @@ static int t_hw_src(struct r300_fragment_program *rp,
 	switch(REG_GET_TYPE(src)) {
 	case REG_TYPE_TEMP:
 		/* NOTE: if reg==-1 here, a source is being read that
-		 * 	 hasn't been written to. Undefined results
+		 * 	 hasn't been written to. Undefined results.
 		 */
 		if (cs->temps[index].reg == -1)
-			cs->temps[index].reg = get_hw_temp(rp);
+			cs->temps[index].reg = get_hw_temp(rp, cs->nrslots);
 
 		idx = cs->temps[index].reg;
 
@@ -795,7 +869,8 @@ static int t_hw_src(struct r300_fragment_program *rp,
 
 static int t_hw_dst(struct r300_fragment_program *rp,
 		    GLuint dest,
-		    GLboolean tex)
+		    GLboolean tex,
+		    int slot)
 {
 	COMPILE_STATE;
 	int idx;
@@ -806,7 +881,7 @@ static int t_hw_dst(struct r300_fragment_program *rp,
 	case REG_TYPE_TEMP:
 		if (cs->temps[REG_GET_INDEX(dest)].reg == -1) {
 			if (!tex) {
-				cs->temps[index].reg = get_hw_temp(rp);
+				cs->temps[index].reg = get_hw_temp(rp, slot);
 			} else {
 				cs->temps[index].reg = get_hw_temp_tex(rp);
 			}
@@ -839,26 +914,20 @@ static int t_hw_dst(struct r300_fragment_program *rp,
 	return idx;
 }
 
-static void emit_nop(struct r300_fragment_program *rp,
-		     GLuint mask,
-		     GLboolean sync)
+static void emit_nop(struct r300_fragment_program *rp)
 {
 	COMPILE_STATE;
 	
-	if (sync)
-		cs->v_pos = cs->s_pos = MAX2(cs->v_pos, cs->s_pos);
-
-	if (mask & WRITEMASK_XYZ) {
-		rp->alu.inst[cs->v_pos].inst0 = NOP_INST0;
-		rp->alu.inst[cs->v_pos].inst1 = NOP_INST1;
-		cs->v_pos++;
-	}
-
-	if (mask & WRITEMASK_W) {
-		rp->alu.inst[cs->s_pos].inst2 = NOP_INST2;
-		rp->alu.inst[cs->s_pos].inst3 = NOP_INST3;
-		cs->s_pos++;
+	if (cs->nrslots >= PFS_MAX_ALU_INST) {
+		ERROR("Out of ALU instruction slots\n");
+		return;
 	}
+	
+	rp->alu.inst[cs->nrslots].inst0 = NOP_INST0;
+	rp->alu.inst[cs->nrslots].inst1 = NOP_INST1;
+	rp->alu.inst[cs->nrslots].inst2 = NOP_INST2;
+	rp->alu.inst[cs->nrslots].inst3 = NOP_INST3;
+	cs->nrslots++;
 }
 
 static void emit_tex(struct r300_fragment_program *rp,
@@ -882,7 +951,7 @@ static void emit_tex(struct r300_fragment_program *rp,
 			rdest = dest;
 			dest = get_temp_reg_tex(rp);
 		}
-		hwdest = t_hw_dst(rp, dest, GL_TRUE);
+		hwdest = t_hw_dst(rp, dest, GL_TRUE, rp->node[rp->cur_node].alu_offset);
 		
 		/* Use a temp that hasn't been used in this node, rather
 		 * than causing an indirection
@@ -904,15 +973,11 @@ static void emit_tex(struct r300_fragment_program *rp,
 	     (din & (1<<hwsrc))) || (uin & (1<<hwdest))) {
 			
 		/* Finish off current node */
-		cs->v_pos = cs->s_pos = MAX2(cs->v_pos, cs->s_pos);
-		if (rp->node[rp->cur_node].alu_offset == cs->v_pos) {
-			/* No alu instructions in the node? Emit a NOP. */
-			emit_nop(rp, WRITEMASK_XYZW, GL_TRUE);
-			cs->v_pos = cs->s_pos = MAX2(cs->v_pos, cs->s_pos);
-		}
+		if (rp->node[rp->cur_node].alu_offset == cs->nrslots)
+			emit_nop(rp);
 				
 		rp->node[rp->cur_node].alu_end =
-				cs->v_pos - rp->node[rp->cur_node].alu_offset - 1;
+				cs->nrslots - rp->node[rp->cur_node].alu_offset - 1;
 		assert(rp->node[rp->cur_node].alu_end >= 0);
 
 		if (++rp->cur_node >= PFS_MAX_TEX_INDIRECT) {
@@ -922,7 +987,7 @@ static void emit_tex(struct r300_fragment_program *rp,
 
 		/* Start new node */
 		rp->node[rp->cur_node].tex_offset = rp->tex.length;
-		rp->node[rp->cur_node].alu_offset = cs->v_pos;
+		rp->node[rp->cur_node].alu_offset = cs->nrslots;
 		rp->node[rp->cur_node].tex_end = -1;
 		rp->node[rp->cur_node].alu_end = -1;	
 		rp->node[rp->cur_node].flags = 0;
@@ -954,84 +1019,243 @@ static void emit_tex(struct r300_fragment_program *rp,
 	}
 }
 
-/* Add sources to FPI1/FPI3 lists.  If source is already on list,
- * reuse the index instead of wasting a source.
+
+/**
+ * Returns the first slot where we could possibly allow writing to dest,
+ * according to register allocation.
  */
-static int add_src(struct r300_fragment_program *rp,
-		   int reg,
-		   int pos,
-		   int srcmask)
+static int get_earliest_allowed_write(
+		struct r300_fragment_program* rp,
+		GLuint dest)
 {
 	COMPILE_STATE;
-	int csm, i;
-	
-	/* Look for matches */
-	for (i=0,csm=srcmask; i<3; i++,csm=csm<<1) {	
-		/* If sources have been allocated in this position(s)... */
-		if ((cs->slot[pos].umask & csm) == csm) {
-			/* ... and the register number(s) match, re-use the
-			   source */
-			if (srcmask == SLOT_VECTOR &&
-			    cs->slot[pos].vsrc[i] == reg)
-				return i;
-			if (srcmask == SLOT_SCALAR &&
-			    cs->slot[pos].ssrc[i] == reg)
-				return i;
-			if (srcmask == SLOT_BOTH &&
-			    cs->slot[pos].vsrc[i] == reg &&
-			    cs->slot[pos].ssrc[i] == reg)
-				return i;
-		}
-	}
+	int idx;
+	GLuint index = REG_GET_INDEX(dest);
+	assert(REG_GET_VALID(dest));
 
-	/* Look for free spaces */
-	for (i=0,csm=srcmask; i<3; i++,csm=csm<<1) {
-		/* If the position(s) haven't been allocated */
-		if ((cs->slot[pos].umask & csm) == 0) {
-			cs->slot[pos].umask |= csm;
-
-			if (srcmask & SLOT_VECTOR)
-				cs->slot[pos].vsrc[i] = reg;
-			if (srcmask & SLOT_SCALAR)
-				cs->slot[pos].ssrc[i] = reg;
-			return i;
-		}	
+	switch(REG_GET_TYPE(dest)) {
+		case REG_TYPE_TEMP:
+			if (cs->temps[index].reg == -1)
+				return 0;
+			
+			idx = cs->temps[index].reg;
+			break;
+		case REG_TYPE_OUTPUT:
+			return 0;
+		default:
+			ERROR("invalid dest reg type %d\n", REG_GET_TYPE(dest));
+			return 0;
 	}
 	
-	//ERROR("Failed to allocate sources in FPI1/FPI3!\n");
-	return 0;
+	return cs->hwtemps[idx].reserved;
 }
 
-/* Determine whether or not to position opcode in the same ALU slot for both
- * vector and scalar portions of an instruction.
+
+/**
+ * Allocates a slot for an ALU instruction that can consist of
+ * a vertex part or a scalar part or both.
+ *
+ * Sources from src (src[0] to src[argc-1]) are added to the slot in the
+ * appropriate position (vector and/or scalar), and their positions are
+ * recorded in the srcpos array.
+ *
+ * This function emits instruction code for the source fetch and the
+ * argument selection. It does not emit instruction code for the
+ * opcode or the destination selection.
  *
- * It's not necessary to force the first case, but it makes disassembled
- * shaders easier to read.
+ * @return the index of the slot
  */
-static GLboolean force_same_slot(int vop,
-				 int sop,
-				 GLboolean emit_vop,
-				 GLboolean emit_sop,
-				 int argc,
-				 GLuint *src)
+static int find_and_prepare_slot(struct r300_fragment_program* rp,
+		GLboolean emit_vop,
+		GLboolean emit_sop,
+		int argc,
+		GLuint* src,
+		GLuint dest)
 {
-	int i;
-
-	if (emit_vop && emit_sop)
-		return GL_TRUE;
+	COMPILE_STATE;
+	int hwsrc[3];
+	int srcpos[3];
+	unsigned int used;
+	int tempused;
+	int tempvsrc[3];
+	int tempssrc[3];
+	int pos;
+	int regnr;
+	int i,j;
+	
+	// Determine instruction slots, whether sources are required on
+	// vector or scalar side, and the smallest slot number where
+	// all source registers are available
+	used = 0;
+	if (emit_vop)
+		used |= SLOT_OP_VECTOR;
+	if (emit_sop)
+		used |= SLOT_OP_SCALAR;
+	
+	pos = get_earliest_allowed_write(rp, dest);
+	
+	if (rp->node[rp->cur_node].alu_offset > pos)
+		pos = rp->node[rp->cur_node].alu_offset;
+	for(i = 0; i < argc; ++i) {
+		if (!REG_GET_BUILTIN(src[i])) {
+			if (emit_vop)
+				used |= v_swiz[REG_GET_VSWZ(src[i])].flags << i;
+			if (emit_sop)
+				used |= s_swiz[REG_GET_SSWZ(src[i])].flags << i;
+		}
+		
+		hwsrc[i] = t_hw_src(rp, src[i], GL_FALSE); /* Note: sideeffects wrt refcounting! */
+		regnr = hwsrc[i] & 31;
+		
+		if (REG_GET_TYPE(src[i]) == REG_TYPE_TEMP) {
+			if (used & (SLOT_SRC_VECTOR << i)) {
+				if (cs->hwtemps[regnr].vector_valid > pos)
+					pos = cs->hwtemps[regnr].vector_valid;
+			}
+			if (used & (SLOT_SRC_SCALAR << i)) {
+				if (cs->hwtemps[regnr].scalar_valid > pos)
+					pos = cs->hwtemps[regnr].scalar_valid;
+			}
+		}
+	}
+	
+	// Find a slot that fits
+	for(; ; ++pos) {
+		if (cs->slot[pos].used & used & SLOT_OP_BOTH)
+			continue;
+		
+		if (pos >= cs->nrslots) {
+			if (cs->nrslots >= PFS_MAX_ALU_INST) {
+				ERROR("Out of ALU instruction slots\n");
+				return -1;
+			}
 
-	if (emit_vop && vop == R300_FPI0_OUTC_REPL_ALPHA)
-		return GL_TRUE;
+			rp->alu.inst[pos].inst0 = NOP_INST0;
+			rp->alu.inst[pos].inst2 = NOP_INST2;
 
+			cs->nrslots++;
+		}
+		
+		// Note: When we need both parts (vector and scalar) of a source,
+		// we always try to put them into the same position. This makes the
+		// code easier to read, and it is optimal (i.e. one doesn't gain
+		// anything by splitting the parts).
+		// It also avoids headaches with swizzles that access both parts (i.e WXY)
+		tempused = cs->slot[pos].used;
+		for(i = 0; i < 3; ++i) {
+			tempvsrc[i] = cs->slot[pos].vsrc[i];
+			tempssrc[i] = cs->slot[pos].ssrc[i];
+		}
+		
+		for(i = 0; i < argc; ++i) {
+			int flags = (used >> i) & SLOT_SRC_BOTH;
+			
+			if (!flags) {
+				srcpos[i] = 0;
+				continue;
+			}
+			
+			for(j = 0; j < 3; ++j) {
+				if ((tempused >> j) & flags & SLOT_SRC_VECTOR) {
+					if (tempvsrc[j] != hwsrc[i])
+						continue;
+				}
+			
+				if ((tempused >> j) & flags & SLOT_SRC_SCALAR) {
+					if (tempssrc[j] != hwsrc[i])
+						continue;
+				}
+				
+				break;
+			}
+			
+			if (j == 3)
+				break;
+			
+			srcpos[i] = j;
+			tempused |= flags << j;
+			if (flags & SLOT_SRC_VECTOR)
+				tempvsrc[j] = hwsrc[i];
+			if (flags & SLOT_SRC_SCALAR)
+				tempssrc[j] = hwsrc[i];
+		}
+		
+		if (i == argc)
+			break;
+	}
+	
+	// Found a slot, reserve it
+	cs->slot[pos].used = tempused | (used & SLOT_OP_BOTH);
+	for(i = 0; i < 3; ++i) {
+		cs->slot[pos].vsrc[i] = tempvsrc[i];
+		cs->slot[pos].ssrc[i] = tempssrc[i];
+	}
+	
+	// Emit the source fetch code
+	rp->alu.inst[pos].inst1 &= ~R300_FPI1_SRC_MASK;
+	rp->alu.inst[pos].inst1 |=
+			((cs->slot[pos].vsrc[0] << R300_FPI1_SRC0C_SHIFT) |
+			 (cs->slot[pos].vsrc[1] << R300_FPI1_SRC1C_SHIFT) |
+			 (cs->slot[pos].vsrc[2] << R300_FPI1_SRC2C_SHIFT));
+	
+	rp->alu.inst[pos].inst3 &= ~R300_FPI3_SRC_MASK;
+	rp->alu.inst[pos].inst3 |=
+			((cs->slot[pos].ssrc[0] << R300_FPI3_SRC0A_SHIFT) |
+			 (cs->slot[pos].ssrc[1] << R300_FPI3_SRC1A_SHIFT) |
+			 (cs->slot[pos].ssrc[2] << R300_FPI3_SRC2A_SHIFT));
+	
+	// Emit the argument selection code
 	if (emit_vop) {
-		for (i=0;i<argc;i++)
-			if (REG_GET_VSWZ(src[i]) == SWIZZLE_WZY)
-				return GL_TRUE;
+		int swz[3];
+		
+		for(i = 0; i < 3; ++i) {
+			if (i < argc) {
+				swz[i] = (v_swiz[REG_GET_VSWZ(src[i])].base +
+				            (srcpos[i] * v_swiz[REG_GET_VSWZ(src[i])].stride)) |
+					((src[i] & REG_NEGV_MASK) ? ARG_NEG : 0) |
+					((src[i] & REG_ABS_MASK) ? ARG_ABS : 0);
+			} else {
+				swz[i] = R300_FPI0_ARGC_ZERO;
+			}
+		}
+		
+		rp->alu.inst[pos].inst0 &=
+				~(R300_FPI0_ARG0C_MASK|R300_FPI0_ARG1C_MASK|R300_FPI0_ARG2C_MASK);
+		rp->alu.inst[pos].inst0 |=
+				(swz[0] << R300_FPI0_ARG0C_SHIFT) |
+				(swz[1] << R300_FPI0_ARG1C_SHIFT) |
+				(swz[2] << R300_FPI0_ARG2C_SHIFT);
+	}
+	
+	if (emit_sop) {
+		int swz[3];
+		
+		for(i = 0; i < 3; ++i) {
+			if (i < argc) {
+				swz[i] = (s_swiz[REG_GET_SSWZ(src[i])].base +
+						(srcpos[i] * s_swiz[REG_GET_SSWZ(src[i])].stride)) |
+						((src[i] & REG_NEGV_MASK) ? ARG_NEG : 0) |
+						((src[i] & REG_ABS_MASK) ? ARG_ABS : 0);
+			} else {
+				swz[i] = R300_FPI2_ARGA_ZERO;
+			}
+		}
+		
+		rp->alu.inst[pos].inst2 &=
+				~(R300_FPI2_ARG0A_MASK|R300_FPI2_ARG1A_MASK|R300_FPI2_ARG2A_MASK);
+		rp->alu.inst[pos].inst2 |=
+				(swz[0] << R300_FPI2_ARG0A_SHIFT) |
+				(swz[1] << R300_FPI2_ARG1A_SHIFT) |
+				(swz[2] << R300_FPI2_ARG2A_SHIFT);
 	}
 
-	return GL_FALSE;
+	return pos;
 }
 
+
+/**
+ * Append an ALU instruction to the instruction list.
+ */
 static void emit_arith(struct r300_fragment_program *rp,
 		       int op,
 		       GLuint dest,
@@ -1043,87 +1267,31 @@ static void emit_arith(struct r300_fragment_program *rp,
 {
 	COMPILE_STATE;
 	GLuint src[3] = { src0, src1, src2 };
-	int hwsrc[3], sswz[3], vswz[3];
 	int hwdest;
-	GLboolean emit_vop = GL_FALSE, emit_sop = GL_FALSE;
+	GLboolean emit_vop, emit_sop;
 	int vop, sop, argc;
-	int vpos, spos;
-	int i;
+	int pos;
 
 	vop = r300_fpop[op].v_op;
 	sop = r300_fpop[op].s_op;
 	argc = r300_fpop[op].argc;
 
+	if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT &&
+	    REG_GET_INDEX(dest) == FRAG_RESULT_DEPR)
+		mask &= ~WRITEMASK_XYZ;
+	
+	emit_vop = GL_FALSE;
+	emit_sop = GL_FALSE;
 	if ((mask & WRITEMASK_XYZ) || vop == R300_FPI0_OUTC_DP3)
 		emit_vop = GL_TRUE;
 	if ((mask & WRITEMASK_W) || vop == R300_FPI0_OUTC_REPL_ALPHA)
 		emit_sop = GL_TRUE;
 
-	if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT &&
-	    REG_GET_INDEX(dest) == FRAG_RESULT_DEPR)
-		emit_vop = GL_FALSE;
-					
-	if (force_same_slot(vop, sop, emit_vop, emit_sop, argc, src)) {
-		vpos = spos = MAX2(cs->v_pos, cs->s_pos);
-	} else {
-		vpos = cs->v_pos;
-		spos = cs->s_pos;
-		/* Here is where we'd decide on where a safe place is to
-		 * combine this instruction with a previous one.
-		 *
-		 * This is extremely simple for now.. if a source depends
-		 * on the opposite stream, force the same instruction.
-		 */
-		for (i=0;i<3;i++) {
-			if (emit_vop &&
-			    (v_swiz[REG_GET_VSWZ(src[i])].flags & SLOT_SCALAR)) {
-				vpos = spos = MAX2(vpos, spos);
-				break;
-			}
-			if (emit_sop &&
-			    (s_swiz[REG_GET_SSWZ(src[i])].flags & SLOT_VECTOR)) {
-				vpos = spos = MAX2(vpos, spos);
-				break;
-			}
-		}
-	}
+	pos = find_and_prepare_slot(rp, emit_vop, emit_sop, argc, src, dest);
+	if (pos < 0)
+		return;
 	
-	/* - Convert src->hwsrc, record for FPI1/FPI3
-	 * - Determine ARG parts of FPI0/FPI2, unused args are filled
-	 *   with ARG_ZERO.
-	 */	
-	for (i=0;i<3;i++) {
-		int srcpos;
-		
-		if (i >= argc) {
-			vswz[i] = R300_FPI0_ARGC_ZERO;
-			sswz[i] = R300_FPI2_ARGA_ZERO;
-			continue;
-		}
-		
-		hwsrc[i] = t_hw_src(rp, src[i], GL_FALSE);	
-
-		if (emit_vop && vop != R300_FPI0_OUTC_REPL_ALPHA) {
-			srcpos = add_src(rp, hwsrc[i], vpos,
-					 v_swiz[REG_GET_VSWZ(src[i])].flags);
-			vswz[i] = (v_swiz[REG_GET_VSWZ(src[i])].base +
-				   (srcpos *
-				    v_swiz[REG_GET_VSWZ(src[i])].stride)) |
-				((src[i] & REG_NEGV_MASK) ? ARG_NEG : 0) |
-				((src[i] & REG_ABS_MASK) ? ARG_ABS : 0);
-		} else vswz[i] = R300_FPI0_ARGC_ZERO;
-		
-		if (emit_sop) {
-			srcpos = add_src(rp, hwsrc[i], spos,
-					 s_swiz[REG_GET_SSWZ(src[i])].flags);
-			sswz[i] = (s_swiz[REG_GET_SSWZ(src[i])].base +
-				   (srcpos *
-				    s_swiz[REG_GET_SSWZ(src[i])].stride)) |
-				((src[i] & REG_NEGS_MASK) ? ARG_NEG : 0) |
-				((src[i] & REG_ABS_MASK) ? ARG_ABS : 0);
-		} else sswz[i] = R300_FPI2_ARGA_ZERO;
-	}
-	hwdest = t_hw_dst(rp, dest, GL_FALSE);
+	hwdest = t_hw_dst(rp, dest, GL_FALSE, pos); /* Note: Side effects wrt register allocation */
 	
 	if (flags & PFS_FLAG_SAT) {
 		vop |= R300_FPI0_OUTC_SAT;
@@ -1131,58 +1299,45 @@ static void emit_arith(struct r300_fragment_program *rp,
 	}
 
 	/* Throw the pieces together and get FPI0/1 */
-	rp->alu.inst[vpos].inst1 =
-			((cs->slot[vpos].vsrc[0] << R300_FPI1_SRC0C_SHIFT) |
-			 (cs->slot[vpos].vsrc[1] << R300_FPI1_SRC1C_SHIFT) |
-			 (cs->slot[vpos].vsrc[2] << R300_FPI1_SRC2C_SHIFT));
 	if (emit_vop) {
-		rp->alu.inst[vpos].inst0 = vop |
-				(vswz[0] << R300_FPI0_ARG0C_SHIFT) |
-				(vswz[1] << R300_FPI0_ARG1C_SHIFT) |
-				(vswz[2] << R300_FPI0_ARG2C_SHIFT);
+		rp->alu.inst[pos].inst0 |= vop;
 
-		rp->alu.inst[vpos].inst1 |= hwdest << R300_FPI1_DSTC_SHIFT;
+		rp->alu.inst[pos].inst1 |= hwdest << R300_FPI1_DSTC_SHIFT;
+		
 		if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT) {
 			if (REG_GET_INDEX(dest) == FRAG_RESULT_COLR) {
-				rp->alu.inst[vpos].inst1 |=
+				rp->alu.inst[pos].inst1 |=
 					(mask & WRITEMASK_XYZ) << R300_FPI1_DSTC_OUTPUT_MASK_SHIFT;
 			} else assert(0);
 		} else {
-			rp->alu.inst[vpos].inst1 |=
+			rp->alu.inst[pos].inst1 |=
 					(mask & WRITEMASK_XYZ) << R300_FPI1_DSTC_REG_MASK_SHIFT;
+			
+			cs->hwtemps[hwdest].vector_valid = pos+1;
 		}
-		cs->v_pos = vpos+1;
-	} else if (spos >= vpos)
-		rp->alu.inst[spos].inst0 = NOP_INST0;
+	}
 
 	/* And now FPI2/3 */
-	rp->alu.inst[spos].inst3 =
-			((cs->slot[spos].ssrc[0] << R300_FPI3_SRC0A_SHIFT) |
-			 (cs->slot[spos].ssrc[1] << R300_FPI3_SRC1A_SHIFT) |
-			 (cs->slot[spos].ssrc[2] << R300_FPI3_SRC2A_SHIFT));
 	if (emit_sop) {
-		rp->alu.inst[spos].inst2 = sop |
-				sswz[0] << R300_FPI2_ARG0A_SHIFT |
-				sswz[1] << R300_FPI2_ARG1A_SHIFT |
-				sswz[2] << R300_FPI2_ARG2A_SHIFT;
+		rp->alu.inst[pos].inst2 |= sop;
 
 		if (mask & WRITEMASK_W) {
 			if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT) {
 				if (REG_GET_INDEX(dest) == FRAG_RESULT_COLR) {
-					rp->alu.inst[spos].inst3 |= 
+					rp->alu.inst[pos].inst3 |= 
 							(hwdest << R300_FPI3_DSTA_SHIFT) | R300_FPI3_DSTA_OUTPUT;
 				} else if (REG_GET_INDEX(dest) == FRAG_RESULT_DEPR) {
-					rp->alu.inst[spos].inst3 |= R300_FPI3_DSTA_DEPTH;
+					rp->alu.inst[pos].inst3 |= R300_FPI3_DSTA_DEPTH;
 				} else assert(0);
 			} else {
-				rp->alu.inst[spos].inst3 |=
+				rp->alu.inst[pos].inst3 |=
 						(hwdest << R300_FPI3_DSTA_SHIFT) | R300_FPI3_DSTA_REG;
+			
+				cs->hwtemps[hwdest].scalar_valid = pos+1;
 			}
 		}
-		cs->s_pos = spos+1;
-	} else if (vpos >= spos)
-		rp->alu.inst[vpos].inst2 = NOP_INST2;
-
+	}
+	
 	return;
 }
 
@@ -1922,7 +2077,7 @@ static void init_program(r300ContextPtr r300, struct r300_fragment_program *rp)
 	for (i=0;i<rp->ctx->Const.MaxTextureUnits;i++) {
 		if (InputsRead & (FRAG_BIT_TEX0 << i)) {
 			cs->inputs[FRAG_ATTRIB_TEX0+i].refcount = 0;
-			cs->inputs[FRAG_ATTRIB_TEX0+i].reg = get_hw_temp(rp);
+			cs->inputs[FRAG_ATTRIB_TEX0+i].reg = get_hw_temp(rp, 0);
 		}
 	}
 	InputsRead &= ~FRAG_BITS_TEX_ANY;
@@ -1930,7 +2085,7 @@ static void init_program(r300ContextPtr r300, struct r300_fragment_program *rp)
 	/* fragment position treated as a texcoord */
 	if (InputsRead & FRAG_BIT_WPOS) {
 		cs->inputs[FRAG_ATTRIB_WPOS].refcount = 0;
-		cs->inputs[FRAG_ATTRIB_WPOS].reg = get_hw_temp(rp);
+		cs->inputs[FRAG_ATTRIB_WPOS].reg = get_hw_temp(rp, 0);
 		insert_wpos(&mp->Base);
 	}
 	InputsRead &= ~FRAG_BIT_WPOS;
@@ -1938,14 +2093,14 @@ static void init_program(r300ContextPtr r300, struct r300_fragment_program *rp)
 	/* Then primary colour */
 	if (InputsRead & FRAG_BIT_COL0) {
 		cs->inputs[FRAG_ATTRIB_COL0].refcount = 0;
-		cs->inputs[FRAG_ATTRIB_COL0].reg = get_hw_temp(rp);
+		cs->inputs[FRAG_ATTRIB_COL0].reg = get_hw_temp(rp, 0);
 	}
 	InputsRead &= ~FRAG_BIT_COL0;
 	
 	/* Secondary color */
 	if (InputsRead & FRAG_BIT_COL1) {
 		cs->inputs[FRAG_ATTRIB_COL1].refcount = 0;
-		cs->inputs[FRAG_ATTRIB_COL1].reg = get_hw_temp(rp);
+		cs->inputs[FRAG_ATTRIB_COL1].reg = get_hw_temp(rp, 0);
 	}
 	InputsRead &= ~FRAG_BIT_COL1;
 
@@ -2030,13 +2185,12 @@ void r300_translate_fragment_shader(r300ContextPtr r300, struct r300_fragment_pr
 		}
 		
 		/* Finish off */
-		cs->v_pos = cs->s_pos = MAX2(cs->v_pos, cs->s_pos);
 		rp->node[rp->cur_node].alu_end =
-				cs->v_pos - rp->node[rp->cur_node].alu_offset - 1;
+				cs->nrslots - rp->node[rp->cur_node].alu_offset - 1;
 		if (rp->node[rp->cur_node].tex_end < 0)
 			rp->node[rp->cur_node].tex_end = 0;
 		rp->alu_offset = 0;
-		rp->alu_end    = cs->v_pos - 1;
+		rp->alu_end    = cs->nrslots - 1;
 		rp->tex_offset = 0;
 		rp->tex_end    = rp->tex.length ? rp->tex.length - 1 : 0;
 		assert(rp->node[rp->cur_node].alu_end >= 0);
@@ -2053,7 +2207,7 @@ void r300_translate_fragment_shader(r300ContextPtr r300, struct r300_fragment_pr
 /* just some random things... */
 static void dump_program(struct r300_fragment_program *rp)
 {
-	int i;
+	int n, i, j;
 	static int pc = 0;
 
 	fprintf(stderr, "pc=%d*************************************\n", pc++);
@@ -2066,46 +2220,136 @@ static void dump_program(struct r300_fragment_program *rp)
 	fprintf(stderr, "Hardware program\n");
 	fprintf(stderr, "----------------\n");
 	
-	fprintf(stderr, "tex:\n");
-	
-	for(i=0;i<rp->tex.length;i++) {
-		fprintf(stderr, "%08x\n", rp->tex.inst[i]);
-	}
-	
-	for (i=0;i<(rp->cur_node+1);i++) {
+	for (n = 0; n < (rp->cur_node+1); n++) {
 		fprintf(stderr, "NODE %d: alu_offset: %d, tex_offset: %d, "\
-			"alu_end: %d, tex_end: %d\n", i,
-			rp->node[i].alu_offset,
-			rp->node[i].tex_offset,
-			rp->node[i].alu_end,
-			rp->node[i].tex_end);
+			"alu_end: %d, tex_end: %d\n", n,
+			rp->node[n].alu_offset,
+			rp->node[n].tex_offset,
+			rp->node[n].alu_end,
+			rp->node[n].tex_end);
+		
+		if (rp->tex.length) {
+			fprintf(stderr, "  TEX:\n");
+			for(i = rp->node[n].tex_offset; i <= rp->node[n].tex_end; ++i)
+				fprintf(stderr, "    %08x\n", rp->tex.inst[i]);
+		}
+		
+		for(i = rp->node[n].alu_offset; i <= rp->node[n].alu_end; ++i) {
+			char srcc[3][10], dstc[20];
+			char srca[3][10], dsta[20];
+			char argc[3][20];
+			char arga[3][20];
+			
+			for(j = 0; j < 3; ++j) {
+				int regc = rp->alu.inst[i].inst1 >> (j*6);
+				int rega = rp->alu.inst[i].inst3 >> (j*6);
+				
+				sprintf(srcc[j], "%c%i", (regc & 32) ? 'c' : 't', regc & 31);
+				sprintf(srca[j], "%c%i", (rega & 32) ? 'c' : 't', rega & 31);
+			}
+			
+			sprintf(dstc, "t%i.%c%c%c o%i.%c%c%c",
+					(rp->alu.inst[i].inst1 >> R300_FPI1_DSTC_SHIFT) & 31,
+					(rp->alu.inst[i].inst1 & R300_FPI1_DSTC_REG_X) ? 'x' : ' ',
+					(rp->alu.inst[i].inst1 & R300_FPI1_DSTC_REG_Y) ? 'y' : ' ',
+					(rp->alu.inst[i].inst1 & R300_FPI1_DSTC_REG_Z) ? 'z' : ' ',
+					(rp->alu.inst[i].inst1 >> R300_FPI1_DSTC_SHIFT) & 31,
+					(rp->alu.inst[i].inst1 & R300_FPI1_DSTC_OUTPUT_X) ? 'x' : ' ',
+					(rp->alu.inst[i].inst1 & R300_FPI1_DSTC_OUTPUT_Y) ? 'y' : ' ',
+					(rp->alu.inst[i].inst1 & R300_FPI1_DSTC_OUTPUT_Z) ? 'z' : ' ');
+			
+			sprintf(dsta, "t%i.%c o%i.%c %c",
+					(rp->alu.inst[i].inst3 >> R300_FPI3_DSTA_SHIFT) & 31,
+					(rp->alu.inst[i].inst3 & R300_FPI3_DSTA_REG) ? 'w' : ' ',
+					(rp->alu.inst[i].inst3 >> R300_FPI3_DSTA_SHIFT) & 31,
+					(rp->alu.inst[i].inst3 & R300_FPI3_DSTA_OUTPUT) ? 'w' : ' ',
+					(rp->alu.inst[i].inst3 & R300_FPI3_DSTA_DEPTH) ? 'Z' : ' ');
+			
+			fprintf(stderr, "%3i: xyz: %3s %3s %3s -> %-20s (%08x)\n"
+			                "       w: %3s %3s %3s -> %-20s (%08x)\n",
+					i,
+					srcc[0], srcc[1], srcc[2], dstc, rp->alu.inst[i].inst1,
+					srca[0], srca[1], srca[2], dsta, rp->alu.inst[i].inst3);
+			
+			for(j = 0; j < 3; ++j) {
+				int regc = rp->alu.inst[i].inst0 >> (j*7);
+				int rega = rp->alu.inst[i].inst2 >> (j*7);
+				int d;
+				char buf[20];
+
+				d = regc & 31;
+				if (d < 12) {
+					switch(d % 4) {
+						case R300_FPI0_ARGC_SRC0C_XYZ:
+							sprintf(buf, "%s.xyz", srcc[d / 4]);
+							break;
+						case R300_FPI0_ARGC_SRC0C_XXX:
+							sprintf(buf, "%s.xxx", srcc[d / 4]);
+							break;
+						case R300_FPI0_ARGC_SRC0C_YYY:
+							sprintf(buf, "%s.yyy", srcc[d / 4]);
+							break;
+						case R300_FPI0_ARGC_SRC0C_ZZZ:
+							sprintf(buf, "%s.zzz", srcc[d / 4]);
+							break;
+					}
+				} else if (d < 15) {
+					sprintf(buf, "%s.www", srca[d-12]);
+				} else if (d == 20) {
+					sprintf(buf, "0.0");
+				} else if (d == 21) {
+					sprintf(buf, "1.0");
+				} else if (d == 22) {
+					sprintf(buf, "0.5");
+				} else if (d >= 23 && d < 32) {
+					d -= 23;
+					switch(d/3) {
+						case 0:
+							sprintf(buf, "%s.yzx", srcc[d % 3]);
+							break;
+						case 1:
+							sprintf(buf, "%s.zxy", srcc[d % 3]);
+							break;
+						case 2:
+							sprintf(buf, "%s.Wzy", srcc[d % 3]);
+							break;
+					}
+				} else {
+					sprintf(buf, "%i", d);
+				}
+				
+				sprintf(argc[j], "%s%s%s%s",
+						(regc & 32) ? "-" : "",
+						(regc & 64) ? "|" : "",
+						buf,
+						(regc & 64) ? "|" : "");
+			
+				d = rega & 31;
+				if (d < 9) {
+					sprintf(buf, "%s.%c", srcc[d / 3], 'x' + (char)(d%3));
+				} else if (d < 12) {
+					sprintf(buf, "%s.w", srca[d-9]);
+				} else if (d == 16) {
+					sprintf(buf, "0.0");
+				} else if (d == 17) {
+					sprintf(buf, "1.0");
+				} else if (d == 18) {
+					sprintf(buf, "0.5");
+				} else {
+					sprintf(buf, "%i", d);
+				}
+				
+				sprintf(arga[j], "%s%s%s%s",
+						(rega & 32) ? "-" : "",
+						(rega & 64) ? "|" : "",
+						buf,
+						(rega & 64) ? "|" : "");
+			}
+			
+			fprintf(stderr, "     xyz: %8s %8s %8s    op: %08x\n"
+			                "       w: %8s %8s %8s    op: %08x\n",
+					argc[0], argc[1], argc[2], rp->alu.inst[i].inst0,
+					arga[0], arga[1], arga[2], rp->alu.inst[i].inst2);
+		}
 	}
-	
-	fprintf(stderr, "%08x\n",
-		((rp->tex_end << 16) | (R300_PFS_TEXI_0 >> 2)));
-	for (i=0;i<=rp->tex_end;i++)
-		fprintf(stderr, "%08x\n", rp->tex.inst[i]);
-
-	/* dump program in pretty_print_command_stream.tcl-readable format */
-	fprintf(stderr, "%08x\n",
-		((rp->alu_end << 16) | (R300_PFS_INSTR0_0 >> 2)));
-	for (i=0;i<=rp->alu_end;i++)
-		fprintf(stderr, "%08x\n", rp->alu.inst[i].inst0);
-
-	fprintf(stderr, "%08x\n",
-		((rp->alu_end << 16) | (R300_PFS_INSTR1_0 >> 2)));
-	for (i=0;i<=rp->alu_end;i++)
-		fprintf(stderr, "%08x\n", rp->alu.inst[i].inst1);
-
-	fprintf(stderr, "%08x\n",
-		((rp->alu_end << 16) | (R300_PFS_INSTR2_0 >> 2)));
-	for (i=0;i<=rp->alu_end;i++)
-		fprintf(stderr, "%08x\n", rp->alu.inst[i].inst2);
-
-	fprintf(stderr, "%08x\n",
-		((rp->alu_end << 16) | (R300_PFS_INSTR3_0 >> 2)));
-	for (i=0;i<=rp->alu_end;i++)
-		fprintf(stderr, "%08x\n", rp->alu.inst[i].inst3);
-
-	fprintf(stderr, "00000000\n");
 }
author	Nicolai Haehnle <prefect@upb.de>	2007-03-18 02:15:56 +0100
committer	Nicolai Haehnle <nhaehnle@gmail.com>	2007-03-19 18:38:07 +0100
commit	7b430acd71f04dce3e21bdcfe70115a23d751f30 (patch)
tree	a50e3628283ba79336b6eb74ca9da061f8776917 /src/mesa/drivers/dri/r300/r300_fragprog.c
parent	07db8c9115c0b07d79be778976e25f8eb18d42a2 (diff)