From 9b42100c04f14b4f2c1e5fe9748bb0519ed6c516 Mon Sep 17 00:00:00 2001
From: Michel Dänzer <michel@tungstengraphics.com>
Date: Mon, 19 Mar 2007 17:23:44 +0100
Subject: i915tex: Fix triple buffering after recent Mesa core changes.

Remove superfluous _mesa_resize_framebuffer call which is now harmful because
it causes the third renderbuffer to have width/height 0, so Mesa refuses to
render to it.

In the long term, it would be nice to remove the hack in
intel_alloc_window_storage in favour of a proper Mesa interface for flipping
between more than two colour buffers.
---
 src/mesa/drivers/dri/i915tex/intel_buffers.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'src/mesa/drivers')

diff --git a/src/mesa/drivers/dri/i915tex/intel_buffers.c b/src/mesa/drivers/dri/i915tex/intel_buffers.c
index 62ff54b007..c0b4f438be 100644
--- a/src/mesa/drivers/dri/i915tex/intel_buffers.c
+++ b/src/mesa/drivers/dri/i915tex/intel_buffers.c
@@ -349,6 +349,28 @@ intelWindowMoved(struct intel_context *intel)
 
    /* Update Mesa's notion of window size */
    driUpdateFramebufferSize(ctx, dPriv);
+
+   /* Update size of third renderbuffer */
+   if (intel_fb->pf_num_pages == 3) {
+      struct gl_renderbuffer *rb = &intel_fb->color_rb[(intel_fb->pf_current_page
+						        + 2) % 3]->Base;
+
+      /* only resize if size is changing */
+         if (rb->Width != intel_fb->Base.Width ||
+	     rb->Height != intel_fb->Base.Height) {
+            /* could just as well pass rb->_ActualFormat here */
+            if (rb->AllocStorage(ctx, rb, rb->InternalFormat,
+				 intel_fb->Base.Width, intel_fb->Base.Height)) {
+               ASSERT(rb->Width == intel_fb->Base.Width);
+               ASSERT(rb->Height == intel_fb->Base.Height);
+            }
+            else {
+               _mesa_error(ctx, GL_OUT_OF_MEMORY, "Resizing framebuffer");
+               /* no return */
+            }
+         }
+   }
+
    intel_fb->Base.Initialized = GL_TRUE; /* XXX remove someday */
 
    /* Update hardware scissor */
-- 
cgit v1.2.3


From 07db8c9115c0b07d79be778976e25f8eb18d42a2 Mon Sep 17 00:00:00 2001
From: Michel Dänzer <michel@tungstengraphics.com>
Date: Mon, 19 Mar 2007 18:34:27 +0100
Subject: i915tex: The intended triple buffering fix.

Making modifications while the editor spawned by git-commit was suspended
didn't have the intended effect.
---
 src/mesa/drivers/dri/i915tex/intel_buffers.c | 22 ----------------------
 src/mesa/drivers/dri/i915tex/intel_context.c |  6 +-----
 2 files changed, 1 insertion(+), 27 deletions(-)

(limited to 'src/mesa/drivers')

diff --git a/src/mesa/drivers/dri/i915tex/intel_buffers.c b/src/mesa/drivers/dri/i915tex/intel_buffers.c
index c0b4f438be..62ff54b007 100644
--- a/src/mesa/drivers/dri/i915tex/intel_buffers.c
+++ b/src/mesa/drivers/dri/i915tex/intel_buffers.c
@@ -349,28 +349,6 @@ intelWindowMoved(struct intel_context *intel)
 
    /* Update Mesa's notion of window size */
    driUpdateFramebufferSize(ctx, dPriv);
-
-   /* Update size of third renderbuffer */
-   if (intel_fb->pf_num_pages == 3) {
-      struct gl_renderbuffer *rb = &intel_fb->color_rb[(intel_fb->pf_current_page
-						        + 2) % 3]->Base;
-
-      /* only resize if size is changing */
-         if (rb->Width != intel_fb->Base.Width ||
-	     rb->Height != intel_fb->Base.Height) {
-            /* could just as well pass rb->_ActualFormat here */
-            if (rb->AllocStorage(ctx, rb, rb->InternalFormat,
-				 intel_fb->Base.Width, intel_fb->Base.Height)) {
-               ASSERT(rb->Width == intel_fb->Base.Width);
-               ASSERT(rb->Height == intel_fb->Base.Height);
-            }
-            else {
-               _mesa_error(ctx, GL_OUT_OF_MEMORY, "Resizing framebuffer");
-               /* no return */
-            }
-         }
-   }
-
    intel_fb->Base.Initialized = GL_TRUE; /* XXX remove someday */
 
    /* Update hardware scissor */
diff --git a/src/mesa/drivers/dri/i915tex/intel_context.c b/src/mesa/drivers/dri/i915tex/intel_context.c
index 5c2cdf0c7d..acda7b1c16 100644
--- a/src/mesa/drivers/dri/i915tex/intel_context.c
+++ b/src/mesa/drivers/dri/i915tex/intel_context.c
@@ -581,11 +581,7 @@ intelMakeCurrent(__DRIcontextPrivate * driContextPriv,
       }
 
       /* set GLframebuffer size to match window, if needed */
-      if (intel_fb->Base.Width != driDrawPriv->w) {
-         _mesa_resize_framebuffer(&intel->ctx, &intel_fb->Base,
-                                  driDrawPriv->w, driDrawPriv->h);
-      }         
-      if (readFb->Width != driReadPriv->w) {
+      if (driReadPriv != driDrawPriv && readFb->Width != driReadPriv->w) {
          _mesa_resize_framebuffer(&intel->ctx, readFb,
                                   driReadPriv->w, driReadPriv->h);
       }         
-- 
cgit v1.2.3


From 7b430acd71f04dce3e21bdcfe70115a23d751f30 Mon Sep 17 00:00:00 2001
From: Nicolai Haehnle <prefect@upb.de>
Date: Sun, 18 Mar 2007 02:15:56 +0100
Subject: r300: Fix fragment program instruction pairing and register
 allocation

There were a number of bugs related to the pairing of vector and scalar
operations where swizzles ended up using the wrong source register,
or an instruction was moved forward and ended up overwriting an aliased
register.

The new algorithm for register allocation is quite conservative and may
run out of registers before necessary. On the plus side, It Just Works.

Pairing is done whenever possible, and in more cases than before, so
in practice this change should be a net win.
---
 src/mesa/drivers/dri/r300/r300_context.h  |  94 +++-
 src/mesa/drivers/dri/r300/r300_fragprog.c | 774 ++++++++++++++++++++----------
 src/mesa/drivers/dri/r300/r300_reg.h      |   4 +-
 3 files changed, 582 insertions(+), 290 deletions(-)

(limited to 'src/mesa/drivers')

diff --git a/src/mesa/drivers/dri/r300/r300_context.h b/src/mesa/drivers/dri/r300/r300_context.h
index bd9ed6f170..bc43953ff3 100644
--- a/src/mesa/drivers/dri/r300/r300_context.h
+++ b/src/mesa/drivers/dri/r300/r300_context.h
@@ -647,38 +647,84 @@ struct r300_vertex_program_cont {
 #define PFS_NUM_TEMP_REGS	32
 #define PFS_NUM_CONST_REGS	16
 
-/* Tracking data for Mesa registers */
+/* Mapping Mesa registers to R300 temporaries */
 struct reg_acc {
        int reg;        /* Assigned hw temp */
        unsigned int refcount; /* Number of uses by mesa program */
 };
 
+/**
+ * Describe the current lifetime information for an R300 temporary
+ */
+struct reg_lifetime {
+	/* Index of the first slot where this register is free in the sense
+	   that it can be used as a new destination register.
+	   This is -1 if the register has been assigned to a Mesa register
+	   and the last access to the register has not yet been emitted */
+	int free;
+	
+	/* Index of the first slot where this register is currently reserved.
+	   This is used to stop e.g. a scalar operation from being moved
+	   before the allocation time of a register that was first allocated
+	   for a vector operation. */
+	int reserved;
+	
+	/* Index of the first slot in which the register can be used as a
+	   source without losing the value that is written by the last
+	   emitted instruction that writes to the register */
+	int vector_valid;
+	int scalar_valid;
+};
+
+
+/**
+ * Store usage information about an ALU instruction slot during the
+ * compilation of a fragment program.
+ */
+#define SLOT_SRC_VECTOR  (1<<0)
+#define SLOT_SRC_SCALAR  (1<<3)
+#define SLOT_SRC_BOTH    (SLOT_SRC_VECTOR | SLOT_SRC_SCALAR)
+#define SLOT_OP_VECTOR   (1<<16)
+#define SLOT_OP_SCALAR   (1<<17)
+#define SLOT_OP_BOTH     (SLOT_OP_VECTOR | SLOT_OP_SCALAR)
+
+struct r300_pfs_compile_slot {
+	/* Bitmask indicating which parts of the slot are used, using SLOT_ constants 
+	   defined above */
+	unsigned int used;
+
+	/* Selected sources */
+	int vsrc[3];
+	int ssrc[3];
+};
+
+/**
+ * Store information during compilation of fragment programs.
+ */
 struct r300_pfs_compile_state {
-       int v_pos, s_pos;       /* highest ALU slots used */
-
-       /* Track some information gathered during opcode
-        * construction.
-        * 
-        * NOTE: Data is only set by the code, and isn't used yet.
-        */
-       struct {
-               int vsrc[3];
-               int ssrc[3];
-               int umask;
-       } slot[PFS_MAX_ALU_INST];
-
-       /* Used to map Mesa's inputs/temps onto hardware temps */
-       int temp_in_use;
-       struct reg_acc temps[PFS_NUM_TEMP_REGS];
-       struct reg_acc inputs[32]; /* don't actually need 32... */
-
-       /* Track usage of hardware temps, for register allocation,
-        * indirection detection, etc. */
-       int hwreg_in_use;
-       GLuint used_in_node;
-       GLuint dest_in_node;
+	int nrslots;       /* number of ALU slots used so far */
+	
+	/* Track which (parts of) slots are already filled with instructions */
+	struct r300_pfs_compile_slot slot[PFS_MAX_ALU_INST];
+	
+	/* Track the validity of R300 temporaries */
+	struct reg_lifetime hwtemps[PFS_NUM_TEMP_REGS];
+	
+	/* Used to map Mesa's inputs/temps onto hardware temps */
+	int temp_in_use;
+	struct reg_acc temps[PFS_NUM_TEMP_REGS];
+	struct reg_acc inputs[32]; /* don't actually need 32... */
+	
+	/* Track usage of hardware temps, for register allocation,
+	 * indirection detection, etc. */
+	GLuint used_in_node;
+	GLuint dest_in_node;
 };
 
+/**
+ * Store everything about a fragment program that is needed
+ * to render with that program.
+ */
 struct r300_fragment_program {
 	struct gl_fragment_program mesa_program;
 
diff --git a/src/mesa/drivers/dri/r300/r300_fragprog.c b/src/mesa/drivers/dri/r300/r300_fragprog.c
index 251fd26082..b2c89ccb36 100644
--- a/src/mesa/drivers/dri/r300/r300_fragprog.c
+++ b/src/mesa/drivers/dri/r300/r300_fragprog.c
@@ -94,8 +94,9 @@
 #define REG_NEGV_SHIFT		18
 #define REG_NEGS_SHIFT		19
 #define REG_ABS_SHIFT		20
-#define REG_NO_USE_SHIFT	21
-#define REG_VALID_SHIFT		22
+#define REG_NO_USE_SHIFT	21 // Hack for refcounting
+#define REG_VALID_SHIFT		22 // Does the register contain a defined value?
+#define REG_BUILTIN_SHIFT   23 // Is it a builtin (like all zero/all one)?
 
 #define REG_TYPE_MASK		(0x03 << REG_TYPE_SHIFT)
 #define REG_INDEX_MASK		(0x3F << REG_INDEX_SHIFT)
@@ -106,12 +107,14 @@
 #define REG_ABS_MASK		(0x01 << REG_ABS_SHIFT)
 #define REG_NO_USE_MASK		(0x01 << REG_NO_USE_SHIFT)
 #define REG_VALID_MASK		(0x01 << REG_VALID_SHIFT)
+#define REG_BUILTIN_MASK	(0x01 << REG_BUILTIN_SHIFT)
 
-#define REG(type, index, vswz, sswz, nouse, valid)			\
+#define REG(type, index, vswz, sswz, nouse, valid, builtin)	\
 	(((type << REG_TYPE_SHIFT) & REG_TYPE_MASK) |			\
 	 ((index << REG_INDEX_SHIFT) & REG_INDEX_MASK) |		\
 	 ((nouse << REG_NO_USE_SHIFT) & REG_NO_USE_MASK) |		\
 	 ((valid << REG_VALID_SHIFT) & REG_VALID_MASK) |		\
+	 ((builtin << REG_BUILTIN_SHIFT) & REG_BUILTIN_MASK) |	\
 	 ((vswz << REG_VSWZ_SHIFT) & REG_VSWZ_MASK) |			\
 	 ((sswz << REG_SSWZ_SHIFT) & REG_SSWZ_MASK))
 #define REG_GET_TYPE(reg)						\
@@ -126,6 +129,8 @@
 	((reg & REG_NO_USE_MASK) >> REG_NO_USE_SHIFT)
 #define REG_GET_VALID(reg)						\
 	((reg & REG_VALID_MASK) >> REG_VALID_SHIFT)
+#define REG_GET_BUILTIN(reg)						\
+	((reg & REG_BUILTIN_MASK) >> REG_BUILTIN_SHIFT)
 #define REG_SET_TYPE(reg, type)						\
 	reg = ((reg & ~REG_TYPE_MASK) |					\
 	       ((type << REG_TYPE_SHIFT) & REG_TYPE_MASK))
@@ -144,6 +149,9 @@
 #define REG_SET_VALID(reg, valid)					\
 	reg = ((reg & ~REG_VALID_MASK) |				\
 	       ((valid << REG_VALID_SHIFT) & REG_VALID_MASK))
+#define REG_SET_BUILTIN(reg, builtin)					\
+	reg = ((reg & ~REG_BUILTIN_MASK) |				\
+	       ((builtin << REG_BUILTIN_SHIFT) & REG_BUILTIN_MASK))
 #define REG_ABS(reg)							\
 	reg = (reg | REG_ABS_MASK)
 #define REG_NEGV(reg)							\
@@ -184,9 +192,6 @@ static const struct {
  *
  * REG_VSWZ/REG_SSWZ is an index into this table
  */
-#define SLOT_VECTOR	(1<<0)
-#define SLOT_SCALAR	(1<<3)
-#define SLOT_BOTH	(SLOT_VECTOR | SLOT_SCALAR)
 
 /* mapping from SWIZZLE_* to r300 native values for scalar insns */
 #define SWIZZLE_HALF 6
@@ -202,14 +207,14 @@ static const struct r300_pfs_swizzle {
 	GLuint flags;
 } v_swiz[] = {
 /* native swizzles */
-	{ MAKE_SWZ3(X, Y, Z), R300_FPI0_ARGC_SRC0C_XYZ, 4, SLOT_VECTOR },
-	{ MAKE_SWZ3(X, X, X), R300_FPI0_ARGC_SRC0C_XXX, 4, SLOT_VECTOR },
-	{ MAKE_SWZ3(Y, Y, Y), R300_FPI0_ARGC_SRC0C_YYY, 4, SLOT_VECTOR },
-	{ MAKE_SWZ3(Z, Z, Z), R300_FPI0_ARGC_SRC0C_ZZZ, 4, SLOT_VECTOR },
-	{ MAKE_SWZ3(W, W, W), R300_FPI0_ARGC_SRC0A,     1, SLOT_SCALAR },
-	{ MAKE_SWZ3(Y, Z, X), R300_FPI0_ARGC_SRC0C_YZX, 1, SLOT_VECTOR },
-	{ MAKE_SWZ3(Z, X, Y), R300_FPI0_ARGC_SRC0C_ZXY, 1, SLOT_VECTOR },
-	{ MAKE_SWZ3(W, Z, Y), R300_FPI0_ARGC_SRC0CA_WZY, 1, SLOT_BOTH },
+	{ MAKE_SWZ3(X, Y, Z), R300_FPI0_ARGC_SRC0C_XYZ, 4, SLOT_SRC_VECTOR },
+	{ MAKE_SWZ3(X, X, X), R300_FPI0_ARGC_SRC0C_XXX, 4, SLOT_SRC_VECTOR },
+	{ MAKE_SWZ3(Y, Y, Y), R300_FPI0_ARGC_SRC0C_YYY, 4, SLOT_SRC_VECTOR },
+	{ MAKE_SWZ3(Z, Z, Z), R300_FPI0_ARGC_SRC0C_ZZZ, 4, SLOT_SRC_VECTOR },
+	{ MAKE_SWZ3(W, W, W), R300_FPI0_ARGC_SRC0A,     1, SLOT_SRC_SCALAR },
+	{ MAKE_SWZ3(Y, Z, X), R300_FPI0_ARGC_SRC0C_YZX, 1, SLOT_SRC_VECTOR },
+	{ MAKE_SWZ3(Z, X, Y), R300_FPI0_ARGC_SRC0C_ZXY, 1, SLOT_SRC_VECTOR },
+	{ MAKE_SWZ3(W, Z, Y), R300_FPI0_ARGC_SRC0CA_WZY, 1, SLOT_SRC_BOTH },
 	{ MAKE_SWZ3(ONE, ONE, ONE), R300_FPI0_ARGC_ONE, 0, 0},
 	{ MAKE_SWZ3(ZERO, ZERO, ZERO), R300_FPI0_ARGC_ZERO, 0, 0},
 	{ MAKE_SWZ3(HALF, HALF, HALF), R300_FPI0_ARGC_HALF, 0, 0},
@@ -241,10 +246,10 @@ static const struct {
 	int stride;	/* difference between SRC0/1/2 */
 	GLuint flags;
 } s_swiz[] = {
-	{ R300_FPI2_ARGA_SRC0C_X, 3, SLOT_VECTOR },
-	{ R300_FPI2_ARGA_SRC0C_Y, 3, SLOT_VECTOR },
-	{ R300_FPI2_ARGA_SRC0C_Z, 3, SLOT_VECTOR },
-	{ R300_FPI2_ARGA_SRC0A  , 1, SLOT_SCALAR },
+	{ R300_FPI2_ARGA_SRC0C_X, 3, SLOT_SRC_VECTOR },
+	{ R300_FPI2_ARGA_SRC0C_Y, 3, SLOT_SRC_VECTOR },
+	{ R300_FPI2_ARGA_SRC0C_Z, 3, SLOT_SRC_VECTOR },
+	{ R300_FPI2_ARGA_SRC0A  , 1, SLOT_SRC_SCALAR },
 	{ R300_FPI2_ARGA_ZERO   , 0, 0 },
 	{ R300_FPI2_ARGA_ONE    , 0, 0 },
 	{ R300_FPI2_ARGA_HALF   , 0, 0 }
@@ -256,6 +261,7 @@ static const GLuint undef = REG(REG_TYPE_TEMP,
 				SWIZZLE_XYZ,
 				SWIZZLE_W,
 				GL_FALSE,
+				GL_FALSE,
 				GL_FALSE);
 
 /* constant one source */
@@ -264,6 +270,7 @@ static const GLuint pfs_one = REG(REG_TYPE_CONST,
 				  SWIZZLE_111,
 				  SWIZZLE_ONE,
 				  GL_FALSE,
+				  GL_TRUE,
 				  GL_TRUE);
 
 /* constant half source */
@@ -272,6 +279,7 @@ static const GLuint pfs_half = REG(REG_TYPE_CONST,
 				   SWIZZLE_HHH,
 				   SWIZZLE_HALF,
 				   GL_FALSE,
+				   GL_TRUE,
 				   GL_TRUE);
 
 /* constant zero source */
@@ -280,6 +288,7 @@ static const GLuint pfs_zero = REG(REG_TYPE_CONST,
 				   SWIZZLE_000,
 				   SWIZZLE_ZERO,
 				   GL_FALSE,
+				   GL_TRUE,
 				   GL_TRUE);
 
 /*
@@ -291,47 +300,105 @@ static void emit_arith(struct r300_fragment_program *rp, int op,
 				GLuint src0, GLuint src1, GLuint src2,
 				int flags);
 
-/*
- * Helper functions prototypes
+/**
+ * Get an R300 temporary that can be written to in the given slot.
  */
-static int get_hw_temp(struct r300_fragment_program *rp)
+static int get_hw_temp(struct r300_fragment_program *rp, int slot)
 {
 	COMPILE_STATE;
-	int r = ffs(~cs->hwreg_in_use);
-	if (!r) {
+	int r;
+	
+	for(r = 0; r < PFS_NUM_TEMP_REGS; ++r) {
+		if (cs->hwtemps[r].free >= 0 && cs->hwtemps[r].free <= slot)
+			break;
+	}
+	
+	if (r >= PFS_NUM_TEMP_REGS) {
 		ERROR("Out of hardware temps\n");
 		return 0;
 	}
-
-	cs->hwreg_in_use |= (1 << --r);
+	
+	// Reserved is used to avoid the following scenario:
+	//  R300 temporary X is first assigned to Mesa temporary Y during vector ops
+	//  R300 temporary X is then assigned to Mesa temporary Z for further vector ops
+	//  Then scalar ops on Mesa temporary Z are emitted and move back in time
+	//  to overwrite the value of temporary Y.
+	// End scenario.
+	cs->hwtemps[r].reserved = cs->hwtemps[r].free;
+	cs->hwtemps[r].free = -1;
+	
+	// Reset to some value that won't mess things up when the user
+	// tries to read from a temporary that hasn't been assigned a value yet.
+	// In the normal case, vector_valid and scalar_valid should be set to
+	// a sane value by the first emit that writes to this temporary.
+	cs->hwtemps[r].vector_valid = 0;
+	cs->hwtemps[r].scalar_valid = 0;
+	
 	if (r > rp->max_temp_idx)
 		rp->max_temp_idx = r;
-
+	
 	return r;
 }
 
+/**
+ * Get an R300 temporary that will act as a TEX destination register.
+ */
 static int get_hw_temp_tex(struct r300_fragment_program *rp)
 {
 	COMPILE_STATE;
 	int r;
 
-	r = ffs(~(cs->hwreg_in_use | cs->used_in_node));
-	if (!r)
-		return get_hw_temp(rp); /* Will cause an indirection */
+	for(r = 0; r < PFS_NUM_TEMP_REGS; ++r) {
+		if (cs->used_in_node & (1 << r))
+			continue;
+		
+		// Note: Be very careful here
+		if (cs->hwtemps[r].free >= 0 && cs->hwtemps[r].free <= 0)
+			break;
+	}
+	
+	if (r >= PFS_NUM_TEMP_REGS)
+		return get_hw_temp(rp, 0); /* Will cause an indirection */
 
-	cs->hwreg_in_use |= (1 << --r);
+	cs->hwtemps[r].reserved = cs->hwtemps[r].free;
+	cs->hwtemps[r].free = -1;
+	
+	// Reset to some value that won't mess things up when the user
+	// tries to read from a temporary that hasn't been assigned a value yet.
+	// In the normal case, vector_valid and scalar_valid should be set to
+	// a sane value by the first emit that writes to this temporary.
+	cs->hwtemps[r].vector_valid = cs->nrslots;
+	cs->hwtemps[r].scalar_valid = cs->nrslots;
+	
 	if (r > rp->max_temp_idx)
 		rp->max_temp_idx = r;
 
 	return r;
 }
 
+/**
+ * Mark the given hardware register as free.
+ */
 static void free_hw_temp(struct r300_fragment_program *rp, int idx)
 {
 	COMPILE_STATE;
-	cs->hwreg_in_use &= ~(1<<idx);
+	
+	// Be very careful here. Consider sequences like
+	//  MAD r0, r1,r2,r3
+	//  TEX r4, ...
+	// The TEX instruction may be moved in front of the MAD instruction
+	// due to the way nodes work. We don't want to alias r1 and r4 in
+	// this case.
+	// I'm certain the register allocation could be further sanitized,
+	// but it's tricky because of stuff that can happen inside emit_tex
+	// and emit_arith.
+	cs->hwtemps[idx].free = cs->nrslots+1;
 }
 
+
+/**
+ * Create a new Mesa temporary register.
+ */
 static GLuint get_temp_reg(struct r300_fragment_program *rp)
 {
 	COMPILE_STATE;
@@ -354,6 +421,10 @@ static GLuint get_temp_reg(struct r300_fragment_program *rp)
 	return r;
 }
 
+/**
+ * Create a new Mesa temporary register that will act as the destination
+ * register for a texture read.
+ */
 static GLuint get_temp_reg_tex(struct r300_fragment_program *rp)
 {
 	COMPILE_STATE;
@@ -376,6 +447,9 @@ static GLuint get_temp_reg_tex(struct r300_fragment_program *rp)
 	return r;
 }
 
+/**
+ * Free a Mesa temporary and the associated R300 temporary.
+ */
 static void free_temp(struct r300_fragment_program *rp, GLuint r)
 {
 	COMPILE_STATE;
@@ -762,10 +836,10 @@ static int t_hw_src(struct r300_fragment_program *rp,
 	switch(REG_GET_TYPE(src)) {
 	case REG_TYPE_TEMP:
 		/* NOTE: if reg==-1 here, a source is being read that
-		 * 	 hasn't been written to. Undefined results
+		 * 	 hasn't been written to. Undefined results.
 		 */
 		if (cs->temps[index].reg == -1)
-			cs->temps[index].reg = get_hw_temp(rp);
+			cs->temps[index].reg = get_hw_temp(rp, cs->nrslots);
 
 		idx = cs->temps[index].reg;
 
@@ -795,7 +869,8 @@ static int t_hw_src(struct r300_fragment_program *rp,
 
 static int t_hw_dst(struct r300_fragment_program *rp,
 		    GLuint dest,
-		    GLboolean tex)
+		    GLboolean tex,
+		    int slot)
 {
 	COMPILE_STATE;
 	int idx;
@@ -806,7 +881,7 @@ static int t_hw_dst(struct r300_fragment_program *rp,
 	case REG_TYPE_TEMP:
 		if (cs->temps[REG_GET_INDEX(dest)].reg == -1) {
 			if (!tex) {
-				cs->temps[index].reg = get_hw_temp(rp);
+				cs->temps[index].reg = get_hw_temp(rp, slot);
 			} else {
 				cs->temps[index].reg = get_hw_temp_tex(rp);
 			}
@@ -839,26 +914,20 @@ static int t_hw_dst(struct r300_fragment_program *rp,
 	return idx;
 }
 
-static void emit_nop(struct r300_fragment_program *rp,
-		     GLuint mask,
-		     GLboolean sync)
+static void emit_nop(struct r300_fragment_program *rp)
 {
 	COMPILE_STATE;
 	
-	if (sync)
-		cs->v_pos = cs->s_pos = MAX2(cs->v_pos, cs->s_pos);
-
-	if (mask & WRITEMASK_XYZ) {
-		rp->alu.inst[cs->v_pos].inst0 = NOP_INST0;
-		rp->alu.inst[cs->v_pos].inst1 = NOP_INST1;
-		cs->v_pos++;
-	}
-
-	if (mask & WRITEMASK_W) {
-		rp->alu.inst[cs->s_pos].inst2 = NOP_INST2;
-		rp->alu.inst[cs->s_pos].inst3 = NOP_INST3;
-		cs->s_pos++;
+	if (cs->nrslots >= PFS_MAX_ALU_INST) {
+		ERROR("Out of ALU instruction slots\n");
+		return;
 	}
+	
+	rp->alu.inst[cs->nrslots].inst0 = NOP_INST0;
+	rp->alu.inst[cs->nrslots].inst1 = NOP_INST1;
+	rp->alu.inst[cs->nrslots].inst2 = NOP_INST2;
+	rp->alu.inst[cs->nrslots].inst3 = NOP_INST3;
+	cs->nrslots++;
 }
 
 static void emit_tex(struct r300_fragment_program *rp,
@@ -882,7 +951,7 @@ static void emit_tex(struct r300_fragment_program *rp,
 			rdest = dest;
 			dest = get_temp_reg_tex(rp);
 		}
-		hwdest = t_hw_dst(rp, dest, GL_TRUE);
+		hwdest = t_hw_dst(rp, dest, GL_TRUE, rp->node[rp->cur_node].alu_offset);
 		
 		/* Use a temp that hasn't been used in this node, rather
 		 * than causing an indirection
@@ -904,15 +973,11 @@ static void emit_tex(struct r300_fragment_program *rp,
 	     (din & (1<<hwsrc))) || (uin & (1<<hwdest))) {
 			
 		/* Finish off current node */
-		cs->v_pos = cs->s_pos = MAX2(cs->v_pos, cs->s_pos);
-		if (rp->node[rp->cur_node].alu_offset == cs->v_pos) {
-			/* No alu instructions in the node? Emit a NOP. */
-			emit_nop(rp, WRITEMASK_XYZW, GL_TRUE);
-			cs->v_pos = cs->s_pos = MAX2(cs->v_pos, cs->s_pos);
-		}
+		if (rp->node[rp->cur_node].alu_offset == cs->nrslots)
+			emit_nop(rp);
 				
 		rp->node[rp->cur_node].alu_end =
-				cs->v_pos - rp->node[rp->cur_node].alu_offset - 1;
+				cs->nrslots - rp->node[rp->cur_node].alu_offset - 1;
 		assert(rp->node[rp->cur_node].alu_end >= 0);
 
 		if (++rp->cur_node >= PFS_MAX_TEX_INDIRECT) {
@@ -922,7 +987,7 @@ static void emit_tex(struct r300_fragment_program *rp,
 
 		/* Start new node */
 		rp->node[rp->cur_node].tex_offset = rp->tex.length;
-		rp->node[rp->cur_node].alu_offset = cs->v_pos;
+		rp->node[rp->cur_node].alu_offset = cs->nrslots;
 		rp->node[rp->cur_node].tex_end = -1;
 		rp->node[rp->cur_node].alu_end = -1;	
 		rp->node[rp->cur_node].flags = 0;
@@ -954,84 +1019,243 @@ static void emit_tex(struct r300_fragment_program *rp,
 	}
 }
 
-/* Add sources to FPI1/FPI3 lists.  If source is already on list,
- * reuse the index instead of wasting a source.
+
+/**
+ * Returns the first slot where we could possibly allow writing to dest,
+ * according to register allocation.
  */
-static int add_src(struct r300_fragment_program *rp,
-		   int reg,
-		   int pos,
-		   int srcmask)
+static int get_earliest_allowed_write(
+		struct r300_fragment_program* rp,
+		GLuint dest)
 {
 	COMPILE_STATE;
-	int csm, i;
-	
-	/* Look for matches */
-	for (i=0,csm=srcmask; i<3; i++,csm=csm<<1) {	
-		/* If sources have been allocated in this position(s)... */
-		if ((cs->slot[pos].umask & csm) == csm) {
-			/* ... and the register number(s) match, re-use the
-			   source */
-			if (srcmask == SLOT_VECTOR &&
-			    cs->slot[pos].vsrc[i] == reg)
-				return i;
-			if (srcmask == SLOT_SCALAR &&
-			    cs->slot[pos].ssrc[i] == reg)
-				return i;
-			if (srcmask == SLOT_BOTH &&
-			    cs->slot[pos].vsrc[i] == reg &&
-			    cs->slot[pos].ssrc[i] == reg)
-				return i;
-		}
-	}
+	int idx;
+	GLuint index = REG_GET_INDEX(dest);
+	assert(REG_GET_VALID(dest));
 
-	/* Look for free spaces */
-	for (i=0,csm=srcmask; i<3; i++,csm=csm<<1) {
-		/* If the position(s) haven't been allocated */
-		if ((cs->slot[pos].umask & csm) == 0) {
-			cs->slot[pos].umask |= csm;
-
-			if (srcmask & SLOT_VECTOR)
-				cs->slot[pos].vsrc[i] = reg;
-			if (srcmask & SLOT_SCALAR)
-				cs->slot[pos].ssrc[i] = reg;
-			return i;
-		}	
+	switch(REG_GET_TYPE(dest)) {
+		case REG_TYPE_TEMP:
+			if (cs->temps[index].reg == -1)
+				return 0;
+			
+			idx = cs->temps[index].reg;
+			break;
+		case REG_TYPE_OUTPUT:
+			return 0;
+		default:
+			ERROR("invalid dest reg type %d\n", REG_GET_TYPE(dest));
+			return 0;
 	}
 	
-	//ERROR("Failed to allocate sources in FPI1/FPI3!\n");
-	return 0;
+	return cs->hwtemps[idx].reserved;
 }
 
-/* Determine whether or not to position opcode in the same ALU slot for both
- * vector and scalar portions of an instruction.
+
+/**
+ * Allocates a slot for an ALU instruction that can consist of
+ * a vertex part or a scalar part or both.
+ *
+ * Sources from src (src[0] to src[argc-1]) are added to the slot in the
+ * appropriate position (vector and/or scalar), and their positions are
+ * recorded in the srcpos array.
+ *
+ * This function emits instruction code for the source fetch and the
+ * argument selection. It does not emit instruction code for the
+ * opcode or the destination selection.
  *
- * It's not necessary to force the first case, but it makes disassembled
- * shaders easier to read.
+ * @return the index of the slot
  */
-static GLboolean force_same_slot(int vop,
-				 int sop,
-				 GLboolean emit_vop,
-				 GLboolean emit_sop,
-				 int argc,
-				 GLuint *src)
+static int find_and_prepare_slot(struct r300_fragment_program* rp,
+		GLboolean emit_vop,
+		GLboolean emit_sop,
+		int argc,
+		GLuint* src,
+		GLuint dest)
 {
-	int i;
-
-	if (emit_vop && emit_sop)
-		return GL_TRUE;
+	COMPILE_STATE;
+	int hwsrc[3];
+	int srcpos[3];
+	unsigned int used;
+	int tempused;
+	int tempvsrc[3];
+	int tempssrc[3];
+	int pos;
+	int regnr;
+	int i,j;
+	
+	// Determine instruction slots, whether sources are required on
+	// vector or scalar side, and the smallest slot number where
+	// all source registers are available
+	used = 0;
+	if (emit_vop)
+		used |= SLOT_OP_VECTOR;
+	if (emit_sop)
+		used |= SLOT_OP_SCALAR;
+	
+	pos = get_earliest_allowed_write(rp, dest);
+	
+	if (rp->node[rp->cur_node].alu_offset > pos)
+		pos = rp->node[rp->cur_node].alu_offset;
+	for(i = 0; i < argc; ++i) {
+		if (!REG_GET_BUILTIN(src[i])) {
+			if (emit_vop)
+				used |= v_swiz[REG_GET_VSWZ(src[i])].flags << i;
+			if (emit_sop)
+				used |= s_swiz[REG_GET_SSWZ(src[i])].flags << i;
+		}
+		
+		hwsrc[i] = t_hw_src(rp, src[i], GL_FALSE); /* Note: sideeffects wrt refcounting! */
+		regnr = hwsrc[i] & 31;
+		
+		if (REG_GET_TYPE(src[i]) == REG_TYPE_TEMP) {
+			if (used & (SLOT_SRC_VECTOR << i)) {
+				if (cs->hwtemps[regnr].vector_valid > pos)
+					pos = cs->hwtemps[regnr].vector_valid;
+			}
+			if (used & (SLOT_SRC_SCALAR << i)) {
+				if (cs->hwtemps[regnr].scalar_valid > pos)
+					pos = cs->hwtemps[regnr].scalar_valid;
+			}
+		}
+	}
+	
+	// Find a slot that fits
+	for(; ; ++pos) {
+		if (cs->slot[pos].used & used & SLOT_OP_BOTH)
+			continue;
+		
+		if (pos >= cs->nrslots) {
+			if (cs->nrslots >= PFS_MAX_ALU_INST) {
+				ERROR("Out of ALU instruction slots\n");
+				return -1;
+			}
 
-	if (emit_vop && vop == R300_FPI0_OUTC_REPL_ALPHA)
-		return GL_TRUE;
+			rp->alu.inst[pos].inst0 = NOP_INST0;
+			rp->alu.inst[pos].inst2 = NOP_INST2;
 
+			cs->nrslots++;
+		}
+		
+		// Note: When we need both parts (vector and scalar) of a source,
+		// we always try to put them into the same position. This makes the
+		// code easier to read, and it is optimal (i.e. one doesn't gain
+		// anything by splitting the parts).
+		// It also avoids headaches with swizzles that access both parts (i.e WXY)
+		tempused = cs->slot[pos].used;
+		for(i = 0; i < 3; ++i) {
+			tempvsrc[i] = cs->slot[pos].vsrc[i];
+			tempssrc[i] = cs->slot[pos].ssrc[i];
+		}
+		
+		for(i = 0; i < argc; ++i) {
+			int flags = (used >> i) & SLOT_SRC_BOTH;
+			
+			if (!flags) {
+				srcpos[i] = 0;
+				continue;
+			}
+			
+			for(j = 0; j < 3; ++j) {
+				if ((tempused >> j) & flags & SLOT_SRC_VECTOR) {
+					if (tempvsrc[j] != hwsrc[i])
+						continue;
+				}
+			
+				if ((tempused >> j) & flags & SLOT_SRC_SCALAR) {
+					if (tempssrc[j] != hwsrc[i])
+						continue;
+				}
+				
+				break;
+			}
+			
+			if (j == 3)
+				break;
+			
+			srcpos[i] = j;
+			tempused |= flags << j;
+			if (flags & SLOT_SRC_VECTOR)
+				tempvsrc[j] = hwsrc[i];
+			if (flags & SLOT_SRC_SCALAR)
+				tempssrc[j] = hwsrc[i];
+		}
+		
+		if (i == argc)
+			break;
+	}
+	
+	// Found a slot, reserve it
+	cs->slot[pos].used = tempused | (used & SLOT_OP_BOTH);
+	for(i = 0; i < 3; ++i) {
+		cs->slot[pos].vsrc[i] = tempvsrc[i];
+		cs->slot[pos].ssrc[i] = tempssrc[i];
+	}
+	
+	// Emit the source fetch code
+	rp->alu.inst[pos].inst1 &= ~R300_FPI1_SRC_MASK;
+	rp->alu.inst[pos].inst1 |=
+			((cs->slot[pos].vsrc[0] << R300_FPI1_SRC0C_SHIFT) |
+			 (cs->slot[pos].vsrc[1] << R300_FPI1_SRC1C_SHIFT) |
+			 (cs->slot[pos].vsrc[2] << R300_FPI1_SRC2C_SHIFT));
+	
+	rp->alu.inst[pos].inst3 &= ~R300_FPI3_SRC_MASK;
+	rp->alu.inst[pos].inst3 |=
+			((cs->slot[pos].ssrc[0] << R300_FPI3_SRC0A_SHIFT) |
+			 (cs->slot[pos].ssrc[1] << R300_FPI3_SRC1A_SHIFT) |
+			 (cs->slot[pos].ssrc[2] << R300_FPI3_SRC2A_SHIFT));
+	
+	// Emit the argument selection code
 	if (emit_vop) {
-		for (i=0;i<argc;i++)
-			if (REG_GET_VSWZ(src[i]) == SWIZZLE_WZY)
-				return GL_TRUE;
+		int swz[3];
+		
+		for(i = 0; i < 3; ++i) {
+			if (i < argc) {
+				swz[i] = (v_swiz[REG_GET_VSWZ(src[i])].base +
+				            (srcpos[i] * v_swiz[REG_GET_VSWZ(src[i])].stride)) |
+					((src[i] & REG_NEGV_MASK) ? ARG_NEG : 0) |
+					((src[i] & REG_ABS_MASK) ? ARG_ABS : 0);
+			} else {
+				swz[i] = R300_FPI0_ARGC_ZERO;
+			}
+		}
+		
+		rp->alu.inst[pos].inst0 &=
+				~(R300_FPI0_ARG0C_MASK|R300_FPI0_ARG1C_MASK|R300_FPI0_ARG2C_MASK);
+		rp->alu.inst[pos].inst0 |=
+				(swz[0] << R300_FPI0_ARG0C_SHIFT) |
+				(swz[1] << R300_FPI0_ARG1C_SHIFT) |
+				(swz[2] << R300_FPI0_ARG2C_SHIFT);
+	}
+	
+	if (emit_sop) {
+		int swz[3];
+		
+		for(i = 0; i < 3; ++i) {
+			if (i < argc) {
+				swz[i] = (s_swiz[REG_GET_SSWZ(src[i])].base +
+						(srcpos[i] * s_swiz[REG_GET_SSWZ(src[i])].stride)) |
+						((src[i] & REG_NEGV_MASK) ? ARG_NEG : 0) |
+						((src[i] & REG_ABS_MASK) ? ARG_ABS : 0);
+			} else {
+				swz[i] = R300_FPI2_ARGA_ZERO;
+			}
+		}
+		
+		rp->alu.inst[pos].inst2 &=
+				~(R300_FPI2_ARG0A_MASK|R300_FPI2_ARG1A_MASK|R300_FPI2_ARG2A_MASK);
+		rp->alu.inst[pos].inst2 |=
+				(swz[0] << R300_FPI2_ARG0A_SHIFT) |
+				(swz[1] << R300_FPI2_ARG1A_SHIFT) |
+				(swz[2] << R300_FPI2_ARG2A_SHIFT);
 	}
 
-	return GL_FALSE;
+	return pos;
 }
 
+
+/**
+ * Append an ALU instruction to the instruction list.
+ */
 static void emit_arith(struct r300_fragment_program *rp,
 		       int op,
 		       GLuint dest,
@@ -1043,87 +1267,31 @@ static void emit_arith(struct r300_fragment_program *rp,
 {
 	COMPILE_STATE;
 	GLuint src[3] = { src0, src1, src2 };
-	int hwsrc[3], sswz[3], vswz[3];
 	int hwdest;
-	GLboolean emit_vop = GL_FALSE, emit_sop = GL_FALSE;
+	GLboolean emit_vop, emit_sop;
 	int vop, sop, argc;
-	int vpos, spos;
-	int i;
+	int pos;
 
 	vop = r300_fpop[op].v_op;
 	sop = r300_fpop[op].s_op;
 	argc = r300_fpop[op].argc;
 
+	if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT &&
+	    REG_GET_INDEX(dest) == FRAG_RESULT_DEPR)
+		mask &= ~WRITEMASK_XYZ;
+	
+	emit_vop = GL_FALSE;
+	emit_sop = GL_FALSE;
 	if ((mask & WRITEMASK_XYZ) || vop == R300_FPI0_OUTC_DP3)
 		emit_vop = GL_TRUE;
 	if ((mask & WRITEMASK_W) || vop == R300_FPI0_OUTC_REPL_ALPHA)
 		emit_sop = GL_TRUE;
 
-	if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT &&
-	    REG_GET_INDEX(dest) == FRAG_RESULT_DEPR)
-		emit_vop = GL_FALSE;
-					
-	if (force_same_slot(vop, sop, emit_vop, emit_sop, argc, src)) {
-		vpos = spos = MAX2(cs->v_pos, cs->s_pos);
-	} else {
-		vpos = cs->v_pos;
-		spos = cs->s_pos;
-		/* Here is where we'd decide on where a safe place is to
-		 * combine this instruction with a previous one.
-		 *
-		 * This is extremely simple for now.. if a source depends
-		 * on the opposite stream, force the same instruction.
-		 */
-		for (i=0;i<3;i++) {
-			if (emit_vop &&
-			    (v_swiz[REG_GET_VSWZ(src[i])].flags & SLOT_SCALAR)) {
-				vpos = spos = MAX2(vpos, spos);
-				break;
-			}
-			if (emit_sop &&
-			    (s_swiz[REG_GET_SSWZ(src[i])].flags & SLOT_VECTOR)) {
-				vpos = spos = MAX2(vpos, spos);
-				break;
-			}
-		}
-	}
+	pos = find_and_prepare_slot(rp, emit_vop, emit_sop, argc, src, dest);
+	if (pos < 0)
+		return;
 	
-	/* - Convert src->hwsrc, record for FPI1/FPI3
-	 * - Determine ARG parts of FPI0/FPI2, unused args are filled
-	 *   with ARG_ZERO.
-	 */	
-	for (i=0;i<3;i++) {
-		int srcpos;
-		
-		if (i >= argc) {
-			vswz[i] = R300_FPI0_ARGC_ZERO;
-			sswz[i] = R300_FPI2_ARGA_ZERO;
-			continue;
-		}
-		
-		hwsrc[i] = t_hw_src(rp, src[i], GL_FALSE);	
-
-		if (emit_vop && vop != R300_FPI0_OUTC_REPL_ALPHA) {
-			srcpos = add_src(rp, hwsrc[i], vpos,
-					 v_swiz[REG_GET_VSWZ(src[i])].flags);
-			vswz[i] = (v_swiz[REG_GET_VSWZ(src[i])].base +
-				   (srcpos *
-				    v_swiz[REG_GET_VSWZ(src[i])].stride)) |
-				((src[i] & REG_NEGV_MASK) ? ARG_NEG : 0) |
-				((src[i] & REG_ABS_MASK) ? ARG_ABS : 0);
-		} else vswz[i] = R300_FPI0_ARGC_ZERO;
-		
-		if (emit_sop) {
-			srcpos = add_src(rp, hwsrc[i], spos,
-					 s_swiz[REG_GET_SSWZ(src[i])].flags);
-			sswz[i] = (s_swiz[REG_GET_SSWZ(src[i])].base +
-				   (srcpos *
-				    s_swiz[REG_GET_SSWZ(src[i])].stride)) |
-				((src[i] & REG_NEGS_MASK) ? ARG_NEG : 0) |
-				((src[i] & REG_ABS_MASK) ? ARG_ABS : 0);
-		} else sswz[i] = R300_FPI2_ARGA_ZERO;
-	}
-	hwdest = t_hw_dst(rp, dest, GL_FALSE);
+	hwdest = t_hw_dst(rp, dest, GL_FALSE, pos); /* Note: Side effects wrt register allocation */
 	
 	if (flags & PFS_FLAG_SAT) {
 		vop |= R300_FPI0_OUTC_SAT;
@@ -1131,58 +1299,45 @@ static void emit_arith(struct r300_fragment_program *rp,
 	}
 
 	/* Throw the pieces together and get FPI0/1 */
-	rp->alu.inst[vpos].inst1 =
-			((cs->slot[vpos].vsrc[0] << R300_FPI1_SRC0C_SHIFT) |
-			 (cs->slot[vpos].vsrc[1] << R300_FPI1_SRC1C_SHIFT) |
-			 (cs->slot[vpos].vsrc[2] << R300_FPI1_SRC2C_SHIFT));
 	if (emit_vop) {
-		rp->alu.inst[vpos].inst0 = vop |
-				(vswz[0] << R300_FPI0_ARG0C_SHIFT) |
-				(vswz[1] << R300_FPI0_ARG1C_SHIFT) |
-				(vswz[2] << R300_FPI0_ARG2C_SHIFT);
+		rp->alu.inst[pos].inst0 |= vop;
 
-		rp->alu.inst[vpos].inst1 |= hwdest << R300_FPI1_DSTC_SHIFT;
+		rp->alu.inst[pos].inst1 |= hwdest << R300_FPI1_DSTC_SHIFT;
+		
 		if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT) {
 			if (REG_GET_INDEX(dest) == FRAG_RESULT_COLR) {
-				rp->alu.inst[vpos].inst1 |=
+				rp->alu.inst[pos].inst1 |=
 					(mask & WRITEMASK_XYZ) << R300_FPI1_DSTC_OUTPUT_MASK_SHIFT;
 			} else assert(0);
 		} else {
-			rp->alu.inst[vpos].inst1 |=
+			rp->alu.inst[pos].inst1 |=
 					(mask & WRITEMASK_XYZ) << R300_FPI1_DSTC_REG_MASK_SHIFT;
+			
+			cs->hwtemps[hwdest].vector_valid = pos+1;
 		}
-		cs->v_pos = vpos+1;
-	} else if (spos >= vpos)
-		rp->alu.inst[spos].inst0 = NOP_INST0;
+	}
 
 	/* And now FPI2/3 */
-	rp->alu.inst[spos].inst3 =
-			((cs->slot[spos].ssrc[0] << R300_FPI3_SRC0A_SHIFT) |
-			 (cs->slot[spos].ssrc[1] << R300_FPI3_SRC1A_SHIFT) |
-			 (cs->slot[spos].ssrc[2] << R300_FPI3_SRC2A_SHIFT));
 	if (emit_sop) {
-		rp->alu.inst[spos].inst2 = sop |
-				sswz[0] << R300_FPI2_ARG0A_SHIFT |
-				sswz[1] << R300_FPI2_ARG1A_SHIFT |
-				sswz[2] << R300_FPI2_ARG2A_SHIFT;
+		rp->alu.inst[pos].inst2 |= sop;
 
 		if (mask & WRITEMASK_W) {
 			if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT) {
 				if (REG_GET_INDEX(dest) == FRAG_RESULT_COLR) {
-					rp->alu.inst[spos].inst3 |= 
+					rp->alu.inst[pos].inst3 |= 
 							(hwdest << R300_FPI3_DSTA_SHIFT) | R300_FPI3_DSTA_OUTPUT;
 				} else if (REG_GET_INDEX(dest) == FRAG_RESULT_DEPR) {
-					rp->alu.inst[spos].inst3 |= R300_FPI3_DSTA_DEPTH;
+					rp->alu.inst[pos].inst3 |= R300_FPI3_DSTA_DEPTH;
 				} else assert(0);
 			} else {
-				rp->alu.inst[spos].inst3 |=
+				rp->alu.inst[pos].inst3 |=
 						(hwdest << R300_FPI3_DSTA_SHIFT) | R300_FPI3_DSTA_REG;
+			
+				cs->hwtemps[hwdest].scalar_valid = pos+1;
 			}
 		}
-		cs->s_pos = spos+1;
-	} else if (vpos >= spos)
-		rp->alu.inst[vpos].inst2 = NOP_INST2;
-
+	}
+	
 	return;
 }
 
@@ -1922,7 +2077,7 @@ static void init_program(r300ContextPtr r300, struct r300_fragment_program *rp)
 	for (i=0;i<rp->ctx->Const.MaxTextureUnits;i++) {
 		if (InputsRead & (FRAG_BIT_TEX0 << i)) {
 			cs->inputs[FRAG_ATTRIB_TEX0+i].refcount = 0;
-			cs->inputs[FRAG_ATTRIB_TEX0+i].reg = get_hw_temp(rp);
+			cs->inputs[FRAG_ATTRIB_TEX0+i].reg = get_hw_temp(rp, 0);
 		}
 	}
 	InputsRead &= ~FRAG_BITS_TEX_ANY;
@@ -1930,7 +2085,7 @@ static void init_program(r300ContextPtr r300, struct r300_fragment_program *rp)
 	/* fragment position treated as a texcoord */
 	if (InputsRead & FRAG_BIT_WPOS) {
 		cs->inputs[FRAG_ATTRIB_WPOS].refcount = 0;
-		cs->inputs[FRAG_ATTRIB_WPOS].reg = get_hw_temp(rp);
+		cs->inputs[FRAG_ATTRIB_WPOS].reg = get_hw_temp(rp, 0);
 		insert_wpos(&mp->Base);
 	}
 	InputsRead &= ~FRAG_BIT_WPOS;
@@ -1938,14 +2093,14 @@ static void init_program(r300ContextPtr r300, struct r300_fragment_program *rp)
 	/* Then primary colour */
 	if (InputsRead & FRAG_BIT_COL0) {
 		cs->inputs[FRAG_ATTRIB_COL0].refcount = 0;
-		cs->inputs[FRAG_ATTRIB_COL0].reg = get_hw_temp(rp);
+		cs->inputs[FRAG_ATTRIB_COL0].reg = get_hw_temp(rp, 0);
 	}
 	InputsRead &= ~FRAG_BIT_COL0;
 	
 	/* Secondary color */
 	if (InputsRead & FRAG_BIT_COL1) {
 		cs->inputs[FRAG_ATTRIB_COL1].refcount = 0;
-		cs->inputs[FRAG_ATTRIB_COL1].reg = get_hw_temp(rp);
+		cs->inputs[FRAG_ATTRIB_COL1].reg = get_hw_temp(rp, 0);
 	}
 	InputsRead &= ~FRAG_BIT_COL1;
 
@@ -2030,13 +2185,12 @@ void r300_translate_fragment_shader(r300ContextPtr r300, struct r300_fragment_pr
 		}
 		
 		/* Finish off */
-		cs->v_pos = cs->s_pos = MAX2(cs->v_pos, cs->s_pos);
 		rp->node[rp->cur_node].alu_end =
-				cs->v_pos - rp->node[rp->cur_node].alu_offset - 1;
+				cs->nrslots - rp->node[rp->cur_node].alu_offset - 1;
 		if (rp->node[rp->cur_node].tex_end < 0)
 			rp->node[rp->cur_node].tex_end = 0;
 		rp->alu_offset = 0;
-		rp->alu_end    = cs->v_pos - 1;
+		rp->alu_end    = cs->nrslots - 1;
 		rp->tex_offset = 0;
 		rp->tex_end    = rp->tex.length ? rp->tex.length - 1 : 0;
 		assert(rp->node[rp->cur_node].alu_end >= 0);
@@ -2053,7 +2207,7 @@ void r300_translate_fragment_shader(r300ContextPtr r300, struct r300_fragment_pr
 /* just some random things... */
 static void dump_program(struct r300_fragment_program *rp)
 {
-	int i;
+	int n, i, j;
 	static int pc = 0;
 
 	fprintf(stderr, "pc=%d*************************************\n", pc++);
@@ -2066,46 +2220,136 @@ static void dump_program(struct r300_fragment_program *rp)
 	fprintf(stderr, "Hardware program\n");
 	fprintf(stderr, "----------------\n");
 	
-	fprintf(stderr, "tex:\n");
-	
-	for(i=0;i<rp->tex.length;i++) {
-		fprintf(stderr, "%08x\n", rp->tex.inst[i]);
-	}
-	
-	for (i=0;i<(rp->cur_node+1);i++) {
+	for (n = 0; n < (rp->cur_node+1); n++) {
 		fprintf(stderr, "NODE %d: alu_offset: %d, tex_offset: %d, "\
-			"alu_end: %d, tex_end: %d\n", i,
-			rp->node[i].alu_offset,
-			rp->node[i].tex_offset,
-			rp->node[i].alu_end,
-			rp->node[i].tex_end);
+			"alu_end: %d, tex_end: %d\n", n,
+			rp->node[n].alu_offset,
+			rp->node[n].tex_offset,
+			rp->node[n].alu_end,
+			rp->node[n].tex_end);
+		
+		if (rp->tex.length) {
+			fprintf(stderr, "  TEX:\n");
+			for(i = rp->node[n].tex_offset; i <= rp->node[n].tex_end; ++i)
+				fprintf(stderr, "    %08x\n", rp->tex.inst[i]);
+		}
+		
+		for(i = rp->node[n].alu_offset; i <= rp->node[n].alu_end; ++i) {
+			char srcc[3][10], dstc[20];
+			char srca[3][10], dsta[20];
+			char argc[3][20];
+			char arga[3][20];
+			
+			for(j = 0; j < 3; ++j) {
+				int regc = rp->alu.inst[i].inst1 >> (j*6);
+				int rega = rp->alu.inst[i].inst3 >> (j*6);
+				
+				sprintf(srcc[j], "%c%i", (regc & 32) ? 'c' : 't', regc & 31);
+				sprintf(srca[j], "%c%i", (rega & 32) ? 'c' : 't', rega & 31);
+			}
+			
+			sprintf(dstc, "t%i.%c%c%c o%i.%c%c%c",
+					(rp->alu.inst[i].inst1 >> R300_FPI1_DSTC_SHIFT) & 31,
+					(rp->alu.inst[i].inst1 & R300_FPI1_DSTC_REG_X) ? 'x' : ' ',
+					(rp->alu.inst[i].inst1 & R300_FPI1_DSTC_REG_Y) ? 'y' : ' ',
+					(rp->alu.inst[i].inst1 & R300_FPI1_DSTC_REG_Z) ? 'z' : ' ',
+					(rp->alu.inst[i].inst1 >> R300_FPI1_DSTC_SHIFT) & 31,
+					(rp->alu.inst[i].inst1 & R300_FPI1_DSTC_OUTPUT_X) ? 'x' : ' ',
+					(rp->alu.inst[i].inst1 & R300_FPI1_DSTC_OUTPUT_Y) ? 'y' : ' ',
+					(rp->alu.inst[i].inst1 & R300_FPI1_DSTC_OUTPUT_Z) ? 'z' : ' ');
+			
+			sprintf(dsta, "t%i.%c o%i.%c %c",
+					(rp->alu.inst[i].inst3 >> R300_FPI3_DSTA_SHIFT) & 31,
+					(rp->alu.inst[i].inst3 & R300_FPI3_DSTA_REG) ? 'w' : ' ',
+					(rp->alu.inst[i].inst3 >> R300_FPI3_DSTA_SHIFT) & 31,
+					(rp->alu.inst[i].inst3 & R300_FPI3_DSTA_OUTPUT) ? 'w' : ' ',
+					(rp->alu.inst[i].inst3 & R300_FPI3_DSTA_DEPTH) ? 'Z' : ' ');
+			
+			fprintf(stderr, "%3i: xyz: %3s %3s %3s -> %-20s (%08x)\n"
+			                "       w: %3s %3s %3s -> %-20s (%08x)\n",
+					i,
+					srcc[0], srcc[1], srcc[2], dstc, rp->alu.inst[i].inst1,
+					srca[0], srca[1], srca[2], dsta, rp->alu.inst[i].inst3);
+			
+			for(j = 0; j < 3; ++j) {
+				int regc = rp->alu.inst[i].inst0 >> (j*7);
+				int rega = rp->alu.inst[i].inst2 >> (j*7);
+				int d;
+				char buf[20];
+
+				d = regc & 31;
+				if (d < 12) {
+					switch(d % 4) {
+						case R300_FPI0_ARGC_SRC0C_XYZ:
+							sprintf(buf, "%s.xyz", srcc[d / 4]);
+							break;
+						case R300_FPI0_ARGC_SRC0C_XXX:
+							sprintf(buf, "%s.xxx", srcc[d / 4]);
+							break;
+						case R300_FPI0_ARGC_SRC0C_YYY:
+							sprintf(buf, "%s.yyy", srcc[d / 4]);
+							break;
+						case R300_FPI0_ARGC_SRC0C_ZZZ:
+							sprintf(buf, "%s.zzz", srcc[d / 4]);
+							break;
+					}
+				} else if (d < 15) {
+					sprintf(buf, "%s.www", srca[d-12]);
+				} else if (d == 20) {
+					sprintf(buf, "0.0");
+				} else if (d == 21) {
+					sprintf(buf, "1.0");
+				} else if (d == 22) {
+					sprintf(buf, "0.5");
+				} else if (d >= 23 && d < 32) {
+					d -= 23;
+					switch(d/3) {
+						case 0:
+							sprintf(buf, "%s.yzx", srcc[d % 3]);
+							break;
+						case 1:
+							sprintf(buf, "%s.zxy", srcc[d % 3]);
+							break;
+						case 2:
+							sprintf(buf, "%s.Wzy", srcc[d % 3]);
+							break;
+					}
+				} else {
+					sprintf(buf, "%i", d);
+				}
+				
+				sprintf(argc[j], "%s%s%s%s",
+						(regc & 32) ? "-" : "",
+						(regc & 64) ? "|" : "",
+						buf,
+						(regc & 64) ? "|" : "");
+			
+				d = rega & 31;
+				if (d < 9) {
+					sprintf(buf, "%s.%c", srcc[d / 3], 'x' + (char)(d%3));
+				} else if (d < 12) {
+					sprintf(buf, "%s.w", srca[d-9]);
+				} else if (d == 16) {
+					sprintf(buf, "0.0");
+				} else if (d == 17) {
+					sprintf(buf, "1.0");
+				} else if (d == 18) {
+					sprintf(buf, "0.5");
+				} else {
+					sprintf(buf, "%i", d);
+				}
+				
+				sprintf(arga[j], "%s%s%s%s",
+						(rega & 32) ? "-" : "",
+						(rega & 64) ? "|" : "",
+						buf,
+						(rega & 64) ? "|" : "");
+			}
+			
+			fprintf(stderr, "     xyz: %8s %8s %8s    op: %08x\n"
+			                "       w: %8s %8s %8s    op: %08x\n",
+					argc[0], argc[1], argc[2], rp->alu.inst[i].inst0,
+					arga[0], arga[1], arga[2], rp->alu.inst[i].inst2);
+		}
 	}
-	
-	fprintf(stderr, "%08x\n",
-		((rp->tex_end << 16) | (R300_PFS_TEXI_0 >> 2)));
-	for (i=0;i<=rp->tex_end;i++)
-		fprintf(stderr, "%08x\n", rp->tex.inst[i]);
-
-	/* dump program in pretty_print_command_stream.tcl-readable format */
-	fprintf(stderr, "%08x\n",
-		((rp->alu_end << 16) | (R300_PFS_INSTR0_0 >> 2)));
-	for (i=0;i<=rp->alu_end;i++)
-		fprintf(stderr, "%08x\n", rp->alu.inst[i].inst0);
-
-	fprintf(stderr, "%08x\n",
-		((rp->alu_end << 16) | (R300_PFS_INSTR1_0 >> 2)));
-	for (i=0;i<=rp->alu_end;i++)
-		fprintf(stderr, "%08x\n", rp->alu.inst[i].inst1);
-
-	fprintf(stderr, "%08x\n",
-		((rp->alu_end << 16) | (R300_PFS_INSTR2_0 >> 2)));
-	for (i=0;i<=rp->alu_end;i++)
-		fprintf(stderr, "%08x\n", rp->alu.inst[i].inst2);
-
-	fprintf(stderr, "%08x\n",
-		((rp->alu_end << 16) | (R300_PFS_INSTR3_0 >> 2)));
-	for (i=0;i<=rp->alu_end;i++)
-		fprintf(stderr, "%08x\n", rp->alu.inst[i].inst3);
-
-	fprintf(stderr, "00000000\n");
 }
diff --git a/src/mesa/drivers/dri/r300/r300_reg.h b/src/mesa/drivers/dri/r300/r300_reg.h
index 3de15752b1..1f4a2d2e64 100644
--- a/src/mesa/drivers/dri/r300/r300_reg.h
+++ b/src/mesa/drivers/dri/r300/r300_reg.h
@@ -1047,7 +1047,7 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
  * WRT swizzling. If, for example, you want to load an R component into an
  * Alpha operand, this R component is taken from a *color* source, not from
  * an alpha source. The corresponding register doesn't even have to appear in
- * the alpha sources list. (I hope this alll makes sense to you)
+ * the alpha sources list. (I hope this all makes sense to you)
  *
  * Destination selection
  * The destination register index is in FPI1 (color) and FPI3 (alpha)
@@ -1074,6 +1074,7 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #       define R300_FPI1_SRC2C_SHIFT             12
 #       define R300_FPI1_SRC2C_MASK              (31 << 12)
 #       define R300_FPI1_SRC2C_CONST             (1 << 17)
+#       define R300_FPI1_SRC_MASK                0x0003ffff
 #       define R300_FPI1_DSTC_SHIFT              18
 #       define R300_FPI1_DSTC_MASK               (31 << 18)
 #		define R300_FPI1_DSTC_REG_MASK_SHIFT     23
@@ -1095,6 +1096,7 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #       define R300_FPI3_SRC2A_SHIFT             12
 #       define R300_FPI3_SRC2A_MASK              (31 << 12)
 #       define R300_FPI3_SRC2A_CONST             (1 << 17)
+#       define R300_FPI3_SRC_MASK                0x0003ffff
 #       define R300_FPI3_DSTA_SHIFT              18
 #       define R300_FPI3_DSTA_MASK               (31 << 18)
 #       define R300_FPI3_DSTA_REG                (1 << 23)
-- 
cgit v1.2.3


From a8e65a010c17444c63859c17786ecb4010bd49c1 Mon Sep 17 00:00:00 2001
From: Nicolai Haehnle <nhaehnle@gmail.com>
Date: Sun, 18 Mar 2007 12:46:53 +0100
Subject: r300: Fix hw fragment program dump

Dumps of fragment programs were incorrect when the program consisted of multiple
nodes.

Also, improved the formatting a bit.
---
 src/mesa/drivers/dri/r300/r300_fragprog.c | 51 ++++++++++++++++++++-----------
 1 file changed, 34 insertions(+), 17 deletions(-)

(limited to 'src/mesa/drivers')

diff --git a/src/mesa/drivers/dri/r300/r300_fragprog.c b/src/mesa/drivers/dri/r300/r300_fragprog.c
index b2c89ccb36..c3d902a4aa 100644
--- a/src/mesa/drivers/dri/r300/r300_fragprog.c
+++ b/src/mesa/drivers/dri/r300/r300_fragprog.c
@@ -2230,15 +2230,16 @@ static void dump_program(struct r300_fragment_program *rp)
 		
 		if (rp->tex.length) {
 			fprintf(stderr, "  TEX:\n");
-			for(i = rp->node[n].tex_offset; i <= rp->node[n].tex_end; ++i)
+			for(i = rp->node[n].tex_offset; i <= rp->node[n].tex_offset+rp->node[n].tex_end; ++i)
 				fprintf(stderr, "    %08x\n", rp->tex.inst[i]);
 		}
 		
-		for(i = rp->node[n].alu_offset; i <= rp->node[n].alu_end; ++i) {
+		for(i = rp->node[n].alu_offset; i <= rp->node[n].alu_offset+rp->node[n].alu_end; ++i) {
 			char srcc[3][10], dstc[20];
 			char srca[3][10], dsta[20];
 			char argc[3][20];
 			char arga[3][20];
+			char flags[5], tmp[10];
 			
 			for(j = 0; j < 3; ++j) {
 				int regc = rp->alu.inst[i].inst1 >> (j*6);
@@ -2248,22 +2249,38 @@ static void dump_program(struct r300_fragment_program *rp)
 				sprintf(srca[j], "%c%i", (rega & 32) ? 'c' : 't', rega & 31);
 			}
 			
-			sprintf(dstc, "t%i.%c%c%c o%i.%c%c%c",
-					(rp->alu.inst[i].inst1 >> R300_FPI1_DSTC_SHIFT) & 31,
-					(rp->alu.inst[i].inst1 & R300_FPI1_DSTC_REG_X) ? 'x' : ' ',
-					(rp->alu.inst[i].inst1 & R300_FPI1_DSTC_REG_Y) ? 'y' : ' ',
-					(rp->alu.inst[i].inst1 & R300_FPI1_DSTC_REG_Z) ? 'z' : ' ',
-					(rp->alu.inst[i].inst1 >> R300_FPI1_DSTC_SHIFT) & 31,
-					(rp->alu.inst[i].inst1 & R300_FPI1_DSTC_OUTPUT_X) ? 'x' : ' ',
-					(rp->alu.inst[i].inst1 & R300_FPI1_DSTC_OUTPUT_Y) ? 'y' : ' ',
-					(rp->alu.inst[i].inst1 & R300_FPI1_DSTC_OUTPUT_Z) ? 'z' : ' ');
+			dstc[0] = 0;
+			sprintf(flags, "%s%s%s",
+					(rp->alu.inst[i].inst1 & R300_FPI1_DSTC_REG_X) ? "x" : "",
+					(rp->alu.inst[i].inst1 & R300_FPI1_DSTC_REG_Y) ? "y" : "",
+					(rp->alu.inst[i].inst1 & R300_FPI1_DSTC_REG_Z) ? "z" : "");
+			if (flags[0] != 0) {
+				sprintf(dstc, "t%i.%s ",
+						(rp->alu.inst[i].inst1 >> R300_FPI1_DSTC_SHIFT) & 31,
+						flags);
+			}
+			sprintf(flags, "%s%s%s",
+					(rp->alu.inst[i].inst1 & R300_FPI1_DSTC_OUTPUT_X) ? "x" : "",
+					(rp->alu.inst[i].inst1 & R300_FPI1_DSTC_OUTPUT_Y) ? "y" : "",
+					(rp->alu.inst[i].inst1 & R300_FPI1_DSTC_OUTPUT_Z) ? "z" : "");
+			if (flags[0] != 0) {
+				sprintf(tmp, "o%i.%s",
+						(rp->alu.inst[i].inst1 >> R300_FPI1_DSTC_SHIFT) & 31,
+						flags);
+				strcat(dstc, tmp);
+			}
 			
-			sprintf(dsta, "t%i.%c o%i.%c %c",
-					(rp->alu.inst[i].inst3 >> R300_FPI3_DSTA_SHIFT) & 31,
-					(rp->alu.inst[i].inst3 & R300_FPI3_DSTA_REG) ? 'w' : ' ',
-					(rp->alu.inst[i].inst3 >> R300_FPI3_DSTA_SHIFT) & 31,
-					(rp->alu.inst[i].inst3 & R300_FPI3_DSTA_OUTPUT) ? 'w' : ' ',
-					(rp->alu.inst[i].inst3 & R300_FPI3_DSTA_DEPTH) ? 'Z' : ' ');
+			dsta[0] = 0;
+			if (rp->alu.inst[i].inst3 & R300_FPI3_DSTA_REG) {
+				sprintf(dsta, "t%i.w ", (rp->alu.inst[i].inst3 >> R300_FPI3_DSTA_SHIFT) & 31);
+			}
+			if (rp->alu.inst[i].inst3 & R300_FPI3_DSTA_OUTPUT) {
+				sprintf(tmp, "o%i.w ", (rp->alu.inst[i].inst3 >> R300_FPI3_DSTA_SHIFT) & 31);
+				strcat(dsta, tmp);
+			}
+			if (rp->alu.inst[i].inst3 & R300_FPI3_DSTA_DEPTH) {
+				strcat(dsta, "Z");
+			}
 			
 			fprintf(stderr, "%3i: xyz: %3s %3s %3s -> %-20s (%08x)\n"
 			                "       w: %3s %3s %3s -> %-20s (%08x)\n",
-- 
cgit v1.2.3


From ec1a77c86481d7f77542fbecda0e81b74732c90f Mon Sep 17 00:00:00 2001
From: Nicolai Haehnle <nhaehnle@gmail.com>
Date: Sun, 18 Mar 2007 13:09:21 +0100
Subject: r300: Fragment program dumps format tex instructions

---
 src/mesa/drivers/dri/r300/r300_fragprog.c | 30 ++++++++++++++++++++++++++++--
 1 file changed, 28 insertions(+), 2 deletions(-)

(limited to 'src/mesa/drivers')

diff --git a/src/mesa/drivers/dri/r300/r300_fragprog.c b/src/mesa/drivers/dri/r300/r300_fragprog.c
index c3d902a4aa..3c54830312 100644
--- a/src/mesa/drivers/dri/r300/r300_fragprog.c
+++ b/src/mesa/drivers/dri/r300/r300_fragprog.c
@@ -2230,8 +2230,34 @@ static void dump_program(struct r300_fragment_program *rp)
 		
 		if (rp->tex.length) {
 			fprintf(stderr, "  TEX:\n");
-			for(i = rp->node[n].tex_offset; i <= rp->node[n].tex_offset+rp->node[n].tex_end; ++i)
-				fprintf(stderr, "    %08x\n", rp->tex.inst[i]);
+			for(i = rp->node[n].tex_offset; i <= rp->node[n].tex_offset+rp->node[n].tex_end; ++i) {
+				const char* instr;
+				
+				switch((rp->tex.inst[i] >> R300_FPITX_OPCODE_SHIFT) & 15) {
+				case R300_FPITX_OP_TEX:
+					instr = "TEX";
+					break;
+				case R300_FPITX_OP_KIL:
+					instr = "KIL";
+					break;
+				case R300_FPITX_OP_TXP:
+					instr = "TXP";
+					break;
+				case R300_FPITX_OP_TXB:
+					instr = "TXB";
+					break;
+				default:
+					instr = "UNKNOWN";
+				}
+				
+				fprintf(stderr, "    %s t%i, %c%i, texture[%i]   (%08x)\n",
+						instr,
+						(rp->tex.inst[i] >> R300_FPITX_DST_SHIFT) & 31,
+						(rp->tex.inst[i] & R300_FPITX_SRC_CONST) ? 'c': 't',
+						(rp->tex.inst[i] >> R300_FPITX_SRC_SHIFT) & 31,
+						(rp->tex.inst[i] & R300_FPITX_IMAGE_MASK) >> R300_FPITX_IMAGE_SHIFT,
+						rp->tex.inst[i]);
+			}
 		}
 		
 		for(i = rp->node[n].alu_offset; i <= rp->node[n].alu_offset+rp->node[n].alu_end; ++i) {
-- 
cgit v1.2.3


From ff6ab9b45b180ab9bf261afa50888e6e740d7924 Mon Sep 17 00:00:00 2001
From: Nicolai Haehnle <nhaehnle@gmail.com>
Date: Sun, 18 Mar 2007 13:29:18 +0100
Subject: r300: Fix fragment program reordering

Do not move an instruction that writes to a temp forward past an instruction
that reads the same temporary.
---
 src/mesa/drivers/dri/r300/r300_context.h  |  5 +++++
 src/mesa/drivers/dri/r300/r300_fragprog.c | 37 ++++++++++++++++++++++++++-----
 2 files changed, 37 insertions(+), 5 deletions(-)

(limited to 'src/mesa/drivers')

diff --git a/src/mesa/drivers/dri/r300/r300_context.h b/src/mesa/drivers/dri/r300/r300_context.h
index bc43953ff3..29436ab9e0 100644
--- a/src/mesa/drivers/dri/r300/r300_context.h
+++ b/src/mesa/drivers/dri/r300/r300_context.h
@@ -674,6 +674,11 @@ struct reg_lifetime {
 	   emitted instruction that writes to the register */
 	int vector_valid;
 	int scalar_valid;
+	
+	/* Index to the slot where the register was last read.
+	   This is also the first slot in which the register may be written again */
+	int vector_lastread;
+	int scalar_lastread;
 };
 
 
diff --git a/src/mesa/drivers/dri/r300/r300_fragprog.c b/src/mesa/drivers/dri/r300/r300_fragprog.c
index 3c54830312..89e9f6531a 100644
--- a/src/mesa/drivers/dri/r300/r300_fragprog.c
+++ b/src/mesa/drivers/dri/r300/r300_fragprog.c
@@ -1026,10 +1026,11 @@ static void emit_tex(struct r300_fragment_program *rp,
  */
 static int get_earliest_allowed_write(
 		struct r300_fragment_program* rp,
-		GLuint dest)
+		GLuint dest, int mask)
 {
 	COMPILE_STATE;
 	int idx;
+	int pos;
 	GLuint index = REG_GET_INDEX(dest);
 	assert(REG_GET_VALID(dest));
 
@@ -1047,7 +1048,17 @@ static int get_earliest_allowed_write(
 			return 0;
 	}
 	
-	return cs->hwtemps[idx].reserved;
+	pos = cs->hwtemps[idx].reserved;
+	if (mask & WRITEMASK_XYZ) {
+		if (pos < cs->hwtemps[idx].vector_lastread)
+			pos = cs->hwtemps[idx].vector_lastread;
+	}
+	if (mask & WRITEMASK_W) {
+		if (pos < cs->hwtemps[idx].scalar_lastread)
+			pos = cs->hwtemps[idx].scalar_lastread;
+	}
+	
+	return pos;
 }
 
 
@@ -1070,7 +1081,8 @@ static int find_and_prepare_slot(struct r300_fragment_program* rp,
 		GLboolean emit_sop,
 		int argc,
 		GLuint* src,
-		GLuint dest)
+		GLuint dest,
+		int mask)
 {
 	COMPILE_STATE;
 	int hwsrc[3];
@@ -1092,7 +1104,7 @@ static int find_and_prepare_slot(struct r300_fragment_program* rp,
 	if (emit_sop)
 		used |= SLOT_OP_SCALAR;
 	
-	pos = get_earliest_allowed_write(rp, dest);
+	pos = get_earliest_allowed_write(rp, dest, mask);
 	
 	if (rp->node[rp->cur_node].alu_offset > pos)
 		pos = rp->node[rp->cur_node].alu_offset;
@@ -1191,6 +1203,21 @@ static int find_and_prepare_slot(struct r300_fragment_program* rp,
 		cs->slot[pos].ssrc[i] = tempssrc[i];
 	}
 	
+	for(i = 0; i < argc; ++i) {
+		if (REG_GET_TYPE(src[i]) == REG_TYPE_TEMP) {
+			int regnr = hwsrc[i] & 31;
+			
+			if (used & (SLOT_SRC_VECTOR << i)) {
+				if (cs->hwtemps[regnr].vector_lastread < pos)
+					cs->hwtemps[regnr].vector_lastread = pos;
+			}
+			if (used & (SLOT_SRC_SCALAR << i)) {
+				if (cs->hwtemps[regnr].scalar_lastread < pos)
+					cs->hwtemps[regnr].scalar_lastread = pos;
+			}
+		}
+	}
+	
 	// Emit the source fetch code
 	rp->alu.inst[pos].inst1 &= ~R300_FPI1_SRC_MASK;
 	rp->alu.inst[pos].inst1 |=
@@ -1287,7 +1314,7 @@ static void emit_arith(struct r300_fragment_program *rp,
 	if ((mask & WRITEMASK_W) || vop == R300_FPI0_OUTC_REPL_ALPHA)
 		emit_sop = GL_TRUE;
 
-	pos = find_and_prepare_slot(rp, emit_vop, emit_sop, argc, src, dest);
+	pos = find_and_prepare_slot(rp, emit_vop, emit_sop, argc, src, dest, mask);
 	if (pos < 0)
 		return;
 	
-- 
cgit v1.2.3


From b645e8c96dc1e3b153cf882c8931f10e0c006f04 Mon Sep 17 00:00:00 2001
From: Nicolai Haehnle <nhaehnle@gmail.com>
Date: Sun, 18 Mar 2007 18:32:32 +0100
Subject: r300: Streamlined fragment program LIT implementation

Fix a bug in the LIT implementation (clamp exponent to 128, not 0.5)
and change the implementation around. In theory, the new implementation
needs as little as 5 instruction slots. Unfortunately, the dependency
analysis in find_and_replace_slot is not strong enough to look at
individual components of a register yet.
---
 src/mesa/drivers/dri/r300/r300_fragprog.c | 163 ++++++++++++++++++------------
 1 file changed, 101 insertions(+), 62 deletions(-)

(limited to 'src/mesa/drivers')

diff --git a/src/mesa/drivers/dri/r300/r300_fragprog.c b/src/mesa/drivers/dri/r300/r300_fragprog.c
index 89e9f6531a..b0681e2808 100644
--- a/src/mesa/drivers/dri/r300/r300_fragprog.c
+++ b/src/mesa/drivers/dri/r300/r300_fragprog.c
@@ -492,7 +492,7 @@ static GLuint emit_param4fv(struct r300_fragment_program *rp,
 	return r;
 }
 
-static GLuint emit_const4fv(struct r300_fragment_program *rp, GLfloat *cp)
+static GLuint emit_const4fv(struct r300_fragment_program *rp, const GLfloat* cp)
 { 
 	GLuint r = undef;
 	GLuint index;
@@ -1405,15 +1405,112 @@ static void make_sin_const(struct r300_fragment_program *rp)
 	}
 }
 
+/**
+ * Emit a LIT instruction.
+ * \p flags may be PFS_FLAG_SAT
+ *
+ * Definition of LIT (from ARB_fragment_program):
+ * tmp = VectorLoad(op0);
+ * if (tmp.x < 0) tmp.x = 0;
+ * if (tmp.y < 0) tmp.y = 0;
+ * if (tmp.w < -(128.0-epsilon)) tmp.w = -(128.0-epsilon);
+ * else if (tmp.w > 128-epsilon) tmp.w = 128-epsilon;
+ * result.x = 1.0;
+ * result.y = tmp.x;
+ * result.z = (tmp.x > 0) ? RoughApproxPower(tmp.y, tmp.w) : 0.0;
+ * result.w = 1.0;
+ *
+ * The longest path of computation is the one leading to result.z,
+ * consisting of 5 operations. This implementation of LIT takes
+ * 5 slots. So unless there's some special undocumented opcode,
+ * this implementation is potentially optimal. Unfortunately,
+ * emit_arith is a bit too conservative because it doesn't understand
+ * partial writes to the vector component.
+ */
+static void emit_lit(struct r300_fragment_program *rp,
+		GLuint dest,
+		int mask,
+		GLuint src,
+		int flags)
+{
+	COMPILE_STATE;
+	static const GLfloat cnstv[4] = { 127.999999, 127.999999, 127.999999, -127.999999 };
+	GLuint cnst;
+	int needTemporary;
+	GLuint temp;
+	
+	cnst = emit_const4fv(rp, cnstv);
+	
+	needTemporary = 0;
+	if ((mask & WRITEMASK_XYZW) != WRITEMASK_XYZW) {
+		needTemporary = 1;
+	} else if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT) {
+		// LIT is typically followed by DP3/DP4, so there's no point
+		// in creating special code for this case
+		needTemporary = 1;
+	}
+	
+	if (needTemporary) {
+		temp = keep(get_temp_reg(rp));
+	} else {
+		temp = keep(dest);
+	}
+	
+	// Npte: The order of emit_arith inside the slots is relevant,
+	// because emit_arith only looks at scalar vs. vector when resolving
+	// dependencies, and it does not consider individual vector components,
+	// so swizzling between the two parts can create fake dependencies.
+	
+	// First slot
+	emit_arith(rp, PFS_OP_MAX, temp, WRITEMASK_XY,
+	           keep(src), pfs_zero, undef, 0);
+	emit_arith(rp, PFS_OP_MAX, temp, WRITEMASK_W,
+	           src, cnst, undef, 0);
+	
+	// Second slot
+	emit_arith(rp, PFS_OP_MIN, temp, WRITEMASK_Z,
+	           swizzle(temp, W, W, W, W), cnst, undef, 0);
+	emit_arith(rp, PFS_OP_LG2, temp, WRITEMASK_W,
+	           swizzle(temp, Y, Y, Y, Y), undef, undef, 0);
+	
+	// Third slot
+	// If desired, we saturate the y result here.
+	// This does not affect the use as a condition variable in the CMP later
+	emit_arith(rp, PFS_OP_MAD, temp, WRITEMASK_W,
+	           temp, swizzle(temp, Z, Z, Z, Z), pfs_zero, 0);
+	emit_arith(rp, PFS_OP_MAD, temp, WRITEMASK_Y,
+	           swizzle(temp, X, X, X, X), pfs_one, pfs_zero, flags);
+	
+	// Fourth slot
+	emit_arith(rp, PFS_OP_MAD, temp, WRITEMASK_X,
+	           pfs_one, pfs_one, pfs_zero, 0);
+	emit_arith(rp, PFS_OP_EX2, temp, WRITEMASK_W,
+	           temp, undef, undef, 0);
+	
+	// Fifth slot
+	emit_arith(rp, PFS_OP_CMP, temp, WRITEMASK_Z,
+	           swizzle(temp, W, W, W, W), pfs_zero, swizzle(temp, Y, Y, Y, Y), flags);
+	emit_arith(rp, PFS_OP_MAD, temp, WRITEMASK_W,
+	           pfs_one, pfs_one, pfs_zero, 0);
+	
+	if (needTemporary) {
+		emit_arith(rp, PFS_OP_MAD, dest, mask,
+			           temp, pfs_one, pfs_zero, flags);
+		free_temp(rp, temp);
+	} else {
+		// Decrease refcount of the destination
+		t_hw_dst(rp, dest, GL_FALSE, cs->nrslots);
+	}
+}
+
+
 static GLboolean parse_program(struct r300_fragment_program *rp)
 {	
 	struct gl_fragment_program *mp = &rp->mesa_program;
 	const struct prog_instruction *inst = mp->Base.Instructions;
 	struct prog_instruction *fpi;
 	GLuint src[3], dest, temp[2];
-	GLuint cnst;
 	int flags, mask = 0;
-	GLfloat cnstv[4] = {0.0, 0.0, 0.0, 0.0};
 
 	if (!inst || inst[0].Opcode == OPCODE_END) {
 		ERROR("empty program?\n");
@@ -1612,66 +1709,8 @@ static GLboolean parse_program(struct r300_fragment_program *rp)
 				   flags);
 			break;
 		case OPCODE_LIT:
-			/* LIT
-			 * if (s.x < 0) t.x = 0; else t.x = s.x;
-			 * if (s.y < 0) t.y = 0; else t.y = s.y;
-			 * if (s.w >  128.0) t.w =  128.0; else t.w = s.w;
-			 * if (s.w < -128.0) t.w = -128.0; else t.w = s.w;
-			 * r.x = 1.0
-			 * if (t.x > 0) r.y = pow(t.y, t.w); else r.y = 0;
-			 * Also r.y = 0 if t.y < 0
-			 * For the t.x > 0 FGLRX use the CMPH opcode which
-			 * change the compare to (t.x + 0.5) > 0.5 we may
-			 * save one instruction by doing CMP -t.x 
-			 */
-			cnstv[0] = cnstv[1] = cnstv[2] = cnstv[3] = 0.50001;
 			src[0] = t_src(rp, fpi->SrcReg[0]);
-			temp[0] = get_temp_reg(rp);
-			cnst = emit_const4fv(rp, cnstv);
-			emit_arith(rp, PFS_OP_CMP, temp[0],
-				   WRITEMASK_X | WRITEMASK_Y,
-				   src[0], pfs_zero, src[0], flags);
-			emit_arith(rp, PFS_OP_MIN, temp[0], WRITEMASK_Z,
-				   swizzle(keep(src[0]), W, W, W, W),
-				   cnst, undef, flags);
-			emit_arith(rp, PFS_OP_LG2, temp[0], WRITEMASK_W,
-				   swizzle(temp[0], Y, Y, Y, Y),
-				   undef, undef, flags);
-			emit_arith(rp, PFS_OP_MAX, temp[0], WRITEMASK_Z,
-				   temp[0], negate(cnst), undef, flags);
-			emit_arith(rp, PFS_OP_MAD, temp[0], WRITEMASK_W,
-				   temp[0], swizzle(temp[0], Z, Z, Z, Z),
-				   pfs_zero, flags);
-			emit_arith(rp, PFS_OP_EX2, temp[0], WRITEMASK_W,
-				   temp[0], undef, undef, flags);
-			emit_arith(rp, PFS_OP_MAD, dest, WRITEMASK_Y,
-				   swizzle(keep(temp[0]), X, X, X, X),
-				   pfs_one, pfs_zero, flags);
-#if 0
-			emit_arith(rp, PFS_OP_MAD, temp[0], WRITEMASK_X,
-				   temp[0], pfs_one, pfs_half, flags);
-			emit_arith(rp, PFS_OP_CMPH, temp[0], WRITEMASK_Z,
-				   swizzle(keep(temp[0]), W, W, W, W),
-				   pfs_zero, swizzle(keep(temp[0]), X, X, X, X),
-				   flags);
-#else
-			emit_arith(rp, PFS_OP_CMP, temp[0], WRITEMASK_Z,
-				   pfs_zero,
-				   swizzle(keep(temp[0]), W, W, W, W),
-				   negate(swizzle(keep(temp[0]), X, X, X, X)),
-				   flags);
-#endif
-			emit_arith(rp, PFS_OP_CMP, dest, WRITEMASK_Z,
-				   pfs_zero, temp[0],
-				   negate(swizzle(keep(temp[0]), Y, Y, Y, Y)),
-				   flags);
-			emit_arith(rp, PFS_OP_MAD, dest,
-				   WRITEMASK_X | WRITEMASK_W,
-				   pfs_one,
-				   pfs_one,
-				   pfs_zero,
-				   flags);
-			free_temp(rp, temp[0]);
+			emit_lit(rp, dest, mask, src[0], flags);
 			break;
 		case OPCODE_LRP:
 			src[0] = t_src(rp, fpi->SrcReg[0]);
-- 
cgit v1.2.3


From c4bf863f4cb48c2de284933bb1fc725b540ee810 Mon Sep 17 00:00:00 2001
From: Nicolai Haehnle <nhaehnle@gmail.com>
Date: Mon, 19 Mar 2007 19:45:45 +0100
Subject: r300: Fix WRITEMASK handling when writing to result.depth

This is a necessary change to emit the right instructions when writing
to result.depth.

However, even with this test, Z-write doesn't work properly, and I don't
fully understand why. In addition to this, we'll at least have to disable
early-Z, but even that doesn't seem to be enough.
---
 src/mesa/drivers/dri/r300/r300_fragprog.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'src/mesa/drivers')

diff --git a/src/mesa/drivers/dri/r300/r300_fragprog.c b/src/mesa/drivers/dri/r300/r300_fragprog.c
index b0681e2808..fb559e880a 100644
--- a/src/mesa/drivers/dri/r300/r300_fragprog.c
+++ b/src/mesa/drivers/dri/r300/r300_fragprog.c
@@ -1304,9 +1304,14 @@ static void emit_arith(struct r300_fragment_program *rp,
 	argc = r300_fpop[op].argc;
 
 	if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT &&
-	    REG_GET_INDEX(dest) == FRAG_RESULT_DEPR)
-		mask &= ~WRITEMASK_XYZ;
-	
+	    REG_GET_INDEX(dest) == FRAG_RESULT_DEPR) {
+		if (mask & WRITEMASK_Z) {
+			mask = WRITEMASK_W;
+		} else {
+			return;
+		}
+	}
+
 	emit_vop = GL_FALSE;
 	emit_sop = GL_FALSE;
 	if ((mask & WRITEMASK_XYZ) || vop == R300_FPI0_OUTC_DP3)
-- 
cgit v1.2.3


From 7b992d024b20df111db007286e5a54afcb531fb1 Mon Sep 17 00:00:00 2001
From: Nicolai Haehnle <nhaehnle@gmail.com>
Date: Mon, 19 Mar 2007 19:46:25 +0100
Subject: r300: Whitespace cleanup (remove trailing spaces)

---
 src/mesa/drivers/dri/r300/r300_fragprog.c | 218 +++++++++++++++---------------
 1 file changed, 109 insertions(+), 109 deletions(-)

(limited to 'src/mesa/drivers')

diff --git a/src/mesa/drivers/dri/r300/r300_fragprog.c b/src/mesa/drivers/dri/r300/r300_fragprog.c
index fb559e880a..93b9c39635 100644
--- a/src/mesa/drivers/dri/r300/r300_fragprog.c
+++ b/src/mesa/drivers/dri/r300/r300_fragprog.c
@@ -307,17 +307,17 @@ static int get_hw_temp(struct r300_fragment_program *rp, int slot)
 {
 	COMPILE_STATE;
 	int r;
-	
+
 	for(r = 0; r < PFS_NUM_TEMP_REGS; ++r) {
 		if (cs->hwtemps[r].free >= 0 && cs->hwtemps[r].free <= slot)
 			break;
 	}
-	
+
 	if (r >= PFS_NUM_TEMP_REGS) {
 		ERROR("Out of hardware temps\n");
 		return 0;
 	}
-	
+
 	// Reserved is used to avoid the following scenario:
 	//  R300 temporary X is first assigned to Mesa temporary Y during vector ops
 	//  R300 temporary X is then assigned to Mesa temporary Z for further vector ops
@@ -326,17 +326,17 @@ static int get_hw_temp(struct r300_fragment_program *rp, int slot)
 	// End scenario.
 	cs->hwtemps[r].reserved = cs->hwtemps[r].free;
 	cs->hwtemps[r].free = -1;
-	
+
 	// Reset to some value that won't mess things up when the user
 	// tries to read from a temporary that hasn't been assigned a value yet.
 	// In the normal case, vector_valid and scalar_valid should be set to
 	// a sane value by the first emit that writes to this temporary.
 	cs->hwtemps[r].vector_valid = 0;
 	cs->hwtemps[r].scalar_valid = 0;
-	
+
 	if (r > rp->max_temp_idx)
 		rp->max_temp_idx = r;
-	
+
 	return r;
 }
 
@@ -351,25 +351,25 @@ static int get_hw_temp_tex(struct r300_fragment_program *rp)
 	for(r = 0; r < PFS_NUM_TEMP_REGS; ++r) {
 		if (cs->used_in_node & (1 << r))
 			continue;
-		
+
 		// Note: Be very careful here
 		if (cs->hwtemps[r].free >= 0 && cs->hwtemps[r].free <= 0)
 			break;
 	}
-	
+
 	if (r >= PFS_NUM_TEMP_REGS)
 		return get_hw_temp(rp, 0); /* Will cause an indirection */
 
 	cs->hwtemps[r].reserved = cs->hwtemps[r].free;
 	cs->hwtemps[r].free = -1;
-	
+
 	// Reset to some value that won't mess things up when the user
 	// tries to read from a temporary that hasn't been assigned a value yet.
 	// In the normal case, vector_valid and scalar_valid should be set to
 	// a sane value by the first emit that writes to this temporary.
 	cs->hwtemps[r].vector_valid = cs->nrslots;
 	cs->hwtemps[r].scalar_valid = cs->nrslots;
-	
+
 	if (r > rp->max_temp_idx)
 		rp->max_temp_idx = r;
 
@@ -382,7 +382,7 @@ static int get_hw_temp_tex(struct r300_fragment_program *rp)
 static void free_hw_temp(struct r300_fragment_program *rp, int idx)
 {
 	COMPILE_STATE;
-	
+
 	// Be very careful here. Consider sequences like
 	//  MAD r0, r1,r2,r3
 	//  TEX r4, ...
@@ -457,7 +457,7 @@ static void free_temp(struct r300_fragment_program *rp, GLuint r)
 
 	if (!(cs->temp_in_use & (1 << index)))
 		return;
-	
+
 	if (REG_GET_TYPE(r) == REG_TYPE_TEMP) {
 		free_hw_temp(rp, cs->temps[index].reg);
 		cs->temps[index].reg = -1;
@@ -493,7 +493,7 @@ static GLuint emit_param4fv(struct r300_fragment_program *rp,
 }
 
 static GLuint emit_const4fv(struct r300_fragment_program *rp, const GLfloat* cp)
-{ 
+{
 	GLuint r = undef;
 	GLuint index;
 
@@ -691,7 +691,7 @@ static GLuint do_swizzle(struct r300_fragment_program *rp,
 	    GLuint offset;
 	    for(i=0; i < 4; ++i){
 		offset = GET_SWZ(arbswz, i);
-		
+
 		newswz |= (offset <= 3)?GET_SWZ(vsrcswz, offset) << i*3:offset << i*3;
 	    }
 
@@ -800,7 +800,7 @@ static GLuint t_dst(struct r300_fragment_program *rp,
 		       struct prog_dst_register dest)
 {
 	GLuint r = undef;
-	
+
 	switch (dest.File) {
 	case PROGRAM_TEMPORARY:
 		REG_SET_INDEX(r, dest.Index);
@@ -910,19 +910,19 @@ static int t_hw_dst(struct r300_fragment_program *rp,
 		ERROR("invalid dest reg type %d\n", REG_GET_TYPE(dest));
 		return 0;
 	}
-	
+
 	return idx;
 }
 
 static void emit_nop(struct r300_fragment_program *rp)
 {
 	COMPILE_STATE;
-	
+
 	if (cs->nrslots >= PFS_MAX_ALU_INST) {
 		ERROR("Out of ALU instruction slots\n");
 		return;
 	}
-	
+
 	rp->alu.inst[cs->nrslots].inst0 = NOP_INST0;
 	rp->alu.inst[cs->nrslots].inst1 = NOP_INST1;
 	rp->alu.inst[cs->nrslots].inst2 = NOP_INST2;
@@ -940,7 +940,7 @@ static void emit_tex(struct r300_fragment_program *rp,
 	GLuint din = cs->dest_in_node, uin = cs->used_in_node;
 	int unit = fpi->TexSrcUnit;
 	int hwsrc, hwdest;
-	
+
 	/* Resolve source/dest to hardware registers */
 	hwsrc = t_hw_src(rp, coord, GL_TRUE);
 	if (opcode != R300_FPITX_OP_KIL) {
@@ -952,7 +952,7 @@ static void emit_tex(struct r300_fragment_program *rp,
 			dest = get_temp_reg_tex(rp);
 		}
 		hwdest = t_hw_dst(rp, dest, GL_TRUE, rp->node[rp->cur_node].alu_offset);
-		
+
 		/* Use a temp that hasn't been used in this node, rather
 		 * than causing an indirection
 		 */
@@ -965,17 +965,17 @@ static void emit_tex(struct r300_fragment_program *rp,
 		hwdest = 0;
 		unit = 0;
 	}
-	
+
 	/* Indirection if source has been written in this node, or if the
 	 * dest has been read/written in this node
 	 */
 	if ((REG_GET_TYPE(coord) != REG_TYPE_CONST &&
 	     (din & (1<<hwsrc))) || (uin & (1<<hwdest))) {
-			
+
 		/* Finish off current node */
 		if (rp->node[rp->cur_node].alu_offset == cs->nrslots)
 			emit_nop(rp);
-				
+
 		rp->node[rp->cur_node].alu_end =
 				cs->nrslots - rp->node[rp->cur_node].alu_offset - 1;
 		assert(rp->node[rp->cur_node].alu_end >= 0);
@@ -989,12 +989,12 @@ static void emit_tex(struct r300_fragment_program *rp,
 		rp->node[rp->cur_node].tex_offset = rp->tex.length;
 		rp->node[rp->cur_node].alu_offset = cs->nrslots;
 		rp->node[rp->cur_node].tex_end = -1;
-		rp->node[rp->cur_node].alu_end = -1;	
+		rp->node[rp->cur_node].alu_end = -1;
 		rp->node[rp->cur_node].flags = 0;
 		cs->used_in_node = 0;
 		cs->dest_in_node = 0;
 	}
-	
+
 	if (rp->cur_node == 0)
 		rp->first_node_has_tex = 1;
 
@@ -1005,7 +1005,7 @@ static void emit_tex(struct r300_fragment_program *rp,
 		/* not entirely sure about this */
 		| (opcode << R300_FPITX_OPCODE_SHIFT);
 
-	cs->dest_in_node |= (1 << hwdest); 
+	cs->dest_in_node |= (1 << hwdest);
 	if (REG_GET_TYPE(coord) != REG_TYPE_CONST)
 		cs->used_in_node |= (1 << hwsrc);
 
@@ -1038,7 +1038,7 @@ static int get_earliest_allowed_write(
 		case REG_TYPE_TEMP:
 			if (cs->temps[index].reg == -1)
 				return 0;
-			
+
 			idx = cs->temps[index].reg;
 			break;
 		case REG_TYPE_OUTPUT:
@@ -1047,7 +1047,7 @@ static int get_earliest_allowed_write(
 			ERROR("invalid dest reg type %d\n", REG_GET_TYPE(dest));
 			return 0;
 	}
-	
+
 	pos = cs->hwtemps[idx].reserved;
 	if (mask & WRITEMASK_XYZ) {
 		if (pos < cs->hwtemps[idx].vector_lastread)
@@ -1057,7 +1057,7 @@ static int get_earliest_allowed_write(
 		if (pos < cs->hwtemps[idx].scalar_lastread)
 			pos = cs->hwtemps[idx].scalar_lastread;
 	}
-	
+
 	return pos;
 }
 
@@ -1094,7 +1094,7 @@ static int find_and_prepare_slot(struct r300_fragment_program* rp,
 	int pos;
 	int regnr;
 	int i,j;
-	
+
 	// Determine instruction slots, whether sources are required on
 	// vector or scalar side, and the smallest slot number where
 	// all source registers are available
@@ -1103,9 +1103,9 @@ static int find_and_prepare_slot(struct r300_fragment_program* rp,
 		used |= SLOT_OP_VECTOR;
 	if (emit_sop)
 		used |= SLOT_OP_SCALAR;
-	
+
 	pos = get_earliest_allowed_write(rp, dest, mask);
-	
+
 	if (rp->node[rp->cur_node].alu_offset > pos)
 		pos = rp->node[rp->cur_node].alu_offset;
 	for(i = 0; i < argc; ++i) {
@@ -1115,10 +1115,10 @@ static int find_and_prepare_slot(struct r300_fragment_program* rp,
 			if (emit_sop)
 				used |= s_swiz[REG_GET_SSWZ(src[i])].flags << i;
 		}
-		
+
 		hwsrc[i] = t_hw_src(rp, src[i], GL_FALSE); /* Note: sideeffects wrt refcounting! */
 		regnr = hwsrc[i] & 31;
-		
+
 		if (REG_GET_TYPE(src[i]) == REG_TYPE_TEMP) {
 			if (used & (SLOT_SRC_VECTOR << i)) {
 				if (cs->hwtemps[regnr].vector_valid > pos)
@@ -1130,12 +1130,12 @@ static int find_and_prepare_slot(struct r300_fragment_program* rp,
 			}
 		}
 	}
-	
+
 	// Find a slot that fits
 	for(; ; ++pos) {
 		if (cs->slot[pos].used & used & SLOT_OP_BOTH)
 			continue;
-		
+
 		if (pos >= cs->nrslots) {
 			if (cs->nrslots >= PFS_MAX_ALU_INST) {
 				ERROR("Out of ALU instruction slots\n");
@@ -1147,7 +1147,7 @@ static int find_and_prepare_slot(struct r300_fragment_program* rp,
 
 			cs->nrslots++;
 		}
-		
+
 		// Note: When we need both parts (vector and scalar) of a source,
 		// we always try to put them into the same position. This makes the
 		// code easier to read, and it is optimal (i.e. one doesn't gain
@@ -1158,32 +1158,32 @@ static int find_and_prepare_slot(struct r300_fragment_program* rp,
 			tempvsrc[i] = cs->slot[pos].vsrc[i];
 			tempssrc[i] = cs->slot[pos].ssrc[i];
 		}
-		
+
 		for(i = 0; i < argc; ++i) {
 			int flags = (used >> i) & SLOT_SRC_BOTH;
-			
+
 			if (!flags) {
 				srcpos[i] = 0;
 				continue;
 			}
-			
+
 			for(j = 0; j < 3; ++j) {
 				if ((tempused >> j) & flags & SLOT_SRC_VECTOR) {
 					if (tempvsrc[j] != hwsrc[i])
 						continue;
 				}
-			
+
 				if ((tempused >> j) & flags & SLOT_SRC_SCALAR) {
 					if (tempssrc[j] != hwsrc[i])
 						continue;
 				}
-				
+
 				break;
 			}
-			
+
 			if (j == 3)
 				break;
-			
+
 			srcpos[i] = j;
 			tempused |= flags << j;
 			if (flags & SLOT_SRC_VECTOR)
@@ -1191,22 +1191,22 @@ static int find_and_prepare_slot(struct r300_fragment_program* rp,
 			if (flags & SLOT_SRC_SCALAR)
 				tempssrc[j] = hwsrc[i];
 		}
-		
+
 		if (i == argc)
 			break;
 	}
-	
+
 	// Found a slot, reserve it
 	cs->slot[pos].used = tempused | (used & SLOT_OP_BOTH);
 	for(i = 0; i < 3; ++i) {
 		cs->slot[pos].vsrc[i] = tempvsrc[i];
 		cs->slot[pos].ssrc[i] = tempssrc[i];
 	}
-	
+
 	for(i = 0; i < argc; ++i) {
 		if (REG_GET_TYPE(src[i]) == REG_TYPE_TEMP) {
 			int regnr = hwsrc[i] & 31;
-			
+
 			if (used & (SLOT_SRC_VECTOR << i)) {
 				if (cs->hwtemps[regnr].vector_lastread < pos)
 					cs->hwtemps[regnr].vector_lastread = pos;
@@ -1217,24 +1217,24 @@ static int find_and_prepare_slot(struct r300_fragment_program* rp,
 			}
 		}
 	}
-	
+
 	// Emit the source fetch code
 	rp->alu.inst[pos].inst1 &= ~R300_FPI1_SRC_MASK;
 	rp->alu.inst[pos].inst1 |=
 			((cs->slot[pos].vsrc[0] << R300_FPI1_SRC0C_SHIFT) |
 			 (cs->slot[pos].vsrc[1] << R300_FPI1_SRC1C_SHIFT) |
 			 (cs->slot[pos].vsrc[2] << R300_FPI1_SRC2C_SHIFT));
-	
+
 	rp->alu.inst[pos].inst3 &= ~R300_FPI3_SRC_MASK;
 	rp->alu.inst[pos].inst3 |=
 			((cs->slot[pos].ssrc[0] << R300_FPI3_SRC0A_SHIFT) |
 			 (cs->slot[pos].ssrc[1] << R300_FPI3_SRC1A_SHIFT) |
 			 (cs->slot[pos].ssrc[2] << R300_FPI3_SRC2A_SHIFT));
-	
+
 	// Emit the argument selection code
 	if (emit_vop) {
 		int swz[3];
-		
+
 		for(i = 0; i < 3; ++i) {
 			if (i < argc) {
 				swz[i] = (v_swiz[REG_GET_VSWZ(src[i])].base +
@@ -1245,7 +1245,7 @@ static int find_and_prepare_slot(struct r300_fragment_program* rp,
 				swz[i] = R300_FPI0_ARGC_ZERO;
 			}
 		}
-		
+
 		rp->alu.inst[pos].inst0 &=
 				~(R300_FPI0_ARG0C_MASK|R300_FPI0_ARG1C_MASK|R300_FPI0_ARG2C_MASK);
 		rp->alu.inst[pos].inst0 |=
@@ -1253,10 +1253,10 @@ static int find_and_prepare_slot(struct r300_fragment_program* rp,
 				(swz[1] << R300_FPI0_ARG1C_SHIFT) |
 				(swz[2] << R300_FPI0_ARG2C_SHIFT);
 	}
-	
+
 	if (emit_sop) {
 		int swz[3];
-		
+
 		for(i = 0; i < 3; ++i) {
 			if (i < argc) {
 				swz[i] = (s_swiz[REG_GET_SSWZ(src[i])].base +
@@ -1267,7 +1267,7 @@ static int find_and_prepare_slot(struct r300_fragment_program* rp,
 				swz[i] = R300_FPI2_ARGA_ZERO;
 			}
 		}
-		
+
 		rp->alu.inst[pos].inst2 &=
 				~(R300_FPI2_ARG0A_MASK|R300_FPI2_ARG1A_MASK|R300_FPI2_ARG2A_MASK);
 		rp->alu.inst[pos].inst2 |=
@@ -1322,9 +1322,9 @@ static void emit_arith(struct r300_fragment_program *rp,
 	pos = find_and_prepare_slot(rp, emit_vop, emit_sop, argc, src, dest, mask);
 	if (pos < 0)
 		return;
-	
+
 	hwdest = t_hw_dst(rp, dest, GL_FALSE, pos); /* Note: Side effects wrt register allocation */
-	
+
 	if (flags & PFS_FLAG_SAT) {
 		vop |= R300_FPI0_OUTC_SAT;
 		sop |= R300_FPI2_OUTA_SAT;
@@ -1335,7 +1335,7 @@ static void emit_arith(struct r300_fragment_program *rp,
 		rp->alu.inst[pos].inst0 |= vop;
 
 		rp->alu.inst[pos].inst1 |= hwdest << R300_FPI1_DSTC_SHIFT;
-		
+
 		if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT) {
 			if (REG_GET_INDEX(dest) == FRAG_RESULT_COLR) {
 				rp->alu.inst[pos].inst1 |=
@@ -1344,7 +1344,7 @@ static void emit_arith(struct r300_fragment_program *rp,
 		} else {
 			rp->alu.inst[pos].inst1 |=
 					(mask & WRITEMASK_XYZ) << R300_FPI1_DSTC_REG_MASK_SHIFT;
-			
+
 			cs->hwtemps[hwdest].vector_valid = pos+1;
 		}
 	}
@@ -1356,7 +1356,7 @@ static void emit_arith(struct r300_fragment_program *rp,
 		if (mask & WRITEMASK_W) {
 			if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT) {
 				if (REG_GET_INDEX(dest) == FRAG_RESULT_COLR) {
-					rp->alu.inst[pos].inst3 |= 
+					rp->alu.inst[pos].inst3 |=
 							(hwdest << R300_FPI3_DSTA_SHIFT) | R300_FPI3_DSTA_OUTPUT;
 				} else if (REG_GET_INDEX(dest) == FRAG_RESULT_DEPR) {
 					rp->alu.inst[pos].inst3 |= R300_FPI3_DSTA_DEPTH;
@@ -1364,12 +1364,12 @@ static void emit_arith(struct r300_fragment_program *rp,
 			} else {
 				rp->alu.inst[pos].inst3 |=
 						(hwdest << R300_FPI3_DSTA_SHIFT) | R300_FPI3_DSTA_REG;
-			
+
 				cs->hwtemps[hwdest].scalar_valid = pos+1;
 			}
 		}
 	}
-	
+
 	return;
 }
 
@@ -1443,9 +1443,9 @@ static void emit_lit(struct r300_fragment_program *rp,
 	GLuint cnst;
 	int needTemporary;
 	GLuint temp;
-	
+
 	cnst = emit_const4fv(rp, cnstv);
-	
+
 	needTemporary = 0;
 	if ((mask & WRITEMASK_XYZW) != WRITEMASK_XYZW) {
 		needTemporary = 1;
@@ -1454,30 +1454,30 @@ static void emit_lit(struct r300_fragment_program *rp,
 		// in creating special code for this case
 		needTemporary = 1;
 	}
-	
+
 	if (needTemporary) {
 		temp = keep(get_temp_reg(rp));
 	} else {
 		temp = keep(dest);
 	}
-	
+
 	// Npte: The order of emit_arith inside the slots is relevant,
 	// because emit_arith only looks at scalar vs. vector when resolving
 	// dependencies, and it does not consider individual vector components,
 	// so swizzling between the two parts can create fake dependencies.
-	
+
 	// First slot
 	emit_arith(rp, PFS_OP_MAX, temp, WRITEMASK_XY,
 	           keep(src), pfs_zero, undef, 0);
 	emit_arith(rp, PFS_OP_MAX, temp, WRITEMASK_W,
 	           src, cnst, undef, 0);
-	
+
 	// Second slot
 	emit_arith(rp, PFS_OP_MIN, temp, WRITEMASK_Z,
 	           swizzle(temp, W, W, W, W), cnst, undef, 0);
 	emit_arith(rp, PFS_OP_LG2, temp, WRITEMASK_W,
 	           swizzle(temp, Y, Y, Y, Y), undef, undef, 0);
-	
+
 	// Third slot
 	// If desired, we saturate the y result here.
 	// This does not affect the use as a condition variable in the CMP later
@@ -1485,19 +1485,19 @@ static void emit_lit(struct r300_fragment_program *rp,
 	           temp, swizzle(temp, Z, Z, Z, Z), pfs_zero, 0);
 	emit_arith(rp, PFS_OP_MAD, temp, WRITEMASK_Y,
 	           swizzle(temp, X, X, X, X), pfs_one, pfs_zero, flags);
-	
+
 	// Fourth slot
 	emit_arith(rp, PFS_OP_MAD, temp, WRITEMASK_X,
 	           pfs_one, pfs_one, pfs_zero, 0);
 	emit_arith(rp, PFS_OP_EX2, temp, WRITEMASK_W,
 	           temp, undef, undef, 0);
-	
+
 	// Fifth slot
 	emit_arith(rp, PFS_OP_CMP, temp, WRITEMASK_Z,
 	           swizzle(temp, W, W, W, W), pfs_zero, swizzle(temp, Y, Y, Y, Y), flags);
 	emit_arith(rp, PFS_OP_MAD, temp, WRITEMASK_W,
 	           pfs_one, pfs_one, pfs_zero, 0);
-	
+
 	if (needTemporary) {
 		emit_arith(rp, PFS_OP_MAD, dest, mask,
 			           temp, pfs_one, pfs_zero, flags);
@@ -1510,7 +1510,7 @@ static void emit_lit(struct r300_fragment_program *rp,
 
 
 static GLboolean parse_program(struct r300_fragment_program *rp)
-{	
+{
 	struct gl_fragment_program *mp = &rp->mesa_program;
 	const struct prog_instruction *inst = mp->Base.Instructions;
 	struct prog_instruction *fpi;
@@ -1604,7 +1604,7 @@ static GLboolean parse_program(struct r300_fragment_program *rp)
 				   absolute(swizzle(temp[0], Z, Z, Z, Z)),
 				   swizzle(temp[0], X, X, X, X),
 				   0);
-			
+
 			emit_arith(rp, PFS_OP_MAD, temp[0], WRITEMASK_Y,
 				   swizzle(temp[0], X, X, X, X),
 				   absolute(swizzle(temp[0], X, X, X, X)),
@@ -1648,12 +1648,12 @@ static GLboolean parse_program(struct r300_fragment_program *rp)
 				   0);
 			emit_arith(rp, PFS_OP_DP4, dest, mask,
 				   temp[0], src[1], undef,
-				   flags);	
+				   flags);
 			free_temp(rp, temp[0]);
 #else
 			emit_arith(rp, PFS_OP_DP4, dest, mask,
 				   swizzle(src[0], X, Y, Z, ONE), src[1],
-				   undef, flags);	
+				   undef, flags);
 #endif
 			break;
 		case OPCODE_DST:
@@ -1684,7 +1684,7 @@ static GLboolean parse_program(struct r300_fragment_program *rp)
 				   src[0], undef, undef,
 				   flags);
 			break;
-		case OPCODE_FLR:		
+		case OPCODE_FLR:
 			src[0] = t_src(rp, fpi->SrcReg[0]);
 			temp[0] = get_temp_reg(rp);
 			/* FRC temp, src0
@@ -1734,7 +1734,7 @@ static GLboolean parse_program(struct r300_fragment_program *rp)
 				   src[0], src[1], temp[0],
 				   flags);
 			free_temp(rp, temp[0]);
-			break;			
+			break;
 		case OPCODE_MAD:
 			src[0] = t_src(rp, fpi->SrcReg[0]);
 			src[1] = t_src(rp, fpi->SrcReg[1]);
@@ -1761,7 +1761,7 @@ static GLboolean parse_program(struct r300_fragment_program *rp)
 		case OPCODE_SWZ:
 			src[0] = t_src(rp, fpi->SrcReg[0]);
 			emit_arith(rp, PFS_OP_MAD, dest, mask,
-				   src[0], pfs_one, pfs_zero, 
+				   src[0], pfs_one, pfs_zero,
 				   flags);
 			break;
 		case OPCODE_MUL:
@@ -1774,7 +1774,7 @@ static GLboolean parse_program(struct r300_fragment_program *rp)
 		case OPCODE_POW:
 			src[0] = t_scalar_src(rp, fpi->SrcReg[0]);
 			src[1] = t_scalar_src(rp, fpi->SrcReg[1]);
-			temp[0] = get_temp_reg(rp);	
+			temp[0] = get_temp_reg(rp);
 			emit_arith(rp, PFS_OP_LG2, temp[0], WRITEMASK_W,
 				   src[0], undef, undef,
 				   0);
@@ -1932,7 +1932,7 @@ static GLboolean parse_program(struct r300_fragment_program *rp)
 				   absolute(swizzle(temp[0], Z, Z, Z, Z)),
 				   swizzle(temp[0], X, X, X, X),
 				   0);
-			
+
 			emit_arith(rp, PFS_OP_MAD, temp[0], WRITEMASK_Y,
 				   swizzle(temp[0], X, X, X, X),
 				   absolute(swizzle(temp[0], X, X, X, X)),
@@ -1989,7 +1989,7 @@ static GLboolean parse_program(struct r300_fragment_program *rp)
 				   swizzle(keep(src[1]), Y, Z, X, W),
 				   pfs_zero,
 				   0);
-			/* dest.xyz = src0.yzx * src1.zxy - temp 
+			/* dest.xyz = src0.yzx * src1.zxy - temp
 			 * dest.w	= undefined
 			 * */
 			emit_arith(rp, PFS_OP_MAD, dest, mask & WRITEMASK_XYZ,
@@ -2089,7 +2089,7 @@ static void insert_wpos(struct gl_program *prog)
 	fpi = &prog->Instructions[prog->NumInstructions-1];
 
 	assert(fpi->Opcode == OPCODE_END);
-	
+
 	for(fpi = &prog->Instructions[3]; fpi->Opcode != OPCODE_END; fpi++){
 		for(i=0; i<3; i++)
 		    if( fpi->SrcReg[i].File == PROGRAM_INPUT &&
@@ -2106,7 +2106,7 @@ static void insert_wpos(struct gl_program *prog)
 static void init_program(r300ContextPtr r300, struct r300_fragment_program *rp)
 {
 	struct r300_pfs_compile_state *cs = NULL;
-	struct gl_fragment_program *mp = &rp->mesa_program;	
+	struct gl_fragment_program *mp = &rp->mesa_program;
 	struct prog_instruction *fpi;
 	GLuint InputsRead = mp->Base.InputsRead;
 	GLuint temps_used = 0; /* for rp->temps[] */
@@ -2127,7 +2127,7 @@ static void init_program(r300ContextPtr r300, struct r300_fragment_program *rp)
 	rp->node[0].alu_end = -1;
 	rp->node[0].tex_end = -1;
 	rp->const_sin[0] = -1;
-	
+
 	_mesa_memset(cs, 0, sizeof(*rp->cs));
 	for (i=0;i<PFS_MAX_ALU_INST;i++) {
 		for (j=0;j<3;j++) {
@@ -2135,7 +2135,7 @@ static void init_program(r300ContextPtr r300, struct r300_fragment_program *rp)
 			cs->slot[i].ssrc[j] = SRC_CONST;
 		}
 	}
-	
+
 	/* Work out what temps the Mesa inputs correspond to, this must match
 	 * what setup_rs_unit does, which shouldn't be a problem as rs_unit
 	 * configures itself based on the fragprog's InputsRead
@@ -2167,7 +2167,7 @@ static void init_program(r300ContextPtr r300, struct r300_fragment_program *rp)
 		cs->inputs[FRAG_ATTRIB_COL0].reg = get_hw_temp(rp, 0);
 	}
 	InputsRead &= ~FRAG_BIT_COL0;
-	
+
 	/* Secondary color */
 	if (InputsRead & FRAG_BIT_COL1) {
 		cs->inputs[FRAG_ATTRIB_COL1].refcount = 0;
@@ -2194,7 +2194,7 @@ static void init_program(r300ContextPtr r300, struct r300_fragment_program *rp)
 
 	for (fpi=mp->Base.Instructions;fpi->Opcode != OPCODE_END; fpi++) {
 		int idx;
-		
+
 		for (i=0;i<3;i++) {
 			idx = fpi->SrcReg[i].Index;
 			switch (fpi->SrcReg[i].File) {
@@ -2246,7 +2246,7 @@ void r300_translate_fragment_shader(r300ContextPtr r300, struct r300_fragment_pr
 	struct r300_pfs_compile_state *cs = NULL;
 
 	if (!rp->translated) {
-		
+
 		init_program(r300, rp);
 		cs = rp->cs;
 
@@ -2254,7 +2254,7 @@ void r300_translate_fragment_shader(r300ContextPtr r300, struct r300_fragment_pr
 			dump_program(rp);
 			return;
 		}
-		
+
 		/* Finish off */
 		rp->node[rp->cur_node].alu_end =
 				cs->nrslots - rp->node[rp->cur_node].alu_offset - 1;
@@ -2266,9 +2266,9 @@ void r300_translate_fragment_shader(r300ContextPtr r300, struct r300_fragment_pr
 		rp->tex_end    = rp->tex.length ? rp->tex.length - 1 : 0;
 		assert(rp->node[rp->cur_node].alu_end >= 0);
 		assert(rp->alu_end >= 0);
-	
+
 		rp->translated = GL_TRUE;
-		if (0) dump_program(rp);
+		if (1) dump_program(rp);
 		r300UpdateStateParameters(rp->ctx, _NEW_PROGRAM);
 	}
 
@@ -2282,7 +2282,7 @@ static void dump_program(struct r300_fragment_program *rp)
 	static int pc = 0;
 
 	fprintf(stderr, "pc=%d*************************************\n", pc++);
-			
+
 	fprintf(stderr, "Mesa program:\n");
 	fprintf(stderr, "-------------\n");
 		_mesa_print_program(&rp->mesa_program.Base);
@@ -2290,7 +2290,7 @@ static void dump_program(struct r300_fragment_program *rp)
 
 	fprintf(stderr, "Hardware program\n");
 	fprintf(stderr, "----------------\n");
-	
+
 	for (n = 0; n < (rp->cur_node+1); n++) {
 		fprintf(stderr, "NODE %d: alu_offset: %d, tex_offset: %d, "\
 			"alu_end: %d, tex_end: %d\n", n,
@@ -2298,12 +2298,12 @@ static void dump_program(struct r300_fragment_program *rp)
 			rp->node[n].tex_offset,
 			rp->node[n].alu_end,
 			rp->node[n].tex_end);
-		
+
 		if (rp->tex.length) {
 			fprintf(stderr, "  TEX:\n");
 			for(i = rp->node[n].tex_offset; i <= rp->node[n].tex_offset+rp->node[n].tex_end; ++i) {
 				const char* instr;
-				
+
 				switch((rp->tex.inst[i] >> R300_FPITX_OPCODE_SHIFT) & 15) {
 				case R300_FPITX_OP_TEX:
 					instr = "TEX";
@@ -2320,7 +2320,7 @@ static void dump_program(struct r300_fragment_program *rp)
 				default:
 					instr = "UNKNOWN";
 				}
-				
+
 				fprintf(stderr, "    %s t%i, %c%i, texture[%i]   (%08x)\n",
 						instr,
 						(rp->tex.inst[i] >> R300_FPITX_DST_SHIFT) & 31,
@@ -2330,22 +2330,22 @@ static void dump_program(struct r300_fragment_program *rp)
 						rp->tex.inst[i]);
 			}
 		}
-		
+
 		for(i = rp->node[n].alu_offset; i <= rp->node[n].alu_offset+rp->node[n].alu_end; ++i) {
 			char srcc[3][10], dstc[20];
 			char srca[3][10], dsta[20];
 			char argc[3][20];
 			char arga[3][20];
 			char flags[5], tmp[10];
-			
+
 			for(j = 0; j < 3; ++j) {
 				int regc = rp->alu.inst[i].inst1 >> (j*6);
 				int rega = rp->alu.inst[i].inst3 >> (j*6);
-				
+
 				sprintf(srcc[j], "%c%i", (regc & 32) ? 'c' : 't', regc & 31);
 				sprintf(srca[j], "%c%i", (rega & 32) ? 'c' : 't', rega & 31);
 			}
-			
+
 			dstc[0] = 0;
 			sprintf(flags, "%s%s%s",
 					(rp->alu.inst[i].inst1 & R300_FPI1_DSTC_REG_X) ? "x" : "",
@@ -2366,7 +2366,7 @@ static void dump_program(struct r300_fragment_program *rp)
 						flags);
 				strcat(dstc, tmp);
 			}
-			
+
 			dsta[0] = 0;
 			if (rp->alu.inst[i].inst3 & R300_FPI3_DSTA_REG) {
 				sprintf(dsta, "t%i.w ", (rp->alu.inst[i].inst3 >> R300_FPI3_DSTA_SHIFT) & 31);
@@ -2378,13 +2378,13 @@ static void dump_program(struct r300_fragment_program *rp)
 			if (rp->alu.inst[i].inst3 & R300_FPI3_DSTA_DEPTH) {
 				strcat(dsta, "Z");
 			}
-			
+
 			fprintf(stderr, "%3i: xyz: %3s %3s %3s -> %-20s (%08x)\n"
 			                "       w: %3s %3s %3s -> %-20s (%08x)\n",
 					i,
 					srcc[0], srcc[1], srcc[2], dstc, rp->alu.inst[i].inst1,
 					srca[0], srca[1], srca[2], dsta, rp->alu.inst[i].inst3);
-			
+
 			for(j = 0; j < 3; ++j) {
 				int regc = rp->alu.inst[i].inst0 >> (j*7);
 				int rega = rp->alu.inst[i].inst2 >> (j*7);
@@ -2431,13 +2431,13 @@ static void dump_program(struct r300_fragment_program *rp)
 				} else {
 					sprintf(buf, "%i", d);
 				}
-				
+
 				sprintf(argc[j], "%s%s%s%s",
 						(regc & 32) ? "-" : "",
 						(regc & 64) ? "|" : "",
 						buf,
 						(regc & 64) ? "|" : "");
-			
+
 				d = rega & 31;
 				if (d < 9) {
 					sprintf(buf, "%s.%c", srcc[d / 3], 'x' + (char)(d%3));
@@ -2452,14 +2452,14 @@ static void dump_program(struct r300_fragment_program *rp)
 				} else {
 					sprintf(buf, "%i", d);
 				}
-				
+
 				sprintf(arga[j], "%s%s%s%s",
 						(rega & 32) ? "-" : "",
 						(rega & 64) ? "|" : "",
 						buf,
 						(rega & 64) ? "|" : "");
 			}
-			
+
 			fprintf(stderr, "     xyz: %8s %8s %8s    op: %08x\n"
 			                "       w: %8s %8s %8s    op: %08x\n",
 					argc[0], argc[1], argc[2], rp->alu.inst[i].inst0,
-- 
cgit v1.2.3


From 826815a5d27d6e79e9d0e0b0fc63bb3fd092d40d Mon Sep 17 00:00:00 2001
From: Nicolai Haehnle <nhaehnle@gmail.com>
Date: Mon, 19 Mar 2007 20:01:20 +0100
Subject: r300: Dump fragment program after translation if RADEON_DEBUG=pixel
 is set

---
 src/mesa/drivers/dri/r300/r300_fragprog.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src/mesa/drivers')

diff --git a/src/mesa/drivers/dri/r300/r300_fragprog.c b/src/mesa/drivers/dri/r300/r300_fragprog.c
index 93b9c39635..6262dc7a44 100644
--- a/src/mesa/drivers/dri/r300/r300_fragprog.c
+++ b/src/mesa/drivers/dri/r300/r300_fragprog.c
@@ -2268,7 +2268,7 @@ void r300_translate_fragment_shader(r300ContextPtr r300, struct r300_fragment_pr
 		assert(rp->alu_end >= 0);
 
 		rp->translated = GL_TRUE;
-		if (1) dump_program(rp);
+		if (RADEON_DEBUG & DEBUG_PIXEL) dump_program(rp);
 		r300UpdateStateParameters(rp->ctx, _NEW_PROGRAM);
 	}
 
-- 
cgit v1.2.3


From b3acba87d7f5ede486cba11db036cf36dff6c29e Mon Sep 17 00:00:00 2001
From: Nicolai Haehnle <nhaehnle@gmail.com>
Date: Mon, 19 Mar 2007 22:17:16 +0100
Subject: r300: Clear fragment program instruction slots on first use

Make sure that instruction slots are fully initialized with NOPs during
find_and_prepare_slot(). This fixes a bug when a fragment program was
translated more than once (e.g. due to a second call to glProgramStringARB).

This partially fixes glean/fragProg1.
---
 src/mesa/drivers/dri/r300/r300_fragprog.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'src/mesa/drivers')

diff --git a/src/mesa/drivers/dri/r300/r300_fragprog.c b/src/mesa/drivers/dri/r300/r300_fragprog.c
index 6262dc7a44..3f9d83f109 100644
--- a/src/mesa/drivers/dri/r300/r300_fragprog.c
+++ b/src/mesa/drivers/dri/r300/r300_fragprog.c
@@ -1143,7 +1143,9 @@ static int find_and_prepare_slot(struct r300_fragment_program* rp,
 			}
 
 			rp->alu.inst[pos].inst0 = NOP_INST0;
+			rp->alu.inst[pos].inst1 = NOP_INST1;
 			rp->alu.inst[pos].inst2 = NOP_INST2;
+			rp->alu.inst[pos].inst3 = NOP_INST3;
 
 			cs->nrslots++;
 		}
-- 
cgit v1.2.3


From 5a6547878373798113f8b55b912abc5bfb93add5 Mon Sep 17 00:00:00 2001
From: Nicolai Haehnle <nhaehnle@gmail.com>
Date: Mon, 19 Mar 2007 22:26:08 +0100
Subject: r300: Fix special case (tmp.x <= 0) in fragment program LIT
 instruction

Also, fix a typo in a related comment.
---
 src/mesa/drivers/dri/r300/r300_fragprog.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'src/mesa/drivers')

diff --git a/src/mesa/drivers/dri/r300/r300_fragprog.c b/src/mesa/drivers/dri/r300/r300_fragprog.c
index 3f9d83f109..1d462ebec8 100644
--- a/src/mesa/drivers/dri/r300/r300_fragprog.c
+++ b/src/mesa/drivers/dri/r300/r300_fragprog.c
@@ -1463,7 +1463,7 @@ static void emit_lit(struct r300_fragment_program *rp,
 		temp = keep(dest);
 	}
 
-	// Npte: The order of emit_arith inside the slots is relevant,
+	// Note: The order of emit_arith inside the slots is relevant,
 	// because emit_arith only looks at scalar vs. vector when resolving
 	// dependencies, and it does not consider individual vector components,
 	// so swizzling between the two parts can create fake dependencies.
@@ -1496,7 +1496,7 @@ static void emit_lit(struct r300_fragment_program *rp,
 
 	// Fifth slot
 	emit_arith(rp, PFS_OP_CMP, temp, WRITEMASK_Z,
-	           swizzle(temp, W, W, W, W), pfs_zero, swizzle(temp, Y, Y, Y, Y), flags);
+	           pfs_zero, swizzle(temp, W, W, W, W), negate(swizzle(temp, Y, Y, Y, Y)), flags);
 	emit_arith(rp, PFS_OP_MAD, temp, WRITEMASK_W,
 	           pfs_one, pfs_one, pfs_zero, 0);
 
-- 
cgit v1.2.3


From 61821a41c07b6b383a275acf31ade56af2ecfb3c Mon Sep 17 00:00:00 2001
From: Nicolai Haehnle <nhaehnle@gmail.com>
Date: Mon, 19 Mar 2007 23:32:36 +0100
Subject: r300: Cleanup fragment program constant allocation, share constants

The constant/parameter allocation was significantly simplified, removing
one unnecessary copy operation of parameters. The dirty state tracking is
unchanged and far from optimal, since all state is always re-fetched.

Constants and parameters are now emitted only once, which significantly
reduces the resource pressure on larger programs.
---
 src/mesa/drivers/dri/r300/r300_context.h  |  20 +++---
 src/mesa/drivers/dri/r300/r300_fragprog.c | 114 ++++++++++++++----------------
 2 files changed, 61 insertions(+), 73 deletions(-)

(limited to 'src/mesa/drivers')

diff --git a/src/mesa/drivers/dri/r300/r300_context.h b/src/mesa/drivers/dri/r300/r300_context.h
index 29436ab9e0..bbe44f5e7f 100644
--- a/src/mesa/drivers/dri/r300/r300_context.h
+++ b/src/mesa/drivers/dri/r300/r300_context.h
@@ -767,23 +767,21 @@ struct r300_fragment_program {
 	int tex_offset;
 	int tex_end;
 
-	/* Hardware constants */
-	GLfloat constant[PFS_NUM_CONST_REGS][4];
+	/* Hardware constants.
+	 * Contains a pointer to the value. The destination of the pointer
+	 * is supposed to be updated when GL state changes.
+	 * Typically, this is either a pointer into
+	 * gl_program_parameter_list::ParameterValues, or a pointer to a
+	 * global constant (e.g. for sin/cos-approximation)
+	 */
+	const GLfloat* constant[PFS_NUM_CONST_REGS];
 	int const_nr;
 
-	/* Tracked parameters */
-	struct {
-		int idx;			/* hardware index */
-		GLfloat *values;	/* pointer to values */
-	} param[PFS_NUM_CONST_REGS];
-	int param_nr;
-	GLboolean params_uptodate;
-
 	int max_temp_idx;
 
 	/* the index of the sin constant is stored here */
 	GLint const_sin[2];
-	
+
 	GLuint optimization;
 };
 
diff --git a/src/mesa/drivers/dri/r300/r300_fragprog.c b/src/mesa/drivers/dri/r300/r300_fragprog.c
index 1d462ebec8..2145c48b80 100644
--- a/src/mesa/drivers/dri/r300/r300_fragprog.c
+++ b/src/mesa/drivers/dri/r300/r300_fragprog.c
@@ -468,47 +468,39 @@ static void free_temp(struct r300_fragment_program *rp, GLuint r)
 	}
 }
 
-static GLuint emit_param4fv(struct r300_fragment_program *rp,
-			    GLfloat *values)
+/**
+ * Emit a hardware constant/parameter.
+ *
+ * \p cp Stable pointer to an array of 4 floats.
+ *  The pointer must be stable in the sense that it remains to be valid
+ *  and hold the contents of the constant/parameter throughout the lifetime
+ *  of the fragment program (actually, up until the next time the fragment
+ *  program is translated).
+ */
+static GLuint emit_const4fv(struct r300_fragment_program *rp, const GLfloat* cp)
 {
-	GLuint r = undef;
-	GLuint index;
-	int pidx;
+	GLuint reg = undef;
+	int index;
 
-	pidx = rp->param_nr++;
-	index = rp->const_nr++;
-	if (pidx >= PFS_NUM_CONST_REGS || index >= PFS_NUM_CONST_REGS) {
-		ERROR("Out of const/param slots!\n");
-		return r;
+	for(index = 0; index < rp->const_nr; ++index) {
+		if (rp->constant[index] == cp)
+			break;
 	}
 
-	rp->param[pidx].idx = index;
-	rp->param[pidx].values = values;
-	rp->params_uptodate = GL_FALSE;
-
-	REG_SET_TYPE(r, REG_TYPE_CONST);
-	REG_SET_INDEX(r, index);
-	REG_SET_VALID(r, GL_TRUE);
-	return r;
-}
-
-static GLuint emit_const4fv(struct r300_fragment_program *rp, const GLfloat* cp)
-{
-	GLuint r = undef;
-	GLuint index;
+	if (index >= rp->const_nr) {
+		if (index >= PFS_NUM_CONST_REGS) {
+			ERROR("Out of hw constants!\n");
+			return reg;
+		}
 
-	index = rp->const_nr++;
-	if (index >= PFS_NUM_CONST_REGS) {
-		ERROR("Out of hw constants!\n");
-		return r;
+		rp->const_nr++;
+		rp->constant[index] = cp;
 	}
 
-	COPY_4V(rp->constant[index], cp);
-
-	REG_SET_TYPE(r, REG_TYPE_CONST);
-	REG_SET_INDEX(r, index);
-	REG_SET_VALID(r, GL_TRUE);
-	return r;
+	REG_SET_TYPE(reg, REG_TYPE_CONST);
+	REG_SET_INDEX(reg, index);
+	REG_SET_VALID(reg, GL_TRUE);
+	return reg;
 }
 
 static inline GLuint negate(GLuint r)
@@ -762,16 +754,16 @@ static GLuint t_src(struct r300_fragment_program *rp,
 		REG_SET_TYPE(r, REG_TYPE_INPUT);
 		break;
 	case PROGRAM_LOCAL_PARAM:
-		r = emit_param4fv(rp,
+		r = emit_const4fv(rp,
 				  rp->mesa_program.Base.LocalParams[fpsrc.Index]);
 		break;
 	case PROGRAM_ENV_PARAM:
-		r = emit_param4fv(rp,
+		r = emit_const4fv(rp,
 				  rp->ctx->FragmentProgram.Parameters[fpsrc.Index]);
 		break;
 	case PROGRAM_STATE_VAR:
 	case PROGRAM_NAMED_PARAM:
-		r = emit_param4fv(rp,
+		r = emit_const4fv(rp,
 				  rp->mesa_program.Base.Parameters->ParameterValues[fpsrc.Index]);
 		break;
 	default:
@@ -1393,22 +1385,27 @@ static GLuint get_attrib(struct r300_fragment_program *rp, GLuint attr)
 }
 #endif
 
+static GLfloat SinCosConsts[2][4] = {
+	{
+		1.273239545,  // 4/PI
+		-0.405284735, // -4/(PI*PI)
+		3.141592654,  // PI
+		0.2225        // weight
+	},
+	{
+		0.75,
+		0.0,
+		0.159154943,  // 1/(2*PI)
+		6.283185307   // 2*PI
+	}
+};
+
+
 static void make_sin_const(struct r300_fragment_program *rp)
 {
-	if(rp->const_sin[0] == -1){
-	    GLfloat cnstv[4];
-
-	    cnstv[0] = 1.273239545; // 4/PI
-	    cnstv[1] =-0.405284735; // -4/(PI*PI)
-	    cnstv[2] = 3.141592654; // PI
-	    cnstv[3] = 0.2225;      // weight
-	    rp->const_sin[0] = emit_const4fv(rp, cnstv);
-
-	    cnstv[0] = 0.75;
-	    cnstv[1] = 0.0;
-	    cnstv[2] = 0.159154943; // 1/(2*PI)
-	    cnstv[3] = 6.283185307; // 2*PI
-	    rp->const_sin[1] = emit_const4fv(rp, cnstv);
+	if(rp->const_sin[0] == -1) {
+		rp->const_sin[0] = emit_const4fv(rp, SinCosConsts[0]);
+		rp->const_sin[1] = emit_const4fv(rp, SinCosConsts[1]);
 	}
 }
 
@@ -1434,6 +1431,8 @@ static void make_sin_const(struct r300_fragment_program *rp)
  * emit_arith is a bit too conservative because it doesn't understand
  * partial writes to the vector component.
  */
+static const GLfloat LitConst[4] = { 127.999999, 127.999999, 127.999999, -127.999999 };
+
 static void emit_lit(struct r300_fragment_program *rp,
 		GLuint dest,
 		int mask,
@@ -1441,12 +1440,11 @@ static void emit_lit(struct r300_fragment_program *rp,
 		int flags)
 {
 	COMPILE_STATE;
-	static const GLfloat cnstv[4] = { 127.999999, 127.999999, 127.999999, -127.999999 };
 	GLuint cnst;
 	int needTemporary;
 	GLuint temp;
 
-	cnst = emit_const4fv(rp, cnstv);
+	cnst = emit_const4fv(rp, LitConst);
 
 	needTemporary = 0;
 	if ((mask & WRITEMASK_XYZW) != WRITEMASK_XYZW) {
@@ -2123,8 +2121,6 @@ static void init_program(r300ContextPtr r300, struct r300_fragment_program *rp)
 	rp->cur_node   = 0;
 	rp->first_node_has_tex = 0;
 	rp->const_nr   = 0;
-	rp->param_nr   = 0;
-	rp->params_uptodate = GL_FALSE;
 	rp->max_temp_idx = 0;
 	rp->node[0].alu_end = -1;
 	rp->node[0].tex_end = -1;
@@ -2231,16 +2227,10 @@ static void init_program(r300ContextPtr r300, struct r300_fragment_program *rp)
 static void update_params(struct r300_fragment_program *rp)
 {
 	struct gl_fragment_program *mp = &rp->mesa_program;
-	int i;
 
 	/* Ask Mesa nicely to fill in ParameterValues for us */
-	if (rp->param_nr)
+	if (mp->Base.Parameters)
 		_mesa_load_state_parameters(rp->ctx, mp->Base.Parameters);
-
-	for (i=0;i<rp->param_nr;i++)
-		COPY_4V(rp->constant[rp->param[i].idx], rp->param[i].values);
-
-	rp->params_uptodate = GL_TRUE;
 }
 
 void r300_translate_fragment_shader(r300ContextPtr r300, struct r300_fragment_program *rp)
-- 
cgit v1.2.3


From 9622a634f61f02ed1a23087762a2ec8a305ae77e Mon Sep 17 00:00:00 2001
From: Ben Skeggs <skeggsb@gmail.com>
Date: Tue, 20 Mar 2007 15:05:35 +1100
Subject: nouveau: NVSDBG macro

---
 src/mesa/drivers/dri/nouveau/nouveau_shader.c   | 12 ++++++++++++
 src/mesa/drivers/dri/nouveau/nouveau_shader.h   |  6 ++++++
 src/mesa/drivers/dri/nouveau/nouveau_shader_0.c |  2 ++
 src/mesa/drivers/dri/nouveau/nouveau_shader_1.c |  2 ++
 src/mesa/drivers/dri/nouveau/nouveau_shader_2.c |  2 ++
 5 files changed, 24 insertions(+)

(limited to 'src/mesa/drivers')

diff --git a/src/mesa/drivers/dri/nouveau/nouveau_shader.c b/src/mesa/drivers/dri/nouveau/nouveau_shader.c
index ba471325aa..9cb837ff3b 100644
--- a/src/mesa/drivers/dri/nouveau/nouveau_shader.c
+++ b/src/mesa/drivers/dri/nouveau/nouveau_shader.c
@@ -48,6 +48,7 @@
 static void
 nouveauBindProgram(GLcontext *ctx, GLenum target, struct gl_program *prog)
 {
+   NVSDBG("target=%s, prog=%p\n", _mesa_lookup_enum_by_nr(target), prog);
 }
 
 static struct gl_program *
@@ -55,7 +56,10 @@ nouveauNewProgram(GLcontext *ctx, GLenum target, GLuint id)
 {
    nouveauShader *nvs;
 
+   NVSDBG("target=%s, id=%d\n", _mesa_lookup_enum_by_nr(target), id);
+
    nvs = CALLOC_STRUCT(_nouveauShader);
+   NVSDBG("prog=%p\n", nvs);
    switch (target) {
    case GL_VERTEX_PROGRAM_ARB:
       return _mesa_init_vertex_program(ctx, &nvs->mesa.vp, target, id);
@@ -75,6 +79,8 @@ nouveauDeleteProgram(GLcontext *ctx, struct gl_program *prog)
 {
    nouveauShader *nvs = (nouveauShader *)prog;
 
+   NVSDBG("prog=%p\n", prog);
+
    if (nvs->translated)
       FREE(nvs->program);
    _mesa_delete_program(ctx, prog);
@@ -86,6 +92,8 @@ nouveauProgramStringNotify(GLcontext *ctx, GLenum target,
 {
    nouveauShader *nvs = (nouveauShader *)prog;
 
+   NVSDBG("target=%s, prog=%p\n", _mesa_lookup_enum_by_nr(target), prog);
+
    if (nvs->translated)
       FREE(nvs->program);
    nvs->translated = 0;
@@ -98,6 +106,8 @@ nouveauIsProgramNative(GLcontext * ctx, GLenum target, struct gl_program *prog)
 {
    nouveauShader *nvs = (nouveauShader *)prog;
 
+   NVSDBG("target=%s, prog=%p\n", _mesa_lookup_enum_by_nr(target), prog);
+
    return nvs->translated;
 }
 
@@ -108,6 +118,8 @@ nvsUpdateShader(GLcontext *ctx, nouveauShader *nvs)
    struct gl_program_parameter_list *plist;
    int i;
 
+   NVSDBG("prog=%p\n", nvs);
+
    /* Translate to HW format now if necessary */
    if (!nvs->translated) {
       /* Mesa ASM shader -> nouveauShader */
diff --git a/src/mesa/drivers/dri/nouveau/nouveau_shader.h b/src/mesa/drivers/dri/nouveau/nouveau_shader.h
index b2df3546f6..56ae270764 100644
--- a/src/mesa/drivers/dri/nouveau/nouveau_shader.h
+++ b/src/mesa/drivers/dri/nouveau/nouveau_shader.h
@@ -4,6 +4,12 @@
 #include "mtypes.h"
 #include "bufferobj.h"
 
+#define NVSDBG(fmt, args...) do {                             \
+	if (NOUVEAU_DEBUG & DEBUG_SHADERS) {                  \
+		fprintf(stderr, "%s: "fmt, __func__, ##args); \
+	}                                                     \
+} while(0)
+
 typedef struct _nvsFunc nvsFunc;
 
 #define NVS_MAX_TEMPS   32
diff --git a/src/mesa/drivers/dri/nouveau/nouveau_shader_0.c b/src/mesa/drivers/dri/nouveau/nouveau_shader_0.c
index 211483dc02..0308a6c397 100644
--- a/src/mesa/drivers/dri/nouveau/nouveau_shader_0.c
+++ b/src/mesa/drivers/dri/nouveau/nouveau_shader_0.c
@@ -974,6 +974,8 @@ nouveau_shader_pass0(GLcontext *ctx, nouveauShader *nvs)
 	struct pass0_rec *rec;
 	int ret = GL_FALSE;
 
+	NVSDBG("start: nvs=%p\n", nvs);
+
 	rec = CALLOC_STRUCT(pass0_rec);
 	if (!rec)
 		return GL_FALSE;
diff --git a/src/mesa/drivers/dri/nouveau/nouveau_shader_1.c b/src/mesa/drivers/dri/nouveau/nouveau_shader_1.c
index 90c57d3807..78c1401f7d 100644
--- a/src/mesa/drivers/dri/nouveau/nouveau_shader_1.c
+++ b/src/mesa/drivers/dri/nouveau/nouveau_shader_1.c
@@ -2,11 +2,13 @@
 #include "macros.h"
 #include "enums.h"
 
+#include "nouveau_context.h"
 #include "nouveau_shader.h"
 
 GLboolean
 nouveau_shader_pass1(nvsPtr nvs)
 {
+   NVSDBG("start: nvs=%p\n", nvs);
 
    return GL_TRUE;
 }
diff --git a/src/mesa/drivers/dri/nouveau/nouveau_shader_2.c b/src/mesa/drivers/dri/nouveau/nouveau_shader_2.c
index b043f877e4..130ef35e57 100644
--- a/src/mesa/drivers/dri/nouveau/nouveau_shader_2.c
+++ b/src/mesa/drivers/dri/nouveau/nouveau_shader_2.c
@@ -209,6 +209,8 @@ nouveau_shader_pass2(nvsPtr nvs)
 	struct pass2_rec *rec;
 	int i;
 
+	NVSDBG("start: nvs=%p\n", nvs);
+
 	rec = calloc(1, sizeof(struct pass2_rec));
 	for (i=0; i<NVS_MAX_TEMPS; i++)
 		rec->temps[i] = -1;
-- 
cgit v1.2.3


From 4185037af2d89c5b245646f5e4a7c6dc946cae43 Mon Sep 17 00:00:00 2001
From: Ben Skeggs <skeggsb@gmail.com>
Date: Tue, 20 Mar 2007 15:52:57 +1100
Subject: nouveau: fail translate if we use too many params somehow

---
 src/mesa/drivers/dri/nouveau/nouveau_shader.c   |  4 +++-
 src/mesa/drivers/dri/nouveau/nouveau_shader.h   |  7 +++++++
 src/mesa/drivers/dri/nouveau/nouveau_shader_0.c | 25 +++++++++++++++++++++----
 3 files changed, 31 insertions(+), 5 deletions(-)

(limited to 'src/mesa/drivers')

diff --git a/src/mesa/drivers/dri/nouveau/nouveau_shader.c b/src/mesa/drivers/dri/nouveau/nouveau_shader.c
index 9cb837ff3b..3d5b6843a1 100644
--- a/src/mesa/drivers/dri/nouveau/nouveau_shader.c
+++ b/src/mesa/drivers/dri/nouveau/nouveau_shader.c
@@ -96,7 +96,9 @@ nouveauProgramStringNotify(GLcontext *ctx, GLenum target,
 
    if (nvs->translated)
       FREE(nvs->program);
-   nvs->translated = 0;
+
+   nvs->error      = GL_FALSE;
+   nvs->translated = GL_FALSE;
 
    _tnl_program_string(ctx, target, prog);
 }
diff --git a/src/mesa/drivers/dri/nouveau/nouveau_shader.h b/src/mesa/drivers/dri/nouveau/nouveau_shader.h
index 56ae270764..7125a2ae82 100644
--- a/src/mesa/drivers/dri/nouveau/nouveau_shader.h
+++ b/src/mesa/drivers/dri/nouveau/nouveau_shader.h
@@ -51,6 +51,7 @@ typedef struct _nouveauShader {
    nvsFunc *func;
 
    /* State of the final program */
+   GLboolean error;
    GLboolean translated;
    GLboolean on_hardware;
    unsigned int *program;
@@ -424,6 +425,12 @@ nvsSwizzle(nvsRegister reg, nvsSwzComp x, nvsSwzComp y,
    return reg;
 }
 
+#define nvsProgramError(nvs,fmt,args...) do {                           \
+	fprintf(stderr, "nvsProgramError (%s): "fmt, __func__, ##args); \
+	(nvs)->error = GL_TRUE;                                         \
+	(nvs)->translated = GL_FALSE;                                   \
+} while(0)
+
 extern GLboolean nvsUpdateShader(GLcontext *ctx, nouveauShader *nvs);
 extern void nvsDisasmHWShader(nvsPtr);
 extern void nvsDumpFragmentList(nvsFragmentHeader *f, int lvl);
diff --git a/src/mesa/drivers/dri/nouveau/nouveau_shader_0.c b/src/mesa/drivers/dri/nouveau/nouveau_shader_0.c
index 0308a6c397..7c2e2b9443 100644
--- a/src/mesa/drivers/dri/nouveau/nouveau_shader_0.c
+++ b/src/mesa/drivers/dri/nouveau/nouveau_shader_0.c
@@ -924,7 +924,7 @@ pass0_rebase_mesa_consts(nouveauShader *nvs)
 	}
 }
 
-static void
+static GLboolean
 pass0_resolve_mesa_consts(nouveauShader *nvs)
 {
 	struct pass0_rec *rec = nvs->pass_rec;
@@ -945,6 +945,11 @@ pass0_resolve_mesa_consts(nouveauShader *nvs)
 	for (i=0; i<plist->NumParameters; i++) {
 		int hw = rec->mesa_const_base + i;
 
+		if (hw > NVS_MAX_CONSTS) {
+			nvsProgramError(nvs, "hw = %d > NVS_MAX_CONSTS!\n", hw);
+			return GL_FALSE;
+		}
+
 		switch (plist->Parameters[i].Type) {
 		case PROGRAM_NAMED_PARAM:
 		case PROGRAM_STATE_VAR:
@@ -958,10 +963,13 @@ pass0_resolve_mesa_consts(nouveauShader *nvs)
 			COPY_4V(nvs->params[hw].val, plist->ParameterValues[i]);
 			break;
 		default:
-			assert(0);
-			break;
+			nvsProgramError(nvs, "hit bad type=%d on param %d\n",
+					plist->Parameters[i].Type, i);
+			return GL_FALSE;
 		}
 	}
+
+	return GL_TRUE;
 }
 
 GLboolean
@@ -976,6 +984,14 @@ nouveau_shader_pass0(GLcontext *ctx, nouveauShader *nvs)
 
 	NVSDBG("start: nvs=%p\n", nvs);
 
+	/* Previously detected an error, and haven't recieved new program
+	 * string, so fail immediately.
+	 */
+	if (nvs->error) {
+		NVSDBG("failed previous compile attempt, not retrying\n");
+		return GL_FALSE;
+	}
+
 	rec = CALLOC_STRUCT(pass0_rec);
 	if (!rec)
 		return GL_FALSE;
@@ -1020,7 +1036,8 @@ nouveau_shader_pass0(GLcontext *ctx, nouveauShader *nvs)
 
 	ret = pass0_translate_instructions(nvs, 0, 0, nvs->program_tree);
 	if (ret)
-		pass0_resolve_mesa_consts(nvs);
+		ret = pass0_resolve_mesa_consts(nvs);	
+	
 	/*XXX: if (!ret) DESTROY TREE!!! */
 
 	FREE(rec);
-- 
cgit v1.2.3


From ecb1a1c82f48dd78203230f6ea3dee49d7ade17d Mon Sep 17 00:00:00 2001
From: Ben Skeggs <skeggsb@gmail.com>
Date: Tue, 20 Mar 2007 15:59:55 +1100
Subject: nouveau: fix typo

---
 src/mesa/drivers/dri/nouveau/nouveau_sync.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src/mesa/drivers')

diff --git a/src/mesa/drivers/dri/nouveau/nouveau_sync.c b/src/mesa/drivers/dri/nouveau/nouveau_sync.c
index 428b19b46e..30e6696269 100644
--- a/src/mesa/drivers/dri/nouveau/nouveau_sync.c
+++ b/src/mesa/drivers/dri/nouveau/nouveau_sync.c
@@ -124,7 +124,7 @@ nouveau_notifier_wait_status(nouveau_notifier *notifier, GLuint id,
 	while (time <= timeout) {
 		if (n[NV_NOTIFY_STATE/4] & NV_NOTIFY_STATE_ERROR_CODE_MASK) {
 			MESSAGE("Notifier returned error: 0x%04x\n",
-					n[NV_NOTIFY_STATE] &
+					n[NV_NOTIFY_STATE/4] &
 					NV_NOTIFY_STATE_ERROR_CODE_MASK);
 			return GL_FALSE;
 		}
-- 
cgit v1.2.3