Merge remote branch 'origin/master' into glsl2

This pulls in multiple i965 driver fixes which will help ensure better testing coverage during development, and also gets past the conflicts of the src/mesa/shader -> src/mesa/program move. Conflicts: src/mesa/Makefile src/mesa/main/shaderapi.c src/mesa/main/shaderobj.h
author: Eric Anholt <eric@anholt.net> 2010-07-26 17:47:59 -0700
committer: Eric Anholt <eric@anholt.net> 2010-07-26 17:53:27 -0700
commit: afe125e0a18ac3886c45c7e6b02b122fb2d327b5 (patch)
tree: 78621707e71154c0b388b0baacffc26432b7e992 /src/mesa/drivers/dri/r300/compiler
parent: d64343f1ae84979bd154475badf11af8a9bfc2eb (diff)
parent: 5403ca79b225605c79f49866a6497c97da53be3b (diff)
19 files changed, 701 insertions, 156 deletions
diff --git a/src/mesa/drivers/dri/r300/compiler/Makefile b/src/mesa/drivers/dri/r300/compiler/Makefile
index ff3801dc67..3167d49bca 100644
--- a/src/mesa/drivers/dri/r300/compiler/Makefile
+++ b/src/mesa/drivers/dri/r300/compiler/Makefile
@@ -23,6 +23,7 @@ C_SOURCES = \
 		radeon_dataflow_deadcode.c \
 		radeon_dataflow_swizzles.c \
 		radeon_optimize.c \
+		radeon_rename_regs.c \
 		r3xx_fragprog.c \
 		r300_fragprog.c \
 		r300_fragprog_swizzle.c \
diff --git a/src/mesa/drivers/dri/r300/compiler/SConscript b/src/mesa/drivers/dri/r300/compiler/SConscript
index 50d9cdb7f2..c6f47a6f8a 100755
--- a/src/mesa/drivers/dri/r300/compiler/SConscript
+++ b/src/mesa/drivers/dri/r300/compiler/SConscript
@@ -22,6 +22,7 @@ r300compiler = env.ConvenienceLibrary(
         'radeon_pair_schedule.c',
         'radeon_pair_regalloc.c',
         'radeon_optimize.c',
+        'radeon_rename_regs.c',
         'radeon_emulate_branches.c',
         'radeon_emulate_loops.c',
         'radeon_dataflow.c',
diff --git a/src/mesa/drivers/dri/r300/compiler/r3xx_fragprog.c b/src/mesa/drivers/dri/r300/compiler/r3xx_fragprog.c
index 38312658d6..a326ee4c4f 100644
--- a/src/mesa/drivers/dri/r300/compiler/r3xx_fragprog.c
+++ b/src/mesa/drivers/dri/r300/compiler/r3xx_fragprog.c
@@ -29,6 +29,7 @@
 #include "radeon_emulate_loops.h"
 #include "radeon_program_alu.h"
 #include "radeon_program_tex.h"
+#include "radeon_rename_regs.h"
 #include "r300_fragprog.h"
 #include "r300_fragprog_swizzle.h"
 #include "r500_fragprog.h"
@@ -97,25 +98,27 @@ static void debug_program_log(struct r300_fragment_program_compiler* c, const ch
 
 void r3xx_compile_fragment_program(struct r300_fragment_program_compiler* c)
 {
+	struct emulate_loop_state loop_state;
+
 	rewrite_depth_out(c);
 
+	/* This transformation needs to be done before any of the IF
+	 * instructions are modified. */
+	radeonTransformKILP(&c->Base);
+
 	debug_program_log(c, "before compilation");
 
-	/* XXX Ideally this should be done only for r3xx, but since
-	 * we don't have branching support for r5xx, we use the emulation
-	 * on all chipsets. */
-	
-	if(c->Base.is_r500){
-		rc_emulate_loops(&c->Base, R500_PFS_MAX_INST);
+	if (c->Base.is_r500){
+		r500_transform_unroll_loops(&c->Base, &loop_state);	
+		debug_program_log(c, "after r500 transform loops");
 	}
 	else{
-		rc_emulate_loops(&c->Base, R300_PFS_MAX_ALU_INST);
+		rc_transform_unroll_loops(&c->Base, &loop_state);
+		debug_program_log(c, "after transform loops");
+		
+		rc_emulate_branches(&c->Base);
+		debug_program_log(c, "after emulate branches");
 	}
-	debug_program_log(c, "after emulate loops");
-	
-	rc_emulate_branches(&c->Base);
-
-	debug_program_log(c, "after emulate branches");
 
 	if (c->Base.is_r500) {
 		struct radeon_program_transformation transformations[] = {
@@ -162,6 +165,11 @@ void r3xx_compile_fragment_program(struct r300_fragment_program_compiler* c)
 
 	debug_program_log(c, "after deadcode");
 
+	if(!c->Base.is_r500){
+		rc_emulate_loops(&loop_state, R300_PFS_MAX_ALU_INST);
+		debug_program_log(c, "after emulate loops");
+	}
+
 	rc_optimize(&c->Base);
 
 	debug_program_log(c, "after dataflow optimize");
@@ -172,6 +180,16 @@ void r3xx_compile_fragment_program(struct r300_fragment_program_compiler* c)
 
 	debug_program_log(c, "after dataflow passes");
 
+	if(!c->Base.is_r500) {
+		/* This pass makes it easier for the scheduler to group TEX
+		 * instructions and reduces the chances of creating too
+		 * many texture indirections.*/
+		rc_rename_regs(&c->Base);
+		if (c->Base.Error)
+			return;
+		debug_program_log(c, "after register rename");
+	}
+
 	rc_pair_translate(c);
 	if (c->Base.Error)
 		return;
diff --git a/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog.c b/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog.c
index 507b2e532f..d347b4df9c 100644
--- a/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog.c
+++ b/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog.c
@@ -30,6 +30,7 @@
 #include "radeon_program_alu.h"
 #include "radeon_swizzle.h"
 #include "radeon_emulate_branches.h"
+#include "radeon_emulate_loops.h"
 
 /*
  * Take an already-setup and valid source then swizzle it appropriately to
@@ -145,7 +146,8 @@ static unsigned long t_src(struct r300_vertex_program_code *vp,
 			       t_swizzle(GET_SWZ(src->Swizzle, 2)),
 			       t_swizzle(GET_SWZ(src->Swizzle, 3)),
 			       t_src_class(src->File),
-			       src->Negate) | (src->RelAddr << 4);
+			       src->Negate) |
+	       (src->RelAddr << 4) | (src->Abs << 3);
 }
 
 static unsigned long t_src_scalar(struct r300_vertex_program_code *vp,
@@ -161,7 +163,7 @@ static unsigned long t_src_scalar(struct r300_vertex_program_code *vp,
 			       t_swizzle(GET_SWZ(src->Swizzle, 0)),
 			       t_src_class(src->File),
 			       src->Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
-	    (src->RelAddr << 4);
+	       (src->RelAddr << 4) | (src->Abs << 3);
 }
 
 static int valid_dst(struct r300_vertex_program_code *vp,
@@ -348,7 +350,8 @@ static void translate_vertex_program(struct r300_vertex_program_compiler * compi
 		if (!valid_dst(compiler->code, &vpi->DstReg))
 			continue;
 
-		if (compiler->code->length >= VSF_MAX_FRAGMENT_LENGTH) {
+		if (compiler->code->length >= R500_VS_MAX_ALU_DWORDS ||
+		    (compiler->code->length >= R300_VS_MAX_ALU_DWORDS && !compiler->Base.is_r500)) {
 			rc_error(&compiler->Base, "Vertex program has too many instructions\n");
 			return;
 		}
@@ -404,7 +407,7 @@ static void allocate_temporary_registers(struct r300_vertex_program_compiler * c
 {
 	struct rc_instruction *inst;
 	unsigned int num_orig_temps = 0;
-	char hwtemps[VSF_MAX_FRAGMENT_TEMPS];
+	char hwtemps[R300_VS_MAX_TEMPS];
 	struct temporary_allocation * ta;
 	unsigned int i, j;
 
@@ -463,11 +466,11 @@ static void allocate_temporary_registers(struct r300_vertex_program_compiler * c
 				unsigned int orig = inst->U.I.DstReg.Index;
 
 				if (!ta[orig].Allocated) {
-					for(j = 0; j < VSF_MAX_FRAGMENT_TEMPS; ++j) {
+					for(j = 0; j < R300_VS_MAX_TEMPS; ++j) {
 						if (!hwtemps[j])
 							break;
 					}
-					if (j >= VSF_MAX_FRAGMENT_TEMPS) {
+					if (j >= R300_VS_MAX_TEMPS) {
 						fprintf(stderr, "Out of hw temporaries\n");
 					} else {
 						ta[orig].Allocated = 1;
@@ -485,6 +488,44 @@ static void allocate_temporary_registers(struct r300_vertex_program_compiler * c
 	}
 }
 
+/**
+ * R3xx-R4xx vertex engine does not support the Absolute source operand modifier
+ * and the Saturate opcode modifier. Only Absolute is currently transformed.
+ */
+static int transform_nonnative_modifiers(
+	struct radeon_compiler *c,
+	struct rc_instruction *inst,
+	void* unused)
+{
+	const struct rc_opcode_info *opcode = rc_get_opcode_info(inst->U.I.Opcode);
+	unsigned i;
+
+	/* Transform ABS(a) to MAX(a, -a). */
+	for (i = 0; i < opcode->NumSrcRegs; i++) {
+		if (inst->U.I.SrcReg[i].Abs) {
+			struct rc_instruction *new_inst;
+			unsigned temp;
+
+			inst->U.I.SrcReg[i].Abs = 0;
+
+			temp = rc_find_free_temporary(c);
+
+			new_inst = rc_insert_new_instruction(c, inst->Prev);
+			new_inst->U.I.Opcode = RC_OPCODE_MAX;
+			new_inst->U.I.DstReg.File = RC_FILE_TEMPORARY;
+			new_inst->U.I.DstReg.Index = temp;
+			new_inst->U.I.SrcReg[0] = inst->U.I.SrcReg[i];
+			new_inst->U.I.SrcReg[1] = inst->U.I.SrcReg[i];
+			new_inst->U.I.SrcReg[1].Negate ^= RC_MASK_XYZW;
+
+			memset(&inst->U.I.SrcReg[i], 0, sizeof(inst->U.I.SrcReg[i]));
+			inst->U.I.SrcReg[i].File = RC_FILE_TEMPORARY;
+			inst->U.I.SrcReg[i].Index = temp;
+			inst->U.I.SrcReg[i].Swizzle = RC_SWIZZLE_XYZW;
+		}
+	}
+	return 1;
+}
 
 /**
  * Vertex engine cannot read two inputs or two constants at the same time.
@@ -591,6 +632,8 @@ static struct rc_swizzle_caps r300_vertprog_swizzle_caps = {
 
 void r3xx_compile_vertex_program(struct r300_vertex_program_compiler* compiler)
 {
+	struct emulate_loop_state loop_state;
+	
 	compiler->Base.SwizzleCaps = &r300_vertprog_swizzle_caps;
 
 	addArtificialOutputs(compiler);
@@ -600,19 +643,48 @@ void r3xx_compile_vertex_program(struct r300_vertex_program_compiler* compiler)
 	/* XXX Ideally this should be done only for r3xx, but since
 	 * we don't have branching support for r5xx, we use the emulation
 	 * on all chipsets. */
+	rc_transform_unroll_loops(&compiler->Base, &loop_state);
+	
+	debug_program_log(compiler, "after transform loops");
+	
+	if (compiler->Base.is_r500){
+		rc_emulate_loops(&loop_state, R500_VS_MAX_ALU);
+	} else {
+		rc_emulate_loops(&loop_state, R300_VS_MAX_ALU);
+	}
+	debug_program_log(compiler, "after emulate loops");
+
 	rc_emulate_branches(&compiler->Base);
 
 	debug_program_log(compiler, "after emulate branches");
 
-	{
+	if (compiler->Base.is_r500) {
 		struct radeon_program_transformation transformations[] = {
 			{ &r300_transform_vertex_alu, 0 },
 			{ &r300_transform_trig_scale_vertex, 0 }
 		};
 		radeonLocalTransform(&compiler->Base, 2, transformations);
-	}
 
-	debug_program_log(compiler, "after native rewrite");
+		debug_program_log(compiler, "after native rewrite");
+	} else {
+		struct radeon_program_transformation transformations[] = {
+			{ &r300_transform_vertex_alu, 0 },
+			{ &radeonTransformTrigSimple, 0 }
+		};
+		radeonLocalTransform(&compiler->Base, 2, transformations);
+
+		debug_program_log(compiler, "after native rewrite");
+
+		/* Note: This pass has to be done seperately from ALU rewrite,
+		 * because it needs to check every instruction.
+		 */
+		struct radeon_program_transformation transformations2[] = {
+			{ &transform_nonnative_modifiers, 0 },
+		};
+		radeonLocalTransform(&compiler->Base, 1, transformations2);
+
+		debug_program_log(compiler, "after emulate modifiers");
+	}
 
 	{
 		/* Note: This pass has to be done seperately from ALU rewrite,
diff --git a/src/mesa/drivers/dri/r300/compiler/r500_fragprog.c b/src/mesa/drivers/dri/r300/compiler/r500_fragprog.c
index 632f0bcf4f..e6b5522c5b 100644
--- a/src/mesa/drivers/dri/r300/compiler/r500_fragprog.c
+++ b/src/mesa/drivers/dri/r300/compiler/r500_fragprog.c
@@ -30,6 +30,7 @@
 #include <stdio.h>
 
 #include "../r300_reg.h"
+#include "radeon_emulate_loops.h"
 
 /**
  * Rewrite IF instructions to use the ALU result special register.
@@ -59,6 +60,31 @@ int r500_transform_IF(
 	return 1;
 }
 
+/**
+ * Rewrite loops to make them easier to emit.  This is not a local
+ * transformation, because it modifies and reorders an entire block of code.
+ */
+void r500_transform_unroll_loops(struct radeon_compiler * c,
+						struct emulate_loop_state *s)
+{
+	int i;
+	
+	rc_transform_unroll_loops(c, s);
+	
+	for( i = s->LoopCount - 1; i >= 0; i-- ){
+		struct rc_instruction * inst_continue;
+		if(!s->Loops[i].EndLoop){
+			continue;
+		}
+		/* Insert a continue instruction at the end of the loop.  This
+		 * is required in order to emit loops correctly. */
+		inst_continue = rc_insert_new_instruction(c,
+						s->Loops[i].EndIf->Prev);
+		inst_continue->U.I.Opcode = RC_OPCODE_CONTINUE;
+	}
+
+}
+
 static int r500_swizzle_is_native(rc_opcode opcode, struct rc_src_register reg)
 {
 	unsigned int relevant;
@@ -252,7 +278,7 @@ void r500FragmentProgramDump(struct rX00_fragment_program_code *c)
   struct r500_fragment_program_code *code = &c->code.r500;
   fprintf(stderr, "R500 Fragment Program:\n--------\n");
 
-  int n;
+  int n, i;
   uint32_t inst;
   uint32_t inst0;
   char *str = NULL;
@@ -275,8 +301,8 @@ void r500FragmentProgramDump(struct rX00_fragment_program_code *c)
 	    to_mask((inst >> 15) & 0xf));
 
     switch(inst0 & 0x3) {
-    case 0:
-    case 1:
+    case R500_INST_TYPE_ALU:
+    case R500_INST_TYPE_OUT:
       fprintf(stderr,"\t1:RGB_ADDR   0x%08x:", code->inst[n].inst1);
       inst = code->inst[n].inst1;
 
@@ -319,9 +345,87 @@ void r500FragmentProgramDump(struct rX00_fragment_program_code *c)
 	      (inst >> 23) & 0x3,
 	      (inst >> 25) & 0x3, toswiz((inst >> 27) & 0x7), (inst >> 30) & 0x3);
       break;
-    case 2:
+    case R500_INST_TYPE_FC:
+      fprintf(stderr, "\t2:FC_INST    0x%08x:", code->inst[n].inst2);
+      inst = code->inst[n].inst2;
+      /* JUMP_FUNC JUMP_ANY*/
+      fprintf(stderr, "0x%02x %1x ", inst >> 8 & 0xff,
+          (inst & R500_FC_JUMP_ANY) >> 5);
+      
+      /* OP */
+      switch(inst & 0x7){
+      case R500_FC_OP_JUMP:
+      	fprintf(stderr, "JUMP");
+        break;
+      case R500_FC_OP_LOOP:
+        fprintf(stderr, "LOOP");
+        break;
+      case R500_FC_OP_ENDLOOP:
+        fprintf(stderr, "ENDLOOP");
+        break;
+      case R500_FC_OP_REP:
+        fprintf(stderr, "REP");
+        break;
+      case R500_FC_OP_ENDREP:
+        fprintf(stderr, "ENDREP");
+        break;
+      case R500_FC_OP_BREAKLOOP:
+        fprintf(stderr, "BREAKLOOP");
+        break;
+      case R500_FC_OP_BREAKREP:
+        fprintf(stderr, "BREAKREP");
+	break;
+      case R500_FC_OP_CONTINUE:
+        fprintf(stderr, "CONTINUE");
+        break;
+      }
+      fprintf(stderr," "); 
+      /* A_OP */
+      switch(inst & (0x3 << 6)){
+      case R500_FC_A_OP_NONE:
+        fprintf(stderr, "NONE");
+        break;
+      case R500_FC_A_OP_POP:
+	fprintf(stderr, "POP");
+        break;
+      case R500_FC_A_OP_PUSH:
+        fprintf(stderr, "PUSH");
+        break;
+      }
+      /* B_OP0 B_OP1 */
+      for(i=0; i<2; i++){
+        fprintf(stderr, " ");
+        switch(inst & (0x3 << (24 + (i * 2)))){
+        /* R500_FC_B_OP0_NONE 
+	 * R500_FC_B_OP1_NONE */
+	case 0:
+          fprintf(stderr, "NONE");
+          break;
+        case R500_FC_B_OP0_DECR:
+        case R500_FC_B_OP1_DECR:
+          fprintf(stderr, "DECR");
+          break;
+        case R500_FC_B_OP0_INCR:
+        case R500_FC_B_OP1_INCR:
+          fprintf(stderr, "INCR");
+          break;
+        }
+      }
+      /*POP_CNT B_ELSE */
+      fprintf(stderr, " %d %1x", (inst >> 16) & 0x1f, (inst & R500_FC_B_ELSE) >> 4);
+      inst = code->inst[n].inst3;
+      /* JUMP_ADDR */
+      fprintf(stderr, " %d", inst >> 16);
+      
+      if(code->inst[n].inst2 & R500_FC_IGNORE_UNCOVERED){
+        fprintf(stderr, " IGN_UNC");
+      }
+      inst = code->inst[n].inst3;
+      fprintf(stderr, "\n\t3:FC_ADDR    0x%08x:", inst);
+      fprintf(stderr, "BOOL: 0x%02x, INT: 0x%02x, JUMP_ADDR: %d, JMP_GLBL: %1x\n",
+      inst & 0x1f, (inst >> 8) & 0x1f, (inst >> 16) & 0x1ff, inst >> 31); 
       break;
-    case 3:
+    case R500_INST_TYPE_TEX:
       inst = code->inst[n].inst1;
       fprintf(stderr,"\t1:TEX_INST:  0x%08x: id: %d op:%s, %s, %s %s\n", inst, (inst >> 16) & 0xf,
 	      to_texop((inst >> 22) & 0x7), (inst & (1<<25)) ? "ACQ" : "",
diff --git a/src/mesa/drivers/dri/r300/compiler/r500_fragprog.h b/src/mesa/drivers/dri/r300/compiler/r500_fragprog.h
index 4efbae7ba6..0d005a794f 100644
--- a/src/mesa/drivers/dri/r300/compiler/r500_fragprog.h
+++ b/src/mesa/drivers/dri/r300/compiler/r500_fragprog.h
@@ -36,6 +36,8 @@
 #include "radeon_compiler.h"
 #include "radeon_swizzle.h"
 
+struct emulate_loop_state;
+
 extern void r500BuildFragmentProgramHwCode(struct r300_fragment_program_compiler *compiler);
 
 extern void r500FragmentProgramDump(struct rX00_fragment_program_code *c);
@@ -47,4 +49,6 @@ extern int r500_transform_IF(
 	struct rc_instruction * inst,
 	void* data);
 
+void r500_transform_unroll_loops(struct radeon_compiler * c,
+						struct emulate_loop_state * s);
 #endif
diff --git a/src/mesa/drivers/dri/r300/compiler/r500_fragprog_emit.c b/src/mesa/drivers/dri/r300/compiler/r500_fragprog_emit.c
index fb2d8b5a9c..0bd8f0a239 100644
--- a/src/mesa/drivers/dri/r300/compiler/r500_fragprog_emit.c
+++ b/src/mesa/drivers/dri/r300/compiler/r500_fragprog_emit.c
@@ -45,6 +45,8 @@
 
 #include "radeon_program_pair.h"
 
+#define MAX_BRANCH_DEPTH_FULL 32
+#define MAX_BRANCH_DEPTH_PARTIAL 4
 
 #define PROG_CODE \
 	struct r500_fragment_program_code *code = &c->code->code.r500
@@ -61,6 +63,10 @@ struct branch_info {
 	int Endif;
 };
 
+struct loop_info {
+	int LoopStart;
+};
+
 struct emit_state {
 	struct radeon_compiler * C;
 	struct r500_fragment_program_code * Code;
@@ -69,7 +75,12 @@ struct emit_state {
 	unsigned int CurrentBranchDepth;
 	unsigned int BranchesReserved;
 
+	struct loop_info * Loops;
+	unsigned int CurrentLoopDepth;
+	unsigned int LoopsReserved;
+
 	unsigned int MaxBranchDepth;
+
 };
 
 static unsigned int translate_rgb_op(struct r300_fragment_program_compiler *c, rc_opcode opcode)
@@ -359,16 +370,49 @@ static void emit_flowcontrol(struct emit_state * s, struct rc_instruction * inst
 
 	s->Code->inst[newip].inst0 = R500_INST_TYPE_FC | R500_INST_ALU_WAIT;
 
-	if (inst->U.I.Opcode == RC_OPCODE_IF) {
-		if (s->CurrentBranchDepth >= 32) {
+	switch(inst->U.I.Opcode){
+	struct branch_info * branch;
+	struct loop_info * loop;
+	case RC_OPCODE_BGNLOOP:
+		memory_pool_array_reserve(&s->C->Pool, struct loop_info,
+			s->Loops, s->CurrentLoopDepth, s->LoopsReserved, 1);
+
+		loop = &s->Loops[s->CurrentLoopDepth++];
+		
+		/* We don't emit an instruction for BGNLOOP, so we need to
+		 * decrement the instruction counter, but first we need to
+		 * set LoopStart to the current value of inst_end, which
+		 * will end up being the first real instruction in the loop.*/
+		loop->LoopStart = s->Code->inst_end--;
+		break;
+	
+	case RC_OPCODE_BRK:
+		/* Don't emit an instruction for BRK */
+		s->Code->inst_end--;
+		break;
+
+	case RC_OPCODE_CONTINUE:
+		loop = &s->Loops[s->CurrentLoopDepth - 1];
+		s->Code->inst[newip].inst2 = R500_FC_OP_JUMP |
+			R500_FC_JUMP_FUNC(0xff);
+		s->Code->inst[newip].inst3 = R500_FC_JUMP_ADDR(loop->LoopStart);
+		break;
+
+	case RC_OPCODE_ENDLOOP:
+		/* Don't emit an instruction for ENDLOOP */
+		s->Code->inst_end--;
+		s->CurrentLoopDepth--;
+		break;
+
+	case RC_OPCODE_IF:
+		if ( s->CurrentBranchDepth >= MAX_BRANCH_DEPTH_FULL) {
 			rc_error(s->C, "Branch depth exceeds hardware limit");
 			return;
 		}
-
 		memory_pool_array_reserve(&s->C->Pool, struct branch_info,
 				s->Branches, s->CurrentBranchDepth, s->BranchesReserved, 1);
 
-		struct branch_info * branch = &s->Branches[s->CurrentBranchDepth++];
+		branch = &s->Branches[s->CurrentBranchDepth++];
 		branch->If = newip;
 		branch->Else = -1;
 		branch->Endif = -1;
@@ -377,29 +421,50 @@ static void emit_flowcontrol(struct emit_state * s, struct rc_instruction * inst
 			s->MaxBranchDepth = s->CurrentBranchDepth;
 
 		/* actual instruction is filled in at ENDIF time */
-	} else if (inst->U.I.Opcode == RC_OPCODE_ELSE) {
+		break;
+	
+	case RC_OPCODE_ELSE:
 		if (!s->CurrentBranchDepth) {
 			rc_error(s->C, "%s: got ELSE outside a branch", __FUNCTION__);
 			return;
 		}
 
-		struct branch_info * branch = &s->Branches[s->CurrentBranchDepth - 1];
+		branch = &s->Branches[s->CurrentBranchDepth - 1];
 		branch->Else = newip;
 
 		/* actual instruction is filled in at ENDIF time */
-	} else if (inst->U.I.Opcode == RC_OPCODE_ENDIF) {
+		break;
+
+	case RC_OPCODE_ENDIF:
 		if (!s->CurrentBranchDepth) {
 			rc_error(s->C, "%s: got ELSE outside a branch", __FUNCTION__);
 			return;
 		}
 
-		struct branch_info * branch = &s->Branches[s->CurrentBranchDepth - 1];
-		branch->Endif = newip;
-
+		branch = &s->Branches[s->CurrentBranchDepth - 1];
+		
+		if(inst->Prev->U.I.Opcode == RC_OPCODE_BRK){
+			branch->Endif = --s->Code->inst_end;
+			s->Code->inst[branch->Endif].inst2 |=
+				R500_FC_B_OP0_DECR;
+		}
+		else{
+			branch->Endif = newip;
+		
+			s->Code->inst[branch->Endif].inst2 = R500_FC_OP_JUMP
+				| R500_FC_A_OP_NONE /* no address stack */
+				| R500_FC_JUMP_ANY /* docs says set this, but I don't understand why */
+				| R500_FC_B_OP0_DECR /* decrement branch counter if stay */
+				| R500_FC_B_OP1_NONE /* no branch counter if stay */
+				| R500_FC_B_POP_CNT(1)
+			;
+			s->Code->inst[branch->Endif].inst3 = R500_FC_JUMP_ADDR(branch->Endif + 1);
+		}
 		s->Code->inst[branch->If].inst2 = R500_FC_OP_JUMP
 			| R500_FC_A_OP_NONE /* no address stack */
 			| R500_FC_JUMP_FUNC(0x0f) /* jump if ALU result is false */
 			| R500_FC_B_OP0_INCR /* increment branch counter if stay */
+			| R500_FC_IGNORE_UNCOVERED
 		;
 
 		if (branch->Else >= 0) {
@@ -421,17 +486,10 @@ static void emit_flowcontrol(struct emit_state * s, struct rc_instruction * inst
 			s->Code->inst[branch->If].inst3 = R500_FC_JUMP_ADDR(branch->Endif + 1);
 		}
 
-		s->Code->inst[branch->Endif].inst2 = R500_FC_OP_JUMP
-			| R500_FC_A_OP_NONE /* no address stack */
-			| R500_FC_JUMP_ANY /* docs says set this, but I don't understand why */
-			| R500_FC_B_OP0_DECR /* decrement branch counter if stay */
-			| R500_FC_B_OP1_NONE /* no branch counter if stay */
-			| R500_FC_B_POP_CNT(1)
-		;
-		s->Code->inst[branch->Endif].inst3 = R500_FC_JUMP_ADDR(branch->Endif + 1);
 
 		s->CurrentBranchDepth--;
-	} else {
+		break;
+	default:
 		rc_error(s->C, "%s: unknown opcode %s\n", __FUNCTION__, rc_get_opcode_info(inst->U.I.Opcode)->Name);
 	}
 }
@@ -486,6 +544,10 @@ void r500BuildFragmentProgramHwCode(struct r300_fragment_program_compiler *compi
 		code->inst[ip].inst0 = R500_INST_TYPE_OUT | R500_INST_TEX_SEM_WAIT;
 	}
 
+	/* Use FULL flow control mode if branches are nested deep enough.
+	 * We don not need to enable FULL flow control mode for loops, becasue
+	 * we aren't using the hardware loop instructions.
+	 */
 	if (s.MaxBranchDepth >= 4) {
 		if (code->max_temp_idx < 1)
 			code->max_temp_idx = 1;
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_code.h b/src/mesa/drivers/dri/r300/compiler/radeon_code.h
index 1979e7e4e4..d03689763b 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_code.h
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_code.h
@@ -235,8 +235,11 @@ struct rX00_fragment_program_code {
 };
 
 
-#define VSF_MAX_FRAGMENT_LENGTH (255*4)
-#define VSF_MAX_FRAGMENT_TEMPS (14)
+#define R300_VS_MAX_ALU		256
+#define R300_VS_MAX_ALU_DWORDS  (R300_VS_MAX_ALU * 4)
+#define R500_VS_MAX_ALU	        1024
+#define R500_VS_MAX_ALU_DWORDS  (R500_VS_MAX_ALU * 4)
+#define R300_VS_MAX_TEMPS	32
 
 #define VSF_MAX_INPUTS 32
 #define VSF_MAX_OUTPUTS 32
@@ -244,8 +247,8 @@ struct rX00_fragment_program_code {
 struct r300_vertex_program_code {
 	int length;
 	union {
-		uint32_t d[VSF_MAX_FRAGMENT_LENGTH];
-		float f[VSF_MAX_FRAGMENT_LENGTH];
+		uint32_t d[R500_VS_MAX_ALU_DWORDS];
+		float f[R500_VS_MAX_ALU_DWORDS];
 	} body;
 
 	int pos_end;
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_dataflow_deadcode.c b/src/mesa/drivers/dri/r300/compiler/radeon_dataflow_deadcode.c
index e3c2c83c0c..fbb4235c22 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_dataflow_deadcode.c
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_dataflow_deadcode.c
@@ -202,32 +202,65 @@ void rc_dataflow_deadcode(struct radeon_compiler * c, rc_dataflow_mark_outputs_f
 	    inst = inst->Prev) {
 		const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
 
-		if (opcode->IsFlowControl) {
-			if (opcode->Opcode == RC_OPCODE_ENDIF) {
-				push_branch(&s);
-			} else {
-				if (s.BranchStackSize) {
-					struct branchinfo * branch = &s.BranchStack[s.BranchStackSize-1];
-
-					if (opcode->Opcode == RC_OPCODE_IF) {
-						or_updatemasks(&s.R,
-								&s.R,
-								branch->HaveElse ? &branch->StoreElse : &branch->StoreEndif);
-
-						s.BranchStackSize--;
-					} else if (opcode->Opcode == RC_OPCODE_ELSE) {
-						if (branch->HaveElse) {
-							rc_error(c, "%s: Multiple ELSE for one IF/ENDIF\n", __FUNCTION__);
-						} else {
-							memcpy(&branch->StoreElse, &s.R, sizeof(s.R));
-							memcpy(&s.R, &branch->StoreEndif, sizeof(s.R));
-							branch->HaveElse = 1;
-						}
+		switch(opcode->Opcode){
+		/* Mark all sources in the loop body as used before doing
+		 * normal deadcode analysis.  This is probably not optimal.
+		 */
+		case RC_OPCODE_ENDLOOP:
+		{
+			int endloops = 1;
+			struct rc_instruction *ptr;
+			for(ptr = inst->Prev; endloops > 0; ptr = ptr->Prev){
+				opcode = rc_get_opcode_info(ptr->U.I.Opcode);
+				if(ptr->U.I.Opcode == RC_OPCODE_BGNLOOP){
+					endloops--;
+					continue;
+				}
+				if(ptr->U.I.Opcode == RC_OPCODE_ENDLOOP){
+					endloops++;
+					continue;
+				}
+				if(opcode->HasDstReg){
+					int src = 0;
+					unsigned int srcmasks[3];
+					rc_compute_sources_for_writemask(ptr,
+						ptr->U.I.DstReg.WriteMask, srcmasks);
+					for(src=0; src < opcode->NumSrcRegs; src++){
+						mark_used(&s,
+							ptr->U.I.SrcReg[src].File,
+							ptr->U.I.SrcReg[src].Index,
+							srcmasks[src]);
+					}
+				}
+			}
+			break;
+		}
+		case RC_OPCODE_CONTINUE:
+		case RC_OPCODE_BRK:
+		case RC_OPCODE_BGNLOOP:
+			break;
+		case RC_OPCODE_ENDIF:
+			push_branch(&s);
+			break;
+		default:
+			if (opcode->IsFlowControl && s.BranchStackSize) {
+				struct branchinfo * branch = &s.BranchStack[s.BranchStackSize-1];
+				if (opcode->Opcode == RC_OPCODE_IF) {
+					or_updatemasks(&s.R,
+							&s.R,
+							branch->HaveElse ? &branch->StoreElse : &branch->StoreEndif);
+
+					s.BranchStackSize--;
+				} else if (opcode->Opcode == RC_OPCODE_ELSE) {
+					if (branch->HaveElse) {
+						rc_error(c, "%s: Multiple ELSE for one IF/ENDIF\n", __FUNCTION__);
 					} else {
-						rc_error(c, "%s: Unhandled control flow instruction %s\n", __FUNCTION__, opcode->Name);
+						memcpy(&branch->StoreElse, &s.R, sizeof(s.R));
+						memcpy(&s.R, &branch->StoreEndif, sizeof(s.R));
+						branch->HaveElse = 1;
 					}
 				} else {
-					rc_error(c, "%s: Unexpected control flow instruction\n", __FUNCTION__);
+					rc_error(c, "%s: Unhandled control flow instruction %s\n", __FUNCTION__, opcode->Name);
 				}
 			}
 		}
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.c b/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.c
index 4c5d29f421..131e9e7436 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.c
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.c
@@ -38,22 +38,6 @@
 
 #define DBG(...) do { if (VERBOSE) fprintf(stderr, __VA_ARGS__); } while(0)
 
-struct emulate_loop_state {
-	struct radeon_compiler * C;
-	struct loop_info * Loops;
-	unsigned int LoopCount;
-	unsigned int LoopReserved;
-};
-
-struct loop_info {
-	struct rc_instruction * BeginLoop;
-	struct rc_instruction * Cond;
-	struct rc_instruction * If;
-	struct rc_instruction * Brk;
-	struct rc_instruction * EndIf;
-	struct rc_instruction * EndLoop;
-};
-
 struct const_value {
 	
 	struct radeon_compiler * C;
@@ -94,22 +78,13 @@ static int src_reg_is_immediate(struct rc_src_register * src,
 	c->Program.Constants.Constants[src->Index].Type==RC_CONSTANT_IMMEDIATE;
 }
 
-static unsigned int loop_count_instructions(struct loop_info * loop)
+static unsigned int loop_calc_iterations(struct emulate_loop_state *s, 
+			struct loop_info * loop, unsigned int max_instructions)
 {
-	unsigned int count = 0;
-	struct rc_instruction * inst = loop->BeginLoop->Next;
-	while(inst != loop->EndLoop){
-		count++;
-		inst = inst->Next;
-	}
-	return count;
-}
-
-static unsigned int loop_calc_iterations(struct loop_info * loop,
-		unsigned int loop_count, unsigned int max_instructions)
-{
-	unsigned int icount = loop_count_instructions(loop);
-	return max_instructions / (loop_count * icount);
+	unsigned int total_i = rc_recompute_ips(s->C);
+	unsigned int loop_i = (loop->EndLoop->IP - loop->BeginLoop->IP) - 1;
+	/* +1 because the program already has one iteration of the loop. */
+	return 1 + ((max_instructions - total_i) / (s->LoopCount * loop_i));
 }
 
 static void loop_unroll(struct emulate_loop_state * s,
@@ -214,8 +189,7 @@ static void get_incr_amount(void * data, struct rc_instruction * inst,
 }
 
 static int transform_const_loop(struct emulate_loop_state * s,
-						struct loop_info * loop,
-						struct rc_instruction * cond)
+						struct loop_info * loop)
 {
 	int end_loops = 1;
 	int iterations;
@@ -228,13 +202,13 @@ static int transform_const_loop(struct emulate_loop_state * s,
 
 	/* Find the counter and the upper limit */
 	
-	if(src_reg_is_immediate(&cond->U.I.SrcReg[0], s->C)){
-		limit = &cond->U.I.SrcReg[0];
-		counter = &cond->U.I.SrcReg[1];
+	if(src_reg_is_immediate(&loop->Cond->U.I.SrcReg[0], s->C)){
+		limit = &loop->Cond->U.I.SrcReg[0];
+		counter = &loop->Cond->U.I.SrcReg[1];
 	}
-	else if(src_reg_is_immediate(&cond->U.I.SrcReg[1], s->C)){
-		limit = &cond->U.I.SrcReg[1];
-		counter = &cond->U.I.SrcReg[0];
+	else if(src_reg_is_immediate(&loop->Cond->U.I.SrcReg[1], s->C)){
+		limit = &loop->Cond->U.I.SrcReg[1];
+		counter = &loop->Cond->U.I.SrcReg[0];
 	}
 	else{
 		DBG("No constant limit.\n");
@@ -293,8 +267,22 @@ static int transform_const_loop(struct emulate_loop_state * s,
 	 * simple, since we only support increment and decrement loops.
 	 */
 	limit_value = get_constant_value(s->C, limit, 0);
-	iterations = (int) ((limit_value - counter_value.Value) /
+	DBG("Limit is %f.\n", limit_value);
+	switch(loop->Cond->U.I.Opcode){
+	case RC_OPCODE_SGT:
+	case RC_OPCODE_SLT:
+		iterations = (int) ceilf((limit_value - counter_value.Value) /
 							count_inst.Amount);
+		break;
+
+	case RC_OPCODE_SLE:
+	case RC_OPCODE_SGE:
+		iterations = (int) floorf((limit_value - counter_value.Value) /
+							count_inst.Amount) + 1;
+		break;
+	default:
+		return 0;
+	}
 
 	DBG("Loop will have %d iterations.\n", iterations);
 	
@@ -414,7 +402,7 @@ static struct rc_instruction * transform_loop(struct emulate_loop_state * s,
 	}
 	
 	/* Check if the number of loops is known at compile time. */
-	if(transform_const_loop(s, loop, ptr)){
+	if(transform_const_loop(s, loop)){
 		return loop->BeginLoop->Next;
 	}
 
@@ -425,9 +413,14 @@ static struct rc_instruction * transform_loop(struct emulate_loop_state * s,
 	return loop->EndLoop;
 }
 
-static void rc_transform_loops(struct emulate_loop_state * s)
+void rc_transform_unroll_loops(struct radeon_compiler *c,
+					struct emulate_loop_state * s)
 {
-	struct rc_instruction * ptr = s->C->Program.Instructions.Next;
+	struct rc_instruction * ptr;
+	
+	memset(s, 0, sizeof(struct emulate_loop_state));
+	s->C = c;
+	ptr = s->C->Program.Instructions.Next;
 	while(ptr != &s->C->Program.Instructions) {
 		if(ptr->Type == RC_INSTRUCTION_NORMAL &&
 					ptr->U.I.Opcode == RC_OPCODE_BGNLOOP){
@@ -440,7 +433,7 @@ static void rc_transform_loops(struct emulate_loop_state * s)
 	}
 }
 
-static void rc_unroll_loops(struct emulate_loop_state *s,
+void rc_emulate_loops(struct emulate_loop_state *s,
 						unsigned int max_instructions)
 {
 	int i;
@@ -451,24 +444,8 @@ static void rc_unroll_loops(struct emulate_loop_state *s,
 		if(!s->Loops[i].EndLoop){
 			continue;
 		}
-		unsigned int iterations = loop_calc_iterations(&s->Loops[i],
-						s->LoopCount, max_instructions);
+		unsigned int iterations = loop_calc_iterations(s, &s->Loops[i],
+							max_instructions);
 		loop_unroll(s, &s->Loops[i], iterations);
 	}
 }
-
-void rc_emulate_loops(struct radeon_compiler *c, unsigned int max_instructions)
-{
-	struct emulate_loop_state s;
-
-	memset(&s, 0, sizeof(struct emulate_loop_state));
-	s.C = c;
-
-	/* We may need to move these two operations to r3xx_(vert|frag)prog.c
-	 * and run the optimization passes between them in order to increase
-	 * the number of unrolls we can do for each loop.
-	 */
-	rc_transform_loops(&s);
-	
-	rc_unroll_loops(&s, max_instructions);
-}
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.h b/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.h
index ddcf1c0fab..7748813c4e 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.h
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.h
@@ -7,6 +7,26 @@
 
 struct radeon_compiler;
 
-void rc_emulate_loops(struct radeon_compiler *c, unsigned int max_instructions);
+struct loop_info {
+	struct rc_instruction * BeginLoop;
+	struct rc_instruction * Cond;
+	struct rc_instruction * If;
+	struct rc_instruction * Brk;
+	struct rc_instruction * EndIf;
+	struct rc_instruction * EndLoop;
+};
+
+struct emulate_loop_state {
+	struct radeon_compiler * C;
+	struct loop_info * Loops;
+	unsigned int LoopCount;
+	unsigned int LoopReserved;
+};
+
+void rc_transform_unroll_loops(struct radeon_compiler *c,
+					struct emulate_loop_state * s);
+
+void rc_emulate_loops(struct emulate_loop_state *s,
+					unsigned int max_instructions);
 
 #endif /* RADEON_EMULATE_LOOPS_H */
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.c b/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.c
index 1dc16855dc..04f234f11d 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.c
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.c
@@ -386,6 +386,12 @@ struct rc_opcode_info rc_opcodes[MAX_RC_OPCODE] = {
 		.NumSrcRegs = 0,
 	},
 	{
+		.Opcode = RC_OPCODE_CONTINUE,
+		.Name = "CONTINUE",
+		.IsFlowControl = 1,
+		.NumSrcRegs = 0
+	},
+	{
 		.Opcode = RC_OPCODE_REPL_ALPHA,
 		.Name = "REPL_ALPHA",
 		.HasDstReg = 1
@@ -393,6 +399,10 @@ struct rc_opcode_info rc_opcodes[MAX_RC_OPCODE] = {
 	{
 		.Opcode = RC_OPCODE_BEGIN_TEX,
 		.Name = "BEGIN_TEX"
+	},
+	{
+		.Opcode = RC_OPCODE_KILP,
+		.Name = "KILP",
 	}
 };
 
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.h b/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.h
index 91c82ac089..8b9fa07dde 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.h
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.h
@@ -187,6 +187,8 @@ typedef enum {
 
 	RC_OPCODE_ENDLOOP,
 
+	RC_OPCODE_CONTINUE,
+
 	/** special instruction, used in R300-R500 fragment program pair instructions
 	 * indicates that the result of the alpha operation shall be replicated
 	 * across all other channels */
@@ -197,6 +199,9 @@ typedef enum {
 	 * can run simultaneously. */
 	RC_OPCODE_BEGIN_TEX,
 
+	/** Stop execution of the shader (GLSL discard) */
+	RC_OPCODE_KILP,
+
 	MAX_RC_OPCODE
 } rc_opcode;
 
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_optimize.c b/src/mesa/drivers/dri/r300/compiler/radeon_optimize.c
index 21d7210888..eca0651536 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_optimize.c
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_optimize.c
@@ -75,6 +75,15 @@ struct peephole_state {
 	int BranchDepth;
 };
 
+/**
+ * This is a callback function that is meant to be passed to
+ * rc_for_all_reads_mask.  This function will be called once for each source
+ * register in inst.
+ * @param inst The instruction that the source register belongs to.
+ * @param file The register file of the source register.
+ * @param index The index of the source register.
+ * @param mask The components of the source register that are being read from.
+ */
 static void peephole_scan_read(void * data, struct rc_instruction * inst,
 		rc_register_file file, unsigned int index, unsigned int mask)
 {
@@ -153,6 +162,11 @@ static void peephole(struct radeon_compiler * c, struct rc_instruction * inst_mo
 	for(struct rc_instruction * inst = inst_mov->Next;
 	    inst != &c->Program.Instructions;
 	    inst = inst->Next) {
+		/* XXX In the future we might be able to make the optimizer
+		 * smart enough to handle loops. */
+		if(inst->U.I.Opcode == RC_OPCODE_BGNLOOP){
+			return;
+		}
 		rc_for_all_reads_mask(inst, peephole_scan_read, &s);
 		rc_for_all_writes_mask(inst, peephole_scan_write, &s);
 		if (s.Conflict)
@@ -161,7 +175,8 @@ static void peephole(struct radeon_compiler * c, struct rc_instruction * inst_mo
 		if (s.BranchDepth >= 0) {
 			if (inst->U.I.Opcode == RC_OPCODE_IF) {
 				s.BranchDepth++;
-			} else if (inst->U.I.Opcode == RC_OPCODE_ENDIF) {
+			} else if (inst->U.I.Opcode == RC_OPCODE_ENDIF
+				|| inst->U.I.Opcode == RC_OPCODE_ELSE) {
 				s.BranchDepth--;
 				if (s.BranchDepth < 0) {
 					s.DefinedMask &= ~s.MovMask;
@@ -208,7 +223,8 @@ static void peephole(struct radeon_compiler * c, struct rc_instruction * inst_mo
 		if (s.BranchDepth >= 0) {
 			if (inst->U.I.Opcode == RC_OPCODE_IF) {
 				s.BranchDepth++;
-			} else if (inst->U.I.Opcode == RC_OPCODE_ENDIF) {
+			} else if (inst->U.I.Opcode == RC_OPCODE_ENDIF
+				|| inst->U.I.Opcode == RC_OPCODE_ELSE) {
 				s.BranchDepth--;
 				if (s.BranchDepth < 0)
 					break; /* no more readers after this point */
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_pair_schedule.c b/src/mesa/drivers/dri/r300/compiler/radeon_pair_schedule.c
index a279549ff8..fc540496c4 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_pair_schedule.c
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_pair_schedule.c
@@ -141,12 +141,28 @@ static void add_inst_to_list(struct schedule_instruction ** list, struct schedul
 	*list = inst;
 }
 
+static void add_inst_to_list_end(struct schedule_instruction ** list,
+					struct schedule_instruction * inst)
+{
+	if(!*list){
+		*list = inst;
+	}else{
+		struct schedule_instruction * temp = *list;
+		while(temp->NextReady){
+			temp = temp->NextReady;
+		}
+		temp->NextReady = inst;
+	}
+}
+
 static void instruction_ready(struct schedule_state * s, struct schedule_instruction * sinst)
 {
 	DBG("%i is now ready\n", sinst->Instruction->IP);
 
+	/* Adding Ready TEX instructions to the end of the "Ready List" helps
+	 * us emit TEX instructions in blocks without losing our place. */
 	if (sinst->Instruction->Type == RC_INSTRUCTION_NORMAL)
-		add_inst_to_list(&s->ReadyTEX, sinst);
+		add_inst_to_list_end(&s->ReadyTEX, sinst);
 	else if (sinst->Instruction->U.P.Alpha.Opcode == RC_OPCODE_NOP)
 		add_inst_to_list(&s->ReadyRGB, sinst);
 	else if (sinst->Instruction->U.P.RGB.Opcode == RC_OPCODE_NOP)
@@ -163,11 +179,14 @@ static void decrease_dependencies(struct schedule_state * s, struct schedule_ins
 		instruction_ready(s, sinst);
 }
 
-static void commit_instruction(struct schedule_state * s, struct schedule_instruction * sinst)
-{
-	DBG("%i: commit\n", sinst->Instruction->IP);
-
-	for(unsigned int i = 0; i < sinst->NumReadValues; ++i) {
+/**
+ * This function decreases the dependencies of the next instruction that
+ * wants to write to each of sinst's read values.
+ */
+static void commit_update_reads(struct schedule_state * s,
+					struct schedule_instruction * sinst){
+	unsigned int i;
+	for(i = 0; i < sinst->NumReadValues; ++i) {
 		struct reg_value * v = sinst->ReadValues[i];
 		assert(v->NumReaders > 0);
 		v->NumReaders--;
@@ -176,8 +195,12 @@ static void commit_instruction(struct schedule_state * s, struct schedule_instru
 				decrease_dependencies(s, v->Next->Writer);
 		}
 	}
+}
 
-	for(unsigned int i = 0; i < sinst->NumWriteValues; ++i) {
+static void commit_update_writes(struct schedule_state * s,
+					struct schedule_instruction * sinst){
+	unsigned int i;
+	for(i = 0; i < sinst->NumWriteValues; ++i) {
 		struct reg_value * v = sinst->WriteValues[i];
 		if (v->NumReaders) {
 			for(struct reg_value_reader * r = v->Readers; r; r = r->Next) {
@@ -196,6 +219,15 @@ static void commit_instruction(struct schedule_state * s, struct schedule_instru
 	}
 }
 
+static void commit_alu_instruction(struct schedule_state * s, struct schedule_instruction * sinst)
+{
+	DBG("%i: commit\n", sinst->Instruction->IP);
+
+	commit_update_reads(s, sinst);
+
+	commit_update_writes(s, sinst);
+}
+
 /**
  * Emit all ready texture instructions in a single block.
  *
@@ -208,21 +240,37 @@ static void emit_all_tex(struct schedule_state * s, struct rc_instruction * befo
 
 	assert(s->ReadyTEX);
 
-	/* Don't let the ready list change under us! */
-	readytex = s->ReadyTEX;
-	s->ReadyTEX = 0;
-
 	/* Node marker for R300 */
 	struct rc_instruction * inst_begin = rc_insert_new_instruction(s->C, before->Prev);
 	inst_begin->U.I.Opcode = RC_OPCODE_BEGIN_TEX;
 
 	/* Link texture instructions back in */
+	readytex = s->ReadyTEX;
 	while(readytex) {
-		struct schedule_instruction * tex = readytex;
+		rc_insert_instruction(before->Prev, readytex->Instruction);
+		DBG("%i: commit TEX reads\n", readytex->Instruction->IP);
+
+		/* All of the TEX instructions in the same TEX block have
+		 * their source registers read from before any of the
+		 * instructions in that block write to their destination
+		 * registers.  This means that when we commit a TEX
+		 * instruction, any other TEX instruction that wants to write
+		 * to one of the committed instruction's source register can be
+		 * marked as ready and should be emitted in the same TEX
+		 * block. This prevents the following sequence from being
+		 * emitted in two different TEX blocks:
+		 * 0: TEX temp[0].xyz, temp[1].xy__, 2D[0];
+		 * 1: TEX temp[1].xyz, temp[2].xy__, 2D[0];
+		 */
+		commit_update_reads(s, readytex);
+		readytex = readytex->NextReady;
+	}
+	readytex = s->ReadyTEX;
+	s->ReadyTEX = 0;
+	while(readytex){
+		DBG("%i: commit TEX writes\n", readytex->Instruction->IP);
+		commit_update_writes(s, readytex);
 		readytex = readytex->NextReady;
-
-		rc_insert_instruction(before->Prev, tex->Instruction);
-		commit_instruction(s, tex);
 	}
 }
 
@@ -328,7 +376,7 @@ static void emit_one_alu(struct schedule_state *s, struct rc_instruction * befor
 		}
 
 		rc_insert_instruction(before->Prev, sinst->Instruction);
-		commit_instruction(s, sinst);
+		commit_alu_instruction(s, sinst);
 	} else {
 		struct schedule_instruction **prgb;
 		struct schedule_instruction **palpha;
@@ -346,8 +394,8 @@ static void emit_one_alu(struct schedule_state *s, struct rc_instruction * befor
 				*prgb = (*prgb)->NextReady;
 				*palpha = (*palpha)->NextReady;
 				rc_insert_instruction(before->Prev, psirgb->Instruction);
-				commit_instruction(s, psirgb);
-				commit_instruction(s, psialpha);
+				commit_alu_instruction(s, psirgb);
+				commit_alu_instruction(s, psialpha);
 				goto success;
 			}
 		}
@@ -357,7 +405,7 @@ static void emit_one_alu(struct schedule_state *s, struct rc_instruction * befor
 		s->ReadyRGB = s->ReadyRGB->NextReady;
 
 		rc_insert_instruction(before->Prev, sinst->Instruction);
-		commit_instruction(s, sinst);
+		commit_alu_instruction(s, sinst);
 	success: ;
 	}
 }
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_program_alu.c b/src/mesa/drivers/dri/r300/compiler/radeon_program_alu.c
index c922d3d9a4..3cc2897293 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_program_alu.c
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_program_alu.c
@@ -973,3 +973,32 @@ int radeonTransformDeriv(struct radeon_compiler* c,
 
 	return 1;
 }
+
+/**
+ * IF Temp[0].x -\
+ * KILP         - > KIL -abs(Temp[0].x)
+ * ENDIF        -/
+ *
+ * This needs to be done in its own pass, because it modifies the instructions
+ * before and after KILP.
+ */
+void radeonTransformKILP(struct radeon_compiler * c)
+{
+	struct rc_instruction * inst;
+	for (inst = c->Program.Instructions.Next;
+			inst != &c->Program.Instructions; inst = inst->Next) {
+
+		if (inst->U.I.Opcode != RC_OPCODE_KILP
+			|| inst->Prev->U.I.Opcode != RC_OPCODE_IF
+			|| inst->Next->U.I.Opcode != RC_OPCODE_ENDIF) {
+			continue;
+		}
+		inst->U.I.Opcode = RC_OPCODE_KIL;
+		inst->U.I.SrcReg[0] = negate(absolute(inst->Prev->U.I.SrcReg[0]));
+
+		/* Remove IF */
+		rc_remove_instruction(inst->Prev);
+		/* Remove ENDIF */
+		rc_remove_instruction(inst->Next);
+	}
+}
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_program_alu.h b/src/mesa/drivers/dri/r300/compiler/radeon_program_alu.h
index 77d444476f..e6e2cc20c5 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_program_alu.h
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_program_alu.h
@@ -60,4 +60,6 @@ int radeonTransformDeriv(
 	struct rc_instruction * inst,
 	void*);
 
+void radeonTransformKILP(struct radeon_compiler * c);
+
 #endif /* __RADEON_PROGRAM_ALU_H_ */
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_rename_regs.c b/src/mesa/drivers/dri/r300/compiler/radeon_rename_regs.c
new file mode 100644
index 0000000000..31c9866883
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_rename_regs.c
@@ -0,0 +1,131 @@
+/*
+ * Copyright 2010 Tom Stellard <tstellar@gmail.com>
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+/**
+ * \file
+ */
+
+#include "radeon_rename_regs.h"
+
+#include "radeon_compiler.h"
+#include "radeon_dataflow.h"
+
+struct reg_rename {
+	int old_index;
+	int new_index;
+	int temp_index;
+};
+
+static void rename_reg(void * data, struct rc_instruction * inst,
+			rc_register_file * file, unsigned int * index)
+{
+	struct reg_rename *r = data;
+
+	if(r->old_index == *index && *file == RC_FILE_TEMPORARY) {
+		*index = r->new_index;
+	}
+	else if(r->new_index == *index && *file == RC_FILE_TEMPORARY) {
+		*index = r->temp_index;
+	}
+}
+
+static void rename_all(
+	struct radeon_compiler *c,
+	struct rc_instruction * start,
+	unsigned int old,
+	unsigned int new,
+	unsigned int temp)
+{
+	struct rc_instruction * inst;
+	struct reg_rename r;
+	r.old_index = old;
+	r.new_index = new;
+	r.temp_index = temp;
+	for(inst = start; inst != &c->Program.Instructions;
+						inst = inst->Next) {
+		rc_remap_registers(inst, rename_reg, &r);
+	}
+}
+
+/**
+ * This function renames registers in an attempt to get the code close to
+ * SSA form.  After this function has completed, most of the register are only
+ * written to one time, with a few exceptions.  For example, this block of code
+ * will not be modified by this function:
+ * Mov Temp[0].x Const[0].x
+ * Mov Temp[0].y Const[0].y
+ * Basically, destination registers will be renamed if:
+ * 1. There have been no previous writes to that register
+ * or
+ * 2. If the instruction is writting to the exact components (no more, no less)
+ * of a register that has been written to by previous instructions.
+ *
+ * This function assumes all the instructions are still of type
+ * RC_INSTRUCTION_NORMAL.
+ */
+void rc_rename_regs(struct radeon_compiler * c)
+{
+	unsigned int cur_index = 0;
+	unsigned int icount;
+	struct rc_instruction * inst;
+	unsigned int * masks;
+
+	/* The number of instructions in the program is also the maximum
+	 * number of temp registers that could potentially be used. */
+	icount = rc_recompute_ips(c);
+	masks = memory_pool_malloc(&c->Pool, icount * sizeof(unsigned int));
+	memset(masks, 0, icount * sizeof(unsigned int));
+
+	for(inst = c->Program.Instructions.Next;
+					inst != &c->Program.Instructions;
+					inst = inst->Next) {
+		const struct rc_opcode_info * info;
+		if(inst->Type != RC_INSTRUCTION_NORMAL) {
+			rc_error(c, "%s only works with normal instructions.",
+								__FUNCTION__);
+			return;
+		}
+		unsigned int old_index, temp_index;
+		struct rc_dst_register * dst = &inst->U.I.DstReg;
+		info = rc_get_opcode_info(inst->U.I.Opcode);
+		if(!info->HasDstReg || dst->File != RC_FILE_TEMPORARY) {
+			continue;
+		}
+		if(dst->Index >= icount || !masks[dst->Index] ||
+					masks[dst->Index] == dst->WriteMask) {
+			old_index = dst->Index;
+			/* We need to set dst->Index here so get free temporary
+			 * will work. */
+			dst->Index = cur_index++;
+			temp_index = rc_find_free_temporary(c);
+			rename_all(c, inst->Next, old_index,
+						dst->Index, temp_index);
+		}
+		assert(dst->Index < icount);
+		masks[dst->Index] |= dst->WriteMask;
+	}
+}
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_rename_regs.h b/src/mesa/drivers/dri/r300/compiler/radeon_rename_regs.h
new file mode 100644
index 0000000000..4323b995d8
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_rename_regs.h
@@ -0,0 +1,9 @@
+
+#ifndef RADEON_RENAME_REGS_H
+#define RADEON_RENAME_REGS_H
+
+struct radeon_compiler;
+
+void rc_rename_regs(struct radeon_compiler * c);
+
+#endif /* RADEON_RENAME_REGS_H */
author	Eric Anholt <eric@anholt.net>	2010-07-26 17:47:59 -0700
committer	Eric Anholt <eric@anholt.net>	2010-07-26 17:53:27 -0700
commit	afe125e0a18ac3886c45c7e6b02b122fb2d327b5 (patch)
tree	78621707e71154c0b388b0baacffc26432b7e992 /src/mesa/drivers/dri/r300/compiler
parent	d64343f1ae84979bd154475badf11af8a9bfc2eb (diff)
parent	5403ca79b225605c79f49866a6497c97da53be3b (diff)