1 files changed, 655 insertions, 0 deletions
diff --git a/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog.c b/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog.c
new file mode 100644
index 0000000000..93a516105e
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog.c
@@ -0,0 +1,655 @@
+/*
+ * Copyright 2009 Nicolai Hähnle <nhaehnle@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#include "radeon_compiler.h"
+
+#include "../r300_reg.h"
+
+#include "radeon_nqssadce.h"
+#include "radeon_program.h"
+#include "radeon_program_alu.h"
+
+#include "shader/prog_print.h"
+
+
+/*
+ * Take an already-setup and valid source then swizzle it appropriately to
+ * obtain a constant ZERO or ONE source.
+ */
+#define __CONST(x, y)	\
+	(PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[x]),	\
+			   t_swizzle(y),	\
+			   t_swizzle(y),	\
+			   t_swizzle(y),	\
+			   t_swizzle(y),	\
+			   t_src_class(vpi->SrcReg[x].File), \
+			   NEGATE_NONE) | (vpi->SrcReg[x].RelAddr << 4))
+
+
+static unsigned long t_dst_mask(GLuint mask)
+{
+	/* WRITEMASK_* is equivalent to VSF_FLAG_* */
+	return mask & WRITEMASK_XYZW;
+}
+
+static unsigned long t_dst_class(gl_register_file file)
+{
+
+	switch (file) {
+	case PROGRAM_TEMPORARY:
+		return PVS_DST_REG_TEMPORARY;
+	case PROGRAM_OUTPUT:
+		return PVS_DST_REG_OUT;
+	case PROGRAM_ADDRESS:
+		return PVS_DST_REG_A0;
+		/*
+		   case PROGRAM_INPUT:
+		   case PROGRAM_LOCAL_PARAM:
+		   case PROGRAM_ENV_PARAM:
+		   case PROGRAM_NAMED_PARAM:
+		   case PROGRAM_STATE_VAR:
+		   case PROGRAM_WRITE_ONLY:
+		   case PROGRAM_ADDRESS:
+		 */
+	default:
+		fprintf(stderr, "problem in %s", __FUNCTION__);
+		_mesa_exit(-1);
+		return -1;
+	}
+}
+
+static unsigned long t_dst_index(struct r300_vertex_program_code *vp,
+				 struct prog_dst_register *dst)
+{
+	if (dst->File == PROGRAM_OUTPUT)
+		return vp->outputs[dst->Index];
+
+	return dst->Index;
+}
+
+static unsigned long t_src_class(gl_register_file file)
+{
+	switch (file) {
+	case PROGRAM_TEMPORARY:
+		return PVS_SRC_REG_TEMPORARY;
+	case PROGRAM_INPUT:
+		return PVS_SRC_REG_INPUT;
+	case PROGRAM_LOCAL_PARAM:
+	case PROGRAM_ENV_PARAM:
+	case PROGRAM_NAMED_PARAM:
+	case PROGRAM_CONSTANT:
+	case PROGRAM_STATE_VAR:
+		return PVS_SRC_REG_CONSTANT;
+		/*
+		   case PROGRAM_OUTPUT:
+		   case PROGRAM_WRITE_ONLY:
+		   case PROGRAM_ADDRESS:
+		 */
+	default:
+		fprintf(stderr, "problem in %s", __FUNCTION__);
+		_mesa_exit(-1);
+		return -1;
+	}
+}
+
+static GLboolean t_src_conflict(struct prog_src_register a, struct prog_src_register b)
+{
+	unsigned long aclass = t_src_class(a.File);
+	unsigned long bclass = t_src_class(b.File);
+
+	if (aclass != bclass)
+		return GL_FALSE;
+	if (aclass == PVS_SRC_REG_TEMPORARY)
+		return GL_FALSE;
+
+	if (a.RelAddr || b.RelAddr)
+		return GL_TRUE;
+	if (a.Index != b.Index)
+		return GL_TRUE;
+
+	return GL_FALSE;
+}
+
+static INLINE unsigned long t_swizzle(GLubyte swizzle)
+{
+	/* this is in fact a NOP as the Mesa SWIZZLE_* are all identical to VSF_IN_COMPONENT_* */
+	return swizzle;
+}
+
+static unsigned long t_src_index(struct r300_vertex_program_code *vp,
+				 struct prog_src_register *src)
+{
+	if (src->File == PROGRAM_INPUT) {
+		assert(vp->inputs[src->Index] != -1);
+		return vp->inputs[src->Index];
+	} else {
+		if (src->Index < 0) {
+			fprintf(stderr,
+				"negative offsets for indirect addressing do not work.\n");
+			return 0;
+		}
+		return src->Index;
+	}
+}
+
+/* these two functions should probably be merged... */
+
+static unsigned long t_src(struct r300_vertex_program_code *vp,
+			   struct prog_src_register *src)
+{
+	/* src->Negate uses the NEGATE_ flags from program_instruction.h,
+	 * which equal our VSF_FLAGS_ values, so it's safe to just pass it here.
+	 */
+	return PVS_SRC_OPERAND(t_src_index(vp, src),
+			       t_swizzle(GET_SWZ(src->Swizzle, 0)),
+			       t_swizzle(GET_SWZ(src->Swizzle, 1)),
+			       t_swizzle(GET_SWZ(src->Swizzle, 2)),
+			       t_swizzle(GET_SWZ(src->Swizzle, 3)),
+			       t_src_class(src->File),
+			       src->Negate) | (src->RelAddr << 4);
+}
+
+static unsigned long t_src_scalar(struct r300_vertex_program_code *vp,
+				  struct prog_src_register *src)
+{
+	/* src->Negate uses the NEGATE_ flags from program_instruction.h,
+	 * which equal our VSF_FLAGS_ values, so it's safe to just pass it here.
+	 */
+	return PVS_SRC_OPERAND(t_src_index(vp, src),
+			       t_swizzle(GET_SWZ(src->Swizzle, 0)),
+			       t_swizzle(GET_SWZ(src->Swizzle, 0)),
+			       t_swizzle(GET_SWZ(src->Swizzle, 0)),
+			       t_swizzle(GET_SWZ(src->Swizzle, 0)),
+			       t_src_class(src->File),
+			       src->Negate ? NEGATE_XYZW : NEGATE_NONE) |
+	    (src->RelAddr << 4);
+}
+
+static GLboolean valid_dst(struct r300_vertex_program_code *vp,
+			   struct prog_dst_register *dst)
+{
+	if (dst->File == PROGRAM_OUTPUT && vp->outputs[dst->Index] == -1) {
+		return GL_FALSE;
+	} else if (dst->File == PROGRAM_ADDRESS) {
+		assert(dst->Index == 0);
+	}
+
+	return GL_TRUE;
+}
+
+static void ei_vector1(struct r300_vertex_program_code *vp,
+				GLuint hw_opcode,
+				struct prog_instruction *vpi,
+				GLuint * inst)
+{
+	inst[0] = PVS_OP_DST_OPERAND(hw_opcode,
+				     GL_FALSE,
+				     GL_FALSE,
+				     t_dst_index(vp, &vpi->DstReg),
+				     t_dst_mask(vpi->DstReg.WriteMask),
+				     t_dst_class(vpi->DstReg.File));
+	inst[1] = t_src(vp, &vpi->SrcReg[0]);
+	inst[2] = __CONST(0, SWIZZLE_ZERO);
+	inst[3] = __CONST(0, SWIZZLE_ZERO);
+}
+
+static void ei_vector2(struct r300_vertex_program_code *vp,
+				GLuint hw_opcode,
+				struct prog_instruction *vpi,
+				GLuint * inst)
+{
+	inst[0] = PVS_OP_DST_OPERAND(hw_opcode,
+				     GL_FALSE,
+				     GL_FALSE,
+				     t_dst_index(vp, &vpi->DstReg),
+				     t_dst_mask(vpi->DstReg.WriteMask),
+				     t_dst_class(vpi->DstReg.File));
+	inst[1] = t_src(vp, &vpi->SrcReg[0]);
+	inst[2] = t_src(vp, &vpi->SrcReg[1]);
+	inst[3] = __CONST(1, SWIZZLE_ZERO);
+}
+
+static void ei_math1(struct r300_vertex_program_code *vp,
+				GLuint hw_opcode,
+				struct prog_instruction *vpi,
+				GLuint * inst)
+{
+	inst[0] = PVS_OP_DST_OPERAND(hw_opcode,
+				     GL_TRUE,
+				     GL_FALSE,
+				     t_dst_index(vp, &vpi->DstReg),
+				     t_dst_mask(vpi->DstReg.WriteMask),
+				     t_dst_class(vpi->DstReg.File));
+	inst[1] = t_src_scalar(vp, &vpi->SrcReg[0]);
+	inst[2] = __CONST(0, SWIZZLE_ZERO);
+	inst[3] = __CONST(0, SWIZZLE_ZERO);
+}
+
+static void ei_lit(struct r300_vertex_program_code *vp,
+				      struct prog_instruction *vpi,
+				      GLuint * inst)
+{
+	//LIT TMP 1.Y Z TMP 1{} {X W Z Y} TMP 1{} {Y W Z X} TMP 1{} {Y X Z W}
+
+	inst[0] = PVS_OP_DST_OPERAND(ME_LIGHT_COEFF_DX,
+				     GL_TRUE,
+				     GL_FALSE,
+				     t_dst_index(vp, &vpi->DstReg),
+				     t_dst_mask(vpi->DstReg.WriteMask),
+				     t_dst_class(vpi->DstReg.File));
+	/* NOTE: Users swizzling might not work. */
+	inst[1] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)),	// X
+				  t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)),	// W
+				  PVS_SRC_SELECT_FORCE_0,	// Z
+				  t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)),	// Y
+				  t_src_class(vpi->SrcReg[0].File),
+				  vpi->SrcReg[0].Negate ? NEGATE_XYZW : NEGATE_NONE) |
+	    (vpi->SrcReg[0].RelAddr << 4);
+	inst[2] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)),	// Y
+				  t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)),	// W
+				  PVS_SRC_SELECT_FORCE_0,	// Z
+				  t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)),	// X
+				  t_src_class(vpi->SrcReg[0].File),
+				  vpi->SrcReg[0].Negate ? NEGATE_XYZW : NEGATE_NONE) |
+	    (vpi->SrcReg[0].RelAddr << 4);
+	inst[3] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)),	// Y
+				  t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)),	// X
+				  PVS_SRC_SELECT_FORCE_0,	// Z
+				  t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)),	// W
+				  t_src_class(vpi->SrcReg[0].File),
+				  vpi->SrcReg[0].Negate ? NEGATE_XYZW : NEGATE_NONE) |
+	    (vpi->SrcReg[0].RelAddr << 4);
+}
+
+static void ei_mad(struct r300_vertex_program_code *vp,
+				      struct prog_instruction *vpi,
+				      GLuint * inst)
+{
+	/* Remarks about hardware limitations of MAD
+	 * (please preserve this comment, as this information is _NOT_
+	 * in the documentation provided by AMD).
+	 *
+	 * As described in the documentation, MAD with three unique temporary
+	 * source registers requires the use of the macro version.
+	 *
+	 * However (and this is not mentioned in the documentation), apparently
+	 * the macro version is _NOT_ a full superset of the normal version.
+	 * In particular, the macro version does not always work when relative
+	 * addressing is used in the source operands.
+	 *
+	 * This limitation caused incorrect rendering in Sauerbraten's OpenGL
+	 * assembly shader path when using medium quality animations
+	 * (i.e. animations with matrix blending instead of quaternion blending).
+	 *
+	 * Unfortunately, I (nha) have been unable to extract a Piglit regression
+	 * test for this issue - for some reason, it is possible to have vertex
+	 * programs whose prefix is *exactly* the same as the prefix of the
+	 * offending program in Sauerbraten up to the offending instruction
+	 * without causing any trouble.
+	 *
+	 * Bottom line: Only use the macro version only when really necessary;
+	 * according to AMD docs, this should improve performance by one clock
+	 * as a nice side bonus.
+	 */
+	if (vpi->SrcReg[0].File == PROGRAM_TEMPORARY &&
+	    vpi->SrcReg[1].File == PROGRAM_TEMPORARY &&
+	    vpi->SrcReg[2].File == PROGRAM_TEMPORARY &&
+	    vpi->SrcReg[0].Index != vpi->SrcReg[1].Index &&
+	    vpi->SrcReg[0].Index != vpi->SrcReg[2].Index &&
+	    vpi->SrcReg[1].Index != vpi->SrcReg[2].Index) {
+		inst[0] = PVS_OP_DST_OPERAND(PVS_MACRO_OP_2CLK_MADD,
+				GL_FALSE,
+				GL_TRUE,
+				t_dst_index(vp, &vpi->DstReg),
+				t_dst_mask(vpi->DstReg.WriteMask),
+				t_dst_class(vpi->DstReg.File));
+	} else {
+		inst[0] = PVS_OP_DST_OPERAND(VE_MULTIPLY_ADD,
+				GL_FALSE,
+				GL_FALSE,
+				t_dst_index(vp, &vpi->DstReg),
+				t_dst_mask(vpi->DstReg.WriteMask),
+				t_dst_class(vpi->DstReg.File));
+	}
+	inst[1] = t_src(vp, &vpi->SrcReg[0]);
+	inst[2] = t_src(vp, &vpi->SrcReg[1]);
+	inst[3] = t_src(vp, &vpi->SrcReg[2]);
+}
+
+static void ei_pow(struct r300_vertex_program_code *vp,
+				      struct prog_instruction *vpi,
+				      GLuint * inst)
+{
+	inst[0] = PVS_OP_DST_OPERAND(ME_POWER_FUNC_FF,
+				     GL_TRUE,
+				     GL_FALSE,
+				     t_dst_index(vp, &vpi->DstReg),
+				     t_dst_mask(vpi->DstReg.WriteMask),
+				     t_dst_class(vpi->DstReg.File));
+	inst[1] = t_src_scalar(vp, &vpi->SrcReg[0]);
+	inst[2] = __CONST(0, SWIZZLE_ZERO);
+	inst[3] = t_src_scalar(vp, &vpi->SrcReg[1]);
+}
+
+
+static void translate_vertex_program(struct r300_vertex_program_compiler * compiler)
+{
+	struct rc_instruction *rci;
+
+	compiler->code->pos_end = 0;	/* Not supported yet */
+	compiler->code->length = 0;
+
+	compiler->SetHwInputOutput(compiler);
+
+	for(rci = compiler->Base.Program.Instructions.Next; rci != &compiler->Base.Program.Instructions; rci = rci->Next) {
+		struct prog_instruction *vpi = &rci->I;
+		GLuint *inst = compiler->code->body.d + compiler->code->length;
+
+		/* Skip instructions writing to non-existing destination */
+		if (!valid_dst(compiler->code, &vpi->DstReg))
+			continue;
+
+		if (compiler->code->length >= VSF_MAX_FRAGMENT_LENGTH) {
+			rc_error(&compiler->Base, "Vertex program has too many instructions\n");
+			return;
+		}
+
+		switch (vpi->Opcode) {
+		case OPCODE_ADD: ei_vector2(compiler->code, VE_ADD, vpi, inst); break;
+		case OPCODE_ARL: ei_vector1(compiler->code, VE_FLT2FIX_DX, vpi, inst); break;
+		case OPCODE_DP4: ei_vector2(compiler->code, VE_DOT_PRODUCT, vpi, inst); break;
+		case OPCODE_DST: ei_vector2(compiler->code, VE_DISTANCE_VECTOR, vpi, inst); break;
+		case OPCODE_EX2: ei_math1(compiler->code, ME_EXP_BASE2_FULL_DX, vpi, inst); break;
+		case OPCODE_EXP: ei_math1(compiler->code, ME_EXP_BASE2_DX, vpi, inst); break;
+		case OPCODE_FRC: ei_vector1(compiler->code, VE_FRACTION, vpi, inst); break;
+		case OPCODE_LG2: ei_math1(compiler->code, ME_LOG_BASE2_FULL_DX, vpi, inst); break;
+		case OPCODE_LIT: ei_lit(compiler->code, vpi, inst); break;
+		case OPCODE_LOG: ei_math1(compiler->code, ME_LOG_BASE2_DX, vpi, inst); break;
+		case OPCODE_MAD: ei_mad(compiler->code, vpi, inst); break;
+		case OPCODE_MAX: ei_vector2(compiler->code, VE_MAXIMUM, vpi, inst); break;
+		case OPCODE_MIN: ei_vector2(compiler->code, VE_MINIMUM, vpi, inst); break;
+		case OPCODE_MOV: ei_vector1(compiler->code, VE_ADD, vpi, inst); break;
+		case OPCODE_MUL: ei_vector2(compiler->code, VE_MULTIPLY, vpi, inst); break;
+		case OPCODE_POW: ei_pow(compiler->code, vpi, inst); break;
+		case OPCODE_RCP: ei_math1(compiler->code, ME_RECIP_DX, vpi, inst); break;
+		case OPCODE_RSQ: ei_math1(compiler->code, ME_RECIP_SQRT_DX, vpi, inst); break;
+		case OPCODE_SGE: ei_vector2(compiler->code, VE_SET_GREATER_THAN_EQUAL, vpi, inst); break;
+		case OPCODE_SLT: ei_vector2(compiler->code, VE_SET_LESS_THAN, vpi, inst); break;
+		default:
+			rc_error(&compiler->Base, "Unknown opcode %i\n", vpi->Opcode);
+			return;
+		}
+
+		compiler->code->length += 4;
+
+		if (compiler->Base.Error)
+			return;
+	}
+}
+
+struct temporary_allocation {
+	GLuint Allocated:1;
+	GLuint HwTemp:15;
+	struct rc_instruction * LastRead;
+};
+
+static void allocate_temporary_registers(struct r300_vertex_program_compiler * compiler)
+{
+	struct rc_instruction *inst;
+	GLuint num_orig_temps = 0;
+	GLboolean hwtemps[VSF_MAX_FRAGMENT_TEMPS];
+	struct temporary_allocation * ta;
+	GLuint i, j;
+
+	compiler->code->num_temporaries = 0;
+	memset(hwtemps, 0, sizeof(hwtemps));
+
+	/* Pass 1: Count original temporaries and allocate structures */
+	for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) {
+		GLuint numsrcs = _mesa_num_inst_src_regs(inst->I.Opcode);
+		GLuint numdsts = _mesa_num_inst_dst_regs(inst->I.Opcode);
+
+		for (i = 0; i < numsrcs; ++i) {
+			if (inst->I.SrcReg[i].File == PROGRAM_TEMPORARY) {
+				if (inst->I.SrcReg[i].Index >= num_orig_temps)
+					num_orig_temps = inst->I.SrcReg[i].Index + 1;
+			}
+		}
+
+		if (numdsts) {
+			if (inst->I.DstReg.File == PROGRAM_TEMPORARY) {
+				if (inst->I.DstReg.Index >= num_orig_temps)
+					num_orig_temps = inst->I.DstReg.Index + 1;
+			}
+		}
+	}
+
+	ta = (struct temporary_allocation*)memory_pool_malloc(&compiler->Base.Pool,
+			sizeof(struct temporary_allocation) * num_orig_temps);
+	memset(ta, 0, sizeof(struct temporary_allocation) * num_orig_temps);
+
+	/* Pass 2: Determine original temporary lifetimes */
+	for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) {
+		GLuint numsrcs = _mesa_num_inst_src_regs(inst->I.Opcode);
+
+		for (i = 0; i < numsrcs; ++i) {
+			if (inst->I.SrcReg[i].File == PROGRAM_TEMPORARY)
+				ta[inst->I.SrcReg[i].Index].LastRead = inst;
+		}
+	}
+
+	/* Pass 3: Register allocation */
+	for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) {
+		GLuint numsrcs = _mesa_num_inst_src_regs(inst->I.Opcode);
+		GLuint numdsts = _mesa_num_inst_dst_regs(inst->I.Opcode);
+
+		for (i = 0; i < numsrcs; ++i) {
+			if (inst->I.SrcReg[i].File == PROGRAM_TEMPORARY) {
+				GLuint orig = inst->I.SrcReg[i].Index;
+				inst->I.SrcReg[i].Index = ta[orig].HwTemp;
+
+				if (ta[orig].Allocated && inst == ta[orig].LastRead)
+					hwtemps[ta[orig].HwTemp] = GL_FALSE;
+			}
+		}
+
+		if (numdsts) {
+			if (inst->I.DstReg.File == PROGRAM_TEMPORARY) {
+				GLuint orig = inst->I.DstReg.Index;
+
+				if (!ta[orig].Allocated) {
+					for(j = 0; j < VSF_MAX_FRAGMENT_TEMPS; ++j) {
+						if (!hwtemps[j])
+							break;
+					}
+					if (j >= VSF_MAX_FRAGMENT_TEMPS) {
+						fprintf(stderr, "Out of hw temporaries\n");
+					} else {
+						ta[orig].Allocated = GL_TRUE;
+						ta[orig].HwTemp = j;
+						hwtemps[j] = GL_TRUE;
+
+						if (j >= compiler->code->num_temporaries)
+							compiler->code->num_temporaries = j + 1;
+					}
+				}
+
+				inst->I.DstReg.Index = ta[orig].HwTemp;
+			}
+		}
+	}
+}
+
+
+/**
+ * Vertex engine cannot read two inputs or two constants at the same time.
+ * Introduce intermediate MOVs to temporary registers to account for this.
+ */
+static GLboolean transform_source_conflicts(
+	struct radeon_compiler *c,
+	struct rc_instruction* inst,
+	void* unused)
+{
+	GLuint num_operands = _mesa_num_inst_src_regs(inst->I.Opcode);
+
+	if (num_operands == 3) {
+		if (t_src_conflict(inst->I.SrcReg[1], inst->I.SrcReg[2])
+		    || t_src_conflict(inst->I.SrcReg[0], inst->I.SrcReg[2])) {
+			int tmpreg = rc_find_free_temporary(c);
+			struct rc_instruction * inst_mov = rc_insert_new_instruction(c, inst->Prev);
+			inst_mov->I.Opcode = OPCODE_MOV;
+			inst_mov->I.DstReg.File = PROGRAM_TEMPORARY;
+			inst_mov->I.DstReg.Index = tmpreg;
+			inst_mov->I.SrcReg[0] = inst->I.SrcReg[2];
+
+			reset_srcreg(&inst->I.SrcReg[2]);
+			inst->I.SrcReg[2].File = PROGRAM_TEMPORARY;
+			inst->I.SrcReg[2].Index = tmpreg;
+		}
+	}
+
+	if (num_operands >= 2) {
+		if (t_src_conflict(inst->I.SrcReg[1], inst->I.SrcReg[0])) {
+			int tmpreg = rc_find_free_temporary(c);
+			struct rc_instruction * inst_mov = rc_insert_new_instruction(c, inst->Prev);
+			inst_mov->I.Opcode = OPCODE_MOV;
+			inst_mov->I.DstReg.File = PROGRAM_TEMPORARY;
+			inst_mov->I.DstReg.Index = tmpreg;
+			inst_mov->I.SrcReg[0] = inst->I.SrcReg[1];
+
+			reset_srcreg(&inst->I.SrcReg[1]);
+			inst->I.SrcReg[1].File = PROGRAM_TEMPORARY;
+			inst->I.SrcReg[1].Index = tmpreg;
+		}
+	}
+
+	return GL_TRUE;
+}
+
+static void addArtificialOutputs(struct r300_vertex_program_compiler * compiler)
+{
+	int i;
+
+	for(i = 0; i < 32; ++i) {
+		if ((compiler->RequiredOutputs & (1 << i)) &&
+		    !(compiler->Base.Program.OutputsWritten & (1 << i))) {
+			struct rc_instruction * inst = rc_insert_new_instruction(&compiler->Base, compiler->Base.Program.Instructions.Prev);
+			inst->I.Opcode = OPCODE_MOV;
+
+			inst->I.DstReg.File = PROGRAM_OUTPUT;
+			inst->I.DstReg.Index = i;
+			inst->I.DstReg.WriteMask = WRITEMASK_XYZW;
+
+			inst->I.SrcReg[0].File = PROGRAM_CONSTANT;
+			inst->I.SrcReg[0].Index = 0;
+			inst->I.SrcReg[0].Swizzle = SWIZZLE_XYZW;
+
+			compiler->Base.Program.OutputsWritten |= 1 << i;
+		}
+	}
+}
+
+static void nqssadceInit(struct nqssadce_state* s)
+{
+	struct r300_vertex_program_compiler * compiler = s->UserData;
+	int i;
+
+	for(i = 0; i < VERT_RESULT_MAX; ++i) {
+		if (compiler->RequiredOutputs & (1 << i))
+			s->Outputs[i].Sourced = WRITEMASK_XYZW;
+	}
+}
+
+static GLboolean swizzleIsNative(GLuint opcode, struct prog_src_register reg)
+{
+	(void) opcode;
+	(void) reg;
+
+	return GL_TRUE;
+}
+
+
+
+void r3xx_compile_vertex_program(struct r300_vertex_program_compiler* compiler)
+{
+	addArtificialOutputs(compiler);
+
+	{
+		struct radeon_program_transformation transformations[] = {
+			{ &r300_transform_vertex_alu, 0 },
+		};
+		radeonLocalTransform(&compiler->Base, 1, transformations);
+	}
+
+	if (compiler->Base.Debug) {
+		fprintf(stderr, "Vertex program after native rewrite:\n");
+		rc_print_program(&compiler->Base.Program);
+		fflush(stderr);
+	}
+
+	{
+		/* Note: This pass has to be done seperately from ALU rewrite,
+		 * otherwise non-native ALU instructions with source conflits
+		 * will not be treated properly.
+		 */
+		struct radeon_program_transformation transformations[] = {
+			{ &transform_source_conflicts, 0 },
+		};
+		radeonLocalTransform(&compiler->Base, 1, transformations);
+	}
+
+	if (compiler->Base.Debug) {
+		fprintf(stderr, "Vertex program after source conflict resolve:\n");
+		rc_print_program(&compiler->Base.Program);
+		fflush(stderr);
+	}
+
+	{
+		struct radeon_nqssadce_descr nqssadce = {
+			.Init = &nqssadceInit,
+			.IsNativeSwizzle = &swizzleIsNative,
+			.BuildSwizzle = NULL
+		};
+		radeonNqssaDce(&compiler->Base, &nqssadce, compiler);
+
+		/* We need this step for reusing temporary registers */
+		allocate_temporary_registers(compiler);
+
+		if (compiler->Base.Debug) {
+			fprintf(stderr, "Vertex program after NQSSADCE:\n");
+			rc_print_program(&compiler->Base.Program);
+			fflush(stderr);
+		}
+	}
+
+	translate_vertex_program(compiler);
+
+	rc_constants_copy(&compiler->code->constants, &compiler->Base.Program.Constants);
+
+	compiler->code->InputsRead = compiler->Base.Program.InputsRead;
+	compiler->code->OutputsWritten = compiler->Base.Program.OutputsWritten;
+
+	if (compiler->Base.Debug) {
+		fprintf(stderr, "Final vertex program code:\n");
+		r300_vertex_program_dump(compiler->code);
+	}
+}