From 4e4ab2a62bf33a582420cff85775a6580167b5a9 Mon Sep 17 00:00:00 2001 From: Oliver McFadden Date: Thu, 15 Mar 2007 17:35:34 +0000 Subject: Committed Rune Petersen's fragment.position patch (Bug #10024) plus a few small corrections. --- src/mesa/drivers/dri/r300/r300_fragprog.c | 91 +++++++++++++++++++++++++++++++ 1 file changed, 91 insertions(+) (limited to 'src/mesa/drivers/dri/r300/r300_fragprog.c') diff --git a/src/mesa/drivers/dri/r300/r300_fragprog.c b/src/mesa/drivers/dri/r300/r300_fragprog.c index a1c634a54d..d3062a4145 100644 --- a/src/mesa/drivers/dri/r300/r300_fragprog.c +++ b/src/mesa/drivers/dri/r300/r300_fragprog.c @@ -49,6 +49,7 @@ #include "r300_context.h" #include "r300_fragprog.h" #include "r300_reg.h" +#include "r300_state.h" /* * Usefull macros and values @@ -1787,6 +1788,94 @@ static GLboolean parse_program(struct r300_fragment_program *rp) return GL_TRUE; } +static void insert_wpos(struct gl_program *prog) +{ + GLint tokens[6] = { STATE_INTERNAL, STATE_R300_WINDOW_DIMENSION, 0, 0, 0, 0 }; + struct prog_instruction *fpi; + GLuint window_index; + int i = 0; + GLuint tempregi = prog->NumTemporaries; + /* should do something else if no temps left... */ + prog->NumTemporaries++; + + + fpi = malloc((prog->NumInstructions + 3) * sizeof(struct prog_instruction)); + /* all including END */ + memcpy(&fpi[3], prog->Instructions, prog->NumInstructions * sizeof(struct prog_instruction)); + + memset(fpi, 0, 3 * sizeof(struct prog_instruction)); + + /* perspective divide */ + fpi[i].Opcode = OPCODE_RCP; + + fpi[i].DstReg.File = PROGRAM_TEMPORARY; + fpi[i].DstReg.Index = tempregi; + fpi[i].DstReg.WriteMask = WRITEMASK_W; + fpi[i].DstReg.CondMask = COND_TR; + + fpi[i].SrcReg[0].File = PROGRAM_INPUT; + fpi[i].SrcReg[0].Index = FRAG_ATTRIB_WPOS; + fpi[i].SrcReg[0].Swizzle = MAKE_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W); + i++; + + fpi[i].Opcode = OPCODE_MUL; + + fpi[i].DstReg.File = PROGRAM_TEMPORARY; + fpi[i].DstReg.Index = tempregi; + fpi[i].DstReg.WriteMask = WRITEMASK_XYZ; + fpi[i].DstReg.CondMask = COND_TR; + + fpi[i].SrcReg[0].File = PROGRAM_INPUT; + fpi[i].SrcReg[0].Index = FRAG_ATTRIB_WPOS; + fpi[i].SrcReg[0].Swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_W); + + fpi[i].SrcReg[1].File = PROGRAM_TEMPORARY; + fpi[i].SrcReg[1].Index = tempregi; + fpi[i].SrcReg[1].Swizzle = MAKE_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W); + i++; + + /* viewport transformation */ + window_index = _mesa_add_state_reference(prog->Parameters, tokens); + + fpi[i].Opcode = OPCODE_MAD; + + fpi[i].DstReg.File = PROGRAM_TEMPORARY; + fpi[i].DstReg.Index = tempregi; + fpi[i].DstReg.WriteMask = WRITEMASK_XYZ; + fpi[i].DstReg.CondMask = COND_TR; + + fpi[i].SrcReg[0].File = PROGRAM_TEMPORARY; + fpi[i].SrcReg[0].Index = tempregi; + fpi[i].SrcReg[0].Swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO); + + fpi[i].SrcReg[1].File = PROGRAM_STATE_VAR; + fpi[i].SrcReg[1].Index = window_index; + fpi[i].SrcReg[1].Swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO); + + fpi[i].SrcReg[2].File = PROGRAM_STATE_VAR; + fpi[i].SrcReg[2].Index = window_index; + fpi[i].SrcReg[2].Swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO); + i++; + + free(prog->Instructions); + + prog->Instructions = fpi; + + prog->NumInstructions += i; + fpi = &prog->Instructions[prog->NumInstructions-1]; + + assert(fpi->Opcode == OPCODE_END); + + for(fpi = &prog->Instructions[3]; fpi->Opcode != OPCODE_END; fpi++){ + for(i=0; i<3; i++) + if( fpi->SrcReg[i].File == PROGRAM_INPUT && + fpi->SrcReg[i].Index == FRAG_ATTRIB_WPOS ){ + fpi->SrcReg[i].File = PROGRAM_TEMPORARY; + fpi->SrcReg[i].Index = tempregi; + } + } +} + /* - Init structures * - Determine what hwregs each input corresponds to */ @@ -1844,6 +1933,7 @@ static void init_program(r300ContextPtr r300, struct r300_fragment_program *rp) if (InputsRead & FRAG_BIT_WPOS) { cs->inputs[FRAG_ATTRIB_WPOS].refcount = 0; cs->inputs[FRAG_ATTRIB_WPOS].reg = get_hw_temp(rp); + insert_wpos(&mp->Base); } InputsRead &= ~FRAG_BIT_WPOS; @@ -1956,6 +2046,7 @@ void r300_translate_fragment_shader(r300ContextPtr r300, struct r300_fragment_pr rp->translated = GL_TRUE; if (0) dump_program(rp); + r300UpdateStateParameters(rp->ctx, _NEW_PROGRAM); } update_params(rp); -- cgit v1.2.3 From 0d6d80ef3dbd39ed346c3189385242016f5aed74 Mon Sep 17 00:00:00 2001 From: Oliver McFadden Date: Thu, 15 Mar 2007 19:09:10 +0000 Subject: r300: Updated R300 to use the new SWIZZLE macros. --- src/mesa/drivers/dri/r300/r300_fragprog.c | 6 +++--- src/mesa/drivers/dri/r300/r300_vertexprog.c | 12 ++++++------ 2 files changed, 9 insertions(+), 9 deletions(-) (limited to 'src/mesa/drivers/dri/r300/r300_fragprog.c') diff --git a/src/mesa/drivers/dri/r300/r300_fragprog.c b/src/mesa/drivers/dri/r300/r300_fragprog.c index d3062a4145..82fb5b66ed 100644 --- a/src/mesa/drivers/dri/r300/r300_fragprog.c +++ b/src/mesa/drivers/dri/r300/r300_fragprog.c @@ -1815,7 +1815,7 @@ static void insert_wpos(struct gl_program *prog) fpi[i].SrcReg[0].File = PROGRAM_INPUT; fpi[i].SrcReg[0].Index = FRAG_ATTRIB_WPOS; - fpi[i].SrcReg[0].Swizzle = MAKE_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W); + fpi[i].SrcReg[0].Swizzle = SWIZZLE_WWWW; i++; fpi[i].Opcode = OPCODE_MUL; @@ -1827,11 +1827,11 @@ static void insert_wpos(struct gl_program *prog) fpi[i].SrcReg[0].File = PROGRAM_INPUT; fpi[i].SrcReg[0].Index = FRAG_ATTRIB_WPOS; - fpi[i].SrcReg[0].Swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_W); + fpi[i].SrcReg[0].Swizzle = SWIZZLE_XYZW; fpi[i].SrcReg[1].File = PROGRAM_TEMPORARY; fpi[i].SrcReg[1].Index = tempregi; - fpi[i].SrcReg[1].Swizzle = MAKE_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W); + fpi[i].SrcReg[1].Swizzle = SWIZZLE_WWWW; i++; /* viewport transformation */ diff --git a/src/mesa/drivers/dri/r300/r300_vertexprog.c b/src/mesa/drivers/dri/r300/r300_vertexprog.c index 68a11a42b3..9257ff44e3 100644 --- a/src/mesa/drivers/dri/r300/r300_vertexprog.c +++ b/src/mesa/drivers/dri/r300/r300_vertexprog.c @@ -908,11 +908,11 @@ static void position_invariant(struct gl_program *prog) vpi[i].SrcReg[0].File = PROGRAM_STATE_VAR; vpi[i].SrcReg[0].Index = idx; - vpi[i].SrcReg[0].Swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_W); + vpi[i].SrcReg[0].Swizzle = SWIZZLE_XYZW; vpi[i].SrcReg[1].File = PROGRAM_INPUT; vpi[i].SrcReg[1].Index = VERT_ATTRIB_POS; - vpi[i].SrcReg[1].Swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_W); + vpi[i].SrcReg[1].Swizzle = SWIZZLE_XYZW; #else if (i == 0) vpi[i].Opcode = OPCODE_MUL; @@ -932,7 +932,7 @@ static void position_invariant(struct gl_program *prog) vpi[i].SrcReg[0].File = PROGRAM_STATE_VAR; vpi[i].SrcReg[0].Index = idx; - vpi[i].SrcReg[0].Swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_W); + vpi[i].SrcReg[0].Swizzle = SWIZZLE_XYZW; vpi[i].SrcReg[1].File = PROGRAM_INPUT; vpi[i].SrcReg[1].Index = VERT_ATTRIB_POS; @@ -941,7 +941,7 @@ static void position_invariant(struct gl_program *prog) if (i > 0) { vpi[i].SrcReg[2].File = PROGRAM_TEMPORARY; vpi[i].SrcReg[2].Index = 0; - vpi[i].SrcReg[2].Swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_W); + vpi[i].SrcReg[2].Swizzle = SWIZZLE_XYZW; } #endif } @@ -985,7 +985,7 @@ static void insert_wpos(struct r300_vertex_program *vp, vpi_insert[i].SrcReg[0].File = PROGRAM_TEMPORARY; vpi_insert[i].SrcReg[0].Index = temp_index; - vpi_insert[i].SrcReg[0].Swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_W); + vpi_insert[i].SrcReg[0].Swizzle = SWIZZLE_XYZW; i++; vpi_insert[i].Opcode = OPCODE_MOV; @@ -997,7 +997,7 @@ static void insert_wpos(struct r300_vertex_program *vp, vpi_insert[i].SrcReg[0].File = PROGRAM_TEMPORARY; vpi_insert[i].SrcReg[0].Index = temp_index; - vpi_insert[i].SrcReg[0].Swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_W); + vpi_insert[i].SrcReg[0].Swizzle = SWIZZLE_XYZW; i++; free(prog->Instructions); -- cgit v1.2.3 From 0e9ada1087a58af4c1375cc35b318aa0521d3a72 Mon Sep 17 00:00:00 2001 From: Oliver McFadden Date: Thu, 15 Mar 2007 19:49:10 +0000 Subject: r300: Use _mesa_alloc_instructions/_mesa_init_instructions instead of malloc. Note that insert_wpos in r300_vertexprog.c is still a little flaky and could be improved. --- src/mesa/drivers/dri/r300/r300_fragprog.c | 10 ++++------ src/mesa/drivers/dri/r300/r300_vertexprog.c | 15 +++++++-------- 2 files changed, 11 insertions(+), 14 deletions(-) (limited to 'src/mesa/drivers/dri/r300/r300_fragprog.c') diff --git a/src/mesa/drivers/dri/r300/r300_fragprog.c b/src/mesa/drivers/dri/r300/r300_fragprog.c index 82fb5b66ed..e05abdb7c6 100644 --- a/src/mesa/drivers/dri/r300/r300_fragprog.c +++ b/src/mesa/drivers/dri/r300/r300_fragprog.c @@ -1798,12 +1798,8 @@ static void insert_wpos(struct gl_program *prog) /* should do something else if no temps left... */ prog->NumTemporaries++; - - fpi = malloc((prog->NumInstructions + 3) * sizeof(struct prog_instruction)); - /* all including END */ - memcpy(&fpi[3], prog->Instructions, prog->NumInstructions * sizeof(struct prog_instruction)); - - memset(fpi, 0, 3 * sizeof(struct prog_instruction)); + fpi = _mesa_alloc_instructions (prog->NumInstructions + 3); + _mesa_init_instructions (fpi, prog->NumInstructions + 3); /* perspective divide */ fpi[i].Opcode = OPCODE_RCP; @@ -1857,6 +1853,8 @@ static void insert_wpos(struct gl_program *prog) fpi[i].SrcReg[2].Swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO); i++; + _mesa_memcpy(&fpi[i], prog->Instructions, prog->NumInstructions * sizeof(struct prog_instruction)); + free(prog->Instructions); prog->Instructions = fpi; diff --git a/src/mesa/drivers/dri/r300/r300_vertexprog.c b/src/mesa/drivers/dri/r300/r300_vertexprog.c index 9257ff44e3..0c43270d75 100644 --- a/src/mesa/drivers/dri/r300/r300_vertexprog.c +++ b/src/mesa/drivers/dri/r300/r300_vertexprog.c @@ -889,8 +889,8 @@ static void position_invariant(struct gl_program *prog) #endif paramList = prog->Parameters; - vpi = malloc((prog->NumInstructions + 4) * sizeof(struct prog_instruction)); - memset(vpi, 0, 4 * sizeof(struct prog_instruction)); + vpi = _mesa_alloc_instructions (prog->NumInstructions + 4); + _mesa_init_instructions (vpi, prog->NumInstructions + 4); for (i=0; i < 4; i++) { GLint idx; @@ -946,7 +946,7 @@ static void position_invariant(struct gl_program *prog) #endif } - memcpy(&vpi[i], prog->Instructions, prog->NumInstructions * sizeof(struct prog_instruction)); + _mesa_memcpy(&vpi[i], prog->Instructions, prog->NumInstructions * sizeof(struct prog_instruction)); free(prog->Instructions); @@ -966,15 +966,14 @@ static void insert_wpos(struct r300_vertex_program *vp, struct prog_instruction *vpi_insert; int i = 0; - vpi = malloc((prog->NumInstructions + 2) * sizeof(struct prog_instruction)); + vpi = _mesa_alloc_instructions (prog->NumInstructions + 2); + _mesa_init_instructions (vpi, prog->NumInstructions + 2); /* all but END */ - memcpy(vpi, prog->Instructions, (prog->NumInstructions - 1) * sizeof(struct prog_instruction)); + _mesa_memcpy(vpi, prog->Instructions, (prog->NumInstructions - 1) * sizeof(struct prog_instruction)); /* END */ - memcpy(&vpi[prog->NumInstructions + 1], &prog->Instructions[prog->NumInstructions - 1], + _mesa_memcpy(&vpi[prog->NumInstructions + 1], &prog->Instructions[prog->NumInstructions - 1], sizeof(struct prog_instruction)); - vpi_insert = &vpi[prog->NumInstructions - 1]; - memset(vpi_insert, 0, 2 * sizeof(struct prog_instruction)); vpi_insert[i].Opcode = OPCODE_MOV; -- cgit v1.2.3 From 0c25d9ab198f79afee23ec1bf8ac61c4cd801d3a Mon Sep 17 00:00:00 2001 From: Oliver McFadden Date: Thu, 15 Mar 2007 20:55:30 +0000 Subject: r300: Added _mesa_copy_instructions. --- src/mesa/drivers/dri/r300/r300_fragprog.c | 2 +- src/mesa/drivers/dri/r300/r300_vertprog.c | 9 +++++---- src/mesa/shader/program.c | 14 ++++++++++++++ src/mesa/shader/program.h | 3 +++ 4 files changed, 23 insertions(+), 5 deletions(-) (limited to 'src/mesa/drivers/dri/r300/r300_fragprog.c') diff --git a/src/mesa/drivers/dri/r300/r300_fragprog.c b/src/mesa/drivers/dri/r300/r300_fragprog.c index e05abdb7c6..251fd26082 100644 --- a/src/mesa/drivers/dri/r300/r300_fragprog.c +++ b/src/mesa/drivers/dri/r300/r300_fragprog.c @@ -1853,7 +1853,7 @@ static void insert_wpos(struct gl_program *prog) fpi[i].SrcReg[2].Swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO); i++; - _mesa_memcpy(&fpi[i], prog->Instructions, prog->NumInstructions * sizeof(struct prog_instruction)); + _mesa_copy_instructions (&fpi[i], prog->Instructions, prog->NumInstructions); free(prog->Instructions); diff --git a/src/mesa/drivers/dri/r300/r300_vertprog.c b/src/mesa/drivers/dri/r300/r300_vertprog.c index 0c43270d75..092ebb1140 100644 --- a/src/mesa/drivers/dri/r300/r300_vertprog.c +++ b/src/mesa/drivers/dri/r300/r300_vertprog.c @@ -946,7 +946,7 @@ static void position_invariant(struct gl_program *prog) #endif } - _mesa_memcpy(&vpi[i], prog->Instructions, prog->NumInstructions * sizeof(struct prog_instruction)); + _mesa_copy_instructions (&vpi[i], prog->Instructions, prog->NumInstructions); free(prog->Instructions); @@ -969,10 +969,11 @@ static void insert_wpos(struct r300_vertex_program *vp, vpi = _mesa_alloc_instructions (prog->NumInstructions + 2); _mesa_init_instructions (vpi, prog->NumInstructions + 2); /* all but END */ - _mesa_memcpy(vpi, prog->Instructions, (prog->NumInstructions - 1) * sizeof(struct prog_instruction)); + _mesa_copy_instructions (vpi, prog->Instructions, prog->NumInstructions - 1); /* END */ - _mesa_memcpy(&vpi[prog->NumInstructions + 1], &prog->Instructions[prog->NumInstructions - 1], - sizeof(struct prog_instruction)); + _mesa_copy_instructions (&vpi[prog->NumInstructions + 1], + &prog->Instructions[prog->NumInstructions - 1], + 1); vpi_insert = &vpi[prog->NumInstructions - 1]; vpi_insert[i].Opcode = OPCODE_MOV; diff --git a/src/mesa/shader/program.c b/src/mesa/shader/program.c index 490f919445..3d5f648191 100644 --- a/src/mesa/shader/program.c +++ b/src/mesa/shader/program.c @@ -1480,6 +1480,20 @@ _mesa_realloc_instructions(struct prog_instruction *oldInst, return newInst; } +/** + * Copy an narray of program instructions. + * \param dest pointer to destination. + * \param src pointer to source. + * \param n number of instructions to copy. + * \return pointer to destination. + */ +struct prog_instruction * +_mesa_copy_instructions(struct prog_instruction *dest, + const struct prog_instruction *src, GLuint n) +{ + return _mesa_memcpy (dest, src, n * sizeof (struct prog_instruction)); +} + /** * Basic info about each instruction diff --git a/src/mesa/shader/program.h b/src/mesa/shader/program.h index 5b5d134f6d..6f5013df35 100644 --- a/src/mesa/shader/program.h +++ b/src/mesa/shader/program.h @@ -128,6 +128,9 @@ extern struct prog_instruction * _mesa_realloc_instructions(struct prog_instruction *oldInst, GLuint numOldInst, GLuint numNewInst); +extern struct prog_instruction * +_mesa_copy_instructions(struct prog_instruction *dest, + const struct prog_instruction *src, GLuint n); /** * Used for describing GL state referenced from inside ARB vertex and -- cgit v1.2.3 From 7b430acd71f04dce3e21bdcfe70115a23d751f30 Mon Sep 17 00:00:00 2001 From: Nicolai Haehnle Date: Sun, 18 Mar 2007 02:15:56 +0100 Subject: r300: Fix fragment program instruction pairing and register allocation There were a number of bugs related to the pairing of vector and scalar operations where swizzles ended up using the wrong source register, or an instruction was moved forward and ended up overwriting an aliased register. The new algorithm for register allocation is quite conservative and may run out of registers before necessary. On the plus side, It Just Works. Pairing is done whenever possible, and in more cases than before, so in practice this change should be a net win. --- src/mesa/drivers/dri/r300/r300_context.h | 94 +++- src/mesa/drivers/dri/r300/r300_fragprog.c | 774 ++++++++++++++++++++---------- src/mesa/drivers/dri/r300/r300_reg.h | 4 +- 3 files changed, 582 insertions(+), 290 deletions(-) (limited to 'src/mesa/drivers/dri/r300/r300_fragprog.c') diff --git a/src/mesa/drivers/dri/r300/r300_context.h b/src/mesa/drivers/dri/r300/r300_context.h index bd9ed6f170..bc43953ff3 100644 --- a/src/mesa/drivers/dri/r300/r300_context.h +++ b/src/mesa/drivers/dri/r300/r300_context.h @@ -647,38 +647,84 @@ struct r300_vertex_program_cont { #define PFS_NUM_TEMP_REGS 32 #define PFS_NUM_CONST_REGS 16 -/* Tracking data for Mesa registers */ +/* Mapping Mesa registers to R300 temporaries */ struct reg_acc { int reg; /* Assigned hw temp */ unsigned int refcount; /* Number of uses by mesa program */ }; +/** + * Describe the current lifetime information for an R300 temporary + */ +struct reg_lifetime { + /* Index of the first slot where this register is free in the sense + that it can be used as a new destination register. + This is -1 if the register has been assigned to a Mesa register + and the last access to the register has not yet been emitted */ + int free; + + /* Index of the first slot where this register is currently reserved. + This is used to stop e.g. a scalar operation from being moved + before the allocation time of a register that was first allocated + for a vector operation. */ + int reserved; + + /* Index of the first slot in which the register can be used as a + source without losing the value that is written by the last + emitted instruction that writes to the register */ + int vector_valid; + int scalar_valid; +}; + + +/** + * Store usage information about an ALU instruction slot during the + * compilation of a fragment program. + */ +#define SLOT_SRC_VECTOR (1<<0) +#define SLOT_SRC_SCALAR (1<<3) +#define SLOT_SRC_BOTH (SLOT_SRC_VECTOR | SLOT_SRC_SCALAR) +#define SLOT_OP_VECTOR (1<<16) +#define SLOT_OP_SCALAR (1<<17) +#define SLOT_OP_BOTH (SLOT_OP_VECTOR | SLOT_OP_SCALAR) + +struct r300_pfs_compile_slot { + /* Bitmask indicating which parts of the slot are used, using SLOT_ constants + defined above */ + unsigned int used; + + /* Selected sources */ + int vsrc[3]; + int ssrc[3]; +}; + +/** + * Store information during compilation of fragment programs. + */ struct r300_pfs_compile_state { - int v_pos, s_pos; /* highest ALU slots used */ - - /* Track some information gathered during opcode - * construction. - * - * NOTE: Data is only set by the code, and isn't used yet. - */ - struct { - int vsrc[3]; - int ssrc[3]; - int umask; - } slot[PFS_MAX_ALU_INST]; - - /* Used to map Mesa's inputs/temps onto hardware temps */ - int temp_in_use; - struct reg_acc temps[PFS_NUM_TEMP_REGS]; - struct reg_acc inputs[32]; /* don't actually need 32... */ - - /* Track usage of hardware temps, for register allocation, - * indirection detection, etc. */ - int hwreg_in_use; - GLuint used_in_node; - GLuint dest_in_node; + int nrslots; /* number of ALU slots used so far */ + + /* Track which (parts of) slots are already filled with instructions */ + struct r300_pfs_compile_slot slot[PFS_MAX_ALU_INST]; + + /* Track the validity of R300 temporaries */ + struct reg_lifetime hwtemps[PFS_NUM_TEMP_REGS]; + + /* Used to map Mesa's inputs/temps onto hardware temps */ + int temp_in_use; + struct reg_acc temps[PFS_NUM_TEMP_REGS]; + struct reg_acc inputs[32]; /* don't actually need 32... */ + + /* Track usage of hardware temps, for register allocation, + * indirection detection, etc. */ + GLuint used_in_node; + GLuint dest_in_node; }; +/** + * Store everything about a fragment program that is needed + * to render with that program. + */ struct r300_fragment_program { struct gl_fragment_program mesa_program; diff --git a/src/mesa/drivers/dri/r300/r300_fragprog.c b/src/mesa/drivers/dri/r300/r300_fragprog.c index 251fd26082..b2c89ccb36 100644 --- a/src/mesa/drivers/dri/r300/r300_fragprog.c +++ b/src/mesa/drivers/dri/r300/r300_fragprog.c @@ -94,8 +94,9 @@ #define REG_NEGV_SHIFT 18 #define REG_NEGS_SHIFT 19 #define REG_ABS_SHIFT 20 -#define REG_NO_USE_SHIFT 21 -#define REG_VALID_SHIFT 22 +#define REG_NO_USE_SHIFT 21 // Hack for refcounting +#define REG_VALID_SHIFT 22 // Does the register contain a defined value? +#define REG_BUILTIN_SHIFT 23 // Is it a builtin (like all zero/all one)? #define REG_TYPE_MASK (0x03 << REG_TYPE_SHIFT) #define REG_INDEX_MASK (0x3F << REG_INDEX_SHIFT) @@ -106,12 +107,14 @@ #define REG_ABS_MASK (0x01 << REG_ABS_SHIFT) #define REG_NO_USE_MASK (0x01 << REG_NO_USE_SHIFT) #define REG_VALID_MASK (0x01 << REG_VALID_SHIFT) +#define REG_BUILTIN_MASK (0x01 << REG_BUILTIN_SHIFT) -#define REG(type, index, vswz, sswz, nouse, valid) \ +#define REG(type, index, vswz, sswz, nouse, valid, builtin) \ (((type << REG_TYPE_SHIFT) & REG_TYPE_MASK) | \ ((index << REG_INDEX_SHIFT) & REG_INDEX_MASK) | \ ((nouse << REG_NO_USE_SHIFT) & REG_NO_USE_MASK) | \ ((valid << REG_VALID_SHIFT) & REG_VALID_MASK) | \ + ((builtin << REG_BUILTIN_SHIFT) & REG_BUILTIN_MASK) | \ ((vswz << REG_VSWZ_SHIFT) & REG_VSWZ_MASK) | \ ((sswz << REG_SSWZ_SHIFT) & REG_SSWZ_MASK)) #define REG_GET_TYPE(reg) \ @@ -126,6 +129,8 @@ ((reg & REG_NO_USE_MASK) >> REG_NO_USE_SHIFT) #define REG_GET_VALID(reg) \ ((reg & REG_VALID_MASK) >> REG_VALID_SHIFT) +#define REG_GET_BUILTIN(reg) \ + ((reg & REG_BUILTIN_MASK) >> REG_BUILTIN_SHIFT) #define REG_SET_TYPE(reg, type) \ reg = ((reg & ~REG_TYPE_MASK) | \ ((type << REG_TYPE_SHIFT) & REG_TYPE_MASK)) @@ -144,6 +149,9 @@ #define REG_SET_VALID(reg, valid) \ reg = ((reg & ~REG_VALID_MASK) | \ ((valid << REG_VALID_SHIFT) & REG_VALID_MASK)) +#define REG_SET_BUILTIN(reg, builtin) \ + reg = ((reg & ~REG_BUILTIN_MASK) | \ + ((builtin << REG_BUILTIN_SHIFT) & REG_BUILTIN_MASK)) #define REG_ABS(reg) \ reg = (reg | REG_ABS_MASK) #define REG_NEGV(reg) \ @@ -184,9 +192,6 @@ static const struct { * * REG_VSWZ/REG_SSWZ is an index into this table */ -#define SLOT_VECTOR (1<<0) -#define SLOT_SCALAR (1<<3) -#define SLOT_BOTH (SLOT_VECTOR | SLOT_SCALAR) /* mapping from SWIZZLE_* to r300 native values for scalar insns */ #define SWIZZLE_HALF 6 @@ -202,14 +207,14 @@ static const struct r300_pfs_swizzle { GLuint flags; } v_swiz[] = { /* native swizzles */ - { MAKE_SWZ3(X, Y, Z), R300_FPI0_ARGC_SRC0C_XYZ, 4, SLOT_VECTOR }, - { MAKE_SWZ3(X, X, X), R300_FPI0_ARGC_SRC0C_XXX, 4, SLOT_VECTOR }, - { MAKE_SWZ3(Y, Y, Y), R300_FPI0_ARGC_SRC0C_YYY, 4, SLOT_VECTOR }, - { MAKE_SWZ3(Z, Z, Z), R300_FPI0_ARGC_SRC0C_ZZZ, 4, SLOT_VECTOR }, - { MAKE_SWZ3(W, W, W), R300_FPI0_ARGC_SRC0A, 1, SLOT_SCALAR }, - { MAKE_SWZ3(Y, Z, X), R300_FPI0_ARGC_SRC0C_YZX, 1, SLOT_VECTOR }, - { MAKE_SWZ3(Z, X, Y), R300_FPI0_ARGC_SRC0C_ZXY, 1, SLOT_VECTOR }, - { MAKE_SWZ3(W, Z, Y), R300_FPI0_ARGC_SRC0CA_WZY, 1, SLOT_BOTH }, + { MAKE_SWZ3(X, Y, Z), R300_FPI0_ARGC_SRC0C_XYZ, 4, SLOT_SRC_VECTOR }, + { MAKE_SWZ3(X, X, X), R300_FPI0_ARGC_SRC0C_XXX, 4, SLOT_SRC_VECTOR }, + { MAKE_SWZ3(Y, Y, Y), R300_FPI0_ARGC_SRC0C_YYY, 4, SLOT_SRC_VECTOR }, + { MAKE_SWZ3(Z, Z, Z), R300_FPI0_ARGC_SRC0C_ZZZ, 4, SLOT_SRC_VECTOR }, + { MAKE_SWZ3(W, W, W), R300_FPI0_ARGC_SRC0A, 1, SLOT_SRC_SCALAR }, + { MAKE_SWZ3(Y, Z, X), R300_FPI0_ARGC_SRC0C_YZX, 1, SLOT_SRC_VECTOR }, + { MAKE_SWZ3(Z, X, Y), R300_FPI0_ARGC_SRC0C_ZXY, 1, SLOT_SRC_VECTOR }, + { MAKE_SWZ3(W, Z, Y), R300_FPI0_ARGC_SRC0CA_WZY, 1, SLOT_SRC_BOTH }, { MAKE_SWZ3(ONE, ONE, ONE), R300_FPI0_ARGC_ONE, 0, 0}, { MAKE_SWZ3(ZERO, ZERO, ZERO), R300_FPI0_ARGC_ZERO, 0, 0}, { MAKE_SWZ3(HALF, HALF, HALF), R300_FPI0_ARGC_HALF, 0, 0}, @@ -241,10 +246,10 @@ static const struct { int stride; /* difference between SRC0/1/2 */ GLuint flags; } s_swiz[] = { - { R300_FPI2_ARGA_SRC0C_X, 3, SLOT_VECTOR }, - { R300_FPI2_ARGA_SRC0C_Y, 3, SLOT_VECTOR }, - { R300_FPI2_ARGA_SRC0C_Z, 3, SLOT_VECTOR }, - { R300_FPI2_ARGA_SRC0A , 1, SLOT_SCALAR }, + { R300_FPI2_ARGA_SRC0C_X, 3, SLOT_SRC_VECTOR }, + { R300_FPI2_ARGA_SRC0C_Y, 3, SLOT_SRC_VECTOR }, + { R300_FPI2_ARGA_SRC0C_Z, 3, SLOT_SRC_VECTOR }, + { R300_FPI2_ARGA_SRC0A , 1, SLOT_SRC_SCALAR }, { R300_FPI2_ARGA_ZERO , 0, 0 }, { R300_FPI2_ARGA_ONE , 0, 0 }, { R300_FPI2_ARGA_HALF , 0, 0 } @@ -256,6 +261,7 @@ static const GLuint undef = REG(REG_TYPE_TEMP, SWIZZLE_XYZ, SWIZZLE_W, GL_FALSE, + GL_FALSE, GL_FALSE); /* constant one source */ @@ -264,6 +270,7 @@ static const GLuint pfs_one = REG(REG_TYPE_CONST, SWIZZLE_111, SWIZZLE_ONE, GL_FALSE, + GL_TRUE, GL_TRUE); /* constant half source */ @@ -272,6 +279,7 @@ static const GLuint pfs_half = REG(REG_TYPE_CONST, SWIZZLE_HHH, SWIZZLE_HALF, GL_FALSE, + GL_TRUE, GL_TRUE); /* constant zero source */ @@ -280,6 +288,7 @@ static const GLuint pfs_zero = REG(REG_TYPE_CONST, SWIZZLE_000, SWIZZLE_ZERO, GL_FALSE, + GL_TRUE, GL_TRUE); /* @@ -291,47 +300,105 @@ static void emit_arith(struct r300_fragment_program *rp, int op, GLuint src0, GLuint src1, GLuint src2, int flags); -/* - * Helper functions prototypes +/** + * Get an R300 temporary that can be written to in the given slot. */ -static int get_hw_temp(struct r300_fragment_program *rp) +static int get_hw_temp(struct r300_fragment_program *rp, int slot) { COMPILE_STATE; - int r = ffs(~cs->hwreg_in_use); - if (!r) { + int r; + + for(r = 0; r < PFS_NUM_TEMP_REGS; ++r) { + if (cs->hwtemps[r].free >= 0 && cs->hwtemps[r].free <= slot) + break; + } + + if (r >= PFS_NUM_TEMP_REGS) { ERROR("Out of hardware temps\n"); return 0; } - - cs->hwreg_in_use |= (1 << --r); + + // Reserved is used to avoid the following scenario: + // R300 temporary X is first assigned to Mesa temporary Y during vector ops + // R300 temporary X is then assigned to Mesa temporary Z for further vector ops + // Then scalar ops on Mesa temporary Z are emitted and move back in time + // to overwrite the value of temporary Y. + // End scenario. + cs->hwtemps[r].reserved = cs->hwtemps[r].free; + cs->hwtemps[r].free = -1; + + // Reset to some value that won't mess things up when the user + // tries to read from a temporary that hasn't been assigned a value yet. + // In the normal case, vector_valid and scalar_valid should be set to + // a sane value by the first emit that writes to this temporary. + cs->hwtemps[r].vector_valid = 0; + cs->hwtemps[r].scalar_valid = 0; + if (r > rp->max_temp_idx) rp->max_temp_idx = r; - + return r; } +/** + * Get an R300 temporary that will act as a TEX destination register. + */ static int get_hw_temp_tex(struct r300_fragment_program *rp) { COMPILE_STATE; int r; - r = ffs(~(cs->hwreg_in_use | cs->used_in_node)); - if (!r) - return get_hw_temp(rp); /* Will cause an indirection */ + for(r = 0; r < PFS_NUM_TEMP_REGS; ++r) { + if (cs->used_in_node & (1 << r)) + continue; + + // Note: Be very careful here + if (cs->hwtemps[r].free >= 0 && cs->hwtemps[r].free <= 0) + break; + } + + if (r >= PFS_NUM_TEMP_REGS) + return get_hw_temp(rp, 0); /* Will cause an indirection */ - cs->hwreg_in_use |= (1 << --r); + cs->hwtemps[r].reserved = cs->hwtemps[r].free; + cs->hwtemps[r].free = -1; + + // Reset to some value that won't mess things up when the user + // tries to read from a temporary that hasn't been assigned a value yet. + // In the normal case, vector_valid and scalar_valid should be set to + // a sane value by the first emit that writes to this temporary. + cs->hwtemps[r].vector_valid = cs->nrslots; + cs->hwtemps[r].scalar_valid = cs->nrslots; + if (r > rp->max_temp_idx) rp->max_temp_idx = r; return r; } +/** + * Mark the given hardware register as free. + */ static void free_hw_temp(struct r300_fragment_program *rp, int idx) { COMPILE_STATE; - cs->hwreg_in_use &= ~(1<hwtemps[idx].free = cs->nrslots+1; } + +/** + * Create a new Mesa temporary register. + */ static GLuint get_temp_reg(struct r300_fragment_program *rp) { COMPILE_STATE; @@ -354,6 +421,10 @@ static GLuint get_temp_reg(struct r300_fragment_program *rp) return r; } +/** + * Create a new Mesa temporary register that will act as the destination + * register for a texture read. + */ static GLuint get_temp_reg_tex(struct r300_fragment_program *rp) { COMPILE_STATE; @@ -376,6 +447,9 @@ static GLuint get_temp_reg_tex(struct r300_fragment_program *rp) return r; } +/** + * Free a Mesa temporary and the associated R300 temporary. + */ static void free_temp(struct r300_fragment_program *rp, GLuint r) { COMPILE_STATE; @@ -762,10 +836,10 @@ static int t_hw_src(struct r300_fragment_program *rp, switch(REG_GET_TYPE(src)) { case REG_TYPE_TEMP: /* NOTE: if reg==-1 here, a source is being read that - * hasn't been written to. Undefined results + * hasn't been written to. Undefined results. */ if (cs->temps[index].reg == -1) - cs->temps[index].reg = get_hw_temp(rp); + cs->temps[index].reg = get_hw_temp(rp, cs->nrslots); idx = cs->temps[index].reg; @@ -795,7 +869,8 @@ static int t_hw_src(struct r300_fragment_program *rp, static int t_hw_dst(struct r300_fragment_program *rp, GLuint dest, - GLboolean tex) + GLboolean tex, + int slot) { COMPILE_STATE; int idx; @@ -806,7 +881,7 @@ static int t_hw_dst(struct r300_fragment_program *rp, case REG_TYPE_TEMP: if (cs->temps[REG_GET_INDEX(dest)].reg == -1) { if (!tex) { - cs->temps[index].reg = get_hw_temp(rp); + cs->temps[index].reg = get_hw_temp(rp, slot); } else { cs->temps[index].reg = get_hw_temp_tex(rp); } @@ -839,26 +914,20 @@ static int t_hw_dst(struct r300_fragment_program *rp, return idx; } -static void emit_nop(struct r300_fragment_program *rp, - GLuint mask, - GLboolean sync) +static void emit_nop(struct r300_fragment_program *rp) { COMPILE_STATE; - if (sync) - cs->v_pos = cs->s_pos = MAX2(cs->v_pos, cs->s_pos); - - if (mask & WRITEMASK_XYZ) { - rp->alu.inst[cs->v_pos].inst0 = NOP_INST0; - rp->alu.inst[cs->v_pos].inst1 = NOP_INST1; - cs->v_pos++; - } - - if (mask & WRITEMASK_W) { - rp->alu.inst[cs->s_pos].inst2 = NOP_INST2; - rp->alu.inst[cs->s_pos].inst3 = NOP_INST3; - cs->s_pos++; + if (cs->nrslots >= PFS_MAX_ALU_INST) { + ERROR("Out of ALU instruction slots\n"); + return; } + + rp->alu.inst[cs->nrslots].inst0 = NOP_INST0; + rp->alu.inst[cs->nrslots].inst1 = NOP_INST1; + rp->alu.inst[cs->nrslots].inst2 = NOP_INST2; + rp->alu.inst[cs->nrslots].inst3 = NOP_INST3; + cs->nrslots++; } static void emit_tex(struct r300_fragment_program *rp, @@ -882,7 +951,7 @@ static void emit_tex(struct r300_fragment_program *rp, rdest = dest; dest = get_temp_reg_tex(rp); } - hwdest = t_hw_dst(rp, dest, GL_TRUE); + hwdest = t_hw_dst(rp, dest, GL_TRUE, rp->node[rp->cur_node].alu_offset); /* Use a temp that hasn't been used in this node, rather * than causing an indirection @@ -904,15 +973,11 @@ static void emit_tex(struct r300_fragment_program *rp, (din & (1<v_pos = cs->s_pos = MAX2(cs->v_pos, cs->s_pos); - if (rp->node[rp->cur_node].alu_offset == cs->v_pos) { - /* No alu instructions in the node? Emit a NOP. */ - emit_nop(rp, WRITEMASK_XYZW, GL_TRUE); - cs->v_pos = cs->s_pos = MAX2(cs->v_pos, cs->s_pos); - } + if (rp->node[rp->cur_node].alu_offset == cs->nrslots) + emit_nop(rp); rp->node[rp->cur_node].alu_end = - cs->v_pos - rp->node[rp->cur_node].alu_offset - 1; + cs->nrslots - rp->node[rp->cur_node].alu_offset - 1; assert(rp->node[rp->cur_node].alu_end >= 0); if (++rp->cur_node >= PFS_MAX_TEX_INDIRECT) { @@ -922,7 +987,7 @@ static void emit_tex(struct r300_fragment_program *rp, /* Start new node */ rp->node[rp->cur_node].tex_offset = rp->tex.length; - rp->node[rp->cur_node].alu_offset = cs->v_pos; + rp->node[rp->cur_node].alu_offset = cs->nrslots; rp->node[rp->cur_node].tex_end = -1; rp->node[rp->cur_node].alu_end = -1; rp->node[rp->cur_node].flags = 0; @@ -954,84 +1019,243 @@ static void emit_tex(struct r300_fragment_program *rp, } } -/* Add sources to FPI1/FPI3 lists. If source is already on list, - * reuse the index instead of wasting a source. + +/** + * Returns the first slot where we could possibly allow writing to dest, + * according to register allocation. */ -static int add_src(struct r300_fragment_program *rp, - int reg, - int pos, - int srcmask) +static int get_earliest_allowed_write( + struct r300_fragment_program* rp, + GLuint dest) { COMPILE_STATE; - int csm, i; - - /* Look for matches */ - for (i=0,csm=srcmask; i<3; i++,csm=csm<<1) { - /* If sources have been allocated in this position(s)... */ - if ((cs->slot[pos].umask & csm) == csm) { - /* ... and the register number(s) match, re-use the - source */ - if (srcmask == SLOT_VECTOR && - cs->slot[pos].vsrc[i] == reg) - return i; - if (srcmask == SLOT_SCALAR && - cs->slot[pos].ssrc[i] == reg) - return i; - if (srcmask == SLOT_BOTH && - cs->slot[pos].vsrc[i] == reg && - cs->slot[pos].ssrc[i] == reg) - return i; - } - } + int idx; + GLuint index = REG_GET_INDEX(dest); + assert(REG_GET_VALID(dest)); - /* Look for free spaces */ - for (i=0,csm=srcmask; i<3; i++,csm=csm<<1) { - /* If the position(s) haven't been allocated */ - if ((cs->slot[pos].umask & csm) == 0) { - cs->slot[pos].umask |= csm; - - if (srcmask & SLOT_VECTOR) - cs->slot[pos].vsrc[i] = reg; - if (srcmask & SLOT_SCALAR) - cs->slot[pos].ssrc[i] = reg; - return i; - } + switch(REG_GET_TYPE(dest)) { + case REG_TYPE_TEMP: + if (cs->temps[index].reg == -1) + return 0; + + idx = cs->temps[index].reg; + break; + case REG_TYPE_OUTPUT: + return 0; + default: + ERROR("invalid dest reg type %d\n", REG_GET_TYPE(dest)); + return 0; } - //ERROR("Failed to allocate sources in FPI1/FPI3!\n"); - return 0; + return cs->hwtemps[idx].reserved; } -/* Determine whether or not to position opcode in the same ALU slot for both - * vector and scalar portions of an instruction. + +/** + * Allocates a slot for an ALU instruction that can consist of + * a vertex part or a scalar part or both. + * + * Sources from src (src[0] to src[argc-1]) are added to the slot in the + * appropriate position (vector and/or scalar), and their positions are + * recorded in the srcpos array. + * + * This function emits instruction code for the source fetch and the + * argument selection. It does not emit instruction code for the + * opcode or the destination selection. * - * It's not necessary to force the first case, but it makes disassembled - * shaders easier to read. + * @return the index of the slot */ -static GLboolean force_same_slot(int vop, - int sop, - GLboolean emit_vop, - GLboolean emit_sop, - int argc, - GLuint *src) +static int find_and_prepare_slot(struct r300_fragment_program* rp, + GLboolean emit_vop, + GLboolean emit_sop, + int argc, + GLuint* src, + GLuint dest) { - int i; - - if (emit_vop && emit_sop) - return GL_TRUE; + COMPILE_STATE; + int hwsrc[3]; + int srcpos[3]; + unsigned int used; + int tempused; + int tempvsrc[3]; + int tempssrc[3]; + int pos; + int regnr; + int i,j; + + // Determine instruction slots, whether sources are required on + // vector or scalar side, and the smallest slot number where + // all source registers are available + used = 0; + if (emit_vop) + used |= SLOT_OP_VECTOR; + if (emit_sop) + used |= SLOT_OP_SCALAR; + + pos = get_earliest_allowed_write(rp, dest); + + if (rp->node[rp->cur_node].alu_offset > pos) + pos = rp->node[rp->cur_node].alu_offset; + for(i = 0; i < argc; ++i) { + if (!REG_GET_BUILTIN(src[i])) { + if (emit_vop) + used |= v_swiz[REG_GET_VSWZ(src[i])].flags << i; + if (emit_sop) + used |= s_swiz[REG_GET_SSWZ(src[i])].flags << i; + } + + hwsrc[i] = t_hw_src(rp, src[i], GL_FALSE); /* Note: sideeffects wrt refcounting! */ + regnr = hwsrc[i] & 31; + + if (REG_GET_TYPE(src[i]) == REG_TYPE_TEMP) { + if (used & (SLOT_SRC_VECTOR << i)) { + if (cs->hwtemps[regnr].vector_valid > pos) + pos = cs->hwtemps[regnr].vector_valid; + } + if (used & (SLOT_SRC_SCALAR << i)) { + if (cs->hwtemps[regnr].scalar_valid > pos) + pos = cs->hwtemps[regnr].scalar_valid; + } + } + } + + // Find a slot that fits + for(; ; ++pos) { + if (cs->slot[pos].used & used & SLOT_OP_BOTH) + continue; + + if (pos >= cs->nrslots) { + if (cs->nrslots >= PFS_MAX_ALU_INST) { + ERROR("Out of ALU instruction slots\n"); + return -1; + } - if (emit_vop && vop == R300_FPI0_OUTC_REPL_ALPHA) - return GL_TRUE; + rp->alu.inst[pos].inst0 = NOP_INST0; + rp->alu.inst[pos].inst2 = NOP_INST2; + cs->nrslots++; + } + + // Note: When we need both parts (vector and scalar) of a source, + // we always try to put them into the same position. This makes the + // code easier to read, and it is optimal (i.e. one doesn't gain + // anything by splitting the parts). + // It also avoids headaches with swizzles that access both parts (i.e WXY) + tempused = cs->slot[pos].used; + for(i = 0; i < 3; ++i) { + tempvsrc[i] = cs->slot[pos].vsrc[i]; + tempssrc[i] = cs->slot[pos].ssrc[i]; + } + + for(i = 0; i < argc; ++i) { + int flags = (used >> i) & SLOT_SRC_BOTH; + + if (!flags) { + srcpos[i] = 0; + continue; + } + + for(j = 0; j < 3; ++j) { + if ((tempused >> j) & flags & SLOT_SRC_VECTOR) { + if (tempvsrc[j] != hwsrc[i]) + continue; + } + + if ((tempused >> j) & flags & SLOT_SRC_SCALAR) { + if (tempssrc[j] != hwsrc[i]) + continue; + } + + break; + } + + if (j == 3) + break; + + srcpos[i] = j; + tempused |= flags << j; + if (flags & SLOT_SRC_VECTOR) + tempvsrc[j] = hwsrc[i]; + if (flags & SLOT_SRC_SCALAR) + tempssrc[j] = hwsrc[i]; + } + + if (i == argc) + break; + } + + // Found a slot, reserve it + cs->slot[pos].used = tempused | (used & SLOT_OP_BOTH); + for(i = 0; i < 3; ++i) { + cs->slot[pos].vsrc[i] = tempvsrc[i]; + cs->slot[pos].ssrc[i] = tempssrc[i]; + } + + // Emit the source fetch code + rp->alu.inst[pos].inst1 &= ~R300_FPI1_SRC_MASK; + rp->alu.inst[pos].inst1 |= + ((cs->slot[pos].vsrc[0] << R300_FPI1_SRC0C_SHIFT) | + (cs->slot[pos].vsrc[1] << R300_FPI1_SRC1C_SHIFT) | + (cs->slot[pos].vsrc[2] << R300_FPI1_SRC2C_SHIFT)); + + rp->alu.inst[pos].inst3 &= ~R300_FPI3_SRC_MASK; + rp->alu.inst[pos].inst3 |= + ((cs->slot[pos].ssrc[0] << R300_FPI3_SRC0A_SHIFT) | + (cs->slot[pos].ssrc[1] << R300_FPI3_SRC1A_SHIFT) | + (cs->slot[pos].ssrc[2] << R300_FPI3_SRC2A_SHIFT)); + + // Emit the argument selection code if (emit_vop) { - for (i=0;ialu.inst[pos].inst0 &= + ~(R300_FPI0_ARG0C_MASK|R300_FPI0_ARG1C_MASK|R300_FPI0_ARG2C_MASK); + rp->alu.inst[pos].inst0 |= + (swz[0] << R300_FPI0_ARG0C_SHIFT) | + (swz[1] << R300_FPI0_ARG1C_SHIFT) | + (swz[2] << R300_FPI0_ARG2C_SHIFT); + } + + if (emit_sop) { + int swz[3]; + + for(i = 0; i < 3; ++i) { + if (i < argc) { + swz[i] = (s_swiz[REG_GET_SSWZ(src[i])].base + + (srcpos[i] * s_swiz[REG_GET_SSWZ(src[i])].stride)) | + ((src[i] & REG_NEGV_MASK) ? ARG_NEG : 0) | + ((src[i] & REG_ABS_MASK) ? ARG_ABS : 0); + } else { + swz[i] = R300_FPI2_ARGA_ZERO; + } + } + + rp->alu.inst[pos].inst2 &= + ~(R300_FPI2_ARG0A_MASK|R300_FPI2_ARG1A_MASK|R300_FPI2_ARG2A_MASK); + rp->alu.inst[pos].inst2 |= + (swz[0] << R300_FPI2_ARG0A_SHIFT) | + (swz[1] << R300_FPI2_ARG1A_SHIFT) | + (swz[2] << R300_FPI2_ARG2A_SHIFT); } - return GL_FALSE; + return pos; } + +/** + * Append an ALU instruction to the instruction list. + */ static void emit_arith(struct r300_fragment_program *rp, int op, GLuint dest, @@ -1043,87 +1267,31 @@ static void emit_arith(struct r300_fragment_program *rp, { COMPILE_STATE; GLuint src[3] = { src0, src1, src2 }; - int hwsrc[3], sswz[3], vswz[3]; int hwdest; - GLboolean emit_vop = GL_FALSE, emit_sop = GL_FALSE; + GLboolean emit_vop, emit_sop; int vop, sop, argc; - int vpos, spos; - int i; + int pos; vop = r300_fpop[op].v_op; sop = r300_fpop[op].s_op; argc = r300_fpop[op].argc; + if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT && + REG_GET_INDEX(dest) == FRAG_RESULT_DEPR) + mask &= ~WRITEMASK_XYZ; + + emit_vop = GL_FALSE; + emit_sop = GL_FALSE; if ((mask & WRITEMASK_XYZ) || vop == R300_FPI0_OUTC_DP3) emit_vop = GL_TRUE; if ((mask & WRITEMASK_W) || vop == R300_FPI0_OUTC_REPL_ALPHA) emit_sop = GL_TRUE; - if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT && - REG_GET_INDEX(dest) == FRAG_RESULT_DEPR) - emit_vop = GL_FALSE; - - if (force_same_slot(vop, sop, emit_vop, emit_sop, argc, src)) { - vpos = spos = MAX2(cs->v_pos, cs->s_pos); - } else { - vpos = cs->v_pos; - spos = cs->s_pos; - /* Here is where we'd decide on where a safe place is to - * combine this instruction with a previous one. - * - * This is extremely simple for now.. if a source depends - * on the opposite stream, force the same instruction. - */ - for (i=0;i<3;i++) { - if (emit_vop && - (v_swiz[REG_GET_VSWZ(src[i])].flags & SLOT_SCALAR)) { - vpos = spos = MAX2(vpos, spos); - break; - } - if (emit_sop && - (s_swiz[REG_GET_SSWZ(src[i])].flags & SLOT_VECTOR)) { - vpos = spos = MAX2(vpos, spos); - break; - } - } - } + pos = find_and_prepare_slot(rp, emit_vop, emit_sop, argc, src, dest); + if (pos < 0) + return; - /* - Convert src->hwsrc, record for FPI1/FPI3 - * - Determine ARG parts of FPI0/FPI2, unused args are filled - * with ARG_ZERO. - */ - for (i=0;i<3;i++) { - int srcpos; - - if (i >= argc) { - vswz[i] = R300_FPI0_ARGC_ZERO; - sswz[i] = R300_FPI2_ARGA_ZERO; - continue; - } - - hwsrc[i] = t_hw_src(rp, src[i], GL_FALSE); - - if (emit_vop && vop != R300_FPI0_OUTC_REPL_ALPHA) { - srcpos = add_src(rp, hwsrc[i], vpos, - v_swiz[REG_GET_VSWZ(src[i])].flags); - vswz[i] = (v_swiz[REG_GET_VSWZ(src[i])].base + - (srcpos * - v_swiz[REG_GET_VSWZ(src[i])].stride)) | - ((src[i] & REG_NEGV_MASK) ? ARG_NEG : 0) | - ((src[i] & REG_ABS_MASK) ? ARG_ABS : 0); - } else vswz[i] = R300_FPI0_ARGC_ZERO; - - if (emit_sop) { - srcpos = add_src(rp, hwsrc[i], spos, - s_swiz[REG_GET_SSWZ(src[i])].flags); - sswz[i] = (s_swiz[REG_GET_SSWZ(src[i])].base + - (srcpos * - s_swiz[REG_GET_SSWZ(src[i])].stride)) | - ((src[i] & REG_NEGS_MASK) ? ARG_NEG : 0) | - ((src[i] & REG_ABS_MASK) ? ARG_ABS : 0); - } else sswz[i] = R300_FPI2_ARGA_ZERO; - } - hwdest = t_hw_dst(rp, dest, GL_FALSE); + hwdest = t_hw_dst(rp, dest, GL_FALSE, pos); /* Note: Side effects wrt register allocation */ if (flags & PFS_FLAG_SAT) { vop |= R300_FPI0_OUTC_SAT; @@ -1131,58 +1299,45 @@ static void emit_arith(struct r300_fragment_program *rp, } /* Throw the pieces together and get FPI0/1 */ - rp->alu.inst[vpos].inst1 = - ((cs->slot[vpos].vsrc[0] << R300_FPI1_SRC0C_SHIFT) | - (cs->slot[vpos].vsrc[1] << R300_FPI1_SRC1C_SHIFT) | - (cs->slot[vpos].vsrc[2] << R300_FPI1_SRC2C_SHIFT)); if (emit_vop) { - rp->alu.inst[vpos].inst0 = vop | - (vswz[0] << R300_FPI0_ARG0C_SHIFT) | - (vswz[1] << R300_FPI0_ARG1C_SHIFT) | - (vswz[2] << R300_FPI0_ARG2C_SHIFT); + rp->alu.inst[pos].inst0 |= vop; - rp->alu.inst[vpos].inst1 |= hwdest << R300_FPI1_DSTC_SHIFT; + rp->alu.inst[pos].inst1 |= hwdest << R300_FPI1_DSTC_SHIFT; + if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT) { if (REG_GET_INDEX(dest) == FRAG_RESULT_COLR) { - rp->alu.inst[vpos].inst1 |= + rp->alu.inst[pos].inst1 |= (mask & WRITEMASK_XYZ) << R300_FPI1_DSTC_OUTPUT_MASK_SHIFT; } else assert(0); } else { - rp->alu.inst[vpos].inst1 |= + rp->alu.inst[pos].inst1 |= (mask & WRITEMASK_XYZ) << R300_FPI1_DSTC_REG_MASK_SHIFT; + + cs->hwtemps[hwdest].vector_valid = pos+1; } - cs->v_pos = vpos+1; - } else if (spos >= vpos) - rp->alu.inst[spos].inst0 = NOP_INST0; + } /* And now FPI2/3 */ - rp->alu.inst[spos].inst3 = - ((cs->slot[spos].ssrc[0] << R300_FPI3_SRC0A_SHIFT) | - (cs->slot[spos].ssrc[1] << R300_FPI3_SRC1A_SHIFT) | - (cs->slot[spos].ssrc[2] << R300_FPI3_SRC2A_SHIFT)); if (emit_sop) { - rp->alu.inst[spos].inst2 = sop | - sswz[0] << R300_FPI2_ARG0A_SHIFT | - sswz[1] << R300_FPI2_ARG1A_SHIFT | - sswz[2] << R300_FPI2_ARG2A_SHIFT; + rp->alu.inst[pos].inst2 |= sop; if (mask & WRITEMASK_W) { if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT) { if (REG_GET_INDEX(dest) == FRAG_RESULT_COLR) { - rp->alu.inst[spos].inst3 |= + rp->alu.inst[pos].inst3 |= (hwdest << R300_FPI3_DSTA_SHIFT) | R300_FPI3_DSTA_OUTPUT; } else if (REG_GET_INDEX(dest) == FRAG_RESULT_DEPR) { - rp->alu.inst[spos].inst3 |= R300_FPI3_DSTA_DEPTH; + rp->alu.inst[pos].inst3 |= R300_FPI3_DSTA_DEPTH; } else assert(0); } else { - rp->alu.inst[spos].inst3 |= + rp->alu.inst[pos].inst3 |= (hwdest << R300_FPI3_DSTA_SHIFT) | R300_FPI3_DSTA_REG; + + cs->hwtemps[hwdest].scalar_valid = pos+1; } } - cs->s_pos = spos+1; - } else if (vpos >= spos) - rp->alu.inst[vpos].inst2 = NOP_INST2; - + } + return; } @@ -1922,7 +2077,7 @@ static void init_program(r300ContextPtr r300, struct r300_fragment_program *rp) for (i=0;ictx->Const.MaxTextureUnits;i++) { if (InputsRead & (FRAG_BIT_TEX0 << i)) { cs->inputs[FRAG_ATTRIB_TEX0+i].refcount = 0; - cs->inputs[FRAG_ATTRIB_TEX0+i].reg = get_hw_temp(rp); + cs->inputs[FRAG_ATTRIB_TEX0+i].reg = get_hw_temp(rp, 0); } } InputsRead &= ~FRAG_BITS_TEX_ANY; @@ -1930,7 +2085,7 @@ static void init_program(r300ContextPtr r300, struct r300_fragment_program *rp) /* fragment position treated as a texcoord */ if (InputsRead & FRAG_BIT_WPOS) { cs->inputs[FRAG_ATTRIB_WPOS].refcount = 0; - cs->inputs[FRAG_ATTRIB_WPOS].reg = get_hw_temp(rp); + cs->inputs[FRAG_ATTRIB_WPOS].reg = get_hw_temp(rp, 0); insert_wpos(&mp->Base); } InputsRead &= ~FRAG_BIT_WPOS; @@ -1938,14 +2093,14 @@ static void init_program(r300ContextPtr r300, struct r300_fragment_program *rp) /* Then primary colour */ if (InputsRead & FRAG_BIT_COL0) { cs->inputs[FRAG_ATTRIB_COL0].refcount = 0; - cs->inputs[FRAG_ATTRIB_COL0].reg = get_hw_temp(rp); + cs->inputs[FRAG_ATTRIB_COL0].reg = get_hw_temp(rp, 0); } InputsRead &= ~FRAG_BIT_COL0; /* Secondary color */ if (InputsRead & FRAG_BIT_COL1) { cs->inputs[FRAG_ATTRIB_COL1].refcount = 0; - cs->inputs[FRAG_ATTRIB_COL1].reg = get_hw_temp(rp); + cs->inputs[FRAG_ATTRIB_COL1].reg = get_hw_temp(rp, 0); } InputsRead &= ~FRAG_BIT_COL1; @@ -2030,13 +2185,12 @@ void r300_translate_fragment_shader(r300ContextPtr r300, struct r300_fragment_pr } /* Finish off */ - cs->v_pos = cs->s_pos = MAX2(cs->v_pos, cs->s_pos); rp->node[rp->cur_node].alu_end = - cs->v_pos - rp->node[rp->cur_node].alu_offset - 1; + cs->nrslots - rp->node[rp->cur_node].alu_offset - 1; if (rp->node[rp->cur_node].tex_end < 0) rp->node[rp->cur_node].tex_end = 0; rp->alu_offset = 0; - rp->alu_end = cs->v_pos - 1; + rp->alu_end = cs->nrslots - 1; rp->tex_offset = 0; rp->tex_end = rp->tex.length ? rp->tex.length - 1 : 0; assert(rp->node[rp->cur_node].alu_end >= 0); @@ -2053,7 +2207,7 @@ void r300_translate_fragment_shader(r300ContextPtr r300, struct r300_fragment_pr /* just some random things... */ static void dump_program(struct r300_fragment_program *rp) { - int i; + int n, i, j; static int pc = 0; fprintf(stderr, "pc=%d*************************************\n", pc++); @@ -2066,46 +2220,136 @@ static void dump_program(struct r300_fragment_program *rp) fprintf(stderr, "Hardware program\n"); fprintf(stderr, "----------------\n"); - fprintf(stderr, "tex:\n"); - - for(i=0;itex.length;i++) { - fprintf(stderr, "%08x\n", rp->tex.inst[i]); - } - - for (i=0;i<(rp->cur_node+1);i++) { + for (n = 0; n < (rp->cur_node+1); n++) { fprintf(stderr, "NODE %d: alu_offset: %d, tex_offset: %d, "\ - "alu_end: %d, tex_end: %d\n", i, - rp->node[i].alu_offset, - rp->node[i].tex_offset, - rp->node[i].alu_end, - rp->node[i].tex_end); + "alu_end: %d, tex_end: %d\n", n, + rp->node[n].alu_offset, + rp->node[n].tex_offset, + rp->node[n].alu_end, + rp->node[n].tex_end); + + if (rp->tex.length) { + fprintf(stderr, " TEX:\n"); + for(i = rp->node[n].tex_offset; i <= rp->node[n].tex_end; ++i) + fprintf(stderr, " %08x\n", rp->tex.inst[i]); + } + + for(i = rp->node[n].alu_offset; i <= rp->node[n].alu_end; ++i) { + char srcc[3][10], dstc[20]; + char srca[3][10], dsta[20]; + char argc[3][20]; + char arga[3][20]; + + for(j = 0; j < 3; ++j) { + int regc = rp->alu.inst[i].inst1 >> (j*6); + int rega = rp->alu.inst[i].inst3 >> (j*6); + + sprintf(srcc[j], "%c%i", (regc & 32) ? 'c' : 't', regc & 31); + sprintf(srca[j], "%c%i", (rega & 32) ? 'c' : 't', rega & 31); + } + + sprintf(dstc, "t%i.%c%c%c o%i.%c%c%c", + (rp->alu.inst[i].inst1 >> R300_FPI1_DSTC_SHIFT) & 31, + (rp->alu.inst[i].inst1 & R300_FPI1_DSTC_REG_X) ? 'x' : ' ', + (rp->alu.inst[i].inst1 & R300_FPI1_DSTC_REG_Y) ? 'y' : ' ', + (rp->alu.inst[i].inst1 & R300_FPI1_DSTC_REG_Z) ? 'z' : ' ', + (rp->alu.inst[i].inst1 >> R300_FPI1_DSTC_SHIFT) & 31, + (rp->alu.inst[i].inst1 & R300_FPI1_DSTC_OUTPUT_X) ? 'x' : ' ', + (rp->alu.inst[i].inst1 & R300_FPI1_DSTC_OUTPUT_Y) ? 'y' : ' ', + (rp->alu.inst[i].inst1 & R300_FPI1_DSTC_OUTPUT_Z) ? 'z' : ' '); + + sprintf(dsta, "t%i.%c o%i.%c %c", + (rp->alu.inst[i].inst3 >> R300_FPI3_DSTA_SHIFT) & 31, + (rp->alu.inst[i].inst3 & R300_FPI3_DSTA_REG) ? 'w' : ' ', + (rp->alu.inst[i].inst3 >> R300_FPI3_DSTA_SHIFT) & 31, + (rp->alu.inst[i].inst3 & R300_FPI3_DSTA_OUTPUT) ? 'w' : ' ', + (rp->alu.inst[i].inst3 & R300_FPI3_DSTA_DEPTH) ? 'Z' : ' '); + + fprintf(stderr, "%3i: xyz: %3s %3s %3s -> %-20s (%08x)\n" + " w: %3s %3s %3s -> %-20s (%08x)\n", + i, + srcc[0], srcc[1], srcc[2], dstc, rp->alu.inst[i].inst1, + srca[0], srca[1], srca[2], dsta, rp->alu.inst[i].inst3); + + for(j = 0; j < 3; ++j) { + int regc = rp->alu.inst[i].inst0 >> (j*7); + int rega = rp->alu.inst[i].inst2 >> (j*7); + int d; + char buf[20]; + + d = regc & 31; + if (d < 12) { + switch(d % 4) { + case R300_FPI0_ARGC_SRC0C_XYZ: + sprintf(buf, "%s.xyz", srcc[d / 4]); + break; + case R300_FPI0_ARGC_SRC0C_XXX: + sprintf(buf, "%s.xxx", srcc[d / 4]); + break; + case R300_FPI0_ARGC_SRC0C_YYY: + sprintf(buf, "%s.yyy", srcc[d / 4]); + break; + case R300_FPI0_ARGC_SRC0C_ZZZ: + sprintf(buf, "%s.zzz", srcc[d / 4]); + break; + } + } else if (d < 15) { + sprintf(buf, "%s.www", srca[d-12]); + } else if (d == 20) { + sprintf(buf, "0.0"); + } else if (d == 21) { + sprintf(buf, "1.0"); + } else if (d == 22) { + sprintf(buf, "0.5"); + } else if (d >= 23 && d < 32) { + d -= 23; + switch(d/3) { + case 0: + sprintf(buf, "%s.yzx", srcc[d % 3]); + break; + case 1: + sprintf(buf, "%s.zxy", srcc[d % 3]); + break; + case 2: + sprintf(buf, "%s.Wzy", srcc[d % 3]); + break; + } + } else { + sprintf(buf, "%i", d); + } + + sprintf(argc[j], "%s%s%s%s", + (regc & 32) ? "-" : "", + (regc & 64) ? "|" : "", + buf, + (regc & 64) ? "|" : ""); + + d = rega & 31; + if (d < 9) { + sprintf(buf, "%s.%c", srcc[d / 3], 'x' + (char)(d%3)); + } else if (d < 12) { + sprintf(buf, "%s.w", srca[d-9]); + } else if (d == 16) { + sprintf(buf, "0.0"); + } else if (d == 17) { + sprintf(buf, "1.0"); + } else if (d == 18) { + sprintf(buf, "0.5"); + } else { + sprintf(buf, "%i", d); + } + + sprintf(arga[j], "%s%s%s%s", + (rega & 32) ? "-" : "", + (rega & 64) ? "|" : "", + buf, + (rega & 64) ? "|" : ""); + } + + fprintf(stderr, " xyz: %8s %8s %8s op: %08x\n" + " w: %8s %8s %8s op: %08x\n", + argc[0], argc[1], argc[2], rp->alu.inst[i].inst0, + arga[0], arga[1], arga[2], rp->alu.inst[i].inst2); + } } - - fprintf(stderr, "%08x\n", - ((rp->tex_end << 16) | (R300_PFS_TEXI_0 >> 2))); - for (i=0;i<=rp->tex_end;i++) - fprintf(stderr, "%08x\n", rp->tex.inst[i]); - - /* dump program in pretty_print_command_stream.tcl-readable format */ - fprintf(stderr, "%08x\n", - ((rp->alu_end << 16) | (R300_PFS_INSTR0_0 >> 2))); - for (i=0;i<=rp->alu_end;i++) - fprintf(stderr, "%08x\n", rp->alu.inst[i].inst0); - - fprintf(stderr, "%08x\n", - ((rp->alu_end << 16) | (R300_PFS_INSTR1_0 >> 2))); - for (i=0;i<=rp->alu_end;i++) - fprintf(stderr, "%08x\n", rp->alu.inst[i].inst1); - - fprintf(stderr, "%08x\n", - ((rp->alu_end << 16) | (R300_PFS_INSTR2_0 >> 2))); - for (i=0;i<=rp->alu_end;i++) - fprintf(stderr, "%08x\n", rp->alu.inst[i].inst2); - - fprintf(stderr, "%08x\n", - ((rp->alu_end << 16) | (R300_PFS_INSTR3_0 >> 2))); - for (i=0;i<=rp->alu_end;i++) - fprintf(stderr, "%08x\n", rp->alu.inst[i].inst3); - - fprintf(stderr, "00000000\n"); } diff --git a/src/mesa/drivers/dri/r300/r300_reg.h b/src/mesa/drivers/dri/r300/r300_reg.h index 3de15752b1..1f4a2d2e64 100644 --- a/src/mesa/drivers/dri/r300/r300_reg.h +++ b/src/mesa/drivers/dri/r300/r300_reg.h @@ -1047,7 +1047,7 @@ USE OR OTHER DEALINGS IN THE SOFTWARE. * WRT swizzling. If, for example, you want to load an R component into an * Alpha operand, this R component is taken from a *color* source, not from * an alpha source. The corresponding register doesn't even have to appear in - * the alpha sources list. (I hope this alll makes sense to you) + * the alpha sources list. (I hope this all makes sense to you) * * Destination selection * The destination register index is in FPI1 (color) and FPI3 (alpha) @@ -1074,6 +1074,7 @@ USE OR OTHER DEALINGS IN THE SOFTWARE. # define R300_FPI1_SRC2C_SHIFT 12 # define R300_FPI1_SRC2C_MASK (31 << 12) # define R300_FPI1_SRC2C_CONST (1 << 17) +# define R300_FPI1_SRC_MASK 0x0003ffff # define R300_FPI1_DSTC_SHIFT 18 # define R300_FPI1_DSTC_MASK (31 << 18) # define R300_FPI1_DSTC_REG_MASK_SHIFT 23 @@ -1095,6 +1096,7 @@ USE OR OTHER DEALINGS IN THE SOFTWARE. # define R300_FPI3_SRC2A_SHIFT 12 # define R300_FPI3_SRC2A_MASK (31 << 12) # define R300_FPI3_SRC2A_CONST (1 << 17) +# define R300_FPI3_SRC_MASK 0x0003ffff # define R300_FPI3_DSTA_SHIFT 18 # define R300_FPI3_DSTA_MASK (31 << 18) # define R300_FPI3_DSTA_REG (1 << 23) -- cgit v1.2.3 From a8e65a010c17444c63859c17786ecb4010bd49c1 Mon Sep 17 00:00:00 2001 From: Nicolai Haehnle Date: Sun, 18 Mar 2007 12:46:53 +0100 Subject: r300: Fix hw fragment program dump Dumps of fragment programs were incorrect when the program consisted of multiple nodes. Also, improved the formatting a bit. --- src/mesa/drivers/dri/r300/r300_fragprog.c | 51 ++++++++++++++++++++----------- 1 file changed, 34 insertions(+), 17 deletions(-) (limited to 'src/mesa/drivers/dri/r300/r300_fragprog.c') diff --git a/src/mesa/drivers/dri/r300/r300_fragprog.c b/src/mesa/drivers/dri/r300/r300_fragprog.c index b2c89ccb36..c3d902a4aa 100644 --- a/src/mesa/drivers/dri/r300/r300_fragprog.c +++ b/src/mesa/drivers/dri/r300/r300_fragprog.c @@ -2230,15 +2230,16 @@ static void dump_program(struct r300_fragment_program *rp) if (rp->tex.length) { fprintf(stderr, " TEX:\n"); - for(i = rp->node[n].tex_offset; i <= rp->node[n].tex_end; ++i) + for(i = rp->node[n].tex_offset; i <= rp->node[n].tex_offset+rp->node[n].tex_end; ++i) fprintf(stderr, " %08x\n", rp->tex.inst[i]); } - for(i = rp->node[n].alu_offset; i <= rp->node[n].alu_end; ++i) { + for(i = rp->node[n].alu_offset; i <= rp->node[n].alu_offset+rp->node[n].alu_end; ++i) { char srcc[3][10], dstc[20]; char srca[3][10], dsta[20]; char argc[3][20]; char arga[3][20]; + char flags[5], tmp[10]; for(j = 0; j < 3; ++j) { int regc = rp->alu.inst[i].inst1 >> (j*6); @@ -2248,22 +2249,38 @@ static void dump_program(struct r300_fragment_program *rp) sprintf(srca[j], "%c%i", (rega & 32) ? 'c' : 't', rega & 31); } - sprintf(dstc, "t%i.%c%c%c o%i.%c%c%c", - (rp->alu.inst[i].inst1 >> R300_FPI1_DSTC_SHIFT) & 31, - (rp->alu.inst[i].inst1 & R300_FPI1_DSTC_REG_X) ? 'x' : ' ', - (rp->alu.inst[i].inst1 & R300_FPI1_DSTC_REG_Y) ? 'y' : ' ', - (rp->alu.inst[i].inst1 & R300_FPI1_DSTC_REG_Z) ? 'z' : ' ', - (rp->alu.inst[i].inst1 >> R300_FPI1_DSTC_SHIFT) & 31, - (rp->alu.inst[i].inst1 & R300_FPI1_DSTC_OUTPUT_X) ? 'x' : ' ', - (rp->alu.inst[i].inst1 & R300_FPI1_DSTC_OUTPUT_Y) ? 'y' : ' ', - (rp->alu.inst[i].inst1 & R300_FPI1_DSTC_OUTPUT_Z) ? 'z' : ' '); + dstc[0] = 0; + sprintf(flags, "%s%s%s", + (rp->alu.inst[i].inst1 & R300_FPI1_DSTC_REG_X) ? "x" : "", + (rp->alu.inst[i].inst1 & R300_FPI1_DSTC_REG_Y) ? "y" : "", + (rp->alu.inst[i].inst1 & R300_FPI1_DSTC_REG_Z) ? "z" : ""); + if (flags[0] != 0) { + sprintf(dstc, "t%i.%s ", + (rp->alu.inst[i].inst1 >> R300_FPI1_DSTC_SHIFT) & 31, + flags); + } + sprintf(flags, "%s%s%s", + (rp->alu.inst[i].inst1 & R300_FPI1_DSTC_OUTPUT_X) ? "x" : "", + (rp->alu.inst[i].inst1 & R300_FPI1_DSTC_OUTPUT_Y) ? "y" : "", + (rp->alu.inst[i].inst1 & R300_FPI1_DSTC_OUTPUT_Z) ? "z" : ""); + if (flags[0] != 0) { + sprintf(tmp, "o%i.%s", + (rp->alu.inst[i].inst1 >> R300_FPI1_DSTC_SHIFT) & 31, + flags); + strcat(dstc, tmp); + } - sprintf(dsta, "t%i.%c o%i.%c %c", - (rp->alu.inst[i].inst3 >> R300_FPI3_DSTA_SHIFT) & 31, - (rp->alu.inst[i].inst3 & R300_FPI3_DSTA_REG) ? 'w' : ' ', - (rp->alu.inst[i].inst3 >> R300_FPI3_DSTA_SHIFT) & 31, - (rp->alu.inst[i].inst3 & R300_FPI3_DSTA_OUTPUT) ? 'w' : ' ', - (rp->alu.inst[i].inst3 & R300_FPI3_DSTA_DEPTH) ? 'Z' : ' '); + dsta[0] = 0; + if (rp->alu.inst[i].inst3 & R300_FPI3_DSTA_REG) { + sprintf(dsta, "t%i.w ", (rp->alu.inst[i].inst3 >> R300_FPI3_DSTA_SHIFT) & 31); + } + if (rp->alu.inst[i].inst3 & R300_FPI3_DSTA_OUTPUT) { + sprintf(tmp, "o%i.w ", (rp->alu.inst[i].inst3 >> R300_FPI3_DSTA_SHIFT) & 31); + strcat(dsta, tmp); + } + if (rp->alu.inst[i].inst3 & R300_FPI3_DSTA_DEPTH) { + strcat(dsta, "Z"); + } fprintf(stderr, "%3i: xyz: %3s %3s %3s -> %-20s (%08x)\n" " w: %3s %3s %3s -> %-20s (%08x)\n", -- cgit v1.2.3 From ec1a77c86481d7f77542fbecda0e81b74732c90f Mon Sep 17 00:00:00 2001 From: Nicolai Haehnle Date: Sun, 18 Mar 2007 13:09:21 +0100 Subject: r300: Fragment program dumps format tex instructions --- src/mesa/drivers/dri/r300/r300_fragprog.c | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) (limited to 'src/mesa/drivers/dri/r300/r300_fragprog.c') diff --git a/src/mesa/drivers/dri/r300/r300_fragprog.c b/src/mesa/drivers/dri/r300/r300_fragprog.c index c3d902a4aa..3c54830312 100644 --- a/src/mesa/drivers/dri/r300/r300_fragprog.c +++ b/src/mesa/drivers/dri/r300/r300_fragprog.c @@ -2230,8 +2230,34 @@ static void dump_program(struct r300_fragment_program *rp) if (rp->tex.length) { fprintf(stderr, " TEX:\n"); - for(i = rp->node[n].tex_offset; i <= rp->node[n].tex_offset+rp->node[n].tex_end; ++i) - fprintf(stderr, " %08x\n", rp->tex.inst[i]); + for(i = rp->node[n].tex_offset; i <= rp->node[n].tex_offset+rp->node[n].tex_end; ++i) { + const char* instr; + + switch((rp->tex.inst[i] >> R300_FPITX_OPCODE_SHIFT) & 15) { + case R300_FPITX_OP_TEX: + instr = "TEX"; + break; + case R300_FPITX_OP_KIL: + instr = "KIL"; + break; + case R300_FPITX_OP_TXP: + instr = "TXP"; + break; + case R300_FPITX_OP_TXB: + instr = "TXB"; + break; + default: + instr = "UNKNOWN"; + } + + fprintf(stderr, " %s t%i, %c%i, texture[%i] (%08x)\n", + instr, + (rp->tex.inst[i] >> R300_FPITX_DST_SHIFT) & 31, + (rp->tex.inst[i] & R300_FPITX_SRC_CONST) ? 'c': 't', + (rp->tex.inst[i] >> R300_FPITX_SRC_SHIFT) & 31, + (rp->tex.inst[i] & R300_FPITX_IMAGE_MASK) >> R300_FPITX_IMAGE_SHIFT, + rp->tex.inst[i]); + } } for(i = rp->node[n].alu_offset; i <= rp->node[n].alu_offset+rp->node[n].alu_end; ++i) { -- cgit v1.2.3 From ff6ab9b45b180ab9bf261afa50888e6e740d7924 Mon Sep 17 00:00:00 2001 From: Nicolai Haehnle Date: Sun, 18 Mar 2007 13:29:18 +0100 Subject: r300: Fix fragment program reordering Do not move an instruction that writes to a temp forward past an instruction that reads the same temporary. --- src/mesa/drivers/dri/r300/r300_context.h | 5 +++++ src/mesa/drivers/dri/r300/r300_fragprog.c | 37 ++++++++++++++++++++++++++----- 2 files changed, 37 insertions(+), 5 deletions(-) (limited to 'src/mesa/drivers/dri/r300/r300_fragprog.c') diff --git a/src/mesa/drivers/dri/r300/r300_context.h b/src/mesa/drivers/dri/r300/r300_context.h index bc43953ff3..29436ab9e0 100644 --- a/src/mesa/drivers/dri/r300/r300_context.h +++ b/src/mesa/drivers/dri/r300/r300_context.h @@ -674,6 +674,11 @@ struct reg_lifetime { emitted instruction that writes to the register */ int vector_valid; int scalar_valid; + + /* Index to the slot where the register was last read. + This is also the first slot in which the register may be written again */ + int vector_lastread; + int scalar_lastread; }; diff --git a/src/mesa/drivers/dri/r300/r300_fragprog.c b/src/mesa/drivers/dri/r300/r300_fragprog.c index 3c54830312..89e9f6531a 100644 --- a/src/mesa/drivers/dri/r300/r300_fragprog.c +++ b/src/mesa/drivers/dri/r300/r300_fragprog.c @@ -1026,10 +1026,11 @@ static void emit_tex(struct r300_fragment_program *rp, */ static int get_earliest_allowed_write( struct r300_fragment_program* rp, - GLuint dest) + GLuint dest, int mask) { COMPILE_STATE; int idx; + int pos; GLuint index = REG_GET_INDEX(dest); assert(REG_GET_VALID(dest)); @@ -1047,7 +1048,17 @@ static int get_earliest_allowed_write( return 0; } - return cs->hwtemps[idx].reserved; + pos = cs->hwtemps[idx].reserved; + if (mask & WRITEMASK_XYZ) { + if (pos < cs->hwtemps[idx].vector_lastread) + pos = cs->hwtemps[idx].vector_lastread; + } + if (mask & WRITEMASK_W) { + if (pos < cs->hwtemps[idx].scalar_lastread) + pos = cs->hwtemps[idx].scalar_lastread; + } + + return pos; } @@ -1070,7 +1081,8 @@ static int find_and_prepare_slot(struct r300_fragment_program* rp, GLboolean emit_sop, int argc, GLuint* src, - GLuint dest) + GLuint dest, + int mask) { COMPILE_STATE; int hwsrc[3]; @@ -1092,7 +1104,7 @@ static int find_and_prepare_slot(struct r300_fragment_program* rp, if (emit_sop) used |= SLOT_OP_SCALAR; - pos = get_earliest_allowed_write(rp, dest); + pos = get_earliest_allowed_write(rp, dest, mask); if (rp->node[rp->cur_node].alu_offset > pos) pos = rp->node[rp->cur_node].alu_offset; @@ -1191,6 +1203,21 @@ static int find_and_prepare_slot(struct r300_fragment_program* rp, cs->slot[pos].ssrc[i] = tempssrc[i]; } + for(i = 0; i < argc; ++i) { + if (REG_GET_TYPE(src[i]) == REG_TYPE_TEMP) { + int regnr = hwsrc[i] & 31; + + if (used & (SLOT_SRC_VECTOR << i)) { + if (cs->hwtemps[regnr].vector_lastread < pos) + cs->hwtemps[regnr].vector_lastread = pos; + } + if (used & (SLOT_SRC_SCALAR << i)) { + if (cs->hwtemps[regnr].scalar_lastread < pos) + cs->hwtemps[regnr].scalar_lastread = pos; + } + } + } + // Emit the source fetch code rp->alu.inst[pos].inst1 &= ~R300_FPI1_SRC_MASK; rp->alu.inst[pos].inst1 |= @@ -1287,7 +1314,7 @@ static void emit_arith(struct r300_fragment_program *rp, if ((mask & WRITEMASK_W) || vop == R300_FPI0_OUTC_REPL_ALPHA) emit_sop = GL_TRUE; - pos = find_and_prepare_slot(rp, emit_vop, emit_sop, argc, src, dest); + pos = find_and_prepare_slot(rp, emit_vop, emit_sop, argc, src, dest, mask); if (pos < 0) return; -- cgit v1.2.3 From b645e8c96dc1e3b153cf882c8931f10e0c006f04 Mon Sep 17 00:00:00 2001 From: Nicolai Haehnle Date: Sun, 18 Mar 2007 18:32:32 +0100 Subject: r300: Streamlined fragment program LIT implementation Fix a bug in the LIT implementation (clamp exponent to 128, not 0.5) and change the implementation around. In theory, the new implementation needs as little as 5 instruction slots. Unfortunately, the dependency analysis in find_and_replace_slot is not strong enough to look at individual components of a register yet. --- src/mesa/drivers/dri/r300/r300_fragprog.c | 163 ++++++++++++++++++------------ 1 file changed, 101 insertions(+), 62 deletions(-) (limited to 'src/mesa/drivers/dri/r300/r300_fragprog.c') diff --git a/src/mesa/drivers/dri/r300/r300_fragprog.c b/src/mesa/drivers/dri/r300/r300_fragprog.c index 89e9f6531a..b0681e2808 100644 --- a/src/mesa/drivers/dri/r300/r300_fragprog.c +++ b/src/mesa/drivers/dri/r300/r300_fragprog.c @@ -492,7 +492,7 @@ static GLuint emit_param4fv(struct r300_fragment_program *rp, return r; } -static GLuint emit_const4fv(struct r300_fragment_program *rp, GLfloat *cp) +static GLuint emit_const4fv(struct r300_fragment_program *rp, const GLfloat* cp) { GLuint r = undef; GLuint index; @@ -1405,15 +1405,112 @@ static void make_sin_const(struct r300_fragment_program *rp) } } +/** + * Emit a LIT instruction. + * \p flags may be PFS_FLAG_SAT + * + * Definition of LIT (from ARB_fragment_program): + * tmp = VectorLoad(op0); + * if (tmp.x < 0) tmp.x = 0; + * if (tmp.y < 0) tmp.y = 0; + * if (tmp.w < -(128.0-epsilon)) tmp.w = -(128.0-epsilon); + * else if (tmp.w > 128-epsilon) tmp.w = 128-epsilon; + * result.x = 1.0; + * result.y = tmp.x; + * result.z = (tmp.x > 0) ? RoughApproxPower(tmp.y, tmp.w) : 0.0; + * result.w = 1.0; + * + * The longest path of computation is the one leading to result.z, + * consisting of 5 operations. This implementation of LIT takes + * 5 slots. So unless there's some special undocumented opcode, + * this implementation is potentially optimal. Unfortunately, + * emit_arith is a bit too conservative because it doesn't understand + * partial writes to the vector component. + */ +static void emit_lit(struct r300_fragment_program *rp, + GLuint dest, + int mask, + GLuint src, + int flags) +{ + COMPILE_STATE; + static const GLfloat cnstv[4] = { 127.999999, 127.999999, 127.999999, -127.999999 }; + GLuint cnst; + int needTemporary; + GLuint temp; + + cnst = emit_const4fv(rp, cnstv); + + needTemporary = 0; + if ((mask & WRITEMASK_XYZW) != WRITEMASK_XYZW) { + needTemporary = 1; + } else if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT) { + // LIT is typically followed by DP3/DP4, so there's no point + // in creating special code for this case + needTemporary = 1; + } + + if (needTemporary) { + temp = keep(get_temp_reg(rp)); + } else { + temp = keep(dest); + } + + // Npte: The order of emit_arith inside the slots is relevant, + // because emit_arith only looks at scalar vs. vector when resolving + // dependencies, and it does not consider individual vector components, + // so swizzling between the two parts can create fake dependencies. + + // First slot + emit_arith(rp, PFS_OP_MAX, temp, WRITEMASK_XY, + keep(src), pfs_zero, undef, 0); + emit_arith(rp, PFS_OP_MAX, temp, WRITEMASK_W, + src, cnst, undef, 0); + + // Second slot + emit_arith(rp, PFS_OP_MIN, temp, WRITEMASK_Z, + swizzle(temp, W, W, W, W), cnst, undef, 0); + emit_arith(rp, PFS_OP_LG2, temp, WRITEMASK_W, + swizzle(temp, Y, Y, Y, Y), undef, undef, 0); + + // Third slot + // If desired, we saturate the y result here. + // This does not affect the use as a condition variable in the CMP later + emit_arith(rp, PFS_OP_MAD, temp, WRITEMASK_W, + temp, swizzle(temp, Z, Z, Z, Z), pfs_zero, 0); + emit_arith(rp, PFS_OP_MAD, temp, WRITEMASK_Y, + swizzle(temp, X, X, X, X), pfs_one, pfs_zero, flags); + + // Fourth slot + emit_arith(rp, PFS_OP_MAD, temp, WRITEMASK_X, + pfs_one, pfs_one, pfs_zero, 0); + emit_arith(rp, PFS_OP_EX2, temp, WRITEMASK_W, + temp, undef, undef, 0); + + // Fifth slot + emit_arith(rp, PFS_OP_CMP, temp, WRITEMASK_Z, + swizzle(temp, W, W, W, W), pfs_zero, swizzle(temp, Y, Y, Y, Y), flags); + emit_arith(rp, PFS_OP_MAD, temp, WRITEMASK_W, + pfs_one, pfs_one, pfs_zero, 0); + + if (needTemporary) { + emit_arith(rp, PFS_OP_MAD, dest, mask, + temp, pfs_one, pfs_zero, flags); + free_temp(rp, temp); + } else { + // Decrease refcount of the destination + t_hw_dst(rp, dest, GL_FALSE, cs->nrslots); + } +} + + static GLboolean parse_program(struct r300_fragment_program *rp) { struct gl_fragment_program *mp = &rp->mesa_program; const struct prog_instruction *inst = mp->Base.Instructions; struct prog_instruction *fpi; GLuint src[3], dest, temp[2]; - GLuint cnst; int flags, mask = 0; - GLfloat cnstv[4] = {0.0, 0.0, 0.0, 0.0}; if (!inst || inst[0].Opcode == OPCODE_END) { ERROR("empty program?\n"); @@ -1612,66 +1709,8 @@ static GLboolean parse_program(struct r300_fragment_program *rp) flags); break; case OPCODE_LIT: - /* LIT - * if (s.x < 0) t.x = 0; else t.x = s.x; - * if (s.y < 0) t.y = 0; else t.y = s.y; - * if (s.w > 128.0) t.w = 128.0; else t.w = s.w; - * if (s.w < -128.0) t.w = -128.0; else t.w = s.w; - * r.x = 1.0 - * if (t.x > 0) r.y = pow(t.y, t.w); else r.y = 0; - * Also r.y = 0 if t.y < 0 - * For the t.x > 0 FGLRX use the CMPH opcode which - * change the compare to (t.x + 0.5) > 0.5 we may - * save one instruction by doing CMP -t.x - */ - cnstv[0] = cnstv[1] = cnstv[2] = cnstv[3] = 0.50001; src[0] = t_src(rp, fpi->SrcReg[0]); - temp[0] = get_temp_reg(rp); - cnst = emit_const4fv(rp, cnstv); - emit_arith(rp, PFS_OP_CMP, temp[0], - WRITEMASK_X | WRITEMASK_Y, - src[0], pfs_zero, src[0], flags); - emit_arith(rp, PFS_OP_MIN, temp[0], WRITEMASK_Z, - swizzle(keep(src[0]), W, W, W, W), - cnst, undef, flags); - emit_arith(rp, PFS_OP_LG2, temp[0], WRITEMASK_W, - swizzle(temp[0], Y, Y, Y, Y), - undef, undef, flags); - emit_arith(rp, PFS_OP_MAX, temp[0], WRITEMASK_Z, - temp[0], negate(cnst), undef, flags); - emit_arith(rp, PFS_OP_MAD, temp[0], WRITEMASK_W, - temp[0], swizzle(temp[0], Z, Z, Z, Z), - pfs_zero, flags); - emit_arith(rp, PFS_OP_EX2, temp[0], WRITEMASK_W, - temp[0], undef, undef, flags); - emit_arith(rp, PFS_OP_MAD, dest, WRITEMASK_Y, - swizzle(keep(temp[0]), X, X, X, X), - pfs_one, pfs_zero, flags); -#if 0 - emit_arith(rp, PFS_OP_MAD, temp[0], WRITEMASK_X, - temp[0], pfs_one, pfs_half, flags); - emit_arith(rp, PFS_OP_CMPH, temp[0], WRITEMASK_Z, - swizzle(keep(temp[0]), W, W, W, W), - pfs_zero, swizzle(keep(temp[0]), X, X, X, X), - flags); -#else - emit_arith(rp, PFS_OP_CMP, temp[0], WRITEMASK_Z, - pfs_zero, - swizzle(keep(temp[0]), W, W, W, W), - negate(swizzle(keep(temp[0]), X, X, X, X)), - flags); -#endif - emit_arith(rp, PFS_OP_CMP, dest, WRITEMASK_Z, - pfs_zero, temp[0], - negate(swizzle(keep(temp[0]), Y, Y, Y, Y)), - flags); - emit_arith(rp, PFS_OP_MAD, dest, - WRITEMASK_X | WRITEMASK_W, - pfs_one, - pfs_one, - pfs_zero, - flags); - free_temp(rp, temp[0]); + emit_lit(rp, dest, mask, src[0], flags); break; case OPCODE_LRP: src[0] = t_src(rp, fpi->SrcReg[0]); -- cgit v1.2.3 From c4bf863f4cb48c2de284933bb1fc725b540ee810 Mon Sep 17 00:00:00 2001 From: Nicolai Haehnle Date: Mon, 19 Mar 2007 19:45:45 +0100 Subject: r300: Fix WRITEMASK handling when writing to result.depth This is a necessary change to emit the right instructions when writing to result.depth. However, even with this test, Z-write doesn't work properly, and I don't fully understand why. In addition to this, we'll at least have to disable early-Z, but even that doesn't seem to be enough. --- src/mesa/drivers/dri/r300/r300_fragprog.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'src/mesa/drivers/dri/r300/r300_fragprog.c') diff --git a/src/mesa/drivers/dri/r300/r300_fragprog.c b/src/mesa/drivers/dri/r300/r300_fragprog.c index b0681e2808..fb559e880a 100644 --- a/src/mesa/drivers/dri/r300/r300_fragprog.c +++ b/src/mesa/drivers/dri/r300/r300_fragprog.c @@ -1304,9 +1304,14 @@ static void emit_arith(struct r300_fragment_program *rp, argc = r300_fpop[op].argc; if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT && - REG_GET_INDEX(dest) == FRAG_RESULT_DEPR) - mask &= ~WRITEMASK_XYZ; - + REG_GET_INDEX(dest) == FRAG_RESULT_DEPR) { + if (mask & WRITEMASK_Z) { + mask = WRITEMASK_W; + } else { + return; + } + } + emit_vop = GL_FALSE; emit_sop = GL_FALSE; if ((mask & WRITEMASK_XYZ) || vop == R300_FPI0_OUTC_DP3) -- cgit v1.2.3 From 7b992d024b20df111db007286e5a54afcb531fb1 Mon Sep 17 00:00:00 2001 From: Nicolai Haehnle Date: Mon, 19 Mar 2007 19:46:25 +0100 Subject: r300: Whitespace cleanup (remove trailing spaces) --- src/mesa/drivers/dri/r300/r300_fragprog.c | 218 +++++++++++++++--------------- 1 file changed, 109 insertions(+), 109 deletions(-) (limited to 'src/mesa/drivers/dri/r300/r300_fragprog.c') diff --git a/src/mesa/drivers/dri/r300/r300_fragprog.c b/src/mesa/drivers/dri/r300/r300_fragprog.c index fb559e880a..93b9c39635 100644 --- a/src/mesa/drivers/dri/r300/r300_fragprog.c +++ b/src/mesa/drivers/dri/r300/r300_fragprog.c @@ -307,17 +307,17 @@ static int get_hw_temp(struct r300_fragment_program *rp, int slot) { COMPILE_STATE; int r; - + for(r = 0; r < PFS_NUM_TEMP_REGS; ++r) { if (cs->hwtemps[r].free >= 0 && cs->hwtemps[r].free <= slot) break; } - + if (r >= PFS_NUM_TEMP_REGS) { ERROR("Out of hardware temps\n"); return 0; } - + // Reserved is used to avoid the following scenario: // R300 temporary X is first assigned to Mesa temporary Y during vector ops // R300 temporary X is then assigned to Mesa temporary Z for further vector ops @@ -326,17 +326,17 @@ static int get_hw_temp(struct r300_fragment_program *rp, int slot) // End scenario. cs->hwtemps[r].reserved = cs->hwtemps[r].free; cs->hwtemps[r].free = -1; - + // Reset to some value that won't mess things up when the user // tries to read from a temporary that hasn't been assigned a value yet. // In the normal case, vector_valid and scalar_valid should be set to // a sane value by the first emit that writes to this temporary. cs->hwtemps[r].vector_valid = 0; cs->hwtemps[r].scalar_valid = 0; - + if (r > rp->max_temp_idx) rp->max_temp_idx = r; - + return r; } @@ -351,25 +351,25 @@ static int get_hw_temp_tex(struct r300_fragment_program *rp) for(r = 0; r < PFS_NUM_TEMP_REGS; ++r) { if (cs->used_in_node & (1 << r)) continue; - + // Note: Be very careful here if (cs->hwtemps[r].free >= 0 && cs->hwtemps[r].free <= 0) break; } - + if (r >= PFS_NUM_TEMP_REGS) return get_hw_temp(rp, 0); /* Will cause an indirection */ cs->hwtemps[r].reserved = cs->hwtemps[r].free; cs->hwtemps[r].free = -1; - + // Reset to some value that won't mess things up when the user // tries to read from a temporary that hasn't been assigned a value yet. // In the normal case, vector_valid and scalar_valid should be set to // a sane value by the first emit that writes to this temporary. cs->hwtemps[r].vector_valid = cs->nrslots; cs->hwtemps[r].scalar_valid = cs->nrslots; - + if (r > rp->max_temp_idx) rp->max_temp_idx = r; @@ -382,7 +382,7 @@ static int get_hw_temp_tex(struct r300_fragment_program *rp) static void free_hw_temp(struct r300_fragment_program *rp, int idx) { COMPILE_STATE; - + // Be very careful here. Consider sequences like // MAD r0, r1,r2,r3 // TEX r4, ... @@ -457,7 +457,7 @@ static void free_temp(struct r300_fragment_program *rp, GLuint r) if (!(cs->temp_in_use & (1 << index))) return; - + if (REG_GET_TYPE(r) == REG_TYPE_TEMP) { free_hw_temp(rp, cs->temps[index].reg); cs->temps[index].reg = -1; @@ -493,7 +493,7 @@ static GLuint emit_param4fv(struct r300_fragment_program *rp, } static GLuint emit_const4fv(struct r300_fragment_program *rp, const GLfloat* cp) -{ +{ GLuint r = undef; GLuint index; @@ -691,7 +691,7 @@ static GLuint do_swizzle(struct r300_fragment_program *rp, GLuint offset; for(i=0; i < 4; ++i){ offset = GET_SWZ(arbswz, i); - + newswz |= (offset <= 3)?GET_SWZ(vsrcswz, offset) << i*3:offset << i*3; } @@ -800,7 +800,7 @@ static GLuint t_dst(struct r300_fragment_program *rp, struct prog_dst_register dest) { GLuint r = undef; - + switch (dest.File) { case PROGRAM_TEMPORARY: REG_SET_INDEX(r, dest.Index); @@ -910,19 +910,19 @@ static int t_hw_dst(struct r300_fragment_program *rp, ERROR("invalid dest reg type %d\n", REG_GET_TYPE(dest)); return 0; } - + return idx; } static void emit_nop(struct r300_fragment_program *rp) { COMPILE_STATE; - + if (cs->nrslots >= PFS_MAX_ALU_INST) { ERROR("Out of ALU instruction slots\n"); return; } - + rp->alu.inst[cs->nrslots].inst0 = NOP_INST0; rp->alu.inst[cs->nrslots].inst1 = NOP_INST1; rp->alu.inst[cs->nrslots].inst2 = NOP_INST2; @@ -940,7 +940,7 @@ static void emit_tex(struct r300_fragment_program *rp, GLuint din = cs->dest_in_node, uin = cs->used_in_node; int unit = fpi->TexSrcUnit; int hwsrc, hwdest; - + /* Resolve source/dest to hardware registers */ hwsrc = t_hw_src(rp, coord, GL_TRUE); if (opcode != R300_FPITX_OP_KIL) { @@ -952,7 +952,7 @@ static void emit_tex(struct r300_fragment_program *rp, dest = get_temp_reg_tex(rp); } hwdest = t_hw_dst(rp, dest, GL_TRUE, rp->node[rp->cur_node].alu_offset); - + /* Use a temp that hasn't been used in this node, rather * than causing an indirection */ @@ -965,17 +965,17 @@ static void emit_tex(struct r300_fragment_program *rp, hwdest = 0; unit = 0; } - + /* Indirection if source has been written in this node, or if the * dest has been read/written in this node */ if ((REG_GET_TYPE(coord) != REG_TYPE_CONST && (din & (1<node[rp->cur_node].alu_offset == cs->nrslots) emit_nop(rp); - + rp->node[rp->cur_node].alu_end = cs->nrslots - rp->node[rp->cur_node].alu_offset - 1; assert(rp->node[rp->cur_node].alu_end >= 0); @@ -989,12 +989,12 @@ static void emit_tex(struct r300_fragment_program *rp, rp->node[rp->cur_node].tex_offset = rp->tex.length; rp->node[rp->cur_node].alu_offset = cs->nrslots; rp->node[rp->cur_node].tex_end = -1; - rp->node[rp->cur_node].alu_end = -1; + rp->node[rp->cur_node].alu_end = -1; rp->node[rp->cur_node].flags = 0; cs->used_in_node = 0; cs->dest_in_node = 0; } - + if (rp->cur_node == 0) rp->first_node_has_tex = 1; @@ -1005,7 +1005,7 @@ static void emit_tex(struct r300_fragment_program *rp, /* not entirely sure about this */ | (opcode << R300_FPITX_OPCODE_SHIFT); - cs->dest_in_node |= (1 << hwdest); + cs->dest_in_node |= (1 << hwdest); if (REG_GET_TYPE(coord) != REG_TYPE_CONST) cs->used_in_node |= (1 << hwsrc); @@ -1038,7 +1038,7 @@ static int get_earliest_allowed_write( case REG_TYPE_TEMP: if (cs->temps[index].reg == -1) return 0; - + idx = cs->temps[index].reg; break; case REG_TYPE_OUTPUT: @@ -1047,7 +1047,7 @@ static int get_earliest_allowed_write( ERROR("invalid dest reg type %d\n", REG_GET_TYPE(dest)); return 0; } - + pos = cs->hwtemps[idx].reserved; if (mask & WRITEMASK_XYZ) { if (pos < cs->hwtemps[idx].vector_lastread) @@ -1057,7 +1057,7 @@ static int get_earliest_allowed_write( if (pos < cs->hwtemps[idx].scalar_lastread) pos = cs->hwtemps[idx].scalar_lastread; } - + return pos; } @@ -1094,7 +1094,7 @@ static int find_and_prepare_slot(struct r300_fragment_program* rp, int pos; int regnr; int i,j; - + // Determine instruction slots, whether sources are required on // vector or scalar side, and the smallest slot number where // all source registers are available @@ -1103,9 +1103,9 @@ static int find_and_prepare_slot(struct r300_fragment_program* rp, used |= SLOT_OP_VECTOR; if (emit_sop) used |= SLOT_OP_SCALAR; - + pos = get_earliest_allowed_write(rp, dest, mask); - + if (rp->node[rp->cur_node].alu_offset > pos) pos = rp->node[rp->cur_node].alu_offset; for(i = 0; i < argc; ++i) { @@ -1115,10 +1115,10 @@ static int find_and_prepare_slot(struct r300_fragment_program* rp, if (emit_sop) used |= s_swiz[REG_GET_SSWZ(src[i])].flags << i; } - + hwsrc[i] = t_hw_src(rp, src[i], GL_FALSE); /* Note: sideeffects wrt refcounting! */ regnr = hwsrc[i] & 31; - + if (REG_GET_TYPE(src[i]) == REG_TYPE_TEMP) { if (used & (SLOT_SRC_VECTOR << i)) { if (cs->hwtemps[regnr].vector_valid > pos) @@ -1130,12 +1130,12 @@ static int find_and_prepare_slot(struct r300_fragment_program* rp, } } } - + // Find a slot that fits for(; ; ++pos) { if (cs->slot[pos].used & used & SLOT_OP_BOTH) continue; - + if (pos >= cs->nrslots) { if (cs->nrslots >= PFS_MAX_ALU_INST) { ERROR("Out of ALU instruction slots\n"); @@ -1147,7 +1147,7 @@ static int find_and_prepare_slot(struct r300_fragment_program* rp, cs->nrslots++; } - + // Note: When we need both parts (vector and scalar) of a source, // we always try to put them into the same position. This makes the // code easier to read, and it is optimal (i.e. one doesn't gain @@ -1158,32 +1158,32 @@ static int find_and_prepare_slot(struct r300_fragment_program* rp, tempvsrc[i] = cs->slot[pos].vsrc[i]; tempssrc[i] = cs->slot[pos].ssrc[i]; } - + for(i = 0; i < argc; ++i) { int flags = (used >> i) & SLOT_SRC_BOTH; - + if (!flags) { srcpos[i] = 0; continue; } - + for(j = 0; j < 3; ++j) { if ((tempused >> j) & flags & SLOT_SRC_VECTOR) { if (tempvsrc[j] != hwsrc[i]) continue; } - + if ((tempused >> j) & flags & SLOT_SRC_SCALAR) { if (tempssrc[j] != hwsrc[i]) continue; } - + break; } - + if (j == 3) break; - + srcpos[i] = j; tempused |= flags << j; if (flags & SLOT_SRC_VECTOR) @@ -1191,22 +1191,22 @@ static int find_and_prepare_slot(struct r300_fragment_program* rp, if (flags & SLOT_SRC_SCALAR) tempssrc[j] = hwsrc[i]; } - + if (i == argc) break; } - + // Found a slot, reserve it cs->slot[pos].used = tempused | (used & SLOT_OP_BOTH); for(i = 0; i < 3; ++i) { cs->slot[pos].vsrc[i] = tempvsrc[i]; cs->slot[pos].ssrc[i] = tempssrc[i]; } - + for(i = 0; i < argc; ++i) { if (REG_GET_TYPE(src[i]) == REG_TYPE_TEMP) { int regnr = hwsrc[i] & 31; - + if (used & (SLOT_SRC_VECTOR << i)) { if (cs->hwtemps[regnr].vector_lastread < pos) cs->hwtemps[regnr].vector_lastread = pos; @@ -1217,24 +1217,24 @@ static int find_and_prepare_slot(struct r300_fragment_program* rp, } } } - + // Emit the source fetch code rp->alu.inst[pos].inst1 &= ~R300_FPI1_SRC_MASK; rp->alu.inst[pos].inst1 |= ((cs->slot[pos].vsrc[0] << R300_FPI1_SRC0C_SHIFT) | (cs->slot[pos].vsrc[1] << R300_FPI1_SRC1C_SHIFT) | (cs->slot[pos].vsrc[2] << R300_FPI1_SRC2C_SHIFT)); - + rp->alu.inst[pos].inst3 &= ~R300_FPI3_SRC_MASK; rp->alu.inst[pos].inst3 |= ((cs->slot[pos].ssrc[0] << R300_FPI3_SRC0A_SHIFT) | (cs->slot[pos].ssrc[1] << R300_FPI3_SRC1A_SHIFT) | (cs->slot[pos].ssrc[2] << R300_FPI3_SRC2A_SHIFT)); - + // Emit the argument selection code if (emit_vop) { int swz[3]; - + for(i = 0; i < 3; ++i) { if (i < argc) { swz[i] = (v_swiz[REG_GET_VSWZ(src[i])].base + @@ -1245,7 +1245,7 @@ static int find_and_prepare_slot(struct r300_fragment_program* rp, swz[i] = R300_FPI0_ARGC_ZERO; } } - + rp->alu.inst[pos].inst0 &= ~(R300_FPI0_ARG0C_MASK|R300_FPI0_ARG1C_MASK|R300_FPI0_ARG2C_MASK); rp->alu.inst[pos].inst0 |= @@ -1253,10 +1253,10 @@ static int find_and_prepare_slot(struct r300_fragment_program* rp, (swz[1] << R300_FPI0_ARG1C_SHIFT) | (swz[2] << R300_FPI0_ARG2C_SHIFT); } - + if (emit_sop) { int swz[3]; - + for(i = 0; i < 3; ++i) { if (i < argc) { swz[i] = (s_swiz[REG_GET_SSWZ(src[i])].base + @@ -1267,7 +1267,7 @@ static int find_and_prepare_slot(struct r300_fragment_program* rp, swz[i] = R300_FPI2_ARGA_ZERO; } } - + rp->alu.inst[pos].inst2 &= ~(R300_FPI2_ARG0A_MASK|R300_FPI2_ARG1A_MASK|R300_FPI2_ARG2A_MASK); rp->alu.inst[pos].inst2 |= @@ -1322,9 +1322,9 @@ static void emit_arith(struct r300_fragment_program *rp, pos = find_and_prepare_slot(rp, emit_vop, emit_sop, argc, src, dest, mask); if (pos < 0) return; - + hwdest = t_hw_dst(rp, dest, GL_FALSE, pos); /* Note: Side effects wrt register allocation */ - + if (flags & PFS_FLAG_SAT) { vop |= R300_FPI0_OUTC_SAT; sop |= R300_FPI2_OUTA_SAT; @@ -1335,7 +1335,7 @@ static void emit_arith(struct r300_fragment_program *rp, rp->alu.inst[pos].inst0 |= vop; rp->alu.inst[pos].inst1 |= hwdest << R300_FPI1_DSTC_SHIFT; - + if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT) { if (REG_GET_INDEX(dest) == FRAG_RESULT_COLR) { rp->alu.inst[pos].inst1 |= @@ -1344,7 +1344,7 @@ static void emit_arith(struct r300_fragment_program *rp, } else { rp->alu.inst[pos].inst1 |= (mask & WRITEMASK_XYZ) << R300_FPI1_DSTC_REG_MASK_SHIFT; - + cs->hwtemps[hwdest].vector_valid = pos+1; } } @@ -1356,7 +1356,7 @@ static void emit_arith(struct r300_fragment_program *rp, if (mask & WRITEMASK_W) { if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT) { if (REG_GET_INDEX(dest) == FRAG_RESULT_COLR) { - rp->alu.inst[pos].inst3 |= + rp->alu.inst[pos].inst3 |= (hwdest << R300_FPI3_DSTA_SHIFT) | R300_FPI3_DSTA_OUTPUT; } else if (REG_GET_INDEX(dest) == FRAG_RESULT_DEPR) { rp->alu.inst[pos].inst3 |= R300_FPI3_DSTA_DEPTH; @@ -1364,12 +1364,12 @@ static void emit_arith(struct r300_fragment_program *rp, } else { rp->alu.inst[pos].inst3 |= (hwdest << R300_FPI3_DSTA_SHIFT) | R300_FPI3_DSTA_REG; - + cs->hwtemps[hwdest].scalar_valid = pos+1; } } } - + return; } @@ -1443,9 +1443,9 @@ static void emit_lit(struct r300_fragment_program *rp, GLuint cnst; int needTemporary; GLuint temp; - + cnst = emit_const4fv(rp, cnstv); - + needTemporary = 0; if ((mask & WRITEMASK_XYZW) != WRITEMASK_XYZW) { needTemporary = 1; @@ -1454,30 +1454,30 @@ static void emit_lit(struct r300_fragment_program *rp, // in creating special code for this case needTemporary = 1; } - + if (needTemporary) { temp = keep(get_temp_reg(rp)); } else { temp = keep(dest); } - + // Npte: The order of emit_arith inside the slots is relevant, // because emit_arith only looks at scalar vs. vector when resolving // dependencies, and it does not consider individual vector components, // so swizzling between the two parts can create fake dependencies. - + // First slot emit_arith(rp, PFS_OP_MAX, temp, WRITEMASK_XY, keep(src), pfs_zero, undef, 0); emit_arith(rp, PFS_OP_MAX, temp, WRITEMASK_W, src, cnst, undef, 0); - + // Second slot emit_arith(rp, PFS_OP_MIN, temp, WRITEMASK_Z, swizzle(temp, W, W, W, W), cnst, undef, 0); emit_arith(rp, PFS_OP_LG2, temp, WRITEMASK_W, swizzle(temp, Y, Y, Y, Y), undef, undef, 0); - + // Third slot // If desired, we saturate the y result here. // This does not affect the use as a condition variable in the CMP later @@ -1485,19 +1485,19 @@ static void emit_lit(struct r300_fragment_program *rp, temp, swizzle(temp, Z, Z, Z, Z), pfs_zero, 0); emit_arith(rp, PFS_OP_MAD, temp, WRITEMASK_Y, swizzle(temp, X, X, X, X), pfs_one, pfs_zero, flags); - + // Fourth slot emit_arith(rp, PFS_OP_MAD, temp, WRITEMASK_X, pfs_one, pfs_one, pfs_zero, 0); emit_arith(rp, PFS_OP_EX2, temp, WRITEMASK_W, temp, undef, undef, 0); - + // Fifth slot emit_arith(rp, PFS_OP_CMP, temp, WRITEMASK_Z, swizzle(temp, W, W, W, W), pfs_zero, swizzle(temp, Y, Y, Y, Y), flags); emit_arith(rp, PFS_OP_MAD, temp, WRITEMASK_W, pfs_one, pfs_one, pfs_zero, 0); - + if (needTemporary) { emit_arith(rp, PFS_OP_MAD, dest, mask, temp, pfs_one, pfs_zero, flags); @@ -1510,7 +1510,7 @@ static void emit_lit(struct r300_fragment_program *rp, static GLboolean parse_program(struct r300_fragment_program *rp) -{ +{ struct gl_fragment_program *mp = &rp->mesa_program; const struct prog_instruction *inst = mp->Base.Instructions; struct prog_instruction *fpi; @@ -1604,7 +1604,7 @@ static GLboolean parse_program(struct r300_fragment_program *rp) absolute(swizzle(temp[0], Z, Z, Z, Z)), swizzle(temp[0], X, X, X, X), 0); - + emit_arith(rp, PFS_OP_MAD, temp[0], WRITEMASK_Y, swizzle(temp[0], X, X, X, X), absolute(swizzle(temp[0], X, X, X, X)), @@ -1648,12 +1648,12 @@ static GLboolean parse_program(struct r300_fragment_program *rp) 0); emit_arith(rp, PFS_OP_DP4, dest, mask, temp[0], src[1], undef, - flags); + flags); free_temp(rp, temp[0]); #else emit_arith(rp, PFS_OP_DP4, dest, mask, swizzle(src[0], X, Y, Z, ONE), src[1], - undef, flags); + undef, flags); #endif break; case OPCODE_DST: @@ -1684,7 +1684,7 @@ static GLboolean parse_program(struct r300_fragment_program *rp) src[0], undef, undef, flags); break; - case OPCODE_FLR: + case OPCODE_FLR: src[0] = t_src(rp, fpi->SrcReg[0]); temp[0] = get_temp_reg(rp); /* FRC temp, src0 @@ -1734,7 +1734,7 @@ static GLboolean parse_program(struct r300_fragment_program *rp) src[0], src[1], temp[0], flags); free_temp(rp, temp[0]); - break; + break; case OPCODE_MAD: src[0] = t_src(rp, fpi->SrcReg[0]); src[1] = t_src(rp, fpi->SrcReg[1]); @@ -1761,7 +1761,7 @@ static GLboolean parse_program(struct r300_fragment_program *rp) case OPCODE_SWZ: src[0] = t_src(rp, fpi->SrcReg[0]); emit_arith(rp, PFS_OP_MAD, dest, mask, - src[0], pfs_one, pfs_zero, + src[0], pfs_one, pfs_zero, flags); break; case OPCODE_MUL: @@ -1774,7 +1774,7 @@ static GLboolean parse_program(struct r300_fragment_program *rp) case OPCODE_POW: src[0] = t_scalar_src(rp, fpi->SrcReg[0]); src[1] = t_scalar_src(rp, fpi->SrcReg[1]); - temp[0] = get_temp_reg(rp); + temp[0] = get_temp_reg(rp); emit_arith(rp, PFS_OP_LG2, temp[0], WRITEMASK_W, src[0], undef, undef, 0); @@ -1932,7 +1932,7 @@ static GLboolean parse_program(struct r300_fragment_program *rp) absolute(swizzle(temp[0], Z, Z, Z, Z)), swizzle(temp[0], X, X, X, X), 0); - + emit_arith(rp, PFS_OP_MAD, temp[0], WRITEMASK_Y, swizzle(temp[0], X, X, X, X), absolute(swizzle(temp[0], X, X, X, X)), @@ -1989,7 +1989,7 @@ static GLboolean parse_program(struct r300_fragment_program *rp) swizzle(keep(src[1]), Y, Z, X, W), pfs_zero, 0); - /* dest.xyz = src0.yzx * src1.zxy - temp + /* dest.xyz = src0.yzx * src1.zxy - temp * dest.w = undefined * */ emit_arith(rp, PFS_OP_MAD, dest, mask & WRITEMASK_XYZ, @@ -2089,7 +2089,7 @@ static void insert_wpos(struct gl_program *prog) fpi = &prog->Instructions[prog->NumInstructions-1]; assert(fpi->Opcode == OPCODE_END); - + for(fpi = &prog->Instructions[3]; fpi->Opcode != OPCODE_END; fpi++){ for(i=0; i<3; i++) if( fpi->SrcReg[i].File == PROGRAM_INPUT && @@ -2106,7 +2106,7 @@ static void insert_wpos(struct gl_program *prog) static void init_program(r300ContextPtr r300, struct r300_fragment_program *rp) { struct r300_pfs_compile_state *cs = NULL; - struct gl_fragment_program *mp = &rp->mesa_program; + struct gl_fragment_program *mp = &rp->mesa_program; struct prog_instruction *fpi; GLuint InputsRead = mp->Base.InputsRead; GLuint temps_used = 0; /* for rp->temps[] */ @@ -2127,7 +2127,7 @@ static void init_program(r300ContextPtr r300, struct r300_fragment_program *rp) rp->node[0].alu_end = -1; rp->node[0].tex_end = -1; rp->const_sin[0] = -1; - + _mesa_memset(cs, 0, sizeof(*rp->cs)); for (i=0;islot[i].ssrc[j] = SRC_CONST; } } - + /* Work out what temps the Mesa inputs correspond to, this must match * what setup_rs_unit does, which shouldn't be a problem as rs_unit * configures itself based on the fragprog's InputsRead @@ -2167,7 +2167,7 @@ static void init_program(r300ContextPtr r300, struct r300_fragment_program *rp) cs->inputs[FRAG_ATTRIB_COL0].reg = get_hw_temp(rp, 0); } InputsRead &= ~FRAG_BIT_COL0; - + /* Secondary color */ if (InputsRead & FRAG_BIT_COL1) { cs->inputs[FRAG_ATTRIB_COL1].refcount = 0; @@ -2194,7 +2194,7 @@ static void init_program(r300ContextPtr r300, struct r300_fragment_program *rp) for (fpi=mp->Base.Instructions;fpi->Opcode != OPCODE_END; fpi++) { int idx; - + for (i=0;i<3;i++) { idx = fpi->SrcReg[i].Index; switch (fpi->SrcReg[i].File) { @@ -2246,7 +2246,7 @@ void r300_translate_fragment_shader(r300ContextPtr r300, struct r300_fragment_pr struct r300_pfs_compile_state *cs = NULL; if (!rp->translated) { - + init_program(r300, rp); cs = rp->cs; @@ -2254,7 +2254,7 @@ void r300_translate_fragment_shader(r300ContextPtr r300, struct r300_fragment_pr dump_program(rp); return; } - + /* Finish off */ rp->node[rp->cur_node].alu_end = cs->nrslots - rp->node[rp->cur_node].alu_offset - 1; @@ -2266,9 +2266,9 @@ void r300_translate_fragment_shader(r300ContextPtr r300, struct r300_fragment_pr rp->tex_end = rp->tex.length ? rp->tex.length - 1 : 0; assert(rp->node[rp->cur_node].alu_end >= 0); assert(rp->alu_end >= 0); - + rp->translated = GL_TRUE; - if (0) dump_program(rp); + if (1) dump_program(rp); r300UpdateStateParameters(rp->ctx, _NEW_PROGRAM); } @@ -2282,7 +2282,7 @@ static void dump_program(struct r300_fragment_program *rp) static int pc = 0; fprintf(stderr, "pc=%d*************************************\n", pc++); - + fprintf(stderr, "Mesa program:\n"); fprintf(stderr, "-------------\n"); _mesa_print_program(&rp->mesa_program.Base); @@ -2290,7 +2290,7 @@ static void dump_program(struct r300_fragment_program *rp) fprintf(stderr, "Hardware program\n"); fprintf(stderr, "----------------\n"); - + for (n = 0; n < (rp->cur_node+1); n++) { fprintf(stderr, "NODE %d: alu_offset: %d, tex_offset: %d, "\ "alu_end: %d, tex_end: %d\n", n, @@ -2298,12 +2298,12 @@ static void dump_program(struct r300_fragment_program *rp) rp->node[n].tex_offset, rp->node[n].alu_end, rp->node[n].tex_end); - + if (rp->tex.length) { fprintf(stderr, " TEX:\n"); for(i = rp->node[n].tex_offset; i <= rp->node[n].tex_offset+rp->node[n].tex_end; ++i) { const char* instr; - + switch((rp->tex.inst[i] >> R300_FPITX_OPCODE_SHIFT) & 15) { case R300_FPITX_OP_TEX: instr = "TEX"; @@ -2320,7 +2320,7 @@ static void dump_program(struct r300_fragment_program *rp) default: instr = "UNKNOWN"; } - + fprintf(stderr, " %s t%i, %c%i, texture[%i] (%08x)\n", instr, (rp->tex.inst[i] >> R300_FPITX_DST_SHIFT) & 31, @@ -2330,22 +2330,22 @@ static void dump_program(struct r300_fragment_program *rp) rp->tex.inst[i]); } } - + for(i = rp->node[n].alu_offset; i <= rp->node[n].alu_offset+rp->node[n].alu_end; ++i) { char srcc[3][10], dstc[20]; char srca[3][10], dsta[20]; char argc[3][20]; char arga[3][20]; char flags[5], tmp[10]; - + for(j = 0; j < 3; ++j) { int regc = rp->alu.inst[i].inst1 >> (j*6); int rega = rp->alu.inst[i].inst3 >> (j*6); - + sprintf(srcc[j], "%c%i", (regc & 32) ? 'c' : 't', regc & 31); sprintf(srca[j], "%c%i", (rega & 32) ? 'c' : 't', rega & 31); } - + dstc[0] = 0; sprintf(flags, "%s%s%s", (rp->alu.inst[i].inst1 & R300_FPI1_DSTC_REG_X) ? "x" : "", @@ -2366,7 +2366,7 @@ static void dump_program(struct r300_fragment_program *rp) flags); strcat(dstc, tmp); } - + dsta[0] = 0; if (rp->alu.inst[i].inst3 & R300_FPI3_DSTA_REG) { sprintf(dsta, "t%i.w ", (rp->alu.inst[i].inst3 >> R300_FPI3_DSTA_SHIFT) & 31); @@ -2378,13 +2378,13 @@ static void dump_program(struct r300_fragment_program *rp) if (rp->alu.inst[i].inst3 & R300_FPI3_DSTA_DEPTH) { strcat(dsta, "Z"); } - + fprintf(stderr, "%3i: xyz: %3s %3s %3s -> %-20s (%08x)\n" " w: %3s %3s %3s -> %-20s (%08x)\n", i, srcc[0], srcc[1], srcc[2], dstc, rp->alu.inst[i].inst1, srca[0], srca[1], srca[2], dsta, rp->alu.inst[i].inst3); - + for(j = 0; j < 3; ++j) { int regc = rp->alu.inst[i].inst0 >> (j*7); int rega = rp->alu.inst[i].inst2 >> (j*7); @@ -2431,13 +2431,13 @@ static void dump_program(struct r300_fragment_program *rp) } else { sprintf(buf, "%i", d); } - + sprintf(argc[j], "%s%s%s%s", (regc & 32) ? "-" : "", (regc & 64) ? "|" : "", buf, (regc & 64) ? "|" : ""); - + d = rega & 31; if (d < 9) { sprintf(buf, "%s.%c", srcc[d / 3], 'x' + (char)(d%3)); @@ -2452,14 +2452,14 @@ static void dump_program(struct r300_fragment_program *rp) } else { sprintf(buf, "%i", d); } - + sprintf(arga[j], "%s%s%s%s", (rega & 32) ? "-" : "", (rega & 64) ? "|" : "", buf, (rega & 64) ? "|" : ""); } - + fprintf(stderr, " xyz: %8s %8s %8s op: %08x\n" " w: %8s %8s %8s op: %08x\n", argc[0], argc[1], argc[2], rp->alu.inst[i].inst0, -- cgit v1.2.3 From 826815a5d27d6e79e9d0e0b0fc63bb3fd092d40d Mon Sep 17 00:00:00 2001 From: Nicolai Haehnle Date: Mon, 19 Mar 2007 20:01:20 +0100 Subject: r300: Dump fragment program after translation if RADEON_DEBUG=pixel is set --- src/mesa/drivers/dri/r300/r300_fragprog.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'src/mesa/drivers/dri/r300/r300_fragprog.c') diff --git a/src/mesa/drivers/dri/r300/r300_fragprog.c b/src/mesa/drivers/dri/r300/r300_fragprog.c index 93b9c39635..6262dc7a44 100644 --- a/src/mesa/drivers/dri/r300/r300_fragprog.c +++ b/src/mesa/drivers/dri/r300/r300_fragprog.c @@ -2268,7 +2268,7 @@ void r300_translate_fragment_shader(r300ContextPtr r300, struct r300_fragment_pr assert(rp->alu_end >= 0); rp->translated = GL_TRUE; - if (1) dump_program(rp); + if (RADEON_DEBUG & DEBUG_PIXEL) dump_program(rp); r300UpdateStateParameters(rp->ctx, _NEW_PROGRAM); } -- cgit v1.2.3 From b3acba87d7f5ede486cba11db036cf36dff6c29e Mon Sep 17 00:00:00 2001 From: Nicolai Haehnle Date: Mon, 19 Mar 2007 22:17:16 +0100 Subject: r300: Clear fragment program instruction slots on first use Make sure that instruction slots are fully initialized with NOPs during find_and_prepare_slot(). This fixes a bug when a fragment program was translated more than once (e.g. due to a second call to glProgramStringARB). This partially fixes glean/fragProg1. --- src/mesa/drivers/dri/r300/r300_fragprog.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'src/mesa/drivers/dri/r300/r300_fragprog.c') diff --git a/src/mesa/drivers/dri/r300/r300_fragprog.c b/src/mesa/drivers/dri/r300/r300_fragprog.c index 6262dc7a44..3f9d83f109 100644 --- a/src/mesa/drivers/dri/r300/r300_fragprog.c +++ b/src/mesa/drivers/dri/r300/r300_fragprog.c @@ -1143,7 +1143,9 @@ static int find_and_prepare_slot(struct r300_fragment_program* rp, } rp->alu.inst[pos].inst0 = NOP_INST0; + rp->alu.inst[pos].inst1 = NOP_INST1; rp->alu.inst[pos].inst2 = NOP_INST2; + rp->alu.inst[pos].inst3 = NOP_INST3; cs->nrslots++; } -- cgit v1.2.3 From 5a6547878373798113f8b55b912abc5bfb93add5 Mon Sep 17 00:00:00 2001 From: Nicolai Haehnle Date: Mon, 19 Mar 2007 22:26:08 +0100 Subject: r300: Fix special case (tmp.x <= 0) in fragment program LIT instruction Also, fix a typo in a related comment. --- src/mesa/drivers/dri/r300/r300_fragprog.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'src/mesa/drivers/dri/r300/r300_fragprog.c') diff --git a/src/mesa/drivers/dri/r300/r300_fragprog.c b/src/mesa/drivers/dri/r300/r300_fragprog.c index 3f9d83f109..1d462ebec8 100644 --- a/src/mesa/drivers/dri/r300/r300_fragprog.c +++ b/src/mesa/drivers/dri/r300/r300_fragprog.c @@ -1463,7 +1463,7 @@ static void emit_lit(struct r300_fragment_program *rp, temp = keep(dest); } - // Npte: The order of emit_arith inside the slots is relevant, + // Note: The order of emit_arith inside the slots is relevant, // because emit_arith only looks at scalar vs. vector when resolving // dependencies, and it does not consider individual vector components, // so swizzling between the two parts can create fake dependencies. @@ -1496,7 +1496,7 @@ static void emit_lit(struct r300_fragment_program *rp, // Fifth slot emit_arith(rp, PFS_OP_CMP, temp, WRITEMASK_Z, - swizzle(temp, W, W, W, W), pfs_zero, swizzle(temp, Y, Y, Y, Y), flags); + pfs_zero, swizzle(temp, W, W, W, W), negate(swizzle(temp, Y, Y, Y, Y)), flags); emit_arith(rp, PFS_OP_MAD, temp, WRITEMASK_W, pfs_one, pfs_one, pfs_zero, 0); -- cgit v1.2.3 From 61821a41c07b6b383a275acf31ade56af2ecfb3c Mon Sep 17 00:00:00 2001 From: Nicolai Haehnle Date: Mon, 19 Mar 2007 23:32:36 +0100 Subject: r300: Cleanup fragment program constant allocation, share constants The constant/parameter allocation was significantly simplified, removing one unnecessary copy operation of parameters. The dirty state tracking is unchanged and far from optimal, since all state is always re-fetched. Constants and parameters are now emitted only once, which significantly reduces the resource pressure on larger programs. --- src/mesa/drivers/dri/r300/r300_context.h | 20 +++--- src/mesa/drivers/dri/r300/r300_fragprog.c | 114 ++++++++++++++---------------- 2 files changed, 61 insertions(+), 73 deletions(-) (limited to 'src/mesa/drivers/dri/r300/r300_fragprog.c') diff --git a/src/mesa/drivers/dri/r300/r300_context.h b/src/mesa/drivers/dri/r300/r300_context.h index 29436ab9e0..bbe44f5e7f 100644 --- a/src/mesa/drivers/dri/r300/r300_context.h +++ b/src/mesa/drivers/dri/r300/r300_context.h @@ -767,23 +767,21 @@ struct r300_fragment_program { int tex_offset; int tex_end; - /* Hardware constants */ - GLfloat constant[PFS_NUM_CONST_REGS][4]; + /* Hardware constants. + * Contains a pointer to the value. The destination of the pointer + * is supposed to be updated when GL state changes. + * Typically, this is either a pointer into + * gl_program_parameter_list::ParameterValues, or a pointer to a + * global constant (e.g. for sin/cos-approximation) + */ + const GLfloat* constant[PFS_NUM_CONST_REGS]; int const_nr; - /* Tracked parameters */ - struct { - int idx; /* hardware index */ - GLfloat *values; /* pointer to values */ - } param[PFS_NUM_CONST_REGS]; - int param_nr; - GLboolean params_uptodate; - int max_temp_idx; /* the index of the sin constant is stored here */ GLint const_sin[2]; - + GLuint optimization; }; diff --git a/src/mesa/drivers/dri/r300/r300_fragprog.c b/src/mesa/drivers/dri/r300/r300_fragprog.c index 1d462ebec8..2145c48b80 100644 --- a/src/mesa/drivers/dri/r300/r300_fragprog.c +++ b/src/mesa/drivers/dri/r300/r300_fragprog.c @@ -468,47 +468,39 @@ static void free_temp(struct r300_fragment_program *rp, GLuint r) } } -static GLuint emit_param4fv(struct r300_fragment_program *rp, - GLfloat *values) +/** + * Emit a hardware constant/parameter. + * + * \p cp Stable pointer to an array of 4 floats. + * The pointer must be stable in the sense that it remains to be valid + * and hold the contents of the constant/parameter throughout the lifetime + * of the fragment program (actually, up until the next time the fragment + * program is translated). + */ +static GLuint emit_const4fv(struct r300_fragment_program *rp, const GLfloat* cp) { - GLuint r = undef; - GLuint index; - int pidx; + GLuint reg = undef; + int index; - pidx = rp->param_nr++; - index = rp->const_nr++; - if (pidx >= PFS_NUM_CONST_REGS || index >= PFS_NUM_CONST_REGS) { - ERROR("Out of const/param slots!\n"); - return r; + for(index = 0; index < rp->const_nr; ++index) { + if (rp->constant[index] == cp) + break; } - rp->param[pidx].idx = index; - rp->param[pidx].values = values; - rp->params_uptodate = GL_FALSE; - - REG_SET_TYPE(r, REG_TYPE_CONST); - REG_SET_INDEX(r, index); - REG_SET_VALID(r, GL_TRUE); - return r; -} - -static GLuint emit_const4fv(struct r300_fragment_program *rp, const GLfloat* cp) -{ - GLuint r = undef; - GLuint index; + if (index >= rp->const_nr) { + if (index >= PFS_NUM_CONST_REGS) { + ERROR("Out of hw constants!\n"); + return reg; + } - index = rp->const_nr++; - if (index >= PFS_NUM_CONST_REGS) { - ERROR("Out of hw constants!\n"); - return r; + rp->const_nr++; + rp->constant[index] = cp; } - COPY_4V(rp->constant[index], cp); - - REG_SET_TYPE(r, REG_TYPE_CONST); - REG_SET_INDEX(r, index); - REG_SET_VALID(r, GL_TRUE); - return r; + REG_SET_TYPE(reg, REG_TYPE_CONST); + REG_SET_INDEX(reg, index); + REG_SET_VALID(reg, GL_TRUE); + return reg; } static inline GLuint negate(GLuint r) @@ -762,16 +754,16 @@ static GLuint t_src(struct r300_fragment_program *rp, REG_SET_TYPE(r, REG_TYPE_INPUT); break; case PROGRAM_LOCAL_PARAM: - r = emit_param4fv(rp, + r = emit_const4fv(rp, rp->mesa_program.Base.LocalParams[fpsrc.Index]); break; case PROGRAM_ENV_PARAM: - r = emit_param4fv(rp, + r = emit_const4fv(rp, rp->ctx->FragmentProgram.Parameters[fpsrc.Index]); break; case PROGRAM_STATE_VAR: case PROGRAM_NAMED_PARAM: - r = emit_param4fv(rp, + r = emit_const4fv(rp, rp->mesa_program.Base.Parameters->ParameterValues[fpsrc.Index]); break; default: @@ -1393,22 +1385,27 @@ static GLuint get_attrib(struct r300_fragment_program *rp, GLuint attr) } #endif +static GLfloat SinCosConsts[2][4] = { + { + 1.273239545, // 4/PI + -0.405284735, // -4/(PI*PI) + 3.141592654, // PI + 0.2225 // weight + }, + { + 0.75, + 0.0, + 0.159154943, // 1/(2*PI) + 6.283185307 // 2*PI + } +}; + + static void make_sin_const(struct r300_fragment_program *rp) { - if(rp->const_sin[0] == -1){ - GLfloat cnstv[4]; - - cnstv[0] = 1.273239545; // 4/PI - cnstv[1] =-0.405284735; // -4/(PI*PI) - cnstv[2] = 3.141592654; // PI - cnstv[3] = 0.2225; // weight - rp->const_sin[0] = emit_const4fv(rp, cnstv); - - cnstv[0] = 0.75; - cnstv[1] = 0.0; - cnstv[2] = 0.159154943; // 1/(2*PI) - cnstv[3] = 6.283185307; // 2*PI - rp->const_sin[1] = emit_const4fv(rp, cnstv); + if(rp->const_sin[0] == -1) { + rp->const_sin[0] = emit_const4fv(rp, SinCosConsts[0]); + rp->const_sin[1] = emit_const4fv(rp, SinCosConsts[1]); } } @@ -1434,6 +1431,8 @@ static void make_sin_const(struct r300_fragment_program *rp) * emit_arith is a bit too conservative because it doesn't understand * partial writes to the vector component. */ +static const GLfloat LitConst[4] = { 127.999999, 127.999999, 127.999999, -127.999999 }; + static void emit_lit(struct r300_fragment_program *rp, GLuint dest, int mask, @@ -1441,12 +1440,11 @@ static void emit_lit(struct r300_fragment_program *rp, int flags) { COMPILE_STATE; - static const GLfloat cnstv[4] = { 127.999999, 127.999999, 127.999999, -127.999999 }; GLuint cnst; int needTemporary; GLuint temp; - cnst = emit_const4fv(rp, cnstv); + cnst = emit_const4fv(rp, LitConst); needTemporary = 0; if ((mask & WRITEMASK_XYZW) != WRITEMASK_XYZW) { @@ -2123,8 +2121,6 @@ static void init_program(r300ContextPtr r300, struct r300_fragment_program *rp) rp->cur_node = 0; rp->first_node_has_tex = 0; rp->const_nr = 0; - rp->param_nr = 0; - rp->params_uptodate = GL_FALSE; rp->max_temp_idx = 0; rp->node[0].alu_end = -1; rp->node[0].tex_end = -1; @@ -2231,16 +2227,10 @@ static void init_program(r300ContextPtr r300, struct r300_fragment_program *rp) static void update_params(struct r300_fragment_program *rp) { struct gl_fragment_program *mp = &rp->mesa_program; - int i; /* Ask Mesa nicely to fill in ParameterValues for us */ - if (rp->param_nr) + if (mp->Base.Parameters) _mesa_load_state_parameters(rp->ctx, mp->Base.Parameters); - - for (i=0;iparam_nr;i++) - COPY_4V(rp->constant[rp->param[i].idx], rp->param[i].values); - - rp->params_uptodate = GL_TRUE; } void r300_translate_fragment_shader(r300ContextPtr r300, struct r300_fragment_program *rp) -- cgit v1.2.3 From 4bafc547df4af0b560dcc6b72c0a6c37d7754abb Mon Sep 17 00:00:00 2001 From: Nicolai Haehnle Date: Wed, 21 Mar 2007 00:56:38 +0100 Subject: r300: Remove the program-global const_sin index The index is no longer necessary to share constants between multiple SIN/COS/SCS instructions inside a single fragment program, and storing a tiny implementation detail like this in the fragment_program structure itself was just nasty. --- src/mesa/drivers/dri/r300/r300_context.h | 3 -- src/mesa/drivers/dri/r300/r300_fragprog.c | 51 ++++++++++++++----------------- 2 files changed, 23 insertions(+), 31 deletions(-) (limited to 'src/mesa/drivers/dri/r300/r300_fragprog.c') diff --git a/src/mesa/drivers/dri/r300/r300_context.h b/src/mesa/drivers/dri/r300/r300_context.h index 68151d865e..fe261dbbc6 100644 --- a/src/mesa/drivers/dri/r300/r300_context.h +++ b/src/mesa/drivers/dri/r300/r300_context.h @@ -779,9 +779,6 @@ struct r300_fragment_program { int max_temp_idx; - /* the index of the sin constant is stored here */ - GLint const_sin[2]; - GLuint optimization; }; diff --git a/src/mesa/drivers/dri/r300/r300_fragprog.c b/src/mesa/drivers/dri/r300/r300_fragprog.c index 2145c48b80..0d7d1f1af2 100644 --- a/src/mesa/drivers/dri/r300/r300_fragprog.c +++ b/src/mesa/drivers/dri/r300/r300_fragprog.c @@ -1401,14 +1401,6 @@ static GLfloat SinCosConsts[2][4] = { }; -static void make_sin_const(struct r300_fragment_program *rp) -{ - if(rp->const_sin[0] == -1) { - rp->const_sin[0] = emit_const4fv(rp, SinCosConsts[0]); - rp->const_sin[1] = emit_const4fv(rp, SinCosConsts[1]); - } -} - /** * Emit a LIT instruction. * \p flags may be PFS_FLAG_SAT @@ -1516,6 +1508,7 @@ static GLboolean parse_program(struct r300_fragment_program *rp) struct prog_instruction *fpi; GLuint src[3], dest, temp[2]; int flags, mask = 0; + int const_sin[2]; if (!inst || inst[0].Opcode == OPCODE_END) { ERROR("empty program?\n"); @@ -1568,15 +1561,16 @@ static GLboolean parse_program(struct r300_fragment_program *rp) * result = sin(x) */ temp[0] = get_temp_reg(rp); - make_sin_const(rp); + const_sin[0] = emit_const4fv(rp, SinCosConsts[0]); + const_sin[1] = emit_const4fv(rp, SinCosConsts[1]); src[0] = t_scalar_src(rp, fpi->SrcReg[0]); /* add 0.5*PI and do range reduction */ emit_arith(rp, PFS_OP_MAD, temp[0], WRITEMASK_X, swizzle(src[0], X, X, X, X), - swizzle(rp->const_sin[1], Z, Z, Z, Z), - swizzle(rp->const_sin[1], X, X, X, X), + swizzle(const_sin[1], Z, Z, Z, Z), + swizzle(const_sin[1], X, X, X, X), 0); emit_arith(rp, PFS_OP_FRC, temp[0], WRITEMASK_X, @@ -1587,15 +1581,15 @@ static GLboolean parse_program(struct r300_fragment_program *rp) emit_arith(rp, PFS_OP_MAD, temp[0], WRITEMASK_Z, swizzle(temp[0], X, X, X, X), - swizzle(rp->const_sin[1], W, W, W, W), //2*PI - negate(swizzle(rp->const_sin[0], Z, Z, Z, Z)), //-PI + swizzle(const_sin[1], W, W, W, W), //2*PI + negate(swizzle(const_sin[0], Z, Z, Z, Z)), //-PI 0); /* SIN */ emit_arith(rp, PFS_OP_MAD, temp[0], WRITEMASK_X | WRITEMASK_Y, swizzle(temp[0], Z, Z, Z, Z), - rp->const_sin[0], + const_sin[0], pfs_zero, 0); @@ -1614,7 +1608,7 @@ static GLboolean parse_program(struct r300_fragment_program *rp) emit_arith(rp, PFS_OP_MAD, dest, mask, swizzle(temp[0], Y, Y, Y, Y), - swizzle(rp->const_sin[0], W, W, W, W), + swizzle(const_sin[0], W, W, W, W), swizzle(temp[0], X, X, X, X), flags); @@ -1808,19 +1802,20 @@ static GLboolean parse_program(struct r300_fragment_program *rp) */ temp[0] = get_temp_reg(rp); temp[1] = get_temp_reg(rp); - make_sin_const(rp); + const_sin[0] = emit_const4fv(rp, SinCosConsts[0]); + const_sin[1] = emit_const4fv(rp, SinCosConsts[1]); src[0] = t_scalar_src(rp, fpi->SrcReg[0]); /* x = -abs(x)+0.5*PI */ emit_arith(rp, PFS_OP_MAD, temp[0], WRITEMASK_Z, - swizzle(rp->const_sin[0], Z, Z, Z, Z), //PI + swizzle(const_sin[0], Z, Z, Z, Z), //PI pfs_half, negate(abs(swizzle(keep(src[0]), X, X, X, X))), 0); /* C*x (sin) */ emit_arith(rp, PFS_OP_MAD, temp[0], WRITEMASK_W, - swizzle(rp->const_sin[0], Y, Y, Y, Y), + swizzle(const_sin[0], Y, Y, Y, Y), swizzle(keep(src[0]), X, X, X, X), pfs_zero, 0); @@ -1828,13 +1823,13 @@ static GLboolean parse_program(struct r300_fragment_program *rp) /* B*x, C*x (cos) */ emit_arith(rp, PFS_OP_MAD, temp[0], WRITEMASK_X | WRITEMASK_Y, swizzle(temp[0], Z, Z, Z, Z), - rp->const_sin[0], + const_sin[0], pfs_zero, 0); /* B*x (sin) */ emit_arith(rp, PFS_OP_MAD, temp[1], WRITEMASK_W, - swizzle(rp->const_sin[0], X, X, X, X), + swizzle(const_sin[0], X, X, X, X), keep(src[0]), pfs_zero, 0); @@ -1864,7 +1859,7 @@ static GLboolean parse_program(struct r300_fragment_program *rp) /* dest.xy = mad(temp.xy, P, temp2.wz) */ emit_arith(rp, PFS_OP_MAD, dest, mask & (WRITEMASK_X | WRITEMASK_Y), temp[0], - swizzle(rp->const_sin[0], W, W, W, W), + swizzle(const_sin[0], W, W, W, W), swizzle(temp[1], W, Z, Y, X), flags); @@ -1895,7 +1890,8 @@ static GLboolean parse_program(struct r300_fragment_program *rp) */ temp[0] = get_temp_reg(rp); - make_sin_const(rp); + const_sin[0] = emit_const4fv(rp, SinCosConsts[0]); + const_sin[1] = emit_const4fv(rp, SinCosConsts[1]); src[0] = t_scalar_src(rp, fpi->SrcReg[0]); @@ -1903,7 +1899,7 @@ static GLboolean parse_program(struct r300_fragment_program *rp) emit_arith(rp, PFS_OP_MAD, temp[0], WRITEMASK_X, swizzle(keep(src[0]), X, X, X, X), - swizzle(rp->const_sin[1], Z, Z, Z, Z), + swizzle(const_sin[1], Z, Z, Z, Z), pfs_half, 0); @@ -1915,15 +1911,15 @@ static GLboolean parse_program(struct r300_fragment_program *rp) emit_arith(rp, PFS_OP_MAD, temp[0], WRITEMASK_Z, swizzle(temp[0], X, X, X, X), - swizzle(rp->const_sin[1], W, W, W, W), //2*PI - negate(swizzle(rp->const_sin[0], Z, Z, Z, Z)), //PI + swizzle(const_sin[1], W, W, W, W), //2*PI + negate(swizzle(const_sin[0], Z, Z, Z, Z)), //PI 0); /* SIN */ emit_arith(rp, PFS_OP_MAD, temp[0], WRITEMASK_X | WRITEMASK_Y, swizzle(temp[0], Z, Z, Z, Z), - rp->const_sin[0], + const_sin[0], pfs_zero, 0); @@ -1942,7 +1938,7 @@ static GLboolean parse_program(struct r300_fragment_program *rp) emit_arith(rp, PFS_OP_MAD, dest, mask, swizzle(temp[0], Y, Y, Y, Y), - swizzle(rp->const_sin[0], W, W, W, W), + swizzle(const_sin[0], W, W, W, W), swizzle(temp[0], X, X, X, X), flags); @@ -2124,7 +2120,6 @@ static void init_program(r300ContextPtr r300, struct r300_fragment_program *rp) rp->max_temp_idx = 0; rp->node[0].alu_end = -1; rp->node[0].tex_end = -1; - rp->const_sin[0] = -1; _mesa_memset(cs, 0, sizeof(*rp->cs)); for (i=0;i