diff options
-rw-r--r-- | src/mesa/drivers/dri/r300/r300_context.h | 94 | ||||
-rw-r--r-- | src/mesa/drivers/dri/r300/r300_fragprog.c | 774 | ||||
-rw-r--r-- | src/mesa/drivers/dri/r300/r300_reg.h | 4 |
3 files changed, 582 insertions, 290 deletions
diff --git a/src/mesa/drivers/dri/r300/r300_context.h b/src/mesa/drivers/dri/r300/r300_context.h index bd9ed6f170..bc43953ff3 100644 --- a/src/mesa/drivers/dri/r300/r300_context.h +++ b/src/mesa/drivers/dri/r300/r300_context.h @@ -647,38 +647,84 @@ struct r300_vertex_program_cont { #define PFS_NUM_TEMP_REGS 32 #define PFS_NUM_CONST_REGS 16 -/* Tracking data for Mesa registers */ +/* Mapping Mesa registers to R300 temporaries */ struct reg_acc { int reg; /* Assigned hw temp */ unsigned int refcount; /* Number of uses by mesa program */ }; +/** + * Describe the current lifetime information for an R300 temporary + */ +struct reg_lifetime { + /* Index of the first slot where this register is free in the sense + that it can be used as a new destination register. + This is -1 if the register has been assigned to a Mesa register + and the last access to the register has not yet been emitted */ + int free; + + /* Index of the first slot where this register is currently reserved. + This is used to stop e.g. a scalar operation from being moved + before the allocation time of a register that was first allocated + for a vector operation. */ + int reserved; + + /* Index of the first slot in which the register can be used as a + source without losing the value that is written by the last + emitted instruction that writes to the register */ + int vector_valid; + int scalar_valid; +}; + + +/** + * Store usage information about an ALU instruction slot during the + * compilation of a fragment program. + */ +#define SLOT_SRC_VECTOR (1<<0) +#define SLOT_SRC_SCALAR (1<<3) +#define SLOT_SRC_BOTH (SLOT_SRC_VECTOR | SLOT_SRC_SCALAR) +#define SLOT_OP_VECTOR (1<<16) +#define SLOT_OP_SCALAR (1<<17) +#define SLOT_OP_BOTH (SLOT_OP_VECTOR | SLOT_OP_SCALAR) + +struct r300_pfs_compile_slot { + /* Bitmask indicating which parts of the slot are used, using SLOT_ constants + defined above */ + unsigned int used; + + /* Selected sources */ + int vsrc[3]; + int ssrc[3]; +}; + +/** + * Store information during compilation of fragment programs. + */ struct r300_pfs_compile_state { - int v_pos, s_pos; /* highest ALU slots used */ - - /* Track some information gathered during opcode - * construction. - * - * NOTE: Data is only set by the code, and isn't used yet. - */ - struct { - int vsrc[3]; - int ssrc[3]; - int umask; - } slot[PFS_MAX_ALU_INST]; - - /* Used to map Mesa's inputs/temps onto hardware temps */ - int temp_in_use; - struct reg_acc temps[PFS_NUM_TEMP_REGS]; - struct reg_acc inputs[32]; /* don't actually need 32... */ - - /* Track usage of hardware temps, for register allocation, - * indirection detection, etc. */ - int hwreg_in_use; - GLuint used_in_node; - GLuint dest_in_node; + int nrslots; /* number of ALU slots used so far */ + + /* Track which (parts of) slots are already filled with instructions */ + struct r300_pfs_compile_slot slot[PFS_MAX_ALU_INST]; + + /* Track the validity of R300 temporaries */ + struct reg_lifetime hwtemps[PFS_NUM_TEMP_REGS]; + + /* Used to map Mesa's inputs/temps onto hardware temps */ + int temp_in_use; + struct reg_acc temps[PFS_NUM_TEMP_REGS]; + struct reg_acc inputs[32]; /* don't actually need 32... */ + + /* Track usage of hardware temps, for register allocation, + * indirection detection, etc. */ + GLuint used_in_node; + GLuint dest_in_node; }; +/** + * Store everything about a fragment program that is needed + * to render with that program. + */ struct r300_fragment_program { struct gl_fragment_program mesa_program; diff --git a/src/mesa/drivers/dri/r300/r300_fragprog.c b/src/mesa/drivers/dri/r300/r300_fragprog.c index 251fd26082..b2c89ccb36 100644 --- a/src/mesa/drivers/dri/r300/r300_fragprog.c +++ b/src/mesa/drivers/dri/r300/r300_fragprog.c @@ -94,8 +94,9 @@ #define REG_NEGV_SHIFT 18 #define REG_NEGS_SHIFT 19 #define REG_ABS_SHIFT 20 -#define REG_NO_USE_SHIFT 21 -#define REG_VALID_SHIFT 22 +#define REG_NO_USE_SHIFT 21 // Hack for refcounting +#define REG_VALID_SHIFT 22 // Does the register contain a defined value? +#define REG_BUILTIN_SHIFT 23 // Is it a builtin (like all zero/all one)? #define REG_TYPE_MASK (0x03 << REG_TYPE_SHIFT) #define REG_INDEX_MASK (0x3F << REG_INDEX_SHIFT) @@ -106,12 +107,14 @@ #define REG_ABS_MASK (0x01 << REG_ABS_SHIFT) #define REG_NO_USE_MASK (0x01 << REG_NO_USE_SHIFT) #define REG_VALID_MASK (0x01 << REG_VALID_SHIFT) +#define REG_BUILTIN_MASK (0x01 << REG_BUILTIN_SHIFT) -#define REG(type, index, vswz, sswz, nouse, valid) \ +#define REG(type, index, vswz, sswz, nouse, valid, builtin) \ (((type << REG_TYPE_SHIFT) & REG_TYPE_MASK) | \ ((index << REG_INDEX_SHIFT) & REG_INDEX_MASK) | \ ((nouse << REG_NO_USE_SHIFT) & REG_NO_USE_MASK) | \ ((valid << REG_VALID_SHIFT) & REG_VALID_MASK) | \ + ((builtin << REG_BUILTIN_SHIFT) & REG_BUILTIN_MASK) | \ ((vswz << REG_VSWZ_SHIFT) & REG_VSWZ_MASK) | \ ((sswz << REG_SSWZ_SHIFT) & REG_SSWZ_MASK)) #define REG_GET_TYPE(reg) \ @@ -126,6 +129,8 @@ ((reg & REG_NO_USE_MASK) >> REG_NO_USE_SHIFT) #define REG_GET_VALID(reg) \ ((reg & REG_VALID_MASK) >> REG_VALID_SHIFT) +#define REG_GET_BUILTIN(reg) \ + ((reg & REG_BUILTIN_MASK) >> REG_BUILTIN_SHIFT) #define REG_SET_TYPE(reg, type) \ reg = ((reg & ~REG_TYPE_MASK) | \ ((type << REG_TYPE_SHIFT) & REG_TYPE_MASK)) @@ -144,6 +149,9 @@ #define REG_SET_VALID(reg, valid) \ reg = ((reg & ~REG_VALID_MASK) | \ ((valid << REG_VALID_SHIFT) & REG_VALID_MASK)) +#define REG_SET_BUILTIN(reg, builtin) \ + reg = ((reg & ~REG_BUILTIN_MASK) | \ + ((builtin << REG_BUILTIN_SHIFT) & REG_BUILTIN_MASK)) #define REG_ABS(reg) \ reg = (reg | REG_ABS_MASK) #define REG_NEGV(reg) \ @@ -184,9 +192,6 @@ static const struct { * * REG_VSWZ/REG_SSWZ is an index into this table */ -#define SLOT_VECTOR (1<<0) -#define SLOT_SCALAR (1<<3) -#define SLOT_BOTH (SLOT_VECTOR | SLOT_SCALAR) /* mapping from SWIZZLE_* to r300 native values for scalar insns */ #define SWIZZLE_HALF 6 @@ -202,14 +207,14 @@ static const struct r300_pfs_swizzle { GLuint flags; } v_swiz[] = { /* native swizzles */ - { MAKE_SWZ3(X, Y, Z), R300_FPI0_ARGC_SRC0C_XYZ, 4, SLOT_VECTOR }, - { MAKE_SWZ3(X, X, X), R300_FPI0_ARGC_SRC0C_XXX, 4, SLOT_VECTOR }, - { MAKE_SWZ3(Y, Y, Y), R300_FPI0_ARGC_SRC0C_YYY, 4, SLOT_VECTOR }, - { MAKE_SWZ3(Z, Z, Z), R300_FPI0_ARGC_SRC0C_ZZZ, 4, SLOT_VECTOR }, - { MAKE_SWZ3(W, W, W), R300_FPI0_ARGC_SRC0A, 1, SLOT_SCALAR }, - { MAKE_SWZ3(Y, Z, X), R300_FPI0_ARGC_SRC0C_YZX, 1, SLOT_VECTOR }, - { MAKE_SWZ3(Z, X, Y), R300_FPI0_ARGC_SRC0C_ZXY, 1, SLOT_VECTOR }, - { MAKE_SWZ3(W, Z, Y), R300_FPI0_ARGC_SRC0CA_WZY, 1, SLOT_BOTH }, + { MAKE_SWZ3(X, Y, Z), R300_FPI0_ARGC_SRC0C_XYZ, 4, SLOT_SRC_VECTOR }, + { MAKE_SWZ3(X, X, X), R300_FPI0_ARGC_SRC0C_XXX, 4, SLOT_SRC_VECTOR }, + { MAKE_SWZ3(Y, Y, Y), R300_FPI0_ARGC_SRC0C_YYY, 4, SLOT_SRC_VECTOR }, + { MAKE_SWZ3(Z, Z, Z), R300_FPI0_ARGC_SRC0C_ZZZ, 4, SLOT_SRC_VECTOR }, + { MAKE_SWZ3(W, W, W), R300_FPI0_ARGC_SRC0A, 1, SLOT_SRC_SCALAR }, + { MAKE_SWZ3(Y, Z, X), R300_FPI0_ARGC_SRC0C_YZX, 1, SLOT_SRC_VECTOR }, + { MAKE_SWZ3(Z, X, Y), R300_FPI0_ARGC_SRC0C_ZXY, 1, SLOT_SRC_VECTOR }, + { MAKE_SWZ3(W, Z, Y), R300_FPI0_ARGC_SRC0CA_WZY, 1, SLOT_SRC_BOTH }, { MAKE_SWZ3(ONE, ONE, ONE), R300_FPI0_ARGC_ONE, 0, 0}, { MAKE_SWZ3(ZERO, ZERO, ZERO), R300_FPI0_ARGC_ZERO, 0, 0}, { MAKE_SWZ3(HALF, HALF, HALF), R300_FPI0_ARGC_HALF, 0, 0}, @@ -241,10 +246,10 @@ static const struct { int stride; /* difference between SRC0/1/2 */ GLuint flags; } s_swiz[] = { - { R300_FPI2_ARGA_SRC0C_X, 3, SLOT_VECTOR }, - { R300_FPI2_ARGA_SRC0C_Y, 3, SLOT_VECTOR }, - { R300_FPI2_ARGA_SRC0C_Z, 3, SLOT_VECTOR }, - { R300_FPI2_ARGA_SRC0A , 1, SLOT_SCALAR }, + { R300_FPI2_ARGA_SRC0C_X, 3, SLOT_SRC_VECTOR }, + { R300_FPI2_ARGA_SRC0C_Y, 3, SLOT_SRC_VECTOR }, + { R300_FPI2_ARGA_SRC0C_Z, 3, SLOT_SRC_VECTOR }, + { R300_FPI2_ARGA_SRC0A , 1, SLOT_SRC_SCALAR }, { R300_FPI2_ARGA_ZERO , 0, 0 }, { R300_FPI2_ARGA_ONE , 0, 0 }, { R300_FPI2_ARGA_HALF , 0, 0 } @@ -256,6 +261,7 @@ static const GLuint undef = REG(REG_TYPE_TEMP, SWIZZLE_XYZ, SWIZZLE_W, GL_FALSE, + GL_FALSE, GL_FALSE); /* constant one source */ @@ -264,6 +270,7 @@ static const GLuint pfs_one = REG(REG_TYPE_CONST, SWIZZLE_111, SWIZZLE_ONE, GL_FALSE, + GL_TRUE, GL_TRUE); /* constant half source */ @@ -272,6 +279,7 @@ static const GLuint pfs_half = REG(REG_TYPE_CONST, SWIZZLE_HHH, SWIZZLE_HALF, GL_FALSE, + GL_TRUE, GL_TRUE); /* constant zero source */ @@ -280,6 +288,7 @@ static const GLuint pfs_zero = REG(REG_TYPE_CONST, SWIZZLE_000, SWIZZLE_ZERO, GL_FALSE, + GL_TRUE, GL_TRUE); /* @@ -291,47 +300,105 @@ static void emit_arith(struct r300_fragment_program *rp, int op, GLuint src0, GLuint src1, GLuint src2, int flags); -/* - * Helper functions prototypes +/** + * Get an R300 temporary that can be written to in the given slot. */ -static int get_hw_temp(struct r300_fragment_program *rp) +static int get_hw_temp(struct r300_fragment_program *rp, int slot) { COMPILE_STATE; - int r = ffs(~cs->hwreg_in_use); - if (!r) { + int r; + + for(r = 0; r < PFS_NUM_TEMP_REGS; ++r) { + if (cs->hwtemps[r].free >= 0 && cs->hwtemps[r].free <= slot) + break; + } + + if (r >= PFS_NUM_TEMP_REGS) { ERROR("Out of hardware temps\n"); return 0; } - - cs->hwreg_in_use |= (1 << --r); + + // Reserved is used to avoid the following scenario: + // R300 temporary X is first assigned to Mesa temporary Y during vector ops + // R300 temporary X is then assigned to Mesa temporary Z for further vector ops + // Then scalar ops on Mesa temporary Z are emitted and move back in time + // to overwrite the value of temporary Y. + // End scenario. + cs->hwtemps[r].reserved = cs->hwtemps[r].free; + cs->hwtemps[r].free = -1; + + // Reset to some value that won't mess things up when the user + // tries to read from a temporary that hasn't been assigned a value yet. + // In the normal case, vector_valid and scalar_valid should be set to + // a sane value by the first emit that writes to this temporary. + cs->hwtemps[r].vector_valid = 0; + cs->hwtemps[r].scalar_valid = 0; + if (r > rp->max_temp_idx) rp->max_temp_idx = r; - + return r; } +/** + * Get an R300 temporary that will act as a TEX destination register. + */ static int get_hw_temp_tex(struct r300_fragment_program *rp) { COMPILE_STATE; int r; - r = ffs(~(cs->hwreg_in_use | cs->used_in_node)); - if (!r) - return get_hw_temp(rp); /* Will cause an indirection */ + for(r = 0; r < PFS_NUM_TEMP_REGS; ++r) { + if (cs->used_in_node & (1 << r)) + continue; + + // Note: Be very careful here + if (cs->hwtemps[r].free >= 0 && cs->hwtemps[r].free <= 0) + break; + } + + if (r >= PFS_NUM_TEMP_REGS) + return get_hw_temp(rp, 0); /* Will cause an indirection */ - cs->hwreg_in_use |= (1 << --r); + cs->hwtemps[r].reserved = cs->hwtemps[r].free; + cs->hwtemps[r].free = -1; + + // Reset to some value that won't mess things up when the user + // tries to read from a temporary that hasn't been assigned a value yet. + // In the normal case, vector_valid and scalar_valid should be set to + // a sane value by the first emit that writes to this temporary. + cs->hwtemps[r].vector_valid = cs->nrslots; + cs->hwtemps[r].scalar_valid = cs->nrslots; + if (r > rp->max_temp_idx) rp->max_temp_idx = r; return r; } +/** + * Mark the given hardware register as free. + */ static void free_hw_temp(struct r300_fragment_program *rp, int idx) { COMPILE_STATE; - cs->hwreg_in_use &= ~(1<<idx); + + // Be very careful here. Consider sequences like + // MAD r0, r1,r2,r3 + // TEX r4, ... + // The TEX instruction may be moved in front of the MAD instruction + // due to the way nodes work. We don't want to alias r1 and r4 in + // this case. + // I'm certain the register allocation could be further sanitized, + // but it's tricky because of stuff that can happen inside emit_tex + // and emit_arith. + cs->hwtemps[idx].free = cs->nrslots+1; } + +/** + * Create a new Mesa temporary register. + */ static GLuint get_temp_reg(struct r300_fragment_program *rp) { COMPILE_STATE; @@ -354,6 +421,10 @@ static GLuint get_temp_reg(struct r300_fragment_program *rp) return r; } +/** + * Create a new Mesa temporary register that will act as the destination + * register for a texture read. + */ static GLuint get_temp_reg_tex(struct r300_fragment_program *rp) { COMPILE_STATE; @@ -376,6 +447,9 @@ static GLuint get_temp_reg_tex(struct r300_fragment_program *rp) return r; } +/** + * Free a Mesa temporary and the associated R300 temporary. + */ static void free_temp(struct r300_fragment_program *rp, GLuint r) { COMPILE_STATE; @@ -762,10 +836,10 @@ static int t_hw_src(struct r300_fragment_program *rp, switch(REG_GET_TYPE(src)) { case REG_TYPE_TEMP: /* NOTE: if reg==-1 here, a source is being read that - * hasn't been written to. Undefined results + * hasn't been written to. Undefined results. */ if (cs->temps[index].reg == -1) - cs->temps[index].reg = get_hw_temp(rp); + cs->temps[index].reg = get_hw_temp(rp, cs->nrslots); idx = cs->temps[index].reg; @@ -795,7 +869,8 @@ static int t_hw_src(struct r300_fragment_program *rp, static int t_hw_dst(struct r300_fragment_program *rp, GLuint dest, - GLboolean tex) + GLboolean tex, + int slot) { COMPILE_STATE; int idx; @@ -806,7 +881,7 @@ static int t_hw_dst(struct r300_fragment_program *rp, case REG_TYPE_TEMP: if (cs->temps[REG_GET_INDEX(dest)].reg == -1) { if (!tex) { - cs->temps[index].reg = get_hw_temp(rp); + cs->temps[index].reg = get_hw_temp(rp, slot); } else { cs->temps[index].reg = get_hw_temp_tex(rp); } @@ -839,26 +914,20 @@ static int t_hw_dst(struct r300_fragment_program *rp, return idx; } -static void emit_nop(struct r300_fragment_program *rp, - GLuint mask, - GLboolean sync) +static void emit_nop(struct r300_fragment_program *rp) { COMPILE_STATE; - if (sync) - cs->v_pos = cs->s_pos = MAX2(cs->v_pos, cs->s_pos); - - if (mask & WRITEMASK_XYZ) { - rp->alu.inst[cs->v_pos].inst0 = NOP_INST0; - rp->alu.inst[cs->v_pos].inst1 = NOP_INST1; - cs->v_pos++; - } - - if (mask & WRITEMASK_W) { - rp->alu.inst[cs->s_pos].inst2 = NOP_INST2; - rp->alu.inst[cs->s_pos].inst3 = NOP_INST3; - cs->s_pos++; + if (cs->nrslots >= PFS_MAX_ALU_INST) { + ERROR("Out of ALU instruction slots\n"); + return; } + + rp->alu.inst[cs->nrslots].inst0 = NOP_INST0; + rp->alu.inst[cs->nrslots].inst1 = NOP_INST1; + rp->alu.inst[cs->nrslots].inst2 = NOP_INST2; + rp->alu.inst[cs->nrslots].inst3 = NOP_INST3; + cs->nrslots++; } static void emit_tex(struct r300_fragment_program *rp, @@ -882,7 +951,7 @@ static void emit_tex(struct r300_fragment_program *rp, rdest = dest; dest = get_temp_reg_tex(rp); } - hwdest = t_hw_dst(rp, dest, GL_TRUE); + hwdest = t_hw_dst(rp, dest, GL_TRUE, rp->node[rp->cur_node].alu_offset); /* Use a temp that hasn't been used in this node, rather * than causing an indirection @@ -904,15 +973,11 @@ static void emit_tex(struct r300_fragment_program *rp, (din & (1<<hwsrc))) || (uin & (1<<hwdest))) { /* Finish off current node */ - cs->v_pos = cs->s_pos = MAX2(cs->v_pos, cs->s_pos); - if (rp->node[rp->cur_node].alu_offset == cs->v_pos) { - /* No alu instructions in the node? Emit a NOP. */ - emit_nop(rp, WRITEMASK_XYZW, GL_TRUE); - cs->v_pos = cs->s_pos = MAX2(cs->v_pos, cs->s_pos); - } + if (rp->node[rp->cur_node].alu_offset == cs->nrslots) + emit_nop(rp); rp->node[rp->cur_node].alu_end = - cs->v_pos - rp->node[rp->cur_node].alu_offset - 1; + cs->nrslots - rp->node[rp->cur_node].alu_offset - 1; assert(rp->node[rp->cur_node].alu_end >= 0); if (++rp->cur_node >= PFS_MAX_TEX_INDIRECT) { @@ -922,7 +987,7 @@ static void emit_tex(struct r300_fragment_program *rp, /* Start new node */ rp->node[rp->cur_node].tex_offset = rp->tex.length; - rp->node[rp->cur_node].alu_offset = cs->v_pos; + rp->node[rp->cur_node].alu_offset = cs->nrslots; rp->node[rp->cur_node].tex_end = -1; rp->node[rp->cur_node].alu_end = -1; rp->node[rp->cur_node].flags = 0; @@ -954,84 +1019,243 @@ static void emit_tex(struct r300_fragment_program *rp, } } -/* Add sources to FPI1/FPI3 lists. If source is already on list, - * reuse the index instead of wasting a source. + +/** + * Returns the first slot where we could possibly allow writing to dest, + * according to register allocation. */ -static int add_src(struct r300_fragment_program *rp, - int reg, - int pos, - int srcmask) +static int get_earliest_allowed_write( + struct r300_fragment_program* rp, + GLuint dest) { COMPILE_STATE; - int csm, i; - - /* Look for matches */ - for (i=0,csm=srcmask; i<3; i++,csm=csm<<1) { - /* If sources have been allocated in this position(s)... */ - if ((cs->slot[pos].umask & csm) == csm) { - /* ... and the register number(s) match, re-use the - source */ - if (srcmask == SLOT_VECTOR && - cs->slot[pos].vsrc[i] == reg) - return i; - if (srcmask == SLOT_SCALAR && - cs->slot[pos].ssrc[i] == reg) - return i; - if (srcmask == SLOT_BOTH && - cs->slot[pos].vsrc[i] == reg && - cs->slot[pos].ssrc[i] == reg) - return i; - } - } + int idx; + GLuint index = REG_GET_INDEX(dest); + assert(REG_GET_VALID(dest)); - /* Look for free spaces */ - for (i=0,csm=srcmask; i<3; i++,csm=csm<<1) { - /* If the position(s) haven't been allocated */ - if ((cs->slot[pos].umask & csm) == 0) { - cs->slot[pos].umask |= csm; - - if (srcmask & SLOT_VECTOR) - cs->slot[pos].vsrc[i] = reg; - if (srcmask & SLOT_SCALAR) - cs->slot[pos].ssrc[i] = reg; - return i; - } + switch(REG_GET_TYPE(dest)) { + case REG_TYPE_TEMP: + if (cs->temps[index].reg == -1) + return 0; + + idx = cs->temps[index].reg; + break; + case REG_TYPE_OUTPUT: + return 0; + default: + ERROR("invalid dest reg type %d\n", REG_GET_TYPE(dest)); + return 0; } - //ERROR("Failed to allocate sources in FPI1/FPI3!\n"); - return 0; + return cs->hwtemps[idx].reserved; } -/* Determine whether or not to position opcode in the same ALU slot for both - * vector and scalar portions of an instruction. + +/** + * Allocates a slot for an ALU instruction that can consist of + * a vertex part or a scalar part or both. + * + * Sources from src (src[0] to src[argc-1]) are added to the slot in the + * appropriate position (vector and/or scalar), and their positions are + * recorded in the srcpos array. + * + * This function emits instruction code for the source fetch and the + * argument selection. It does not emit instruction code for the + * opcode or the destination selection. * - * It's not necessary to force the first case, but it makes disassembled - * shaders easier to read. + * @return the index of the slot */ -static GLboolean force_same_slot(int vop, - int sop, - GLboolean emit_vop, - GLboolean emit_sop, - int argc, - GLuint *src) +static int find_and_prepare_slot(struct r300_fragment_program* rp, + GLboolean emit_vop, + GLboolean emit_sop, + int argc, + GLuint* src, + GLuint dest) { - int i; - - if (emit_vop && emit_sop) - return GL_TRUE; + COMPILE_STATE; + int hwsrc[3]; + int srcpos[3]; + unsigned int used; + int tempused; + int tempvsrc[3]; + int tempssrc[3]; + int pos; + int regnr; + int i,j; + + // Determine instruction slots, whether sources are required on + // vector or scalar side, and the smallest slot number where + // all source registers are available + used = 0; + if (emit_vop) + used |= SLOT_OP_VECTOR; + if (emit_sop) + used |= SLOT_OP_SCALAR; + + pos = get_earliest_allowed_write(rp, dest); + + if (rp->node[rp->cur_node].alu_offset > pos) + pos = rp->node[rp->cur_node].alu_offset; + for(i = 0; i < argc; ++i) { + if (!REG_GET_BUILTIN(src[i])) { + if (emit_vop) + used |= v_swiz[REG_GET_VSWZ(src[i])].flags << i; + if (emit_sop) + used |= s_swiz[REG_GET_SSWZ(src[i])].flags << i; + } + + hwsrc[i] = t_hw_src(rp, src[i], GL_FALSE); /* Note: sideeffects wrt refcounting! */ + regnr = hwsrc[i] & 31; + + if (REG_GET_TYPE(src[i]) == REG_TYPE_TEMP) { + if (used & (SLOT_SRC_VECTOR << i)) { + if (cs->hwtemps[regnr].vector_valid > pos) + pos = cs->hwtemps[regnr].vector_valid; + } + if (used & (SLOT_SRC_SCALAR << i)) { + if (cs->hwtemps[regnr].scalar_valid > pos) + pos = cs->hwtemps[regnr].scalar_valid; + } + } + } + + // Find a slot that fits + for(; ; ++pos) { + if (cs->slot[pos].used & used & SLOT_OP_BOTH) + continue; + + if (pos >= cs->nrslots) { + if (cs->nrslots >= PFS_MAX_ALU_INST) { + ERROR("Out of ALU instruction slots\n"); + return -1; + } - if (emit_vop && vop == R300_FPI0_OUTC_REPL_ALPHA) - return GL_TRUE; + rp->alu.inst[pos].inst0 = NOP_INST0; + rp->alu.inst[pos].inst2 = NOP_INST2; + cs->nrslots++; + } + + // Note: When we need both parts (vector and scalar) of a source, + // we always try to put them into the same position. This makes the + // code easier to read, and it is optimal (i.e. one doesn't gain + // anything by splitting the parts). + // It also avoids headaches with swizzles that access both parts (i.e WXY) + tempused = cs->slot[pos].used; + for(i = 0; i < 3; ++i) { + tempvsrc[i] = cs->slot[pos].vsrc[i]; + tempssrc[i] = cs->slot[pos].ssrc[i]; + } + + for(i = 0; i < argc; ++i) { + int flags = (used >> i) & SLOT_SRC_BOTH; + + if (!flags) { + srcpos[i] = 0; + continue; + } + + for(j = 0; j < 3; ++j) { + if ((tempused >> j) & flags & SLOT_SRC_VECTOR) { + if (tempvsrc[j] != hwsrc[i]) + continue; + } + + if ((tempused >> j) & flags & SLOT_SRC_SCALAR) { + if (tempssrc[j] != hwsrc[i]) + continue; + } + + break; + } + + if (j == 3) + break; + + srcpos[i] = j; + tempused |= flags << j; + if (flags & SLOT_SRC_VECTOR) + tempvsrc[j] = hwsrc[i]; + if (flags & SLOT_SRC_SCALAR) + tempssrc[j] = hwsrc[i]; + } + + if (i == argc) + break; + } + + // Found a slot, reserve it + cs->slot[pos].used = tempused | (used & SLOT_OP_BOTH); + for(i = 0; i < 3; ++i) { + cs->slot[pos].vsrc[i] = tempvsrc[i]; + cs->slot[pos].ssrc[i] = tempssrc[i]; + } + + // Emit the source fetch code + rp->alu.inst[pos].inst1 &= ~R300_FPI1_SRC_MASK; + rp->alu.inst[pos].inst1 |= + ((cs->slot[pos].vsrc[0] << R300_FPI1_SRC0C_SHIFT) | + (cs->slot[pos].vsrc[1] << R300_FPI1_SRC1C_SHIFT) | + (cs->slot[pos].vsrc[2] << R300_FPI1_SRC2C_SHIFT)); + + rp->alu.inst[pos].inst3 &= ~R300_FPI3_SRC_MASK; + rp->alu.inst[pos].inst3 |= + ((cs->slot[pos].ssrc[0] << R300_FPI3_SRC0A_SHIFT) | + (cs->slot[pos].ssrc[1] << R300_FPI3_SRC1A_SHIFT) | + (cs->slot[pos].ssrc[2] << R300_FPI3_SRC2A_SHIFT)); + + // Emit the argument selection code if (emit_vop) { - for (i=0;i<argc;i++) - if (REG_GET_VSWZ(src[i]) == SWIZZLE_WZY) - return GL_TRUE; + int swz[3]; + + for(i = 0; i < 3; ++i) { + if (i < argc) { + swz[i] = (v_swiz[REG_GET_VSWZ(src[i])].base + + (srcpos[i] * v_swiz[REG_GET_VSWZ(src[i])].stride)) | + ((src[i] & REG_NEGV_MASK) ? ARG_NEG : 0) | + ((src[i] & REG_ABS_MASK) ? ARG_ABS : 0); + } else { + swz[i] = R300_FPI0_ARGC_ZERO; + } + } + + rp->alu.inst[pos].inst0 &= + ~(R300_FPI0_ARG0C_MASK|R300_FPI0_ARG1C_MASK|R300_FPI0_ARG2C_MASK); + rp->alu.inst[pos].inst0 |= + (swz[0] << R300_FPI0_ARG0C_SHIFT) | + (swz[1] << R300_FPI0_ARG1C_SHIFT) | + (swz[2] << R300_FPI0_ARG2C_SHIFT); + } + + if (emit_sop) { + int swz[3]; + + for(i = 0; i < 3; ++i) { + if (i < argc) { + swz[i] = (s_swiz[REG_GET_SSWZ(src[i])].base + + (srcpos[i] * s_swiz[REG_GET_SSWZ(src[i])].stride)) | + ((src[i] & REG_NEGV_MASK) ? ARG_NEG : 0) | + ((src[i] & REG_ABS_MASK) ? ARG_ABS : 0); + } else { + swz[i] = R300_FPI2_ARGA_ZERO; + } + } + + rp->alu.inst[pos].inst2 &= + ~(R300_FPI2_ARG0A_MASK|R300_FPI2_ARG1A_MASK|R300_FPI2_ARG2A_MASK); + rp->alu.inst[pos].inst2 |= + (swz[0] << R300_FPI2_ARG0A_SHIFT) | + (swz[1] << R300_FPI2_ARG1A_SHIFT) | + (swz[2] << R300_FPI2_ARG2A_SHIFT); } - return GL_FALSE; + return pos; } + +/** + * Append an ALU instruction to the instruction list. + */ static void emit_arith(struct r300_fragment_program *rp, int op, GLuint dest, @@ -1043,87 +1267,31 @@ static void emit_arith(struct r300_fragment_program *rp, { COMPILE_STATE; GLuint src[3] = { src0, src1, src2 }; - int hwsrc[3], sswz[3], vswz[3]; int hwdest; - GLboolean emit_vop = GL_FALSE, emit_sop = GL_FALSE; + GLboolean emit_vop, emit_sop; int vop, sop, argc; - int vpos, spos; - int i; + int pos; vop = r300_fpop[op].v_op; sop = r300_fpop[op].s_op; argc = r300_fpop[op].argc; + if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT && + REG_GET_INDEX(dest) == FRAG_RESULT_DEPR) + mask &= ~WRITEMASK_XYZ; + + emit_vop = GL_FALSE; + emit_sop = GL_FALSE; if ((mask & WRITEMASK_XYZ) || vop == R300_FPI0_OUTC_DP3) emit_vop = GL_TRUE; if ((mask & WRITEMASK_W) || vop == R300_FPI0_OUTC_REPL_ALPHA) emit_sop = GL_TRUE; - if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT && - REG_GET_INDEX(dest) == FRAG_RESULT_DEPR) - emit_vop = GL_FALSE; - - if (force_same_slot(vop, sop, emit_vop, emit_sop, argc, src)) { - vpos = spos = MAX2(cs->v_pos, cs->s_pos); - } else { - vpos = cs->v_pos; - spos = cs->s_pos; - /* Here is where we'd decide on where a safe place is to - * combine this instruction with a previous one. - * - * This is extremely simple for now.. if a source depends - * on the opposite stream, force the same instruction. - */ - for (i=0;i<3;i++) { - if (emit_vop && - (v_swiz[REG_GET_VSWZ(src[i])].flags & SLOT_SCALAR)) { - vpos = spos = MAX2(vpos, spos); - break; - } - if (emit_sop && - (s_swiz[REG_GET_SSWZ(src[i])].flags & SLOT_VECTOR)) { - vpos = spos = MAX2(vpos, spos); - break; - } - } - } + pos = find_and_prepare_slot(rp, emit_vop, emit_sop, argc, src, dest); + if (pos < 0) + return; - /* - Convert src->hwsrc, record for FPI1/FPI3 - * - Determine ARG parts of FPI0/FPI2, unused args are filled - * with ARG_ZERO. - */ - for (i=0;i<3;i++) { - int srcpos; - - if (i >= argc) { - vswz[i] = R300_FPI0_ARGC_ZERO; - sswz[i] = R300_FPI2_ARGA_ZERO; - continue; - } - - hwsrc[i] = t_hw_src(rp, src[i], GL_FALSE); - - if (emit_vop && vop != R300_FPI0_OUTC_REPL_ALPHA) { - srcpos = add_src(rp, hwsrc[i], vpos, - v_swiz[REG_GET_VSWZ(src[i])].flags); - vswz[i] = (v_swiz[REG_GET_VSWZ(src[i])].base + - (srcpos * - v_swiz[REG_GET_VSWZ(src[i])].stride)) | - ((src[i] & REG_NEGV_MASK) ? ARG_NEG : 0) | - ((src[i] & REG_ABS_MASK) ? ARG_ABS : 0); - } else vswz[i] = R300_FPI0_ARGC_ZERO; - - if (emit_sop) { - srcpos = add_src(rp, hwsrc[i], spos, - s_swiz[REG_GET_SSWZ(src[i])].flags); - sswz[i] = (s_swiz[REG_GET_SSWZ(src[i])].base + - (srcpos * - s_swiz[REG_GET_SSWZ(src[i])].stride)) | - ((src[i] & REG_NEGS_MASK) ? ARG_NEG : 0) | - ((src[i] & REG_ABS_MASK) ? ARG_ABS : 0); - } else sswz[i] = R300_FPI2_ARGA_ZERO; - } - hwdest = t_hw_dst(rp, dest, GL_FALSE); + hwdest = t_hw_dst(rp, dest, GL_FALSE, pos); /* Note: Side effects wrt register allocation */ if (flags & PFS_FLAG_SAT) { vop |= R300_FPI0_OUTC_SAT; @@ -1131,58 +1299,45 @@ static void emit_arith(struct r300_fragment_program *rp, } /* Throw the pieces together and get FPI0/1 */ - rp->alu.inst[vpos].inst1 = - ((cs->slot[vpos].vsrc[0] << R300_FPI1_SRC0C_SHIFT) | - (cs->slot[vpos].vsrc[1] << R300_FPI1_SRC1C_SHIFT) | - (cs->slot[vpos].vsrc[2] << R300_FPI1_SRC2C_SHIFT)); if (emit_vop) { - rp->alu.inst[vpos].inst0 = vop | - (vswz[0] << R300_FPI0_ARG0C_SHIFT) | - (vswz[1] << R300_FPI0_ARG1C_SHIFT) | - (vswz[2] << R300_FPI0_ARG2C_SHIFT); + rp->alu.inst[pos].inst0 |= vop; - rp->alu.inst[vpos].inst1 |= hwdest << R300_FPI1_DSTC_SHIFT; + rp->alu.inst[pos].inst1 |= hwdest << R300_FPI1_DSTC_SHIFT; + if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT) { if (REG_GET_INDEX(dest) == FRAG_RESULT_COLR) { - rp->alu.inst[vpos].inst1 |= + rp->alu.inst[pos].inst1 |= (mask & WRITEMASK_XYZ) << R300_FPI1_DSTC_OUTPUT_MASK_SHIFT; } else assert(0); } else { - rp->alu.inst[vpos].inst1 |= + rp->alu.inst[pos].inst1 |= (mask & WRITEMASK_XYZ) << R300_FPI1_DSTC_REG_MASK_SHIFT; + + cs->hwtemps[hwdest].vector_valid = pos+1; } - cs->v_pos = vpos+1; - } else if (spos >= vpos) - rp->alu.inst[spos].inst0 = NOP_INST0; + } /* And now FPI2/3 */ - rp->alu.inst[spos].inst3 = - ((cs->slot[spos].ssrc[0] << R300_FPI3_SRC0A_SHIFT) | - (cs->slot[spos].ssrc[1] << R300_FPI3_SRC1A_SHIFT) | - (cs->slot[spos].ssrc[2] << R300_FPI3_SRC2A_SHIFT)); if (emit_sop) { - rp->alu.inst[spos].inst2 = sop | - sswz[0] << R300_FPI2_ARG0A_SHIFT | - sswz[1] << R300_FPI2_ARG1A_SHIFT | - sswz[2] << R300_FPI2_ARG2A_SHIFT; + rp->alu.inst[pos].inst2 |= sop; if (mask & WRITEMASK_W) { if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT) { if (REG_GET_INDEX(dest) == FRAG_RESULT_COLR) { - rp->alu.inst[spos].inst3 |= + rp->alu.inst[pos].inst3 |= (hwdest << R300_FPI3_DSTA_SHIFT) | R300_FPI3_DSTA_OUTPUT; } else if (REG_GET_INDEX(dest) == FRAG_RESULT_DEPR) { - rp->alu.inst[spos].inst3 |= R300_FPI3_DSTA_DEPTH; + rp->alu.inst[pos].inst3 |= R300_FPI3_DSTA_DEPTH; } else assert(0); } else { - rp->alu.inst[spos].inst3 |= + rp->alu.inst[pos].inst3 |= (hwdest << R300_FPI3_DSTA_SHIFT) | R300_FPI3_DSTA_REG; + + cs->hwtemps[hwdest].scalar_valid = pos+1; } } - cs->s_pos = spos+1; - } else if (vpos >= spos) - rp->alu.inst[vpos].inst2 = NOP_INST2; - + } + return; } @@ -1922,7 +2077,7 @@ static void init_program(r300ContextPtr r300, struct r300_fragment_program *rp) for (i=0;i<rp->ctx->Const.MaxTextureUnits;i++) { if (InputsRead & (FRAG_BIT_TEX0 << i)) { cs->inputs[FRAG_ATTRIB_TEX0+i].refcount = 0; - cs->inputs[FRAG_ATTRIB_TEX0+i].reg = get_hw_temp(rp); + cs->inputs[FRAG_ATTRIB_TEX0+i].reg = get_hw_temp(rp, 0); } } InputsRead &= ~FRAG_BITS_TEX_ANY; @@ -1930,7 +2085,7 @@ static void init_program(r300ContextPtr r300, struct r300_fragment_program *rp) /* fragment position treated as a texcoord */ if (InputsRead & FRAG_BIT_WPOS) { cs->inputs[FRAG_ATTRIB_WPOS].refcount = 0; - cs->inputs[FRAG_ATTRIB_WPOS].reg = get_hw_temp(rp); + cs->inputs[FRAG_ATTRIB_WPOS].reg = get_hw_temp(rp, 0); insert_wpos(&mp->Base); } InputsRead &= ~FRAG_BIT_WPOS; @@ -1938,14 +2093,14 @@ static void init_program(r300ContextPtr r300, struct r300_fragment_program *rp) /* Then primary colour */ if (InputsRead & FRAG_BIT_COL0) { cs->inputs[FRAG_ATTRIB_COL0].refcount = 0; - cs->inputs[FRAG_ATTRIB_COL0].reg = get_hw_temp(rp); + cs->inputs[FRAG_ATTRIB_COL0].reg = get_hw_temp(rp, 0); } InputsRead &= ~FRAG_BIT_COL0; /* Secondary color */ if (InputsRead & FRAG_BIT_COL1) { cs->inputs[FRAG_ATTRIB_COL1].refcount = 0; - cs->inputs[FRAG_ATTRIB_COL1].reg = get_hw_temp(rp); + cs->inputs[FRAG_ATTRIB_COL1].reg = get_hw_temp(rp, 0); } InputsRead &= ~FRAG_BIT_COL1; @@ -2030,13 +2185,12 @@ void r300_translate_fragment_shader(r300ContextPtr r300, struct r300_fragment_pr } /* Finish off */ - cs->v_pos = cs->s_pos = MAX2(cs->v_pos, cs->s_pos); rp->node[rp->cur_node].alu_end = - cs->v_pos - rp->node[rp->cur_node].alu_offset - 1; + cs->nrslots - rp->node[rp->cur_node].alu_offset - 1; if (rp->node[rp->cur_node].tex_end < 0) rp->node[rp->cur_node].tex_end = 0; rp->alu_offset = 0; - rp->alu_end = cs->v_pos - 1; + rp->alu_end = cs->nrslots - 1; rp->tex_offset = 0; rp->tex_end = rp->tex.length ? rp->tex.length - 1 : 0; assert(rp->node[rp->cur_node].alu_end >= 0); @@ -2053,7 +2207,7 @@ void r300_translate_fragment_shader(r300ContextPtr r300, struct r300_fragment_pr /* just some random things... */ static void dump_program(struct r300_fragment_program *rp) { - int i; + int n, i, j; static int pc = 0; fprintf(stderr, "pc=%d*************************************\n", pc++); @@ -2066,46 +2220,136 @@ static void dump_program(struct r300_fragment_program *rp) fprintf(stderr, "Hardware program\n"); fprintf(stderr, "----------------\n"); - fprintf(stderr, "tex:\n"); - - for(i=0;i<rp->tex.length;i++) { - fprintf(stderr, "%08x\n", rp->tex.inst[i]); - } - - for (i=0;i<(rp->cur_node+1);i++) { + for (n = 0; n < (rp->cur_node+1); n++) { fprintf(stderr, "NODE %d: alu_offset: %d, tex_offset: %d, "\ - "alu_end: %d, tex_end: %d\n", i, - rp->node[i].alu_offset, - rp->node[i].tex_offset, - rp->node[i].alu_end, - rp->node[i].tex_end); + "alu_end: %d, tex_end: %d\n", n, + rp->node[n].alu_offset, + rp->node[n].tex_offset, + rp->node[n].alu_end, + rp->node[n].tex_end); + + if (rp->tex.length) { + fprintf(stderr, " TEX:\n"); + for(i = rp->node[n].tex_offset; i <= rp->node[n].tex_end; ++i) + fprintf(stderr, " %08x\n", rp->tex.inst[i]); + } + + for(i = rp->node[n].alu_offset; i <= rp->node[n].alu_end; ++i) { + char srcc[3][10], dstc[20]; + char srca[3][10], dsta[20]; + char argc[3][20]; + char arga[3][20]; + + for(j = 0; j < 3; ++j) { + int regc = rp->alu.inst[i].inst1 >> (j*6); + int rega = rp->alu.inst[i].inst3 >> (j*6); + + sprintf(srcc[j], "%c%i", (regc & 32) ? 'c' : 't', regc & 31); + sprintf(srca[j], "%c%i", (rega & 32) ? 'c' : 't', rega & 31); + } + + sprintf(dstc, "t%i.%c%c%c o%i.%c%c%c", + (rp->alu.inst[i].inst1 >> R300_FPI1_DSTC_SHIFT) & 31, + (rp->alu.inst[i].inst1 & R300_FPI1_DSTC_REG_X) ? 'x' : ' ', + (rp->alu.inst[i].inst1 & R300_FPI1_DSTC_REG_Y) ? 'y' : ' ', + (rp->alu.inst[i].inst1 & R300_FPI1_DSTC_REG_Z) ? 'z' : ' ', + (rp->alu.inst[i].inst1 >> R300_FPI1_DSTC_SHIFT) & 31, + (rp->alu.inst[i].inst1 & R300_FPI1_DSTC_OUTPUT_X) ? 'x' : ' ', + (rp->alu.inst[i].inst1 & R300_FPI1_DSTC_OUTPUT_Y) ? 'y' : ' ', + (rp->alu.inst[i].inst1 & R300_FPI1_DSTC_OUTPUT_Z) ? 'z' : ' '); + + sprintf(dsta, "t%i.%c o%i.%c %c", + (rp->alu.inst[i].inst3 >> R300_FPI3_DSTA_SHIFT) & 31, + (rp->alu.inst[i].inst3 & R300_FPI3_DSTA_REG) ? 'w' : ' ', + (rp->alu.inst[i].inst3 >> R300_FPI3_DSTA_SHIFT) & 31, + (rp->alu.inst[i].inst3 & R300_FPI3_DSTA_OUTPUT) ? 'w' : ' ', + (rp->alu.inst[i].inst3 & R300_FPI3_DSTA_DEPTH) ? 'Z' : ' '); + + fprintf(stderr, "%3i: xyz: %3s %3s %3s -> %-20s (%08x)\n" + " w: %3s %3s %3s -> %-20s (%08x)\n", + i, + srcc[0], srcc[1], srcc[2], dstc, rp->alu.inst[i].inst1, + srca[0], srca[1], srca[2], dsta, rp->alu.inst[i].inst3); + + for(j = 0; j < 3; ++j) { + int regc = rp->alu.inst[i].inst0 >> (j*7); + int rega = rp->alu.inst[i].inst2 >> (j*7); + int d; + char buf[20]; + + d = regc & 31; + if (d < 12) { + switch(d % 4) { + case R300_FPI0_ARGC_SRC0C_XYZ: + sprintf(buf, "%s.xyz", srcc[d / 4]); + break; + case R300_FPI0_ARGC_SRC0C_XXX: + sprintf(buf, "%s.xxx", srcc[d / 4]); + break; + case R300_FPI0_ARGC_SRC0C_YYY: + sprintf(buf, "%s.yyy", srcc[d / 4]); + break; + case R300_FPI0_ARGC_SRC0C_ZZZ: + sprintf(buf, "%s.zzz", srcc[d / 4]); + break; + } + } else if (d < 15) { + sprintf(buf, "%s.www", srca[d-12]); + } else if (d == 20) { + sprintf(buf, "0.0"); + } else if (d == 21) { + sprintf(buf, "1.0"); + } else if (d == 22) { + sprintf(buf, "0.5"); + } else if (d >= 23 && d < 32) { + d -= 23; + switch(d/3) { + case 0: + sprintf(buf, "%s.yzx", srcc[d % 3]); + break; + case 1: + sprintf(buf, "%s.zxy", srcc[d % 3]); + break; + case 2: + sprintf(buf, "%s.Wzy", srcc[d % 3]); + break; + } + } else { + sprintf(buf, "%i", d); + } + + sprintf(argc[j], "%s%s%s%s", + (regc & 32) ? "-" : "", + (regc & 64) ? "|" : "", + buf, + (regc & 64) ? "|" : ""); + + d = rega & 31; + if (d < 9) { + sprintf(buf, "%s.%c", srcc[d / 3], 'x' + (char)(d%3)); + } else if (d < 12) { + sprintf(buf, "%s.w", srca[d-9]); + } else if (d == 16) { + sprintf(buf, "0.0"); + } else if (d == 17) { + sprintf(buf, "1.0"); + } else if (d == 18) { + sprintf(buf, "0.5"); + } else { + sprintf(buf, "%i", d); + } + + sprintf(arga[j], "%s%s%s%s", + (rega & 32) ? "-" : "", + (rega & 64) ? "|" : "", + buf, + (rega & 64) ? "|" : ""); + } + + fprintf(stderr, " xyz: %8s %8s %8s op: %08x\n" + " w: %8s %8s %8s op: %08x\n", + argc[0], argc[1], argc[2], rp->alu.inst[i].inst0, + arga[0], arga[1], arga[2], rp->alu.inst[i].inst2); + } } - - fprintf(stderr, "%08x\n", - ((rp->tex_end << 16) | (R300_PFS_TEXI_0 >> 2))); - for (i=0;i<=rp->tex_end;i++) - fprintf(stderr, "%08x\n", rp->tex.inst[i]); - - /* dump program in pretty_print_command_stream.tcl-readable format */ - fprintf(stderr, "%08x\n", - ((rp->alu_end << 16) | (R300_PFS_INSTR0_0 >> 2))); - for (i=0;i<=rp->alu_end;i++) - fprintf(stderr, "%08x\n", rp->alu.inst[i].inst0); - - fprintf(stderr, "%08x\n", - ((rp->alu_end << 16) | (R300_PFS_INSTR1_0 >> 2))); - for (i=0;i<=rp->alu_end;i++) - fprintf(stderr, "%08x\n", rp->alu.inst[i].inst1); - - fprintf(stderr, "%08x\n", - ((rp->alu_end << 16) | (R300_PFS_INSTR2_0 >> 2))); - for (i=0;i<=rp->alu_end;i++) - fprintf(stderr, "%08x\n", rp->alu.inst[i].inst2); - - fprintf(stderr, "%08x\n", - ((rp->alu_end << 16) | (R300_PFS_INSTR3_0 >> 2))); - for (i=0;i<=rp->alu_end;i++) - fprintf(stderr, "%08x\n", rp->alu.inst[i].inst3); - - fprintf(stderr, "00000000\n"); } diff --git a/src/mesa/drivers/dri/r300/r300_reg.h b/src/mesa/drivers/dri/r300/r300_reg.h index 3de15752b1..1f4a2d2e64 100644 --- a/src/mesa/drivers/dri/r300/r300_reg.h +++ b/src/mesa/drivers/dri/r300/r300_reg.h @@ -1047,7 +1047,7 @@ USE OR OTHER DEALINGS IN THE SOFTWARE. * WRT swizzling. If, for example, you want to load an R component into an * Alpha operand, this R component is taken from a *color* source, not from * an alpha source. The corresponding register doesn't even have to appear in - * the alpha sources list. (I hope this alll makes sense to you) + * the alpha sources list. (I hope this all makes sense to you) * * Destination selection * The destination register index is in FPI1 (color) and FPI3 (alpha) @@ -1074,6 +1074,7 @@ USE OR OTHER DEALINGS IN THE SOFTWARE. # define R300_FPI1_SRC2C_SHIFT 12 # define R300_FPI1_SRC2C_MASK (31 << 12) # define R300_FPI1_SRC2C_CONST (1 << 17) +# define R300_FPI1_SRC_MASK 0x0003ffff # define R300_FPI1_DSTC_SHIFT 18 # define R300_FPI1_DSTC_MASK (31 << 18) # define R300_FPI1_DSTC_REG_MASK_SHIFT 23 @@ -1095,6 +1096,7 @@ USE OR OTHER DEALINGS IN THE SOFTWARE. # define R300_FPI3_SRC2A_SHIFT 12 # define R300_FPI3_SRC2A_MASK (31 << 12) # define R300_FPI3_SRC2A_CONST (1 << 17) +# define R300_FPI3_SRC_MASK 0x0003ffff # define R300_FPI3_DSTA_SHIFT 18 # define R300_FPI3_DSTA_MASK (31 << 18) # define R300_FPI3_DSTA_REG (1 << 23) |