summaryrefslogtreecommitdiff
path: root/src/mesa/drivers/dri/r300/radeon_program_pair.c
diff options
context:
space:
mode:
authorNicolai Haehnle <nhaehnle@gmail.com>2008-07-12 01:14:35 +0200
committerNicolai Haehnle <nhaehnle@gmail.com>2008-07-12 09:36:02 +0200
commitb6765c34993b08bba4acf20738c8938413ed4daf (patch)
treed221e963335dc31af894485a657fb423aa4d79c5 /src/mesa/drivers/dri/r300/radeon_program_pair.c
parentd8d086c20b5a43353c4980cf234d8329900585f5 (diff)
r500_fragprog: Major refactoring of final emit
Use an abstracted instruction scheduling and register allocation algorithm that we will be able to share with r300_fragprog. Unlike the original emit code, this code tries to pair instructions that only use the RGB part of the ALU with instructions that only use the alpha part. However, the pairing algorithm still has some shortcomings; for example, it doesn't generate optimal code for the emulation of LIT.
Diffstat (limited to 'src/mesa/drivers/dri/r300/radeon_program_pair.c')
-rw-r--r--src/mesa/drivers/dri/r300/radeon_program_pair.c970
1 files changed, 970 insertions, 0 deletions
diff --git a/src/mesa/drivers/dri/r300/radeon_program_pair.c b/src/mesa/drivers/dri/r300/radeon_program_pair.c
new file mode 100644
index 0000000000..86180edcb5
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/radeon_program_pair.c
@@ -0,0 +1,970 @@
+/*
+ * Copyright (C) 2008 Nicolai Haehnle.
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+/**
+ * @file
+ *
+ * Perform temporary register allocation and attempt to pair off instructions
+ * in RGB and Alpha pairs. Also attempts to optimize the TEX instruction
+ * vs. ALU instruction scheduling.
+ */
+
+#include "radeon_program_pair.h"
+
+#include "radeon_context.h"
+
+#include "shader/prog_print.h"
+
+#define error(fmt, args...) do { \
+ _mesa_problem(s->Ctx, "%s::%s(): " fmt "\n", \
+ __FILE__, __FUNCTION__, ##args); \
+ s->Error = GL_TRUE; \
+} while(0)
+
+struct pair_state_instruction {
+ GLuint IsTex:1; /**< Is a texture instruction */
+ GLuint NeedRGB:1; /**< Needs the RGB ALU */
+ GLuint NeedAlpha:1; /**< Needs the Alpha ALU */
+ GLuint IsTranscendent:1; /**< Is a special transcendent instruction */
+
+ /**
+ * Number of (read and write) dependencies that must be resolved before
+ * this instruction can be scheduled.
+ */
+ GLuint NumDependencies:5;
+
+ /**
+ * Next instruction in the linked list of ready instructions.
+ */
+ struct pair_state_instruction *NextReady;
+
+ /**
+ * Values that this instruction writes
+ */
+ struct reg_value *Values[4];
+};
+
+
+/**
+ * Used to keep track of which instructions read a value.
+ */
+struct reg_value_reader {
+ GLuint IP; /**< IP of the instruction that performs this access */
+ struct reg_value_reader *Next;
+};
+
+/**
+ * Used to keep track which values are stored in each component of a
+ * PROGRAM_TEMPORARY.
+ */
+struct reg_value {
+ GLuint IP; /**< IP of the instruction that writes this value */
+ struct reg_value *Next; /**< Pointer to the next value to be written to the same PROGRAM_TEMPORARY component */
+
+ /**
+ * Unordered linked list of instructions that read from this value.
+ */
+ struct reg_value_reader *Readers;
+
+ /**
+ * Number of readers of this value. This is calculated during @ref scan_instructions
+ * and continually decremented during code emission.
+ * When this count reaches zero, the instruction that writes the @ref Next value
+ * can be scheduled.
+ */
+ GLuint NumReaders;
+};
+
+/**
+ * Used to translate a PROGRAM_INPUT or PROGRAM_TEMPORARY Mesa register
+ * to the proper hardware temporary.
+ */
+struct pair_register_translation {
+ GLuint Allocated:1;
+ GLuint HwIndex:8;
+ GLuint RefCount:23; /**< # of times this occurs in an unscheduled instruction SrcReg or DstReg */
+
+ /**
+ * Notes the value that is currently contained in each component
+ * (only used for PROGRAM_TEMPORARY registers).
+ */
+ struct reg_value *Value[4];
+};
+
+struct pair_state {
+ GLcontext *Ctx;
+ struct gl_program *Program;
+ const struct radeon_pair_handler *Handler;
+ GLboolean Error;
+ GLboolean Debug;
+ GLboolean Verbose;
+ void *UserData;
+
+ /**
+ * Translate Mesa registers to hardware registers
+ */
+ struct pair_register_translation Inputs[FRAG_ATTRIB_MAX];
+ struct pair_register_translation Temps[MAX_PROGRAM_TEMPS];
+
+ /**
+ * Derived information about program instructions.
+ */
+ struct pair_state_instruction *Instructions;
+
+ struct {
+ GLuint RefCount; /**< # of times this occurs in an unscheduled SrcReg or DstReg */
+ } HwTemps[128];
+
+ /**
+ * Linked list of instructions that can be scheduled right now,
+ * based on which ALU/TEX resources they require.
+ */
+ struct pair_state_instruction *ReadyFullALU;
+ struct pair_state_instruction *ReadyRGB;
+ struct pair_state_instruction *ReadyAlpha;
+ struct pair_state_instruction *ReadyTEX;
+
+ /**
+ * Pool of @ref reg_value structures for fast allocation.
+ */
+ struct reg_value *ValuePool;
+ GLuint ValuePoolUsed;
+ struct reg_value_reader *ReaderPool;
+ GLuint ReaderPoolUsed;
+};
+
+
+static struct pair_register_translation *get_register(struct pair_state *s, GLuint file, GLuint index)
+{
+ switch(file) {
+ case PROGRAM_TEMPORARY: return &s->Temps[index];
+ case PROGRAM_INPUT: return &s->Inputs[index];
+ default: return 0;
+ }
+}
+
+
+static GLuint get_hw_reg(struct pair_state *s, GLuint file, GLuint index)
+{
+ GLuint hwindex;
+
+ struct pair_register_translation *t = get_register(s, file, index);
+ if (!t) {
+ _mesa_problem(s->Ctx, "get_hw_reg: %i[%i]\n", file, index);
+ return 0;
+ }
+
+ if (t->Allocated)
+ return t->HwIndex;
+
+ for(hwindex = 0; hwindex < s->Handler->MaxHwTemps; ++hwindex)
+ if (!s->HwTemps[hwindex].RefCount)
+ break;
+
+ if (hwindex >= s->Handler->MaxHwTemps) {
+ error("Ran out of hardware temporaries");
+ return 0;
+ }
+
+ s->HwTemps[hwindex].RefCount = t->RefCount;
+ t->Allocated = 1;
+ t->HwIndex = hwindex;
+ return hwindex;
+}
+
+
+static void deref_hw_reg(struct pair_state *s, GLuint hwindex)
+{
+ if (!s->HwTemps[hwindex].RefCount) {
+ error("Hwindex %i refcount error", hwindex);
+ return;
+ }
+
+ s->HwTemps[hwindex].RefCount--;
+}
+
+static void add_pairinst_to_list(struct pair_state_instruction **list, struct pair_state_instruction *pairinst)
+{
+ pairinst->NextReady = *list;
+ *list = pairinst;
+}
+
+/**
+ * The instruction at the given IP has become ready. Link it into the ready
+ * instructions.
+ */
+static void instruction_ready(struct pair_state *s, int ip)
+{
+ struct pair_state_instruction *pairinst = s->Instructions + ip;
+
+ if (s->Verbose)
+ _mesa_printf("instruction_ready(%i)\n", ip);
+
+ if (pairinst->IsTex)
+ add_pairinst_to_list(&s->ReadyTEX, pairinst);
+ else if (!pairinst->NeedAlpha)
+ add_pairinst_to_list(&s->ReadyRGB, pairinst);
+ else if (!pairinst->NeedRGB)
+ add_pairinst_to_list(&s->ReadyAlpha, pairinst);
+ else
+ add_pairinst_to_list(&s->ReadyFullALU, pairinst);
+}
+
+
+/**
+ * Finally rewrite ADD, MOV, MUL as the appropriate native instruction
+ * and reverse the order of arguments for CMP.
+ */
+static void final_rewrite(struct pair_state *s, struct prog_instruction *inst)
+{
+ struct prog_src_register tmp;
+
+ switch(inst->Opcode) {
+ case OPCODE_ADD:
+ inst->SrcReg[2] = inst->SrcReg[1];
+ inst->SrcReg[1].File = PROGRAM_BUILTIN;
+ inst->SrcReg[1].Swizzle = SWIZZLE_1111;
+ inst->SrcReg[1].NegateBase = 0;
+ inst->SrcReg[1].NegateAbs = 0;
+ inst->Opcode = OPCODE_MAD;
+ break;
+ case OPCODE_CMP:
+ tmp = inst->SrcReg[2];
+ inst->SrcReg[2] = inst->SrcReg[0];
+ inst->SrcReg[0] = tmp;
+ break;
+ case OPCODE_MOV:
+ inst->SrcReg[1] = inst->SrcReg[0];
+ inst->SrcReg[2].File = PROGRAM_BUILTIN;
+ inst->SrcReg[2].Swizzle = SWIZZLE_0000;
+ inst->Opcode = OPCODE_CMP;
+ // TODO: disable output modifiers on R500
+ break;
+ case OPCODE_MUL:
+ inst->SrcReg[2].File = PROGRAM_BUILTIN;
+ inst->SrcReg[2].Swizzle = SWIZZLE_0000;
+ inst->Opcode = OPCODE_MAD;
+ break;
+ default:
+ /* nothing to do */
+ break;
+ }
+}
+
+
+/**
+ * Classify an instruction according to which ALUs etc. it needs
+ */
+static void classify_instruction(struct pair_state *s,
+ struct prog_instruction *inst, struct pair_state_instruction *pairinst)
+{
+ pairinst->NeedRGB = (inst->DstReg.WriteMask & WRITEMASK_XYZ) ? 1 : 0;
+ pairinst->NeedAlpha = (inst->DstReg.WriteMask & WRITEMASK_W) ? 1 : 0;
+
+ switch(inst->Opcode) {
+ case OPCODE_ADD:
+ case OPCODE_CMP:
+ case OPCODE_FRC:
+ case OPCODE_MAD:
+ case OPCODE_MAX:
+ case OPCODE_MIN:
+ case OPCODE_MOV:
+ case OPCODE_MUL:
+ break;
+ case OPCODE_COS:
+ case OPCODE_EX2:
+ case OPCODE_LG2:
+ case OPCODE_RCP:
+ case OPCODE_RSQ:
+ case OPCODE_SIN:
+ pairinst->IsTranscendent = 1;
+ pairinst->NeedAlpha = 1;
+ break;
+ case OPCODE_DP4:
+ pairinst->NeedAlpha = 1;
+ /* fall through */
+ case OPCODE_DP3:
+ pairinst->NeedRGB = 1;
+ break;
+ case OPCODE_KIL:
+ case OPCODE_TEX:
+ case OPCODE_TXB:
+ case OPCODE_TXP:
+ case OPCODE_END:
+ pairinst->IsTex = 1;
+ break;
+ default:
+ error("Unknown opcode %d\n", inst->Opcode);
+ break;
+ }
+}
+
+
+/**
+ * Count which (input, temporary) register is read and written how often,
+ * and scan the instruction stream to find dependencies.
+ */
+static void scan_instructions(struct pair_state *s)
+{
+ struct prog_instruction *inst;
+ struct pair_state_instruction *pairinst;
+ GLuint ip;
+
+ for(inst = s->Program->Instructions, pairinst = s->Instructions, ip = 0;
+ inst->Opcode != OPCODE_END;
+ ++inst, ++pairinst, ++ip) {
+ final_rewrite(s, inst);
+ classify_instruction(s, inst, pairinst);
+
+ int nsrc = _mesa_num_inst_src_regs(inst->Opcode);
+ int j;
+ for(j = 0; j < nsrc; j++) {
+ struct pair_register_translation *t =
+ get_register(s, inst->SrcReg[j].File, inst->SrcReg[j].Index);
+ if (!t)
+ continue;
+
+ t->RefCount++;
+
+ if (inst->SrcReg[j].File == PROGRAM_TEMPORARY) {
+ int i;
+ for(i = 0; i < 4; ++i) {
+ GLuint swz = GET_SWZ(inst->SrcReg[j].Swizzle, i);
+ if (swz >= 4)
+ continue; /* constant or NIL swizzle */
+ if (!t->Value[swz])
+ continue; /* this is an undefined read */
+
+ /* Do not add a dependency if this instruction
+ * also rewrites the value. The code below adds
+ * a dependency for the DstReg, which is a superset
+ * of the SrcReg dependency. */
+ if (inst->DstReg.File == PROGRAM_TEMPORARY &&
+ inst->DstReg.Index == inst->SrcReg[j].Index &&
+ GET_BIT(inst->DstReg.WriteMask, swz))
+ continue;
+
+ struct reg_value_reader* r = &s->ReaderPool[s->ReaderPoolUsed++];
+ pairinst->NumDependencies++;
+ t->Value[swz]->NumReaders++;
+ r->IP = ip;
+ r->Next = t->Value[swz]->Readers;
+ t->Value[swz]->Readers = r;
+ }
+ }
+ }
+
+ int ndst = _mesa_num_inst_dst_regs(inst->Opcode);
+ if (ndst) {
+ struct pair_register_translation *t =
+ get_register(s, inst->DstReg.File, inst->DstReg.Index);
+ if (t) {
+ t->RefCount++;
+
+ if (inst->DstReg.File == PROGRAM_TEMPORARY) {
+ int j;
+ for(j = 0; j < 4; ++j) {
+ if (!GET_BIT(inst->DstReg.WriteMask, j))
+ continue;
+
+ struct reg_value* v = &s->ValuePool[s->ValuePoolUsed++];
+ v->IP = ip;
+ if (t->Value[j]) {
+ pairinst->NumDependencies++;
+ t->Value[j]->Next = v;
+ }
+ t->Value[j] = v;
+ pairinst->Values[j] = v;
+ }
+ }
+ }
+ }
+
+ if (s->Verbose)
+ _mesa_printf("scan(%i): NumDeps = %i\n", ip, pairinst->NumDependencies);
+
+ if (!pairinst->NumDependencies)
+ instruction_ready(s, ip);
+ }
+
+ /* Clear the PROGRAM_TEMPORARY state */
+ int i, j;
+ for(i = 0; i < MAX_PROGRAM_TEMPS; ++i) {
+ for(j = 0; j < 4; ++j)
+ s->Temps[i].Value[j] = 0;
+ }
+}
+
+
+/**
+ * Reserve hardware temporary registers for the program inputs.
+ *
+ * @note This allocation is performed explicitly, because the order of inputs
+ * is determined by the RS hardware.
+ */
+static void allocate_input_registers(struct pair_state *s)
+{
+ GLuint InputsRead = s->Program->InputsRead;
+ int i;
+
+ /* Texcoords come first */
+ for (i = 0; i < s->Ctx->Const.MaxTextureUnits; i++) {
+ if (InputsRead & (FRAG_BIT_TEX0 << i))
+ get_hw_reg(s, PROGRAM_INPUT, FRAG_ATTRIB_TEX0+i);
+ }
+ InputsRead &= ~FRAG_BITS_TEX_ANY;
+
+ /* fragment position treated as a texcoord */
+ if (InputsRead & FRAG_BIT_WPOS)
+ get_hw_reg(s, PROGRAM_INPUT, FRAG_ATTRIB_WPOS);
+ InputsRead &= ~FRAG_BIT_WPOS;
+
+ /* Then primary colour */
+ if (InputsRead & FRAG_BIT_COL0)
+ get_hw_reg(s, PROGRAM_INPUT, FRAG_ATTRIB_COL0);
+ InputsRead &= ~FRAG_BIT_COL0;
+
+ /* Secondary color */
+ if (InputsRead & FRAG_BIT_COL1)
+ get_hw_reg(s, PROGRAM_INPUT, FRAG_ATTRIB_COL1);
+ InputsRead &= ~FRAG_BIT_COL1;
+
+ /* Anything else */
+ if (InputsRead)
+ error("Don't know how to handle inputs 0x%x\n", InputsRead);
+}
+
+
+static void decrement_dependencies(struct pair_state *s, int ip)
+{
+ struct pair_state_instruction *pairinst = s->Instructions + ip;
+ ASSERT(pairinst->NumDependencies > 0);
+ if (!--pairinst->NumDependencies)
+ instruction_ready(s, ip);
+}
+
+/**
+ * Update the dependency tracking state based on what the instruction
+ * at the given IP does.
+ */
+static void commit_instruction(struct pair_state *s, int ip)
+{
+ struct prog_instruction *inst = s->Program->Instructions + ip;
+ struct pair_state_instruction *pairinst = s->Instructions + ip;
+
+ if (s->Verbose)
+ _mesa_printf("commit_instruction(%i)\n", ip);
+
+ if (inst->DstReg.File == PROGRAM_TEMPORARY) {
+ struct pair_register_translation *t = &s->Temps[inst->DstReg.Index];
+ deref_hw_reg(s, t->HwIndex);
+
+ int i;
+ for(i = 0; i < 4; ++i) {
+ if (!GET_BIT(inst->DstReg.WriteMask, i))
+ continue;
+
+ t->Value[i] = pairinst->Values[i];
+ if (t->Value[i]->NumReaders) {
+ struct reg_value_reader *r;
+ for(r = pairinst->Values[i]->Readers; r; r = r->Next)
+ decrement_dependencies(s, r->IP);
+ } else if (t->Value[i]->Next) {
+ /* This happens when the only reader writes
+ * the register at the same time */
+ decrement_dependencies(s, t->Value[i]->Next->IP);
+ }
+ }
+ }
+
+ int nsrc = _mesa_num_inst_src_regs(inst->Opcode);
+ int i;
+ for(i = 0; i < nsrc; i++) {
+ struct pair_register_translation *t = get_register(s, inst->SrcReg[i].File, inst->SrcReg[i].Index);
+ if (!t)
+ continue;
+
+ deref_hw_reg(s, get_hw_reg(s, inst->SrcReg[i].File, inst->SrcReg[i].Index));
+
+ if (inst->SrcReg[i].File != PROGRAM_TEMPORARY)
+ continue;
+
+ int j;
+ for(j = 0; j < 4; ++j) {
+ GLuint swz = GET_SWZ(inst->SrcReg[i].Swizzle, j);
+ if (swz >= 4)
+ continue;
+ if (!t->Value[swz])
+ continue;
+
+ /* Do not free a dependency if this instruction
+ * also rewrites the value. See scan_instructions. */
+ if (inst->DstReg.File == PROGRAM_TEMPORARY &&
+ inst->DstReg.Index == inst->SrcReg[i].Index &&
+ GET_BIT(inst->DstReg.WriteMask, swz))
+ continue;
+
+ if (!--t->Value[swz]->NumReaders) {
+ if (t->Value[swz]->Next)
+ decrement_dependencies(s, t->Value[swz]->Next->IP);
+ }
+ }
+ }
+}
+
+
+/**
+ * Emit all ready texture instructions in a single block.
+ *
+ * Emit as a single block to (hopefully) sample many textures in parallel,
+ * and to avoid hardware indirections on R300.
+ *
+ * In R500, we don't really know when the result of a texture instruction
+ * arrives. So allocate all destinations first, to make sure they do not
+ * arrive early and overwrite a texture coordinate we're going to use later
+ * in the block.
+ */
+static void emit_all_tex(struct pair_state *s)
+{
+ struct pair_state_instruction *readytex;
+ struct pair_state_instruction *pairinst;
+
+ ASSERT(s->ReadyTEX);
+
+ // Don't let the ready list change under us!
+ readytex = s->ReadyTEX;
+ s->ReadyTEX = 0;
+
+ // Allocate destination hardware registers in one block to avoid conflicts.
+ for(pairinst = readytex; pairinst; pairinst = pairinst->NextReady) {
+ int ip = pairinst - s->Instructions;
+ struct prog_instruction *inst = s->Program->Instructions + ip;
+ if (inst->Opcode != OPCODE_KIL)
+ get_hw_reg(s, inst->DstReg.File, inst->DstReg.Index);
+ }
+
+ if (s->Debug)
+ _mesa_printf(" BEGIN_TEX\n");
+
+ for(pairinst = readytex; pairinst; pairinst = pairinst->NextReady) {
+ int ip = pairinst - s->Instructions;
+ struct prog_instruction *inst = s->Program->Instructions + ip;
+ commit_instruction(s, ip);
+
+ if (inst->Opcode != OPCODE_KIL)
+ inst->DstReg.Index = get_hw_reg(s, inst->DstReg.File, inst->DstReg.Index);
+ inst->SrcReg[0].Index = get_hw_reg(s, inst->SrcReg[0].File, inst->SrcReg[0].Index);
+
+ if (s->Debug) {
+ _mesa_printf(" ");
+ _mesa_print_instruction(inst);
+ }
+ s->Error = s->Error || !s->Handler->EmitTex(s->UserData, inst);
+ }
+
+ if (s->Handler->EndTexBlock)
+ s->Handler->EndTexBlock(s->UserData);
+
+ if (s->Debug)
+ _mesa_printf(" END_TEX\n");
+}
+
+
+static int alloc_pair_source(struct pair_state *s, struct radeon_pair_instruction *pair,
+ struct prog_src_register src, GLboolean rgb, GLboolean alpha)
+{
+ int candidate = -1;
+ int candidate_quality = -1;
+ int i;
+
+ if (!rgb && !alpha)
+ return 0;
+
+ GLuint constant;
+ GLuint index;
+
+ if (src.File == PROGRAM_TEMPORARY || src.File == PROGRAM_INPUT) {
+ constant = 0;
+ index = get_hw_reg(s, src.File, src.Index);
+ } else {
+ constant = 1;
+ s->Error |= !s->Handler->EmitConst(s->UserData, src.File, src.Index, &index);
+ }
+
+ for(i = 0; i < 3; ++i) {
+ int q = 0;
+ if (rgb) {
+ if (pair->RGB.Src[i].Used) {
+ if (pair->RGB.Src[i].Constant != constant ||
+ pair->RGB.Src[i].Index != index)
+ continue;
+ q++;
+ }
+ }
+ if (alpha) {
+ if (pair->Alpha.Src[i].Used) {
+ if (pair->Alpha.Src[i].Constant != constant ||
+ pair->Alpha.Src[i].Index != index)
+ continue;
+ q++;
+ }
+ }
+ if (q > candidate_quality) {
+ candidate_quality = q;
+ candidate = i;
+ }
+ }
+
+ if (candidate >= 0) {
+ if (rgb) {
+ pair->RGB.Src[candidate].Used = 1;
+ pair->RGB.Src[candidate].Constant = constant;
+ pair->RGB.Src[candidate].Index = index;
+ }
+ if (alpha) {
+ pair->Alpha.Src[candidate].Used = 1;
+ pair->Alpha.Src[candidate].Constant = constant;
+ pair->Alpha.Src[candidate].Index = index;
+ }
+ }
+
+ return candidate;
+}
+
+
+
+/**
+ * Fill the given ALU instruction's opcodes and source operands into the given pair,
+ * if possible.
+ */
+static GLboolean fill_instruction_into_pair(struct pair_state *s, struct radeon_pair_instruction *pair, int ip)
+{
+ struct pair_state_instruction *pairinst = s->Instructions + ip;
+ struct prog_instruction *inst = s->Program->Instructions + ip;
+
+ ASSERT(!pairinst->NeedRGB || pair->RGB.Opcode == OPCODE_NOP);
+ ASSERT(!pairinst->NeedAlpha || pair->Alpha.Opcode == OPCODE_NOP);
+
+ if (pairinst->NeedRGB) {
+ if (pairinst->IsTranscendent)
+ pair->RGB.Opcode = OPCODE_REPL_ALPHA;
+ else
+ pair->RGB.Opcode = inst->Opcode;
+ }
+ if (pairinst->NeedAlpha)
+ pair->Alpha.Opcode = inst->Opcode;
+
+ int nargs = _mesa_num_inst_src_regs(inst->Opcode);
+ int i;
+
+ for(i = 0; i < nargs; ++i) {
+ int source;
+ if (pairinst->NeedRGB && !pairinst->IsTranscendent) {
+ GLboolean srcrgb = GL_FALSE;
+ GLboolean srcalpha = GL_FALSE;
+ GLuint negatebase = 0;
+ int j;
+ for(j = 0; j < 3; ++j) {
+ GLuint swz = GET_SWZ(inst->SrcReg[i].Swizzle, j);
+ if (swz < 3)
+ srcrgb = GL_TRUE;
+ else if (swz < 4)
+ srcalpha = GL_TRUE;
+ if (swz != SWIZZLE_NIL && GET_BIT(inst->SrcReg[i].NegateBase, j))
+ negatebase = 1;
+ }
+ source = alloc_pair_source(s, pair, inst->SrcReg[i], srcrgb, srcalpha);
+ if (source < 0)
+ return GL_FALSE;
+ pair->RGB.Arg[i].Source = source;
+ pair->RGB.Arg[i].Swizzle = inst->SrcReg[i].Swizzle & 0x1ff;
+ pair->RGB.Arg[i].Abs = inst->SrcReg[i].Abs;
+ pair->RGB.Arg[i].Negate = (negatebase & ~pair->RGB.Arg[i].Abs) ^ inst->SrcReg[i].NegateAbs;
+ }
+ if (pairinst->NeedAlpha) {
+ GLboolean srcrgb = GL_FALSE;
+ GLboolean srcalpha = GL_FALSE;
+ GLuint negatebase = GET_BIT(inst->SrcReg[i].NegateBase, pairinst->IsTranscendent ? 0 : 3);
+ GLuint swz = GET_SWZ(inst->SrcReg[i].Swizzle, pairinst->IsTranscendent ? 0 : 3);
+ if (swz < 3)
+ srcrgb = GL_TRUE;
+ else if (swz < 4)
+ srcalpha = GL_TRUE;
+ source = alloc_pair_source(s, pair, inst->SrcReg[i], srcrgb, srcalpha);
+ if (source < 0)
+ return GL_FALSE;
+ pair->Alpha.Arg[i].Source = source;
+ pair->Alpha.Arg[i].Swizzle = swz;
+ pair->Alpha.Arg[i].Abs = inst->SrcReg[i].Abs;
+ pair->Alpha.Arg[i].Negate = (negatebase & ~pair->RGB.Arg[i].Abs) ^ inst->SrcReg[i].NegateAbs;
+ }
+ }
+
+ return GL_TRUE;
+}
+
+
+/**
+ * Fill in the destination register information.
+ *
+ * This is split from filling in source registers because we want
+ * to avoid allocating hardware temporaries for destinations until
+ * we are absolutely certain that we're going to emit a certain
+ * instruction pairing.
+ */
+static void fill_dest_into_pair(struct pair_state *s, struct radeon_pair_instruction *pair, int ip)
+{
+ struct pair_state_instruction *pairinst = s->Instructions + ip;
+ struct prog_instruction *inst = s->Program->Instructions + ip;
+
+ if (inst->DstReg.File == PROGRAM_OUTPUT) {
+ if (inst->DstReg.Index == FRAG_RESULT_COLR) {
+ pair->RGB.OutputWriteMask |= inst->DstReg.WriteMask & WRITEMASK_XYZ;
+ pair->Alpha.OutputWriteMask |= GET_BIT(inst->DstReg.WriteMask, 3);
+ } else if (inst->DstReg.Index == FRAG_RESULT_DEPR) {
+ pair->Alpha.DepthWriteMask |= GET_BIT(inst->DstReg.WriteMask, 3);
+ }
+ } else {
+ GLuint hwindex = get_hw_reg(s, inst->DstReg.File, inst->DstReg.Index);
+ if (pairinst->NeedRGB) {
+ pair->RGB.DestIndex = hwindex;
+ pair->RGB.WriteMask |= inst->DstReg.WriteMask & WRITEMASK_XYZ;
+ }
+ if (pairinst->NeedAlpha) {
+ pair->Alpha.DestIndex = hwindex;
+ pair->Alpha.WriteMask |= GET_BIT(inst->DstReg.WriteMask, 3);
+ }
+ }
+}
+
+
+/**
+ * Find a good ALU instruction or pair of ALU instruction and emit it.
+ *
+ * Prefer emitting full ALU instructions, so that when we reach a point
+ * where no full ALU instruction can be emitted, we have more candidates
+ * for RGB/Alpha pairing.
+ */
+static void emit_alu(struct pair_state *s)
+{
+ struct radeon_pair_instruction pair;
+
+ if (s->ReadyFullALU || !(s->ReadyRGB && s->ReadyAlpha)) {
+ int ip;
+ if (s->ReadyFullALU) {
+ ip = s->ReadyFullALU - s->Instructions;
+ s->ReadyFullALU = s->ReadyFullALU->NextReady;
+ } else if (s->ReadyRGB) {
+ ip = s->ReadyRGB - s->Instructions;
+ s->ReadyRGB = s->ReadyRGB->NextReady;
+ } else {
+ ip = s->ReadyAlpha - s->Instructions;
+ s->ReadyAlpha = s->ReadyAlpha->NextReady;
+ }
+
+ _mesa_bzero(&pair, sizeof(pair));
+ fill_instruction_into_pair(s, &pair, ip);
+ fill_dest_into_pair(s, &pair, ip);
+ commit_instruction(s, ip);
+ } else {
+ struct pair_state_instruction **prgb;
+ struct pair_state_instruction **palpha;
+
+ /* Some pairings might fail because they require too
+ * many source slots; try all possible pairings if necessary */
+ for(prgb = &s->ReadyRGB; *prgb; prgb = &(*prgb)->NextReady) {
+ for(palpha = &s->ReadyAlpha; *palpha; palpha = &(*palpha)->NextReady) {
+ int rgbip = *prgb - s->Instructions;
+ int alphaip = *palpha - s->Instructions;
+ _mesa_bzero(&pair, sizeof(pair));
+ fill_instruction_into_pair(s, &pair, rgbip);
+ if (!fill_instruction_into_pair(s, &pair, alphaip))
+ continue;
+ *prgb = (*prgb)->NextReady;
+ *palpha = (*palpha)->NextReady;
+ fill_dest_into_pair(s, &pair, rgbip);
+ fill_dest_into_pair(s, &pair, alphaip);
+ commit_instruction(s, rgbip);
+ commit_instruction(s, alphaip);
+ goto success;
+ }
+ }
+
+ /* No success in pairing; just take the first RGB instruction */
+ int ip = s->ReadyRGB - s->Instructions;
+ s->ReadyRGB = s->ReadyRGB->NextReady;
+ _mesa_bzero(&pair, sizeof(pair));
+ fill_instruction_into_pair(s, &pair, ip);
+ fill_dest_into_pair(s, &pair, ip);
+ commit_instruction(s, ip);
+ success: ;
+ }
+
+ if (s->Debug)
+ radeonPrintPairInstruction(&pair);
+
+ s->Error = s->Error || !s->Handler->EmitPaired(s->UserData, &pair);
+}
+
+
+GLboolean radeonPairProgram(GLcontext *ctx, struct gl_program *program,
+ const struct radeon_pair_handler* handler, void *userdata)
+{
+ struct pair_state s;
+
+ _mesa_bzero(&s, sizeof(s));
+ s.Ctx = ctx;
+ s.Program = program;
+ s.Handler = handler;
+ s.UserData = userdata;
+ s.Debug = (RADEON_DEBUG & DEBUG_PIXEL) ? GL_TRUE : GL_FALSE;
+ s.Verbose = GL_FALSE && s.Debug;
+
+ s.Instructions = (struct pair_state_instruction*)_mesa_calloc(
+ sizeof(struct pair_state_instruction)*s.Program->NumInstructions);
+ s.ValuePool = (struct reg_value*)_mesa_calloc(sizeof(struct reg_value)*s.Program->NumInstructions*4);
+ s.ReaderPool = (struct reg_value_reader*)_mesa_calloc(
+ sizeof(struct reg_value_reader)*s.Program->NumInstructions*12);
+
+ if (s.Debug)
+ _mesa_printf("Emit paired program\n");
+
+ scan_instructions(&s);
+ allocate_input_registers(&s);
+
+ while(!s.Error &&
+ (s.ReadyTEX || s.ReadyRGB || s.ReadyAlpha || s.ReadyFullALU)) {
+ if (s.ReadyTEX)
+ emit_all_tex(&s);
+
+ while(s.ReadyFullALU || s.ReadyRGB || s.ReadyAlpha)
+ emit_alu(&s);
+ }
+
+ if (s.Debug)
+ _mesa_printf(" END\n");
+
+ _mesa_free(s.Instructions);
+ _mesa_free(s.ValuePool);
+ _mesa_free(s.ReaderPool);
+
+ return !s.Error;
+}
+
+
+static void print_pair_src(int i, struct radeon_pair_instruction_source* src)
+{
+ _mesa_printf(" Src%i = %s[%i]", i, src->Constant ? "CNST" : "TEMP", src->Index);
+}
+
+static const char* opcode_string(GLuint opcode)
+{
+ if (opcode == OPCODE_REPL_ALPHA)
+ return "SOP";
+ else
+ return _mesa_opcode_string(opcode);
+}
+
+static int num_pairinst_args(GLuint opcode)
+{
+ if (opcode == OPCODE_REPL_ALPHA)
+ return 0;
+ else
+ return _mesa_num_inst_src_regs(opcode);
+}
+
+static char swizzle_char(GLuint swz)
+{
+ switch(swz) {
+ case SWIZZLE_X: return 'x';
+ case SWIZZLE_Y: return 'y';
+ case SWIZZLE_Z: return 'z';
+ case SWIZZLE_W: return 'w';
+ case SWIZZLE_ZERO: return '0';
+ case SWIZZLE_ONE: return '1';
+ case SWIZZLE_NIL: return '_';
+ default: return '?';
+ }
+}
+
+void radeonPrintPairInstruction(struct radeon_pair_instruction *inst)
+{
+ int nargs;
+ int i;
+
+ _mesa_printf(" RGB: ");
+ for(i = 0; i < 3; ++i) {
+ if (inst->RGB.Src[i].Used)
+ print_pair_src(i, inst->RGB.Src + i);
+ }
+ _mesa_printf("\n");
+ _mesa_printf(" Alpha:");
+ for(i = 0; i < 3; ++i) {
+ if (inst->Alpha.Src[i].Used)
+ print_pair_src(i, inst->Alpha.Src + i);
+ }
+ _mesa_printf("\n");
+
+ _mesa_printf(" %s%s", opcode_string(inst->RGB.Opcode), inst->RGB.Saturate ? "_SAT" : "");
+ if (inst->RGB.WriteMask)
+ _mesa_printf(" TEMP[%i].%s%s%s", inst->RGB.DestIndex,
+ (inst->RGB.WriteMask & 1) ? "x" : "",
+ (inst->RGB.WriteMask & 2) ? "y" : "",
+ (inst->RGB.WriteMask & 4) ? "z" : "");
+ if (inst->RGB.OutputWriteMask)
+ _mesa_printf(" COLOR.%s%s%s",
+ (inst->RGB.OutputWriteMask & 1) ? "x" : "",
+ (inst->RGB.OutputWriteMask & 2) ? "y" : "",
+ (inst->RGB.OutputWriteMask & 4) ? "z" : "");
+ nargs = num_pairinst_args(inst->RGB.Opcode);
+ for(i = 0; i < nargs; ++i) {
+ const char* abs = inst->RGB.Arg[i].Abs ? "|" : "";
+ const char* neg = inst->RGB.Arg[i].Negate ? "-" : "";
+ _mesa_printf(", %s%sSrc%i.%c%c%c%s", neg, abs, inst->RGB.Arg[i].Source,
+ swizzle_char(GET_SWZ(inst->RGB.Arg[i].Swizzle, 0)),
+ swizzle_char(GET_SWZ(inst->RGB.Arg[i].Swizzle, 1)),
+ swizzle_char(GET_SWZ(inst->RGB.Arg[i].Swizzle, 2)),
+ abs);
+ }
+ _mesa_printf("\n");
+
+ _mesa_printf(" %s%s", opcode_string(inst->Alpha.Opcode), inst->Alpha.Saturate ? "_SAT" : "");
+ if (inst->Alpha.WriteMask)
+ _mesa_printf(" TEMP[%i].w", inst->Alpha.DestIndex);
+ if (inst->Alpha.OutputWriteMask)
+ _mesa_printf(" COLOR.w");
+ if (inst->Alpha.DepthWriteMask)
+ _mesa_printf(" DEPTH.w");
+ nargs = num_pairinst_args(inst->Alpha.Opcode);
+ for(i = 0; i < nargs; ++i) {
+ const char* abs = inst->Alpha.Arg[i].Abs ? "|" : "";
+ const char* neg = inst->Alpha.Arg[i].Negate ? "-" : "";
+ _mesa_printf(", %s%sSrc%i.%c%s", neg, abs, inst->Alpha.Arg[i].Source,
+ swizzle_char(inst->Alpha.Arg[i].Swizzle), abs);
+ }
+ _mesa_printf("\n");
+}