summaryrefslogtreecommitdiff
path: root/src/gallium/drivers
diff options
context:
space:
mode:
Diffstat (limited to 'src/gallium/drivers')
-rw-r--r--src/gallium/drivers/cell/common.h10
-rw-r--r--src/gallium/drivers/cell/ppu/cell_context.c5
-rw-r--r--src/gallium/drivers/cell/ppu/cell_gen_fp.c970
-rw-r--r--src/gallium/drivers/cell/ppu/cell_gen_fragment.c728
-rw-r--r--src/gallium/drivers/cell/ppu/cell_pipe_state.c4
-rw-r--r--src/gallium/drivers/cell/ppu/cell_screen.c14
-rw-r--r--src/gallium/drivers/cell/ppu/cell_state_emit.c5
-rw-r--r--src/gallium/drivers/cell/spu/.gitignore1
-rw-r--r--src/gallium/drivers/cell/spu/spu_main.c127
-rw-r--r--src/gallium/drivers/cell/spu/spu_main.h8
-rw-r--r--src/gallium/drivers/cell/spu/spu_per_fragment_op.c164
-rw-r--r--src/gallium/drivers/cell/spu/spu_tri.c114
-rw-r--r--src/gallium/drivers/softpipe/sp_fs_exec.c29
13 files changed, 1833 insertions, 346 deletions
diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index cb0631baf5..f0ff96eb47 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -104,11 +104,11 @@
#define CELL_BUFFER_STATUS_FREE 10
#define CELL_BUFFER_STATUS_USED 20
-
-#define CELL_DEBUG_CHECKER (1 << 0)
-#define CELL_DEBUG_SYNC (1 << 1)
-
-
+#define CELL_DEBUG_CHECKER (1 << 0)
+#define CELL_DEBUG_ASM (1 << 1)
+#define CELL_DEBUG_SYNC (1 << 2)
+#define CELL_DEBUG_FRAGMENT_OPS (1 << 3)
+#define CELL_DEBUG_FRAGMENT_OP_FALLBACK (1 << 4)
/** Max instructions for doing per-fragment operations */
#define SPU_MAX_FRAGMENT_OPS_INSTS 64
diff --git a/src/gallium/drivers/cell/ppu/cell_context.c b/src/gallium/drivers/cell/ppu/cell_context.c
index 71f1a3049d..62e213ea35 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.c
+++ b/src/gallium/drivers/cell/ppu/cell_context.c
@@ -85,13 +85,14 @@ cell_draw_create(struct cell_context *cell)
}
-#ifdef DEBUG
static const struct debug_named_value cell_debug_flags[] = {
{"checker", CELL_DEBUG_CHECKER},/**< modulate tile clear color by SPU ID */
+ {"asm", CELL_DEBUG_ASM}, /**< dump SPU asm code */
{"sync", CELL_DEBUG_SYNC}, /**< SPUs do synchronous DMA */
+ {"fragops", CELL_DEBUG_FRAGMENT_OPS}, /**< SPUs emit fragment ops debug messages*/
+ {"fragopfallback", CELL_DEBUG_FRAGMENT_OP_FALLBACK}, /**< SPUs use reference implementation for fragment ops*/
{NULL, 0}
};
-#endif
struct pipe_context *
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
index 6ffe94eb14..1bc803d590 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
@@ -51,9 +51,13 @@
#include "cell_gen_fp.h"
-/** Set to 1 to enable debug/disassembly printfs */
-#define DISASSEM 01
+#define MAX_TEMPS 16
+#define MAX_IMMED 8
+#define CHAN_X 0
+#define CHAN_Y 1
+#define CHAN_Z 2
+#define CHAN_W 3
/**
* Context needed during code generation.
@@ -63,13 +67,21 @@ struct codegen
int inputs_reg; /**< 1st function parameter */
int outputs_reg; /**< 2nd function parameter */
int constants_reg; /**< 3rd function parameter */
- int temp_regs[8][4]; /**< maps TGSI temps to SPE registers */
+ int temp_regs[MAX_TEMPS][4]; /**< maps TGSI temps to SPE registers */
+ int imm_regs[MAX_IMMED][4]; /**< maps TGSI immediates to SPE registers */
+
+ int num_imm; /**< number of immediates */
int one_reg; /**< register containing {1.0, 1.0, 1.0, 1.0} */
/** Per-instruction temps / intermediate temps */
int num_itemps;
- int itemps[3];
+ int itemps[10];
+
+ /** Current IF/ELSE/ENDIF nesting level */
+ int if_nesting;
+ /** Index of execution mask register */
+ int exec_mask_reg;
struct spe_function *f;
boolean error;
@@ -112,19 +124,47 @@ get_const_one_reg(struct codegen *gen)
{
if (gen->one_reg <= 0) {
gen->one_reg = spe_allocate_available_register(gen->f);
- }
- /* one = {1.0, 1.0, 1.0, 1.0} */
- spe_load_float(gen->f, gen->one_reg, 1.0f);
-#if DISASSEM
- printf("il\tr%d, 1.0f\n", gen->one_reg);
-#endif
+ spe_indent(gen->f, 4);
+ spe_comment(gen->f, -4, "INIT CONSTANT 1.0:");
+
+ /* one = {1.0, 1.0, 1.0, 1.0} */
+ spe_load_float(gen->f, gen->one_reg, 1.0f);
+
+ spe_indent(gen->f, -4);
+ }
return gen->one_reg;
}
/**
+ * Return index of the pixel execution mask.
+ * The register is allocated an initialized upon the first call.
+ *
+ * The pixel execution mask controls which pixels in a quad are
+ * modified, according to surrounding conditionals, loops, etc.
+ */
+static int
+get_exec_mask_reg(struct codegen *gen)
+{
+ if (gen->exec_mask_reg <= 0) {
+ gen->exec_mask_reg = spe_allocate_available_register(gen->f);
+
+ spe_indent(gen->f, 4);
+ spe_comment(gen->f, -4, "INIT EXEC MASK = ~0:");
+
+ /* exec_mask = {~0, ~0, ~0, ~0} */
+ spe_load_int(gen->f, gen->exec_mask_reg, ~0);
+
+ spe_indent(gen->f, -4);
+ }
+
+ return gen->exec_mask_reg;
+}
+
+
+/**
* Return the index of the SPU temporary containing the named TGSI
* source register. If the TGSI register is a TGSI_FILE_TEMPORARY we
* just return the corresponding SPE register. If the TGIS register
@@ -136,12 +176,15 @@ get_src_reg(struct codegen *gen,
int channel,
const struct tgsi_full_src_register *src)
{
- int reg;
+ int reg = -1;
+ int swizzle = tgsi_util_get_full_src_register_extswizzle(src, channel);
+ boolean reg_is_itemp = FALSE;
+ uint sign_op;
- /* XXX need to examine src swizzle info here.
- * That will involve changing the channel var...
- */
+ assert(swizzle >= 0);
+ assert(swizzle <= 3);
+ channel = swizzle;
switch (src->SrcRegister.File) {
case TGSI_FILE_TEMPORARY:
@@ -152,21 +195,57 @@ get_src_reg(struct codegen *gen,
/* offset is measured in quadwords, not bytes */
int offset = src->SrcRegister.Index * 4 + channel;
reg = get_itemp(gen);
+ reg_is_itemp = TRUE;
/* Load: reg = memory[(machine_reg) + offset] */
spe_lqd(gen->f, reg, gen->inputs_reg, offset);
-#if DISASSEM
- printf("lqd\tr%d, r%d + %d\n", reg, gen->inputs_reg, offset);
-#endif
}
break;
case TGSI_FILE_IMMEDIATE:
- /* xxx fall-through for now / fix */
+ reg = gen->imm_regs[src->SrcRegister.Index][channel];
+ break;
case TGSI_FILE_CONSTANT:
/* xxx fall-through for now / fix */
default:
assert(0);
}
+ /*
+ * Handle absolute value, negate or set-negative of src register.
+ */
+ sign_op = tgsi_util_get_full_src_register_sign_mode(src, channel);
+ if (sign_op != TGSI_UTIL_SIGN_KEEP) {
+ /*
+ * All sign ops are done by manipulating bit 31, the IEEE float sign bit.
+ */
+ const int bit31mask_reg = get_itemp(gen);
+ int result_reg;
+
+ if (reg_is_itemp) {
+ /* re-use 'reg' for the result */
+ result_reg = reg;
+ }
+ else {
+ /* alloc a new reg for the result */
+ result_reg = get_itemp(gen);
+ }
+
+ /* mask with bit 31 set, the rest cleared */
+ spe_load_int(gen->f, bit31mask_reg, (1 << 31));
+
+ if (sign_op == TGSI_UTIL_SIGN_CLEAR) {
+ spe_andc(gen->f, result_reg, reg, bit31mask_reg);
+ }
+ else if (sign_op == TGSI_UTIL_SIGN_SET) {
+ spe_and(gen->f, result_reg, reg, bit31mask_reg);
+ }
+ else {
+ assert(sign_op == TGSI_UTIL_SIGN_TOGGLE);
+ spe_xor(gen->f, result_reg, reg, bit31mask_reg);
+ }
+
+ reg = result_reg;
+ }
+
return reg;
}
@@ -183,11 +262,14 @@ get_dst_reg(struct codegen *gen,
int channel,
const struct tgsi_full_dst_register *dest)
{
- int reg;
+ int reg = -1;
switch (dest->DstRegister.File) {
case TGSI_FILE_TEMPORARY:
- reg = gen->temp_regs[dest->DstRegister.Index][channel];
+ if (gen->if_nesting > 0)
+ reg = get_itemp(gen);
+ else
+ reg = gen->temp_regs[dest->DstRegister.Index][channel];
break;
case TGSI_FILE_OUTPUT:
reg = get_itemp(gen);
@@ -213,17 +295,40 @@ store_dest_reg(struct codegen *gen,
{
switch (dest->DstRegister.File) {
case TGSI_FILE_TEMPORARY:
- /* no-op */
+ if (gen->if_nesting > 0) {
+ int d_reg = gen->temp_regs[dest->DstRegister.Index][channel];
+ int exec_reg = get_exec_mask_reg(gen);
+ /* Mix d with new value according to exec mask:
+ * d[i] = mask_reg[i] ? value_reg : d_reg
+ */
+ spe_selb(gen->f, d_reg, d_reg, value_reg, exec_reg);
+ }
+ else {
+ /* we're not inside a condition or loop: do nothing special */
+ }
break;
case TGSI_FILE_OUTPUT:
{
/* offset is measured in quadwords, not bytes */
int offset = dest->DstRegister.Index * 4 + channel;
- /* Store: memory[(machine_reg) + offset] = reg */
- spe_stqd(gen->f, value_reg, gen->outputs_reg, offset);
-#if DISASSEM
- printf("stqd\tr%d, r%d + %d\n", value_reg, gen->outputs_reg, offset);
-#endif
+ if (gen->if_nesting > 0) {
+ int exec_reg = get_exec_mask_reg(gen);
+ int curval_reg = get_itemp(gen);
+ /* First read the current value from memory:
+ * Load: curval = memory[(machine_reg) + offset]
+ */
+ spe_lqd(gen->f, curval_reg, gen->outputs_reg, offset);
+ /* Mix curval with newvalue according to exec mask:
+ * d[i] = mask_reg[i] ? value_reg : d_reg
+ */
+ spe_selb(gen->f, curval_reg, curval_reg, value_reg, exec_reg);
+ /* Store: memory[(machine_reg) + offset] = curval */
+ spe_stqd(gen->f, curval_reg, gen->outputs_reg, offset);
+ }
+ else {
+ /* Store: memory[(machine_reg) + offset] = reg */
+ spe_stqd(gen->f, value_reg, gen->outputs_reg, offset);
+ }
}
break;
default:
@@ -236,15 +341,13 @@ static boolean
emit_MOV(struct codegen *gen, const struct tgsi_full_instruction *inst)
{
int ch;
+ spe_comment(gen->f, -4, "MOV:");
for (ch = 0; ch < 4; ch++) {
if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
int src_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
int dst_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
/* XXX we don't always need to actually emit a mov instruction here */
spe_move(gen->f, dst_reg, src_reg);
-#if DISASSEM
- printf("mov\tr%d, r%d\n", dst_reg, src_reg);
-#endif
store_dest_reg(gen, dst_reg, ch, &inst->FullDstRegisters[0]);
free_itemps(gen);
}
@@ -253,6 +356,7 @@ emit_MOV(struct codegen *gen, const struct tgsi_full_instruction *inst)
}
+
/**
* Emit addition instructions. Recall that a single TGSI_OPCODE_ADD
* becomes (up to) four SPU "fa" instructions because we're doing SOA
@@ -262,6 +366,7 @@ static boolean
emit_ADD(struct codegen *gen, const struct tgsi_full_instruction *inst)
{
int ch;
+ spe_comment(gen->f, -4, "ADD:");
/* Loop over Red/Green/Blue/Alpha channels */
for (ch = 0; ch < 4; ch++) {
/* If the dest R, G, B or A writemask is enabled... */
@@ -273,9 +378,6 @@ emit_ADD(struct codegen *gen, const struct tgsi_full_instruction *inst)
/* Emit actual SPE instruction: d = s1 + s2 */
spe_fa(gen->f, d_reg, s1_reg, s2_reg);
-#if DISASSEM
- printf("fa\tr%d, r%d, r%d\n", d_reg, s1_reg, s2_reg);
-#endif
/* Store the result (a no-op for TGSI_FILE_TEMPORARY dests) */
store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
@@ -286,6 +388,82 @@ emit_ADD(struct codegen *gen, const struct tgsi_full_instruction *inst)
return true;
}
+/**
+ * Emit subtract. See emit_ADD for comments.
+ */
+static boolean
+emit_SUB(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+ int ch;
+ spe_comment(gen->f, -4, "SUB:");
+ /* Loop over Red/Green/Blue/Alpha channels */
+ for (ch = 0; ch < 4; ch++) {
+ /* If the dest R, G, B or A writemask is enabled... */
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ /* get indexes of the two src, one dest SPE registers */
+ int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+ int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+ int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+
+ /* Emit actual SPE instruction: d = s1 - s2 */
+ spe_fs(gen->f, d_reg, s1_reg, s2_reg);
+
+ /* Store the result (a no-op for TGSI_FILE_TEMPORARY dests) */
+ store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+ /* Free any intermediate temps we allocated */
+ free_itemps(gen);
+ }
+ }
+ return true;
+}
+
+/**
+ * Emit multiply add. See emit_ADD for comments.
+ */
+static boolean
+emit_MAD(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+ int ch;
+ spe_comment(gen->f, -4, "MAD:");
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+ int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+ int s3_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[2]);
+ int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+ /* d = s1 * s2 + s3 */
+ spe_fma(gen->f, d_reg, s1_reg, s2_reg, s3_reg);
+ store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+ free_itemps(gen);
+ }
+ }
+ return true;
+}
+
+
+/**
+ * Emit linear interpolate. See emit_ADD for comments.
+ */
+static boolean
+emit_LERP(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+ int ch;
+ spe_comment(gen->f, -4, "LERP:");
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+ int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+ int s3_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[2]);
+ int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+ /* d = s3 + s1(s2 - s3) */
+ spe_fs(gen->f, d_reg, s2_reg, s3_reg);
+ spe_fma(gen->f, d_reg, d_reg, s1_reg, s3_reg);
+ store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+ free_itemps(gen);
+ }
+ }
+ return true;
+}
/**
* Emit multiply. See emit_ADD for comments.
@@ -294,6 +472,7 @@ static boolean
emit_MUL(struct codegen *gen, const struct tgsi_full_instruction *inst)
{
int ch;
+ spe_comment(gen->f, -4, "MUL:");
for (ch = 0; ch < 4; ch++) {
if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
@@ -301,9 +480,6 @@ emit_MUL(struct codegen *gen, const struct tgsi_full_instruction *inst)
int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
/* d = s1 * s2 */
spe_fm(gen->f, d_reg, s1_reg, s2_reg);
-#if DISASSEM
- printf("fm\tr%d, r%d, r%d\n", d_reg, s1_reg, s2_reg);
-#endif
store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
free_itemps(gen);
}
@@ -311,6 +487,151 @@ emit_MUL(struct codegen *gen, const struct tgsi_full_instruction *inst)
return true;
}
+/**
+ * Emit reciprocal. See emit_ADD for comments.
+ */
+static boolean
+emit_RCP(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+ int ch;
+ spe_comment(gen->f, -4, "RCP:");
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+ int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+ /* d = 1/s1 */
+ spe_frest(gen->f, d_reg, s1_reg);
+ spe_fi(gen->f, d_reg, s1_reg, d_reg);
+ store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+ free_itemps(gen);
+ }
+ }
+ return true;
+}
+
+/**
+ * Emit reciprocal sqrt. See emit_ADD for comments.
+ */
+static boolean
+emit_RSQ(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+ int ch;
+ spe_comment(gen->f, -4, "RSQ:");
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+ int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+ /* d = 1/s1 */
+ spe_frsqest(gen->f, d_reg, s1_reg);
+ spe_fi(gen->f, d_reg, s1_reg, d_reg);
+ store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+ free_itemps(gen);
+ }
+ }
+ return true;
+}
+
+/**
+ * Emit absolute value. See emit_ADD for comments.
+ */
+static boolean
+emit_ABS(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+ int ch;
+ spe_comment(gen->f, -4, "ABS:");
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+ int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+ const int bit31mask_reg = get_itemp(gen);
+
+ /* mask with bit 31 set, the rest cleared */
+ spe_load_int(gen->f, bit31mask_reg, (1 << 31));
+
+ /* d = sign bit cleared in s1 */
+ spe_andc(gen->f, d_reg, s1_reg, bit31mask_reg);
+
+ store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+ free_itemps(gen);
+ }
+ }
+ return true;
+}
+
+/**
+ * Emit 3 component dot product. See emit_ADD for comments.
+ */
+static boolean
+emit_DP3(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+ int ch;
+ spe_comment(gen->f, -4, "DP3:");
+
+ int s1_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]);
+ int s2_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]);
+ int d_reg = get_dst_reg(gen, CHAN_X, &inst->FullDstRegisters[0]);
+ /* d = x * x */
+ spe_fm(gen->f, d_reg, s1_reg, s2_reg);
+
+ s1_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]);
+ s2_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[1]);
+ /* d = y * y + d */
+ spe_fma(gen->f, d_reg, s1_reg, s2_reg, d_reg);
+
+ s1_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]);
+ s2_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[1]);
+ /* d = z * z + d */
+ spe_fma(gen->f, d_reg, s1_reg, s2_reg, d_reg);
+
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+ }
+ }
+
+ free_itemps(gen);
+ return true;
+}
+
+/**
+ * Emit 4 component dot product. See emit_ADD for comments.
+ */
+static boolean
+emit_DP4(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+ int ch;
+ spe_comment(gen->f, -4, "DP3:");
+
+ int s1_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]);
+ int s2_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]);
+ int d_reg = get_dst_reg(gen, CHAN_X, &inst->FullDstRegisters[0]);
+ /* d = x * x */
+ spe_fm(gen->f, d_reg, s1_reg, s2_reg);
+
+ s1_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]);
+ s2_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[1]);
+ /* d = y * y + d */
+ spe_fma(gen->f, d_reg, s1_reg, s2_reg, d_reg);
+
+ s1_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]);
+ s2_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[1]);
+ /* d = z * z + d */
+ spe_fma(gen->f, d_reg, s1_reg, s2_reg, d_reg);
+
+ s1_reg = get_src_reg(gen, CHAN_W, &inst->FullSrcRegisters[0]);
+ s2_reg = get_src_reg(gen, CHAN_W, &inst->FullSrcRegisters[1]);
+ /* d = w * w + d */
+ spe_fma(gen->f, d_reg, s1_reg, s2_reg, d_reg);
+
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+ }
+ }
+
+ free_itemps(gen);
+ return true;
+}
/**
* Emit set-if-greater-than.
@@ -323,6 +644,8 @@ emit_SGT(struct codegen *gen, const struct tgsi_full_instruction *inst)
{
int ch;
+ spe_comment(gen->f, -4, "SGT:");
+
for (ch = 0; ch < 4; ch++) {
if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
@@ -331,16 +654,10 @@ emit_SGT(struct codegen *gen, const struct tgsi_full_instruction *inst)
/* d = (s1 > s2) */
spe_fcgt(gen->f, d_reg, s1_reg, s2_reg);
-#if DISASSEM
- printf("fcgt\tr%d, r%d, r%d\n", d_reg, s1_reg, s2_reg);
-#endif
/* convert d from 0x0/0xffffffff to 0.0/1.0 */
/* d = d & one_reg */
spe_and(gen->f, d_reg, d_reg, get_const_one_reg(gen));
-#if DISASSEM
- printf("and\tr%d, r%d, r%d\n", d_reg, d_reg, get_const_one_reg(gen));
-#endif
store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
free_itemps(gen);
@@ -350,6 +667,421 @@ emit_SGT(struct codegen *gen, const struct tgsi_full_instruction *inst)
return true;
}
+/**
+ * Emit set-if_less-then. See emit_SGT for comments.
+ */
+static boolean
+emit_SLT(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+ int ch;
+
+ spe_comment(gen->f, -4, "SLT:");
+
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+ int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+ int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+
+ /* d = (s1 < s2) */
+ spe_fcgt(gen->f, d_reg, s2_reg, s1_reg);
+
+ /* convert d from 0x0/0xffffffff to 0.0/1.0 */
+ /* d = d & one_reg */
+ spe_and(gen->f, d_reg, d_reg, get_const_one_reg(gen));
+
+ store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+ free_itemps(gen);
+ }
+ }
+
+ return true;
+}
+
+/**
+ * Emit set-if_greater-then-or-equal. See emit_SGT for comments.
+ */
+static boolean
+emit_SGE(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+ int ch;
+
+ spe_comment(gen->f, -4, "SGE:");
+
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+ int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+ int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+
+ /* d = (s1 >= s2) */
+ spe_fcgt(gen->f, d_reg, s2_reg, s1_reg);
+
+ /* convert d from 0x0/0xffffffff to 0.0/1.0 */
+ /* d = ~d & one_reg */
+ spe_andc(gen->f, d_reg, get_const_one_reg(gen), d_reg);
+
+ store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+ free_itemps(gen);
+ }
+ }
+
+ return true;
+}
+
+/**
+ * Emit set-if_less-then-or-equal. See emit_SGT for comments.
+ */
+static boolean
+emit_SLE(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+ int ch;
+
+ spe_comment(gen->f, -4, "SLE:");
+
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+ int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+ int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+
+ /* d = (s1 <= s2) */
+ spe_fcgt(gen->f, d_reg, s1_reg, s2_reg);
+
+ /* convert d from 0x0/0xffffffff to 0.0/1.0 */
+ /* d = ~d & one_reg */
+ spe_andc(gen->f, d_reg, get_const_one_reg(gen), d_reg);
+
+ store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+ free_itemps(gen);
+ }
+ }
+
+ return true;
+}
+
+/**
+ * Emit set-if_equal. See emit_SGT for comments.
+ */
+static boolean
+emit_SEQ(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+ int ch;
+
+ spe_comment(gen->f, -4, "SEQ:");
+
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+ int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+ int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+
+ /* d = (s1 == s2) */
+ spe_fceq(gen->f, d_reg, s1_reg, s2_reg);
+
+ /* convert d from 0x0/0xffffffff to 0.0/1.0 */
+ /* d = d & one_reg */
+ spe_and(gen->f, d_reg, d_reg, get_const_one_reg(gen));
+
+ store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+ free_itemps(gen);
+ }
+ }
+
+ return true;
+}
+
+/**
+ * Emit set-if_not_equal. See emit_SGT for comments.
+ */
+static boolean
+emit_SNE(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+ int ch;
+
+ spe_comment(gen->f, -4, "SNE:");
+
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+ int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+ int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+
+ /* d = (s1 != s2) */
+ spe_fceq(gen->f, d_reg, s1_reg, s2_reg);
+ spe_nor(gen->f, d_reg, d_reg, d_reg);
+
+ /* convert d from 0x0/0xffffffff to 0.0/1.0 */
+ /* d = d & one_reg */
+ spe_and(gen->f, d_reg, d_reg, get_const_one_reg(gen));
+
+ store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+ free_itemps(gen);
+ }
+ }
+
+ return true;
+}
+
+/**
+ * Emit compare. See emit_SGT for comments.
+ */
+static boolean
+emit_CMP(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+ int ch;
+
+ spe_comment(gen->f, -4, "CMP:");
+
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+ int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+ int s3_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[2]);
+ int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+ int zero_reg = get_itemp(gen);
+
+ spe_xor(gen->f, zero_reg, zero_reg, zero_reg);
+
+ /* d = (s1 < 0) ? s2 : s3 */
+ spe_fcgt(gen->f, d_reg, zero_reg, s1_reg);
+ spe_selb(gen->f, d_reg, s3_reg, s2_reg, d_reg);
+
+ store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+ free_itemps(gen);
+ }
+ }
+
+ return true;
+}
+
+/**
+ * Emit floor.
+ * If negative int subtract one
+ * Convert float to signed int
+ * Convert signed int to float
+ */
+static boolean
+emit_FLR(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+ int ch;
+
+ spe_comment(gen->f, -4, "FLR:");
+
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+ int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+ int tmp_reg = get_itemp(gen);
+
+ /* If negative, subtract 1.0 */
+ spe_xor(gen->f, tmp_reg, tmp_reg, tmp_reg);
+ spe_fcgt(gen->f, d_reg, tmp_reg, s1_reg);
+ spe_selb(gen->f, tmp_reg, tmp_reg, get_const_one_reg(gen), d_reg);
+ spe_fs(gen->f, d_reg, s1_reg, tmp_reg);
+
+ /* Convert float to int */
+ spe_cflts(gen->f, d_reg, d_reg, 0);
+
+ /* Convert int to float */
+ spe_csflt(gen->f, d_reg, d_reg, 0);
+
+ store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+ free_itemps(gen);
+ }
+ }
+
+ return true;
+}
+
+/**
+ * Emit frac.
+ * Input - FLR(Input)
+ */
+static boolean
+emit_FRC(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+ int ch;
+
+ spe_comment(gen->f, -4, "FLR:");
+
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+ int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+ int tmp_reg = get_itemp(gen);
+
+ /* If negative, subtract 1.0 */
+ spe_xor(gen->f, tmp_reg, tmp_reg, tmp_reg);
+ spe_fcgt(gen->f, d_reg, tmp_reg, s1_reg);
+ spe_selb(gen->f, tmp_reg, tmp_reg, get_const_one_reg(gen), d_reg);
+ spe_fs(gen->f, d_reg, s1_reg, tmp_reg);
+
+ /* Convert float to int */
+ spe_cflts(gen->f, d_reg, d_reg, 0);
+
+ /* Convert int to float */
+ spe_csflt(gen->f, d_reg, d_reg, 0);
+
+ /* d = s1 - FLR(s1) */
+ spe_fs(gen->f, d_reg, s1_reg, d_reg);
+
+ store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+ free_itemps(gen);
+ }
+ }
+
+ return true;
+}
+
+
+/**
+ * Emit max. See emit_SGT for comments.
+ */
+static boolean
+emit_MAX(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+ int ch;
+
+ spe_comment(gen->f, -4, "MAX:");
+
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+ int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+ int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+
+ /* d = (s1 > s2) ? s1 : s2 */
+ spe_fcgt(gen->f, d_reg, s1_reg, s2_reg);
+ spe_selb(gen->f, d_reg, s2_reg, s1_reg, d_reg);
+
+ store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+ free_itemps(gen);
+ }
+ }
+
+ return true;
+}
+
+/**
+ * Emit max. See emit_SGT for comments.
+ */
+static boolean
+emit_MIN(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+ int ch;
+
+ spe_comment(gen->f, -4, "MIN:");
+
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+ int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+ int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+
+ /* d = (s2 > s1) ? s1 : s2 */
+ spe_fcgt(gen->f, d_reg, s2_reg, s1_reg);
+ spe_selb(gen->f, d_reg, s2_reg, s1_reg, d_reg);
+
+ store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+ free_itemps(gen);
+ }
+ }
+
+ return true;
+}
+
+static boolean
+emit_IF(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+ const int channel = 0;
+ const int exec_reg = get_exec_mask_reg(gen);
+
+ spe_comment(gen->f, -4, "IF:");
+
+ /* update execution mask with the predicate register */
+ int tmp_reg = get_itemp(gen);
+ int s1_reg = get_src_reg(gen, channel, &inst->FullSrcRegisters[0]);
+
+ /* tmp = (s1_reg == 0) */
+ spe_ceqi(gen->f, tmp_reg, s1_reg, 0);
+ /* tmp = !tmp */
+ spe_complement(gen->f, tmp_reg, tmp_reg);
+ /* exec_mask = exec_mask & tmp */
+ spe_and(gen->f, exec_reg, exec_reg, tmp_reg);
+
+ gen->if_nesting++;
+
+ free_itemps(gen);
+
+ return true;
+}
+
+
+static boolean
+emit_ELSE(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+ const int exec_reg = get_exec_mask_reg(gen);
+
+ spe_comment(gen->f, -4, "ELSE:");
+
+ /* exec_mask = !exec_mask */
+ spe_complement(gen->f, exec_reg, exec_reg);
+
+ return true;
+}
+
+
+static boolean
+emit_ENDIF(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+ const int exec_reg = get_exec_mask_reg(gen);
+
+ spe_comment(gen->f, -4, "ENDIF:");
+
+ /* XXX todo: pop execution mask */
+
+ spe_load_int(gen->f, exec_reg, ~0x0);
+
+ gen->if_nesting--;
+ return true;
+}
+
+
+static boolean
+emit_DDX_DDY(struct codegen *gen, const struct tgsi_full_instruction *inst,
+ boolean ddx)
+{
+ int ch;
+
+ spe_comment(gen->f, -4, ddx ? "DDX:" : "DDY:");
+
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ int s_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+ int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+
+ int t1_reg = get_itemp(gen);
+ int t2_reg = get_itemp(gen);
+
+ spe_splat_word(gen->f, t1_reg, s_reg, 0); /* upper-left pixel */
+ if (ddx) {
+ spe_splat_word(gen->f, t2_reg, s_reg, 1); /* upper-right pixel */
+ }
+ else {
+ spe_splat_word(gen->f, t2_reg, s_reg, 2); /* lower-left pixel */
+ }
+ spe_fs(gen->f, d_reg, t2_reg, t1_reg);
+
+ free_itemps(gen);
+ }
+ }
+
+ return true;
+}
+
+
+
/**
* Emit END instruction.
@@ -361,11 +1093,9 @@ emit_SGT(struct codegen *gen, const struct tgsi_full_instruction *inst)
static boolean
emit_END(struct codegen *gen)
{
+ spe_comment(gen->f, -4, "END:");
/* return from function call */
spe_bi(gen->f, SPE_REG_RA, 0, 0);
-#if DISASSEM
- printf("bi\trRA\n");
-#endif
return true;
}
@@ -384,14 +1114,64 @@ emit_instruction(struct codegen *gen,
return emit_MUL(gen, inst);
case TGSI_OPCODE_ADD:
return emit_ADD(gen, inst);
+ case TGSI_OPCODE_SUB:
+ return emit_SUB(gen, inst);
+ case TGSI_OPCODE_MAD:
+ return emit_MAD(gen, inst);
+ case TGSI_OPCODE_LERP:
+ return emit_LERP(gen, inst);
+ case TGSI_OPCODE_DP3:
+ return emit_DP3(gen, inst);
+ case TGSI_OPCODE_DP4:
+ return emit_DP4(gen, inst);
+ case TGSI_OPCODE_RCP:
+ return emit_RCP(gen, inst);
+ case TGSI_OPCODE_RSQ:
+ return emit_RSQ(gen, inst);
+ case TGSI_OPCODE_ABS:
+ return emit_ABS(gen, inst);
case TGSI_OPCODE_SGT:
return emit_SGT(gen, inst);
+ case TGSI_OPCODE_SLT:
+ return emit_SLT(gen, inst);
+ case TGSI_OPCODE_SGE:
+ return emit_SGE(gen, inst);
+ case TGSI_OPCODE_SLE:
+ return emit_SLE(gen, inst);
+ case TGSI_OPCODE_SEQ:
+ return emit_SEQ(gen, inst);
+ case TGSI_OPCODE_SNE:
+ return emit_SNE(gen, inst);
+ case TGSI_OPCODE_CMP:
+ return emit_CMP(gen, inst);
+ case TGSI_OPCODE_MAX:
+ return emit_MAX(gen, inst);
+ case TGSI_OPCODE_MIN:
+ return emit_MIN(gen, inst);
+ case TGSI_OPCODE_FLR:
+ return emit_FLR(gen, inst);
+ case TGSI_OPCODE_FRC:
+ return emit_FRC(gen, inst);
case TGSI_OPCODE_END:
return emit_END(gen);
+ case TGSI_OPCODE_IF:
+ return emit_IF(gen, inst);
+ case TGSI_OPCODE_ELSE:
+ return emit_ELSE(gen, inst);
+ case TGSI_OPCODE_ENDIF:
+ return emit_ENDIF(gen, inst);
+
+ case TGSI_OPCODE_DDX:
+ return emit_DDX_DDY(gen, inst, true);
+ case TGSI_OPCODE_DDY:
+ return emit_DDX_DDY(gen, inst, false);
+
/* XXX lots more cases to do... */
default:
+ fprintf(stderr, "Cell: unimplemented TGSI instruction %d!\n",
+ inst->Instruction.Opcode);
return false;
}
@@ -401,45 +1181,88 @@ emit_instruction(struct codegen *gen,
/**
+ * Emit code for a TGSI immediate value (vector of four floats).
+ * This involves register allocation and initialization.
+ * XXX the initialization should be done by a "prepare" stage, not
+ * per quad execution!
+ */
+static boolean
+emit_immediate(struct codegen *gen, const struct tgsi_full_immediate *immed)
+{
+ int ch;
+
+ assert(gen->num_imm < MAX_TEMPS);
+
+ spe_comment(gen->f, -4, "IMMEDIATE:");
+
+ for (ch = 0; ch < 4; ch++) {
+ float val = immed->u.ImmediateFloat32[ch].Float;
+ int reg = spe_allocate_available_register(gen->f);
+
+ if (reg < 0)
+ return false;
+
+ /* update immediate map */
+ gen->imm_regs[gen->num_imm][ch] = reg;
+
+ /* emit initializer instruction */
+ spe_load_float(gen->f, reg, val);
+ }
+
+ gen->num_imm++;
+
+ return true;
+}
+
+
+
+/**
* Emit "code" for a TGSI declaration.
* We only care about TGSI TEMPORARY register declarations at this time.
* For each TGSI TEMPORARY we allocate four SPE registers.
*/
-static void
-emit_declaration(struct codegen *gen, const struct tgsi_full_declaration *decl)
+static boolean
+emit_declaration(struct cell_context *cell,
+ struct codegen *gen, const struct tgsi_full_declaration *decl)
{
int i, ch;
switch (decl->Declaration.File) {
case TGSI_FILE_TEMPORARY:
-#if DISASSEM
- printf("Declare temp reg %d .. %d\n",
- decl->DeclarationRange.First,
- decl->DeclarationRange.Last);
-#endif
+ if (cell->debug_flags & CELL_DEBUG_ASM) {
+ printf("Declare temp reg %d .. %d\n",
+ decl->DeclarationRange.First,
+ decl->DeclarationRange.Last);
+ }
+
for (i = decl->DeclarationRange.First;
i <= decl->DeclarationRange.Last;
i++) {
+ assert(i < MAX_TEMPS);
for (ch = 0; ch < 4; ch++) {
gen->temp_regs[i][ch] = spe_allocate_available_register(gen->f);
+ if (gen->temp_regs[i][ch] < 0)
+ return false; /* out of regs */
}
/* XXX if we run out of SPE registers, we need to spill
* to SPU memory. someday...
*/
-#if DISASSEM
- printf(" SPE regs: %d %d %d %d\n",
- gen->temp_regs[i][0],
- gen->temp_regs[i][1],
- gen->temp_regs[i][2],
- gen->temp_regs[i][3]);
-#endif
+ if (cell->debug_flags & CELL_DEBUG_ASM) {
+ printf(" SPE regs: %d %d %d %d\n",
+ gen->temp_regs[i][0],
+ gen->temp_regs[i][1],
+ gen->temp_regs[i][2],
+ gen->temp_regs[i][3]);
+ }
}
break;
default:
; /* ignore */
}
+
+ return true;
}
@@ -472,10 +1295,12 @@ cell_gen_fragment_program(struct cell_context *cell,
spe_allocate_register(f, gen.outputs_reg);
spe_allocate_register(f, gen.constants_reg);
-#if DISASSEM
- printf("Begin %s\n", __FUNCTION__);
- tgsi_dump(tokens, 0);
-#endif
+ if (cell->debug_flags & CELL_DEBUG_ASM) {
+ spe_print_code(f, true);
+ spe_indent(f, 8);
+ printf("Begin %s\n", __FUNCTION__);
+ tgsi_dump(tokens, 0);
+ }
tgsi_parse_init(&parse, tokens);
@@ -484,25 +1309,22 @@ cell_gen_fragment_program(struct cell_context *cell,
switch (parse.FullToken.Token.Type) {
case TGSI_TOKEN_TYPE_IMMEDIATE:
-#if 0
- if (!note_immediate(&gen, &parse.FullToken.FullImmediate ))
- goto fail;
-#endif
+ if (!emit_immediate(&gen, &parse.FullToken.FullImmediate))
+ gen.error = true;
break;
case TGSI_TOKEN_TYPE_DECLARATION:
- emit_declaration(&gen, &parse.FullToken.FullDeclaration);
+ if (!emit_declaration(cell, &gen, &parse.FullToken.FullDeclaration))
+ gen.error = true;
break;
case TGSI_TOKEN_TYPE_INSTRUCTION:
- if (!emit_instruction(&gen, &parse.FullToken.FullInstruction )) {
+ if (!emit_instruction(&gen, &parse.FullToken.FullInstruction))
gen.error = true;
- }
break;
default:
assert(0);
-
}
}
@@ -512,10 +1334,10 @@ cell_gen_fragment_program(struct cell_context *cell,
return emit_END(&gen);
}
-#if DISASSEM
- printf("cell_gen_fragment_program nr instructions: %d\n", f->num_inst);
- printf("End %s\n", __FUNCTION__);
-#endif
+ if (cell->debug_flags & CELL_DEBUG_ASM) {
+ printf("cell_gen_fragment_program nr instructions: %d\n", f->num_inst);
+ printf("End %s\n", __FUNCTION__);
+ }
tgsi_parse_free( &parse );
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
index 06219d4e98..1837b4c79b 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
@@ -60,6 +60,9 @@ gen_depth_test(const struct pipe_depth_stencil_alpha_state *dsa,
struct spe_function *f,
int mask_reg, int ifragZ_reg, int ifbZ_reg, int zmask_reg)
{
+ /* NOTE: we use clgt below, not cgt, because we want to compare _unsigned_
+ * quantities. This only makes a difference for 32-bit Z values though.
+ */
ASSERT(dsa->depth.enabled);
switch (dsa->depth.func) {
@@ -79,28 +82,28 @@ gen_depth_test(const struct pipe_depth_stencil_alpha_state *dsa,
case PIPE_FUNC_GREATER:
/* zmask = (ifragZ > ref) */
- spe_cgt(f, zmask_reg, ifragZ_reg, ifbZ_reg);
+ spe_clgt(f, zmask_reg, ifragZ_reg, ifbZ_reg);
/* mask = (mask & zmask) */
spe_and(f, mask_reg, mask_reg, zmask_reg);
break;
case PIPE_FUNC_LESS:
/* zmask = (ref > ifragZ) */
- spe_cgt(f, zmask_reg, ifbZ_reg, ifragZ_reg);
+ spe_clgt(f, zmask_reg, ifbZ_reg, ifragZ_reg);
/* mask = (mask & zmask) */
spe_and(f, mask_reg, mask_reg, zmask_reg);
break;
case PIPE_FUNC_LEQUAL:
/* zmask = (ifragZ > ref) */
- spe_cgt(f, zmask_reg, ifragZ_reg, ifbZ_reg);
+ spe_clgt(f, zmask_reg, ifragZ_reg, ifbZ_reg);
/* mask = (mask & ~zmask) */
spe_andc(f, mask_reg, mask_reg, zmask_reg);
break;
case PIPE_FUNC_GEQUAL:
/* zmask = (ref > ifragZ) */
- spe_cgt(f, zmask_reg, ifbZ_reg, ifragZ_reg);
+ spe_clgt(f, zmask_reg, ifbZ_reg, ifragZ_reg);
/* mask = (mask & ~zmask) */
spe_andc(f, mask_reg, mask_reg, zmask_reg);
break;
@@ -229,7 +232,27 @@ gen_alpha_test(const struct pipe_depth_stencil_alpha_state *dsa,
spe_release_register(f, amask_reg);
}
+/* This pair of functions is used inline to allocate and deallocate
+ * optional constant registers. Once a constant is discovered to be
+ * needed, we will likely need it again, so we don't want to deallocate
+ * it and have to allocate and load it again unnecessarily.
+ */
+static inline void
+setup_const_register(struct spe_function *f, boolean *is_already_set, unsigned int *r, float value)
+{
+ if (*is_already_set) return;
+ *r = spe_allocate_available_register(f);
+ spe_load_float(f, *r, value);
+ *is_already_set = true;
+}
+static inline void
+release_const_register(struct spe_function *f, boolean *is_already_set, unsigned int r)
+{
+ if (!*is_already_set) return;
+ spe_release_register(f, r);
+ *is_already_set = false;
+}
/**
* Generate SPE code to implement the given blend mode for a quad of pixels.
@@ -242,6 +265,7 @@ gen_alpha_test(const struct pipe_depth_stencil_alpha_state *dsa,
*/
static void
gen_blend(const struct pipe_blend_state *blend,
+ const struct pipe_blend_color *blend_color,
struct spe_function *f,
enum pipe_format color_format,
int fragR_reg, int fragG_reg, int fragB_reg, int fragA_reg,
@@ -262,10 +286,17 @@ gen_blend(const struct pipe_blend_state *blend,
int fbB_reg = spe_allocate_available_register(f);
int fbA_reg = spe_allocate_available_register(f);
- int one_reg = spe_allocate_available_register(f);
int tmp_reg = spe_allocate_available_register(f);
- boolean one_reg_set = false; /* avoid setting one_reg more than once */
+ /* Optional constant registers we might or might not end up using;
+ * if we do use them, make sure we only allocate them once by
+ * keeping a flag on each one.
+ */
+ boolean one_reg_set = false;
+ unsigned int one_reg;
+ boolean constR_reg_set = false, constG_reg_set = false,
+ constB_reg_set = false, constA_reg_set = false;
+ unsigned int constR_reg, constG_reg, constB_reg, constA_reg;
ASSERT(blend->blend_enable);
@@ -346,127 +377,449 @@ gen_blend(const struct pipe_blend_state *blend,
spe_release_register(f, mask_reg);
}
-
/*
- * Compute Src RGB terms
+ * Compute Src RGB terms. We're actually looking for the value
+ * of (the appropriate RGB factors) * (the incoming source RGB color),
+ * because in some cases (like PIPE_BLENDFACTOR_ONE and
+ * PIPE_BLENDFACTOR_ZERO) we can avoid doing unnecessary math.
*/
switch (blend->rgb_src_factor) {
case PIPE_BLENDFACTOR_ONE:
+ /* factors = (1,1,1), so term = (R,G,B) */
spe_move(f, term1R_reg, fragR_reg);
spe_move(f, term1G_reg, fragG_reg);
spe_move(f, term1B_reg, fragB_reg);
break;
case PIPE_BLENDFACTOR_ZERO:
- spe_zero(f, term1R_reg);
- spe_zero(f, term1G_reg);
- spe_zero(f, term1B_reg);
+ /* factors = (0,0,0), so term = (0,0,0) */
+ spe_load_float(f, term1R_reg, 0.0f);
+ spe_load_float(f, term1G_reg, 0.0f);
+ spe_load_float(f, term1B_reg, 0.0f);
break;
case PIPE_BLENDFACTOR_SRC_COLOR:
+ /* factors = (R,G,B), so term = (R*R, G*G, B*B) */
spe_fm(f, term1R_reg, fragR_reg, fragR_reg);
spe_fm(f, term1G_reg, fragG_reg, fragG_reg);
spe_fm(f, term1B_reg, fragB_reg, fragB_reg);
break;
case PIPE_BLENDFACTOR_SRC_ALPHA:
+ /* factors = (A,A,A), so term = (R*A, G*A, B*A) */
spe_fm(f, term1R_reg, fragR_reg, fragA_reg);
spe_fm(f, term1G_reg, fragG_reg, fragA_reg);
spe_fm(f, term1B_reg, fragB_reg, fragA_reg);
break;
- /* XXX more cases */
+ case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+ /* factors = (1-R,1-G,1-B), so term = (R*(1-R), G*(1-G), B*(1-B))
+ * or in other words term = (R-R*R, G-G*G, B-B*B)
+ * fnms(a,b,c,d) computes a = d - b*c
+ */
+ spe_fnms(f, term1R_reg, fragR_reg, fragR_reg, fragR_reg);
+ spe_fnms(f, term1G_reg, fragG_reg, fragG_reg, fragG_reg);
+ spe_fnms(f, term1B_reg, fragB_reg, fragB_reg, fragB_reg);
+ break;
+ case PIPE_BLENDFACTOR_DST_COLOR:
+ /* factors = (Rfb,Gfb,Bfb), so term = (R*Rfb, G*Gfb, B*Bfb) */
+ spe_fm(f, term1R_reg, fragR_reg, fbR_reg);
+ spe_fm(f, term1G_reg, fragG_reg, fbG_reg);
+ spe_fm(f, term1B_reg, fragB_reg, fbB_reg);
+ break;
+ case PIPE_BLENDFACTOR_INV_DST_COLOR:
+ /* factors = (1-Rfb,1-Gfb,1-Bfb), so term = (R*(1-Rfb),G*(1-Gfb),B*(1-Bfb))
+ * or term = (R-R*Rfb, G-G*Gfb, B-B*Bfb)
+ * fnms(a,b,c,d) computes a = d - b*c
+ */
+ spe_fnms(f, term1R_reg, fragR_reg, fbR_reg, fragR_reg);
+ spe_fnms(f, term1G_reg, fragG_reg, fbG_reg, fragG_reg);
+ spe_fnms(f, term1B_reg, fragB_reg, fbB_reg, fragB_reg);
+ break;
+ case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+ /* factors = (1-A,1-A,1-A), so term = (R*(1-A),G*(1-A),B*(1-A))
+ * or term = (R-R*A,G-G*A,B-B*A)
+ * fnms(a,b,c,d) computes a = d - b*c
+ */
+ spe_fnms(f, term1R_reg, fragR_reg, fragA_reg, fragR_reg);
+ spe_fnms(f, term1G_reg, fragG_reg, fragA_reg, fragG_reg);
+ spe_fnms(f, term1B_reg, fragB_reg, fragA_reg, fragB_reg);
+ break;
+ case PIPE_BLENDFACTOR_DST_ALPHA:
+ /* factors = (Afb, Afb, Afb), so term = (R*Afb, G*Afb, B*Afb) */
+ spe_fm(f, term1R_reg, fragR_reg, fbA_reg);
+ spe_fm(f, term1G_reg, fragG_reg, fbA_reg);
+ spe_fm(f, term1B_reg, fragB_reg, fbA_reg);
+ break;
+ case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+ /* factors = (1-Afb, 1-Afb, 1-Afb), so term = (R*(1-Afb),G*(1-Afb),B*(1-Afb))
+ * or term = (R-R*Afb,G-G*Afb,b-B*Afb)
+ * fnms(a,b,c,d) computes a = d - b*c
+ */
+ spe_fnms(f, term1R_reg, fragR_reg, fbA_reg, fragR_reg);
+ spe_fnms(f, term1G_reg, fragG_reg, fbA_reg, fragG_reg);
+ spe_fnms(f, term1B_reg, fragB_reg, fbA_reg, fragB_reg);
+ break;
+ case PIPE_BLENDFACTOR_CONST_COLOR:
+ /* We need the optional constant color registers */
+ setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
+ setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
+ setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
+ /* now, factor = (Rc,Gc,Bc), so term = (R*Rc,G*Gc,B*Bc) */
+ spe_fm(f, term1R_reg, fragR_reg, constR_reg);
+ spe_fm(f, term1G_reg, fragG_reg, constG_reg);
+ spe_fm(f, term1B_reg, fragB_reg, constB_reg);
+ break;
+ case PIPE_BLENDFACTOR_CONST_ALPHA:
+ /* we'll need the optional constant alpha register */
+ setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
+ /* factor = (Ac,Ac,Ac), so term = (R*Ac,G*Ac,B*Ac) */
+ spe_fm(f, term1R_reg, fragR_reg, constA_reg);
+ spe_fm(f, term1G_reg, fragG_reg, constA_reg);
+ spe_fm(f, term1B_reg, fragB_reg, constA_reg);
+ break;
+ case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+ /* We need the optional constant color registers */
+ setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
+ setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
+ setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
+ /* factor = (1-Rc,1-Gc,1-Bc), so term = (R*(1-Rc),G*(1-Gc),B*(1-Bc))
+ * or term = (R-R*Rc, G-G*Gc, B-B*Bc)
+ * fnms(a,b,c,d) computes a = d - b*c
+ */
+ spe_fnms(f, term1R_reg, fragR_reg, constR_reg, fragR_reg);
+ spe_fnms(f, term1G_reg, fragG_reg, constG_reg, fragG_reg);
+ spe_fnms(f, term1B_reg, fragB_reg, constB_reg, fragB_reg);
+ break;
+ case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+ /* We need the optional constant color registers */
+ setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
+ setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
+ setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
+ /* factor = (1-Ac,1-Ac,1-Ac), so term = (R*(1-Ac),G*(1-Ac),B*(1-Ac))
+ * or term = (R-R*Ac,G-G*Ac,B-B*Ac)
+ * fnms(a,b,c,d) computes a = d - b*c
+ */
+ spe_fnms(f, term1R_reg, fragR_reg, constA_reg, fragR_reg);
+ spe_fnms(f, term1G_reg, fragG_reg, constA_reg, fragG_reg);
+ spe_fnms(f, term1B_reg, fragB_reg, constA_reg, fragB_reg);
+ break;
+ case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+ /* We'll need the optional {1,1,1,1} register */
+ setup_const_register(f, &one_reg_set, &one_reg, 1.0f);
+ /* factor = (min(A,1-Afb),min(A,1-Afb),min(A,1-Afb)), so
+ * term = (R*min(A,1-Afb), G*min(A,1-Afb), B*min(A,1-Afb))
+ * We could expand the term (as a*min(b,c) == min(a*b,a*c)
+ * as long as a is positive), but then we'd have to do three
+ * spe_float_min() functions instead of one, so this is simpler.
+ */
+ /* tmp = 1 - Afb */
+ spe_fs(f, tmp_reg, one_reg, fbA_reg);
+ /* tmp = min(A,tmp) */
+ spe_float_min(f, tmp_reg, fragA_reg, tmp_reg);
+ /* term = R*tmp */
+ spe_fm(f, term1R_reg, fragR_reg, tmp_reg);
+ spe_fm(f, term1G_reg, fragG_reg, tmp_reg);
+ spe_fm(f, term1B_reg, fragB_reg, tmp_reg);
+ break;
+
+ /* These are special D3D cases involving a second color output
+ * from the fragment shader. I'm not sure we can support them
+ * yet... XXX
+ */
+ case PIPE_BLENDFACTOR_SRC1_COLOR:
+ case PIPE_BLENDFACTOR_SRC1_ALPHA:
+ case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+ case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+
default:
ASSERT(0);
}
/*
- * Compute Src Alpha term
+ * Compute Src Alpha term. Like the above, we're looking for
+ * the full term A*factor, not just the factor itself, because
+ * in many cases we can avoid doing unnecessary multiplies.
*/
switch (blend->alpha_src_factor) {
+ case PIPE_BLENDFACTOR_ZERO:
+ /* factor = 0, so term = 0 */
+ spe_load_float(f, term1A_reg, 0.0f);
+ break;
+
+ case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: /* fall through */
case PIPE_BLENDFACTOR_ONE:
+ /* factor = 1, so term = A */
spe_move(f, term1A_reg, fragA_reg);
break;
+
case PIPE_BLENDFACTOR_SRC_COLOR:
+ /* factor = A, so term = A*A */
spe_fm(f, term1A_reg, fragA_reg, fragA_reg);
break;
case PIPE_BLENDFACTOR_SRC_ALPHA:
spe_fm(f, term1A_reg, fragA_reg, fragA_reg);
break;
- /* XXX more cases */
+
+ case PIPE_BLENDFACTOR_INV_SRC_ALPHA: /* fall through */
+ case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+ /* factor = 1-A, so term = A*(1-A) = A-A*A */
+ /* fnms(a,b,c,d) computes a = d - b*c */
+ spe_fnms(f, term1A_reg, fragA_reg, fragA_reg, fragA_reg);
+ break;
+
+ case PIPE_BLENDFACTOR_DST_ALPHA: /* fall through */
+ case PIPE_BLENDFACTOR_DST_COLOR:
+ /* factor = Afb, so term = A*Afb */
+ spe_fm(f, term1A_reg, fragA_reg, fbA_reg);
+ break;
+
+ case PIPE_BLENDFACTOR_INV_DST_ALPHA: /* fall through */
+ case PIPE_BLENDFACTOR_INV_DST_COLOR:
+ /* factor = 1-Afb, so term = A*(1-Afb) = A - A*Afb */
+ /* fnms(a,b,c,d) computes a = d - b*c */
+ spe_fnms(f, term1A_reg, fragA_reg, fbA_reg, fragA_reg);
+ break;
+
+ case PIPE_BLENDFACTOR_CONST_ALPHA: /* fall through */
+ case PIPE_BLENDFACTOR_CONST_COLOR:
+ /* We need the optional constA_reg register */
+ setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
+ /* factor = Ac, so term = A*Ac */
+ spe_fm(f, term1A_reg, fragA_reg, constA_reg);
+ break;
+
+ case PIPE_BLENDFACTOR_INV_CONST_ALPHA: /* fall through */
+ case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+ /* We need the optional constA_reg register */
+ setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
+ /* factor = 1-Ac, so term = A*(1-Ac) = A-A*Ac */
+ /* fnms(a,b,c,d) computes a = d - b*c */
+ spe_fnms(f, term1A_reg, fragA_reg, constA_reg, fragA_reg);
+ break;
+
+ /* These are special D3D cases involving a second color output
+ * from the fragment shader. I'm not sure we can support them
+ * yet... XXX
+ */
+ case PIPE_BLENDFACTOR_SRC1_COLOR:
+ case PIPE_BLENDFACTOR_SRC1_ALPHA:
+ case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+ case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
default:
ASSERT(0);
}
/*
- * Compute Dest RGB terms
+ * Compute Dest RGB term. Like the above, we're looking for
+ * the full term (Rfb,Gfb,Bfb)*(factor), not just the factor itself, because
+ * in many cases we can avoid doing unnecessary multiplies.
*/
switch (blend->rgb_dst_factor) {
case PIPE_BLENDFACTOR_ONE:
+ /* factors = (1,1,1), so term = (Rfb,Gfb,Bfb) */
spe_move(f, term2R_reg, fbR_reg);
spe_move(f, term2G_reg, fbG_reg);
spe_move(f, term2B_reg, fbB_reg);
break;
case PIPE_BLENDFACTOR_ZERO:
- spe_zero(f, term2R_reg);
- spe_zero(f, term2G_reg);
- spe_zero(f, term2B_reg);
+ /* factor s= (0,0,0), so term = (0,0,0) */
+ spe_load_float(f, term2R_reg, 0.0f);
+ spe_load_float(f, term2G_reg, 0.0f);
+ spe_load_float(f, term2B_reg, 0.0f);
break;
case PIPE_BLENDFACTOR_SRC_COLOR:
+ /* factors = (R,G,B), so term = (R*Rfb, G*Gfb, B*Bfb) */
spe_fm(f, term2R_reg, fbR_reg, fragR_reg);
spe_fm(f, term2G_reg, fbG_reg, fragG_reg);
spe_fm(f, term2B_reg, fbB_reg, fragB_reg);
break;
+ case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+ /* factors = (1-R,1-G,1-B), so term = (Rfb*(1-R), Gfb*(1-G), Bfb*(1-B))
+ * or in other words term = (Rfb-Rfb*R, Gfb-Gfb*G, Bfb-Bfb*B)
+ * fnms(a,b,c,d) computes a = d - b*c
+ */
+ spe_fnms(f, term2R_reg, fragR_reg, fbR_reg, fbR_reg);
+ spe_fnms(f, term2G_reg, fragG_reg, fbG_reg, fbG_reg);
+ spe_fnms(f, term2B_reg, fragB_reg, fbB_reg, fbB_reg);
+ break;
case PIPE_BLENDFACTOR_SRC_ALPHA:
+ /* factors = (A,A,A), so term = (Rfb*A, Gfb*A, Bfb*A) */
spe_fm(f, term2R_reg, fbR_reg, fragA_reg);
spe_fm(f, term2G_reg, fbG_reg, fragA_reg);
spe_fm(f, term2B_reg, fbB_reg, fragA_reg);
break;
case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
- /* one = {1.0, 1.0, 1.0, 1.0} */
- if (!one_reg_set) {
- spe_load_float(f, one_reg, 1.0f);
- one_reg_set = true;
- }
- /* tmp = one - fragA */
- spe_fs(f, tmp_reg, one_reg, fragA_reg);
- /* term = fb * tmp */
- spe_fm(f, term2R_reg, fbR_reg, tmp_reg);
- spe_fm(f, term2G_reg, fbG_reg, tmp_reg);
- spe_fm(f, term2B_reg, fbB_reg, tmp_reg);
- break;
- /* XXX more cases */
+ /* factors = (1-A,1-A,1-A) so term = (Rfb-Rfb*A,Gfb-Gfb*A,Bfb-Bfb*A) */
+ /* fnms(a,b,c,d) computes a = d - b*c */
+ spe_fnms(f, term2R_reg, fbR_reg, fragA_reg, fbR_reg);
+ spe_fnms(f, term2G_reg, fbG_reg, fragA_reg, fbG_reg);
+ spe_fnms(f, term2B_reg, fbB_reg, fragA_reg, fbB_reg);
+ break;
+ case PIPE_BLENDFACTOR_DST_COLOR:
+ /* factors = (Rfb,Gfb,Bfb), so term = (Rfb*Rfb, Gfb*Gfb, Bfb*Bfb) */
+ spe_fm(f, term2R_reg, fbR_reg, fbR_reg);
+ spe_fm(f, term2G_reg, fbG_reg, fbG_reg);
+ spe_fm(f, term2B_reg, fbB_reg, fbB_reg);
+ break;
+ case PIPE_BLENDFACTOR_INV_DST_COLOR:
+ /* factors = (1-Rfb,1-Gfb,1-Bfb), so term = (Rfb*(1-Rfb),Gfb*(1-Gfb),Bfb*(1-Bfb))
+ * or term = (Rfb-Rfb*Rfb, Gfb-Gfb*Gfb, Bfb-Bfb*Bfb)
+ * fnms(a,b,c,d) computes a = d - b*c
+ */
+ spe_fnms(f, term2R_reg, fbR_reg, fbR_reg, fbR_reg);
+ spe_fnms(f, term2G_reg, fbG_reg, fbG_reg, fbG_reg);
+ spe_fnms(f, term2B_reg, fbB_reg, fbB_reg, fbB_reg);
+ break;
+
+ case PIPE_BLENDFACTOR_DST_ALPHA:
+ /* factors = (Afb, Afb, Afb), so term = (Rfb*Afb, Gfb*Afb, Bfb*Afb) */
+ spe_fm(f, term2R_reg, fbR_reg, fbA_reg);
+ spe_fm(f, term2G_reg, fbG_reg, fbA_reg);
+ spe_fm(f, term2B_reg, fbB_reg, fbA_reg);
+ break;
+ case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+ /* factors = (1-Afb, 1-Afb, 1-Afb), so term = (Rfb*(1-Afb),Gfb*(1-Afb),Bfb*(1-Afb))
+ * or term = (Rfb-Rfb*Afb,Gfb-Gfb*Afb,Bfb-Bfb*Afb)
+ * fnms(a,b,c,d) computes a = d - b*c
+ */
+ spe_fnms(f, term2R_reg, fbR_reg, fbA_reg, fbR_reg);
+ spe_fnms(f, term2G_reg, fbG_reg, fbA_reg, fbG_reg);
+ spe_fnms(f, term2B_reg, fbB_reg, fbA_reg, fbB_reg);
+ break;
+ case PIPE_BLENDFACTOR_CONST_COLOR:
+ /* We need the optional constant color registers */
+ setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
+ setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
+ setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
+ /* now, factor = (Rc,Gc,Bc), so term = (Rfb*Rc,Gfb*Gc,Bfb*Bc) */
+ spe_fm(f, term2R_reg, fbR_reg, constR_reg);
+ spe_fm(f, term2G_reg, fbG_reg, constG_reg);
+ spe_fm(f, term2B_reg, fbB_reg, constB_reg);
+ break;
+ case PIPE_BLENDFACTOR_CONST_ALPHA:
+ /* we'll need the optional constant alpha register */
+ setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
+ /* factor = (Ac,Ac,Ac), so term = (Rfb*Ac,Gfb*Ac,Bfb*Ac) */
+ spe_fm(f, term2R_reg, fbR_reg, constA_reg);
+ spe_fm(f, term2G_reg, fbG_reg, constA_reg);
+ spe_fm(f, term2B_reg, fbB_reg, constA_reg);
+ break;
+ case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+ /* We need the optional constant color registers */
+ setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
+ setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
+ setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
+ /* factor = (1-Rc,1-Gc,1-Bc), so term = (Rfb*(1-Rc),Gfb*(1-Gc),Bfb*(1-Bc))
+ * or term = (Rfb-Rfb*Rc, Gfb-Gfb*Gc, Bfb-Bfb*Bc)
+ * fnms(a,b,c,d) computes a = d - b*c
+ */
+ spe_fnms(f, term2R_reg, fbR_reg, constR_reg, fbR_reg);
+ spe_fnms(f, term2G_reg, fbG_reg, constG_reg, fbG_reg);
+ spe_fnms(f, term2B_reg, fbB_reg, constB_reg, fbB_reg);
+ break;
+ case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+ /* We need the optional constant color registers */
+ setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
+ setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
+ setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
+ /* factor = (1-Ac,1-Ac,1-Ac), so term = (Rfb*(1-Ac),Gfb*(1-Ac),Bfb*(1-Ac))
+ * or term = (Rfb-Rfb*Ac,Gfb-Gfb*Ac,Bfb-Bfb*Ac)
+ * fnms(a,b,c,d) computes a = d - b*c
+ */
+ spe_fnms(f, term2R_reg, fbR_reg, constA_reg, fbR_reg);
+ spe_fnms(f, term2G_reg, fbG_reg, constA_reg, fbG_reg);
+ spe_fnms(f, term2B_reg, fbB_reg, constA_reg, fbB_reg);
+ break;
+ case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: /* not supported for dest RGB */
+ ASSERT(0);
+ break;
+
+ /* These are special D3D cases involving a second color output
+ * from the fragment shader. I'm not sure we can support them
+ * yet... XXX
+ */
+ case PIPE_BLENDFACTOR_SRC1_COLOR:
+ case PIPE_BLENDFACTOR_SRC1_ALPHA:
+ case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+ case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+
default:
ASSERT(0);
}
/*
- * Compute Dest Alpha term
+ * Compute Dest Alpha term. Like the above, we're looking for
+ * the full term Afb*factor, not just the factor itself, because
+ * in many cases we can avoid doing unnecessary multiplies.
*/
switch (blend->alpha_dst_factor) {
case PIPE_BLENDFACTOR_ONE:
+ /* factor = 1, so term = Afb */
spe_move(f, term2A_reg, fbA_reg);
break;
case PIPE_BLENDFACTOR_ZERO:
- spe_zero(f, term2A_reg);
+ /* factor = 0, so term = 0 */
+ spe_load_float(f, term2A_reg, 0.0f);
break;
- case PIPE_BLENDFACTOR_SRC_ALPHA:
+
+ case PIPE_BLENDFACTOR_SRC_ALPHA: /* fall through */
+ case PIPE_BLENDFACTOR_SRC_COLOR:
+ /* factor = A, so term = Afb*A */
spe_fm(f, term2A_reg, fbA_reg, fragA_reg);
break;
- case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
- /* one = {1.0, 1.0, 1.0, 1.0} */
- if (!one_reg_set) {
- spe_load_float(f, one_reg, 1.0f);
- one_reg_set = true;
- }
- /* tmp = one - fragA */
- spe_fs(f, tmp_reg, one_reg, fragA_reg);
- /* termA = fbA * tmp */
- spe_fm(f, term2A_reg, fbA_reg, tmp_reg);
+
+ case PIPE_BLENDFACTOR_INV_SRC_ALPHA: /* fall through */
+ case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+ /* factor = 1-A, so term = Afb*(1-A) = Afb-Afb*A */
+ /* fnms(a,b,c,d) computes a = d - b*c */
+ spe_fnms(f, term2A_reg, fbA_reg, fragA_reg, fbA_reg);
+ break;
+
+ case PIPE_BLENDFACTOR_DST_ALPHA: /* fall through */
+ case PIPE_BLENDFACTOR_DST_COLOR:
+ /* factor = Afb, so term = Afb*Afb */
+ spe_fm(f, term2A_reg, fbA_reg, fbA_reg);
+ break;
+
+ case PIPE_BLENDFACTOR_INV_DST_ALPHA: /* fall through */
+ case PIPE_BLENDFACTOR_INV_DST_COLOR:
+ /* factor = 1-Afb, so term = Afb*(1-Afb) = Afb - Afb*Afb */
+ /* fnms(a,b,c,d) computes a = d - b*c */
+ spe_fnms(f, term2A_reg, fbA_reg, fbA_reg, fbA_reg);
+ break;
+
+ case PIPE_BLENDFACTOR_CONST_ALPHA: /* fall through */
+ case PIPE_BLENDFACTOR_CONST_COLOR:
+ /* We need the optional constA_reg register */
+ setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
+ /* factor = Ac, so term = Afb*Ac */
+ spe_fm(f, term2A_reg, fbA_reg, constA_reg);
+ break;
+
+ case PIPE_BLENDFACTOR_INV_CONST_ALPHA: /* fall through */
+ case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+ /* We need the optional constA_reg register */
+ setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
+ /* factor = 1-Ac, so term = Afb*(1-Ac) = Afb-Afb*Ac */
+ /* fnms(a,b,c,d) computes a = d - b*c */
+ spe_fnms(f, term2A_reg, fbA_reg, constA_reg, fbA_reg);
break;
- /* XXX more cases */
+
+ case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: /* not supported for dest alpha */
+ ASSERT(0);
+ break;
+
+ /* These are special D3D cases involving a second color output
+ * from the fragment shader. I'm not sure we can support them
+ * yet... XXX
+ */
+ case PIPE_BLENDFACTOR_SRC1_COLOR:
+ case PIPE_BLENDFACTOR_SRC1_ALPHA:
+ case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+ case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
default:
ASSERT(0);
}
/*
- * Combine Src/Dest RGB terms
+ * Combine Src/Dest RGB terms as per the blend equation.
*/
switch (blend->rgb_func) {
case PIPE_BLEND_ADD:
@@ -479,7 +832,21 @@ gen_blend(const struct pipe_blend_state *blend,
spe_fs(f, fragG_reg, term1G_reg, term2G_reg);
spe_fs(f, fragB_reg, term1B_reg, term2B_reg);
break;
- /* XXX more cases */
+ case PIPE_BLEND_REVERSE_SUBTRACT:
+ spe_fs(f, fragR_reg, term2R_reg, term1R_reg);
+ spe_fs(f, fragG_reg, term2G_reg, term1G_reg);
+ spe_fs(f, fragB_reg, term2B_reg, term1B_reg);
+ break;
+ case PIPE_BLEND_MIN:
+ spe_float_min(f, fragR_reg, term1R_reg, term2R_reg);
+ spe_float_min(f, fragG_reg, term1G_reg, term2G_reg);
+ spe_float_min(f, fragB_reg, term1B_reg, term2B_reg);
+ break;
+ case PIPE_BLEND_MAX:
+ spe_float_max(f, fragR_reg, term1R_reg, term2R_reg);
+ spe_float_max(f, fragG_reg, term1G_reg, term2G_reg);
+ spe_float_max(f, fragB_reg, term1B_reg, term2B_reg);
+ break;
default:
ASSERT(0);
}
@@ -494,7 +861,15 @@ gen_blend(const struct pipe_blend_state *blend,
case PIPE_BLEND_SUBTRACT:
spe_fs(f, fragA_reg, term1A_reg, term2A_reg);
break;
- /* XXX more cases */
+ case PIPE_BLEND_REVERSE_SUBTRACT:
+ spe_fs(f, fragA_reg, term2A_reg, term1A_reg);
+ break;
+ case PIPE_BLEND_MIN:
+ spe_float_min(f, fragA_reg, term1A_reg, term2A_reg);
+ break;
+ case PIPE_BLEND_MAX:
+ spe_float_max(f, fragA_reg, term1A_reg, term2A_reg);
+ break;
default:
ASSERT(0);
}
@@ -514,8 +889,14 @@ gen_blend(const struct pipe_blend_state *blend,
spe_release_register(f, fbB_reg);
spe_release_register(f, fbA_reg);
- spe_release_register(f, one_reg);
spe_release_register(f, tmp_reg);
+
+ /* Free any optional registers that actually got used */
+ release_const_register(f, &one_reg_set, one_reg);
+ release_const_register(f, &constR_reg_set, constR_reg);
+ release_const_register(f, &constG_reg_set, constG_reg);
+ release_const_register(f, &constB_reg_set, constB_reg);
+ release_const_register(f, &constA_reg_set, constA_reg);
}
@@ -524,8 +905,69 @@ gen_logicop(const struct pipe_blend_state *blend,
struct spe_function *f,
int fragRGBA_reg, int fbRGBA_reg)
{
- /* XXX to-do */
- /* operate on 32-bit packed pixels, not float colors */
+ /* We've got four 32-bit RGBA packed pixels in each of
+ * fragRGBA_reg and fbRGBA_reg, not sets of floating-point
+ * reds, greens, blues, and alphas.
+ * */
+ ASSERT(blend->logicop_enable);
+
+ switch(blend->logicop_func) {
+ case PIPE_LOGICOP_CLEAR: /* 0 */
+ spe_zero(f, fragRGBA_reg);
+ break;
+ case PIPE_LOGICOP_NOR: /* ~(s | d) */
+ spe_nor(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+ break;
+ case PIPE_LOGICOP_AND_INVERTED: /* ~s & d */
+ /* andc R, A, B computes R = A & ~B */
+ spe_andc(f, fragRGBA_reg, fbRGBA_reg, fragRGBA_reg);
+ break;
+ case PIPE_LOGICOP_COPY_INVERTED: /* ~s */
+ spe_complement(f, fragRGBA_reg, fragRGBA_reg);
+ break;
+ case PIPE_LOGICOP_AND_REVERSE: /* s & ~d */
+ /* andc R, A, B computes R = A & ~B */
+ spe_andc(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+ break;
+ case PIPE_LOGICOP_INVERT: /* ~d */
+ /* Note that (A nor A) == ~(A|A) == ~A */
+ spe_nor(f, fragRGBA_reg, fbRGBA_reg, fbRGBA_reg);
+ break;
+ case PIPE_LOGICOP_XOR: /* s ^ d */
+ spe_xor(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+ break;
+ case PIPE_LOGICOP_NAND: /* ~(s & d) */
+ spe_nand(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+ break;
+ case PIPE_LOGICOP_AND: /* s & d */
+ spe_and(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+ break;
+ case PIPE_LOGICOP_EQUIV: /* ~(s ^ d) */
+ spe_xor(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+ spe_complement(f, fragRGBA_reg, fragRGBA_reg);
+ break;
+ case PIPE_LOGICOP_NOOP: /* d */
+ spe_move(f, fragRGBA_reg, fbRGBA_reg);
+ break;
+ case PIPE_LOGICOP_OR_INVERTED: /* ~s | d */
+ /* orc R, A, B computes R = A | ~B */
+ spe_orc(f, fragRGBA_reg, fbRGBA_reg, fragRGBA_reg);
+ break;
+ case PIPE_LOGICOP_COPY: /* s */
+ break;
+ case PIPE_LOGICOP_OR_REVERSE: /* s | ~d */
+ /* orc R, A, B computes R = A | ~B */
+ spe_orc(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+ break;
+ case PIPE_LOGICOP_OR: /* s | d */
+ spe_or(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+ break;
+ case PIPE_LOGICOP_SET: /* 1 */
+ spe_load_int(f, fragRGBA_reg, 0xffffffff);
+ break;
+ default:
+ ASSERT(0);
+ }
}
@@ -534,14 +976,84 @@ gen_colormask(uint colormask,
struct spe_function *f,
int fragRGBA_reg, int fbRGBA_reg)
{
- /* XXX to-do */
- /* operate on 32-bit packed pixels, not float colors */
-}
+ /* We've got four 32-bit RGBA packed pixels in each of
+ * fragRGBA_reg and fbRGBA_reg, not sets of floating-point
+ * reds, greens, blues, and alphas.
+ * */
+
+ /* The color mask operation can prevent any set of color
+ * components in the incoming fragment from being written to the frame
+ * buffer; we do this by replacing the masked components of the
+ * fragment with the frame buffer values.
+ *
+ * There are only 16 possibilities, with a unique mask for
+ * each of the possibilities. (Technically, there are only 15
+ * possibilities, since we shouldn't be called for the one mask
+ * that does nothing, but the complete implementation is here
+ * anyway to avoid confusion.)
+ *
+ * We implement this via a constant static array which we'll index
+ * into to get the correct mask.
+ *
+ * We're dependent on the mask values being low-order bits,
+ * with particular values for each bit; so we start with a
+ * few assertions, which will fail if any of the values were
+ * to change.
+ */
+ ASSERT(PIPE_MASK_R == 0x1);
+ ASSERT(PIPE_MASK_G == 0x2);
+ ASSERT(PIPE_MASK_B == 0x4);
+ ASSERT(PIPE_MASK_A == 0x8);
+ /* Here's the list of all possible colormasks, indexed by the
+ * value of the combined mask specifier.
+ */
+ static const unsigned int colormasks[16] = {
+ 0x00000000, /* 0: all colors masked */
+ 0xff000000, /* 1: PIPE_MASK_R */
+ 0x00ff0000, /* 2: PIPE_MASK_G */
+ 0xffff0000, /* 3: PIPE_MASK_R | PIPE_MASK_G */
+ 0x0000ff00, /* 4: PIPE_MASK_B */
+ 0xff00ff00, /* 5: PIPE_MASK_R | PIPE_MASK_B */
+ 0x00ffff00, /* 6: PIPE_MASK_G | PIPE_MASK_B */
+ 0xffffff00, /* 7: PIPE_MASK_R | PIPE_MASK_G | PIPE_MASK_B */
+ 0x000000ff, /* 8: PIPE_MASK_A */
+ 0xff0000ff, /* 9: PIPE_MASK_R | PIPE_MASK_A */
+ 0x00ff00ff, /* 10: PIPE_MASK_G | PIPE_MASK_A */
+ 0xffff00ff, /* 11: PIPE_MASK_R | PIPE_MASK_G | PIPE_MASK_A */
+ 0x0000ffff, /* 12: PIPE_MASK_B | PIPE_MASK_A */
+ 0xff00ffff, /* 13: PIPE_MASK_R | PIPE_MASK_B | PIPE_MASK_A */
+ 0x00ffffff, /* 14: PIPE_MASK_G | PIPE_MASK_B | PIPE_MASK_A */
+ 0xffffffff /* 15: PIPE_MASK_R | PIPE_MASK_G | PIPE_MASK_B | PIPE_MASK_A */
+ };
+
+ /* Get a temporary register to hold the mask */
+ int colormask_reg = spe_allocate_available_register(f);
+
+ /* Look up the desired mask directly and load it into the mask register.
+ * This will load the same mask into each of the four words in the
+ * mask register.
+ */
+ spe_load_uint(f, colormask_reg, colormasks[colormask]);
+
+ /* Use the mask register to select between the fragment color
+ * values and the frame buffer color values. Wherever the
+ * mask has a 0 bit, the current frame buffer color should override
+ * the fragment color. Wherever the mask has a 1 bit, the
+ * fragment color should persevere. The Select Bits (selb rt, rA, rB, rM)
+ * instruction will select bits from its first operand rA wherever the
+ * the mask bits rM are 0, and from its second operand rB wherever the
+ * mask bits rM are 1. That means that the frame buffer color is the
+ * first operand, and the fragment color the second.
+ */
+ spe_selb(f, fragRGBA_reg, fbRGBA_reg, fragRGBA_reg, colormask_reg);
+ /* Release the temporary register and we're done */
+ spe_release_register(f, colormask_reg);
+}
/**
- * Generate code to pack a quad of float colors into a four 32-bit integers.
+ * Generate code to pack a quad of float colors into four 32-bit integers.
*
* \param f SPE function to append instruction onto.
* \param color_format the dest color packing format
@@ -557,13 +1069,16 @@ gen_pack_colors(struct spe_function *f,
int r_reg, int g_reg, int b_reg, int a_reg,
int rgba_reg)
{
+ int rg_reg = spe_allocate_available_register(f);
+ int ba_reg = spe_allocate_available_register(f);
+
/* Convert float[4] in [0.0,1.0] to int[4] in [0,~0], with clamping */
spe_cfltu(f, r_reg, r_reg, 32);
spe_cfltu(f, g_reg, g_reg, 32);
spe_cfltu(f, b_reg, b_reg, 32);
spe_cfltu(f, a_reg, a_reg, 32);
- /* Shift the most significant bytes to least the significant positions.
+ /* Shift the most significant bytes to the least significant positions.
* I.e.: reg = reg >> 24
*/
spe_rotmi(f, r_reg, r_reg, -24);
@@ -595,9 +1110,12 @@ gen_pack_colors(struct spe_function *f,
* OR-ing all those together gives us four packed colors:
* RGBA = {0xffffffff, 0xaa114477, 0xbb225588, 0xcc336699}
*/
- spe_or(f, rgba_reg, r_reg, g_reg);
- spe_or(f, rgba_reg, rgba_reg, b_reg);
- spe_or(f, rgba_reg, rgba_reg, a_reg);
+ spe_or(f, rg_reg, r_reg, g_reg);
+ spe_or(f, ba_reg, a_reg, b_reg);
+ spe_or(f, rgba_reg, rg_reg, ba_reg);
+
+ spe_release_register(f, rg_reg);
+ spe_release_register(f, ba_reg);
}
@@ -629,6 +1147,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
const struct pipe_depth_stencil_alpha_state *dsa =
&cell->depth_stencil->base;
const struct pipe_blend_state *blend = &cell->blend->base;
+ const struct pipe_blend_color *blend_color = &cell->blend_color;
const enum pipe_format color_format = cell->framebuffer.cbufs[0]->format;
/* For SPE function calls: reg $3 = first param, $4 = second param, etc. */
@@ -652,6 +1171,13 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
int fbZS_reg; /**< framebuffer's combined z/stencil values for quad */
spe_init_func(f, SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE);
+
+ if (cell->debug_flags & CELL_DEBUG_ASM) {
+ spe_print_code(f, true);
+ spe_indent(f, 8);
+ spe_comment(f, -4, "Begin per-fragment ops");
+ }
+
spe_allocate_register(f, x_reg);
spe_allocate_register(f, y_reg);
spe_allocate_register(f, color_tile_reg);
@@ -710,34 +1236,50 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
spe_release_register(f, mask_reg);
/* OK, fbZ_reg has four 24-bit Z values now */
}
+ else if (zs_format == PIPE_FORMAT_Z24S8_UNORM ||
+ zs_format == PIPE_FORMAT_Z24X8_UNORM) {
+ spe_rotmi(f, fbZ_reg, fbZS_reg, -8); /* fbZ = fbZS >> 8 */
+ /* OK, fbZ_reg has four 24-bit Z values now */
+ }
+ else if (zs_format == PIPE_FORMAT_Z32_UNORM) {
+ spe_move(f, fbZ_reg, fbZS_reg);
+ /* OK, fbZ_reg has four 32-bit Z values now */
+ }
+ else if (zs_format == PIPE_FORMAT_Z16_UNORM) {
+ spe_move(f, fbZ_reg, fbZS_reg);
+ /* OK, fbZ_reg has four 16-bit Z values now */
+ }
else {
- /* XXX handle other z/stencil formats */
- ASSERT(0);
+ ASSERT(0); /* invalid format */
}
- /* Convert fragZ values from float[4] to uint[4] */
+ /* Convert fragZ values from float[4] to 16, 24 or 32-bit uint[4] */
if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
zs_format == PIPE_FORMAT_X8Z24_UNORM ||
zs_format == PIPE_FORMAT_Z24S8_UNORM ||
zs_format == PIPE_FORMAT_Z24X8_UNORM) {
- /* 24-bit Z values */
- int scale_reg = spe_allocate_available_register(f);
-
- /* scale_reg[0,1,2,3] = float(2^24-1) */
- spe_load_float(f, scale_reg, (float) 0xffffff);
-
- /* XXX these two instructions might be combined */
- spe_fm(f, fragZ_reg, fragZ_reg, scale_reg); /* fragZ *= scale */
- spe_cfltu(f, fragZ_reg, fragZ_reg, 0); /* fragZ = (int) fragZ */
-
- spe_release_register(f, scale_reg);
+ /* scale/convert fragZ from float in [0,1] to uint in [0, ~0] */
+ spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+ /* fragZ = fragZ >> 8 */
+ spe_rotmi(f, fragZ_reg, fragZ_reg, -8);
}
- else {
- /* XXX handle 16-bit Z format */
- ASSERT(0);
+ else if (zs_format == PIPE_FORMAT_Z32_UNORM) {
+ /* scale/convert fragZ from float in [0,1] to uint in [0, ~0] */
+ spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
}
+ else if (zs_format == PIPE_FORMAT_Z16_UNORM) {
+ /* scale/convert fragZ from float in [0,1] to uint in [0, ~0] */
+ spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+ /* fragZ = fragZ >> 16 */
+ spe_rotmi(f, fragZ_reg, fragZ_reg, -16);
+ }
+ }
+ else {
+ /* no Z test, but set Z to zero so we don't OR-in garbage below */
+ spe_load_uint(f, fbZ_reg, 0); /* XXX set to zero for now */
}
+
if (dsa->stencil[0].enabled) {
/* Extract Stencil bit sfrom fbZS_reg into fbS_reg */
if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
@@ -751,7 +1293,10 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
ASSERT(0);
}
}
-
+ else {
+ /* no stencil test, but set to zero so we don't OR-in garbage below */
+ spe_load_uint(f, fbS_reg, 0); /* XXX set to zero for now */
+ }
if (dsa->stencil[0].enabled) {
/* XXX this may involve depth testing too */
@@ -779,22 +1324,22 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
spe_shli(f, fbS_reg, fbS_reg, 24); /* fbS = fbS << 24 */
spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
}
- else if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
- zs_format == PIPE_FORMAT_X8Z24_UNORM) {
- /* XXX to do */
- ASSERT(0);
+ else if (zs_format == PIPE_FORMAT_Z24S8_UNORM ||
+ zs_format == PIPE_FORMAT_Z24X8_UNORM) {
+ spe_shli(f, fbZ_reg, fbZ_reg, 8); /* fbZ = fbZ << 8 */
+ spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
+ }
+ else if (zs_format == PIPE_FORMAT_Z32_UNORM) {
+ spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */
}
else if (zs_format == PIPE_FORMAT_Z16_UNORM) {
- /* XXX to do */
- ASSERT(0);
+ spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */
}
else if (zs_format == PIPE_FORMAT_S8_UNORM) {
- /* XXX to do */
- ASSERT(0);
+ ASSERT(0); /* XXX to do */
}
else {
- /* bad zs_format */
- ASSERT(0);
+ ASSERT(0); /* bad zs_format */
}
/* Store: memory[depth_tile_reg + quad_offset_reg] = fbZS */
@@ -816,7 +1361,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
if (blend->blend_enable) {
- gen_blend(blend, f, color_format,
+ gen_blend(blend, blend_color, f, color_format,
fragR_reg, fragG_reg, fragB_reg, fragA_reg, fbRGBA_reg);
}
@@ -837,7 +1382,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
gen_logicop(blend, f, rgba_reg, fbRGBA_reg);
}
- if (blend->colormask != 0xf) {
+ if (blend->colormask != PIPE_MASK_RGBA) {
gen_colormask(blend->colormask, f, rgba_reg, fbRGBA_reg);
}
@@ -866,5 +1411,8 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
spe_release_register(f, fbRGBA_reg);
spe_release_register(f, fbZS_reg);
spe_release_register(f, quad_offset_reg);
-}
+ if (cell->debug_flags & CELL_DEBUG_ASM) {
+ spe_comment(f, -4, "End per-fragment ops");
+ }
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_pipe_state.c b/src/gallium/drivers/cell/ppu/cell_pipe_state.c
index 475c6ef0ce..ea820aca74 100644
--- a/src/gallium/drivers/cell/ppu/cell_pipe_state.c
+++ b/src/gallium/drivers/cell/ppu/cell_pipe_state.c
@@ -72,7 +72,9 @@ cell_delete_blend_state(struct pipe_context *pipe, void *blend)
{
struct cell_blend_state *cb = (struct cell_blend_state *) blend;
+#if 0
spe_release_func(& cb->code);
+#endif
FREE(cb);
}
@@ -128,7 +130,9 @@ cell_delete_depth_stencil_alpha_state(struct pipe_context *pipe, void *depth)
struct cell_depth_stencil_alpha_state *cdsa =
(struct cell_depth_stencil_alpha_state *) depth;
+#if 0
spe_release_func(& cdsa->code);
+#endif
FREE(cdsa);
}
diff --git a/src/gallium/drivers/cell/ppu/cell_screen.c b/src/gallium/drivers/cell/ppu/cell_screen.c
index 139b3719b6..47ba6fa290 100644
--- a/src/gallium/drivers/cell/ppu/cell_screen.c
+++ b/src/gallium/drivers/cell/ppu/cell_screen.c
@@ -58,9 +58,9 @@ cell_get_param(struct pipe_screen *screen, int param)
case PIPE_CAP_MAX_TEXTURE_IMAGE_UNITS:
return CELL_MAX_SAMPLERS;
case PIPE_CAP_NPOT_TEXTURES:
- return 0;
+ return 1;
case PIPE_CAP_TWO_SIDED_STENCIL:
- return 0;
+ return 1;
case PIPE_CAP_GLSL:
return 1;
case PIPE_CAP_S3TC:
@@ -68,13 +68,13 @@ cell_get_param(struct pipe_screen *screen, int param)
case PIPE_CAP_ANISOTROPIC_FILTER:
return 0;
case PIPE_CAP_POINT_SPRITE:
- return 0;
+ return 1;
case PIPE_CAP_MAX_RENDER_TARGETS:
return 1;
case PIPE_CAP_OCCLUSION_QUERY:
- return 0;
+ return 1;
case PIPE_CAP_TEXTURE_SHADOW_MAP:
- return 0;
+ return 10;
case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
return 12; /* max 2Kx2K */
case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
@@ -82,7 +82,7 @@ cell_get_param(struct pipe_screen *screen, int param)
case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
return 12; /* max 2Kx2K */
default:
- return 0;
+ return 10;
}
}
@@ -108,7 +108,7 @@ cell_get_paramf(struct pipe_screen *screen, int param)
return 16.0; /* arbitrary */
default:
- return 0;
+ return 10;
}
}
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index 2da3097983..8a389cd6aa 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -100,14 +100,19 @@ cell_emit_state(struct cell_context *cell)
= cell_batch_alloc(cell, sizeof(*fops));
struct spe_function spe_code;
+ /* Prepare the buffer that will hold the generated code. */
+ spe_init_func(&spe_code, SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE);
+
/* generate new code */
cell_gen_fragment_function(cell, &spe_code);
+
/* put the new code into the batch buffer */
fops->opcode = CELL_CMD_STATE_FRAGMENT_OPS;
memcpy(&fops->code, spe_code.store,
SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE);
fops->dsa = cell->depth_stencil->base;
fops->blend = cell->blend->base;
+
/* free codegen buffer */
spe_release_func(&spe_code);
}
diff --git a/src/gallium/drivers/cell/spu/.gitignore b/src/gallium/drivers/cell/spu/.gitignore
new file mode 100644
index 0000000000..2be9a2d324
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/.gitignore
@@ -0,0 +1 @@
+g3d_spu
diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index 78260c4259..b4d30228f7 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -50,7 +50,31 @@ helpful headers:
/opt/cell/sdk/usr/include/libmisc.h
*/
+/* Set to 0 to disable all extraneous debugging code */
+#define DEBUG 1
+
+#if DEBUG
boolean Debug = FALSE;
+boolean force_fragment_ops_fallback = TRUE;
+
+/* These debug macros use the unusual construction ", ##__VA_ARGS__"
+ * which expands to the expected comma + args if variadic arguments
+ * are supplied, but swallows the comma if there are no variadic
+ * arguments (which avoids syntax errors that would otherwise occur).
+ */
+#define DEBUG_PRINTF(format,...) \
+ if (Debug) \
+ printf("SPU %u: " format, spu.init.id, ##__VA_ARGS__)
+#define D_PRINTF(flag, format,...) \
+ if (spu.init.debug_flags & (flag)) \
+ printf("SPU %u: " format, spu.init.id, ##__VA_ARGS__)
+
+#else
+
+#define DEBUG_PRINTF(...)
+#define D_PRINTF(...)
+
+#endif
struct spu_global spu;
@@ -133,9 +157,7 @@ really_clear_tiles(uint surfaceIndex)
static void
cmd_clear_surface(const struct cell_command_clear_surface *clear)
{
- if (Debug)
- printf("SPU %u: CLEAR SURF %u to 0x%08x\n", spu.init.id,
- clear->surface, clear->value);
+ DEBUG_PRINTF("CLEAR SURF %u to 0x%08x\n", clear->surface, clear->value);
if (clear->surface == 0) {
spu.fb.color_clear_value = clear->value;
@@ -203,17 +225,14 @@ cmd_clear_surface(const struct cell_command_clear_surface *clear)
#endif /* CLEAR_OPT */
- if (Debug)
- printf("SPU %u: CLEAR SURF done\n", spu.init.id);
+ DEBUG_PRINTF("CLEAR SURF done\n");
}
static void
cmd_release_verts(const struct cell_command_release_verts *release)
{
- if (Debug)
- printf("SPU %u: RELEASE VERTS %u\n",
- spu.init.id, release->vertex_buf);
+ DEBUG_PRINTF("RELEASE VERTS %u\n", release->vertex_buf);
ASSERT(release->vertex_buf != ~0U);
release_buffer(release->vertex_buf);
}
@@ -228,16 +247,38 @@ cmd_release_verts(const struct cell_command_release_verts *release)
static void
cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops)
{
- if (Debug)
- printf("SPU %u: CMD_STATE_FRAGMENT_OPS\n", spu.init.id);
+ static int warned = 0;
+
+ DEBUG_PRINTF("CMD_STATE_FRAGMENT_OPS\n");
/* Copy SPU code from batch buffer to spu buffer */
memcpy(spu.fragment_ops_code, fops->code, SPU_MAX_FRAGMENT_OPS_INSTS * 4);
/* Copy state info (for fallback case only) */
memcpy(&spu.depth_stencil_alpha, &fops->dsa, sizeof(fops->dsa));
memcpy(&spu.blend, &fops->blend, sizeof(fops->blend));
- /* Point function pointer at new code */
- spu.fragment_ops = (spu_fragment_ops_func) spu.fragment_ops_code;
+ /* Parity twist! For now, always use the fallback code by default,
+ * only switching to codegen when specifically requested. This
+ * allows us to develop freely without risking taking down the
+ * branch.
+ *
+ * Later, the parity of this check will be reversed, so that
+ * codegen is *always* used, unless we specifically indicate that
+ * we don't want it.
+ *
+ * Eventually, the option will be removed completely, because in
+ * final code we'll always use codegen and won't even provide the
+ * raw state records that the fallback code requires.
+ */
+ if ((spu.init.debug_flags & CELL_DEBUG_FRAGMENT_OP_FALLBACK) == 0) {
+ spu.fragment_ops = (spu_fragment_ops_func) spu.fragment_ops_code;
+ }
+ else {
+ /* otherwise, the default fallback code remains in place */
+ if (!warned) {
+ fprintf(stderr, "Cell Warning: using fallback per-fragment code\n");
+ warned = 1;
+ }
+ }
spu.read_depth = spu.depth_stencil_alpha.depth.enabled;
spu.read_stencil = spu.depth_stencil_alpha.stencil[0].enabled;
@@ -247,8 +288,7 @@ cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops)
static void
cmd_state_fragment_program(const struct cell_command_fragment_program *fp)
{
- if (Debug)
- printf("SPU %u: CMD_STATE_FRAGMENT_PROGRAM\n", spu.init.id);
+ DEBUG_PRINTF("CMD_STATE_FRAGMENT_PROGRAM\n");
/* Copy SPU code from batch buffer to spu buffer */
memcpy(spu.fragment_program_code, fp->code,
SPU_MAX_FRAGMENT_PROGRAM_INSTS * 4);
@@ -262,9 +302,7 @@ cmd_state_fragment_program(const struct cell_command_fragment_program *fp)
static void
cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
{
- if (Debug)
- printf("SPU %u: FRAMEBUFFER: %d x %d at %p, cformat 0x%x zformat 0x%x\n",
- spu.init.id,
+ DEBUG_PRINTF("FRAMEBUFFER: %d x %d at %p, cformat 0x%x zformat 0x%x\n",
cmd->width,
cmd->height,
cmd->color_start,
@@ -309,9 +347,7 @@ cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
static void
cmd_state_sampler(const struct cell_command_sampler *sampler)
{
- if (Debug)
- printf("SPU %u: SAMPLER [%u]\n",
- spu.init.id, sampler->unit);
+ DEBUG_PRINTF("SAMPLER [%u]\n", sampler->unit);
spu.sampler[sampler->unit] = sampler->state;
if (spu.sampler[sampler->unit].min_img_filter == PIPE_TEX_FILTER_LINEAR)
@@ -328,11 +364,9 @@ cmd_state_texture(const struct cell_command_texture *texture)
const uint width = texture->width;
const uint height = texture->height;
- if (Debug) {
- printf("SPU %u: TEXTURE [%u] at %p size %u x %u\n", spu.init.id,
+ DEBUG_PRINTF("TEXTURE [%u] at %p size %u x %u\n",
texture->unit, texture->start,
texture->width, texture->height);
- }
spu.texture[unit].start = texture->start;
spu.texture[unit].width = width;
@@ -351,10 +385,7 @@ cmd_state_texture(const struct cell_command_texture *texture)
static void
cmd_state_vertex_info(const struct vertex_info *vinfo)
{
- if (Debug) {
- printf("SPU %u: VERTEX_INFO num_attribs=%u\n", spu.init.id,
- vinfo->num_attribs);
- }
+ DEBUG_PRINTF("VERTEX_INFO num_attribs=%u\n", vinfo->num_attribs);
ASSERT(vinfo->num_attribs >= 1);
ASSERT(vinfo->num_attribs <= 8);
memcpy(&spu.vertex_info, vinfo, sizeof(*vinfo));
@@ -393,8 +424,7 @@ cmd_state_attrib_fetch(const struct cell_attribute_fetch_code *code)
static void
cmd_finish(void)
{
- if (Debug)
- printf("SPU %u: FINISH\n", spu.init.id);
+ DEBUG_PRINTF("FINISH\n");
really_clear_tiles(0);
/* wait for all outstanding DMAs to finish */
mfc_write_tag_mask(~0);
@@ -419,9 +449,8 @@ cmd_batch(uint opcode)
const unsigned usize = size / sizeof(buffer[0]);
uint pos;
- if (Debug)
- printf("SPU %u: BATCH buffer %u, len %u, from %p\n",
- spu.init.id, buf, size, spu.init.buffers[buf]);
+ DEBUG_PRINTF("BATCH buffer %u, len %u, from %p\n",
+ buf, size, spu.init.buffers[buf]);
ASSERT((opcode & CELL_CMD_OPCODE_MASK) == CELL_CMD_BATCH);
@@ -440,8 +469,7 @@ cmd_batch(uint opcode)
wait_on_mask(1 << TAG_BATCH_BUFFER);
/* Tell PPU we're done copying the buffer to local store */
- if (Debug)
- printf("SPU %u: release batch buf %u\n", spu.init.id, buf);
+ DEBUG_PRINTF("release batch buf %u\n", buf);
release_buffer(buf);
/*
@@ -571,8 +599,7 @@ cmd_batch(uint opcode)
}
}
- if (Debug)
- printf("SPU %u: BATCH complete\n", spu.init.id);
+ DEBUG_PRINTF("BATCH complete\n");
}
@@ -585,8 +612,7 @@ main_loop(void)
struct cell_command cmd;
int exitFlag = 0;
- if (Debug)
- printf("SPU %u: Enter main loop\n", spu.init.id);
+ DEBUG_PRINTF("Enter main loop\n");
ASSERT((sizeof(struct cell_command) & 0xf) == 0);
ASSERT_ALIGN16(&cmd);
@@ -595,14 +621,12 @@ main_loop(void)
unsigned opcode;
int tag = 0;
- if (Debug)
- printf("SPU %u: Wait for cmd...\n", spu.init.id);
+ DEBUG_PRINTF("Wait for cmd...\n");
/* read/wait from mailbox */
opcode = (unsigned int) spu_read_in_mbox();
- if (Debug)
- printf("SPU %u: got cmd 0x%x\n", spu.init.id, opcode);
+ DEBUG_PRINTF("got cmd 0x%x\n", opcode);
/* command payload */
mfc_get(&cmd, /* dest */
@@ -619,8 +643,7 @@ main_loop(void)
switch (opcode & CELL_CMD_OPCODE_MASK) {
case CELL_CMD_EXIT:
- if (Debug)
- printf("SPU %u: EXIT\n", spu.init.id);
+ DEBUG_PRINTF("EXIT\n");
exitFlag = 1;
break;
case CELL_CMD_VS_EXECUTE:
@@ -632,13 +655,12 @@ main_loop(void)
cmd_batch(opcode);
break;
default:
- printf("Bad opcode!\n");
+ printf("Bad opcode 0x%x!\n", opcode & CELL_CMD_OPCODE_MASK);
}
}
- if (Debug)
- printf("SPU %u: Exit main loop\n", spu.init.id);
+ DEBUG_PRINTF("Exit main loop\n");
spu_dcache_report();
}
@@ -653,7 +675,8 @@ one_time_init(void)
invalidate_tex_cache();
/* Install default/fallback fragment processing function.
- * This will normally be overriden by a code-gen'd function.
+ * This will normally be overriden by a code-gen'd function
+ * unless CELL_FORCE_FRAGMENT_OPS_FALLBACK is set.
*/
spu.fragment_ops = spu_fallback_fragment_ops;
}
@@ -682,11 +705,13 @@ main(main_param_t speid, main_param_t argp)
ASSERT(sizeof(tile_t) == TILE_SIZE * TILE_SIZE * 4);
ASSERT(sizeof(struct cell_command_render) % 8 == 0);
+ ASSERT(((unsigned long) &spu.fragment_ops_code) % 32 == 0);
+ ASSERT(((unsigned long) &spu.fragment_program_code) % 32 == 0);
one_time_init();
- if (Debug)
- printf("SPU: main() speid=%lu\n", (unsigned long) speid);
+ DEBUG_PRINTF("main() speid=%lu\n", (unsigned long) speid);
+ D_PRINTF(CELL_DEBUG_FRAGMENT_OP_FALLBACK, "using fragment op fallback\n");
mfc_get(&spu.init, /* dest */
(unsigned int) argp, /* src */
@@ -698,7 +723,7 @@ main(main_param_t speid, main_param_t argp)
#if 0
if (spu.init.id==0)
- spu_test_misc();
+ spu_test_misc(spu.init.id);
#endif
main_loop();
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 2c7b625840..72e540fcff 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -143,13 +143,13 @@ struct spu_global
ubyte ctile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
ubyte ztile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
- /** Current fragment ops machine code */
- uint fragment_ops_code[SPU_MAX_FRAGMENT_OPS_INSTS];
+ /** Current fragment ops machine code, at 32-byte boundary */
+ uint fragment_ops_code[SPU_MAX_FRAGMENT_OPS_INSTS] ALIGN32_ATTRIB;
/** Current fragment ops function */
spu_fragment_ops_func fragment_ops;
- /** Current fragment program machine code */
- uint fragment_program_code[SPU_MAX_FRAGMENT_PROGRAM_INSTS];
+ /** Current fragment program machine code, at 32-byte boundary */
+ uint fragment_program_code[SPU_MAX_FRAGMENT_PROGRAM_INSTS] ALIGN32_ATTRIB;
/** Current fragment ops function */
spu_fragment_program_func fragment_program;
diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
index 03dd547845..f107764fb2 100644
--- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
+++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
@@ -60,9 +60,12 @@ spu_fallback_fragment_ops(uint x, uint y,
vector unsigned int mask)
{
vector float frag_aos[4];
- unsigned int c0, c1, c2, c3;
+ unsigned int fbc0, fbc1, fbc2, fbc3 ; /* framebuffer/tile colors */
+ unsigned int fragc0, fragc1, fragc2, fragc3; /* fragment colors */
- /* do alpha test */
+ /*
+ * Do alpha test
+ */
if (spu.depth_stencil_alpha.alpha.enabled) {
vector float ref = spu_splats(spu.depth_stencil_alpha.alpha.ref);
vector unsigned int amask;
@@ -102,7 +105,10 @@ spu_fallback_fragment_ops(uint x, uint y,
mask = spu_and(mask, amask);
}
- /* Z and/or stencil testing... */
+
+ /*
+ * Z and/or stencil testing...
+ */
if (spu.depth_stencil_alpha.depth.enabled ||
spu.depth_stencil_alpha.stencil[0].enabled) {
@@ -178,6 +184,32 @@ spu_fallback_fragment_ops(uint x, uint y,
}
}
+
+ /*
+ * If we'll need the current framebuffer/tile colors for blending
+ * or logicop or colormask, fetch them now.
+ */
+ if (spu.blend.blend_enable ||
+ spu.blend.logicop_enable ||
+ spu.blend.colormask != 0xf) {
+
+#if LINEAR_QUAD_LAYOUT /* See comments/diagram below */
+ fbc0 = colorTile->ui[y][x*2+0];
+ fbc1 = colorTile->ui[y][x*2+1];
+ fbc2 = colorTile->ui[y][x*2+2];
+ fbc3 = colorTile->ui[y][x*2+3];
+#else
+ fbc0 = colorTile->ui[y+0][x+0];
+ fbc1 = colorTile->ui[y+0][x+1];
+ fbc2 = colorTile->ui[y+1][x+0];
+ fbc3 = colorTile->ui[y+1][x+1];
+#endif
+ }
+
+
+ /*
+ * Do blending
+ */
if (spu.blend.blend_enable) {
/* blending terms, misc regs */
vector float term1r, term1g, term1b, term1a;
@@ -186,39 +218,26 @@ spu_fallback_fragment_ops(uint x, uint y,
vector float fbRGBA[4]; /* current framebuffer colors */
- /* get colors from framebuffer/tile */
+ /* convert framebuffer colors from packed int to vector float */
{
- vector float fc[4];
- uint c0, c1, c2, c3;
-
-#if LINEAR_QUAD_LAYOUT /* See comments/diagram below */
- c0 = colorTile->ui[y][x*2+0];
- c1 = colorTile->ui[y][x*2+1];
- c2 = colorTile->ui[y][x*2+2];
- c3 = colorTile->ui[y][x*2+3];
-#else
- c0 = colorTile->ui[y+0][x+0];
- c1 = colorTile->ui[y+0][x+1];
- c2 = colorTile->ui[y+1][x+0];
- c3 = colorTile->ui[y+1][x+1];
-#endif
+ vector float temp[4]; /* float colors in AOS form */
switch (spu.fb.color_format) {
case PIPE_FORMAT_B8G8R8A8_UNORM:
- fc[0] = spu_unpack_B8G8R8A8(c0);
- fc[1] = spu_unpack_B8G8R8A8(c1);
- fc[2] = spu_unpack_B8G8R8A8(c2);
- fc[3] = spu_unpack_B8G8R8A8(c3);
+ temp[0] = spu_unpack_B8G8R8A8(fbc0);
+ temp[1] = spu_unpack_B8G8R8A8(fbc1);
+ temp[2] = spu_unpack_B8G8R8A8(fbc2);
+ temp[3] = spu_unpack_B8G8R8A8(fbc3);
break;
case PIPE_FORMAT_A8R8G8B8_UNORM:
- fc[0] = spu_unpack_A8R8G8B8(c0);
- fc[1] = spu_unpack_A8R8G8B8(c1);
- fc[2] = spu_unpack_A8R8G8B8(c2);
- fc[3] = spu_unpack_A8R8G8B8(c3);
+ temp[0] = spu_unpack_A8R8G8B8(fbc0);
+ temp[1] = spu_unpack_A8R8G8B8(fbc1);
+ temp[2] = spu_unpack_A8R8G8B8(fbc2);
+ temp[3] = spu_unpack_A8R8G8B8(fbc3);
break;
default:
ASSERT(0);
}
- _transpose_matrix4x4(fbRGBA, fc);
+ _transpose_matrix4x4(fbRGBA, temp); /* fbRGBA = transpose(temp) */
}
/*
@@ -384,21 +403,20 @@ spu_fallback_fragment_ops(uint x, uint y,
#endif
/*
- * Pack float colors into 32-bit RGBA words.
+ * Pack fragment float colors into 32-bit RGBA words.
*/
switch (spu.fb.color_format) {
case PIPE_FORMAT_A8R8G8B8_UNORM:
- c0 = spu_pack_A8R8G8B8(frag_aos[0]);
- c1 = spu_pack_A8R8G8B8(frag_aos[1]);
- c2 = spu_pack_A8R8G8B8(frag_aos[2]);
- c3 = spu_pack_A8R8G8B8(frag_aos[3]);
+ fragc0 = spu_pack_A8R8G8B8(frag_aos[0]);
+ fragc1 = spu_pack_A8R8G8B8(frag_aos[1]);
+ fragc2 = spu_pack_A8R8G8B8(frag_aos[2]);
+ fragc3 = spu_pack_A8R8G8B8(frag_aos[3]);
break;
-
case PIPE_FORMAT_B8G8R8A8_UNORM:
- c0 = spu_pack_B8G8R8A8(frag_aos[0]);
- c1 = spu_pack_B8G8R8A8(frag_aos[1]);
- c2 = spu_pack_B8G8R8A8(frag_aos[2]);
- c3 = spu_pack_B8G8R8A8(frag_aos[3]);
+ fragc0 = spu_pack_B8G8R8A8(frag_aos[0]);
+ fragc1 = spu_pack_B8G8R8A8(frag_aos[1]);
+ fragc2 = spu_pack_B8G8R8A8(frag_aos[2]);
+ fragc3 = spu_pack_B8G8R8A8(frag_aos[3]);
break;
default:
fprintf(stderr, "SPU: Bad pixel format in spu_default_fragment_ops\n");
@@ -407,20 +425,57 @@ spu_fallback_fragment_ops(uint x, uint y,
/*
- * Color masking
+ * Do color masking
*/
if (spu.blend.colormask != 0xf) {
- /* XXX to do */
- /* apply color mask to 32-bit packed colors */
+ uint cmask = 0x0; /* each byte corresponds to a color channel */
+
+ /* Form bitmask depending on color buffer format and colormask bits */
+ switch (spu.fb.color_format) {
+ case PIPE_FORMAT_A8R8G8B8_UNORM:
+ if (spu.blend.colormask & (1<<0))
+ cmask |= 0x00ff0000; /* red */
+ if (spu.blend.colormask & (1<<1))
+ cmask |= 0x0000ff00; /* green */
+ if (spu.blend.colormask & (1<<2))
+ cmask |= 0x000000ff; /* blue */
+ if (spu.blend.colormask & (1<<3))
+ cmask |= 0xff000000; /* alpha */
+ break;
+ case PIPE_FORMAT_B8G8R8A8_UNORM:
+ if (spu.blend.colormask & (1<<0))
+ cmask |= 0x0000ff00; /* red */
+ if (spu.blend.colormask & (1<<1))
+ cmask |= 0x00ff0000; /* green */
+ if (spu.blend.colormask & (1<<2))
+ cmask |= 0xff000000; /* blue */
+ if (spu.blend.colormask & (1<<3))
+ cmask |= 0x000000ff; /* alpha */
+ break;
+ default:
+ ASSERT(0);
+ }
+
+ /*
+ * Apply color mask to the 32-bit packed colors.
+ * if (cmask[i])
+ * frag color[i] = frag color[i];
+ * else
+ * frag color[i] = framebuffer color[i];
+ */
+ fragc0 = (fragc0 & cmask) | (fbc0 & ~cmask);
+ fragc1 = (fragc1 & cmask) | (fbc1 & ~cmask);
+ fragc2 = (fragc2 & cmask) | (fbc2 & ~cmask);
+ fragc3 = (fragc3 & cmask) | (fbc3 & ~cmask);
}
/*
- * Logic Ops
+ * Do logic ops
*/
if (spu.blend.logicop_enable) {
/* XXX to do */
- /* apply logicop to 32-bit packed colors */
+ /* apply logicop to 32-bit packed colors (fragcx and fbcx) */
}
@@ -431,45 +486,46 @@ spu_fallback_fragment_ops(uint x, uint y,
spu.cur_ctile_status = TILE_STATUS_DIRTY;
}
else {
+ /* write no fragments */
return;
}
/*
- * Write new quad colors to the framebuffer/tile.
+ * Write new fragment/quad colors to the framebuffer/tile.
* Only write pixels where the corresponding mask word is set.
*/
#if LINEAR_QUAD_LAYOUT
/*
* Quad layout:
* +--+--+--+--+
- * |p0|p1|p2|p3|
+ * |p0|p1|p2|p3|...
* +--+--+--+--+
*/
if (spu_extract(mask, 0))
- colorTile->ui[y][x*2] = c0;
+ colorTile->ui[y][x*2] = fragc0;
if (spu_extract(mask, 1))
- colorTile->ui[y][x*2+1] = c1;
+ colorTile->ui[y][x*2+1] = fragc1;
if (spu_extract(mask, 2))
- colorTile->ui[y][x*2+2] = c2;
+ colorTile->ui[y][x*2+2] = fragc2;
if (spu_extract(mask, 3))
- colorTile->ui[y][x*2+3] = c3;
+ colorTile->ui[y][x*2+3] = fragc3;
#else
/*
* Quad layout:
* +--+--+
- * |p0|p1|
+ * |p0|p1|...
* +--+--+
- * |p2|p3|
+ * |p2|p3|...
* +--+--+
*/
if (spu_extract(mask, 0))
- colorTile->ui[y+0][x+0] = c0;
+ colorTile->ui[y+0][x+0] = fragc0;
if (spu_extract(mask, 1))
- colorTile->ui[y+0][x+1] = c1;
+ colorTile->ui[y+0][x+1] = fragc1;
if (spu_extract(mask, 2))
- colorTile->ui[y+1][x+0] = c2;
+ colorTile->ui[y+1][x+0] = fragc2;
if (spu_extract(mask, 3))
- colorTile->ui[y+1][x+1] = c3;
+ colorTile->ui[y+1][x+1] = fragc3;
#endif
}
diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index 8b93878192..0a8fb56a62 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -241,6 +241,19 @@ eval_coeff(uint slot, float x, float y, vector float result[4])
}
+/**
+ * As above, but return 4 vectors in SOA format.
+ * XXX this will all be re-written someday.
+ */
+static INLINE void
+eval_coeff_soa(uint slot, float x, float y, vector float result[4])
+{
+ eval_coeff(slot, x, y, result);
+ _transpose_matrix4x4(result, result);
+}
+
+
+
static INLINE vector float
eval_z(float x, float y)
{
@@ -267,14 +280,17 @@ emit_quad( int x, int y, mask_t mask )
if (spu_extract(spu_orx(mask), 0)) {
const int ix = x - setup.cliprect_minx;
const int iy = y - setup.cliprect_miny;
- vector float colors[4];
spu.cur_ctile_status = TILE_STATUS_DIRTY;
spu.cur_ztile_status = TILE_STATUS_DIRTY;
if (spu.texture[0].start) {
- /* texture mapping */
+ /*
+ * Temporary texture mapping path
+ * This will go away when fragment programs support TEX inst.
+ */
const uint unit = 0;
+ vector float colors[4];
vector float texcoords[4];
eval_coeff(2, (float) x, (float) y, texcoords);
@@ -311,70 +327,60 @@ emit_quad( int x, int y, mask_t mask )
colors[3] = spu_mul(colors[3], colors1[3]);
}
+ {
+ /* Convert fragment data from AoS to SoA format.
+ * I.e. (RGBA,RGBA,RGBA,RGBA) -> (RRRR,GGGG,BBBB,AAAA)
+ * This is temporary!
+ */
+ vector float soa_frag[4];
+ _transpose_matrix4x4(soa_frag, colors);
+
+ vector float fragZ = eval_z((float) x, (float) y);
+
+ /* Do all per-fragment/quad operations here, including:
+ * alpha test, z test, stencil test, blend and framebuffer writing.
+ */
+ spu.fragment_ops(ix, iy, &spu.ctile, &spu.ztile,
+ fragZ,
+ soa_frag[0], soa_frag[1],
+ soa_frag[2], soa_frag[3],
+ mask);
+ }
+
}
else {
- /* simple shading */
-#if 0
- eval_coeff(1, (float) x, (float) y, colors);
+ /*
+ * Run fragment shader, execute per-fragment ops, update fb/tile.
+ */
+ vector float inputs[4*4], outputs[2*4];
+ vector float fragZ = eval_z((float) x, (float) y);
+ /* setup inputs */
+#if 0
+ eval_coeff_soa(1, (float) x, (float) y, inputs);
#else
- /* XXX new fragment program code */
-
- if (spu.fragment_program) {
- vector float inputs[4*4], outputs[2*4];
-
- /* setup inputs */
- eval_coeff(1, (float) x, (float) y, inputs);
-
- /* Execute the current fragment program */
- spu.fragment_program(inputs, outputs, spu.constants);
-
- /* Copy outputs */
- colors[0] = outputs[0*4+0];
- colors[1] = outputs[0*4+1];
- colors[2] = outputs[0*4+2];
- colors[3] = outputs[0*4+3];
-
- if (0 && spu.init.id==0 && y == 48) {
- printf("colors[0] = %f %f %f %f\n",
- spu_extract(colors[0], 0),
- spu_extract(colors[0], 1),
- spu_extract(colors[0], 2),
- spu_extract(colors[0], 3));
- printf("colors[1] = %f %f %f %f\n",
- spu_extract(colors[1], 0),
- spu_extract(colors[1], 1),
- spu_extract(colors[1], 2),
- spu_extract(colors[1], 3));
- }
-
+ uint i;
+ for (i = 0; i < spu.vertex_info.num_attribs; i++) {
+ eval_coeff_soa(i+1, (float) x, (float) y, inputs + i * 4);
}
#endif
- }
+ ASSERT(spu.fragment_program);
+ ASSERT(spu.fragment_ops);
+ /* Execute the current fragment program */
+ spu.fragment_program(inputs, outputs, spu.constants);
- {
- /* Convert fragment data from AoS to SoA format.
- * I.e. (RGBA,RGBA,RGBA,RGBA) -> (RRRR,GGGG,BBBB,AAAA)
- * This is temporary!
- */
- vector float soa_frag[4];
- _transpose_matrix4x4(soa_frag, colors);
-
- float4 fragZ;
-
- fragZ.v = eval_z((float) x, (float) y);
-
- /* Do all per-fragment/quad operations here, including:
- * alpha test, z test, stencil test, blend and framebuffer writing.
+ /* Execute per-fragment/quad operations, including:
+ * alpha test, z test, stencil test, blend and framebuffer writing.
*/
spu.fragment_ops(ix, iy, &spu.ctile, &spu.ztile,
- fragZ.v,
- soa_frag[0], soa_frag[1],
- soa_frag[2], soa_frag[3],
+ fragZ,
+ outputs[0*4+0],
+ outputs[0*4+1],
+ outputs[0*4+2],
+ outputs[0*4+3],
mask);
}
-
}
}
diff --git a/src/gallium/drivers/softpipe/sp_fs_exec.c b/src/gallium/drivers/softpipe/sp_fs_exec.c
index 701ee4c72f..4fea71c314 100644
--- a/src/gallium/drivers/softpipe/sp_fs_exec.c
+++ b/src/gallium/drivers/softpipe/sp_fs_exec.c
@@ -39,11 +39,20 @@
#include "tgsi/tgsi_exec.h"
#include "tgsi/tgsi_parse.h"
-struct sp_exec_fragment_shader {
+struct sp_exec_fragment_shader
+{
struct sp_fragment_shader base;
+ const struct tgsi_token *machine_tokens;
};
+/** cast wrapper */
+static INLINE struct sp_exec_fragment_shader *
+sp_exec_fragment_shader(const struct sp_fragment_shader *base)
+{
+ return (struct sp_exec_fragment_shader *) base;
+}
+
/**
* Compute quad X,Y,Z,W for the four fragments in a quad.
@@ -86,10 +95,20 @@ exec_prepare( const struct sp_fragment_shader *base,
struct tgsi_exec_machine *machine,
struct tgsi_sampler *samplers )
{
- tgsi_exec_machine_bind_shader( machine,
- base->shader.tokens,
- PIPE_MAX_SAMPLERS,
- samplers );
+ struct sp_exec_fragment_shader *spefs =
+ sp_exec_fragment_shader(base);
+
+ /*
+ * Bind tokens/shader to the interpreter's machine state.
+ * Avoid redundant binding.
+ */
+ if (spefs->machine_tokens != base->shader.tokens) {
+ tgsi_exec_machine_bind_shader( machine,
+ base->shader.tokens,
+ PIPE_MAX_SAMPLERS,
+ samplers );
+ spefs->machine_tokens = base->shader.tokens;
+ }
}