summaryrefslogtreecommitdiff
path: root/src/gallium/drivers/cell
diff options
context:
space:
mode:
Diffstat (limited to 'src/gallium/drivers/cell')
-rw-r--r--src/gallium/drivers/cell/ppu/cell_context.c2
-rw-r--r--src/gallium/drivers/cell/ppu/cell_gen_fp.c1327
-rw-r--r--src/gallium/drivers/cell/ppu/cell_screen.c8
-rw-r--r--src/gallium/drivers/cell/spu/spu_shuffle.h186
-rw-r--r--src/gallium/drivers/cell/spu/spu_tri.c209
5 files changed, 987 insertions, 745 deletions
diff --git a/src/gallium/drivers/cell/ppu/cell_context.c b/src/gallium/drivers/cell/ppu/cell_context.c
index 22d552d8e3..8f502823f9 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.c
+++ b/src/gallium/drivers/cell/ppu/cell_context.c
@@ -162,7 +162,7 @@ cell_create_context(struct pipe_screen *screen,
*/
/* This call only works with SDK 3.0. Anyone still using 2.1??? */
cell->num_cells = spe_cpu_info_get(SPE_COUNT_PHYSICAL_CPU_NODES, -1);
- cell->num_spus = spe_cpu_info_get(SPE_COUNT_USABLE_SPES, 0);
+ cell->num_spus = spe_cpu_info_get(SPE_COUNT_USABLE_SPES, -1);
if (cell->debug_flags) {
printf("Cell: found %d Cell(s) with %u SPUs\n",
cell->num_cells, cell->num_spus);
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
index 96a1743fc1..8f3deb482e 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
@@ -2,6 +2,7 @@
*
* Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
* All Rights Reserved.
+ * Copyright 2009 VMware, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the
@@ -75,13 +76,25 @@ struct codegen
int one_reg; /**< register containing {1.0, 1.0, 1.0, 1.0} */
+ int addr_reg; /**< address register, integer values */
+
/** Per-instruction temps / intermediate temps */
int num_itemps;
int itemps[12];
/** Current IF/ELSE/ENDIF nesting level */
int if_nesting;
- /** Index of execution mask register */
+ /** Current BGNLOOP/ENDLOOP nesting level */
+ int loop_nesting;
+ /** Location of start of current loop */
+ int loop_start;
+
+ /** Index of if/conditional mask register */
+ int cond_mask_reg;
+ /** Index of loop mask register */
+ int loop_mask_reg;
+
+ /** Index of master execution mask register */
int exec_mask_reg;
/** KIL mask: indicates which fragments have been killed */
@@ -145,10 +158,33 @@ get_const_one_reg(struct codegen *gen)
/**
- * Return index of the pixel execution mask.
+ * Return index of the address register.
+ * Used for indirect register loads/stores.
+ */
+static int
+get_address_reg(struct codegen *gen)
+{
+ if (gen->addr_reg <= 0) {
+ gen->addr_reg = spe_allocate_available_register(gen->f);
+
+ spe_indent(gen->f, 4);
+ spe_comment(gen->f, -4, "INIT CONSTANT 1.0:");
+
+ /* init addr = {0, 0, 0, 0} */
+ spe_zero(gen->f, gen->addr_reg);
+
+ spe_indent(gen->f, -4);
+ }
+
+ return gen->addr_reg;
+}
+
+
+/**
+ * Return index of the master execution mask.
* The register is allocated an initialized upon the first call.
*
- * The pixel execution mask controls which pixels in a quad are
+ * The master execution mask controls which pixels in a quad are
* modified, according to surrounding conditionals, loops, etc.
*/
static int
@@ -157,19 +193,40 @@ get_exec_mask_reg(struct codegen *gen)
if (gen->exec_mask_reg <= 0) {
gen->exec_mask_reg = spe_allocate_available_register(gen->f);
- spe_indent(gen->f, 4);
- spe_comment(gen->f, -4, "INIT EXEC MASK = ~0:");
-
- /* exec_mask = {~0, ~0, ~0, ~0} */
+ /* XXX this may not be needed */
+ spe_comment(gen->f, 0*-4, "initialize master execution mask = ~0");
spe_load_int(gen->f, gen->exec_mask_reg, ~0);
-
- spe_indent(gen->f, -4);
}
return gen->exec_mask_reg;
}
+/** Return index of the conditional (if/else) execution mask register */
+static int
+get_cond_mask_reg(struct codegen *gen)
+{
+ if (gen->cond_mask_reg <= 0) {
+ gen->cond_mask_reg = spe_allocate_available_register(gen->f);
+ }
+
+ return gen->cond_mask_reg;
+}
+
+
+/** Return index of the loop execution mask register */
+static int
+get_loop_mask_reg(struct codegen *gen)
+{
+ if (gen->loop_mask_reg <= 0) {
+ gen->loop_mask_reg = spe_allocate_available_register(gen->f);
+ }
+
+ return gen->loop_mask_reg;
+}
+
+
+
static boolean
is_register_src(struct codegen *gen, int channel,
const struct tgsi_full_src_register *src)
@@ -231,16 +288,22 @@ get_src_reg(struct codegen *gen,
spe_xor(gen->f, reg, reg, reg);
}
else {
+ int index = src->SrcRegister.Index;
+
assert(swizzle < 4);
+ if (src->SrcRegister.Indirect) {
+ /* XXX unfinished */
+ }
+
switch (src->SrcRegister.File) {
case TGSI_FILE_TEMPORARY:
- reg = gen->temp_regs[src->SrcRegister.Index][swizzle];
+ reg = gen->temp_regs[index][swizzle];
break;
case TGSI_FILE_INPUT:
{
/* offset is measured in quadwords, not bytes */
- int offset = src->SrcRegister.Index * 4 + swizzle;
+ int offset = index * 4 + swizzle;
reg = get_itemp(gen);
reg_is_itemp = TRUE;
/* Load: reg = memory[(machine_reg) + offset] */
@@ -248,12 +311,12 @@ get_src_reg(struct codegen *gen,
}
break;
case TGSI_FILE_IMMEDIATE:
- reg = gen->imm_regs[src->SrcRegister.Index][swizzle];
+ reg = gen->imm_regs[index][swizzle];
break;
case TGSI_FILE_CONSTANT:
{
/* offset is measured in quadwords, not bytes */
- int offset = src->SrcRegister.Index * 4 + swizzle;
+ int offset = index * 4 + swizzle;
reg = get_itemp(gen);
reg_is_itemp = TRUE;
/* Load: reg = memory[(machine_reg) + offset] */
@@ -322,7 +385,7 @@ get_dst_reg(struct codegen *gen,
switch (dest->DstRegister.File) {
case TGSI_FILE_TEMPORARY:
- if (gen->if_nesting > 0)
+ if (gen->if_nesting > 0 || gen->loop_nesting > 0)
reg = get_itemp(gen);
else
reg = gen->temp_regs[dest->DstRegister.Index][channel];
@@ -367,7 +430,7 @@ store_dest_reg(struct codegen *gen,
switch (dest->DstRegister.File) {
case TGSI_FILE_TEMPORARY:
- if (gen->if_nesting > 0) {
+ if (gen->if_nesting > 0 || gen->loop_nesting > 0) {
int d_reg = gen->temp_regs[dest->DstRegister.Index][channel];
int exec_reg = get_exec_mask_reg(gen);
/* Mix d with new value according to exec mask:
@@ -384,7 +447,7 @@ store_dest_reg(struct codegen *gen,
{
/* offset is measured in quadwords, not bytes */
int offset = dest->DstRegister.Index * 4 + channel;
- if (gen->if_nesting > 0) {
+ if (gen->if_nesting > 0 || gen->loop_nesting > 0) {
int exec_reg = get_exec_mask_reg(gen);
int curval_reg = get_itemp(gen);
/* First read the current value from memory:
@@ -488,96 +551,118 @@ emit_epilogue(struct codegen *gen)
}
+#define FOR_EACH_ENABLED_CHANNEL(inst, ch) \
+ for (ch = 0; ch < 4; ch++) \
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch))
+
+
+static boolean
+emit_ARL(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+ int ch = 0, src_reg, addr_reg;
+
+ spe_comment(gen->f, -4, "ARL:");
+
+ src_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+ addr_reg = get_address_reg(gen);
+
+ /* convert float to int */
+ spe_cflts(gen->f, addr_reg, src_reg, 0);
+
+ free_itemps(gen);
+
+ return TRUE;
+}
+
+
static boolean
emit_MOV(struct codegen *gen, const struct tgsi_full_instruction *inst)
{
int ch, src_reg[4], dst_reg[4];
spe_comment(gen->f, -4, "MOV:");
- for (ch = 0; ch < 4; ch++) {
- if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
- src_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
- dst_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
- }
+
+ FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+ src_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+ dst_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
}
- for (ch = 0; ch < 4; ch++) {
- if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
- if (is_register_src(gen, ch, &inst->FullSrcRegisters[0]) &&
- is_memory_dst(gen, ch, &inst->FullDstRegisters[0])) {
- /* special-case: register to memory store */
- store_dest_reg(gen, src_reg[ch], ch, &inst->FullDstRegisters[0]);
- }
- else {
- spe_move(gen->f, dst_reg[ch], src_reg[ch]);
- store_dest_reg(gen, dst_reg[ch], ch, &inst->FullDstRegisters[0]);
- }
- free_itemps(gen);
+ FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+ if (is_register_src(gen, ch, &inst->FullSrcRegisters[0]) &&
+ is_memory_dst(gen, ch, &inst->FullDstRegisters[0])) {
+ /* special-case: register to memory store */
+ store_dest_reg(gen, src_reg[ch], ch, &inst->FullDstRegisters[0]);
+ }
+ else {
+ spe_move(gen->f, dst_reg[ch], src_reg[ch]);
+ store_dest_reg(gen, dst_reg[ch], ch, &inst->FullDstRegisters[0]);
}
}
- return true;
+
+ free_itemps(gen);
+
+ return TRUE;
}
/**
- * Emit addition instructions. Recall that a single TGSI_OPCODE_ADD
- * becomes (up to) four SPU "fa" instructions because we're doing SOA
- * processing.
+ * Emit binary operation
*/
static boolean
-emit_ADD(struct codegen *gen, const struct tgsi_full_instruction *inst)
+emit_binop(struct codegen *gen, const struct tgsi_full_instruction *inst)
{
int ch, s1_reg[4], s2_reg[4], d_reg[4];
- spe_comment(gen->f, -4, "ADD:");
+ switch (inst->Instruction.Opcode) {
+ case TGSI_OPCODE_ADD:
+ spe_comment(gen->f, -4, "ADD:");
+ break;
+ case TGSI_OPCODE_SUB:
+ spe_comment(gen->f, -4, "SUB:");
+ break;
+ case TGSI_OPCODE_MUL:
+ spe_comment(gen->f, -4, "MUL:");
+ break;
+ default:
+ assert(0);
+ }
+
/* Loop over Red/Green/Blue/Alpha channels, fetch src operands */
- for (ch = 0; ch < 4; ch++) {
- /* If the dest R, G, B or A writemask is enabled... */
- if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
- s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
- s2_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
- d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
- }
+ FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+ s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+ s2_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+ d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
}
- /* Loop over Red/Green/Blue/Alpha channels, do the add, store results */
- for (ch = 0; ch < 4; ch++) {
- if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
- /* Emit actual SPE instruction: d = s1 + s2 */
+
+ /* Loop over Red/Green/Blue/Alpha channels, do the op, store results */
+ FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+ /* Emit actual SPE instruction: d = s1 + s2 */
+ switch (inst->Instruction.Opcode) {
+ case TGSI_OPCODE_ADD:
spe_fa(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch]);
- /* Store the result (a no-op for TGSI_FILE_TEMPORARY dests) */
- store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
- /* Free any intermediate temps we allocated */
- free_itemps(gen);
+ break;
+ case TGSI_OPCODE_SUB:
+ spe_fs(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch]);
+ break;
+ case TGSI_OPCODE_MUL:
+ spe_fm(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch]);
+ break;
+ default:
+ ;
}
}
- return true;
-}
-/**
- * Emit subtract. See emit_ADD for comments.
- */
-static boolean
-emit_SUB(struct codegen *gen, const struct tgsi_full_instruction *inst)
-{
- int ch, s1_reg[4], s2_reg[4], d_reg[4];
- spe_comment(gen->f, -4, "SUB:");
- for (ch = 0; ch < 4; ch++) {
- if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
- s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
- s2_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
- d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
- }
+ /* Store the result (a no-op for TGSI_FILE_TEMPORARY dests) */
+ FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+ store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
}
- for (ch = 0; ch < 4; ch++) {
- if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
- /* d = s1 - s2 */
- spe_fs(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch]);
- store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
- free_itemps(gen);
- }
- }
- return true;
+
+ /* Free any intermediate temps we allocated */
+ free_itemps(gen);
+
+ return TRUE;
}
+
/**
* Emit multiply add. See emit_ADD for comments.
*/
@@ -586,23 +671,20 @@ emit_MAD(struct codegen *gen, const struct tgsi_full_instruction *inst)
{
int ch, s1_reg[4], s2_reg[4], s3_reg[4], d_reg[4];
spe_comment(gen->f, -4, "MAD:");
- for (ch = 0; ch < 4; ch++) {
- if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
- s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
- s2_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
- s3_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[2]);
- d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
- }
+ FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+ s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+ s2_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+ s3_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[2]);
+ d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
}
- for (ch = 0; ch < 4; ch++) {
- if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
- /* d = s1 * s2 + s3 */
- spe_fma(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch], s3_reg[ch]);
- store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
- free_itemps(gen);
- }
+ FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+ spe_fma(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch], s3_reg[ch]);
+ }
+ FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+ store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
}
- return true;
+ free_itemps(gen);
+ return TRUE;
}
@@ -615,132 +697,108 @@ emit_LERP(struct codegen *gen, const struct tgsi_full_instruction *inst)
int ch, s1_reg[4], s2_reg[4], s3_reg[4], d_reg[4], tmp_reg[4];
spe_comment(gen->f, -4, "LERP:");
/* setup/get src/dst/temp regs */
- for (ch = 0; ch < 4; ch++) {
- if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
- s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
- s2_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
- s3_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[2]);
- d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
- tmp_reg[ch] = get_itemp(gen);
- }
+ FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+ s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+ s2_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+ s3_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[2]);
+ d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+ tmp_reg[ch] = get_itemp(gen);
}
/* d = s3 + s1(s2 - s3) */
/* do all subtracts, then all fma, then all stores to better pipeline */
- for (ch = 0; ch < 4; ch++) {
- if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
- spe_fs(gen->f, tmp_reg[ch], s2_reg[ch], s3_reg[ch]);
- }
+ FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+ spe_fs(gen->f, tmp_reg[ch], s2_reg[ch], s3_reg[ch]);
}
- for (ch = 0; ch < 4; ch++) {
- if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
- spe_fma(gen->f, d_reg[ch], tmp_reg[ch], s1_reg[ch], s3_reg[ch]);
- }
+ FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+ spe_fma(gen->f, d_reg[ch], tmp_reg[ch], s1_reg[ch], s3_reg[ch]);
}
- for (ch = 0; ch < 4; ch++) {
- if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
- store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
- }
+ FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+ store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
}
free_itemps(gen);
- return true;
+ return TRUE;
}
+
+
/**
- * Emit multiply. See emit_ADD for comments.
+ * Emit reciprocal or recip sqrt.
*/
static boolean
-emit_MUL(struct codegen *gen, const struct tgsi_full_instruction *inst)
+emit_RCP_RSQ(struct codegen *gen, const struct tgsi_full_instruction *inst)
{
- int ch, s1_reg[4], s2_reg[4], d_reg[4];
- spe_comment(gen->f, -4, "MUL:");
- for (ch = 0; ch < 4; ch++) {
- if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
- s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
- s2_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
- d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
- }
+ int ch, s1_reg[4], d_reg[4], tmp_reg[4];
+
+ if (inst->Instruction.Opcode == TGSI_OPCODE_RCP) {
+ spe_comment(gen->f, -4, "RCP:");
}
- for (ch = 0; ch < 4; ch++) {
- if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
- /* d = s1 * s2 */
- spe_fm(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch]);
- store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
- free_itemps(gen);
- }
+ else {
+ assert(inst->Instruction.Opcode == TGSI_OPCODE_RSQ);
+ spe_comment(gen->f, -4, "RSQ:");
}
- return true;
-}
-/**
- * Emit reciprocal. See emit_ADD for comments.
- */
-static boolean
-emit_RCP(struct codegen *gen, const struct tgsi_full_instruction *inst)
-{
- int ch;
- spe_comment(gen->f, -4, "RCP:");
- for (ch = 0; ch < 4; ch++) {
- if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
- int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
- int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
- /* d = 1/s1 */
- spe_frest(gen->f, d_reg, s1_reg);
- spe_fi(gen->f, d_reg, s1_reg, d_reg);
- store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
- free_itemps(gen);
- }
+ FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+ s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+ d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+ tmp_reg[ch] = get_itemp(gen);
}
- return true;
-}
-/**
- * Emit reciprocal sqrt. See emit_ADD for comments.
- */
-static boolean
-emit_RSQ(struct codegen *gen, const struct tgsi_full_instruction *inst)
-{
- int ch;
- spe_comment(gen->f, -4, "RSQ:");
- for (ch = 0; ch < 4; ch++) {
- if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
- int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
- int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
- /* d = 1/s1 */
- spe_frsqest(gen->f, d_reg, s1_reg);
- spe_fi(gen->f, d_reg, s1_reg, d_reg);
- store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
- free_itemps(gen);
+ FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+ if (inst->Instruction.Opcode == TGSI_OPCODE_RCP) {
+ /* tmp = 1/s1 */
+ spe_frest(gen->f, tmp_reg[ch], s1_reg[ch]);
+ }
+ else {
+ /* tmp = 1/sqrt(s1) */
+ spe_frsqest(gen->f, tmp_reg[ch], s1_reg[ch]);
}
}
- return true;
+
+ FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+ /* d = float_interp(s1, tmp) */
+ spe_fi(gen->f, d_reg[ch], s1_reg[ch], tmp_reg[ch]);
+ }
+
+ FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+ store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
+ }
+
+ free_itemps(gen);
+ return TRUE;
}
+
/**
* Emit absolute value. See emit_ADD for comments.
*/
static boolean
emit_ABS(struct codegen *gen, const struct tgsi_full_instruction *inst)
{
- int ch;
+ int ch, s1_reg[4], d_reg[4];
+ const int bit31mask_reg = get_itemp(gen);
+
spe_comment(gen->f, -4, "ABS:");
- for (ch = 0; ch < 4; ch++) {
- if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
- int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
- int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
- const int bit31mask_reg = get_itemp(gen);
- /* mask with bit 31 set, the rest cleared */
- spe_load_uint(gen->f, bit31mask_reg, (1 << 31));
+ /* mask with bit 31 set, the rest cleared */
+ spe_load_uint(gen->f, bit31mask_reg, (1 << 31));
- /* d = sign bit cleared in s1 */
- spe_andc(gen->f, d_reg, s1_reg, bit31mask_reg);
+ FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+ s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+ d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+ }
- store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
- free_itemps(gen);
- }
+ /* d = sign bit cleared in s1 */
+ FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+ spe_andc(gen->f, d_reg[ch], s1_reg[ch], bit31mask_reg);
}
- return true;
+
+ FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+ store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
+ }
+
+ free_itemps(gen);
+ return TRUE;
}
/**
@@ -775,16 +833,14 @@ emit_DP3(struct codegen *gen, const struct tgsi_full_instruction *inst)
/* t0 = t0 + t1 */
spe_fa(gen->f, t0_reg, t0_reg, t1_reg);
- for (ch = 0; ch < 4; ch++) {
- if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
- int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
- spe_move(gen->f, d_reg, t0_reg);
- store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
- }
+ FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+ int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+ spe_move(gen->f, d_reg, t0_reg);
+ store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
}
free_itemps(gen);
- return true;
+ return TRUE;
}
/**
@@ -824,16 +880,14 @@ emit_DP4(struct codegen *gen, const struct tgsi_full_instruction *inst)
/* t0 = t0 + t1 */
spe_fa(gen->f, t0_reg, t0_reg, t1_reg);
- for (ch = 0; ch < 4; ch++) {
- if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
- int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
- spe_move(gen->f, d_reg, t0_reg);
- store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
- }
+ FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+ int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+ spe_move(gen->f, d_reg, t0_reg);
+ store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
}
free_itemps(gen);
- return true;
+ return TRUE;
}
/**
@@ -867,16 +921,14 @@ emit_DPH(struct codegen *gen, const struct tgsi_full_instruction *inst)
/* t = w1 + t */
spe_fa(gen->f, tmp_reg, s2_reg, tmp_reg);
- for (ch = 0; ch < 4; ch++) {
- if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
- int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
- spe_move(gen->f, d_reg, tmp_reg);
- store_dest_reg(gen, tmp_reg, ch, &inst->FullDstRegisters[0]);
- }
+ FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+ int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+ spe_move(gen->f, d_reg, tmp_reg);
+ store_dest_reg(gen, tmp_reg, ch, &inst->FullDstRegisters[0]);
}
free_itemps(gen);
- return true;
+ return TRUE;
}
/**
@@ -911,17 +963,15 @@ emit_NRM3(struct codegen *gen, const struct tgsi_full_instruction *inst)
spe_frsqest(gen->f, t1_reg, t0_reg);
spe_fi(gen->f, t1_reg, t0_reg, t1_reg);
- for (ch = 0; ch < 3; ch++) { /* NOTE: omit W channel */
- if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
- int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
- /* dst = src[ch] * t1 */
- spe_fm(gen->f, d_reg, src_reg[ch], t1_reg);
- store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
- }
+ FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+ int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+ /* dst = src[ch] * t1 */
+ spe_fm(gen->f, d_reg, src_reg[ch], t1_reg);
+ store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
}
free_itemps(gen);
- return true;
+ return TRUE;
}
@@ -978,201 +1028,101 @@ emit_XPD(struct codegen *gen, const struct tgsi_full_instruction *inst)
}
free_itemps(gen);
- return true;
+ return TRUE;
}
+
/**
- * Emit set-if-greater-than.
+ * Emit inequality instruction.
* Note that the SPE fcgt instruction produces 0x0 and 0xffffffff as
* the result but OpenGL/TGSI needs 0.0 and 1.0 results.
* We can easily convert 0x0/0xffffffff to 0.0/1.0 with a bitwise AND.
*/
static boolean
-emit_SGT(struct codegen *gen, const struct tgsi_full_instruction *inst)
+emit_inequality(struct codegen *gen, const struct tgsi_full_instruction *inst)
{
- int ch;
+ int ch, s1_reg[4], s2_reg[4], d_reg[4], one_reg;
+ bool complement = FALSE;
- spe_comment(gen->f, -4, "SGT:");
-
- for (ch = 0; ch < 4; ch++) {
- if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
- int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
- int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
- int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
-
- /* d = (s1 > s2) */
- spe_fcgt(gen->f, d_reg, s1_reg, s2_reg);
-
- /* convert d from 0x0/0xffffffff to 0.0/1.0 */
- /* d = d & one_reg */
- spe_and(gen->f, d_reg, d_reg, get_const_one_reg(gen));
-
- store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
- free_itemps(gen);
- }
+ switch (inst->Instruction.Opcode) {
+ case TGSI_OPCODE_SGT:
+ spe_comment(gen->f, -4, "SGT:");
+ break;
+ case TGSI_OPCODE_SLT:
+ spe_comment(gen->f, -4, "SLT:");
+ break;
+ case TGSI_OPCODE_SGE:
+ spe_comment(gen->f, -4, "SGE:");
+ complement = TRUE;
+ break;
+ case TGSI_OPCODE_SLE:
+ spe_comment(gen->f, -4, "SLE:");
+ complement = TRUE;
+ break;
+ case TGSI_OPCODE_SEQ:
+ spe_comment(gen->f, -4, "SEQ:");
+ break;
+ case TGSI_OPCODE_SNE:
+ spe_comment(gen->f, -4, "SNE:");
+ complement = TRUE;
+ break;
+ default:
+ ;
}
- return true;
-}
+ one_reg = get_const_one_reg(gen);
-/**
- * Emit set-if_less-then. See emit_SGT for comments.
- */
-static boolean
-emit_SLT(struct codegen *gen, const struct tgsi_full_instruction *inst)
-{
- int ch;
-
- spe_comment(gen->f, -4, "SLT:");
-
- for (ch = 0; ch < 4; ch++) {
- if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
- int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
- int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
- int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
-
- /* d = (s1 < s2) */
- spe_fcgt(gen->f, d_reg, s2_reg, s1_reg);
-
- /* convert d from 0x0/0xffffffff to 0.0/1.0 */
- /* d = d & one_reg */
- spe_and(gen->f, d_reg, d_reg, get_const_one_reg(gen));
-
- store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
- free_itemps(gen);
- }
+ FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+ s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+ s2_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+ d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
}
- return true;
-}
-
-/**
- * Emit set-if_greater-then-or-equal. See emit_SGT for comments.
- */
-static boolean
-emit_SGE(struct codegen *gen, const struct tgsi_full_instruction *inst)
-{
- int ch;
-
- spe_comment(gen->f, -4, "SGE:");
-
- for (ch = 0; ch < 4; ch++) {
- if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
- int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
- int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
- int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
-
- /* d = (s1 >= s2) */
- spe_fcgt(gen->f, d_reg, s2_reg, s1_reg);
-
- /* convert d from 0x0/0xffffffff to 0.0/1.0 */
- /* d = ~d & one_reg */
- spe_andc(gen->f, d_reg, get_const_one_reg(gen), d_reg);
-
- store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
- free_itemps(gen);
+ FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+ switch (inst->Instruction.Opcode) {
+ case TGSI_OPCODE_SGT:
+ spe_fcgt(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch]);
+ break;
+ case TGSI_OPCODE_SLT:
+ spe_fcgt(gen->f, d_reg[ch], s2_reg[ch], s1_reg[ch]);
+ break;
+ case TGSI_OPCODE_SGE:
+ spe_fcgt(gen->f, d_reg[ch], s2_reg[ch], s1_reg[ch]);
+ break;
+ case TGSI_OPCODE_SLE:
+ spe_fcgt(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch]);
+ break;
+ case TGSI_OPCODE_SEQ:
+ spe_fceq(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch]);
+ break;
+ case TGSI_OPCODE_SNE:
+ spe_fceq(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch]);
+ break;
+ default:
+ assert(0);
}
}
- return true;
-}
-
-/**
- * Emit set-if_less-then-or-equal. See emit_SGT for comments.
- */
-static boolean
-emit_SLE(struct codegen *gen, const struct tgsi_full_instruction *inst)
-{
- int ch;
-
- spe_comment(gen->f, -4, "SLE:");
-
- for (ch = 0; ch < 4; ch++) {
- if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
- int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
- int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
- int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
-
- /* d = (s1 <= s2) */
- spe_fcgt(gen->f, d_reg, s1_reg, s2_reg);
-
- /* convert d from 0x0/0xffffffff to 0.0/1.0 */
- /* d = ~d & one_reg */
- spe_andc(gen->f, d_reg, get_const_one_reg(gen), d_reg);
-
- store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
- free_itemps(gen);
- }
+ /* convert d from 0x0/0xffffffff to 0.0/1.0 */
+ FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+ /* d = d & one_reg */
+ if (complement)
+ spe_andc(gen->f, d_reg[ch], one_reg, d_reg[ch]);
+ else
+ spe_and(gen->f, d_reg[ch], one_reg, d_reg[ch]);
}
- return true;
-}
-
-/**
- * Emit set-if_equal. See emit_SGT for comments.
- */
-static boolean
-emit_SEQ(struct codegen *gen, const struct tgsi_full_instruction *inst)
-{
- int ch;
-
- spe_comment(gen->f, -4, "SEQ:");
-
- for (ch = 0; ch < 4; ch++) {
- if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
- int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
- int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
- int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
-
- /* d = (s1 == s2) */
- spe_fceq(gen->f, d_reg, s1_reg, s2_reg);
-
- /* convert d from 0x0/0xffffffff to 0.0/1.0 */
- /* d = d & one_reg */
- spe_and(gen->f, d_reg, d_reg, get_const_one_reg(gen));
-
- store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
- free_itemps(gen);
- }
+ FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+ store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
}
- return true;
+ free_itemps(gen);
+ return TRUE;
}
-/**
- * Emit set-if_not_equal. See emit_SGT for comments.
- */
-static boolean
-emit_SNE(struct codegen *gen, const struct tgsi_full_instruction *inst)
-{
- int ch;
-
- spe_comment(gen->f, -4, "SNE:");
-
- for (ch = 0; ch < 4; ch++) {
- if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
- int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
- int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
- int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
-
- /* d = (s1 != s2) */
- spe_fceq(gen->f, d_reg, s1_reg, s2_reg);
- spe_nor(gen->f, d_reg, d_reg, d_reg);
-
- /* convert d from 0x0/0xffffffff to 0.0/1.0 */
- /* d = d & one_reg */
- spe_and(gen->f, d_reg, d_reg, get_const_one_reg(gen));
-
- store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
- free_itemps(gen);
- }
- }
-
- return true;
-}
/**
- * Emit compare. See emit_SGT for comments.
+ * Emit compare.
*/
static boolean
emit_CMP(struct codegen *gen, const struct tgsi_full_instruction *inst)
@@ -1181,26 +1131,24 @@ emit_CMP(struct codegen *gen, const struct tgsi_full_instruction *inst)
spe_comment(gen->f, -4, "CMP:");
- for (ch = 0; ch < 4; ch++) {
- if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
- int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
- int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
- int s3_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[2]);
- int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
- int zero_reg = get_itemp(gen);
+ FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+ int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+ int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+ int s3_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[2]);
+ int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+ int zero_reg = get_itemp(gen);
- spe_xor(gen->f, zero_reg, zero_reg, zero_reg);
+ spe_zero(gen->f, zero_reg);
- /* d = (s1 < 0) ? s2 : s3 */
- spe_fcgt(gen->f, d_reg, zero_reg, s1_reg);
- spe_selb(gen->f, d_reg, s3_reg, s2_reg, d_reg);
+ /* d = (s1 < 0) ? s2 : s3 */
+ spe_fcgt(gen->f, d_reg, zero_reg, s1_reg);
+ spe_selb(gen->f, d_reg, s3_reg, s2_reg, d_reg);
- store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
- free_itemps(gen);
- }
+ store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+ free_itemps(gen);
}
- return true;
+ return TRUE;
}
/**
@@ -1211,29 +1159,34 @@ emit_CMP(struct codegen *gen, const struct tgsi_full_instruction *inst)
static boolean
emit_TRUNC(struct codegen *gen, const struct tgsi_full_instruction *inst)
{
- int ch;
+ int ch, s1_reg[4], d_reg[4];
spe_comment(gen->f, -4, "TRUNC:");
- for (ch = 0; ch < 4; ch++) {
- if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
- int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
- int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+ FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+ s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+ d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+ }
- /* Convert float to int */
- spe_cflts(gen->f, d_reg, s1_reg, 0);
+ /* Convert float to int */
+ FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+ spe_cflts(gen->f, d_reg[ch], s1_reg[ch], 0);
+ }
- /* Convert int to float */
- spe_csflt(gen->f, d_reg, d_reg, 0);
+ /* Convert int to float */
+ FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+ spe_csflt(gen->f, d_reg[ch], d_reg[ch], 0);
+ }
- store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
- free_itemps(gen);
- }
+ FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+ store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
}
- return true;
+ free_itemps(gen);
+ return TRUE;
}
+
/**
* Emit floor.
* If negative int subtract one
@@ -1243,77 +1196,103 @@ emit_TRUNC(struct codegen *gen, const struct tgsi_full_instruction *inst)
static boolean
emit_FLR(struct codegen *gen, const struct tgsi_full_instruction *inst)
{
- int ch;
+ int ch, s1_reg[4], d_reg[4], tmp_reg[4], zero_reg, one_reg;
spe_comment(gen->f, -4, "FLR:");
- int zero_reg = get_itemp(gen);
- spe_xor(gen->f, zero_reg, zero_reg, zero_reg);
+ zero_reg = get_itemp(gen);
+ spe_zero(gen->f, zero_reg);
+ one_reg = get_const_one_reg(gen);
- for (ch = 0; ch < 4; ch++) {
- if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
- int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
- int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
- int tmp_reg = get_itemp(gen);
+ FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+ s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+ d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+ tmp_reg[ch] = get_itemp(gen);
+ }
- /* If negative, subtract 1.0 */
- spe_fcgt(gen->f, tmp_reg, zero_reg, s1_reg);
- spe_selb(gen->f, tmp_reg, zero_reg, get_const_one_reg(gen), tmp_reg);
- spe_fs(gen->f, tmp_reg, s1_reg, tmp_reg);
+ /* If negative, subtract 1.0 */
+ FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+ spe_fcgt(gen->f, tmp_reg[ch], zero_reg, s1_reg[ch]);
+ }
+ FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+ spe_selb(gen->f, tmp_reg[ch], zero_reg, one_reg, tmp_reg[ch]);
+ }
+ FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+ spe_fs(gen->f, tmp_reg[ch], s1_reg[ch], tmp_reg[ch]);
+ }
- /* Convert float to int */
- spe_cflts(gen->f, tmp_reg, tmp_reg, 0);
+ /* Convert float to int */
+ FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+ spe_cflts(gen->f, tmp_reg[ch], tmp_reg[ch], 0);
+ }
- /* Convert int to float */
- spe_csflt(gen->f, d_reg, tmp_reg, 0);
+ /* Convert int to float */
+ FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+ spe_csflt(gen->f, d_reg[ch], tmp_reg[ch], 0);
+ }
- store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
- free_itemps(gen);
- }
+ FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+ store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
}
- return true;
+ free_itemps(gen);
+ return TRUE;
}
+
/**
* Compute frac = Input - FLR(Input)
*/
static boolean
emit_FRC(struct codegen *gen, const struct tgsi_full_instruction *inst)
{
- int ch;
+ int ch, s1_reg[4], d_reg[4], tmp_reg[4], zero_reg, one_reg;
spe_comment(gen->f, -4, "FRC:");
- int zero_reg = get_itemp(gen);
- spe_xor(gen->f, zero_reg, zero_reg, zero_reg);
+ zero_reg = get_itemp(gen);
+ spe_zero(gen->f, zero_reg);
+ one_reg = get_const_one_reg(gen);
- for (ch = 0; ch < 4; ch++) {
- if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
- int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
- int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
- int tmp_reg = get_itemp(gen);
+ FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+ s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+ d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+ tmp_reg[ch] = get_itemp(gen);
+ }
- /* If negative, subtract 1.0 */
- spe_fcgt(gen->f, tmp_reg, zero_reg, s1_reg);
- spe_selb(gen->f, tmp_reg, zero_reg, get_const_one_reg(gen), tmp_reg);
- spe_fs(gen->f, tmp_reg, s1_reg, tmp_reg);
+ /* If negative, subtract 1.0 */
+ FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+ spe_fcgt(gen->f, tmp_reg[ch], zero_reg, s1_reg[ch]);
+ }
+ FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+ spe_selb(gen->f, tmp_reg[ch], zero_reg, one_reg, tmp_reg[ch]);
+ }
+ FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+ spe_fs(gen->f, tmp_reg[ch], s1_reg[ch], tmp_reg[ch]);
+ }
- /* Convert float to int */
- spe_cflts(gen->f, tmp_reg, tmp_reg, 0);
+ /* Convert float to int */
+ FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+ spe_cflts(gen->f, tmp_reg[ch], tmp_reg[ch], 0);
+ }
- /* Convert int to float */
- spe_csflt(gen->f, tmp_reg, tmp_reg, 0);
+ /* Convert int to float */
+ FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+ spe_csflt(gen->f, tmp_reg[ch], tmp_reg[ch], 0);
+ }
- /* d = s1 - FLR(s1) */
- spe_fs(gen->f, d_reg, s1_reg, tmp_reg);
+ /* d = s1 - FLR(s1) */
+ FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+ spe_fs(gen->f, d_reg[ch], s1_reg[ch], tmp_reg[ch]);
+ }
- store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
- free_itemps(gen);
- }
+ /* store result */
+ FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+ store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
}
- return true;
+ free_itemps(gen);
+ return TRUE;
}
@@ -1379,73 +1358,71 @@ emit_function_call(struct codegen *gen,
retval_reg = spe_allocate_available_register(gen->f);
}
- for (ch = 0; ch < 4; ch++) {
- if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
- int d_reg;
- ubyte usedRegs[SPE_NUM_REGS];
- uint i, numUsed;
-
- if (!scalar) {
- for (a = 0; a < num_args; a++) {
- s_regs[a] = get_src_reg(gen, ch, &inst->FullSrcRegisters[a]);
- }
+ FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+ int d_reg;
+ ubyte usedRegs[SPE_NUM_REGS];
+ uint i, numUsed;
+
+ if (!scalar) {
+ for (a = 0; a < num_args; a++) {
+ s_regs[a] = get_src_reg(gen, ch, &inst->FullSrcRegisters[a]);
}
+ }
- d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+ d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
- if (!scalar || !func_called) {
- /* for a scalar function, we'll really only call the function once */
+ if (!scalar || !func_called) {
+ /* for a scalar function, we'll really only call the function once */
- numUsed = spe_get_registers_used(gen->f, usedRegs);
- assert(numUsed < gen->frame_size / 16 - 2);
+ numUsed = spe_get_registers_used(gen->f, usedRegs);
+ assert(numUsed < gen->frame_size / 16 - 2);
- /* save registers to stack */
- for (i = 0; i < numUsed; i++) {
- uint reg = usedRegs[i];
- int offset = 2 + i;
- spe_stqd(gen->f, reg, SPE_REG_SP, 16 * offset);
- }
+ /* save registers to stack */
+ for (i = 0; i < numUsed; i++) {
+ uint reg = usedRegs[i];
+ int offset = 2 + i;
+ spe_stqd(gen->f, reg, SPE_REG_SP, 16 * offset);
+ }
- /* setup function arguments */
- for (a = 0; a < num_args; a++) {
- spe_move(gen->f, 3 + a, s_regs[a]);
- }
+ /* setup function arguments */
+ for (a = 0; a < num_args; a++) {
+ spe_move(gen->f, 3 + a, s_regs[a]);
+ }
- /* branch to function, save return addr */
- spe_brasl(gen->f, SPE_REG_RA, addr);
-
- /* save function's return value */
- if (scalar)
- spe_move(gen->f, retval_reg, 3);
- else
- spe_move(gen->f, d_reg, 3);
-
- /* restore registers from stack */
- for (i = 0; i < numUsed; i++) {
- uint reg = usedRegs[i];
- if (reg != d_reg && reg != retval_reg) {
- int offset = 2 + i;
- spe_lqd(gen->f, reg, SPE_REG_SP, 16 * offset);
- }
- }
+ /* branch to function, save return addr */
+ spe_brasl(gen->f, SPE_REG_RA, addr);
- func_called = TRUE;
- }
+ /* save function's return value */
+ if (scalar)
+ spe_move(gen->f, retval_reg, 3);
+ else
+ spe_move(gen->f, d_reg, 3);
- if (scalar) {
- spe_move(gen->f, d_reg, retval_reg);
+ /* restore registers from stack */
+ for (i = 0; i < numUsed; i++) {
+ uint reg = usedRegs[i];
+ if (reg != d_reg && reg != retval_reg) {
+ int offset = 2 + i;
+ spe_lqd(gen->f, reg, SPE_REG_SP, 16 * offset);
+ }
}
- store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
- free_itemps(gen);
+ func_called = TRUE;
+ }
+
+ if (scalar) {
+ spe_move(gen->f, d_reg, retval_reg);
}
+
+ store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+ free_itemps(gen);
}
if (scalar) {
spe_release_register(gen->f, retval_reg);
}
- return true;
+ return TRUE;
}
@@ -1525,11 +1502,9 @@ emit_TEX(struct codegen *gen, const struct tgsi_full_instruction *inst)
}
}
- for (ch = 0; ch < 4; ch++) {
- if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
- store_dest_reg(gen, d_regs[ch], ch, &inst->FullDstRegisters[0]);
- free_itemps(gen);
- }
+ FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+ store_dest_reg(gen, d_regs[ch], ch, &inst->FullDstRegisters[0]);
+ free_itemps(gen);
}
return TRUE;
@@ -1549,35 +1524,31 @@ emit_KIL(struct codegen *gen, const struct tgsi_full_instruction *inst)
/* zero = {0,0,0,0} */
zero_reg = get_itemp(gen);
- spe_load_uint(gen->f, zero_reg, 0);
+ spe_zero(gen->f, zero_reg);
cmp_reg = get_itemp(gen);
/* get src regs */
- for (ch = 0; ch < 4; ch++) {
- if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
- s_regs[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
- }
+ FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+ s_regs[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
}
/* test if any src regs are < 0 */
- for (ch = 0; ch < 4; ch++) {
- if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
- if (kil_reg >= 0) {
- /* cmp = 0 > src ? : ~0 : 0 */
- spe_fcgt(gen->f, cmp_reg, zero_reg, s_regs[ch]);
- /* kil = kil | cmp */
- spe_or(gen->f, kil_reg, kil_reg, cmp_reg);
- }
- else {
- kil_reg = get_itemp(gen);
- /* kil = 0 > src ? : ~0 : 0 */
- spe_fcgt(gen->f, kil_reg, zero_reg, s_regs[ch]);
- }
+ FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+ if (kil_reg >= 0) {
+ /* cmp = 0 > src ? : ~0 : 0 */
+ spe_fcgt(gen->f, cmp_reg, zero_reg, s_regs[ch]);
+ /* kil = kil | cmp */
+ spe_or(gen->f, kil_reg, kil_reg, cmp_reg);
+ }
+ else {
+ kil_reg = get_itemp(gen);
+ /* kil = 0 > src ? : ~0 : 0 */
+ spe_fcgt(gen->f, kil_reg, zero_reg, s_regs[ch]);
}
}
- if (gen->if_nesting) {
+ if (gen->if_nesting || gen->loop_nesting) {
/* may have been a conditional kil */
spe_and(gen->f, kil_reg, kil_reg, gen->exec_mask_reg);
}
@@ -1599,96 +1570,92 @@ emit_KIL(struct codegen *gen, const struct tgsi_full_instruction *inst)
/**
- * Emit max. See emit_SGT for comments.
+ * Emit min or max.
*/
static boolean
-emit_MAX(struct codegen *gen, const struct tgsi_full_instruction *inst)
+emit_MIN_MAX(struct codegen *gen, const struct tgsi_full_instruction *inst)
{
int ch, s0_reg[4], s1_reg[4], d_reg[4], tmp_reg[4];
spe_comment(gen->f, -4, "MAX:");
- for (ch = 0; ch < 4; ch++) {
- if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
- s0_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
- s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
- d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
- tmp_reg[ch] = get_itemp(gen);
- }
+ FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+ s0_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+ s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+ d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+ tmp_reg[ch] = get_itemp(gen);
}
/* d = (s0 > s1) ? s0 : s1 */
- for (ch = 0; ch < 4; ch++) {
- if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+ if (inst->Instruction.Opcode == TGSI_OPCODE_MAX)
spe_fcgt(gen->f, tmp_reg[ch], s0_reg[ch], s1_reg[ch]);
- }
+ else
+ spe_fcgt(gen->f, tmp_reg[ch], s1_reg[ch], s0_reg[ch]);
}
- for (ch = 0; ch < 4; ch++) {
- if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
- spe_selb(gen->f, d_reg[ch], s1_reg[ch], s0_reg[ch], tmp_reg[ch]);
- }
+ FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+ spe_selb(gen->f, d_reg[ch], s1_reg[ch], s0_reg[ch], tmp_reg[ch]);
}
- for (ch = 0; ch < 4; ch++) {
- if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
- store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
- }
+ FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+ store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
}
free_itemps(gen);
- return true;
+ return TRUE;
}
+
/**
- * Emit max. See emit_SGT for comments.
+ * Emit code to update the execution mask.
+ * This needs to be done whenever the execution status of a conditional
+ * or loop is changed.
*/
-static boolean
-emit_MIN(struct codegen *gen, const struct tgsi_full_instruction *inst)
+static void
+emit_update_exec_mask(struct codegen *gen)
{
- int ch, s0_reg[4], s1_reg[4], d_reg[4], tmp_reg[4];
+ const int exec_reg = get_exec_mask_reg(gen);
+ const int cond_reg = gen->cond_mask_reg;
+ const int loop_reg = gen->loop_mask_reg;
- spe_comment(gen->f, -4, "MIN:");
+ spe_comment(gen->f, 0, "Update master execution mask");
- for (ch = 0; ch < 4; ch++) {
- if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
- s0_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
- s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
- d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
- tmp_reg[ch] = get_itemp(gen);
- }
+ if (gen->if_nesting > 0 && gen->loop_nesting > 0) {
+ /* exec_mask = cond_mask & loop_mask */
+ assert(cond_reg > 0);
+ assert(loop_reg > 0);
+ spe_and(gen->f, exec_reg, cond_reg, loop_reg);
}
-
- /* d = (s1 > s0) ? s0 : s1 */
- for (ch = 0; ch < 4; ch++) {
- if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
- spe_fcgt(gen->f, tmp_reg[ch], s1_reg[ch], s0_reg[ch]);
- }
+ else if (gen->if_nesting > 0) {
+ assert(cond_reg > 0);
+ spe_move(gen->f, exec_reg, cond_reg);
}
- for (ch = 0; ch < 4; ch++) {
- if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
- spe_selb(gen->f, d_reg[ch], s1_reg[ch], s0_reg[ch], tmp_reg[ch]);
- }
+ else if (gen->loop_nesting > 0) {
+ assert(loop_reg > 0);
+ spe_move(gen->f, exec_reg, loop_reg);
}
-
- for (ch = 0; ch < 4; ch++) {
- if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
- store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
- }
+ else {
+ spe_load_int(gen->f, exec_reg, ~0x0);
}
-
- free_itemps(gen);
- return true;
}
+
static boolean
emit_IF(struct codegen *gen, const struct tgsi_full_instruction *inst)
{
const int channel = 0;
- const int exec_reg = get_exec_mask_reg(gen);
+ int cond_reg;
spe_comment(gen->f, -4, "IF:");
- /* update execution mask with the predicate register */
+ cond_reg = get_cond_mask_reg(gen);
+
+ /* XXX push cond exec mask */
+
+ spe_comment(gen->f, 0, "init conditional exec mask = ~0:");
+ spe_load_int(gen->f, cond_reg, ~0);
+
+ /* update conditional execution mask with the predicate register */
int tmp_reg = get_itemp(gen);
int s1_reg = get_src_reg(gen, channel, &inst->FullSrcRegisters[0]);
@@ -1696,44 +1663,126 @@ emit_IF(struct codegen *gen, const struct tgsi_full_instruction *inst)
spe_ceqi(gen->f, tmp_reg, s1_reg, 0);
/* tmp = !tmp */
spe_complement(gen->f, tmp_reg, tmp_reg);
- /* exec_mask = exec_mask & tmp */
- spe_and(gen->f, exec_reg, exec_reg, tmp_reg);
+ /* cond_mask = cond_mask & tmp */
+ spe_and(gen->f, cond_reg, cond_reg, tmp_reg);
gen->if_nesting++;
+ /* update the master execution mask */
+ emit_update_exec_mask(gen);
+
free_itemps(gen);
- return true;
+ return TRUE;
}
static boolean
emit_ELSE(struct codegen *gen, const struct tgsi_full_instruction *inst)
{
- const int exec_reg = get_exec_mask_reg(gen);
+ const int cond_reg = get_cond_mask_reg(gen);
spe_comment(gen->f, -4, "ELSE:");
- /* exec_mask = !exec_mask */
- spe_complement(gen->f, exec_reg, exec_reg);
+ spe_comment(gen->f, 0, "cond exec mask = !cond exec mask");
+ spe_complement(gen->f, cond_reg, cond_reg);
+ emit_update_exec_mask(gen);
- return true;
+ return TRUE;
}
static boolean
emit_ENDIF(struct codegen *gen, const struct tgsi_full_instruction *inst)
{
+ spe_comment(gen->f, -4, "ENDIF:");
+
+ /* XXX todo: pop cond exec mask */
+
+ gen->if_nesting--;
+
+ emit_update_exec_mask(gen);
+
+ return TRUE;
+}
+
+
+static boolean
+emit_BGNLOOP(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+ int exec_reg, loop_reg;
+
+ spe_comment(gen->f, -4, "BGNLOOP:");
+
+ exec_reg = get_exec_mask_reg(gen);
+ loop_reg = get_loop_mask_reg(gen);
+
+ /* XXX push loop_exec mask */
+
+ spe_comment(gen->f, 0*-4, "initialize loop exec mask = ~0");
+ spe_load_int(gen->f, loop_reg, ~0x0);
+
+ gen->loop_nesting++;
+ gen->loop_start = spe_code_size(gen->f); /* in bytes */
+
+ return TRUE;
+}
+
+
+static boolean
+emit_ENDLOOP(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+ const int loop_reg = get_loop_mask_reg(gen);
+ const int tmp_reg = get_itemp(gen);
+ int offset;
+
+ spe_comment(gen->f, -4, "ENDLOOP:");
+
+ /* tmp_reg = exec[0] | exec[1] | exec[2] | exec[3] */
+ spe_orx(gen->f, tmp_reg, loop_reg);
+
+ offset = gen->loop_start - spe_code_size(gen->f); /* in bytes */
+
+ /* branch back to top of loop if tmp_reg != 0 */
+ spe_brnz(gen->f, tmp_reg, offset / 4);
+
+ /* XXX pop loop_exec mask */
+
+ gen->loop_nesting--;
+
+ emit_update_exec_mask(gen);
+
+ return TRUE;
+}
+
+
+static boolean
+emit_BRK(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
const int exec_reg = get_exec_mask_reg(gen);
+ const int loop_reg = get_loop_mask_reg(gen);
- spe_comment(gen->f, -4, "ENDIF:");
+ spe_comment(gen->f, -4, "BREAK:");
- /* XXX todo: pop execution mask */
+ assert(gen->loop_nesting > 0);
- spe_load_int(gen->f, exec_reg, ~0x0);
+ spe_comment(gen->f, 0, "loop exec mask &= ~master exec mask");
+ spe_andc(gen->f, loop_reg, loop_reg, exec_reg);
- gen->if_nesting--;
- return true;
+ emit_update_exec_mask(gen);
+
+ return TRUE;
+}
+
+
+static boolean
+emit_CONT(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+ spe_comment(gen->f, -4, "CONT:");
+
+ assert(gen->loop_nesting > 0);
+
+ return TRUE;
}
@@ -1745,28 +1794,26 @@ emit_DDX_DDY(struct codegen *gen, const struct tgsi_full_instruction *inst,
spe_comment(gen->f, -4, ddx ? "DDX:" : "DDY:");
- for (ch = 0; ch < 4; ch++) {
- if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
- int s_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
- int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+ FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+ int s_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+ int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
- int t1_reg = get_itemp(gen);
- int t2_reg = get_itemp(gen);
+ int t1_reg = get_itemp(gen);
+ int t2_reg = get_itemp(gen);
- spe_splat_word(gen->f, t1_reg, s_reg, 0); /* upper-left pixel */
- if (ddx) {
- spe_splat_word(gen->f, t2_reg, s_reg, 1); /* upper-right pixel */
- }
- else {
- spe_splat_word(gen->f, t2_reg, s_reg, 2); /* lower-left pixel */
- }
- spe_fs(gen->f, d_reg, t2_reg, t1_reg);
-
- free_itemps(gen);
+ spe_splat_word(gen->f, t1_reg, s_reg, 0); /* upper-left pixel */
+ if (ddx) {
+ spe_splat_word(gen->f, t2_reg, s_reg, 1); /* upper-right pixel */
+ }
+ else {
+ spe_splat_word(gen->f, t2_reg, s_reg, 2); /* lower-left pixel */
}
+ spe_fs(gen->f, d_reg, t2_reg, t1_reg);
+
+ free_itemps(gen);
}
- return true;
+ return TRUE;
}
@@ -1784,7 +1831,7 @@ emit_END(struct codegen *gen)
{
spe_comment(gen->f, -4, "END:");
emit_epilogue(gen);
- return true;
+ return TRUE;
}
@@ -1796,15 +1843,15 @@ emit_instruction(struct codegen *gen,
const struct tgsi_full_instruction *inst)
{
switch (inst->Instruction.Opcode) {
+ case TGSI_OPCODE_ARL:
+ return emit_ARL(gen, inst);
case TGSI_OPCODE_MOV:
case TGSI_OPCODE_SWZ:
return emit_MOV(gen, inst);
- case TGSI_OPCODE_MUL:
- return emit_MUL(gen, inst);
case TGSI_OPCODE_ADD:
- return emit_ADD(gen, inst);
case TGSI_OPCODE_SUB:
- return emit_SUB(gen, inst);
+ case TGSI_OPCODE_MUL:
+ return emit_binop(gen, inst);
case TGSI_OPCODE_MAD:
return emit_MAD(gen, inst);
case TGSI_OPCODE_LERP:
@@ -1820,29 +1867,22 @@ emit_instruction(struct codegen *gen,
case TGSI_OPCODE_XPD:
return emit_XPD(gen, inst);
case TGSI_OPCODE_RCP:
- return emit_RCP(gen, inst);
case TGSI_OPCODE_RSQ:
- return emit_RSQ(gen, inst);
+ return emit_RCP_RSQ(gen, inst);
case TGSI_OPCODE_ABS:
return emit_ABS(gen, inst);
case TGSI_OPCODE_SGT:
- return emit_SGT(gen, inst);
case TGSI_OPCODE_SLT:
- return emit_SLT(gen, inst);
case TGSI_OPCODE_SGE:
- return emit_SGE(gen, inst);
case TGSI_OPCODE_SLE:
- return emit_SLE(gen, inst);
case TGSI_OPCODE_SEQ:
- return emit_SEQ(gen, inst);
case TGSI_OPCODE_SNE:
- return emit_SNE(gen, inst);
+ return emit_inequality(gen, inst);
case TGSI_OPCODE_CMP:
return emit_CMP(gen, inst);
- case TGSI_OPCODE_MAX:
- return emit_MAX(gen, inst);
case TGSI_OPCODE_MIN:
- return emit_MIN(gen, inst);
+ case TGSI_OPCODE_MAX:
+ return emit_MIN_MAX(gen, inst);
case TGSI_OPCODE_TRUNC:
return emit_TRUNC(gen, inst);
case TGSI_OPCODE_FLR:
@@ -1882,20 +1922,29 @@ emit_instruction(struct codegen *gen,
case TGSI_OPCODE_ENDIF:
return emit_ENDIF(gen, inst);
+ case TGSI_OPCODE_BGNLOOP2:
+ return emit_BGNLOOP(gen, inst);
+ case TGSI_OPCODE_ENDLOOP2:
+ return emit_ENDLOOP(gen, inst);
+ case TGSI_OPCODE_BRK:
+ return emit_BRK(gen, inst);
+ case TGSI_OPCODE_CONT:
+ return emit_CONT(gen, inst);
+
case TGSI_OPCODE_DDX:
- return emit_DDX_DDY(gen, inst, true);
+ return emit_DDX_DDY(gen, inst, TRUE);
case TGSI_OPCODE_DDY:
- return emit_DDX_DDY(gen, inst, false);
+ return emit_DDX_DDY(gen, inst, FALSE);
/* XXX lots more cases to do... */
default:
fprintf(stderr, "Cell: unimplemented TGSI instruction %d!\n",
inst->Instruction.Opcode);
- return false;
+ return FALSE;
}
- return true;
+ return TRUE;
}
@@ -1923,10 +1972,14 @@ emit_immediate(struct codegen *gen, const struct tgsi_full_immediate *immed)
gen->imm_regs[gen->num_imm][ch] = gen->imm_regs[gen->num_imm][ch - 1];
}
else {
+ char str[100];
int reg = spe_allocate_available_register(gen->f);
if (reg < 0)
- return false;
+ return FALSE;
+
+ sprintf(str, "init $%d = %f", reg, val);
+ spe_comment(gen->f, 0, str);
/* update immediate map */
gen->imm_regs[gen->num_imm][ch] = reg;
@@ -1938,7 +1991,7 @@ emit_immediate(struct codegen *gen, const struct tgsi_full_immediate *immed)
gen->num_imm++;
- return true;
+ return TRUE;
}
@@ -1963,7 +2016,7 @@ emit_declaration(struct cell_context *cell,
for (ch = 0; ch < 4; ch++) {
gen->temp_regs[i][ch] = spe_allocate_available_register(gen->f);
if (gen->temp_regs[i][ch] < 0)
- return false; /* out of regs */
+ return FALSE; /* out of regs */
}
/* XXX if we run out of SPE registers, we need to spill
@@ -1983,7 +2036,7 @@ emit_declaration(struct cell_context *cell,
; /* ignore */
}
- return true;
+ return TRUE;
}
@@ -2019,7 +2072,7 @@ cell_gen_fragment_program(struct cell_context *cell,
spe_allocate_register(f, gen.constants_reg);
if (cell->debug_flags & CELL_DEBUG_ASM) {
- spe_print_code(f, true);
+ spe_print_code(f, TRUE);
spe_indent(f, 8);
printf("Begin %s\n", __FUNCTION__);
tgsi_dump(tokens, 0);
@@ -2035,17 +2088,17 @@ cell_gen_fragment_program(struct cell_context *cell,
switch (parse.FullToken.Token.Type) {
case TGSI_TOKEN_TYPE_IMMEDIATE:
if (!emit_immediate(&gen, &parse.FullToken.FullImmediate))
- gen.error = true;
+ gen.error = TRUE;
break;
case TGSI_TOKEN_TYPE_DECLARATION:
if (!emit_declaration(cell, &gen, &parse.FullToken.FullDeclaration))
- gen.error = true;
+ gen.error = TRUE;
break;
case TGSI_TOKEN_TYPE_INSTRUCTION:
if (!emit_instruction(&gen, &parse.FullToken.FullInstruction))
- gen.error = true;
+ gen.error = TRUE;
break;
default:
diff --git a/src/gallium/drivers/cell/ppu/cell_screen.c b/src/gallium/drivers/cell/ppu/cell_screen.c
index d223557950..6fc2257e2a 100644
--- a/src/gallium/drivers/cell/ppu/cell_screen.c
+++ b/src/gallium/drivers/cell/ppu/cell_screen.c
@@ -81,8 +81,12 @@ cell_get_param(struct pipe_screen *screen, int param)
return 8; /* max 128x128x128 */
case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
return CELL_MAX_TEXTURE_LEVELS;
+ case PIPE_CAP_TEXTURE_MIRROR_REPEAT:
+ return 1; /* XXX not really true */
+ case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
+ return 0; /* XXX to do */
default:
- return 10;
+ return 0;
}
}
@@ -108,7 +112,7 @@ cell_get_paramf(struct pipe_screen *screen, int param)
return 16.0; /* arbitrary */
default:
- return 10;
+ return 0;
}
}
diff --git a/src/gallium/drivers/cell/spu/spu_shuffle.h b/src/gallium/drivers/cell/spu/spu_shuffle.h
new file mode 100644
index 0000000000..7cbdb814d2
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_shuffle.h
@@ -0,0 +1,186 @@
+#ifndef SPU_SHUFFLE_H
+#define SPU_SHUFFLE_H
+
+/*
+ * Generate shuffle patterns with minimal fuss.
+ *
+ * Based on ideas from
+ * http://www.insomniacgames.com/tech/articles/0408/files/shuffles.pdf
+ *
+ * A-P indicates 0-15th position in first vector
+ * a-p indicates 0-15th position in second vector
+ *
+ * +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
+ * |00|01|02|03|04|05|06|07|08|09|0a|0b|0c|0d|0e|0f|
+ * +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
+ * | A| B| C| D|
+ * +-----+-----+-----+-----+-----+-----+-----+-----+
+ * | A| B| C| D| E| F| G| H|
+ * +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
+ * | A| B| C| D| E| F| G| H| I| J| K| L| M| N| O| P|
+ * +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
+ *
+ * x or X indicates 0xff
+ * 8 indicates 0x80
+ * 0 indicates 0x00
+ *
+ * The macros SHUFFLE4() SHUFFLE8() and SHUFFLE16() provide a const vector
+ * unsigned char literal suitable for use with spu_shuffle().
+ *
+ * The macros SHUFB4() SHUFB8() and SHUFB16() provide a const qword vector
+ * literal suitable for use with si_shufb().
+ *
+ *
+ * For example :
+ * SHUFB4(A,A,A,A)
+ * expands to :
+ * ((const qword){0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3})
+ *
+ * SHUFFLE8(A,B,a,b,C,c,8,8)
+ * expands to :
+ * ((const vector unsigned char){0x00,0x01,0x02,0x03,0x10,0x11,0x12,0x13,
+ * 0x04,0x05,0x14,0x15,0xe0,0xe0,0xe0,0xe0})
+ *
+ */
+
+#include <spu_intrinsics.h>
+
+#define SHUFFLE_PATTERN_4_A__ 0x00, 0x01, 0x02, 0x03
+#define SHUFFLE_PATTERN_4_B__ 0x04, 0x05, 0x06, 0x07
+#define SHUFFLE_PATTERN_4_C__ 0x08, 0x09, 0x0a, 0x0b
+#define SHUFFLE_PATTERN_4_D__ 0x0c, 0x0d, 0x0e, 0x0f
+#define SHUFFLE_PATTERN_4_a__ 0x10, 0x11, 0x12, 0x13
+#define SHUFFLE_PATTERN_4_b__ 0x14, 0x15, 0x16, 0x17
+#define SHUFFLE_PATTERN_4_c__ 0x18, 0x19, 0x1a, 0x1b
+#define SHUFFLE_PATTERN_4_d__ 0x1c, 0x1d, 0x1e, 0x1f
+#define SHUFFLE_PATTERN_4_X__ 0xc0, 0xc0, 0xc0, 0xc0
+#define SHUFFLE_PATTERN_4_x__ 0xc0, 0xc0, 0xc0, 0xc0
+#define SHUFFLE_PATTERN_4_0__ 0x80, 0x80, 0x80, 0x80
+#define SHUFFLE_PATTERN_4_8__ 0xe0, 0xe0, 0xe0, 0xe0
+
+#define SHUFFLE_VECTOR_4__(A, B, C, D) \
+ SHUFFLE_PATTERN_4_##A##__, \
+ SHUFFLE_PATTERN_4_##B##__, \
+ SHUFFLE_PATTERN_4_##C##__, \
+ SHUFFLE_PATTERN_4_##D##__
+
+#define SHUFFLE4(A, B, C, D) \
+ ((const vector unsigned char){ \
+ SHUFFLE_VECTOR_4__(A, B, C, D) \
+ })
+
+#define SHUFB4(A, B, C, D) \
+ ((const qword){ \
+ SHUFFLE_VECTOR_4__(A, B, C, D) \
+ })
+
+
+#define SHUFFLE_PATTERN_8_A__ 0x00, 0x01
+#define SHUFFLE_PATTERN_8_B__ 0x02, 0x03
+#define SHUFFLE_PATTERN_8_C__ 0x04, 0x05
+#define SHUFFLE_PATTERN_8_D__ 0x06, 0x07
+#define SHUFFLE_PATTERN_8_E__ 0x08, 0x09
+#define SHUFFLE_PATTERN_8_F__ 0x0a, 0x0b
+#define SHUFFLE_PATTERN_8_G__ 0x0c, 0x0d
+#define SHUFFLE_PATTERN_8_H__ 0x0e, 0x0f
+#define SHUFFLE_PATTERN_8_a__ 0x10, 0x11
+#define SHUFFLE_PATTERN_8_b__ 0x12, 0x13
+#define SHUFFLE_PATTERN_8_c__ 0x14, 0x15
+#define SHUFFLE_PATTERN_8_d__ 0x16, 0x17
+#define SHUFFLE_PATTERN_8_e__ 0x18, 0x19
+#define SHUFFLE_PATTERN_8_f__ 0x1a, 0x1b
+#define SHUFFLE_PATTERN_8_g__ 0x1c, 0x1d
+#define SHUFFLE_PATTERN_8_h__ 0x1e, 0x1f
+#define SHUFFLE_PATTERN_8_X__ 0xc0, 0xc0
+#define SHUFFLE_PATTERN_8_x__ 0xc0, 0xc0
+#define SHUFFLE_PATTERN_8_0__ 0x80, 0x80
+#define SHUFFLE_PATTERN_8_8__ 0xe0, 0xe0
+
+
+#define SHUFFLE_VECTOR_8__(A, B, C, D, E, F, G, H) \
+ SHUFFLE_PATTERN_8_##A##__, \
+ SHUFFLE_PATTERN_8_##B##__, \
+ SHUFFLE_PATTERN_8_##C##__, \
+ SHUFFLE_PATTERN_8_##D##__, \
+ SHUFFLE_PATTERN_8_##E##__, \
+ SHUFFLE_PATTERN_8_##F##__, \
+ SHUFFLE_PATTERN_8_##G##__, \
+ SHUFFLE_PATTERN_8_##H##__
+
+#define SHUFFLE8(A, B, C, D, E, F, G, H) \
+ ((const vector unsigned char){ \
+ SHUFFLE_VECTOR_8__(A, B, C, D, E, F, G, H) \
+ })
+
+#define SHUFB8(A, B, C, D, E, F, G, H) \
+ ((const qword){ \
+ SHUFFLE_VECTOR_8__(A, B, C, D, E, F, G, H) \
+ })
+
+
+#define SHUFFLE_PATTERN_16_A__ 0x00
+#define SHUFFLE_PATTERN_16_B__ 0x01
+#define SHUFFLE_PATTERN_16_C__ 0x02
+#define SHUFFLE_PATTERN_16_D__ 0x03
+#define SHUFFLE_PATTERN_16_E__ 0x04
+#define SHUFFLE_PATTERN_16_F__ 0x05
+#define SHUFFLE_PATTERN_16_G__ 0x06
+#define SHUFFLE_PATTERN_16_H__ 0x07
+#define SHUFFLE_PATTERN_16_I__ 0x08
+#define SHUFFLE_PATTERN_16_J__ 0x09
+#define SHUFFLE_PATTERN_16_K__ 0x0a
+#define SHUFFLE_PATTERN_16_L__ 0x0b
+#define SHUFFLE_PATTERN_16_M__ 0x0c
+#define SHUFFLE_PATTERN_16_N__ 0x0d
+#define SHUFFLE_PATTERN_16_O__ 0x0e
+#define SHUFFLE_PATTERN_16_P__ 0x0f
+#define SHUFFLE_PATTERN_16_a__ 0x10
+#define SHUFFLE_PATTERN_16_b__ 0x11
+#define SHUFFLE_PATTERN_16_c__ 0x12
+#define SHUFFLE_PATTERN_16_d__ 0x13
+#define SHUFFLE_PATTERN_16_e__ 0x14
+#define SHUFFLE_PATTERN_16_f__ 0x15
+#define SHUFFLE_PATTERN_16_g__ 0x16
+#define SHUFFLE_PATTERN_16_h__ 0x17
+#define SHUFFLE_PATTERN_16_i__ 0x18
+#define SHUFFLE_PATTERN_16_j__ 0x19
+#define SHUFFLE_PATTERN_16_k__ 0x1a
+#define SHUFFLE_PATTERN_16_l__ 0x1b
+#define SHUFFLE_PATTERN_16_m__ 0x1c
+#define SHUFFLE_PATTERN_16_n__ 0x1d
+#define SHUFFLE_PATTERN_16_o__ 0x1e
+#define SHUFFLE_PATTERN_16_p__ 0x1f
+#define SHUFFLE_PATTERN_16_X__ 0xc0
+#define SHUFFLE_PATTERN_16_x__ 0xc0
+#define SHUFFLE_PATTERN_16_0__ 0x80
+#define SHUFFLE_PATTERN_16_8__ 0xe0
+
+#define SHUFFLE_VECTOR_16__(A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P) \
+ SHUFFLE_PATTERN_16_##A##__, \
+ SHUFFLE_PATTERN_16_##B##__, \
+ SHUFFLE_PATTERN_16_##C##__, \
+ SHUFFLE_PATTERN_16_##D##__, \
+ SHUFFLE_PATTERN_16_##E##__, \
+ SHUFFLE_PATTERN_16_##F##__, \
+ SHUFFLE_PATTERN_16_##G##__, \
+ SHUFFLE_PATTERN_16_##H##__, \
+ SHUFFLE_PATTERN_16_##I##__, \
+ SHUFFLE_PATTERN_16_##J##__, \
+ SHUFFLE_PATTERN_16_##K##__, \
+ SHUFFLE_PATTERN_16_##L##__, \
+ SHUFFLE_PATTERN_16_##M##__, \
+ SHUFFLE_PATTERN_16_##N##__, \
+ SHUFFLE_PATTERN_16_##O##__, \
+ SHUFFLE_PATTERN_16_##P
+
+#define SHUFFLE16(A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P) \
+ ((const vector unsigned char){ \
+ SHUFFLE_VECTOR_16__(A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P) \
+ })
+
+#define SHUFB16(A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P) \
+ ((const qword){ \
+ SHUFFLE_VECTOR_16__(A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P) \
+ })
+
+#endif
diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index 22e51a86ae..322be1252e 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -35,6 +35,7 @@
#include "util/u_math.h"
#include "spu_colorpack.h"
#include "spu_main.h"
+#include "spu_shuffle.h"
#include "spu_texture.h"
#include "spu_tile.h"
#include "spu_tri.h"
@@ -76,8 +77,13 @@ struct vertex_header {
* Triangle edge info
*/
struct edge {
- float dx; /**< X(v1) - X(v0), used only during setup */
- float dy; /**< Y(v1) - Y(v0), used only during setup */
+ union {
+ struct {
+ float dx; /**< X(v1) - X(v0), used only during setup */
+ float dy; /**< Y(v1) - Y(v0), used only during setup */
+ };
+ vec_float4 ds; /**< vector accessor for dx and dy */
+ };
float dxdy; /**< dx/dy */
float sx, sy; /**< first sample point coord */
int lines; /**< number of lines on this edge */
@@ -102,10 +108,15 @@ struct setup_stage {
* turn. Currently fixed at 4 floats, but should change in time.
* Codegen will help cope with this.
*/
- const struct vertex_header *vmax;
- const struct vertex_header *vmid;
- const struct vertex_header *vmin;
- const struct vertex_header *vprovoke;
+ union {
+ struct {
+ const struct vertex_header *vmin;
+ const struct vertex_header *vmid;
+ const struct vertex_header *vmax;
+ const struct vertex_header *vprovoke;
+ };
+ qword vertex_headers;
+ };
struct edge ebot;
struct edge etop;
@@ -122,8 +133,7 @@ struct setup_stage {
struct interp_coef coef[PIPE_MAX_SHADER_INPUTS];
struct {
- int left[2]; /**< [0] = row0, [1] = row1 */
- int right[2];
+ vec_int4 quad; /**< [0] = row0, [1] = row1; {left[0],left[1],right[0],right[1]} */
int y;
unsigned y_flags;
unsigned mask; /**< mask of MASK_BOTTOM/TOP_LEFT/RIGHT bits */
@@ -306,52 +316,35 @@ block(int x)
/**
- * Compute mask which indicates which pixels in the 2x2 quad are actually inside
- * the triangle's bounds.
- * The mask is a uint4 vector and each element will be 0 or 0xffffffff.
- */
-static INLINE mask_t
-calculate_mask(int x)
-{
- /* This is a little tricky.
- * Use & instead of && to avoid branches.
- * Use negation to convert true/false to ~0/0 values.
- */
- mask_t mask;
- mask = spu_insert(-((x >= setup.span.left[0]) & (x < setup.span.right[0])), mask, 0);
- mask = spu_insert(-((x+1 >= setup.span.left[0]) & (x+1 < setup.span.right[0])), mask, 1);
- mask = spu_insert(-((x >= setup.span.left[1]) & (x < setup.span.right[1])), mask, 2);
- mask = spu_insert(-((x+1 >= setup.span.left[1]) & (x+1 < setup.span.right[1])), mask, 3);
- return mask;
-}
-
-
-/**
* Render a horizontal span of quads
*/
static void
flush_spans(void)
{
int minleft, maxright;
- int x;
+
+ const int l0 = spu_extract(setup.span.quad, 0);
+ const int l1 = spu_extract(setup.span.quad, 1);
+ const int r0 = spu_extract(setup.span.quad, 2);
+ const int r1 = spu_extract(setup.span.quad, 3);
switch (setup.span.y_flags) {
case 0x3:
/* both odd and even lines written (both quad rows) */
- minleft = MIN2(setup.span.left[0], setup.span.left[1]);
- maxright = MAX2(setup.span.right[0], setup.span.right[1]);
+ minleft = MIN2(l0, l1);
+ maxright = MAX2(r0, r1);
break;
case 0x1:
/* only even line written (quad top row) */
- minleft = setup.span.left[0];
- maxright = setup.span.right[0];
+ minleft = l0;
+ maxright = r0;
break;
case 0x2:
/* only odd line written (quad bottom row) */
- minleft = setup.span.left[1];
- maxright = setup.span.right[1];
+ minleft = l1;
+ maxright = r1;
break;
default:
@@ -389,17 +382,42 @@ flush_spans(void)
ASSERT(spu.cur_ztile_status != TILE_STATUS_DEFINED);
}
- /* XXX this loop could be moved into the above switch cases and
- * calculate_mask() could be simplified a bit...
- */
- for (x = block(minleft); x <= block(maxright); x += 2) {
- emit_quad( x, setup.span.y, calculate_mask( x ));
+ /* XXX this loop could be moved into the above switch cases... */
+
+ /* Setup for mask calculation */
+ const vec_int4 quad_LlRr = setup.span.quad;
+ const vec_int4 quad_RrLl = spu_rlqwbyte(quad_LlRr, 8);
+ const vec_int4 quad_LLll = spu_shuffle(quad_LlRr, quad_LlRr, SHUFFLE4(A,A,B,B));
+ const vec_int4 quad_RRrr = spu_shuffle(quad_RrLl, quad_RrLl, SHUFFLE4(A,A,B,B));
+
+ const vec_int4 twos = spu_splats(2);
+
+ const int x = block(minleft);
+ vec_int4 xs = {x, x+1, x, x+1};
+
+ for (; spu_extract(xs, 0) <= block(maxright); xs += twos) {
+ /**
+ * Computes mask to indicate which pixels in the 2x2 quad are actually
+ * inside the triangle's bounds.
+ */
+
+ /* Calculate ({x,x+1,x,x+1} >= {l[0],l[0],l[1],l[1]}) */
+ const mask_t gt_LLll_xs = spu_cmpgt(quad_LLll, xs);
+ const mask_t gte_xs_LLll = spu_nand(gt_LLll_xs, gt_LLll_xs);
+
+ /* Calculate ({r[0],r[0],r[1],r[1]} > {x,x+1,x,x+1}) */
+ const mask_t gt_RRrr_xs = spu_cmpgt(quad_RRrr, xs);
+
+ /* Combine results to create mask */
+ const mask_t mask = spu_and(gte_xs_LLll, gt_RRrr_xs);
+
+ emit_quad(spu_extract(xs, 0), setup.span.y, mask);
}
setup.span.y = 0;
setup.span.y_flags = 0;
- setup.span.right[0] = 0;
- setup.span.right[1] = 0;
+ /* Zero right elements */
+ setup.span.quad = spu_shuffle(setup.span.quad, setup.span.quad, SHUFFLE4(A,B,0,0));
}
@@ -444,55 +462,39 @@ setup_sort_vertices(const struct vertex_header *v0,
/* determine bottom to top order of vertices */
{
- float y0 = spu_extract(v0->data[0], 1);
- float y1 = spu_extract(v1->data[0], 1);
- float y2 = spu_extract(v2->data[0], 1);
- if (y0 <= y1) {
- if (y1 <= y2) {
- /* y0<=y1<=y2 */
- setup.vmin = v0;
- setup.vmid = v1;
- setup.vmax = v2;
- sign = -1.0f;
- }
- else if (y2 <= y0) {
- /* y2<=y0<=y1 */
- setup.vmin = v2;
- setup.vmid = v0;
- setup.vmax = v1;
- sign = -1.0f;
- }
- else {
- /* y0<=y2<=y1 */
- setup.vmin = v0;
- setup.vmid = v2;
- setup.vmax = v1;
- sign = 1.0f;
- }
- }
- else {
- if (y0 <= y2) {
- /* y1<=y0<=y2 */
- setup.vmin = v1;
- setup.vmid = v0;
- setup.vmax = v2;
- sign = 1.0f;
- }
- else if (y2 <= y1) {
- /* y2<=y1<=y0 */
- setup.vmin = v2;
- setup.vmid = v1;
- setup.vmax = v0;
- sign = 1.0f;
- }
- else {
- /* y1<=y2<=y0 */
- setup.vmin = v1;
- setup.vmid = v2;
- setup.vmax = v0;
- sign = -1.0f;
- }
- }
+ /* A table of shuffle patterns for putting vertex_header pointers into
+ correct order. Quite magical. */
+ const vec_uchar16 sort_order_patterns[] = {
+ SHUFFLE4(A,B,C,C),
+ SHUFFLE4(C,A,B,C),
+ SHUFFLE4(A,C,B,C),
+ SHUFFLE4(B,C,A,C),
+ SHUFFLE4(B,A,C,C),
+ SHUFFLE4(C,B,A,C) };
+
+ /* The vertex_header pointers, packed for easy shuffling later */
+ const vec_uint4 vs = {(unsigned)v0, (unsigned)v1, (unsigned)v2};
+
+ /* Collate y values into two vectors for comparison.
+ Using only one shuffle constant! ;) */
+ const vec_float4 y_02_ = spu_shuffle(v0->data[0], v2->data[0], SHUFFLE4(0,B,b,C));
+ const vec_float4 y_10_ = spu_shuffle(v1->data[0], v0->data[0], SHUFFLE4(0,B,b,C));
+ const vec_float4 y_012 = spu_shuffle(y_02_, v1->data[0], SHUFFLE4(0,B,b,C));
+ const vec_float4 y_120 = spu_shuffle(y_10_, v2->data[0], SHUFFLE4(0,B,b,C));
+
+ /* Perform comparison: {y0,y1,y2} > {y1,y2,y0} */
+ const vec_uint4 compare = spu_cmpgt(y_012, y_120);
+ /* Compress the result of the comparison into 4 bits */
+ const vec_uint4 gather = spu_gather(compare);
+ /* Subtract one to attain the index into the LUT. Magical. */
+ const unsigned int index = spu_extract(gather, 0) - 1;
+
+ /* Load the appropriate pattern and construct the desired vector. */
+ setup.vertex_headers = (qword)spu_shuffle(vs, vs, sort_order_patterns[index]);
+
+ /* Using the result of the comparison, set sign.
+ Very magical. */
+ sign = ((si_to_uint(si_cntb((qword)gather)) == 2) ? 1.0f : -1.0f);
}
/* Check if triangle is completely outside the tile bounds */
@@ -509,12 +511,9 @@ setup_sort_vertices(const struct vertex_header *v0,
spu_extract(setup.vmax->data[0], 0) > setup.cliprect_maxx)
return FALSE;
- setup.ebot.dx = spu_extract(setup.vmid->data[0], 0) - spu_extract(setup.vmin->data[0], 0);
- setup.ebot.dy = spu_extract(setup.vmid->data[0], 1) - spu_extract(setup.vmin->data[0], 1);
- setup.emaj.dx = spu_extract(setup.vmax->data[0], 0) - spu_extract(setup.vmin->data[0], 0);
- setup.emaj.dy = spu_extract(setup.vmax->data[0], 1) - spu_extract(setup.vmin->data[0], 1);
- setup.etop.dx = spu_extract(setup.vmax->data[0], 0) - spu_extract(setup.vmid->data[0], 0);
- setup.etop.dy = spu_extract(setup.vmax->data[0], 1) - spu_extract(setup.vmid->data[0], 1);
+ setup.ebot.ds = spu_sub(setup.vmid->data[0], setup.vmin->data[0]);
+ setup.emaj.ds = spu_sub(setup.vmax->data[0], setup.vmin->data[0]);
+ setup.etop.ds = spu_sub(setup.vmax->data[0], setup.vmid->data[0]);
/*
* Compute triangle's area. Use 1/area to compute partial
@@ -535,8 +534,6 @@ setup_sort_vertices(const struct vertex_header *v0,
setup.facing = (area * sign > 0.0f)
^ (spu.rasterizer.front_winding == PIPE_WINDING_CW);
- setup.vprovoke = v2;
-
return TRUE;
}
@@ -746,9 +743,11 @@ subtriangle(struct edge *eleft, struct edge *eright, unsigned lines)
setup.span.y = block(_y);
}
- setup.span.left[_y&1] = left;
- setup.span.right[_y&1] = right;
- setup.span.y_flags |= 1<<(_y&1);
+ int offset = _y&1;
+ vec_int4 quad_LlRr = {left, left, right, right};
+ /* Store left and right in 0 or 1 row of quad based on offset */
+ setup.span.quad = spu_sel(quad_LlRr, setup.span.quad, spu_maskw(5<<offset));
+ setup.span.y_flags |= 1<<offset;
}
}
@@ -790,8 +789,8 @@ tri_draw(const float *v0, const float *v1, const float *v2,
setup.span.y = 0;
setup.span.y_flags = 0;
- setup.span.right[0] = 0;
- setup.span.right[1] = 0;
+ /* Zero right elements */
+ setup.span.quad = spu_shuffle(setup.span.quad, setup.span.quad, SHUFFLE4(A,B,0,0));
if (setup.oneOverArea < 0.0) {
/* emaj on left */