diff options
Diffstat (limited to 'src/gallium/drivers')
19 files changed, 2236 insertions, 514 deletions
| diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h index c0ca201e1d..cb0631baf5 100644 --- a/src/gallium/drivers/cell/common.h +++ b/src/gallium/drivers/cell/common.h @@ -84,7 +84,7 @@  #define CELL_CMD_BATCH                5  #define CELL_CMD_RELEASE_VERTS        6  #define CELL_CMD_STATE_FRAMEBUFFER   10 -#define CELL_CMD_STATE_DEPTH_STENCIL 11 +#define CELL_CMD_STATE_FRAGMENT_OPS  11  #define CELL_CMD_STATE_SAMPLER       12  #define CELL_CMD_STATE_TEXTURE       13  #define CELL_CMD_STATE_VERTEX_INFO   14 @@ -92,9 +92,8 @@  #define CELL_CMD_STATE_UNIFORMS      16  #define CELL_CMD_STATE_VS_ARRAY_INFO 17  #define CELL_CMD_STATE_BIND_VS       18 -#define CELL_CMD_STATE_BLEND         19 +#define CELL_CMD_STATE_FRAGMENT_PROGRAM 19  #define CELL_CMD_STATE_ATTRIB_FETCH  20 -#define CELL_CMD_STATE_LOGICOP       21  #define CELL_CMD_VS_EXECUTE          22  #define CELL_CMD_FLUSH_BUFFER_RANGE  23 @@ -110,29 +109,34 @@  #define CELL_DEBUG_SYNC     (1 << 1) -/** - */ -struct cell_command_depth_stencil_alpha_test { -   uint64_t base;               /**< Effective address of code start. */ -   unsigned size;               /**< Size in bytes of SPE code. */ -   unsigned read_depth;         /**< Flag: should depth be read? */ -   unsigned read_stencil;       /**< Flag: should stencil be read? */ -}; + +/** Max instructions for doing per-fragment operations */ +#define SPU_MAX_FRAGMENT_OPS_INSTS 64  /** - * Upload code to perform framebuffer blend operation + * Command to specify per-fragment operations state and generated code.   */ -struct cell_command_blend { -   uint64_t base;               /**< Effective address of code start. */ -   unsigned size;               /**< Size in bytes of SPE code. */ -   unsigned read_fb;            /**< Flag: should framebuffer be read? */ +struct cell_command_fragment_ops +{ +   uint64_t opcode;      /**< CELL_CMD_STATE_FRAGMENT_OPS */ +   struct pipe_depth_stencil_alpha_state dsa; +   struct pipe_blend_state blend; +   unsigned code[SPU_MAX_FRAGMENT_OPS_INSTS];  }; -struct cell_command_logicop { -   uint64_t base;               /**< Effective address of code start. */ -   unsigned size;               /**< Size in bytes of SPE code. */ +/** Max instructions for fragment programs */ +#define SPU_MAX_FRAGMENT_PROGRAM_INSTS 128 + +/** + * Command to send a fragment progra to SPUs. + */ +struct cell_command_fragment_program +{ +   uint64_t opcode;      /**< CELL_CMD_STATE_FRAGMENT_PROGRAM */ +   uint num_inst;        /**< Number of instructions */ +   unsigned code[SPU_MAX_FRAGMENT_PROGRAM_INSTS];  }; @@ -172,13 +176,15 @@ struct cell_array_info  }; -struct cell_attribute_fetch_code { +struct cell_attribute_fetch_code +{     uint64_t base;     uint size;  }; -struct cell_buffer_range { +struct cell_buffer_range +{     uint64_t base;     unsigned size;  }; diff --git a/src/gallium/drivers/cell/ppu/Makefile b/src/gallium/drivers/cell/ppu/Makefile index 25473e200c..b28f4c5c31 100644 --- a/src/gallium/drivers/cell/ppu/Makefile +++ b/src/gallium/drivers/cell/ppu/Makefile @@ -25,9 +25,10 @@ SOURCES = \  	cell_context.c \  	cell_draw_arrays.c \  	cell_flush.c \ +	cell_gen_fragment.c \ +	cell_gen_fp.c \  	cell_state_derived.c \  	cell_state_emit.c \ -	cell_state_per_fragment.c \  	cell_state_shader.c \  	cell_pipe_state.c \  	cell_screen.c \ diff --git a/src/gallium/drivers/cell/ppu/cell_context.h b/src/gallium/drivers/cell/ppu/cell_context.h index 8cec9f45b2..14914b9c6f 100644 --- a/src/gallium/drivers/cell/ppu/cell_context.h +++ b/src/gallium/drivers/cell/ppu/cell_context.h @@ -61,6 +61,7 @@ struct cell_fragment_shader_state  {     struct pipe_shader_state shader;     struct tgsi_shader_info info; +   struct spe_function code;     void *data;  }; diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c new file mode 100644 index 0000000000..6ffe94eb14 --- /dev/null +++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c @@ -0,0 +1,523 @@ +/************************************************************************** + *  + * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + *  + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + *  + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + *  + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *  + **************************************************************************/ + + + +/** + * Generate SPU fragment program/shader code. + * + * Note that we generate SOA-style code here.  So each TGSI instruction + * operates on four pixels (and is translated into four SPU instructions, + * generally speaking). + * + * \author Brian Paul + */ + + +#include "pipe/p_defines.h" +#include "pipe/p_state.h" +#include "pipe/p_shader_tokens.h" +#include "tgsi/tgsi_parse.h" +#include "tgsi/tgsi_util.h" +#include "tgsi/tgsi_exec.h" +#include "tgsi/tgsi_dump.h" +#include "rtasm/rtasm_ppc_spe.h" +#include "util/u_memory.h" +#include "cell_context.h" +#include "cell_gen_fp.h" + + +/** Set to 1 to enable debug/disassembly printfs */ +#define DISASSEM 01 + + +/** + * Context needed during code generation. + */ +struct codegen +{ +   int inputs_reg;      /**< 1st function parameter */ +   int outputs_reg;     /**< 2nd function parameter */ +   int constants_reg;   /**< 3rd function parameter */ +   int temp_regs[8][4]; /**< maps TGSI temps to SPE registers */ + +   int one_reg;         /**< register containing {1.0, 1.0, 1.0, 1.0} */ + +   /** Per-instruction temps / intermediate temps */ +   int num_itemps; +   int itemps[3]; + +   struct spe_function *f; +   boolean error; +}; + + +/** + * Allocate an intermediate temporary register. + */ +static int +get_itemp(struct codegen *gen) +{ +   int t = spe_allocate_available_register(gen->f); +   assert(gen->num_itemps < Elements(gen->itemps)); +   gen->itemps[gen->num_itemps++] = t; +   return t; +} + +/** + * Free all intermediate temporary registers.  To be called after each + * instruction has been emitted. + */ +static void +free_itemps(struct codegen *gen) +{ +   int i; +   for (i = 0; i < gen->num_itemps; i++) { +      spe_release_register(gen->f, gen->itemps[i]); +   } +   gen->num_itemps = 0; +} + + +/** + * Return index of an SPE register containing {1.0, 1.0, 1.0, 1.0}. + * The register is allocated and initialized upon the first call. + */ +static int +get_const_one_reg(struct codegen *gen) +{ +   if (gen->one_reg <= 0) { +      gen->one_reg = spe_allocate_available_register(gen->f); +   } + +   /* one = {1.0, 1.0, 1.0, 1.0} */ +   spe_load_float(gen->f, gen->one_reg, 1.0f); +#if DISASSEM +   printf("il\tr%d, 1.0f\n", gen->one_reg); +#endif + +   return gen->one_reg; +} + + +/** + * Return the index of the SPU temporary containing the named TGSI + * source register.  If the TGSI register is a TGSI_FILE_TEMPORARY we + * just return the corresponding SPE register.  If the TGIS register + * is TGSI_FILE_INPUT/CONSTANT/IMMEDIATE we allocate a new SPE register + * and emit an SPE load instruction. + */ +static int +get_src_reg(struct codegen *gen, +            int channel, +            const struct tgsi_full_src_register *src) +{ +   int reg; + +   /* XXX need to examine src swizzle info here. +    * That will involve changing the channel var... +    */ + + +   switch (src->SrcRegister.File) { +   case TGSI_FILE_TEMPORARY: +      reg = gen->temp_regs[src->SrcRegister.Index][channel]; +      break; +   case TGSI_FILE_INPUT: +      { +         /* offset is measured in quadwords, not bytes */ +         int offset = src->SrcRegister.Index * 4 + channel; +         reg = get_itemp(gen); +         /* Load:  reg = memory[(machine_reg) + offset] */ +         spe_lqd(gen->f, reg, gen->inputs_reg, offset); +#if DISASSEM +         printf("lqd\tr%d, r%d + %d\n", reg, gen->inputs_reg, offset); +#endif +      } +      break; +   case TGSI_FILE_IMMEDIATE: +      /* xxx fall-through for now / fix */ +   case TGSI_FILE_CONSTANT: +      /* xxx fall-through for now / fix */ +   default: +      assert(0); +   } + +   return reg; +} + + +/** + * Return the index of an SPE register to use for the given TGSI register. + * If the TGSI register is TGSI_FILE_TEMPORARAY, the index of the + * corresponding SPE register is returned.  If the TGSI register is + * TGSI_FILE_OUTPUT we allocate an intermediate temporary register. + * See store_dest_reg() below... + */ +static int +get_dst_reg(struct codegen *gen, +            int channel, +            const struct tgsi_full_dst_register *dest) +{ +   int reg; + +   switch (dest->DstRegister.File) { +   case TGSI_FILE_TEMPORARY: +      reg = gen->temp_regs[dest->DstRegister.Index][channel]; +      break; +   case TGSI_FILE_OUTPUT: +      reg = get_itemp(gen); +      break; +   default: +      assert(0); +   } + +   return reg; +} + + +/** + * When a TGSI instruction is writing to an output register, this + * function emits the SPE store instruction to store the value_reg. + * \param value_reg  the SPE register containing the value to store. + *                   This would have been returned by get_dst_reg(). + */ +static void +store_dest_reg(struct codegen *gen, +               int value_reg, int channel, +               const struct tgsi_full_dst_register *dest) +{ +   switch (dest->DstRegister.File) { +   case TGSI_FILE_TEMPORARY: +      /* no-op */ +      break; +   case TGSI_FILE_OUTPUT: +      { +         /* offset is measured in quadwords, not bytes */ +         int offset = dest->DstRegister.Index * 4 + channel; +         /* Store: memory[(machine_reg) + offset] = reg */ +         spe_stqd(gen->f, value_reg, gen->outputs_reg, offset); +#if DISASSEM +         printf("stqd\tr%d, r%d + %d\n", value_reg, gen->outputs_reg, offset); +#endif +      } +      break; +   default: +      assert(0); +   } +} + + +static boolean +emit_MOV(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ +   int ch; +   for (ch = 0; ch < 4; ch++) { +      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { +         int src_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); +         int dst_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); +         /* XXX we don't always need to actually emit a mov instruction here */ +         spe_move(gen->f, dst_reg, src_reg); +#if DISASSEM +         printf("mov\tr%d, r%d\n", dst_reg, src_reg); +#endif +         store_dest_reg(gen, dst_reg, ch, &inst->FullDstRegisters[0]); +         free_itemps(gen); +      } +   } +   return true; +} + + +/** + * Emit addition instructions.  Recall that a single TGSI_OPCODE_ADD + * becomes (up to) four SPU "fa" instructions because we're doing SOA + * processing. + */ +static boolean +emit_ADD(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ +   int ch; +   /* Loop over Red/Green/Blue/Alpha channels */ +   for (ch = 0; ch < 4; ch++) { +      /* If the dest R, G, B or A writemask is enabled... */ +      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { +         /* get indexes of the two src, one dest SPE registers */ +         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); +         int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]); +         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + +         /* Emit actual SPE instruction: d = s1 + s2 */ +         spe_fa(gen->f, d_reg, s1_reg, s2_reg); +#if DISASSEM +         printf("fa\tr%d, r%d, r%d\n", d_reg, s1_reg, s2_reg); +#endif + +         /* Store the result (a no-op for TGSI_FILE_TEMPORARY dests) */ +         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); +         /* Free any intermediate temps we allocated */ +         free_itemps(gen); +      } +   } +   return true; +} + + +/** + * Emit multiply.  See emit_ADD for comments. + */ +static boolean +emit_MUL(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ +   int ch; +   for (ch = 0; ch < 4; ch++) { +      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { +         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); +         int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]); +         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); +         /* d = s1 * s2 */ +         spe_fm(gen->f, d_reg, s1_reg, s2_reg); +#if DISASSEM +         printf("fm\tr%d, r%d, r%d\n", d_reg, s1_reg, s2_reg); +#endif +         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); +         free_itemps(gen); +      } +   } +   return true; +} + + +/** + * Emit set-if-greater-than. + * Note that the SPE fcgt instruction produces 0x0 and 0xffffffff as + * the result but OpenGL/TGSI needs 0.0 and 1.0 results. + * We can easily convert 0x0/0xffffffff to 0.0/1.0 with a bitwise AND. + */ +static boolean +emit_SGT(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ +   int ch; + +   for (ch = 0; ch < 4; ch++) { +      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { +         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); +         int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]); +         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + +         /* d = (s1 > s2) */ +         spe_fcgt(gen->f, d_reg, s1_reg, s2_reg); +#if DISASSEM +         printf("fcgt\tr%d, r%d, r%d\n", d_reg, s1_reg, s2_reg); +#endif + +         /* convert d from 0x0/0xffffffff to 0.0/1.0 */ +         /* d = d & one_reg */ +         spe_and(gen->f, d_reg, d_reg, get_const_one_reg(gen)); +#if DISASSEM +         printf("and\tr%d, r%d, r%d\n", d_reg, d_reg, get_const_one_reg(gen)); +#endif + +         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); +         free_itemps(gen); +      } +   } + +   return true; +} + + +/** + * Emit END instruction. + * We just return from the shader function at this point. + * + * Note that there may be more code after this that would be + * called by TGSI_OPCODE_CALL. + */ +static boolean +emit_END(struct codegen *gen) +{ +   /* return from function call */ +   spe_bi(gen->f, SPE_REG_RA, 0, 0); +#if DISASSEM +   printf("bi\trRA\n"); +#endif +   return true; +} + + +/** + * Emit code for the given instruction.  Just a big switch stmt. + */ +static boolean +emit_instruction(struct codegen *gen, +                 const struct tgsi_full_instruction *inst) +{ +   switch (inst->Instruction.Opcode) { +   case TGSI_OPCODE_MOV: +      return emit_MOV(gen, inst); +   case TGSI_OPCODE_MUL: +      return emit_MUL(gen, inst); +   case TGSI_OPCODE_ADD: +      return emit_ADD(gen, inst); +   case TGSI_OPCODE_SGT: +      return emit_SGT(gen, inst); +   case TGSI_OPCODE_END: +      return emit_END(gen); + +   /* XXX lots more cases to do... */ + +   default: +      return false; +   } + +   return true; +} + + + +/** + * Emit "code" for a TGSI declaration. + * We only care about TGSI TEMPORARY register declarations at this time. + * For each TGSI TEMPORARY we allocate four SPE registers. + */ +static void +emit_declaration(struct codegen *gen, const struct tgsi_full_declaration *decl) +{ +   int i, ch; + +   switch (decl->Declaration.File) { +   case TGSI_FILE_TEMPORARY: +#if DISASSEM +      printf("Declare temp reg %d .. %d\n", +             decl->DeclarationRange.First, +             decl->DeclarationRange.Last); +#endif +      for (i = decl->DeclarationRange.First; +           i <= decl->DeclarationRange.Last; +           i++) { +         for (ch = 0; ch < 4; ch++) { +            gen->temp_regs[i][ch] = spe_allocate_available_register(gen->f); +         } + +         /* XXX if we run out of SPE registers, we need to spill +          * to SPU memory.  someday... +          */ + +#if DISASSEM +         printf("  SPE regs: %d %d %d %d\n", +                gen->temp_regs[i][0], +                gen->temp_regs[i][1], +                gen->temp_regs[i][2], +                gen->temp_regs[i][3]); +#endif +      } +      break; +   default: +      ; /* ignore */ +   } +} + + +/** + * Translate TGSI shader code to SPE instructions.  This is done when + * the state tracker gives us a new shader (via pipe->create_fs_state()). + * + * \param cell    the rendering context (in) + * \param tokens  the TGSI shader (in) + * \param f       the generated function (out) + */ +boolean +cell_gen_fragment_program(struct cell_context *cell, +                          const struct tgsi_token *tokens, +                          struct spe_function *f) +{ +   struct tgsi_parse_context parse; +   struct codegen gen; + +   memset(&gen, 0, sizeof(gen)); +   gen.f = f; + +   /* For SPE function calls: reg $3 = first param, $4 = second param, etc. */ +   gen.inputs_reg = 3;     /* pointer to inputs array */ +   gen.outputs_reg = 4;    /* pointer to outputs array */ +   gen.constants_reg = 5;  /* pointer to constants array */ + +   spe_init_func(f, SPU_MAX_FRAGMENT_PROGRAM_INSTS * SPE_INST_SIZE); +   spe_allocate_register(f, gen.inputs_reg); +   spe_allocate_register(f, gen.outputs_reg); +   spe_allocate_register(f, gen.constants_reg); + +#if DISASSEM +   printf("Begin %s\n", __FUNCTION__); +   tgsi_dump(tokens, 0); +#endif + +   tgsi_parse_init(&parse, tokens); + +   while (!tgsi_parse_end_of_tokens(&parse) && !gen.error) { +      tgsi_parse_token(&parse); + +      switch (parse.FullToken.Token.Type) { +      case TGSI_TOKEN_TYPE_IMMEDIATE: +#if 0 +         if (!note_immediate(&gen, &parse.FullToken.FullImmediate )) +            goto fail; +#endif +         break; + +      case TGSI_TOKEN_TYPE_DECLARATION: +         emit_declaration(&gen, &parse.FullToken.FullDeclaration); +         break; + +      case TGSI_TOKEN_TYPE_INSTRUCTION: +         if (!emit_instruction(&gen, &parse.FullToken.FullInstruction )) { +            gen.error = true; +         } +         break; + +      default: +         assert(0); + +      } +   } + + +   if (gen.error) { +      /* terminate the SPE code */ +      return emit_END(&gen); +   } + +#if DISASSEM +   printf("cell_gen_fragment_program nr instructions: %d\n", f->num_inst); +   printf("End %s\n", __FUNCTION__); +#endif + +   tgsi_parse_free( &parse ); + +   return !gen.error; +} diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.h b/src/gallium/drivers/cell/ppu/cell_gen_fp.h new file mode 100644 index 0000000000..99faea7046 --- /dev/null +++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.h @@ -0,0 +1,42 @@ +/************************************************************************** + *  + * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + *  + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + *  + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + *  + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *  + **************************************************************************/ + + + +#ifndef CELL_GEN_FP_H +#define CELL_GEN_FP_H + + + +extern boolean +cell_gen_fragment_program(struct cell_context *cell, +                          const struct tgsi_token *tokens, +                          struct spe_function *f); + + +#endif /* CELL_GEN_FP_H */ + diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c new file mode 100644 index 0000000000..06219d4e98 --- /dev/null +++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c @@ -0,0 +1,870 @@ +/************************************************************************** + *  + * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + *  + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + *  + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + *  + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *  + **************************************************************************/ + + + +/** + * Generate SPU per-fragment code (actually per-quad code). + * \author Brian Paul + */ + + +#include "pipe/p_defines.h" +#include "pipe/p_state.h" +#include "rtasm/rtasm_ppc_spe.h" +#include "cell_context.h" +#include "cell_gen_fragment.h" + + + +/** Do extra optimizations? */ +#define OPTIMIZATIONS 1 + + +/** + * Generate SPE code to perform Z/depth testing. + * + * \param dsa         Gallium depth/stencil/alpha state to gen code for + * \param f           SPE function to append instruction onto. + * \param mask_reg    register containing quad/pixel "alive" mask (in/out) + * \param ifragZ_reg  register containing integer fragment Z values (in) + * \param ifbZ_reg    register containing integer frame buffer Z values (in/out) + * \param zmask_reg   register containing result of Z test/comparison (out) + */ +static void +gen_depth_test(const struct pipe_depth_stencil_alpha_state *dsa, +               struct spe_function *f, +               int mask_reg, int ifragZ_reg, int ifbZ_reg, int zmask_reg) +{ +   ASSERT(dsa->depth.enabled); + +   switch (dsa->depth.func) { +   case PIPE_FUNC_EQUAL: +      /* zmask = (ifragZ == ref) */ +      spe_ceq(f, zmask_reg, ifragZ_reg, ifbZ_reg); +      /* mask = (mask & zmask) */ +      spe_and(f, mask_reg, mask_reg, zmask_reg); +      break; + +   case PIPE_FUNC_NOTEQUAL: +      /* zmask = (ifragZ == ref) */ +      spe_ceq(f, zmask_reg, ifragZ_reg, ifbZ_reg); +      /* mask = (mask & ~zmask) */ +      spe_andc(f, mask_reg, mask_reg, zmask_reg); +      break; + +   case PIPE_FUNC_GREATER: +      /* zmask = (ifragZ > ref) */ +      spe_cgt(f, zmask_reg, ifragZ_reg, ifbZ_reg); +      /* mask = (mask & zmask) */ +      spe_and(f, mask_reg, mask_reg, zmask_reg); +      break; + +   case PIPE_FUNC_LESS: +      /* zmask = (ref > ifragZ) */ +      spe_cgt(f, zmask_reg, ifbZ_reg, ifragZ_reg); +      /* mask = (mask & zmask) */ +      spe_and(f, mask_reg, mask_reg, zmask_reg); +      break; + +   case PIPE_FUNC_LEQUAL: +      /* zmask = (ifragZ > ref) */ +      spe_cgt(f, zmask_reg, ifragZ_reg, ifbZ_reg); +      /* mask = (mask & ~zmask) */ +      spe_andc(f, mask_reg, mask_reg, zmask_reg); +      break; + +   case PIPE_FUNC_GEQUAL: +      /* zmask = (ref > ifragZ) */ +      spe_cgt(f, zmask_reg, ifbZ_reg, ifragZ_reg); +      /* mask = (mask & ~zmask) */ +      spe_andc(f, mask_reg, mask_reg, zmask_reg); +      break; + +   case PIPE_FUNC_NEVER: +      spe_il(f, mask_reg, 0);  /* mask = {0,0,0,0} */ +      spe_move(f, zmask_reg, mask_reg);  /* zmask = mask */ +      break; + +   case PIPE_FUNC_ALWAYS: +      /* mask unchanged */ +      spe_il(f, zmask_reg, ~0);  /* zmask = {~0,~0,~0,~0} */ +      break; + +   default: +      ASSERT(0); +      break; +   } + +   if (dsa->depth.writemask) { +      /* +       * If (ztest passed) { +       *    framebufferZ = fragmentZ; +       * } +       * OR, +       * framebufferZ = (ztest_passed ? fragmentZ : framebufferZ; +       */ +      spe_selb(f, ifbZ_reg, ifbZ_reg, ifragZ_reg, mask_reg); +   } +} + + +/** + * Generate SPE code to perform alpha testing. + * + * \param dsa        Gallium depth/stencil/alpha state to gen code for + * \param f          SPE function to append instruction onto. + * \param mask_reg   register containing quad/pixel "alive" mask (in/out) + * \param fragA_reg  register containing four fragment alpha values (in) + */ +static void +gen_alpha_test(const struct pipe_depth_stencil_alpha_state *dsa, +               struct spe_function *f, int mask_reg, int fragA_reg) +{ +   int ref_reg = spe_allocate_available_register(f); +   int amask_reg = spe_allocate_available_register(f); + +   ASSERT(dsa->alpha.enabled); + +   if ((dsa->alpha.func != PIPE_FUNC_NEVER) && +       (dsa->alpha.func != PIPE_FUNC_ALWAYS)) { +      /* load/splat the alpha reference float value */ +      spe_load_float(f, ref_reg, dsa->alpha.ref); +   } + +   /* emit code to do the alpha comparison, updating 'mask' */ +   switch (dsa->alpha.func) { +   case PIPE_FUNC_EQUAL: +      /* amask = (fragA == ref) */ +      spe_fceq(f, amask_reg, fragA_reg, ref_reg); +      /* mask = (mask & amask) */ +      spe_and(f, mask_reg, mask_reg, amask_reg); +      break; + +   case PIPE_FUNC_NOTEQUAL: +      /* amask = (fragA == ref) */ +      spe_fceq(f, amask_reg, fragA_reg, ref_reg); +      /* mask = (mask & ~amask) */ +      spe_andc(f, mask_reg, mask_reg, amask_reg); +      break; + +   case PIPE_FUNC_GREATER: +      /* amask = (fragA > ref) */ +      spe_fcgt(f, amask_reg, fragA_reg, ref_reg); +      /* mask = (mask & amask) */ +      spe_and(f, mask_reg, mask_reg, amask_reg); +      break; + +   case PIPE_FUNC_LESS: +      /* amask = (ref > fragA) */ +      spe_fcgt(f, amask_reg, ref_reg, fragA_reg); +      /* mask = (mask & amask) */ +      spe_and(f, mask_reg, mask_reg, amask_reg); +      break; + +   case PIPE_FUNC_LEQUAL: +      /* amask = (fragA > ref) */ +      spe_fcgt(f, amask_reg, fragA_reg, ref_reg); +      /* mask = (mask & ~amask) */ +      spe_andc(f, mask_reg, mask_reg, amask_reg); +      break; + +   case PIPE_FUNC_GEQUAL: +      /* amask = (ref > fragA) */ +      spe_fcgt(f, amask_reg, ref_reg, fragA_reg); +      /* mask = (mask & ~amask) */ +      spe_andc(f, mask_reg, mask_reg, amask_reg); +      break; + +   case PIPE_FUNC_NEVER: +      spe_il(f, mask_reg, 0);  /* mask = [0,0,0,0] */ +      break; + +   case PIPE_FUNC_ALWAYS: +      /* no-op, mask unchanged */ +      break; + +   default: +      ASSERT(0); +      break; +   } + +#if OPTIMIZATIONS +   /* if mask == {0,0,0,0} we're all done, return */ +   { +      /* re-use amask reg here */ +      int tmp_reg = amask_reg; +      /* tmp[0] = (mask[0] | mask[1] | mask[2] | mask[3]) */ +      spe_orx(f, tmp_reg, mask_reg); +      /* if tmp[0] == 0 then return from function call */ +      spe_biz(f, tmp_reg, SPE_REG_RA, 0, 0); +   } +#endif + +   spe_release_register(f, ref_reg); +   spe_release_register(f, amask_reg); +} + + + +/** + * Generate SPE code to implement the given blend mode for a quad of pixels. + * \param f          SPE function to append instruction onto. + * \param fragR_reg  register with fragment red values (float) (in/out) + * \param fragG_reg  register with fragment green values (float) (in/out) + * \param fragB_reg  register with fragment blue values (float) (in/out) + * \param fragA_reg  register with fragment alpha values (float) (in/out) + * \param fbRGBA_reg register with packed framebuffer colors (integer) (in) + */ +static void +gen_blend(const struct pipe_blend_state *blend, +          struct spe_function *f, +          enum pipe_format color_format, +          int fragR_reg, int fragG_reg, int fragB_reg, int fragA_reg, +          int fbRGBA_reg) +{ +   int term1R_reg = spe_allocate_available_register(f); +   int term1G_reg = spe_allocate_available_register(f); +   int term1B_reg = spe_allocate_available_register(f); +   int term1A_reg = spe_allocate_available_register(f); + +   int term2R_reg = spe_allocate_available_register(f); +   int term2G_reg = spe_allocate_available_register(f); +   int term2B_reg = spe_allocate_available_register(f); +   int term2A_reg = spe_allocate_available_register(f); + +   int fbR_reg = spe_allocate_available_register(f); +   int fbG_reg = spe_allocate_available_register(f); +   int fbB_reg = spe_allocate_available_register(f); +   int fbA_reg = spe_allocate_available_register(f); + +   int one_reg = spe_allocate_available_register(f); +   int tmp_reg = spe_allocate_available_register(f); + +   boolean one_reg_set = false; /* avoid setting one_reg more than once */ + +   ASSERT(blend->blend_enable); + +   /* Unpack/convert framebuffer colors from four 32-bit packed colors +    * (fbRGBA) to four float RGBA vectors (fbR, fbG, fbB, fbA). +    * Each 8-bit color component is expanded into a float in [0.0, 1.0]. +    */ +   { +      int mask_reg = spe_allocate_available_register(f); + +      /* mask = {0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff} */ +      spe_load_int(f, mask_reg, 0xff); + +      /* XXX there may be more clever ways to implement the following code */ +      switch (color_format) { +      case PIPE_FORMAT_A8R8G8B8_UNORM: +         /* fbB = fbB & mask */ +         spe_and(f, fbB_reg, fbRGBA_reg, mask_reg); +         /* mask = mask << 8 */ +         spe_roti(f, mask_reg, mask_reg, 8); + +         /* fbG = fbRGBA & mask */ +         spe_and(f, fbG_reg, fbRGBA_reg, mask_reg); +         /* fbG = fbG >> 8 */ +         spe_roti(f, fbG_reg, fbG_reg, -8); +         /* mask = mask << 8 */ +         spe_roti(f, mask_reg, mask_reg, 8); + +         /* fbR = fbRGBA & mask */ +         spe_and(f, fbR_reg, fbRGBA_reg, mask_reg); +         /* fbR = fbR >> 16 */ +         spe_roti(f, fbR_reg, fbR_reg, -16); +         /* mask = mask << 8 */ +         spe_roti(f, mask_reg, mask_reg, 8); + +         /* fbA = fbRGBA & mask */ +         spe_and(f, fbA_reg, fbRGBA_reg, mask_reg); +         /* fbA = fbA >> 24 */ +         spe_roti(f, fbA_reg, fbA_reg, -24); +         break; + +      case PIPE_FORMAT_B8G8R8A8_UNORM: +         /* fbA = fbA & mask */ +         spe_and(f, fbA_reg, fbRGBA_reg, mask_reg); +         /* mask = mask << 8 */ +         spe_roti(f, mask_reg, mask_reg, 8); + +         /* fbR = fbRGBA & mask */ +         spe_and(f, fbR_reg, fbRGBA_reg, mask_reg); +         /* fbR = fbR >> 8 */ +         spe_roti(f, fbR_reg, fbR_reg, -8); +         /* mask = mask << 8 */ +         spe_roti(f, mask_reg, mask_reg, 8); + +         /* fbG = fbRGBA & mask */ +         spe_and(f, fbG_reg, fbRGBA_reg, mask_reg); +         /* fbG = fbG >> 16 */ +         spe_roti(f, fbG_reg, fbG_reg, -16); +         /* mask = mask << 8 */ +         spe_roti(f, mask_reg, mask_reg, 8); + +         /* fbB = fbRGBA & mask */ +         spe_and(f, fbB_reg, fbRGBA_reg, mask_reg); +         /* fbB = fbB >> 24 */ +         spe_roti(f, fbB_reg, fbB_reg, -24); +         break; + +      default: +         ASSERT(0); +      } + +      /* convert int[4] in [0,255] to float[4] in [0.0, 1.0] */ +      spe_cuflt(f, fbR_reg, fbR_reg, 8); +      spe_cuflt(f, fbG_reg, fbG_reg, 8); +      spe_cuflt(f, fbB_reg, fbB_reg, 8); +      spe_cuflt(f, fbA_reg, fbA_reg, 8); + +      spe_release_register(f, mask_reg); +   } + + +   /* +    * Compute Src RGB terms +    */ +   switch (blend->rgb_src_factor) { +   case PIPE_BLENDFACTOR_ONE: +      spe_move(f, term1R_reg, fragR_reg); +      spe_move(f, term1G_reg, fragG_reg); +      spe_move(f, term1B_reg, fragB_reg); +      break; +   case PIPE_BLENDFACTOR_ZERO: +      spe_zero(f, term1R_reg); +      spe_zero(f, term1G_reg); +      spe_zero(f, term1B_reg); +      break; +   case PIPE_BLENDFACTOR_SRC_COLOR: +      spe_fm(f, term1R_reg, fragR_reg, fragR_reg); +      spe_fm(f, term1G_reg, fragG_reg, fragG_reg); +      spe_fm(f, term1B_reg, fragB_reg, fragB_reg); +      break; +   case PIPE_BLENDFACTOR_SRC_ALPHA: +      spe_fm(f, term1R_reg, fragR_reg, fragA_reg); +      spe_fm(f, term1G_reg, fragG_reg, fragA_reg); +      spe_fm(f, term1B_reg, fragB_reg, fragA_reg); +      break; +      /* XXX more cases */ +   default: +      ASSERT(0); +   } + +   /* +    * Compute Src Alpha term +    */ +   switch (blend->alpha_src_factor) { +   case PIPE_BLENDFACTOR_ONE: +      spe_move(f, term1A_reg, fragA_reg); +      break; +   case PIPE_BLENDFACTOR_SRC_COLOR: +      spe_fm(f, term1A_reg, fragA_reg, fragA_reg); +      break; +   case PIPE_BLENDFACTOR_SRC_ALPHA: +      spe_fm(f, term1A_reg, fragA_reg, fragA_reg); +      break; +      /* XXX more cases */ +   default: +      ASSERT(0); +   } + +   /* +    * Compute Dest RGB terms +    */ +   switch (blend->rgb_dst_factor) { +   case PIPE_BLENDFACTOR_ONE: +      spe_move(f, term2R_reg, fbR_reg); +      spe_move(f, term2G_reg, fbG_reg); +      spe_move(f, term2B_reg, fbB_reg); +      break; +   case PIPE_BLENDFACTOR_ZERO: +      spe_zero(f, term2R_reg); +      spe_zero(f, term2G_reg); +      spe_zero(f, term2B_reg); +      break; +   case PIPE_BLENDFACTOR_SRC_COLOR: +      spe_fm(f, term2R_reg, fbR_reg, fragR_reg); +      spe_fm(f, term2G_reg, fbG_reg, fragG_reg); +      spe_fm(f, term2B_reg, fbB_reg, fragB_reg); +      break; +   case PIPE_BLENDFACTOR_SRC_ALPHA: +      spe_fm(f, term2R_reg, fbR_reg, fragA_reg); +      spe_fm(f, term2G_reg, fbG_reg, fragA_reg); +      spe_fm(f, term2B_reg, fbB_reg, fragA_reg); +      break; +   case PIPE_BLENDFACTOR_INV_SRC_ALPHA: +      /* one = {1.0, 1.0, 1.0, 1.0} */ +      if (!one_reg_set) { +         spe_load_float(f, one_reg, 1.0f); +         one_reg_set = true; +      } +      /* tmp = one - fragA */ +      spe_fs(f, tmp_reg, one_reg, fragA_reg); +      /* term = fb * tmp */ +      spe_fm(f, term2R_reg, fbR_reg, tmp_reg); +      spe_fm(f, term2G_reg, fbG_reg, tmp_reg); +      spe_fm(f, term2B_reg, fbB_reg, tmp_reg); +      break; +      /* XXX more cases */ +   default: +      ASSERT(0); +   } + +   /* +    * Compute Dest Alpha term +    */ +   switch (blend->alpha_dst_factor) { +   case PIPE_BLENDFACTOR_ONE: +      spe_move(f, term2A_reg, fbA_reg); +      break; +   case PIPE_BLENDFACTOR_ZERO: +      spe_zero(f, term2A_reg); +      break; +   case PIPE_BLENDFACTOR_SRC_ALPHA: +      spe_fm(f, term2A_reg, fbA_reg, fragA_reg); +      break; +   case PIPE_BLENDFACTOR_INV_SRC_ALPHA: +      /* one = {1.0, 1.0, 1.0, 1.0} */ +      if (!one_reg_set) { +         spe_load_float(f, one_reg, 1.0f); +         one_reg_set = true; +      } +      /* tmp = one - fragA */ +      spe_fs(f, tmp_reg, one_reg, fragA_reg); +      /* termA = fbA * tmp */ +      spe_fm(f, term2A_reg, fbA_reg, tmp_reg); +      break; +      /* XXX more cases */ +   default: +      ASSERT(0); +   } + +   /* +    * Combine Src/Dest RGB terms +    */ +   switch (blend->rgb_func) { +   case PIPE_BLEND_ADD: +      spe_fa(f, fragR_reg, term1R_reg, term2R_reg); +      spe_fa(f, fragG_reg, term1G_reg, term2G_reg); +      spe_fa(f, fragB_reg, term1B_reg, term2B_reg); +      break; +   case PIPE_BLEND_SUBTRACT: +      spe_fs(f, fragR_reg, term1R_reg, term2R_reg); +      spe_fs(f, fragG_reg, term1G_reg, term2G_reg); +      spe_fs(f, fragB_reg, term1B_reg, term2B_reg); +      break; +      /* XXX more cases */ +   default: +      ASSERT(0); +   } + +   /* +    * Combine Src/Dest A term +    */ +   switch (blend->alpha_func) { +   case PIPE_BLEND_ADD: +      spe_fa(f, fragA_reg, term1A_reg, term2A_reg); +      break; +   case PIPE_BLEND_SUBTRACT: +      spe_fs(f, fragA_reg, term1A_reg, term2A_reg); +      break; +      /* XXX more cases */ +   default: +      ASSERT(0); +   } + +   spe_release_register(f, term1R_reg); +   spe_release_register(f, term1G_reg); +   spe_release_register(f, term1B_reg); +   spe_release_register(f, term1A_reg); + +   spe_release_register(f, term2R_reg); +   spe_release_register(f, term2G_reg); +   spe_release_register(f, term2B_reg); +   spe_release_register(f, term2A_reg); + +   spe_release_register(f, fbR_reg); +   spe_release_register(f, fbG_reg); +   spe_release_register(f, fbB_reg); +   spe_release_register(f, fbA_reg); + +   spe_release_register(f, one_reg); +   spe_release_register(f, tmp_reg); +} + + +static void +gen_logicop(const struct pipe_blend_state *blend, +            struct spe_function *f, +            int fragRGBA_reg, int fbRGBA_reg) +{ +   /* XXX to-do */ +   /* operate on 32-bit packed pixels, not float colors */ +} + + +static void +gen_colormask(uint colormask, +              struct spe_function *f, +              int fragRGBA_reg, int fbRGBA_reg) +{ +   /* XXX to-do */ +   /* operate on 32-bit packed pixels, not float colors */ +} + + + +/** + * Generate code to pack a quad of float colors into a four 32-bit integers. + * + * \param f             SPE function to append instruction onto. + * \param color_format  the dest color packing format + * \param r_reg         register containing four red values (in/clobbered) + * \param g_reg         register containing four green values (in/clobbered) + * \param b_reg         register containing four blue values (in/clobbered) + * \param a_reg         register containing four alpha values (in/clobbered) + * \param rgba_reg      register to store the packed RGBA colors (out) + */ +static void +gen_pack_colors(struct spe_function *f, +                enum pipe_format color_format, +                int r_reg, int g_reg, int b_reg, int a_reg, +                int rgba_reg) +{ +   /* Convert float[4] in [0.0,1.0] to int[4] in [0,~0], with clamping */ +   spe_cfltu(f, r_reg, r_reg, 32); +   spe_cfltu(f, g_reg, g_reg, 32); +   spe_cfltu(f, b_reg, b_reg, 32); +   spe_cfltu(f, a_reg, a_reg, 32); + +   /* Shift the most significant bytes to least the significant positions. +    * I.e.: reg = reg >> 24 +    */ +   spe_rotmi(f, r_reg, r_reg, -24); +   spe_rotmi(f, g_reg, g_reg, -24); +   spe_rotmi(f, b_reg, b_reg, -24); +   spe_rotmi(f, a_reg, a_reg, -24); + +   /* Shift the color bytes according to the surface format */ +   if (color_format == PIPE_FORMAT_A8R8G8B8_UNORM) { +      spe_roti(f, g_reg, g_reg, 8);   /* green <<= 8 */ +      spe_roti(f, r_reg, r_reg, 16);  /* red <<= 16 */ +      spe_roti(f, a_reg, a_reg, 24);  /* alpha <<= 24 */ +   } +   else if (color_format == PIPE_FORMAT_B8G8R8A8_UNORM) { +      spe_roti(f, r_reg, r_reg, 8);   /* red <<= 8 */ +      spe_roti(f, g_reg, g_reg, 16);  /* green <<= 16 */ +      spe_roti(f, b_reg, b_reg, 24);  /* blue <<= 24 */ +   } +   else { +      ASSERT(0); +   } + +   /* Merge red, green, blue, alpha registers to make packed RGBA colors. +    * Eg: after shifting according to color_format we might have: +    *     R = {0x00ff0000, 0x00110000, 0x00220000, 0x00330000} +    *     G = {0x0000ff00, 0x00004400, 0x00005500, 0x00006600} +    *     B = {0x000000ff, 0x00000077, 0x00000088, 0x00000099} +    *     A = {0xff000000, 0xaa000000, 0xbb000000, 0xcc000000} +    * OR-ing all those together gives us four packed colors: +    *  RGBA = {0xffffffff, 0xaa114477, 0xbb225588, 0xcc336699} +    */ +   spe_or(f, rgba_reg, r_reg, g_reg); +   spe_or(f, rgba_reg, rgba_reg, b_reg); +   spe_or(f, rgba_reg, rgba_reg, a_reg); +} + + + + +/** + * Generate SPE code to implement the fragment operations (alpha test, + * depth test, stencil test, blending, colormask, and final + * framebuffer write) as specified by the current context state. + * + * Logically, this code will be called after running the fragment + * shader.  But under some circumstances we could run some of this + * code before the fragment shader to cull fragments/quads that are + * totally occluded/discarded. + * + * XXX we only support PIPE_FORMAT_Z24S8_UNORM z/stencil buffer right now. + * + * See the spu_default_fragment_ops() function to see how the per-fragment + * operations would be done with ordinary C code. + * The code we generate here though has no branches, is SIMD, etc and + * should be much faster. + * + * \param cell  the rendering context (in) + * \param f     the generated function (out) + */ +void +cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f) +{ +   const struct pipe_depth_stencil_alpha_state *dsa = +      &cell->depth_stencil->base; +   const struct pipe_blend_state *blend = &cell->blend->base; +   const enum pipe_format color_format = cell->framebuffer.cbufs[0]->format; + +   /* For SPE function calls: reg $3 = first param, $4 = second param, etc. */ +   const int x_reg = 3;  /* uint */ +   const int y_reg = 4;  /* uint */ +   const int color_tile_reg = 5;  /* tile_t * */ +   const int depth_tile_reg = 6;  /* tile_t * */ +   const int fragZ_reg = 7;   /* vector float */ +   const int fragR_reg = 8;   /* vector float */ +   const int fragG_reg = 9;   /* vector float */ +   const int fragB_reg = 10;  /* vector float */ +   const int fragA_reg = 11;  /* vector float */ +   const int mask_reg = 12;   /* vector uint */ + +   /* offset of quad from start of tile +    * XXX assuming 4-byte pixels for color AND Z/stencil!!!! +    */ +   int quad_offset_reg; + +   int fbRGBA_reg;  /**< framebuffer's RGBA colors for quad */ +   int fbZS_reg;    /**< framebuffer's combined z/stencil values for quad */ + +   spe_init_func(f, SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE); +   spe_allocate_register(f, x_reg); +   spe_allocate_register(f, y_reg); +   spe_allocate_register(f, color_tile_reg); +   spe_allocate_register(f, depth_tile_reg); +   spe_allocate_register(f, fragZ_reg); +   spe_allocate_register(f, fragR_reg); +   spe_allocate_register(f, fragG_reg); +   spe_allocate_register(f, fragB_reg); +   spe_allocate_register(f, fragA_reg); +   spe_allocate_register(f, mask_reg); + +   quad_offset_reg = spe_allocate_available_register(f); +   fbRGBA_reg = spe_allocate_available_register(f); +   fbZS_reg = spe_allocate_available_register(f); + +   /* compute offset of quad from start of tile, in bytes */ +   { +      int x2_reg = spe_allocate_available_register(f); +      int y2_reg = spe_allocate_available_register(f); + +      ASSERT(TILE_SIZE == 32); + +      spe_rotmi(f, x2_reg, x_reg, -1);  /* x2 = x / 2 */ +      spe_rotmi(f, y2_reg, y_reg, -1);  /* y2 = y / 2 */ +      spe_shli(f, y2_reg, y2_reg, 4);   /* y2 *= 16 */ +      spe_a(f, quad_offset_reg, y2_reg, x2_reg);  /* offset = y2 + x2 */ +      spe_shli(f, quad_offset_reg, quad_offset_reg, 4);   /* offset *= 16 */ + +      spe_release_register(f, x2_reg); +      spe_release_register(f, y2_reg); +   } + + +   if (dsa->alpha.enabled) { +      gen_alpha_test(dsa, f, mask_reg, fragA_reg); +   } + +   if (dsa->depth.enabled || dsa->stencil[0].enabled) { +      const enum pipe_format zs_format = cell->framebuffer.zsbuf->format; +      boolean write_depth_stencil; + +      int fbZ_reg = spe_allocate_available_register(f); /* Z values */ +      int fbS_reg = spe_allocate_available_register(f); /* Stencil values */ + +      /* fetch quad of depth/stencil values from tile at (x,y) */ +      /* Load: fbZS_reg = memory[depth_tile_reg + offset_reg] */ +      spe_lqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg); + +      if (dsa->depth.enabled) { +         /* Extract Z bits from fbZS_reg into fbZ_reg */ +         if (zs_format == PIPE_FORMAT_S8Z24_UNORM || +             zs_format == PIPE_FORMAT_X8Z24_UNORM) { +            int mask_reg = spe_allocate_available_register(f); +            spe_fsmbi(f, mask_reg, 0x7777);  /* mask[0,1,2,3] = 0x00ffffff */ +            spe_and(f, fbZ_reg, fbZS_reg, mask_reg);  /* fbZ = fbZS & mask */ +            spe_release_register(f, mask_reg); +            /* OK, fbZ_reg has four 24-bit Z values now */ +         } +         else { +            /* XXX handle other z/stencil formats */ +            ASSERT(0); +         } + +         /* Convert fragZ values from float[4] to uint[4] */ +         if (zs_format == PIPE_FORMAT_S8Z24_UNORM || +             zs_format == PIPE_FORMAT_X8Z24_UNORM || +             zs_format == PIPE_FORMAT_Z24S8_UNORM || +             zs_format == PIPE_FORMAT_Z24X8_UNORM) { +            /* 24-bit Z values */ +            int scale_reg = spe_allocate_available_register(f); + +            /* scale_reg[0,1,2,3] = float(2^24-1) */ +            spe_load_float(f, scale_reg, (float) 0xffffff); + +            /* XXX these two instructions might be combined */ +            spe_fm(f, fragZ_reg, fragZ_reg, scale_reg); /* fragZ *= scale */ +            spe_cfltu(f, fragZ_reg, fragZ_reg, 0);  /* fragZ = (int) fragZ */ + +            spe_release_register(f, scale_reg); +         } +         else { +            /* XXX handle 16-bit Z format */ +            ASSERT(0); +         } +      } + +      if (dsa->stencil[0].enabled) { +         /* Extract Stencil bit sfrom fbZS_reg into fbS_reg */ +         if (zs_format == PIPE_FORMAT_S8Z24_UNORM || +             zs_format == PIPE_FORMAT_X8Z24_UNORM) { +            /* XXX extract with a shift */ +            ASSERT(0); +         } +         else if (zs_format == PIPE_FORMAT_Z24S8_UNORM || +                  zs_format == PIPE_FORMAT_Z24X8_UNORM) { +            /* XXX extract with a mask */ +            ASSERT(0); +         } +      } + + +      if (dsa->stencil[0].enabled) { +         /* XXX this may involve depth testing too */ +         // gen_stencil_test(dsa, f, ... ); +         ASSERT(0); +      } +      else if (dsa->depth.enabled) { +         int zmask_reg = spe_allocate_available_register(f); +         gen_depth_test(dsa, f, mask_reg, fragZ_reg, fbZ_reg, zmask_reg); +         spe_release_register(f, zmask_reg); +      } + +      /* do we need to write Z and/or Stencil back into framebuffer? */ +      write_depth_stencil = (dsa->depth.writemask | +                             dsa->stencil[0].write_mask | +                             dsa->stencil[1].write_mask); + +      if (write_depth_stencil) { +         /* Merge latest Z and Stencil values into fbZS_reg. +          * fbZ_reg has four Z vals in bits [23..0] or bits [15..0]. +          * fbS_reg has four 8-bit Z values in bits [7..0]. +          */ +         if (zs_format == PIPE_FORMAT_S8Z24_UNORM || +             zs_format == PIPE_FORMAT_X8Z24_UNORM) { +            spe_shli(f, fbS_reg, fbS_reg, 24); /* fbS = fbS << 24 */ +            spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */ +         } +         else if (zs_format == PIPE_FORMAT_S8Z24_UNORM || +                  zs_format == PIPE_FORMAT_X8Z24_UNORM) { +            /* XXX to do */ +            ASSERT(0); +         } +         else if (zs_format == PIPE_FORMAT_Z16_UNORM) { +            /* XXX to do */ +            ASSERT(0); +         } +         else if (zs_format == PIPE_FORMAT_S8_UNORM) { +            /* XXX to do */ +            ASSERT(0); +         } +         else { +            /* bad zs_format */ +            ASSERT(0); +         } + +         /* Store: memory[depth_tile_reg + quad_offset_reg] = fbZS */ +         spe_stqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg); +      } + +      spe_release_register(f, fbZ_reg); +      spe_release_register(f, fbS_reg); +   } + + +   /* Get framebuffer quad/colors.  We'll need these for blending, +    * color masking, and to obey the quad/pixel mask. +    * Load: fbRGBA_reg = memory[color_tile + quad_offset] +    * Note: if mask={~0,~0,~0,~0} and we're not blending or colormasking +    * we could skip this load. +    */ +   spe_lqx(f, fbRGBA_reg, color_tile_reg, quad_offset_reg); + + +   if (blend->blend_enable) { +      gen_blend(blend, f, color_format, +                fragR_reg, fragG_reg, fragB_reg, fragA_reg, fbRGBA_reg); +   } + +   /* +    * Write fragment colors to framebuffer/tile. +    * This involves converting the fragment colors from float[4] to the +    * tile's specific format and obeying the quad/pixel mask. +    */ +   { +      int rgba_reg = spe_allocate_available_register(f); + +      /* Pack four float colors as four 32-bit int colors */ +      gen_pack_colors(f, color_format, +                      fragR_reg, fragG_reg, fragB_reg, fragA_reg, +                      rgba_reg); + +      if (blend->logicop_enable) { +         gen_logicop(blend, f, rgba_reg, fbRGBA_reg); +      } + +      if (blend->colormask != 0xf) { +         gen_colormask(blend->colormask, f, rgba_reg, fbRGBA_reg); +      } + + +      /* Mix fragment colors with framebuffer colors using the quad/pixel mask: +       * if (mask[i]) +       *    rgba[i] = rgba[i]; +       * else +       *    rgba[i] = framebuffer[i]; +       */ +      spe_selb(f, rgba_reg, fbRGBA_reg, rgba_reg, mask_reg); + +      /* Store updated quad in tile: +       * memory[color_tile + quad_offset] = rgba_reg; +       */ +      spe_stqx(f, rgba_reg, color_tile_reg, quad_offset_reg); + +      spe_release_register(f, rgba_reg); +   } + +   //printf("gen_fragment_ops nr instructions: %u\n", f->num_inst); + +   spe_bi(f, SPE_REG_RA, 0, 0);  /* return from function call */ + + +   spe_release_register(f, fbRGBA_reg); +   spe_release_register(f, fbZS_reg); +   spe_release_register(f, quad_offset_reg); +} + diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.h b/src/gallium/drivers/cell/ppu/cell_gen_fragment.h new file mode 100644 index 0000000000..b59de198dc --- /dev/null +++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.h @@ -0,0 +1,38 @@ +/************************************************************************** + *  + * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + *  + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + *  + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + *  + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *  + **************************************************************************/ + + +#ifndef CELL_GEN_FRAGMENT_H +#define CELL_GEN_FRAGMENT_H + + +extern void +cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f); + + +#endif /* CELL_GEN_FRAGMENT_H */ + diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c index f2feaa329a..2da3097983 100644 --- a/src/gallium/drivers/cell/ppu/cell_state_emit.c +++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c @@ -27,6 +27,7 @@  #include "util/u_memory.h"  #include "cell_context.h" +#include "cell_gen_fragment.h"  #include "cell_state.h"  #include "cell_state_emit.h"  #include "cell_state_per_fragment.h" @@ -54,23 +55,6 @@ emit_state_cmd(struct cell_context *cell, uint cmd,  void  cell_emit_state(struct cell_context *cell)  { -   if (cell->dirty & (CELL_NEW_FRAMEBUFFER | CELL_NEW_BLEND)) { -      struct cell_command_logicop logicop; - -      if (cell->logic_op.store != NULL) { -	 spe_release_func(& cell->logic_op); -      } - -      cell_generate_logic_op(& cell->logic_op, -			     & cell->blend->base, -			     cell->framebuffer.cbufs[0]); - -      logicop.base = (intptr_t) cell->logic_op.store; -      logicop.size = 64 * 4; -      emit_state_cmd(cell, CELL_CMD_STATE_LOGICOP, &logicop, -		     sizeof(logicop)); -   } -     if (cell->dirty & CELL_NEW_FRAMEBUFFER) {        struct pipe_surface *cbuf = cell->framebuffer.cbufs[0];        struct pipe_surface *zbuf = cell->framebuffer.zsbuf; @@ -83,44 +67,49 @@ cell_emit_state(struct cell_context *cell)        fb->depth_format = zbuf ? zbuf->format : PIPE_FORMAT_NONE;        fb->width = cell->framebuffer.width;        fb->height = cell->framebuffer.height; +#if 0 +      printf("EMIT color format %s\n", pf_name(fb->color_format)); +      printf("EMIT depth format %s\n", pf_name(fb->depth_format)); +#endif     } -   if (cell->dirty & CELL_NEW_BLEND) { -      struct cell_command_blend blend; - -      if (cell->blend != NULL) { -         blend.base = (intptr_t) cell->blend->code.store; -         blend.size = (char *) cell->blend->code.csr -             - (char *) cell->blend->code.store; -         blend.read_fb = TRUE; -      } -      else { -         blend.base = 0; -         blend.size = 0; -         blend.read_fb = FALSE; +   if (cell->dirty & (CELL_NEW_FS)) { +      /* Send new fragment program to SPUs */ +      struct cell_command_fragment_program *fp +            = cell_batch_alloc(cell, sizeof(*fp)); +      fp->opcode = CELL_CMD_STATE_FRAGMENT_PROGRAM; +      fp->num_inst = cell->fs->code.num_inst; +      memcpy(&fp->code, cell->fs->code.store, +             SPU_MAX_FRAGMENT_PROGRAM_INSTS * SPE_INST_SIZE); +      if (0) { +         int i; +         printf("PPU Emit CELL_CMD_STATE_FRAGMENT_PROGRAM:\n"); +         for (i = 0; i < fp->num_inst; i++) { +            printf(" %3d: 0x%08x\n", i, fp->code[i]); +         }        } - -      emit_state_cmd(cell, CELL_CMD_STATE_BLEND, &blend, sizeof(blend));     } -   if (cell->dirty & CELL_NEW_DEPTH_STENCIL) { -      struct cell_command_depth_stencil_alpha_test dsat; +   if (cell->dirty & (CELL_NEW_FRAMEBUFFER | +                      CELL_NEW_DEPTH_STENCIL | +                      CELL_NEW_BLEND)) { +      /* XXX we don't want to always do codegen here.  We should have +       * a hash/lookup table to cache previous results... +       */ +      struct cell_command_fragment_ops *fops +            = cell_batch_alloc(cell, sizeof(*fops)); +      struct spe_function spe_code; -      if (cell->depth_stencil != NULL) { -	 dsat.base = (intptr_t) cell->depth_stencil->code.store; -	 dsat.size = (char *) cell->depth_stencil->code.csr -	     - (char *) cell->depth_stencil->code.store; -	 dsat.read_depth = TRUE; -	 dsat.read_stencil = FALSE; -      } -      else { -	 dsat.base = 0; -	 dsat.size = 0; -	 dsat.read_depth = FALSE; -	 dsat.read_stencil = FALSE; -      } - -      emit_state_cmd(cell, CELL_CMD_STATE_DEPTH_STENCIL, &dsat, sizeof(dsat)); +      /* generate new code */ +      cell_gen_fragment_function(cell, &spe_code); +      /* put the new code into the batch buffer */ +      fops->opcode = CELL_CMD_STATE_FRAGMENT_OPS; +      memcpy(&fops->code, spe_code.store, +             SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE); +      fops->dsa = cell->depth_stencil->base; +      fops->blend = cell->blend->base; +      /* free codegen buffer */ +      spe_release_func(&spe_code);     }     if (cell->dirty & CELL_NEW_SAMPLER) { @@ -160,7 +149,8 @@ cell_emit_state(struct cell_context *cell)        emit_state_cmd(cell, CELL_CMD_STATE_VERTEX_INFO,                       &cell->vertex_info, sizeof(struct vertex_info));     } -    + +#if 0     if (cell->dirty & CELL_NEW_VS) {        const struct draw_context *const draw = cell->draw;        struct cell_shader_info info; @@ -175,4 +165,5 @@ cell_emit_state(struct cell_context *cell)        emit_state_cmd(cell, CELL_CMD_STATE_BIND_VS, &info, sizeof(info));     } +#endif  } diff --git a/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c b/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c index 53ae3aa50e..78cb446c14 100644 --- a/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c +++ b/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c @@ -132,9 +132,9 @@ emit_alpha_test(struct pipe_depth_stencil_alpha_state *dsa,  /** + * Generate code to perform Z testing.  Four Z values are tested at once.   * \param dsa        Current depth-test state   * \param f          Function to which code should be appended - * \param m          Mask of allocated / free SPE registers   * \param mask       Index of register to contain depth-pass mask   * \param stored     Index of register containing values from depth buffer   * \param calculated Index of register containing per-fragment depth values @@ -198,6 +198,7 @@ emit_depth_test(struct pipe_depth_stencil_alpha_state *dsa,  /** + * Generate code to apply the stencil operation (after testing).   * \note Emits a maximum of 5 instructions.   *   * \warning @@ -222,9 +223,13 @@ emit_stencil_op(struct spe_function *f,        spe_il(f, result, ref);        break;     case PIPE_STENCIL_OP_INCR: +      /* clamp = [0xff, 0xff, 0xff, 0xff] */        spe_il(f, clamp, 0x0ff); +      /* result[i] = in[i] + 1 */        spe_ai(f, result, in, 1); +      /* clamp_mask[i] = (result[i] > 0xff) */        spe_clgti(f, clamp_mask, result, 0x0ff); +      /* result[i] = clamp_mask[i] ? clamp[i] : result[i] */        spe_selb(f, result, result, clamp, clamp_mask);        break;     case PIPE_STENCIL_OP_DECR: @@ -259,10 +264,10 @@ emit_stencil_op(struct spe_function *f,  /** + * Generate code to do stencil test.  Four pixels are tested at once.   * \param dsa        Depth / stencil test state   * \param face       0 for front face, 1 for back face   * \param f          Function to append instructions to - * \param reg_mask   Mask of allocated registers   * \param mask       Register containing mask of fragments passing the   *                   alpha test   * \param depth_mask Register containing mask of fragments passing the @@ -310,13 +315,14 @@ emit_stencil_test(struct pipe_depth_stencil_alpha_state *dsa,     switch (dsa->stencil[face].func) {     case PIPE_FUNC_NEVER: -      spe_il(f, stencil_mask, 0); +      spe_il(f, stencil_mask, 0);   /* stencil_mask[0..3] = [0,0,0,0] */        break;     case PIPE_FUNC_NOTEQUAL:        complement = TRUE;        /* FALLTHROUGH */     case PIPE_FUNC_EQUAL: +      /* stencil_mask[i] = (stored[i] == ref) */        spe_ceqi(f, stencil_mask, stored, ref);        break; @@ -324,6 +330,8 @@ emit_stencil_test(struct pipe_depth_stencil_alpha_state *dsa,        complement = TRUE;        /* FALLTHROUGH */     case PIPE_FUNC_GREATER: +      complement = TRUE; +      /* stencil_mask[i] = (stored[i] > ref) */        spe_clgti(f, stencil_mask, stored, ref);        break; @@ -331,8 +339,11 @@ emit_stencil_test(struct pipe_depth_stencil_alpha_state *dsa,        complement = TRUE;        /* FALLTHROUGH */     case PIPE_FUNC_GEQUAL: +      /* stencil_mask[i] = (stored[i] > ref) */        spe_clgti(f, stencil_mask, stored, ref); +      /* tmp[i] = (stored[i] == ref) */        spe_ceqi(f, tmp, stored, ref); +      /* stencil_mask[i] = stencil_mask[i] | tmp[i] */        spe_or(f, stencil_mask, stencil_mask, tmp);        break; @@ -461,7 +472,7 @@ cell_generate_depth_stencil_test(struct cell_depth_stencil_alpha_state *cdsa)      * + 25 (front stencil) + 25 (back stencil) + 4 = 63 instructions.  Round      * up to 64 to make it a happy power-of-two.      */ -   spe_init_func(f, 4 * 64); +   spe_init_func(f, SPE_INST_SIZE * 64);     /* Allocate registers for the function's input parameters.  Cleverly (and @@ -540,7 +551,7 @@ cell_generate_depth_stencil_test(struct cell_depth_stencil_alpha_state *cdsa)           spe_selb(f, depth, depth, zvals, mask);     } -   spe_bi(f, 0, 0, 0); +   spe_bi(f, 0, 0, 0);  /* return from function call */  #if 0 @@ -956,7 +967,7 @@ cell_generate_alpha_blend(struct cell_blend_state *cb)      * + 4 (fragment mask) + 1 (return) = 55 instlructions.  Round up to 64 to      * make it a happy power-of-two.      */ -   spe_init_func(f, 4 * 64); +   spe_init_func(f, SPE_INST_SIZE * 64);     const int frag[4] = { @@ -1144,9 +1155,10 @@ cell_generate_alpha_blend(struct cell_blend_state *cb)  } -int PC_OFFSET(const struct spe_function *f, const void *d) +static int +PC_OFFSET(const struct spe_function *f, const void *d)  { -   const intptr_t pc = (intptr_t) f->csr; +   const intptr_t pc = (intptr_t) &f->store[f->num_inst];     const intptr_t ea = ~0x0f & (intptr_t) d;     return (ea - pc) >> 2; @@ -1178,7 +1190,7 @@ cell_generate_logic_op(struct spe_function *f,      * bytes (equiv. to 8 instructions) are needed for data storage.  Round up      * to 64 to make it a happy power-of-two.      */ -   spe_init_func(f, 4 * 64); +   spe_init_func(f, SPE_INST_SIZE * 64);     /* Pixel colors in framebuffer format in AoS layout. diff --git a/src/gallium/drivers/cell/ppu/cell_state_shader.c b/src/gallium/drivers/cell/ppu/cell_state_shader.c index 97e44eeb1a..3a0d066da2 100644 --- a/src/gallium/drivers/cell/ppu/cell_state_shader.c +++ b/src/gallium/drivers/cell/ppu/cell_state_shader.c @@ -34,7 +34,7 @@  #include "cell_context.h"  #include "cell_state.h" - +#include "cell_gen_fp.h"  /** cast wrapper */ @@ -61,7 +61,7 @@ static void *  cell_create_fs_state(struct pipe_context *pipe,                       const struct pipe_shader_state *templ)  { -   /*struct cell_context *cell = cell_context(pipe);*/ +   struct cell_context *cell = cell_context(pipe);     struct cell_fragment_shader_state *cfs;     cfs = CALLOC_STRUCT(cell_fragment_shader_state); @@ -76,6 +76,8 @@ cell_create_fs_state(struct pipe_context *pipe,     tgsi_scan_shader(templ->tokens, &cfs->info); +   cell_gen_fragment_program(cell, cfs->shader.tokens, &cfs->code); +     return cfs;  } @@ -102,6 +104,8 @@ cell_delete_fs_state(struct pipe_context *pipe, void *fs)  {     struct cell_fragment_shader_state *cfs = cell_fragment_shader_state(fs); +   spe_release_func(&cfs->code); +     FREE((void *) cfs->shader.tokens);     FREE(cfs);  } diff --git a/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c b/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c index 2ece0250f6..566df7f59e 100644 --- a/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c +++ b/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c @@ -297,10 +297,9 @@ void cell_update_vertex_fetch(struct draw_context *draw)     /* Each fetch function can be a maximum of 34 instructions (note: this is -    * actually a slight over-estimate).  That means (34 * 4) = 136 bytes -    * each maximum. +    * actually a slight over-estimate).      */ -   spe_init_func(p, 136 * unique_attr_formats); +   spe_init_func(p, 34 * SPE_INST_SIZE * unique_attr_formats);     /* Allocate registers for the function's input parameters. diff --git a/src/gallium/drivers/cell/spu/Makefile b/src/gallium/drivers/cell/spu/Makefile index d49abb2e82..1ae0dfb8c1 100644 --- a/src/gallium/drivers/cell/spu/Makefile +++ b/src/gallium/drivers/cell/spu/Makefile @@ -22,12 +22,15 @@ SOURCES = \  	spu_render.c \  	spu_texture.c \  	spu_tile.c \ -	spu_tri.c \ +	spu_tri.c + +OLD_SOURCES = \  	spu_exec.c \  	spu_util.c \  	spu_vertex_fetch.c \  	spu_vertex_shader.c +  SPU_OBJECTS = $(SOURCES:.c=.o) \  SPU_ASM_OUT = $(SOURCES:.c=.s) \ @@ -43,7 +46,7 @@ INCLUDE_DIRS = \  	$(SPU_CC) $(SPU_CFLAGS) -c $<  .c.s: -	$(SPU_CC) $(SPU_CFLAGS) -S $< +	$(SPU_CC) $(SPU_CFLAGS) -O3 -S $<  # The .a file will be linked into the main/PPU executable diff --git a/src/gallium/drivers/cell/spu/spu_colorpack.h b/src/gallium/drivers/cell/spu/spu_colorpack.h index e9fee8a3a6..fd8dc6ded3 100644 --- a/src/gallium/drivers/cell/spu/spu_colorpack.h +++ b/src/gallium/drivers/cell/spu/spu_colorpack.h @@ -79,14 +79,14 @@ spu_pack_color_shuffle(vector float rgba, vector unsigned char shuffle)  static INLINE vector float -spu_unpack_color(uint color) +spu_unpack_B8G8R8A8(uint color)  {     vector unsigned int color_u4 = spu_splats(color);     color_u4 = spu_shuffle(color_u4, color_u4,                            ((vector unsigned char) { -                             0, 0, 0, 0, -                             5, 5, 5, 5,                               10, 10, 10, 10, +                             5, 5, 5, 5, +                             0, 0, 0, 0,                               15, 15, 15, 15}) );     return spu_convtf(color_u4, 32);  } diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c index d223f32d94..78260c4259 100644 --- a/src/gallium/drivers/cell/spu/spu_main.c +++ b/src/gallium/drivers/cell/spu/spu_main.c @@ -34,6 +34,7 @@  #include "spu_main.h"  #include "spu_render.h" +#include "spu_per_fragment_op.h"  #include "spu_texture.h"  #include "spu_tile.h"  //#include "spu_test.h" @@ -46,7 +47,7 @@  /*  helpful headers:  /usr/lib/gcc/spu/4.1.1/include/spu_mfcio.h -/opt/ibm/cell-sdk/prototype/sysroot/usr/include/libmisc.h +/opt/cell/sdk/usr/include/libmisc.h  */  boolean Debug = FALSE; @@ -62,14 +63,6 @@ struct spu_vs_context draw;  static unsigned char attribute_fetch_code_buffer[136 * PIPE_MAX_ATTRIBS]      ALIGN16_ATTRIB; -static unsigned char depth_stencil_code_buffer[4 * 64] -    ALIGN16_ATTRIB; - -static unsigned char fb_blend_code_buffer[4 * 64] -    ALIGN16_ATTRIB; - -static unsigned char logicop_code_buffer[4 * 64] -    ALIGN16_ATTRIB;  /** @@ -226,6 +219,46 @@ cmd_release_verts(const struct cell_command_release_verts *release)  } +/** + * Process a CELL_CMD_STATE_FRAGMENT_OPS command. + * This involves installing new fragment ops SPU code. + * If this function is never called, we'll use a regular C fallback function + * for fragment processing. + */ +static void +cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops) +{ +   if (Debug) +      printf("SPU %u: CMD_STATE_FRAGMENT_OPS\n", spu.init.id); +   /* Copy SPU code from batch buffer to spu buffer */ +   memcpy(spu.fragment_ops_code, fops->code, SPU_MAX_FRAGMENT_OPS_INSTS * 4); +   /* Copy state info (for fallback case only) */ +   memcpy(&spu.depth_stencil_alpha, &fops->dsa, sizeof(fops->dsa)); +   memcpy(&spu.blend, &fops->blend, sizeof(fops->blend)); + +   /* Point function pointer at new code */ +   spu.fragment_ops = (spu_fragment_ops_func) spu.fragment_ops_code; + +   spu.read_depth = spu.depth_stencil_alpha.depth.enabled; +   spu.read_stencil = spu.depth_stencil_alpha.stencil[0].enabled; +} + + +static void +cmd_state_fragment_program(const struct cell_command_fragment_program *fp) +{ +   if (Debug) +      printf("SPU %u: CMD_STATE_FRAGMENT_PROGRAM\n", spu.init.id); +   /* Copy SPU code from batch buffer to spu buffer */ +   memcpy(spu.fragment_program_code, fp->code, +          SPU_MAX_FRAGMENT_PROGRAM_INSTS * 4); +#if 01 +   /* Point function pointer at new code */ +   spu.fragment_program = (spu_fragment_program_func)spu.fragment_program_code; +#endif +} + +  static void  cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)  { @@ -252,102 +285,24 @@ cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)     switch (spu.fb.depth_format) {     case PIPE_FORMAT_Z32_UNORM: +      spu.fb.zsize = 4; +      spu.fb.zscale = (float) 0xffffffffu; +      break;     case PIPE_FORMAT_Z24S8_UNORM:     case PIPE_FORMAT_S8Z24_UNORM: +   case PIPE_FORMAT_Z24X8_UNORM: +   case PIPE_FORMAT_X8Z24_UNORM:        spu.fb.zsize = 4; +      spu.fb.zscale = (float) 0x00ffffffu;        break;     case PIPE_FORMAT_Z16_UNORM:        spu.fb.zsize = 2; +      spu.fb.zscale = (float) 0xffffu;        break;     default:        spu.fb.zsize = 0;        break;     } - -   if (spu.fb.color_format == PIPE_FORMAT_A8R8G8B8_UNORM) -      spu.color_shuffle = ((vector unsigned char) { -                              12, 0, 4, 8, 0, 0, 0, 0,  -                              0, 0, 0, 0, 0, 0, 0, 0}); -   else if (spu.fb.color_format == PIPE_FORMAT_B8G8R8A8_UNORM) -      spu.color_shuffle = ((vector unsigned char) { -                              8, 4, 0, 12, 0, 0, 0, 0,  -                              0, 0, 0, 0, 0, 0, 0, 0}); -   else -      ASSERT(0); -} - - -static void -cmd_state_blend(const struct cell_command_blend *state) -{ -   if (Debug) -      printf("SPU %u: BLEND: enabled %d\n", -             spu.init.id, -             (state->size != 0)); - -   ASSERT_ALIGN16(state->base); - -   if (state->size != 0) { -      mfc_get(fb_blend_code_buffer, -              (unsigned int) state->base,  /* src */ -              ROUNDUP16(state->size), -              TAG_BATCH_BUFFER, -              0, /* tid */ -              0  /* rid */); -      wait_on_mask(1 << TAG_BATCH_BUFFER); -      spu.blend = (blend_func) fb_blend_code_buffer; -      spu.read_fb = state->read_fb; -   } else { -      spu.read_fb = FALSE; -   } -} - - -static void -cmd_state_depth_stencil(const struct cell_command_depth_stencil_alpha_test *state) -{ -   if (Debug) -      printf("SPU %u: DEPTH_STENCIL: ztest %d\n", -             spu.init.id, -             state->read_depth); - -   ASSERT_ALIGN16(state->base); - -   if (state->size != 0) { -      mfc_get(depth_stencil_code_buffer, -	      (unsigned int) state->base,  /* src */ -	      ROUNDUP16(state->size), -	      TAG_BATCH_BUFFER, -	      0, /* tid */ -	      0  /* rid */); -      wait_on_mask(1 << TAG_BATCH_BUFFER); -   } else { -      /* If there is no code, emit a return instruction. -       */ -      depth_stencil_code_buffer[0] = 0x35; -      depth_stencil_code_buffer[1] = 0x00; -      depth_stencil_code_buffer[2] = 0x00; -      depth_stencil_code_buffer[3] = 0x00; -   } - -   spu.frag_test = (frag_test_func) depth_stencil_code_buffer; -   spu.read_depth = state->read_depth; -   spu.read_stencil = state->read_stencil; -} - - -static void -cmd_state_logicop(const struct cell_command_logicop * code) -{ -   mfc_get(logicop_code_buffer, -           (unsigned int) code->base,  /* src */ -           code->size, -           TAG_BATCH_BUFFER, -           0, /* tid */ -           0  /* rid */); -   wait_on_mask(1 << TAG_BATCH_BUFFER); - -   spu.logicop = (logicop_func) logicop_code_buffer;  } @@ -450,7 +405,9 @@ cmd_finish(void)  /** - * Execute a batch of commands + * Execute a batch of commands which was sent to us by the PPU. + * See the cell_emit_state.c code to see where the commands come from. + *   * The opcode param encodes the location of the buffer and its size.   */  static void @@ -487,16 +444,14 @@ cmd_batch(uint opcode)        printf("SPU %u: release batch buf %u\n", spu.init.id, buf);     release_buffer(buf); +   /* +    * Loop over commands in the batch buffer +    */     for (pos = 0; pos < usize; /* no incr */) {        switch (buffer[pos]) { -      case CELL_CMD_STATE_FRAMEBUFFER: -         { -            struct cell_command_framebuffer *fb -               = (struct cell_command_framebuffer *) &buffer[pos]; -            cmd_state_framebuffer(fb); -            pos += sizeof(*fb) / 8; -         } -         break; +      /* +       * rendering commands +       */        case CELL_CMD_CLEAR_SURFACE:           {              struct cell_command_clear_surface *clr @@ -514,26 +469,32 @@ cmd_batch(uint opcode)              pos += pos_incr;           }           break; -      case CELL_CMD_RELEASE_VERTS: +      /* +       * state-update commands +       */ +      case CELL_CMD_STATE_FRAMEBUFFER:           { -            struct cell_command_release_verts *release -               = (struct cell_command_release_verts *) &buffer[pos]; -            cmd_release_verts(release); -            pos += sizeof(*release) / 8; +            struct cell_command_framebuffer *fb +               = (struct cell_command_framebuffer *) &buffer[pos]; +            cmd_state_framebuffer(fb); +            pos += sizeof(*fb) / 8;           }           break; -      case CELL_CMD_FINISH: -         cmd_finish(); -         pos += 1; -         break; -      case CELL_CMD_STATE_BLEND: -         cmd_state_blend((struct cell_command_blend *) &buffer[pos+1]); -         pos += (1 + ROUNDUP8(sizeof(struct cell_command_blend)) / 8); +      case CELL_CMD_STATE_FRAGMENT_OPS: +         { +            struct cell_command_fragment_ops *fops +               = (struct cell_command_fragment_ops *) &buffer[pos]; +            cmd_state_fragment_ops(fops); +            pos += sizeof(*fops) / 8; +         }           break; -      case CELL_CMD_STATE_DEPTH_STENCIL: -         cmd_state_depth_stencil((struct cell_command_depth_stencil_alpha_test *) -                                 &buffer[pos+1]); -         pos += (1 + ROUNDUP8(sizeof(struct cell_command_depth_stencil_alpha_test)) / 8); +      case CELL_CMD_STATE_FRAGMENT_PROGRAM: +         { +            struct cell_command_fragment_program *fp +               = (struct cell_command_fragment_program *) &buffer[pos]; +            cmd_state_fragment_program(fp); +            pos += sizeof(*fp) / 8; +         }           break;        case CELL_CMD_STATE_SAMPLER:           { @@ -569,8 +530,10 @@ cmd_batch(uint opcode)           pos += (1 + ROUNDUP8(sizeof(struct cell_array_info)) / 8);           break;        case CELL_CMD_STATE_BIND_VS: +#if 0           spu_bind_vertex_shader(&draw,                                  (struct cell_shader_info *) &buffer[pos+1]); +#endif           pos += (1 + ROUNDUP8(sizeof(struct cell_shader_info)) / 8);           break;        case CELL_CMD_STATE_ATTRIB_FETCH: @@ -578,9 +541,20 @@ cmd_batch(uint opcode)                                  &buffer[pos+1]);           pos += (1 + ROUNDUP8(sizeof(struct cell_attribute_fetch_code)) / 8);           break; -      case CELL_CMD_STATE_LOGICOP: -         cmd_state_logicop((struct cell_command_logicop *) &buffer[pos+1]); -         pos += (1 + ROUNDUP8(sizeof(struct cell_command_logicop)) / 8); +      /* +       * misc commands +       */ +      case CELL_CMD_FINISH: +         cmd_finish(); +         pos += 1; +         break; +      case CELL_CMD_RELEASE_VERTS: +         { +            struct cell_command_release_verts *release +               = (struct cell_command_release_verts *) &buffer[pos]; +            cmd_release_verts(release); +            pos += sizeof(*release) / 8; +         }           break;        case CELL_CMD_FLUSH_BUFFER_RANGE: {  	 struct cell_buffer_range *br = (struct cell_buffer_range *) @@ -650,7 +624,9 @@ main_loop(void)           exitFlag = 1;           break;        case CELL_CMD_VS_EXECUTE: +#if 0           spu_execute_vertex_shader(&draw, &cmd.vs); +#endif           break;        case CELL_CMD_BATCH:           cmd_batch(opcode); @@ -675,6 +651,11 @@ one_time_init(void)     memset(spu.ctile_status, TILE_STATUS_DEFINED, sizeof(spu.ctile_status));     memset(spu.ztile_status, TILE_STATUS_DEFINED, sizeof(spu.ztile_status));     invalidate_tex_cache(); + +   /* Install default/fallback fragment processing function. +    * This will normally be overriden by a code-gen'd function. +    */ +   spu.fragment_ops = spu_fallback_fragment_ops;  } diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h index 4879f8c9c8..2c7b625840 100644 --- a/src/gallium/drivers/cell/spu/spu_main.h +++ b/src/gallium/drivers/cell/spu/spu_main.h @@ -41,6 +41,10 @@  #define MAX_HEIGHT 1024 +/** + * A tile is basically a TILE_SIZE x TILE_SIZE block of 4-byte pixels. + * The data may be addressed through several different types. + */  typedef union {     ushort us[TILE_SIZE][TILE_SIZE];     uint   ui[TILE_SIZE][TILE_SIZE]; @@ -56,38 +60,29 @@ typedef union {  #define TILE_STATUS_GETTING 5  /**< mfc_get() called but not yet arrived */ -struct spu_frag_test_results { -   qword mask; -   qword depth; -   qword stencil; -}; - -typedef struct spu_frag_test_results (*frag_test_func)(qword frag_mask, -    qword pixel_depth, qword pixel_stencil, qword frag_depth, -    qword frag_alpha, qword facing); - - -struct spu_blend_results { -   qword r; -   qword g; -   qword b; -   qword a; -}; +/** Function for sampling textures */ +typedef vector float (*spu_sample_texture_func)(uint unit, +                                                vector float texcoord); -typedef struct spu_blend_results (*blend_func)( -    qword frag_r, qword frag_g, qword frag_b, qword frag_a, -    qword pixel_r, qword pixel_g, qword pixel_b, qword pixel_a, -    qword const_r, qword const_g, qword const_b, qword const_a); +/** Function for performing per-fragment ops */ +typedef void (*spu_fragment_ops_func)(uint x, uint y, +                                      tile_t *colorTile, +                                      tile_t *depthStencilTile, +                                      vector float fragZ, +                                      vector float fragRed, +                                      vector float fragGreen, +                                      vector float fragBlue, +                                      vector float fragAlpha, +                                      vector unsigned int mask); -typedef struct spu_blend_results (*logicop_func)( -    qword pixel_r, qword pixel_g, qword pixel_b, qword pixel_a, -    qword frag_r, qword frag_g, qword frag_b, qword frag_a, -    qword frag_mask); +/** Function for running fragment program */ +typedef void (*spu_fragment_program_func)(vector float *inputs, +                                          vector float *outputs, +                                          vector float *constants); -typedef vector float (*sample_texture_func)(uint unit, vector float texcoord); - -struct spu_framebuffer { +struct spu_framebuffer +{     void *color_start;              /**< addr of color surface in main memory */     void *depth_start;              /**< addr of depth surface in main memory */     enum pipe_format color_format; @@ -99,6 +94,7 @@ struct spu_framebuffer {     uint depth_clear_value;     uint zsize;                     /**< 0, 2 or 4 bytes per Z */ +   float zscale;                   /**< 65535.0, 2^24-1 or 2^32-1 */  } ALIGN16_ATTRIB; @@ -115,35 +111,31 @@ struct spu_texture  /** - * All SPU global/context state will be in singleton object of this type: + * All SPU global/context state will be in a singleton object of this type:   */  struct spu_global  { +   /** One-time init/constant info */     struct cell_init_info init; +   /* +    * Current state +    */     struct spu_framebuffer fb; -   boolean read_depth; -   boolean read_stencil; -   frag_test_func frag_test;  /**< Current depth/stencil test code */ -    -   boolean read_fb;   /**< Does current blend mode require framebuffer read? */ -   blend_func blend;  /**< Current blend code */ -   qword const_blend_color[4] ALIGN16_ATTRIB; - -   logicop_func logicop;  /**< Current logicop code **/ - +   struct pipe_depth_stencil_alpha_state depth_stencil_alpha; +   struct pipe_blend_state blend;     struct pipe_sampler_state sampler[PIPE_MAX_SAMPLERS];     struct spu_texture texture[PIPE_MAX_SAMPLERS]; -     struct vertex_info vertex_info; -   /* XXX more state to come */ - - -   /** current color and Z tiles */ +   /** Current color and Z tiles */     tile_t ctile ALIGN16_ATTRIB;     tile_t ztile ALIGN16_ATTRIB; +   /** Read depth/stencil tiles? */ +   boolean read_depth; +   boolean read_stencil; +     /** Current tiles' status */     ubyte cur_ctile_status, cur_ztile_status; @@ -151,11 +143,22 @@ struct spu_global     ubyte ctile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;     ubyte ztile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB; +   /** Current fragment ops machine code */ +   uint fragment_ops_code[SPU_MAX_FRAGMENT_OPS_INSTS]; +   /** Current fragment ops function */ +   spu_fragment_ops_func fragment_ops; + +   /** Current fragment program machine code */ +   uint fragment_program_code[SPU_MAX_FRAGMENT_PROGRAM_INSTS]; +   /** Current fragment ops function */ +   spu_fragment_program_func fragment_program; -   /** for converting RGBA to PIPE_FORMAT_x colors */ -   vector unsigned char color_shuffle; +   /** Current texture sampler function */ +   spu_sample_texture_func sample_texture[CELL_MAX_SAMPLERS]; -   sample_texture_func sample_texture[CELL_MAX_SAMPLERS]; +   /** Fragment program constants (XXX preliminary/used) */ +#define MAX_CONSTANTS 32 +   vector float constants[MAX_CONSTANTS];  } ALIGN16_ATTRIB; diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c index c0a729b3d2..03dd547845 100644 --- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c +++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c @@ -1,211 +1,475 @@ -/* - * (C) Copyright IBM Corporation 2008 +/************************************************************************** + *  + * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.   * All Rights Reserved.   *   * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * on the rights to use, copy, modify, merge, publish, distribute, sub - * license, and/or sell copies of the Software, and to permit persons to whom - * the Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL - * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, - * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR - * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE - * USE OR OTHER DEALINGS IN THE SOFTWARE. - */ + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + *  + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + *  + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *  + **************************************************************************/  /** - * \file spu_per_fragment_op.c - * SPU implementation various per-fragment operations. - * - * \author Ian Romanick <idr@us.ibm.com> + * \author Brian Paul   */ + +#include <transpose_matrix4x4.h>  #include "pipe/p_format.h"  #include "spu_main.h" +#include "spu_colorpack.h"  #include "spu_per_fragment_op.h" -#define ZERO 0x80 -static void -read_ds_quad(tile_t *buffer, unsigned x, unsigned y, -             enum pipe_format depth_format, qword *depth, -             qword *stencil) -{ -   const int ix = x / 2; -   const int iy = y / 2; +#define LINEAR_QUAD_LAYOUT 1 -   switch (depth_format) { -   case PIPE_FORMAT_Z16_UNORM: { -      qword *ptr = (qword *) &buffer->us8[iy][ix / 2]; -      const qword shuf_vec = (qword) { -         ZERO, ZERO, 0, 1, ZERO, ZERO, 2, 3, -         ZERO, ZERO, 4, 5, ZERO, ZERO, 6, 7 -      }; +/** + * Called by rasterizer for each quad after the shader has run.  Do + * all the per-fragment operations including alpha test, z test, + * stencil test, blend, colormask and logicops.  This is a + * fallback/debug function.  In reality we'll use a generated function + * produced by the PPU.  But this function is useful for + * debug/validation. + */ +void +spu_fallback_fragment_ops(uint x, uint y, +                          tile_t *colorTile, +                          tile_t *depthStencilTile, +                          vector float fragZ, +                          vector float fragR, +                          vector float fragG, +                          vector float fragB, +                          vector float fragA, +                          vector unsigned int mask) +{ +   vector float frag_aos[4]; +   unsigned int c0, c1, c2, c3; +   /* do alpha test */ +   if (spu.depth_stencil_alpha.alpha.enabled) { +      vector float ref = spu_splats(spu.depth_stencil_alpha.alpha.ref); +      vector unsigned int amask; -      /* At even X values we want the first 4 shorts, and at odd X values we -       * want the second 4 shorts. -       */ -      qword bias = (qword) spu_splats((unsigned char) ((ix & 0x01) << 3)); -      qword bias_mask = si_fsmbi(0x3333); -      qword sv = si_a(shuf_vec, si_and(bias_mask, bias)); +      switch (spu.depth_stencil_alpha.alpha.func) { +      case PIPE_FUNC_LESS: +         amask = spu_cmpgt(ref, fragA);  /* mask = (fragA < ref) */ +         break; +      case PIPE_FUNC_GREATER: +         amask = spu_cmpgt(fragA, ref);  /* mask = (fragA > ref) */ +         break; +      case PIPE_FUNC_GEQUAL: +         amask = spu_cmpgt(ref, fragA); +         amask = spu_nor(amask, amask); +         break; +      case PIPE_FUNC_LEQUAL: +         amask = spu_cmpgt(fragA, ref); +         amask = spu_nor(amask, amask); +         break; +      case PIPE_FUNC_EQUAL: +         amask = spu_cmpeq(ref, fragA); +         break; +      case PIPE_FUNC_NOTEQUAL: +         amask = spu_cmpeq(ref, fragA); +         amask = spu_nor(amask, amask); +         break; +      case PIPE_FUNC_ALWAYS: +         amask = spu_splats(0xffffffffU); +         break; +      case PIPE_FUNC_NEVER: +         amask = spu_splats( 0x0U); +         break; +      default: +         ; +      } -      *depth = si_shufb(*ptr, *ptr, sv); -      *stencil = si_il(0); -      break; +      mask = spu_and(mask, amask);     } +   /* Z and/or stencil testing... */ +   if (spu.depth_stencil_alpha.depth.enabled || +       spu.depth_stencil_alpha.stencil[0].enabled) { -   case PIPE_FORMAT_Z32_UNORM: { -      qword *ptr = (qword *) &buffer->ui4[iy][ix]; +      /* get four Z/Stencil values from tile */ +      vector unsigned int mask24 = spu_splats((unsigned int)0x00ffffffU); +      vector unsigned int ifbZS = depthStencilTile->ui4[y/2][x/2]; +      vector unsigned int ifbZ = spu_and(ifbZS, mask24); +      vector unsigned int ifbS = spu_andc(ifbZS, mask24); -      *depth = *ptr; -      *stencil = si_il(0); -      break; -   } -       +      if (spu.depth_stencil_alpha.stencil[0].enabled) { +         /* do stencil test */ +         ASSERT(spu.fb.depth_format == PIPE_FORMAT_S8Z24_UNORM); -   case PIPE_FORMAT_Z24S8_UNORM: { -      qword *ptr = (qword *) &buffer->ui4[iy][ix]; -      qword mask = si_fsmbi(0xEEEE); +      } +      else if (spu.depth_stencil_alpha.depth.enabled) { +         /* do depth test */ -      *depth = si_rotmai(si_and(*ptr, mask), -8); -      *stencil = si_andc(*ptr, mask); -      break; -   } +         ASSERT(spu.fb.depth_format == PIPE_FORMAT_S8Z24_UNORM || +                spu.fb.depth_format == PIPE_FORMAT_X8Z24_UNORM); +         vector unsigned int ifragZ; +         vector unsigned int zmask; -   case PIPE_FORMAT_S8Z24_UNORM: { -      qword *ptr = (qword *) &buffer->ui4[iy][ix]; +         /* convert four fragZ from float to uint */ +         fragZ = spu_mul(fragZ, spu_splats((float) 0xffffff)); +         ifragZ = spu_convtu(fragZ, 0); -      *depth = si_and(*ptr, si_fsmbi(0x7777)); -      *stencil = si_andi(si_roti(*ptr, 8), 0x0ff); -      break; -   } +         /* do depth comparison, setting zmask with results */ +         switch (spu.depth_stencil_alpha.depth.func) { +         case PIPE_FUNC_LESS: +            zmask = spu_cmpgt(ifbZ, ifragZ);  /* mask = (ifragZ < ifbZ) */ +            break; +         case PIPE_FUNC_GREATER: +            zmask = spu_cmpgt(ifragZ, ifbZ);  /* mask = (ifbZ > ifragZ) */ +            break; +         case PIPE_FUNC_GEQUAL: +            zmask = spu_cmpgt(ifbZ, ifragZ); +            zmask = spu_nor(zmask, zmask); +            break; +         case PIPE_FUNC_LEQUAL: +            zmask = spu_cmpgt(ifragZ, ifbZ); +            zmask = spu_nor(zmask, zmask); +            break; +         case PIPE_FUNC_EQUAL: +            zmask = spu_cmpeq(ifbZ, ifragZ); +            break; +         case PIPE_FUNC_NOTEQUAL: +            zmask = spu_cmpeq(ifbZ, ifragZ); +            zmask = spu_nor(zmask, zmask); +            break; +         case PIPE_FUNC_ALWAYS: +            zmask = spu_splats(0xffffffffU); +            break; +         case PIPE_FUNC_NEVER: +            zmask = spu_splats( 0x0U); +            break; +         default: +            ; +         } +         mask = spu_and(mask, zmask); -   default: -      ASSERT(0); -      break; +         /* merge framebuffer Z and fragment Z according to the mask */ +         ifbZ = spu_or(spu_and(ifragZ, mask), +                       spu_andc(ifbZ, mask)); +      } + +      if (spu_extract(spu_orx(mask), 0)) { +         /* put new fragment Z/Stencil values back into Z/Stencil tile */ +         depthStencilTile->ui4[y/2][x/2] = spu_or(ifbZ, ifbS); + +         spu.cur_ztile_status = TILE_STATUS_DIRTY; +      }     } -} +   if (spu.blend.blend_enable) { +      /* blending terms, misc regs */ +      vector float term1r, term1g, term1b, term1a; +      vector float term2r, term2g, term2b, term2a; +      vector float one, tmp; -static void -write_ds_quad(tile_t *buffer, unsigned x, unsigned y, -              enum pipe_format depth_format, -              qword depth, qword stencil) -{ -   const int ix = x / 2; -   const int iy = y / 2; +      vector float fbRGBA[4];  /* current framebuffer colors */ -   (void) stencil; +      /* get colors from framebuffer/tile */ +      { +         vector float fc[4]; +         uint c0, c1, c2, c3; -   switch (depth_format) { -   case PIPE_FORMAT_Z16_UNORM: { -      qword *ptr = (qword *) &buffer->us8[iy][ix / 2]; +#if LINEAR_QUAD_LAYOUT /* See comments/diagram below */ +         c0 = colorTile->ui[y][x*2+0]; +         c1 = colorTile->ui[y][x*2+1]; +         c2 = colorTile->ui[y][x*2+2]; +         c3 = colorTile->ui[y][x*2+3]; +#else +         c0 = colorTile->ui[y+0][x+0]; +         c1 = colorTile->ui[y+0][x+1]; +         c2 = colorTile->ui[y+1][x+0]; +         c3 = colorTile->ui[y+1][x+1]; +#endif +         switch (spu.fb.color_format) { +         case PIPE_FORMAT_B8G8R8A8_UNORM: +            fc[0] = spu_unpack_B8G8R8A8(c0); +            fc[1] = spu_unpack_B8G8R8A8(c1); +            fc[2] = spu_unpack_B8G8R8A8(c2); +            fc[3] = spu_unpack_B8G8R8A8(c3); +            break; +         case PIPE_FORMAT_A8R8G8B8_UNORM: +            fc[0] = spu_unpack_A8R8G8B8(c0); +            fc[1] = spu_unpack_A8R8G8B8(c1); +            fc[2] = spu_unpack_A8R8G8B8(c2); +            fc[3] = spu_unpack_A8R8G8B8(c3); +            break; +         default: +            ASSERT(0); +         } +         _transpose_matrix4x4(fbRGBA, fc); +      } -      qword sv = ((ix & 0x01) == 0)  -          ? (qword) { 2, 3, 6, 7, 10, 11, 14, 15, -                      24, 25, 26, 27, 28, 29, 30, 31 } -          : (qword) { 16, 17, 18, 19, 20 , 21, 22, 23, -                      2, 3, 6, 7, 10, 11, 14, 15 }; -      *ptr = si_shufb(depth, *ptr, sv); -      break; -   } +      /* +       * Compute Src RGB terms +       */ +      switch (spu.blend.rgb_src_factor) { +      case PIPE_BLENDFACTOR_ONE: +         term1r = fragR; +         term1g = fragG; +         term1b = fragB; +         break; +      case PIPE_BLENDFACTOR_ZERO: +         term1r = +         term1g = +         term1b = spu_splats(0.0f); +         break; +      case PIPE_BLENDFACTOR_SRC_COLOR: +         term1r = spu_mul(fragR, fragR); +         term1g = spu_mul(fragG, fragG); +         term1b = spu_mul(fragB, fragB); +         break; +      case PIPE_BLENDFACTOR_SRC_ALPHA: +         term1r = spu_mul(fragR, fragA); +         term1g = spu_mul(fragG, fragA); +         term1b = spu_mul(fragB, fragA); +         break; +      /* XXX more cases */ +      default: +         ASSERT(0); +      } +      /* +       * Compute Src Alpha term +       */ +      switch (spu.blend.alpha_src_factor) { +      case PIPE_BLENDFACTOR_ONE: +         term1a = fragA; +         break; +      case PIPE_BLENDFACTOR_SRC_COLOR: +         term1a = spu_splats(0.0f); +         break; +      case PIPE_BLENDFACTOR_SRC_ALPHA: +         term1a = spu_mul(fragA, fragA); +         break; +      /* XXX more cases */ +      default: +         ASSERT(0); +      } -   case PIPE_FORMAT_Z32_UNORM: { -      qword *ptr = (qword *) &buffer->ui4[iy][ix]; -      *ptr = depth; -      break; -   } +      /* +       * Compute Dest RGB terms +       */ +      switch (spu.blend.rgb_dst_factor) { +      case PIPE_BLENDFACTOR_ONE: +         term2r = fragR; +         term2g = fragG; +         term2b = fragB; +         break; +      case PIPE_BLENDFACTOR_ZERO: +         term2r = +         term2g = +         term2b = spu_splats(0.0f); +         break; +      case PIPE_BLENDFACTOR_SRC_COLOR: +         term2r = spu_mul(fbRGBA[0], fragR); +         term2g = spu_mul(fbRGBA[1], fragG); +         term2b = spu_mul(fbRGBA[2], fragB); +         break; +      case PIPE_BLENDFACTOR_SRC_ALPHA: +         term2r = spu_mul(fbRGBA[0], fragA); +         term2g = spu_mul(fbRGBA[1], fragA); +         term2b = spu_mul(fbRGBA[2], fragA); +         break; +      case PIPE_BLENDFACTOR_INV_SRC_ALPHA: +         one = spu_splats(1.0f); +         tmp = spu_sub(one, fragA); +         term2r = spu_mul(fbRGBA[0], tmp); +         term2g = spu_mul(fbRGBA[1], tmp); +         term2b = spu_mul(fbRGBA[2], tmp); +         break; +      /* XXX more cases */ +      default: +         ASSERT(0); +      } +      /* +       * Compute Dest Alpha term +       */ +      switch (spu.blend.alpha_dst_factor) { +      case PIPE_BLENDFACTOR_ONE: +         term2a = fragA; +         break; +      case PIPE_BLENDFACTOR_SRC_COLOR: +         term2a = spu_splats(0.0f); +         break; +      case PIPE_BLENDFACTOR_SRC_ALPHA: +         term2a = spu_mul(fbRGBA[3], fragA); +         break; +      case PIPE_BLENDFACTOR_INV_SRC_ALPHA: +         one = spu_splats(1.0f); +         tmp = spu_sub(one, fragA); +         term2a = spu_mul(fbRGBA[3], tmp); +         break; +      /* XXX more cases */ +      default: +         ASSERT(0); +      } -   case PIPE_FORMAT_Z24S8_UNORM: { -      qword *ptr = (qword *) &buffer->ui4[iy][ix]; -      qword mask = si_fsmbi(0xEEEE); +      /* +       * Combine Src/Dest RGB terms +       */ +      switch (spu.blend.rgb_func) { +      case PIPE_BLEND_ADD: +         fragR = spu_add(term1r, term2r); +         fragG = spu_add(term1g, term2g); +         fragB = spu_add(term1b, term2b); +         break; +      case PIPE_BLEND_SUBTRACT: +         fragR = spu_sub(term1r, term2r); +         fragG = spu_sub(term1g, term2g); +         fragB = spu_sub(term1b, term2b); +         break; +      /* XXX more cases */ +      default: +         ASSERT(0); +      } -      depth = si_shli(depth, 8); -      *ptr = si_selb(stencil, depth, mask); -      break; +      /* +       * Combine Src/Dest A term +       */ +      switch (spu.blend.alpha_func) { +      case PIPE_BLEND_ADD: +         fragA = spu_add(term1a, term2a); +         break; +      case PIPE_BLEND_SUBTRACT: +         fragA = spu_sub(term1a, term2a); +         break; +      /* XXX more cases */ +      default: +         ASSERT(0); +      }     } -   case PIPE_FORMAT_S8Z24_UNORM: { -      qword *ptr = (qword *) &buffer->ui4[iy][ix]; -      qword mask = si_fsmbi(0x7777); - -      stencil = si_shli(stencil, 24); -      *ptr = si_selb(stencil, depth, mask); -      break; +   /* +    * Convert RRRR,GGGG,BBBB,AAAA to RGBA,RGBA,RGBA,RGBA. +    */ +#if 0 +   /* original code */ +   { +      vector float frag_soa[4]; +      frag_soa[0] = fragR; +      frag_soa[1] = fragG; +      frag_soa[2] = fragB; +      frag_soa[3] = fragA; +      _transpose_matrix4x4(frag_aos, frag_soa);     } +#else +   /* short-cut relying on function parameter layout: */ +   _transpose_matrix4x4(frag_aos, &fragR); +   (void) fragG; +   (void) fragB; +#endif +   /* +    * Pack float colors into 32-bit RGBA words. +    */ +   switch (spu.fb.color_format) { +   case PIPE_FORMAT_A8R8G8B8_UNORM: +      c0 = spu_pack_A8R8G8B8(frag_aos[0]); +      c1 = spu_pack_A8R8G8B8(frag_aos[1]); +      c2 = spu_pack_A8R8G8B8(frag_aos[2]); +      c3 = spu_pack_A8R8G8B8(frag_aos[3]); +      break; +   case PIPE_FORMAT_B8G8R8A8_UNORM: +      c0 = spu_pack_B8G8R8A8(frag_aos[0]); +      c1 = spu_pack_B8G8R8A8(frag_aos[1]); +      c2 = spu_pack_B8G8R8A8(frag_aos[2]); +      c3 = spu_pack_B8G8R8A8(frag_aos[3]); +      break;     default: +      fprintf(stderr, "SPU: Bad pixel format in spu_default_fragment_ops\n");        ASSERT(0); -      break;     } -} - -qword -spu_do_depth_stencil(int x, int y, -                     qword frag_mask, qword frag_depth, qword frag_alpha, -                     qword facing) -{ -   struct spu_frag_test_results  result; -   qword pixel_depth; -   qword pixel_stencil; -   /* All of this preable code (everthing before the call to frag_test) should -    * be generated on the PPU and upload to the SPU. +   /* +    * Color masking      */ -   if (spu.read_depth || spu.read_stencil) { -      read_ds_quad(&spu.ztile, x, y, spu.fb.depth_format, -                   &pixel_depth, &pixel_stencil); -   } -    -   switch (spu.fb.depth_format) { -   case PIPE_FORMAT_Z16_UNORM: -      frag_depth = si_fm(frag_depth, (qword)spu_splats((float)(0x0000ffffu))); -      frag_depth = si_cfltu(frag_depth, 0); -      break; -   case PIPE_FORMAT_Z32_UNORM: -      frag_depth = si_fm(frag_depth, (qword)spu_splats((float)(0xffffffffu))); -      frag_depth = si_cfltu(frag_depth, 0); -      break; -   case PIPE_FORMAT_Z24S8_UNORM: -   case PIPE_FORMAT_S8Z24_UNORM: -      frag_depth = si_fm(frag_depth, (qword)spu_splats((float)(0x00ffffffu))); -      frag_depth = si_cfltu(frag_depth, 0); -      break; -   default: -      ASSERT(0); -      break; +   if (spu.blend.colormask != 0xf) { +      /* XXX to do */ +      /* apply color mask to 32-bit packed colors */     } -   result = (*spu.frag_test)(frag_mask, pixel_depth, pixel_stencil, -                             frag_depth, frag_alpha, facing); + +   /* +    * Logic Ops +    */ +   if (spu.blend.logicop_enable) { +      /* XXX to do */ +      /* apply logicop to 32-bit packed colors */ +   } -   /* This code (everthing after the call to frag_test) should -    * be generated on the PPU and upload to the SPU. +   /* +    * If mask is non-zero, mark tile as dirty.      */ -   if (spu.read_depth || spu.read_stencil) { -      write_ds_quad(&spu.ztile, x, y, spu.fb.depth_format, -                    result.depth, result.stencil); +   if (spu_extract(spu_orx(mask), 0)) { +      spu.cur_ctile_status = TILE_STATUS_DIRTY;     } +   else { +      return; +   } + -   return result.mask; +   /* +    * Write new quad colors to the framebuffer/tile. +    * Only write pixels where the corresponding mask word is set. +    */ +#if LINEAR_QUAD_LAYOUT +   /* +    * Quad layout: +    *  +--+--+--+--+ +    *  |p0|p1|p2|p3| +    *  +--+--+--+--+ +    */ +   if (spu_extract(mask, 0)) +      colorTile->ui[y][x*2] = c0; +   if (spu_extract(mask, 1)) +      colorTile->ui[y][x*2+1] = c1; +   if (spu_extract(mask, 2)) +      colorTile->ui[y][x*2+2] = c2; +   if (spu_extract(mask, 3)) +      colorTile->ui[y][x*2+3] = c3; +#else +   /* +    * Quad layout: +    *  +--+--+ +    *  |p0|p1| +    *  +--+--+ +    *  |p2|p3| +    *  +--+--+ +    */ +   if (spu_extract(mask, 0)) +      colorTile->ui[y+0][x+0] = c0; +   if (spu_extract(mask, 1)) +      colorTile->ui[y+0][x+1] = c1; +   if (spu_extract(mask, 2)) +      colorTile->ui[y+1][x+0] = c2; +   if (spu_extract(mask, 3)) +      colorTile->ui[y+1][x+1] = c3; +#endif  } diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.h b/src/gallium/drivers/cell/spu/spu_per_fragment_op.h index 6571258699..f817abf046 100644 --- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.h +++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.h @@ -1,32 +1,44 @@ -/* - * (C) Copyright IBM Corporation 2008 +/************************************************************************** + *  + * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.   * All Rights Reserved.   *   * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * on the rights to use, copy, modify, merge, publish, distribute, sub - * license, and/or sell copies of the Software, and to permit persons to whom - * the Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL - * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, - * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR - * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE - * USE OR OTHER DEALINGS IN THE SOFTWARE. - */ + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + *  + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + *  + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *  + **************************************************************************/  #ifndef SPU_PER_FRAGMENT_OP  #define SPU_PER_FRAGMENT_OP -extern qword -spu_do_depth_stencil(int x, int y, qword frag_mask, qword frag_depth, -		     qword frag_alpha, qword facing); + +extern void +spu_fallback_fragment_ops(uint x, uint y, +                          tile_t *colorTile, +                          tile_t *depthStencilTile, +                          vector float fragZ, +                          vector float fragRed, +                          vector float fragGreen, +                          vector float fragBlue, +                          vector float fragAlpha, +                          vector unsigned int mask); +  #endif /* SPU_PER_FRAGMENT_OP */ diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c index 5051774f00..117b8a36f8 100644 --- a/src/gallium/drivers/cell/spu/spu_texture.c +++ b/src/gallium/drivers/cell/spu/spu_texture.c @@ -97,7 +97,7 @@ get_four_texels(uint unit, vec_uint4 x, vec_uint4 y, vec_uint4 *texels)     const qword offset_y = si_andi((qword) y, 0x1f);     const qword tiles_per_row = (qword) spu_splats(spu.texture[unit].tiles_per_row); -   const qword tile_size = (qword) spu_splats(sizeof(tile_t)); +   const qword tile_size = (qword) spu_splats((unsigned) sizeof(tile_t));     qword tile_offset = si_mpya((qword) tile_y, tiles_per_row, (qword) tile_x);     tile_offset = si_mpy((qword) tile_offset, tile_size); diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c index 2a4e0b423c..8b93878192 100644 --- a/src/gallium/drivers/cell/spu/spu_tri.c +++ b/src/gallium/drivers/cell/spu/spu_tri.c @@ -38,7 +38,6 @@  #include "spu_texture.h"  #include "spu_tile.h"  #include "spu_tri.h" -#include "spu_per_fragment_op.h"  /** Masks are uint[4] vectors with each element being 0 or 0xffffffff */ @@ -209,7 +208,7 @@ clip_emit_quad(struct setup_stage *setup)  /**   * Evaluate attribute coefficients (plane equations) to compute   * attribute values for the four fragments in a quad. - * Eg: four colors will be compute. + * Eg: four colors will be computed (in AoS format).   */  static INLINE void  eval_coeff(uint slot, float x, float y, vector float result[4]) @@ -255,31 +254,6 @@ eval_z(float x, float y)  } -static INLINE mask_t -do_depth_test(int x, int y, mask_t quadmask) -{ -   float4 zvals; -   mask_t mask; - -   if (spu.fb.depth_format == PIPE_FORMAT_NONE) -      return quadmask; - -   zvals.v = eval_z((float) x, (float) y); - -   mask = (mask_t) spu_do_depth_stencil(x - setup.cliprect_minx, -					y - setup.cliprect_miny, -					(qword) quadmask,  -					(qword) zvals.v, -					(qword) spu_splats((unsigned char) 0x0ffu), -					(qword) spu_splats((unsigned int) 0x01u)); - -   if (spu_extract(spu_orx(mask), 0)) -      spu.cur_ztile_status = TILE_STATUS_DIRTY; - -   return mask; -} - -  /**   * Emit a quad (pass to next stage).  No clipping is done.   * Note: about 1/5 to 1/7 of the time, mask is zero and this function @@ -289,18 +263,6 @@ do_depth_test(int x, int y, mask_t quadmask)  static INLINE void  emit_quad( int x, int y, mask_t mask )  { -#if 0 -   struct softpipe_context *sp = setup.softpipe; -   setup.quad.x0 = x; -   setup.quad.y0 = y; -   setup.quad.mask = mask; -   sp->quad.first->run(sp->quad.first, &setup.quad); -#else - -   if (spu.read_depth) { -      mask = do_depth_test(x, y, mask); -   } -     /* If any bits in mask are set... */     if (spu_extract(spu_orx(mask), 0)) {        const int ix = x - setup.cliprect_minx; @@ -308,6 +270,7 @@ emit_quad( int x, int y, mask_t mask )        vector float colors[4];        spu.cur_ctile_status = TILE_STATUS_DIRTY; +      spu.cur_ztile_status = TILE_STATUS_DIRTY;        if (spu.texture[0].start) {           /* texture mapping */ @@ -351,59 +314,68 @@ emit_quad( int x, int y, mask_t mask )        }        else {           /* simple shading */ +#if 0           eval_coeff(1, (float) x, (float) y, colors); -      } +#else +         /* XXX new fragment program code */ -      /* Convert fragment data from AoS to SoA format. -       */ -      qword soa_frag[4]; -      _transpose_matrix4x4((vec_float4 *) soa_frag, colors); +         if (spu.fragment_program) { +            vector float inputs[4*4], outputs[2*4]; -      /* Read the current framebuffer values. -       */ -      const qword pix[4] = { -         (qword) spu_splats(spu.ctile.ui[iy+0][ix+0]), -         (qword) spu_splats(spu.ctile.ui[iy+0][ix+1]), -         (qword) spu_splats(spu.ctile.ui[iy+1][ix+0]), -         (qword) spu_splats(spu.ctile.ui[iy+1][ix+1]), -      }; +            /* setup inputs */ +            eval_coeff(1, (float) x, (float) y, inputs); -      qword soa_pix[4]; +            /* Execute the current fragment program */ +            spu.fragment_program(inputs, outputs, spu.constants); -      if (spu.read_fb) { -         /* Convert pixel data from AoS to SoA format. -          */ -         vec_float4 aos_pix[4] = { -            spu_unpack_A8R8G8B8(spu.ctile.ui[iy+0][ix+0]), -            spu_unpack_A8R8G8B8(spu.ctile.ui[iy+0][ix+1]), -            spu_unpack_A8R8G8B8(spu.ctile.ui[iy+1][ix+0]), -            spu_unpack_A8R8G8B8(spu.ctile.ui[iy+1][ix+1]), -         }; +            /* Copy outputs */ +            colors[0] = outputs[0*4+0]; +            colors[1] = outputs[0*4+1]; +            colors[2] = outputs[0*4+2]; +            colors[3] = outputs[0*4+3]; -         _transpose_matrix4x4((vec_float4 *) soa_pix, aos_pix); +            if (0 && spu.init.id==0 && y == 48) { +               printf("colors[0] = %f %f %f %f\n", +                      spu_extract(colors[0], 0), +                      spu_extract(colors[0], 1), +                      spu_extract(colors[0], 2), +                      spu_extract(colors[0], 3)); +               printf("colors[1] = %f %f %f %f\n", +                      spu_extract(colors[1], 0), +                      spu_extract(colors[1], 1), +                      spu_extract(colors[1], 2), +                      spu_extract(colors[1], 3)); +            } + +         } +#endif        } -      struct spu_blend_results result = -          (*spu.blend)(soa_frag[0], soa_frag[1], soa_frag[2], soa_frag[3], -                       soa_pix[0], soa_pix[1], soa_pix[2], soa_pix[3], -                       spu.const_blend_color[0], spu.const_blend_color[1], -                       spu.const_blend_color[2], spu.const_blend_color[3]); +      { +         /* Convert fragment data from AoS to SoA format. +          * I.e. (RGBA,RGBA,RGBA,RGBA) -> (RRRR,GGGG,BBBB,AAAA) +          * This is temporary! +          */ +         vector float soa_frag[4]; +         _transpose_matrix4x4(soa_frag, colors); +         float4 fragZ; -      /* Convert final pixel data from SoA to AoS format. -       */ -      result = (*spu.logicop)(pix[0], pix[1], pix[2], pix[3], -                              result.r, result.g, result.b, result.a, -                              (qword) mask); +         fragZ.v = eval_z((float) x, (float) y); + +         /* Do all per-fragment/quad operations here, including: +          *  alpha test, z test, stencil test, blend and framebuffer writing. +          */ +         spu.fragment_ops(ix, iy, &spu.ctile, &spu.ztile, +                          fragZ.v, +                          soa_frag[0], soa_frag[1], +                          soa_frag[2], soa_frag[3], +                          mask); +      } -      spu.ctile.ui[iy+0][ix+0] = spu_extract((vec_uint4) result.r, 0); -      spu.ctile.ui[iy+0][ix+1] = spu_extract((vec_uint4) result.g, 0); -      spu.ctile.ui[iy+1][ix+0] = spu_extract((vec_uint4) result.b, 0); -      spu.ctile.ui[iy+1][ix+1] = spu_extract((vec_uint4) result.a, 0);     } -#endif  } | 
