diff options
Diffstat (limited to 'src')
| -rw-r--r-- | src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c | 246 | ||||
| -rw-r--r-- | src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h | 41 | ||||
| -rw-r--r-- | src/gallium/drivers/cell/common.h | 1 | ||||
| -rw-r--r-- | src/gallium/drivers/cell/ppu/cell_gen_fragment.c | 877 | ||||
| -rw-r--r-- | src/gallium/drivers/cell/ppu/cell_render.c | 1 | ||||
| -rw-r--r-- | src/gallium/drivers/cell/ppu/cell_vbuf.c | 1 | ||||
| -rw-r--r-- | src/gallium/drivers/cell/spu/spu_main.h | 3 | ||||
| -rw-r--r-- | src/gallium/drivers/cell/spu/spu_per_fragment_op.c | 19 | ||||
| -rw-r--r-- | src/gallium/drivers/cell/spu/spu_per_fragment_op.h | 3 | ||||
| -rw-r--r-- | src/gallium/drivers/cell/spu/spu_render.c | 4 | ||||
| -rw-r--r-- | src/gallium/drivers/cell/spu/spu_tri.c | 35 | ||||
| -rw-r--r-- | src/gallium/drivers/cell/spu/spu_tri.h | 2 | 
12 files changed, 1089 insertions, 144 deletions
| diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c index 491141f190..8a87e9abb1 100644 --- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c +++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c @@ -359,14 +359,21 @@ void _name (struct spe_function *p, int imm) \   */  void spe_init_func(struct spe_function *p, unsigned code_size)  { +    register unsigned int i; +      p->store = align_malloc(code_size, 16);      p->num_inst = 0;      p->max_inst = code_size / SPE_INST_SIZE; +    p->set_count = 0; +    memset(p->regs, 0, SPE_NUM_REGS * sizeof(p->regs[0])); +      /* Conservatively treat R0 - R2 and R80 - R127 as non-volatile.       */ -    p->regs[0] = ~7; -    p->regs[1] = (1U << (80 - 64)) - 1; +    p->regs[0] = p->regs[1] = p->regs[2] = 1; +    for (i = 80; i <= 127; i++) { +      p->regs[i] = 1; +    }      p->print = false;      p->indent = 0; @@ -398,12 +405,8 @@ int spe_allocate_available_register(struct spe_function *p)  {     unsigned i;     for (i = 0; i < SPE_NUM_REGS; i++) { -      const uint64_t mask = (1ULL << (i % 64)); -      const unsigned idx = i / 64; - -      assert(idx < 2); -      if ((p->regs[idx] & mask) != 0) { -         p->regs[idx] &= ~mask; +      if (p->regs[i] == 0) { +         p->regs[i] = 1;           return i;        }     } @@ -417,31 +420,68 @@ int spe_allocate_available_register(struct spe_function *p)   */  int spe_allocate_register(struct spe_function *p, int reg)  { -   const unsigned idx = reg / 64; -   const unsigned bit = reg % 64; -     assert(reg < SPE_NUM_REGS); -   assert((p->regs[idx] & (1ULL << bit)) != 0); - -   p->regs[idx] &= ~(1ULL << bit); +   assert(p->regs[reg] == 0); +   p->regs[reg] = 1;     return reg;  }  /** - * Mark the given SPE register as "unallocated". + * Mark the given SPE register as "unallocated".  Note that this should + * only be used on registers allocated in the current register set; an + * assertion will fail if an attempt is made to deallocate a register + * allocated in an earlier register set.   */  void spe_release_register(struct spe_function *p, int reg)  { -   const unsigned idx = reg / 64; -   const unsigned bit = reg % 64; +   assert(reg < SPE_NUM_REGS); +   assert(p->regs[reg] == 1); -   assert(idx < 2); +   p->regs[reg] = 0; +} -   assert(reg < SPE_NUM_REGS); -   assert((p->regs[idx] & (1ULL << bit)) == 0); +/** + * Start a new set of registers.  This can be called if + * it will be difficult later to determine exactly what + * registers were actually allocated during a code generation + * sequence, and you really just want to deallocate all of them. + */ +void spe_allocate_register_set(struct spe_function *p) +{ +   register unsigned int i; + +   /* Keep track of the set count.  If it ever wraps around to 0,  +    * we're in trouble. +    */ +   p->set_count++; +   assert(p->set_count > 0); -   p->regs[idx] |= (1ULL << bit); +   /* Increment the allocation count of all registers currently +    * allocated.  Then any registers that are allocated in this set +    * will be the only ones with a count of 1; they'll all be released +    * when the register set is released. +    */ +   for (i = 0; i < SPE_NUM_REGS; i++) { +      if (p->regs[i] > 0) p->regs[i]++; +   } +} + +void spe_release_register_set(struct spe_function *p) +{ +   unsigned int i; + +   /* If the set count drops below zero, we're in trouble. */ +   assert(p->set_count > 0); +   p->set_count--; + +   /* Drop the allocation level of all registers.  Any allocated +    * during this register set will drop to 0 and then become +    * available. +    */ +   for (i = 0; i < SPE_NUM_REGS; i++) { +      if (p->regs[i] > 0) p->regs[i]--; +   }  } @@ -603,8 +643,10 @@ void spe_load_uint(struct spe_function *p, unsigned rT, unsigned int ui)  {     /* If the whole value is in the lower 18 bits, use ila, which      * doesn't sign-extend.  Otherwise, if the two halfwords of -    * the constant are identical, use ilh.  Otherwise, we have -    * to use ilhu followed by iohl. +    * the constant are identical, use ilh.  Otherwise, if every byte of +    * the desired value is 0x00 or 0xff, we can use Form Select Mask for +    * Bytes Immediate (fsmbi) to load the value in a single instruction. +    * Otherwise, in the general case, we have to use ilhu followed by iohl.      */     if ((ui & 0xfffc0000) == ui) {        spe_ila(p, rT, ui); @@ -612,13 +654,171 @@ void spe_load_uint(struct spe_function *p, unsigned rT, unsigned int ui)     else if ((ui >> 16) == (ui & 0xffff)) {        spe_ilh(p, rT, ui & 0xffff);     } +   else if ( +      ((ui & 0x000000ff) == 0 || (ui & 0x000000ff) == 0x000000ff) && +      ((ui & 0x0000ff00) == 0 || (ui & 0x0000ff00) == 0x0000ff00) && +      ((ui & 0x00ff0000) == 0 || (ui & 0x00ff0000) == 0x00ff0000) && +      ((ui & 0xff000000) == 0 || (ui & 0xff000000) == 0xff000000) +   ) { +      unsigned int mask = 0; +      /* fsmbi duplicates each bit in the given mask eight times, +       * using a 16-bit value to initialize a 16-byte quadword. +       * Each 4-bit nybble of the mask corresponds to a full word +       * of the result; look at the value and figure out the mask +       * (replicated for each word in the quadword), and then +       * form the "select mask" to get the value. +       */ +      if ((ui & 0x000000ff) == 0x000000ff) mask |= 0x1111; +      if ((ui & 0x0000ff00) == 0x0000ff00) mask |= 0x2222; +      if ((ui & 0x00ff0000) == 0x00ff0000) mask |= 0x4444; +      if ((ui & 0xff000000) == 0xff000000) mask |= 0x8888; +      spe_fsmbi(p, rT, mask); +   }     else { +      /* The general case: this usually uses two instructions, but +       * may use only one if the low-order 16 bits of each word are 0. +       */        spe_ilhu(p, rT, ui >> 16);        if (ui & 0xffff)           spe_iohl(p, rT, ui & 0xffff);     }  } +/* This function is constructed identically to spe_sor_uint() below. + * Changes to one should be made in the other. + */ +void spe_and_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui) +{ +   /* If we can, emit a single instruction, either And Byte Immediate +    * (which uses the same constant across each byte), And Halfword Immediate +    * (which sign-extends a 10-bit immediate to 16 bits and uses that +    * across each halfword), or And Word Immediate (which sign-extends +    * a 10-bit immediate to 32 bits). +    * +    * Otherwise, we'll need to use a temporary register. +    */ +   register unsigned int tmp; + +   /* If the upper 23 bits are all 0s or all 1s, sign extension +    * will work and we can use And Word Immediate +    */ +   tmp = ui & 0xfffffe00; +   if (tmp == 0xfffffe00 || tmp  == 0) { +      spe_andi(p, rT, rA, ui & 0x000003ff); +      return; +   } +    +   /* If the ui field is symmetric along halfword boundaries and +    * the upper 7 bits of each halfword are all 0s or 1s, we +    * can use And Halfword Immediate +    */ +   tmp = ui & 0xfe00fe00; +   if ((tmp == 0xfe00fe00 || tmp == 0) && ((ui >> 16) == (ui & 0x0000ffff))) { +      spe_andhi(p, rT, rA, ui & 0x000003ff); +      return; +   } + +   /* If the ui field is symmetric in each byte, then we can use +    * the And Byte Immediate instruction. +    */ +   tmp = ui & 0x000000ff; +   if ((ui >> 24) == tmp && ((ui >> 16) & 0xff) == tmp && ((ui >> 8) & 0xff) == tmp) { +      spe_andbi(p, rT, rA, tmp); +      return; +   } + +   /* Otherwise, we'll have to use a temporary register. */ +   unsigned int tmp_reg = spe_allocate_available_register(p); +   spe_load_uint(p, tmp_reg, ui); +   spe_and(p, rT, rA, tmp_reg); +   spe_release_register(p, tmp_reg); +} + +/* This function is constructed identically to spe_and_uint() above. + * Changes to one should be made in the other. + */ +void spe_xor_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui) +{ +   /* If we can, emit a single instruction, either Exclusive Or Byte  +    * Immediate (which uses the same constant across each byte), Exclusive  +    * Or Halfword Immediate (which sign-extends a 10-bit immediate to  +    * 16 bits and uses that across each halfword), or Exclusive Or Word  +    * Immediate (which sign-extends a 10-bit immediate to 32 bits). +    * +    * Otherwise, we'll need to use a temporary register. +    */ +   register unsigned int tmp; + +   /* If the upper 23 bits are all 0s or all 1s, sign extension +    * will work and we can use Exclusive Or Word Immediate +    */ +   tmp = ui & 0xfffffe00; +   if (tmp == 0xfffffe00 || tmp  == 0) { +      spe_xori(p, rT, rA, ui & 0x000003ff); +      return; +   } +    +   /* If the ui field is symmetric along halfword boundaries and +    * the upper 7 bits of each halfword are all 0s or 1s, we +    * can use Exclusive Or Halfword Immediate +    */ +   tmp = ui & 0xfe00fe00; +   if ((tmp == 0xfe00fe00 || tmp == 0) && ((ui >> 16) == (ui & 0x0000ffff))) { +      spe_xorhi(p, rT, rA, ui & 0x000003ff); +      return; +   } + +   /* If the ui field is symmetric in each byte, then we can use +    * the Exclusive Or Byte Immediate instruction. +    */ +   tmp = ui & 0x000000ff; +   if ((ui >> 24) == tmp && ((ui >> 16) & 0xff) == tmp && ((ui >> 8) & 0xff) == tmp) { +      spe_xorbi(p, rT, rA, tmp); +      return; +   } + +   /* Otherwise, we'll have to use a temporary register. */ +   unsigned int tmp_reg = spe_allocate_available_register(p); +   spe_load_uint(p, tmp_reg, ui); +   spe_xor(p, rT, rA, tmp_reg); +   spe_release_register(p, tmp_reg); +} + +void +spe_compare_equal_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui) +{ +   /* If the comparison value is 9 bits or less, it fits inside a +    * Compare Equal Word Immediate instruction. +    */ +   if ((ui & 0x000001ff) == ui) { +      spe_ceqi(p, rT, rA, ui); +   } +   /* Otherwise, we're going to have to load a word first. */ +   else { +      unsigned int tmp_reg = spe_allocate_available_register(p); +      spe_load_uint(p, tmp_reg, ui); +      spe_ceq(p, rT, rA, tmp_reg); +      spe_release_register(p, tmp_reg); +   } +} + +void +spe_compare_greater_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui) +{ +   /* If the comparison value is 10 bits or less, it fits inside a +    * Compare Logical Greater Than Word Immediate instruction. +    */ +   if ((ui & 0x000003ff) == ui) { +      spe_clgti(p, rT, rA, ui); +   } +   /* Otherwise, we're going to have to load a word first. */ +   else { +      unsigned int tmp_reg = spe_allocate_available_register(p); +      spe_load_uint(p, tmp_reg, ui); +      spe_clgt(p, rT, rA, tmp_reg); +      spe_release_register(p, tmp_reg); +   } +}  void  spe_splat(struct spe_function *p, unsigned rT, unsigned rA) diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h index 61c7edeb60..cd2e245409 100644 --- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h +++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h @@ -53,17 +53,26 @@ struct spe_function     uint num_inst;     uint max_inst; -    /** -     * Mask of used / unused registers -     * -     * Each set bit corresponds to an available register.  Each cleared bit -     * corresponds to an allocated register. +   /** +    * The "set count" reflects the number of nested register sets +    * are allowed.  In the unlikely case that we exceed the set count, +    * register allocation will start to be confused, which is critical +    * enough that we check for it. +    */ +   unsigned char set_count; + +   /**  +    * Flags for used and unused registers.  Each byte corresponds to a +    * register; a 0 in that byte means that the register is available. +    * A value of 1 means that the register was allocated in the current +    * register set.  Any other value N means that the register was allocated +    * N register sets ago.       *       * \sa       * spe_allocate_register, spe_allocate_available_register, -     * spe_release_register +     * spe_allocate_register_set, spe_release_register_set, spe_release_register,        */ -    uint64_t regs[SPE_NUM_REGS / 64]; +    unsigned char regs[SPE_NUM_REGS];      boolean print; /**< print/dump instructions as they're emitted? */      int indent;    /**< number of spaces to indent */ @@ -77,6 +86,8 @@ extern unsigned spe_code_size(const struct spe_function *p);  extern int spe_allocate_available_register(struct spe_function *p);  extern int spe_allocate_register(struct spe_function *p, int reg);  extern void spe_release_register(struct spe_function *p, int reg); +extern void spe_allocate_register_set(struct spe_function *p); +extern void spe_release_register_set(struct spe_function *p);  extern void spe_print_code(struct spe_function *p, boolean enable);  extern void spe_indent(struct spe_function *p, int spaces); @@ -307,6 +318,22 @@ spe_load_int(struct spe_function *p, unsigned rT, int i);  extern void  spe_load_uint(struct spe_function *p, unsigned rT, unsigned int ui); +/** And immediate value into rT. */ +extern void +spe_and_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui); + +/** Xor immediate value into rT. */ +extern void +spe_xor_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui); + +/** Compare equal with immediate value. */ +extern void +spe_compare_equal_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui); + +/** Compare greater with immediate value. */ +extern void +spe_compare_greater_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui); +  /** Replicate word 0 of rA across rT. */  extern void  spe_splat(struct spe_function *p, unsigned rT, unsigned rA); diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h index 99329fd8e2..c223bc1744 100644 --- a/src/gallium/drivers/cell/common.h +++ b/src/gallium/drivers/cell/common.h @@ -227,6 +227,7 @@ struct cell_command_render     float xmin, ymin, xmax, ymax;  /* XXX another dummy field */     uint min_index;     boolean inline_verts; +   uint front_winding; /* the rasterizer needs to be able to determine facing to apply front/back-facing stencil */  }; diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c index 653afc235d..f920ae13b4 100644 --- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c +++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c @@ -54,10 +54,12 @@   * \param ifragZ_reg  register containing integer fragment Z values (in)   * \param ifbZ_reg    register containing integer frame buffer Z values (in/out)   * \param zmask_reg   register containing result of Z test/comparison (out) + * + * Returns true if the Z-buffer needs to be updated.   */ -static void -gen_depth_test(const struct pipe_depth_stencil_alpha_state *dsa, -               struct spe_function *f, +static boolean +gen_depth_test(struct spe_function *f, +               const struct pipe_depth_stencil_alpha_state *dsa,                 int mask_reg, int ifragZ_reg, int ifbZ_reg, int zmask_reg)  {     /* NOTE: we use clgt below, not cgt, because we want to compare _unsigned_ @@ -132,7 +134,10 @@ gen_depth_test(const struct pipe_depth_stencil_alpha_state *dsa,         * framebufferZ = (ztest_passed ? fragmentZ : framebufferZ;         */        spe_selb(f, ifbZ_reg, ifbZ_reg, ifragZ_reg, mask_reg); +      return true;     } + +   return false;  } @@ -238,22 +243,34 @@ gen_alpha_test(const struct pipe_depth_stencil_alpha_state *dsa,   * it and have to allocate and load it again unnecessarily.   */  static inline void -setup_const_register(struct spe_function *f, boolean *is_already_set, unsigned int *r, float value) +setup_optional_register(struct spe_function *f, boolean *is_already_set, unsigned int *r)  {     if (*is_already_set) return;     *r = spe_allocate_available_register(f); -   spe_load_float(f, *r, value); -   *is_already_set = true;  }  static inline void -release_const_register(struct spe_function *f, boolean *is_already_set, unsigned int r) +release_optional_register(struct spe_function *f, boolean *is_already_set, unsigned int r)  {      if (!*is_already_set) return;      spe_release_register(f, r);      *is_already_set = false;  } +static inline void +setup_const_register(struct spe_function *f, boolean *is_already_set, unsigned int *r, float value) +{ +   if (*is_already_set) return; +   setup_optional_register(f, is_already_set, r); +   spe_load_float(f, *r, value); +} + +static inline void +release_const_register(struct spe_function *f, boolean *is_already_set, unsigned int r) +{ +    release_optional_register(f, is_already_set, r); +} +  /**   * Generate SPE code to implement the given blend mode for a quad of pixels.   * \param f          SPE function to append instruction onto. @@ -1117,6 +1134,633 @@ gen_colormask(struct spe_function *f,      spe_release_register(f, colormask_reg);  } +/* This function is annoyingly similar to gen_depth_test(), above, except + * that instead of comparing two varying values (i.e. fragment and buffer), + * we're comparing a varying value with a static value.  As such, we have + * access to the Compare Immediate instructions where we don't in  + * gen_depth_test(), which is what makes us very different. + * + * The return value in the stencil_pass_reg is a bitmask of valid + * fragments that also passed the stencil test.  The bitmask of valid + * fragments that failed would be found in (mask_reg & ~stencil_pass_reg). + */ +static void +gen_stencil_test(struct spe_function *f, const struct pipe_stencil_state *state,  +                 unsigned int mask_reg, unsigned int fbS_reg,  +                 unsigned int stencil_pass_reg) +{ +   /* Generate code that puts the set of passing fragments into the stencil_pass_reg +    * register, taking into account whether each fragment was active to begin with. +    */ +   switch (state->func) { +   case PIPE_FUNC_EQUAL: +      /* stencil_pass = mask & (s == reference) */ +      spe_compare_equal_uint(f, stencil_pass_reg, fbS_reg, state->ref_value); +      spe_and(f, stencil_pass_reg, mask_reg, stencil_pass_reg); +      /* stencil_fail = mask & ~stencil_pass */ +      break; + +   case PIPE_FUNC_NOTEQUAL: +      /* stencil_pass = mask & ~(s == reference) */ +      spe_compare_equal_uint(f, stencil_pass_reg, fbS_reg, state->ref_value); +      spe_andc(f, stencil_pass_reg, mask_reg, stencil_pass_reg); +      break; + +   case PIPE_FUNC_GREATER: +      /* stencil_pass = mask & (s > reference) */ +      spe_compare_greater_uint(f, stencil_pass_reg, fbS_reg, state->ref_value); +      spe_and(f, stencil_pass_reg, mask_reg, stencil_pass_reg); +      break; + +   case PIPE_FUNC_LESS: { +      /* stencil_pass = mask & (reference > s) */ +      /* There's no convenient Compare Less Than Immediate instruction, so +       * we'll have to do this one the harder way, by loading a register and  +       * comparing directly.  Compare Logical Greater Than Word (clgt)  +       * treats its operands as unsigned - no sign extension. +       */ +      unsigned int tmp_reg = spe_allocate_available_register(f); +      spe_load_uint(f, tmp_reg, state->ref_value); +      spe_clgt(f, stencil_pass_reg, tmp_reg, fbS_reg); +      spe_and(f, stencil_pass_reg, mask_reg, stencil_pass_reg); +      spe_release_register(f, tmp_reg); +      break; +   } + +   case PIPE_FUNC_LEQUAL: +      /* stencil_pass = mask & (s <= reference) = mask & ~(s > reference) */ +      spe_compare_greater_uint(f, stencil_pass_reg, fbS_reg, state->ref_value); +      spe_andc(f, stencil_pass_reg, mask_reg, stencil_pass_reg); +      break; + +   case PIPE_FUNC_GEQUAL: { +      /* stencil_pass = mask & (s >= reference) = mask & ~(reference > s) */ +      /* As above, we have to do this by loading a register */ +      unsigned int tmp_reg = spe_allocate_available_register(f); +      spe_load_uint(f, tmp_reg, state->ref_value); +      spe_clgt(f, stencil_pass_reg, tmp_reg, fbS_reg); +      spe_andc(f, stencil_pass_reg, mask_reg, stencil_pass_reg); +      spe_release_register(f, tmp_reg); +      break; +   } + +   case PIPE_FUNC_NEVER: +      /* stencil_pass = mask & 0 = 0 */ +      spe_load_uint(f, stencil_pass_reg, 0); +      spe_move(f, stencil_pass_reg, mask_reg);  /* zmask = mask */ +      break; + +   case PIPE_FUNC_ALWAYS: +      /* stencil_pass = mask & 1 = mask */ +      spe_move(f, stencil_pass_reg, mask_reg); +      break; +   } + +   /* The fragments that passed the stencil test are now in stencil_pass_reg. +    * The fragments that failed would be (mask_reg & ~stencil_pass_reg). +    */ +} + +/* This function generates code that calculates a set of new stencil values + * given the earlier values and the operation to apply.  It does not + * apply any tests.  It is intended to be called up to 3 times + * (for the stencil fail operation, for the stencil pass-z fail operation, + * and for the stencil pass-z pass operation) to collect up to three + * possible sets of values, and for the caller to combine them based + * on the result of the tests. + * + * stencil_max_value should be (2^n - 1) where n is the number of bits + * in the stencil buffer - in other words, it should be usable as a mask. + */ +static void +gen_stencil_values(struct spe_function *f, unsigned int stencil_op, +                   unsigned int stencil_ref_value, unsigned int stencil_max_value, +                   unsigned int fbS_reg, unsigned int newS_reg) +{ +   /* The code below assumes that newS_reg and fbS_reg are not the same +    * register; if they can be, the calculations below will have to use +    * an additional temporary register.  For now, mark the assumption +    * with an assertion that will fail if they are the same. +    */ +   ASSERT(fbS_reg != newS_reg); + +   /* The code also assumes the the stencil_max_value is of the form  +    * 2^n-1 and can therefore be used as a mask for the valid bits in  +    * addition to a maximum.  Make sure this is the case as well. +    * The clever math below exploits the fact that incrementing a  +    * binary number serves to flip all the bits of a number starting at +    * the LSB and continuing to (and including) the first zero bit +    * found.  That means that a number and its increment will always +    * have at least one bit in common (the high order bit, if nothing +    * else) *unless* the number is zero, *or* the number is of a form +    * consisting of some number of 1s in the low-order bits followed +    * by nothing but 0s in the high-order bits.  The latter case +    * implies it's of the form 2^n-1. +    */ +   ASSERT(stencil_max_value > 0 && ((stencil_max_value + 1) & stencil_max_value) == 0); + +   switch(stencil_op) { +   case PIPE_STENCIL_OP_KEEP: +      /* newS = S */ +      spe_move(f, newS_reg, fbS_reg); +      break; + +   case PIPE_STENCIL_OP_ZERO: +      /* newS = 0 */ +      spe_zero(f, newS_reg); +      break; + +   case PIPE_STENCIL_OP_REPLACE: +      /* newS = stencil reference value */ +      spe_load_uint(f, newS_reg, stencil_ref_value); +      break; + +   case PIPE_STENCIL_OP_INCR: { +      /* newS = (s == max ? max : s + 1) */ +      unsigned int equals_reg = spe_allocate_available_register(f); + +      spe_compare_equal_uint(f, equals_reg, fbS_reg, stencil_max_value); +      /* Add Word Immediate computes rT = rA + 10-bit signed immediate */ +      spe_ai(f, newS_reg, fbS_reg, 1); +      /* Select from the current value or the new value based on the equality test */ +      spe_selb(f, newS_reg, fbS_reg, newS_reg, equals_reg); + +      spe_release_register(f, equals_reg); +      break; +   } +   case PIPE_STENCIL_OP_DECR: { +      /* newS = (s == 0 ? 0 : s - 1) */ +      unsigned int equals_reg = spe_allocate_available_register(f); + +      spe_compare_equal_uint(f, equals_reg, fbS_reg, 0); +      /* Add Word Immediate with a (-1) value works */ +      spe_ai(f, newS_reg, fbS_reg, -1); +      /* Select from the current value or the new value based on the equality test */ +      spe_selb(f, newS_reg, fbS_reg, newS_reg, equals_reg); + +      spe_release_register(f, equals_reg); +      break; +   } +   case PIPE_STENCIL_OP_INCR_WRAP: +      /* newS = (s == max ? 0 : s + 1), but since max is 2^n-1, we can +       * do a normal add and mask off the correct bits  +       */ +      spe_ai(f, newS_reg, fbS_reg, 1); +      spe_and_uint(f, newS_reg, newS_reg, stencil_max_value); +      break; + +   case PIPE_STENCIL_OP_DECR_WRAP: +      /* newS = (s == 0 ? max : s - 1), but we'll pull the same mask trick as above */ +      spe_ai(f, newS_reg, fbS_reg, -1); +      spe_and_uint(f, newS_reg, newS_reg, stencil_max_value); +      break; + +   case PIPE_STENCIL_OP_INVERT: +      /* newS = ~s.  We take advantage of the mask/max value to invert only +       * the valid bits for the field so we don't have to do an extra "and". +       */ +      spe_xor_uint(f, newS_reg, fbS_reg, stencil_max_value); +      break; + +   default: +      ASSERT(0); +   } +} + + +/* This function generates code to get all the necessary possible + * stencil values.  For each of the output registers (fail_reg, + * zfail_reg, and zpass_reg), it either allocates a new register + * and calculates a new set of values based on the stencil operation, + * or it reuses a register allocation and calculation done for an + * earlier (matching) operation, or it reuses the fbS_reg register + * (if the stencil operation is KEEP, which doesn't change the  + * stencil buffer). + * + * Since this function allocates a variable number of registers, + * to avoid incurring complex logic to free them, they should + * be allocated after a spe_allocate_register_set() call + * and released by the corresponding spe_release_register_set() call. + */ +static void +gen_get_stencil_values(struct spe_function *f, const struct pipe_depth_stencil_alpha_state *dsa, +                       unsigned int fbS_reg,  +                       unsigned int *fail_reg, unsigned int *zfail_reg,  +                       unsigned int *zpass_reg, unsigned int *back_fail_reg,  +                       unsigned int *back_zfail_reg, unsigned int *back_zpass_reg) +{ +   unsigned zfail_op, back_zfail_op; + +   /* Stenciling had better be enabled here */ +   ASSERT(dsa->stencil[0].enabled); + +   /* If the depth test is not enabled, it is treated as though it always +    * passes.  In particular, that means that the "zfail_op" (and the backfacing +    * counterpart, if active) are not considered - a failing stencil test will +    * trigger the "fail_op", and a passing stencil test will trigger the +    * "zpass_op". +    * +    * By overriding the operations in this case to be PIPE_STENCIL_OP_KEEP, +    * we keep them from being calculated. +    */ +   if (dsa->depth.enabled) { +      zfail_op = dsa->stencil[0].zfail_op; +      back_zfail_op = dsa->stencil[1].zfail_op; +   } +   else { +      zfail_op = PIPE_STENCIL_OP_KEEP; +      back_zfail_op = PIPE_STENCIL_OP_KEEP; +   } + +   /* One-sided or front-facing stencil */ +   if (dsa->stencil[0].fail_op == PIPE_STENCIL_OP_KEEP) { +      *fail_reg = fbS_reg; +   } +   else { +      *fail_reg = spe_allocate_available_register(f); +      gen_stencil_values(f, dsa->stencil[0].fail_op, dsa->stencil[0].ref_value,  +         0xff, fbS_reg, *fail_reg); +   } + +   if (zfail_op == PIPE_STENCIL_OP_KEEP) { +      *zfail_reg = fbS_reg; +   } +   else if (zfail_op == dsa->stencil[0].fail_op) { +      *zfail_reg = *fail_reg; +   } +   else { +      *zfail_reg = spe_allocate_available_register(f); +      gen_stencil_values(f, dsa->stencil[0].zfail_op, dsa->stencil[0].ref_value,  +         0xff, fbS_reg, *zfail_reg); +   } + +   if (dsa->stencil[0].zpass_op == PIPE_STENCIL_OP_KEEP) { +      *zpass_reg = fbS_reg; +   } +   else if (dsa->stencil[0].zpass_op == dsa->stencil[0].fail_op) { +      *zpass_reg = *fail_reg; +   } +   else if (dsa->stencil[0].zpass_op == zfail_op) { +      *zpass_reg = *zfail_reg; +   } +   else { +      *zpass_reg = spe_allocate_available_register(f); +      gen_stencil_values(f, dsa->stencil[0].zpass_op, dsa->stencil[0].ref_value,  +         0xff, fbS_reg, *zpass_reg); +   } + +   /* If two-sided stencil is enabled, we have more work to do. */ +   if (!dsa->stencil[1].enabled) { +      /* This just flags that the registers need not be deallocated later */ +      *back_fail_reg = fbS_reg; +      *back_zfail_reg = fbS_reg; +      *back_zpass_reg = fbS_reg; +   } +   else { +      /* Same calculations as above, but for the back stencil */ +      if (dsa->stencil[1].fail_op == PIPE_STENCIL_OP_KEEP) { +         *back_fail_reg = fbS_reg; +      } +      else if (dsa->stencil[1].fail_op == dsa->stencil[0].fail_op) { +         *back_fail_reg = *fail_reg; +      } +      else if (dsa->stencil[1].fail_op == zfail_op) { +         *back_fail_reg = *zfail_reg; +      } +      else if (dsa->stencil[1].fail_op == dsa->stencil[0].zpass_op) { +         *back_fail_reg = *zpass_reg; +      } +      else { +         *back_fail_reg = spe_allocate_available_register(f); +         gen_stencil_values(f, dsa->stencil[1].fail_op, dsa->stencil[1].ref_value,  +            0xff, fbS_reg, *back_fail_reg); +      } + +      if (back_zfail_op == PIPE_STENCIL_OP_KEEP) { +         *back_zfail_reg = fbS_reg; +      } +      else if (back_zfail_op == dsa->stencil[0].fail_op) { +         *back_zfail_reg = *fail_reg; +      } +      else if (back_zfail_op == zfail_op) { +         *back_zfail_reg = *zfail_reg; +      } +      else if (back_zfail_op == dsa->stencil[0].zpass_op) { +         *back_zfail_reg = *zpass_reg; +      } +      else if (back_zfail_op == dsa->stencil[1].fail_op) { +         *back_zfail_reg = *back_fail_reg; +      } +      else { +         *back_zfail_reg = spe_allocate_available_register(f); +         gen_stencil_values(f, dsa->stencil[1].zfail_op, dsa->stencil[1].ref_value,  +            0xff, fbS_reg, *back_zfail_reg); +      } + +      if (dsa->stencil[1].zpass_op == PIPE_STENCIL_OP_KEEP) { +         *back_zpass_reg = fbS_reg; +      } +      else if (dsa->stencil[1].zpass_op == dsa->stencil[0].fail_op) { +         *back_zpass_reg = *fail_reg; +      } +      else if (dsa->stencil[1].zpass_op == zfail_op) { +         *back_zpass_reg = *zfail_reg; +      } +      else if (dsa->stencil[1].zpass_op == dsa->stencil[0].zpass_op) { +         *back_zpass_reg = *zpass_reg; +      } +      else if (dsa->stencil[1].zpass_op == dsa->stencil[1].fail_op) { +         *back_zpass_reg = *back_fail_reg; +      } +      else if (dsa->stencil[1].zpass_op == back_zfail_op) { +         *back_zpass_reg = *back_zfail_reg; +      } +      else { +         *back_zfail_reg = spe_allocate_available_register(f); +         gen_stencil_values(f, dsa->stencil[1].zpass_op, dsa->stencil[1].ref_value,  +            0xff, fbS_reg, *back_zpass_reg); +      } +   } /* End of calculations for back-facing stencil */ +} + +static boolean +gen_stencil_depth_test(struct spe_function *f,  +                       const struct pipe_depth_stencil_alpha_state *dsa,  +                       const int const facing_reg, +                       const int mask_reg, const int fragZ_reg,  +                       const int fbZ_reg, const int fbS_reg) +{ +   /* True if we've generated code that could require writeback to the +    * depth and/or stencil buffers +    */ +   boolean modified_buffers = false; + +   boolean need_to_calculate_stencil_values; +   boolean need_to_writemask_stencil_values; + +   /* Registers.  We may or may not actually allocate these, depending +    * on whether the state values indicate that we need them. +    */ +   unsigned int stencil_pass_reg, stencil_fail_reg; +   unsigned int stencil_fail_values, stencil_pass_depth_fail_values, stencil_pass_depth_pass_values; +   unsigned int stencil_writemask_reg; +   unsigned int zmask_reg; +   unsigned int newS_reg; + +   /* Stenciling is quite complex: up to six different configurable stencil  +    * operations/calculations can be required (three each for front-facing +    * and back-facing fragments).  Many of those operations will likely  +    * be identical, so there's good reason to try to avoid calculating  +    * the same values more than once (which unfortunately makes the code less  +    * straightforward). +    * +    * To make register management easier, we start a new  +    * register set; we can release all the registers in the set at +    * once, and avoid having to keep track of exactly which registers +    * we allocate.  We can still allocate and free registers as  +    * desired (if we know we no longer need a register), but we don't +    * have to spend the complexity to track the more difficult variant +    * register usage scenarios. +    */ +   spe_allocate_register_set(f); + +   /* Calculate the writemask.  If the writemask is trivial (either +    * all 0s, meaning that we don't need to calculate any stencil values +    * because they're not going to change the stencil anyway, or all 1s, +    * meaning that we have to calculate the stencil values but do not +    * need to mask them), we can avoid generating code.  Don't forget +    * that we need to consider backfacing stencil, if enabled. +    */ +   if (dsa->stencil[0].write_mask == 0x0 && (!dsa->stencil[1].enabled || dsa->stencil[1].write_mask == 0x00)) { +      /* Trivial: don't need to calculate stencil values, and don't need to  +       * write them back to the framebuffer. +       */ +      need_to_calculate_stencil_values = false; +      need_to_writemask_stencil_values = false; +   } +   else if (dsa->stencil[0].write_mask == 0xff && (!dsa->stencil[1].enabled || dsa->stencil[1].write_mask == 0x00)) { +      /* Still trivial, but a little less so.  We need to write the stencil +       * values, but we don't need to mask them. +       */ +      need_to_calculate_stencil_values = true; +      need_to_writemask_stencil_values = false; +   } +   else { +      /* The general case: calculate, mask, and write */ +      need_to_calculate_stencil_values = true; +      need_to_writemask_stencil_values = true; + +      /* While we're here, generate code that calculates what the +       * writemask should be.  If backface stenciling is enabled, +       * and the backface writemask is not the same as the frontface +       * writemask, we'll have to generate code that merges the +       * two masks into a single effective mask based on fragment facing. +       */ +      stencil_writemask_reg = spe_allocate_available_register(f); +      spe_load_uint(f, stencil_writemask_reg, dsa->stencil[0].write_mask); +      if (dsa->stencil[1].enabled && dsa->stencil[0].write_mask != dsa->stencil[1].write_mask) { +         unsigned int back_write_mask_reg = spe_allocate_available_register(f); +         spe_load_uint(f, back_write_mask_reg, dsa->stencil[1].write_mask); +         spe_selb(f, stencil_writemask_reg, stencil_writemask_reg, back_write_mask_reg, facing_reg); +         spe_release_register(f, back_write_mask_reg); +      } +   } + +   /* At least one-sided stenciling must be on.  Generate code that +    * runs the stencil test on the basic/front-facing stencil, leaving +    * the mask of passing stencil bits in stencil_pass_reg.  This mask will +    * be used both to mask the set of active pixels, and also to +    * determine how the stencil buffer changes. +    * +    * This test will *not* change the value in mask_reg (because we don't +    * yet know whether to apply the two-sided stencil or one-sided stencil). +    */ +   stencil_pass_reg = spe_allocate_available_register(f); +   gen_stencil_test(f, &dsa->stencil[0], mask_reg, fbS_reg, stencil_pass_reg); + +   /* If two-sided stenciling is on, generate code to run the stencil +    * test on the backfacing stencil as well, and combine the two results +    * into the one correct result based on facing. +    */ +   if (dsa->stencil[1].enabled) { +      unsigned int temp_reg = spe_allocate_available_register(f); +      gen_stencil_test(f, &dsa->stencil[1], mask_reg, fbS_reg, temp_reg); +      spe_selb(f, stencil_pass_reg, stencil_pass_reg, temp_reg, facing_reg); +      spe_release_register(f, temp_reg); +   } + +   /* Generate code that, given the mask of valid fragments and the +    * mask of valid fragments that passed the stencil test, computes +    * the mask of valid fragments that failed the stencil test.  We +    * have to do this before we run a depth test (because the +    * depth test should not be performed on fragments that failed the +    * stencil test, and because the depth test will update the  +    * mask of valid fragments based on the results of the depth test). +    */ +   stencil_fail_reg = spe_allocate_available_register(f); +   spe_andc(f, stencil_fail_reg, mask_reg, stencil_pass_reg); +   /* Now remove the stenciled-out pixels from the valid fragment mask, +    * so we can later use the valid fragment mask in the depth test. +    */ +   spe_and(f, mask_reg, mask_reg, stencil_pass_reg); + +   /* We may not need to calculate stencil values, if the writemask is off */ +   if (need_to_calculate_stencil_values) { +      unsigned int back_stencil_fail_values, back_stencil_pass_depth_fail_values, back_stencil_pass_depth_pass_values; +      unsigned int front_stencil_fail_values, front_stencil_pass_depth_fail_values, front_stencil_pass_depth_pass_values; + +      /* Generate code that calculates exactly which stencil values we need, +       * without calculating the same value twice (say, if two different +       * stencil ops have the same value).  This code will work for one-sided +       * and two-sided stenciling (so that we take into account that operations +       * may match between front and back stencils), and will also take into +       * account whether the depth test is enabled (if the depth test is off, +       * we don't need any of the zfail results, because the depth test always +       * is considered to pass if it is disabled).  Any register value that +       * does not need to be calculated will come back with the same value +       * that's in fbS_reg. +       * +       * This function will allocate a variant number of registers that +       * will be released as part of the register set. +       */ +      gen_get_stencil_values(f, dsa, fbS_reg,  +         &front_stencil_fail_values, &front_stencil_pass_depth_fail_values,  +         &front_stencil_pass_depth_pass_values, &back_stencil_fail_values,  +         &back_stencil_pass_depth_fail_values, &back_stencil_pass_depth_pass_values); + +      /* Tricky, tricky, tricky - the things we do to create optimal +       * code... +       * +       * The various stencil values registers may overlap with each other +       * and with fbS_reg arbitrarily (as any particular operation is +       * only calculated once and stored in one register, no matter +       * how many times it is used).  So we can't change the values  +       * within those registers directly - if we change a value in a +       * register that's being referenced by two different calculations, +       * we've just unwittingly changed the second value as well... +       * +       * Avoid this by allocating new registers to hold the results +       * (there may be 2, if the depth test is off, or 3, if it is on). +       * These will be released as part of the register set. +       */ +      if (!dsa->stencil[1].enabled) { +         /* The easy case: if two-sided stenciling is *not* enabled, we +          * just use the front-sided values. +          */ +         stencil_fail_values = front_stencil_fail_values; +         stencil_pass_depth_fail_values = front_stencil_pass_depth_fail_values; +         stencil_pass_depth_pass_values = front_stencil_pass_depth_pass_values; +      } +      else { /* two-sided stencil enabled */ +         /* Allocate new registers for the needed merged values */ +         stencil_fail_values = spe_allocate_available_register(f); +         spe_selb(f, stencil_fail_values, front_stencil_fail_values, back_stencil_fail_values, facing_reg); +         if (dsa->depth.enabled) { +            stencil_pass_depth_fail_values = spe_allocate_available_register(f); +            spe_selb(f, stencil_pass_depth_fail_values, front_stencil_pass_depth_fail_values, back_stencil_pass_depth_fail_values, facing_reg); +         } +         else { +            stencil_pass_depth_fail_values = fbS_reg; +         } +         stencil_pass_depth_pass_values = spe_allocate_available_register(f); +         spe_selb(f, stencil_pass_depth_pass_values, front_stencil_pass_depth_pass_values, back_stencil_pass_depth_pass_values, facing_reg); +      } +   } + +   /* We now have all the stencil values we need.  We also need  +    * the results of the depth test to figure out which +    * stencil values will become the new stencil values.  (Even if +    * we aren't actually calculating stencil values, we need to apply +    * the depth test if it's enabled.) +    * +    * The code generated by gen_depth_test() returns the results of the +    * test in the given register, but also alters the mask_reg based +    * on the results of the test. +    */ +   if (dsa->depth.enabled) { +      zmask_reg = spe_allocate_available_register(f); +      modified_buffers |= gen_depth_test(f, dsa, mask_reg, fragZ_reg, fbZ_reg, zmask_reg); +   } + +   if (need_to_calculate_stencil_values) { +      /* If we need to writemask the stencil values before going into +       * the stencil buffer, we'll have to use a new register to +       * hold the new values.  If not, we can just keep using the +       * current register. +       */ +      if (need_to_writemask_stencil_values) { +         newS_reg = spe_allocate_available_register(f); +         spe_move(f, newS_reg, fbS_reg); +         modified_buffers = true; +      } +      else { +         newS_reg = fbS_reg; +      } + +      /* Merge in the selected stencil fail values */ +      if (stencil_fail_values != fbS_reg) { +         spe_selb(f, newS_reg, newS_reg, stencil_fail_values, stencil_fail_reg); +      } + +      /* Same for the stencil pass/depth fail values.  If this calculation +       * is not needed (say, if depth test is off), then the +       * stencil_pass_depth_fail_values register will be equal to fbS_reg +       * and we'll skip the calculation. +       */ +      if (stencil_pass_depth_fail_values != fbS_reg) { +         /* We don't actually have a stencil pass/depth fail mask yet. +          * Calculate it here from the stencil passing mask and the +          * depth passing mask.  Note that zmask_reg *must* have been +          * set above if we're here. +          */ +         unsigned int stencil_pass_depth_fail_mask = spe_allocate_available_register(f); +         spe_andc(f, stencil_pass_depth_fail_mask, stencil_pass_reg, zmask_reg); + +         spe_selb(f, newS_reg, newS_reg, stencil_pass_depth_fail_values, stencil_pass_depth_fail_mask); + +         spe_release_register(f, stencil_pass_depth_fail_mask); +      } + +      /* Same for the stencil pass/depth pass mask */ +      if (stencil_pass_depth_pass_values != fbS_reg) { +         unsigned int stencil_pass_depth_pass_mask = spe_allocate_available_register(f); +         spe_and(f, stencil_pass_depth_pass_mask, stencil_pass_reg, zmask_reg); + +         spe_selb(f, newS_reg, newS_reg, stencil_pass_depth_pass_values, stencil_pass_depth_pass_mask); +         spe_release_register(f, stencil_pass_depth_pass_mask); +      } + +      /* Almost done.  If we need to writemask, do it now, leaving the +       * results in the fbS_reg register passed in.  If we don't need +       * to writemask, then the results are *already* in the fbS_reg, +       * so there's nothing more to do. +       */ + +      if (need_to_writemask_stencil_values) { +         /* The Select Bytes command makes a fine writemask.  Where +          * the mask is 0, the first (original) values are retained, +          * effectively masking out changes.  Where the mask is 1, the +          * second (new) values are retained, incorporating changes. +          */ +         spe_selb(f, fbS_reg, fbS_reg, newS_reg, stencil_writemask_reg); +      } +   } /* done calculating stencil values */ + +   /* The stencil and/or depth values have been applied, and the +    * mask_reg, fbS_reg, and fbZ_reg values have been updated. +    * We're all done, except that we've allocated a fair number +    * of registers that we didn't bother tracking.  Release all +    * those registers as part of the register set, and go home. +    */ +   spe_release_register_set(f); + +   /* Return true if we could have modified the stencil and/or +    * depth buffers. +    */ +   return modified_buffers; +} + +  /**   * Generate SPE code to implement the fragment operations (alpha test,   * depth test, stencil test, blending, colormask, and final @@ -1156,6 +1800,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)     const int fragB_reg = 10;  /* vector float */     const int fragA_reg = 11;  /* vector float */     const int mask_reg = 12;   /* vector uint */ +   const int facing_reg = 13; /* uint */     /* offset of quad from start of tile      * XXX assuming 4-byte pixels for color AND Z/stencil!!!! @@ -1183,6 +1828,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)     spe_allocate_register(f, fragB_reg);     spe_allocate_register(f, fragA_reg);     spe_allocate_register(f, mask_reg); +   spe_allocate_register(f, facing_reg);     quad_offset_reg = spe_allocate_available_register(f);     fbRGBA_reg = spe_allocate_available_register(f); @@ -1195,6 +1841,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)        ASSERT(TILE_SIZE == 32); +      spe_comment(f, 0, "Computing tile location in memory");        spe_rotmi(f, y2_reg, y_reg, -1);  /* y2 = y / 2 */        spe_rotmi(f, x2_reg, x_reg, -1);  /* x2 = x / 2 */        spe_shli(f, y2_reg, y2_reg, 4);   /* y2 *= 16 */ @@ -1205,124 +1852,164 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)        spe_release_register(f, y2_reg);     } -     if (dsa->alpha.enabled) {        gen_alpha_test(dsa, f, mask_reg, fragA_reg);     } +   /* If we need the stencil buffers (because one- or two-sided stencil is +    * enabled) or the depth buffer (because the depth test is enabled), +    * go grab them.  Note that if either one- or two-sided stencil is +    * enabled, dsa->stencil[0].enabled will be true. +    */     if (dsa->depth.enabled || dsa->stencil[0].enabled) {        const enum pipe_format zs_format = cell->framebuffer.zsbuf->format;        boolean write_depth_stencil; -      int fbZ_reg = spe_allocate_available_register(f); /* Z values */ -      int fbS_reg = spe_allocate_available_register(f); /* Stencil values */ +      /* We may or may not need to allocate a register for Z or stencil values */ +      boolean fbS_reg_set = false, fbZ_reg_set = false; +      unsigned int fbS_reg, fbZ_reg = 0; + +      spe_comment(f, 0, "Loading Z/stencil tile");        /* fetch quad of depth/stencil values from tile at (x,y) */        /* Load: fbZS_reg = memory[depth_tile_reg + offset_reg] */ +      /* XXX Not sure this is allowed if we've only got a 16-bit Z buffer... */        spe_lqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg); -      if (dsa->depth.enabled) { -         /* Extract Z bits from fbZS_reg into fbZ_reg */ -         if (zs_format == PIPE_FORMAT_S8Z24_UNORM || -             zs_format == PIPE_FORMAT_X8Z24_UNORM) { -            int mask_reg = spe_allocate_available_register(f); -            spe_fsmbi(f, mask_reg, 0x7777);  /* mask[0,1,2,3] = 0x00ffffff */ -            spe_and(f, fbZ_reg, fbZS_reg, mask_reg);  /* fbZ = fbZS & mask */ -            spe_release_register(f, mask_reg); -            /* OK, fbZ_reg has four 24-bit Z values now */ -         } -         else if (zs_format == PIPE_FORMAT_Z24S8_UNORM || -                  zs_format == PIPE_FORMAT_Z24X8_UNORM) { -            spe_rotmi(f, fbZ_reg, fbZS_reg, -8);  /* fbZ = fbZS >> 8 */ -            /* OK, fbZ_reg has four 24-bit Z values now */ -         } -         else if (zs_format == PIPE_FORMAT_Z32_UNORM) { -            spe_move(f, fbZ_reg, fbZS_reg); -            /* OK, fbZ_reg has four 32-bit Z values now */ -         } -         else if (zs_format == PIPE_FORMAT_Z16_UNORM) { -            spe_move(f, fbZ_reg, fbZS_reg); -            /* OK, fbZ_reg has four 16-bit Z values now */ -         } -         else { -            ASSERT(0);  /* invalid format */ -         } +      /* From the Z/stencil buffer format, pull out the bits we need for +       * Z and/or stencil.  We'll also convert the incoming fragment Z +       * value in fragZ_reg from a floating point value in [0.0..1.0] to +       * an unsigned integer value with the appropriate resolution. +       */ +      switch(zs_format) { -         /* Convert fragZ values from float[4] to 16, 24 or 32-bit uint[4] */ -         if (zs_format == PIPE_FORMAT_S8Z24_UNORM || -             zs_format == PIPE_FORMAT_X8Z24_UNORM || -             zs_format == PIPE_FORMAT_Z24S8_UNORM || -             zs_format == PIPE_FORMAT_Z24X8_UNORM) { -            /* scale/convert fragZ from float in [0,1] to uint in [0, ~0] */ -            spe_cfltu(f, fragZ_reg, fragZ_reg, 32); -            /* fragZ = fragZ >> 8 */ -            spe_rotmi(f, fragZ_reg, fragZ_reg, -8); -         } -         else if (zs_format == PIPE_FORMAT_Z32_UNORM) { -            /* scale/convert fragZ from float in [0,1] to uint in [0, ~0] */ -            spe_cfltu(f, fragZ_reg, fragZ_reg, 32); -         } -         else if (zs_format == PIPE_FORMAT_Z16_UNORM) { -            /* scale/convert fragZ from float in [0,1] to uint in [0, ~0] */ -            spe_cfltu(f, fragZ_reg, fragZ_reg, 32); -            /* fragZ = fragZ >> 16 */ -            spe_rotmi(f, fragZ_reg, fragZ_reg, -16); -         } -      } -      else { -         /* no Z test, but set Z to zero so we don't OR-in garbage below */ -         spe_load_uint(f, fbZ_reg, 0); /* XXX set to zero for now */ -      } +         case PIPE_FORMAT_S8Z24_UNORM: /* fall through */ +         case PIPE_FORMAT_X8Z24_UNORM: +            if (dsa->depth.enabled) { +               /* We need the Z part at least */ +               setup_optional_register(f, &fbZ_reg_set, &fbZ_reg); +               /* four 24-bit Z values in the low-order bits */ +               spe_and_uint(f, fbZ_reg, fbZS_reg, 0x00ffffff); +               /* Incoming fragZ_reg value is a float in 0.0...1.0; convert +                * to a 24-bit unsigned integer +                */ +               spe_cfltu(f, fragZ_reg, fragZ_reg, 32); +               spe_rotmi(f, fragZ_reg, fragZ_reg, -8); +            } +            if (dsa->stencil[0].enabled) { +               setup_optional_register(f, &fbS_reg_set, &fbS_reg); +               /* four 8-bit Z values in the high-order bits */ +               spe_rotmi(f, fbS_reg, fbZS_reg, -24); +            } +            break; -      if (dsa->stencil[0].enabled) { -         /* Extract Stencil bit sfrom fbZS_reg into fbS_reg */ -         if (zs_format == PIPE_FORMAT_S8Z24_UNORM || -             zs_format == PIPE_FORMAT_X8Z24_UNORM) { -            /* XXX extract with a shift */ -            ASSERT(0); -         } -         else if (zs_format == PIPE_FORMAT_Z24S8_UNORM || -                  zs_format == PIPE_FORMAT_Z24X8_UNORM) { -            /* XXX extract with a mask */ -            ASSERT(0); -         } -      } -      else { -         /* no stencil test, but set to zero so we don't OR-in garbage below */ -         spe_load_uint(f, fbS_reg, 0); /* XXX set to zero for now */ +         case PIPE_FORMAT_Z24S8_UNORM: /* fall through */ +         case PIPE_FORMAT_Z24X8_UNORM: +            if (dsa->depth.enabled) { +               setup_optional_register(f, &fbZ_reg_set, &fbZ_reg); +               /* shift by 8 to get the upper 24-bit values */ +               spe_rotmi(f, fbS_reg, fbZS_reg, -8); + +               /* Incoming fragZ_reg value is a float in 0.0...1.0; convert +                * to a 24-bit unsigned integer +                */ +               spe_cfltu(f, fragZ_reg, fragZ_reg, 32); +               spe_rotmi(f, fragZ_reg, fragZ_reg, -8); +            } +            if (dsa->stencil[0].enabled) { +               setup_optional_register(f, &fbS_reg_set, &fbS_reg); +               /* 8-bit stencil in the low-order bits - mask them out */ +               spe_and_uint(f, fbS_reg, fbZS_reg, 0x000000ff); +            } +            break; + +         case PIPE_FORMAT_Z32_UNORM: +            if (dsa->depth.enabled) { +               setup_optional_register(f, &fbZ_reg_set, &fbZ_reg); +               /* Copy over 4 32-bit values */ +               spe_move(f, fbZ_reg, fbZS_reg); + +               /* Incoming fragZ_reg value is a float in 0.0...1.0; convert +                * to a 32-bit unsigned integer +                */ +               spe_cfltu(f, fragZ_reg, fragZ_reg, 32); +            } +            /* No stencil, so can't do anything there */ +            break; + +         case PIPE_FORMAT_Z16_UNORM: +            if (dsa->depth.enabled) { +               /* XXX Not sure this is correct, but it was here before, so we're +                * going with it for now +                */ +               setup_optional_register(f, &fbZ_reg_set, &fbZ_reg); +               /* Copy over 4 32-bit values */ +               spe_move(f, fbZ_reg, fbZS_reg); + +               /* Incoming fragZ_reg value is a float in 0.0...1.0; convert +                * to a 16-bit unsigned integer +                */ +               spe_cfltu(f, fragZ_reg, fragZ_reg, 32); +               spe_rotmi(f, fragZ_reg, fragZ_reg, -16); +            } +            /* No stencil */ +            break; + +         default: +            ASSERT(0); /* invalid format */        } +      /* If stencil is enabled, use the stencil-specific code +       * generator to generate both the stencil and depth (if needed) +       * tests.  Otherwise, if only depth is enabled, generate +       * a quick depth test.  The test generators themselves will +       * report back whether the depth/stencil buffer has to be +       * written back. +       */        if (dsa->stencil[0].enabled) { -         /* XXX this may involve depth testing too */ -         // gen_stencil_test(dsa, f, ... ); -         ASSERT(0); +         /* This will perform the stencil and depth tests, and update +          * the mask_reg, fbZ_reg, and fbS_reg as required by the +          * tests. +          */ +         ASSERT(fbS_reg_set); +         ASSERT(fbZ_reg_set); +         spe_comment(f, 0, "Perform stencil test"); + +         write_depth_stencil = gen_stencil_depth_test(f, dsa, facing_reg, mask_reg, fragZ_reg, fbZ_reg, fbS_reg);        }        else if (dsa->depth.enabled) {           int zmask_reg = spe_allocate_available_register(f); -         gen_depth_test(dsa, f, mask_reg, fragZ_reg, fbZ_reg, zmask_reg); +         spe_comment(f, 0, "Perform depth test"); +         write_depth_stencil = gen_depth_test(f, dsa, mask_reg, fragZ_reg, fbZ_reg, zmask_reg);           spe_release_register(f, zmask_reg);        } - -      /* do we need to write Z and/or Stencil back into framebuffer? */ -      write_depth_stencil = (dsa->depth.writemask | -                             dsa->stencil[0].write_mask | -                             dsa->stencil[1].write_mask); +      else { +         write_depth_stencil = false; +      }        if (write_depth_stencil) {           /* Merge latest Z and Stencil values into fbZS_reg.            * fbZ_reg has four Z vals in bits [23..0] or bits [15..0].            * fbS_reg has four 8-bit Z values in bits [7..0].            */ +         spe_comment(f, 0, "Storing depth/stencil values");           if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||               zs_format == PIPE_FORMAT_X8Z24_UNORM) { -            spe_shli(f, fbS_reg, fbS_reg, 24); /* fbS = fbS << 24 */ -            spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */ +            if (fbS_reg_set) { +               spe_shli(f, fbS_reg, fbS_reg, 24); /* fbS = fbS << 24 */ +               spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */ +            } +            else { +               spe_move(f, fbZS_reg, fbZ_reg); +            }           }           else if (zs_format == PIPE_FORMAT_Z24S8_UNORM ||                    zs_format == PIPE_FORMAT_Z24X8_UNORM) {              spe_shli(f, fbZ_reg, fbZ_reg, 8); /* fbZ = fbZ << 8 */ -            spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */ +            if (fbS_reg_set) { +               spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */ +            }           }           else if (zs_format == PIPE_FORMAT_Z32_UNORM) {              spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */ @@ -1341,11 +2028,10 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)           spe_stqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg);        } -      spe_release_register(f, fbZ_reg); -      spe_release_register(f, fbS_reg); +      release_optional_register(f, &fbZ_reg_set, fbZ_reg); +      release_optional_register(f, &fbS_reg_set, fbS_reg);     } -     /* Get framebuffer quad/colors.  We'll need these for blending,      * color masking, and to obey the quad/pixel mask.      * Load: fbRGBA_reg = memory[color_tile + quad_offset] @@ -1354,8 +2040,8 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)      */     spe_lqx(f, fbRGBA_reg, color_tile_reg, quad_offset_reg); -     if (blend->blend_enable) { +      spe_comment(f, 0, "Perform blending");        gen_blend(blend, blend_color, f, color_format,                  fragR_reg, fragG_reg, fragB_reg, fragA_reg, fbRGBA_reg);     } @@ -1369,19 +2055,21 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)        int rgba_reg = spe_allocate_available_register(f);        /* Pack four float colors as four 32-bit int colors */ +      spe_comment(f, 0, "Convert fragment colors to framebuffer colors");        gen_pack_colors(f, color_format,                        fragR_reg, fragG_reg, fragB_reg, fragA_reg,                        rgba_reg);        if (blend->logicop_enable) { +         spe_comment(f, 0, "Compute logic op");           gen_logicop(blend, f, rgba_reg, fbRGBA_reg);        }        if (blend->colormask != PIPE_MASK_RGBA) { +         spe_comment(f, 0, "Compute color mask");           gen_colormask(f, blend->colormask, color_format, rgba_reg, fbRGBA_reg);        } -        /* Mix fragment colors with framebuffer colors using the quad/pixel mask:         * if (mask[i])         *    rgba[i] = rgba[i]; @@ -1393,6 +2081,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)        /* Store updated quad in tile:         * memory[color_tile + quad_offset] = rgba_reg;         */ +      spe_comment(f, 0, "Store framebuffer colors");        spe_stqx(f, rgba_reg, color_tile_reg, quad_offset_reg);        spe_release_register(f, rgba_reg); diff --git a/src/gallium/drivers/cell/ppu/cell_render.c b/src/gallium/drivers/cell/ppu/cell_render.c index dd25ae880e..79cb8df82f 100644 --- a/src/gallium/drivers/cell/ppu/cell_render.c +++ b/src/gallium/drivers/cell/ppu/cell_render.c @@ -152,6 +152,7 @@ cell_flush_prim_buffer(struct cell_context *cell)        struct cell_command_render *render = &cell_global.command[i].render;        render->prim_type = PIPE_PRIM_TRIANGLES;        render->num_verts = cell->prim_buffer.num_verts; +      render->front_winding = cell->rasterizer->front_winding;        render->vertex_size = cell->vertex_info->size * 4;        render->xmin = cell->prim_buffer.xmin;        render->ymin = cell->prim_buffer.ymin; diff --git a/src/gallium/drivers/cell/ppu/cell_vbuf.c b/src/gallium/drivers/cell/ppu/cell_vbuf.c index aa63435b93..578ddf62dc 100644 --- a/src/gallium/drivers/cell/ppu/cell_vbuf.c +++ b/src/gallium/drivers/cell/ppu/cell_vbuf.c @@ -214,6 +214,7 @@ cell_vbuf_draw(struct vbuf_render *vbr,        render->opcode = CELL_CMD_RENDER;        render->prim_type = cvbr->prim; +      render->front_winding = cell->rasterizer->front_winding;        render->num_indexes = nr_indices;        render->min_index = min_index; diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h index 29a305232e..1cd577c23c 100644 --- a/src/gallium/drivers/cell/spu/spu_main.h +++ b/src/gallium/drivers/cell/spu/spu_main.h @@ -73,7 +73,8 @@ typedef void (*spu_fragment_ops_func)(uint x, uint y,                                        vector float fragGreen,                                        vector float fragBlue,                                        vector float fragAlpha, -                                      vector unsigned int mask); +                                      vector unsigned int mask, +                                      uint facing);  /** Function for running fragment program */  typedef void (*spu_fragment_program_func)(vector float *inputs, diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c index f107764fb2..d252fa6dc1 100644 --- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c +++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c @@ -57,7 +57,8 @@ spu_fallback_fragment_ops(uint x, uint y,                            vector float fragG,                            vector float fragB,                            vector float fragA, -                          vector unsigned int mask) +                          vector unsigned int mask, +                          uint facing)  {     vector float frag_aos[4];     unsigned int fbc0, fbc1, fbc2, fbc3 ; /* framebuffer/tile colors */ @@ -433,23 +434,23 @@ spu_fallback_fragment_ops(uint x, uint y,        /* Form bitmask depending on color buffer format and colormask bits */        switch (spu.fb.color_format) {        case PIPE_FORMAT_A8R8G8B8_UNORM: -         if (spu.blend.colormask & (1<<0)) +         if (spu.blend.colormask & PIPE_MASK_R)              cmask |= 0x00ff0000; /* red */ -         if (spu.blend.colormask & (1<<1)) +         if (spu.blend.colormask & PIPE_MASK_G)              cmask |= 0x0000ff00; /* green */ -         if (spu.blend.colormask & (1<<2)) +         if (spu.blend.colormask & PIPE_MASK_B)              cmask |= 0x000000ff; /* blue */ -         if (spu.blend.colormask & (1<<3)) +         if (spu.blend.colormask & PIPE_MASK_A)              cmask |= 0xff000000; /* alpha */           break;        case PIPE_FORMAT_B8G8R8A8_UNORM: -         if (spu.blend.colormask & (1<<0)) +         if (spu.blend.colormask & PIPE_MASK_R)              cmask |= 0x0000ff00; /* red */ -         if (spu.blend.colormask & (1<<1)) +         if (spu.blend.colormask & PIPE_MASK_G)              cmask |= 0x00ff0000; /* green */ -         if (spu.blend.colormask & (1<<2)) +         if (spu.blend.colormask & PIPE_MASK_B)              cmask |= 0xff000000; /* blue */ -         if (spu.blend.colormask & (1<<3)) +         if (spu.blend.colormask & PIPE_MASK_A)              cmask |= 0x000000ff; /* alpha */           break;        default: diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.h b/src/gallium/drivers/cell/spu/spu_per_fragment_op.h index f817abf046..a61689c83a 100644 --- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.h +++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.h @@ -38,7 +38,8 @@ spu_fallback_fragment_ops(uint x, uint y,                            vector float fragGreen,                            vector float fragBlue,                            vector float fragAlpha, -                          vector unsigned int mask); +                          vector unsigned int mask, +                          uint facing);  #endif /* SPU_PER_FRAGMENT_OP */ diff --git a/src/gallium/drivers/cell/spu/spu_render.c b/src/gallium/drivers/cell/spu/spu_render.c index 305dc98881..82dbeb26b7 100644 --- a/src/gallium/drivers/cell/spu/spu_render.c +++ b/src/gallium/drivers/cell/spu/spu_render.c @@ -279,7 +279,7 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)           v1 = (const float *) (vertices + indexes[j+1] * vertex_size);           v2 = (const float *) (vertices + indexes[j+2] * vertex_size); -         drawn += tri_draw(v0, v1, v2, tx, ty); +         drawn += tri_draw(v0, v1, v2, tx, ty, render->front_winding);        }        //printf("SPU %u: drew %u of %u\n", spu.init.id, drawn, render->num_indexes/3); @@ -297,5 +297,3 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)        printf("SPU %u: RENDER done\n",               spu.init.id);  } - - diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c index 0a8fb56a62..6039cd80b2 100644 --- a/src/gallium/drivers/cell/spu/spu_tri.c +++ b/src/gallium/drivers/cell/spu/spu_tri.c @@ -118,6 +118,8 @@ struct setup_stage {     float oneoverarea; +   uint facing; +     uint tx, ty;     int cliprect_minx, cliprect_maxx, cliprect_miny, cliprect_maxy; @@ -274,7 +276,7 @@ eval_z(float x, float y)   * overall.   */  static INLINE void -emit_quad( int x, int y, mask_t mask ) +emit_quad( int x, int y, mask_t mask)  {     /* If any bits in mask are set... */     if (spu_extract(spu_orx(mask), 0)) { @@ -344,7 +346,8 @@ emit_quad( int x, int y, mask_t mask )                               fragZ,                               soa_frag[0], soa_frag[1],                               soa_frag[2], soa_frag[3], -                             mask); +                             mask, +                             setup.facing);           }        } @@ -379,7 +382,8 @@ emit_quad( int x, int y, mask_t mask )                            outputs[0*4+1],                            outputs[0*4+2],                            outputs[0*4+3], -                          mask); +                          mask, +                          setup.facing);        }     }  } @@ -483,7 +487,7 @@ static void flush_spans( void )      */     for (x = block(minleft); x <= block(maxright); x += 2) {  #if 1 -      emit_quad( x, setup.span.y, calculate_mask( x ) ); +      emit_quad( x, setup.span.y, calculate_mask( x ));  #endif     } @@ -902,13 +906,28 @@ static void subtriangle( struct edge *eleft,     eright->sy += lines;  } +static float +determinant( const float *v0, +             const float *v1, +             const float *v2 ) +{ +   /* edge vectors e = v0 - v2, f = v1 - v2 */ +   const float ex = v0[0] - v2[0]; +   const float ey = v0[1] - v2[1]; +   const float fx = v1[0] - v2[0]; +   const float fy = v1[1] - v2[1]; + +   /* det = cross(e,f).z */ +   return ex * fy - ey * fx; +} +  /**   * Draw triangle into tile at (tx, ty) (tile coords)   * The tile data should have already been fetched.   */  boolean -tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty) +tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty, uint front_winding)  {     setup.tx = tx;     setup.ty = ty; @@ -919,6 +938,12 @@ tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty)     setup.cliprect_maxx = (tx + 1) * TILE_SIZE;     setup.cliprect_maxy = (ty + 1) * TILE_SIZE; +   /* Before we sort vertices, determine the facing of the triangle, +    * which will be needed for front/back-face stencil application +    */ +   float det = determinant(v0, v1, v2); +   setup.facing = (det > 0.0) ^ (front_winding == PIPE_WINDING_CW); +     if (!setup_sort_vertices((struct vertex_header *) v0,                              (struct vertex_header *) v1,                              (struct vertex_header *) v2)) { diff --git a/src/gallium/drivers/cell/spu/spu_tri.h b/src/gallium/drivers/cell/spu/spu_tri.h index aa694dd7c9..abc3d35160 100644 --- a/src/gallium/drivers/cell/spu/spu_tri.h +++ b/src/gallium/drivers/cell/spu/spu_tri.h @@ -31,7 +31,7 @@  extern boolean -tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty); +tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty, uint front_winding);  #endif /* SPU_TRI_H */ | 
