diff options
| author | Brian Paul <brianp@vmware.com> | 2009-01-11 15:11:00 -0700 | 
|---|---|---|
| committer | Brian Paul <brianp@vmware.com> | 2009-01-11 15:11:00 -0700 | 
| commit | b27eb7cb4f5b49b9e7c24deb6c1fb52908f63703 (patch) | |
| tree | b2b4a09a9c20bb435604d66728c243674a5e437c | |
| parent | c4a782041b19cb4a08712384b19be25b79acba3c (diff) | |
cell: re-order the z/stencil fetch/extract/convert instructions for better perf
The new instruction order is 10 cycles faster.
| -rw-r--r-- | src/gallium/drivers/cell/ppu/cell_gen_fragment.c | 104 | 
1 files changed, 50 insertions, 54 deletions
| diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c index 4d28d4801f..b3cce68157 100644 --- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c +++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c @@ -1813,93 +1813,88 @@ gen_depth_stencil(struct cell_context *cell,     const enum pipe_format zs_format = cell->framebuffer.zsbuf->format;     boolean write_depth_stencil; -   /* We may or may not need to allocate a register for Z or stencil values */ -   int fbS_reg = -1, fbZ_reg = -1; - -   /* framebuffer's combined z/stencil values for quad */ +   /* framebuffer's combined z/stencil values register */     int fbZS_reg = spe_allocate_available_register(f); +   /* Framebufer Z values register */ +   int fbZ_reg = spe_allocate_available_register(f); -   spe_comment(f, 0, "Fetch Z/stencil quad from tile"); +   /* Framebuffer stencil values register (may not be used) */ +   int fbS_reg = spe_allocate_available_register(f); -   /* fetch quad of depth/stencil values from tile at (x,y) */ -   /* Load: fbZS_reg = memory[depth_tile_reg + offset_reg] */ -   /* XXX Not sure this is allowed if we've only got a 16-bit Z buffer... */ -   spe_lqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg); +   /* 24-bit mask register (may not be used) */ +   int zmask_reg = spe_allocate_available_register(f); -   /* From the Z/stencil buffer format, pull out the bits we need for -    * Z and/or stencil.  We'll also convert the incoming fragment Z -    * value in fragZ_reg from a floating point value in [0.0..1.0] to -    * an unsigned integer value with the appropriate resolution. -    * Note that even if depth or stencil is *not* enabled, if it's -    * present in the buffer, we pull it out and put it back later; -    * otherwise, we can inadvertently destroy the contents of -    * buffers we're not supposed to touch (e.g., if the user is -    * clearing the depth buffer but not the stencil buffer, a -    * quad of constant depth is drawn over the surface; the stencil -    * buffer must be maintained). +   /** +    * The following code: +    * 1. fetch quad of packed Z/S values from the framebuffer tile. +    * 2. extract the separate the Z and S values from packed values +    * 3. convert fragment Z values from float in [0,1] to 32/24/16-bit ints +    * +    * The instructions for doing this are interleaved for better performance.      */ +   spe_comment(f, 0, "Fetch Z/stencil quad from tile"); +     switch(zs_format) {     case PIPE_FORMAT_S8Z24_UNORM: /* fall through */     case PIPE_FORMAT_X8Z24_UNORM: -      /* Pull out both Z and stencil */ -      setup_optional_register(f, &fbZ_reg); -      setup_optional_register(f, &fbS_reg); - -      /* four 24-bit Z values in the low-order bits */ -      spe_and_uint(f, fbZ_reg, fbZS_reg, 0x00ffffff); +      /* prepare mask to extract Z vals from ZS vals */ +      spe_load_uint(f, zmask_reg, 0x00ffffff); -      /* Incoming fragZ_reg value is a float in 0.0...1.0; convert -       * to a 24-bit unsigned integer -       */ +      /* convert fragment Z from [0,1] to 32-bit ints */        spe_cfltu(f, fragZ_reg, fragZ_reg, 32); + +      /* Load: fbZS_reg = memory[depth_tile_reg + offset_reg] */ +      spe_lqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg); + +      /* right shift 32-bit fragment Z to 24 bits */        spe_rotmi(f, fragZ_reg, fragZ_reg, -8); -      /* four 8-bit stencil values in the high-order bits */ +      /* extract 24-bit Z values from ZS values by masking */ +      spe_and(f, fbZ_reg, fbZS_reg, zmask_reg); + +      /* extract 8-bit stencil values by shifting */        spe_rotmi(f, fbS_reg, fbZS_reg, -24);        break;     case PIPE_FORMAT_Z24S8_UNORM: /* fall through */     case PIPE_FORMAT_Z24X8_UNORM: -      setup_optional_register(f, &fbZ_reg); -      setup_optional_register(f, &fbS_reg); +      /* convert fragment Z from [0,1] to 32-bit ints */ +      spe_cfltu(f, fragZ_reg, fragZ_reg, 32); -      /* shift by 8 to get the upper 24-bit values */ -      spe_rotmi(f, fbS_reg, fbZS_reg, -8); +      /* Load: fbZS_reg = memory[depth_tile_reg + offset_reg] */ +      spe_lqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg); -      /* Incoming fragZ_reg value is a float in 0.0...1.0; convert -       * to a 24-bit unsigned integer -       */ -      spe_cfltu(f, fragZ_reg, fragZ_reg, 32); +      /* right shift 32-bit fragment Z to 24 bits */        spe_rotmi(f, fragZ_reg, fragZ_reg, -8); -      /* 8-bit stencil in the low-order bits - mask them out */ +      /* extract 24-bit Z values from ZS values by shifting */ +      spe_rotmi(f, fbZ_reg, fbZS_reg, -8); + +      /* extract 8-bit stencil values by masking */        spe_and_uint(f, fbS_reg, fbZS_reg, 0x000000ff);        break;     case PIPE_FORMAT_Z32_UNORM: -      setup_optional_register(f, &fbZ_reg); -      /* Copy over 4 32-bit values */ -      spe_move(f, fbZ_reg, fbZS_reg); +      /* Load: fbZ_reg = memory[depth_tile_reg + offset_reg] */ +      spe_lqx(f, fbZ_reg, depth_tile_reg, quad_offset_reg); -      /* Incoming fragZ_reg value is a float in 0.0...1.0; convert -       * to a 32-bit unsigned integer -       */ +      /* convert fragment Z from [0,1] to 32-bit ints */        spe_cfltu(f, fragZ_reg, fragZ_reg, 32); +        /* No stencil, so can't do anything there */        break;     case PIPE_FORMAT_Z16_UNORM: -      /* XXX Not sure this is correct, but it was here before, so we're -       * going with it for now -       */ -      setup_optional_register(f, &fbZ_reg); +      /* XXX This code for 16bpp Z is broken! */ + +      /* Load: fbZS_reg = memory[depth_tile_reg + offset_reg] */ +      spe_lqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg); +        /* Copy over 4 32-bit values */        spe_move(f, fbZ_reg, fbZS_reg); -      /* Incoming fragZ_reg value is a float in 0.0...1.0; convert -       * to a 16-bit unsigned integer -       */ +      /* convert Z from [0,1] to 16-bit ints */        spe_cfltu(f, fragZ_reg, fragZ_reg, 32);        spe_rotmi(f, fragZ_reg, fragZ_reg, -16);        /* No stencil */ @@ -1979,9 +1974,10 @@ gen_depth_stencil(struct cell_context *cell,     }     /* Don't need these any more */ -   release_optional_register(f, fbZ_reg); -   release_optional_register(f, fbS_reg);     spe_release_register(f, fbZS_reg); +   spe_release_register(f, fbZ_reg); +   spe_release_register(f, fbS_reg); +   spe_release_register(f, zmask_reg);  } | 
