From 582ca6e4180e45655ea5f85ac1c823a665efad47 Mon Sep 17 00:00:00 2001 From: Jonathan White Date: Mon, 27 Oct 2008 16:29:20 -0600 Subject: cell: Added support for untwiddling textures during glReadPixels. This allows glReadPixels to work correctly on cell now and makes conformance tests that use pixel compares useable. --- src/gallium/drivers/cell/ppu/cell_texture.c | 158 ++++++++++++++++++++++++++-- src/gallium/drivers/cell/ppu/cell_texture.h | 1 + 2 files changed, 152 insertions(+), 7 deletions(-) (limited to 'src/gallium/drivers/cell') diff --git a/src/gallium/drivers/cell/ppu/cell_texture.c b/src/gallium/drivers/cell/ppu/cell_texture.c index 9ac2f3bbb9..8ae4439f6c 100644 --- a/src/gallium/drivers/cell/ppu/cell_texture.c +++ b/src/gallium/drivers/cell/ppu/cell_texture.c @@ -41,7 +41,6 @@ #include "cell_state.h" #include "cell_texture.h" - /* Simple, maximally packed layout. */ @@ -210,6 +209,87 @@ twiddle_image_uint(uint w, uint h, uint tile_size, uint *dst, } } +/** + * For Cell. Basically, rearrange the pixels/quads from this layout: + * +--+--+--+--+ + * |p0|p1|p2|p3|.... + * +--+--+--+--+ + * + * to this layout: + * +--+--+ + * |p0|p1|.... + * +--+--+ + * |p2|p3| + * +--+--+ + */ +static void +twiddle_tile(const uint *tileIn, uint *tileOut) +{ + int y, x; + + for (y = 0; y < TILE_SIZE; y+=2) { + for (x = 0; x < TILE_SIZE; x+=2) { + int k = 4 * (y/2 * TILE_SIZE/2 + x/2); + tileOut[y * TILE_SIZE + (x + 0)] = tileIn[k]; + tileOut[y * TILE_SIZE + (x + 1)] = tileIn[k+1]; + tileOut[(y + 1) * TILE_SIZE + (x + 0)] = tileIn[k+2]; + tileOut[(y + 1) * TILE_SIZE + (x + 1)] = tileIn[k+3]; + } + } +} + +/** + * Convert image from tiled layout to linear layout. 4-byte pixels. + */ +static void +untwiddle_image_uint(uint w, uint h, uint tile_size, uint *dst, + uint src_stride, const uint *src) +{ + const uint tile_size2 = tile_size * tile_size; + const uint h_t = (h + tile_size - 1) / tile_size; + const uint w_t = (w + tile_size - 1) / tile_size; + uint *tile_buf; + + uint it, jt; /* tile counters */ + uint i, j; /* intra-tile counters */ + + src_stride /= 4; /* convert from bytes to pixels */ + + tile_buf = align_malloc(tile_size * tile_size * 4, 16); + + /* loop over src tiles */ + for (it = 0; it < h_t; it++) { + for (jt = 0; jt < w_t; jt++) { + /* start of src tile: */ + const uint *tsrc = src + (it * w_t + jt) * tile_size2; + + twiddle_tile(tsrc, tile_buf); + tsrc = tile_buf; + + /* compute size of this tile (may be smaller than tile_size) */ + /* XXX note: a compiler bug was found here. That's why the code + * looks as it does. + */ + uint tile_width = w - jt * tile_size; + tile_width = MIN2(tile_width, tile_size); + uint tile_height = h - it * tile_size; + tile_height = MIN2(tile_height, tile_size); + + /* loop over texels in the tile */ + for (i = 0; i < tile_height; i++) { + for (j = 0; j < tile_width; j++) { + uint dsti = it * tile_size + i; + uint dstj = jt * tile_size + j; + ASSERT(dsti < h); + ASSERT(dstj < w); + dst[dsti * src_stride + dstj] = tsrc[i * tile_size + j]; + } + } + } + } + + align_free(tile_buf); +} /** * Convert linear texture image data to tiled format for SPU usage. @@ -260,6 +340,47 @@ cell_twiddle_texture(struct pipe_screen *screen, pipe_buffer_unmap(screen, surface->buffer); } +/** + * Convert SPU tiled texture image data to linear format for app usage. + */ +static void +cell_untwiddle_texture(struct pipe_screen *screen, + struct pipe_surface *surface) +{ + struct cell_texture *ct = cell_texture(surface->texture); + const uint level = surface->level; + const uint texWidth = ct->base.width[level]; + const uint texHeight = ct->base.height[level]; + const void *map = pipe_buffer_map(screen, surface->buffer, + PIPE_BUFFER_USAGE_CPU_READ); + const uint *src = (const uint *) ((const ubyte *) map + surface->offset); + + switch (ct->base.format) { + case PIPE_FORMAT_A8R8G8B8_UNORM: + { + int numFaces = ct->base.target == PIPE_TEXTURE_CUBE ? 6 : 1; + int offset = surface->stride * texHeight * 4 * surface->face; + uint *dst; + + if (!ct->untiled_data[level]) { + ct->untiled_data[level] = + align_malloc(surface->stride * texHeight * 4 * numFaces, 16); + } + + dst = (uint *) ((ubyte *) ct->untiled_data[level] + offset); + + untwiddle_image_uint(texWidth, texHeight, TILE_SIZE, dst, + surface->stride, src); + } + break; + default: + printf("Cell: untwiddle unsupported texture format\n"); + ; + } + + pipe_buffer_unmap(screen, surface->buffer); +} + static struct pipe_surface * cell_get_tex_surface(struct pipe_screen *screen, @@ -294,13 +415,18 @@ cell_get_tex_surface(struct pipe_screen *screen, ps->zslice = zslice; if (pt->target == PIPE_TEXTURE_CUBE || pt->target == PIPE_TEXTURE_3D) { - ps->offset += ((pt->target == PIPE_TEXTURE_CUBE) ? face : zslice) * - ps->nblocksy * - ps->stride; + ps->offset += ((pt->target == PIPE_TEXTURE_CUBE) ? face : zslice) * + ps->nblocksy * + ps->stride; } else { - assert(face == 0); - assert(zslice == 0); + assert(face == 0); + assert(zslice == 0); + } + + if (ps->usage & PIPE_BUFFER_USAGE_CPU_READ) { + /* convert from tiled to linear layout */ + cell_untwiddle_texture(screen, ps); } } return ps; @@ -311,6 +437,13 @@ static void cell_tex_surface_release(struct pipe_screen *screen, struct pipe_surface **s) { + struct cell_texture *ct = cell_texture((*s)->texture); + const uint level = (*s)->level; + + if ((*s)->usage & PIPE_BUFFER_USAGE_CPU_READ) { + align_free(ct->untiled_data[level]); + } + /* XXX if done rendering to teximage, re-tile */ pipe_texture_reference(&(*s)->texture, NULL); @@ -325,6 +458,10 @@ cell_surface_map(struct pipe_screen *screen, unsigned flags) { ubyte *map; + struct cell_texture *ct = cell_texture(surface->texture); + const uint level = surface->level; + + assert(ct); if (flags & ~surface->usage) { assert(0); @@ -335,7 +472,14 @@ cell_surface_map(struct pipe_screen *screen, if (map == NULL) return NULL; else - return (void *) (map + surface->offset); + { + if (surface->usage & PIPE_BUFFER_USAGE_CPU_READ) { + return (void *) ((ubyte *) ct->untiled_data[level] + surface->offset); + } + else { + return (void *) (map + surface->offset); + } + } } diff --git a/src/gallium/drivers/cell/ppu/cell_texture.h b/src/gallium/drivers/cell/ppu/cell_texture.h index 2f5fe0dd1b..7018b0c9bf 100644 --- a/src/gallium/drivers/cell/ppu/cell_texture.h +++ b/src/gallium/drivers/cell/ppu/cell_texture.h @@ -52,6 +52,7 @@ struct cell_texture struct pipe_buffer *tiled_buffer[CELL_MAX_TEXTURE_LEVELS]; /** Mapped, tiled texture data */ void *tiled_mapped[CELL_MAX_TEXTURE_LEVELS]; + void *untiled_data[CELL_MAX_TEXTURE_LEVELS]; }; -- cgit v1.2.3 From d01324eb78da2d501ce33e2792713225090c84cd Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Mon, 27 Oct 2008 18:25:33 -0600 Subject: cell: fix some problems when displaying to a PIPE_FORMAT_B8G8R8A8_UNORM screen --- src/gallium/drivers/cell/ppu/cell_texture.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'src/gallium/drivers/cell') diff --git a/src/gallium/drivers/cell/ppu/cell_texture.c b/src/gallium/drivers/cell/ppu/cell_texture.c index 8ae4439f6c..7734381c7e 100644 --- a/src/gallium/drivers/cell/ppu/cell_texture.c +++ b/src/gallium/drivers/cell/ppu/cell_texture.c @@ -310,6 +310,7 @@ cell_twiddle_texture(struct pipe_screen *screen, switch (ct->base.format) { case PIPE_FORMAT_A8R8G8B8_UNORM: + case PIPE_FORMAT_B8G8R8A8_UNORM: { int numFaces = ct->base.target == PIPE_TEXTURE_CUBE ? 6 : 1; int offset = bufWidth * bufHeight * 4 * surface->face; @@ -357,6 +358,7 @@ cell_untwiddle_texture(struct pipe_screen *screen, switch (ct->base.format) { case PIPE_FORMAT_A8R8G8B8_UNORM: + case PIPE_FORMAT_B8G8R8A8_UNORM: { int numFaces = ct->base.target == PIPE_TEXTURE_CUBE ? 6 : 1; int offset = surface->stride * texHeight * 4 * surface->face; @@ -442,6 +444,7 @@ cell_tex_surface_release(struct pipe_screen *screen, if ((*s)->usage & PIPE_BUFFER_USAGE_CPU_READ) { align_free(ct->untiled_data[level]); + ct->untiled_data[level] = NULL; } /* XXX if done rendering to teximage, re-tile */ -- cgit v1.2.3 From 57487590871d523dd6044ad214dafde04dd799f0 Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Tue, 28 Oct 2008 12:41:47 -0600 Subject: cell: don't include libmisc.h Doesn't seem to be needed and fixes compilation with SDK 3.1 beta. --- src/gallium/drivers/cell/ppu/cell_spu.h | 1 - 1 file changed, 1 deletion(-) (limited to 'src/gallium/drivers/cell') diff --git a/src/gallium/drivers/cell/ppu/cell_spu.h b/src/gallium/drivers/cell/ppu/cell_spu.h index b633880c25..c93958a9ed 100644 --- a/src/gallium/drivers/cell/ppu/cell_spu.h +++ b/src/gallium/drivers/cell/ppu/cell_spu.h @@ -30,7 +30,6 @@ #include -#include #include #include "cell/common.h" -- cgit v1.2.3 From db680ac0e3697ecc2c2dbd5f22c4c2fdb136b62c Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Tue, 28 Oct 2008 14:03:51 -0600 Subject: cell: fix a number of fence issues Plus add assertions to check status, alignment, etc. --- src/gallium/drivers/cell/ppu/cell_batch.c | 19 ++++++++++++++++--- src/gallium/drivers/cell/ppu/cell_context.h | 2 +- src/gallium/drivers/cell/ppu/cell_fence.c | 14 ++++++++++++-- src/gallium/drivers/cell/spu/spu_command.c | 2 +- 4 files changed, 30 insertions(+), 7 deletions(-) (limited to 'src/gallium/drivers/cell') diff --git a/src/gallium/drivers/cell/ppu/cell_batch.c b/src/gallium/drivers/cell/ppu/cell_batch.c index 448b723d85..962775cd33 100644 --- a/src/gallium/drivers/cell/ppu/cell_batch.c +++ b/src/gallium/drivers/cell/ppu/cell_batch.c @@ -100,12 +100,23 @@ emit_fence(struct cell_context *cell) const uint batch = cell->cur_batch; const uint size = cell->buffer_size[batch]; struct cell_command_fence *fence_cmd; + struct cell_fence *fence = &cell->fenced_buffers[batch].fence; + uint i; + + /* set fence status to emitted, not yet signalled */ + for (i = 0; i < cell->num_spus; i++) { + fence->status[i][0] = CELL_FENCE_EMITTED; + } ASSERT(size + sizeof(struct cell_command_fence) <= CELL_BUFFER_SIZE); fence_cmd = (struct cell_command_fence *) (cell->buffer[batch] + size); fence_cmd->opcode = CELL_CMD_FENCE; - fence_cmd->fence = &cell->fenced_buffers[batch].fence; + fence_cmd->fence = fence; + + /* update batch buffer size */ + cell->buffer_size[batch] = size + sizeof(struct cell_command_fence); + assert(sizeof(struct cell_command_fence) % 8 == 0); } @@ -119,7 +130,7 @@ cell_batch_flush(struct cell_context *cell) { static boolean flushing = FALSE; uint batch = cell->cur_batch; - const uint size = cell->buffer_size[batch]; + uint size = cell->buffer_size[batch]; uint spu, cmd_word; assert(!flushing); @@ -130,8 +141,10 @@ cell_batch_flush(struct cell_context *cell) /* Before we use this batch buffer, make sure any fenced texture buffers * are released. */ - if (cell->fenced_buffers[batch].head) + if (cell->fenced_buffers[batch].head) { emit_fence(cell); + size = cell->buffer_size[batch]; + } flushing = TRUE; diff --git a/src/gallium/drivers/cell/ppu/cell_context.h b/src/gallium/drivers/cell/ppu/cell_context.h index 4491ae8cdf..eb1397bb3f 100644 --- a/src/gallium/drivers/cell/ppu/cell_context.h +++ b/src/gallium/drivers/cell/ppu/cell_context.h @@ -89,7 +89,7 @@ struct cell_buffer_node; */ struct cell_buffer_list { - struct cell_fence fence; + struct cell_fence fence ALIGN16_ATTRIB; struct cell_buffer_node *head; }; diff --git a/src/gallium/drivers/cell/ppu/cell_fence.c b/src/gallium/drivers/cell/ppu/cell_fence.c index ffb3bea12b..867b5dcaa0 100644 --- a/src/gallium/drivers/cell/ppu/cell_fence.c +++ b/src/gallium/drivers/cell/ppu/cell_fence.c @@ -38,6 +38,7 @@ void cell_fence_init(struct cell_fence *fence) { uint i; + ASSERT_ALIGN16(fence->status); for (i = 0; i < CELL_MAX_SPUS; i++) { fence->status[i][0] = CELL_FENCE_IDLE; } @@ -50,9 +51,9 @@ cell_fence_signalled(const struct cell_context *cell, { uint i; for (i = 0; i < cell->num_spus; i++) { - //ASSERT(fence->status[i][0] != CELL_FENCE_IDLE); - if (fence->status[i][0] == CELL_FENCE_EMITTED) + if (fence->status[i][0] != CELL_FENCE_SIGNALLED) return FALSE; + /*assert(fence->status[i][0] == CELL_FENCE_EMITTED);*/ } return TRUE; } @@ -65,6 +66,15 @@ cell_fence_finish(const struct cell_context *cell, while (!cell_fence_signalled(cell, fence)) { usleep(10); } + +#ifdef DEBUG + { + uint i; + for (i = 0; i < cell->num_spus; i++) { + assert(fence->status[i][0] == CELL_FENCE_SIGNALLED); + } + } +#endif } diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c index a6ed29ea63..63818d4c46 100644 --- a/src/gallium/drivers/cell/spu/spu_command.c +++ b/src/gallium/drivers/cell/spu/spu_command.c @@ -107,7 +107,7 @@ cmd_fence(struct cell_command_fence *fence_cmd) CELL_FENCE_SIGNALLED}; uint *dst = (uint *) fence_cmd->fence; dst += 4 * spu.init.id; /* main store/memory address, not local store */ - + ASSERT_ALIGN16(dst); mfc_put((void *) &status, /* src in local memory */ (unsigned int) dst, /* dst in main memory */ sizeof(status), /* size */ -- cgit v1.2.3 From 8b3af5c5d6fe100707da0d9dcc42500921792638 Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Wed, 29 Oct 2008 12:12:30 -0600 Subject: cell: use simd utilities for pow, exp2, log2 --- src/gallium/drivers/cell/spu/spu_funcs.c | 28 ++++++---------------------- 1 file changed, 6 insertions(+), 22 deletions(-) (limited to 'src/gallium/drivers/cell') diff --git a/src/gallium/drivers/cell/spu/spu_funcs.c b/src/gallium/drivers/cell/spu/spu_funcs.c index 3534b35000..ff3d609d25 100644 --- a/src/gallium/drivers/cell/spu/spu_funcs.c +++ b/src/gallium/drivers/cell/spu/spu_funcs.c @@ -38,7 +38,9 @@ #include #include #include -#include +#include +#include +#include #include "cell/common.h" #include "spu_main.h" @@ -68,37 +70,19 @@ spu_sin(vector float x) static vector float spu_pow(vector float x, vector float y) { - float z0 = powf(spu_extract(x,0), spu_extract(y,0)); - float z1 = powf(spu_extract(x,1), spu_extract(y,1)); - float z2 = powf(spu_extract(x,2), spu_extract(y,2)); - float z3 = powf(spu_extract(x,3), spu_extract(y,3)); - return (vector float) {z0, z1, z2, z3}; + return _powf4(x, y); } static vector float spu_exp2(vector float x) { - float z0 = powf(2.0f, spu_extract(x,0)); - float z1 = powf(2.0f, spu_extract(x,1)); - float z2 = powf(2.0f, spu_extract(x,2)); - float z3 = powf(2.0f, spu_extract(x,3)); - return (vector float) {z0, z1, z2, z3}; + return _exp2f4(x); } static vector float spu_log2(vector float x) { - /* - * log_base_2(x) = log(x) / log(2) - * 1.442695 = 1/log(2). - */ - static const vector float k = {1.442695F, 1.442695F, 1.442695F, 1.442695F}; - float z0 = logf(spu_extract(x,0)); - float z1 = logf(spu_extract(x,1)); - float z2 = logf(spu_extract(x,2)); - float z3 = logf(spu_extract(x,3)); - vector float v = (vector float) {z0, z1, z2, z3}; - return spu_mul(v, k); + return _log2f4(x); } -- cgit v1.2.3 From 1f7a323a138e6cc43b1192022b071c606a5ee6f4 Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Wed, 29 Oct 2008 12:14:11 -0600 Subject: cell: add scalar param to emit_function_call() to indicate scalar function calls Scalar calls only use the X component of the src regs and smear the result across the dest register's X/Y/Z/W. --- src/gallium/drivers/cell/ppu/cell_gen_fp.c | 103 +++++++++++++++++++---------- 1 file changed, 69 insertions(+), 34 deletions(-) (limited to 'src/gallium/drivers/cell') diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c index d4d644d6e8..5c41b264ac 100644 --- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c +++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c @@ -1303,60 +1303,91 @@ lookup_function(struct cell_context *cell, const char *funcname) /** * Emit code to call a SPU function. * Used to implement instructions like SIN/COS/POW/TEX/etc. + * If scalar, only the X components of the src regs are used, and the + * result is replicated across the dest register's XYZW components. */ static boolean emit_function_call(struct codegen *gen, const struct tgsi_full_instruction *inst, - char *funcname, uint num_args) + char *funcname, uint num_args, boolean scalar) { const uint addr = lookup_function(gen->cell, funcname); char comment[100]; - int ch; + int s_regs[3]; + int func_called = FALSE; + uint a, ch; + int retval_reg = -1; assert(num_args <= 3); snprintf(comment, sizeof(comment), "CALL %s:", funcname); spe_comment(gen->f, -4, comment); + if (scalar) { + for (a = 0; a < num_args; a++) { + s_regs[a] = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[a]); + } + /* we'll call the function, put the return value in this register, + * then replicate it across all write-enabled components in d_reg. + */ + retval_reg = spe_allocate_available_register(gen->f); + } + for (ch = 0; ch < 4; ch++) { if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { - int s_regs[3], d_reg; + int d_reg; ubyte usedRegs[SPE_NUM_REGS]; - uint a, i, numUsed; + uint i, numUsed; - for (a = 0; a < num_args; a++) { - s_regs[a] = get_src_reg(gen, ch, &inst->FullSrcRegisters[a]); + if (!scalar) { + for (a = 0; a < num_args; a++) { + s_regs[a] = get_src_reg(gen, ch, &inst->FullSrcRegisters[a]); + } } - d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); - numUsed = spe_get_registers_used(gen->f, usedRegs); - assert(numUsed < gen->frame_size / 16 - 2); + d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); - /* save registers to stack */ - for (i = 0; i < numUsed; i++) { - uint reg = usedRegs[i]; - int offset = 2 + i; - spe_stqd(gen->f, reg, SPE_REG_SP, 16 * offset); - } + if (!scalar || !func_called) { + /* for a scalar function, we'll really only call the function once */ - /* setup function arguments */ - for (a = 0; a < num_args; a++) { - spe_move(gen->f, 3 + a, s_regs[a]); - } + numUsed = spe_get_registers_used(gen->f, usedRegs); + assert(numUsed < gen->frame_size / 16 - 2); - /* branch to function, save return addr */ - spe_brasl(gen->f, SPE_REG_RA, addr); + /* save registers to stack */ + for (i = 0; i < numUsed; i++) { + uint reg = usedRegs[i]; + int offset = 2 + i; + spe_stqd(gen->f, reg, SPE_REG_SP, 16 * offset); + } - /* save function's return value */ - spe_move(gen->f, d_reg, 3); + /* setup function arguments */ + for (a = 0; a < num_args; a++) { + spe_move(gen->f, 3 + a, s_regs[a]); + } - /* restore registers from stack */ - for (i = 0; i < numUsed; i++) { - uint reg = usedRegs[i]; - if (reg != d_reg) { - int offset = 2 + i; - spe_lqd(gen->f, reg, SPE_REG_SP, 16 * offset); + /* branch to function, save return addr */ + spe_brasl(gen->f, SPE_REG_RA, addr); + + /* save function's return value */ + if (scalar) + spe_move(gen->f, retval_reg, 3); + else + spe_move(gen->f, d_reg, 3); + + /* restore registers from stack */ + for (i = 0; i < numUsed; i++) { + uint reg = usedRegs[i]; + if (reg != d_reg && reg != retval_reg) { + int offset = 2 + i; + spe_lqd(gen->f, reg, SPE_REG_SP, 16 * offset); + } } + + func_called = TRUE; + } + + if (scalar) { + spe_move(gen->f, d_reg, retval_reg); } store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); @@ -1364,6 +1395,10 @@ emit_function_call(struct codegen *gen, } } + if (scalar) { + spe_release_register(gen->f, retval_reg); + } + return true; } @@ -1770,15 +1805,15 @@ emit_instruction(struct codegen *gen, return emit_END(gen); case TGSI_OPCODE_COS: - return emit_function_call(gen, inst, "spu_cos", 1); + return emit_function_call(gen, inst, "spu_cos", 1, TRUE); case TGSI_OPCODE_SIN: - return emit_function_call(gen, inst, "spu_sin", 1); + return emit_function_call(gen, inst, "spu_sin", 1, TRUE); case TGSI_OPCODE_POW: - return emit_function_call(gen, inst, "spu_pow", 2); + return emit_function_call(gen, inst, "spu_pow", 2, TRUE); case TGSI_OPCODE_EXPBASE2: - return emit_function_call(gen, inst, "spu_exp2", 1); + return emit_function_call(gen, inst, "spu_exp2", 1, TRUE); case TGSI_OPCODE_LOGBASE2: - return emit_function_call(gen, inst, "spu_log2", 1); + return emit_function_call(gen, inst, "spu_log2", 1, TRUE); case TGSI_OPCODE_TEX: /* fall-through for now */ case TGSI_OPCODE_TXD: -- cgit v1.2.3 From 157ddc14183807834068687f02c67b66acf9effa Mon Sep 17 00:00:00 2001 From: Jonathan White Date: Thu, 30 Oct 2008 11:22:20 -0600 Subject: cell: Added check for PIPE_FLUSH_RENDER_CACHE to cell_flush to fix black blocks during st_readpixels due to a flush wait not happening in order to allow any previous rendering to complete. --- src/gallium/drivers/cell/ppu/cell_flush.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'src/gallium/drivers/cell') diff --git a/src/gallium/drivers/cell/ppu/cell_flush.c b/src/gallium/drivers/cell/ppu/cell_flush.c index 6596b72010..a64967b4b9 100644 --- a/src/gallium/drivers/cell/ppu/cell_flush.c +++ b/src/gallium/drivers/cell/ppu/cell_flush.c @@ -49,7 +49,7 @@ cell_flush(struct pipe_context *pipe, unsigned flags, flags |= CELL_FLUSH_WAIT; } - if (flags & PIPE_FLUSH_SWAPBUFFERS) + if (flags & (PIPE_FLUSH_SWAPBUFFERS | PIPE_FLUSH_RENDER_CACHE)) flags |= CELL_FLUSH_WAIT; draw_flush( cell->draw ); -- cgit v1.2.3 From 711f8a1dd94e2e1e715615d947e03015ef972326 Mon Sep 17 00:00:00 2001 From: Robert Ellison Date: Thu, 30 Oct 2008 15:24:23 -0600 Subject: CELL: stencil bug fixes Two definitive bugs in stenciling were fixed. The first, reversed registers in the generated Select Bytes (selb) instruction, caused the stenciling INCR and DECR operations to fail dramatically, putting new values in where old values were supposed to be and vice versa. The second caused stencil tiles to not be read and written from main memory by the SPUs. A per-spu flag, spu.read_depth, was used to indicate whether the SPU should be reading depth tiles, and was set only when depth was enabled. A second flag, spu.read_stencil, was set when stenciling was enabled, but never referenced. As stenciling and depth are in the same tiles on the Cell, and there is no corresponding TAG_WRITE_TILE_STENCIL to complement TAG_WRITE_TILE_COLOR and TAG_WRITE_TILE_Z, I fixed this by eliminating the unused "spu.read_stencil", renaming "spu.read_depth" to "spu.read_depth_stencil", and setting it if either stenciling or depth is enabled. I also added an optimization to the fragment ops generation code, that avoids calculating stencil values and/or stencil writemask when the stencil operations are all KEEP. --- progs/trivial/tri-stencil.c | 13 ++++++++++-- src/gallium/drivers/cell/ppu/cell_gen_fragment.c | 25 ++++++++++++++++++------ src/gallium/drivers/cell/spu/spu_command.c | 3 +-- src/gallium/drivers/cell/spu/spu_main.h | 3 +-- src/gallium/drivers/cell/spu/spu_render.c | 4 ++-- src/gallium/drivers/cell/spu/spu_tri.c | 2 +- 6 files changed, 35 insertions(+), 15 deletions(-) (limited to 'src/gallium/drivers/cell') diff --git a/progs/trivial/tri-stencil.c b/progs/trivial/tri-stencil.c index 5edbef26ce..7686e16aef 100644 --- a/progs/trivial/tri-stencil.c +++ b/progs/trivial/tri-stencil.c @@ -49,7 +49,15 @@ static void Key(unsigned char key, int x, int y) switch (key) { case 27: + printf("Exiting...\n"); exit(1); + case 'r': + printf("Redisplaying...\n"); + glutPostRedisplay(); + break; + default: + printf("No such key '%c'...\n", key); + break; } } @@ -89,7 +97,7 @@ static void Draw(void) glEnd(); #endif -#if 0 +#if 1 glStencilFunc(GL_EQUAL, 1, 1); glStencilOp(GL_KEEP, GL_KEEP, GL_KEEP); @@ -130,7 +138,8 @@ int main(int argc, char **argv) exit(1); } - glutInitWindowPosition(0, 0); glutInitWindowSize( 300, 300); + glutInitWindowPosition(0, 0); + glutInitWindowSize( 300, 300); type = GLUT_RGB | GLUT_SINGLE | GLUT_DEPTH | GLUT_STENCIL; glutInitDisplayMode(type); diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c index 4e1e53ecdc..8e4dd82404 100644 --- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c +++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c @@ -1282,7 +1282,7 @@ gen_stencil_values(struct spe_function *f, unsigned int stencil_op, /* Add Word Immediate computes rT = rA + 10-bit signed immediate */ spe_ai(f, newS_reg, fbS_reg, 1); /* Select from the current value or the new value based on the equality test */ - spe_selb(f, newS_reg, fbS_reg, newS_reg, equals_reg); + spe_selb(f, newS_reg, newS_reg, fbS_reg, equals_reg); spe_release_register(f, equals_reg); break; @@ -1295,7 +1295,7 @@ gen_stencil_values(struct spe_function *f, unsigned int stencil_op, /* Add Word Immediate with a (-1) value works */ spe_ai(f, newS_reg, fbS_reg, -1); /* Select from the current value or the new value based on the equality test */ - spe_selb(f, newS_reg, fbS_reg, newS_reg, equals_reg); + spe_selb(f, newS_reg, newS_reg, fbS_reg, equals_reg); spe_release_register(f, equals_reg); break; @@ -1534,15 +1534,28 @@ gen_stencil_depth_test(struct spe_function *f, * meaning that we have to calculate the stencil values but do not * need to mask them), we can avoid generating code. Don't forget * that we need to consider backfacing stencil, if enabled. + * + * Note that if the backface stencil is *not* enabled, the backface + * stencil will have the same values as the frontface stencil. */ - if (dsa->stencil[0].write_mask == 0x0 && (!dsa->stencil[1].enabled || dsa->stencil[1].write_mask == 0x00)) { - /* Trivial: don't need to calculate stencil values, and don't need to - * write them back to the framebuffer. + if (dsa->stencil[0].fail_op == PIPE_STENCIL_OP_KEEP && + dsa->stencil[0].zfail_op == PIPE_STENCIL_OP_KEEP && + dsa->stencil[0].zpass_op == PIPE_STENCIL_OP_KEEP && + dsa->stencil[1].fail_op == PIPE_STENCIL_OP_KEEP && + dsa->stencil[1].zfail_op == PIPE_STENCIL_OP_KEEP && + dsa->stencil[1].zpass_op == PIPE_STENCIL_OP_KEEP) { + /* No changes to any stencil values */ + need_to_calculate_stencil_values = false; + need_to_writemask_stencil_values = false; + } + else if (dsa->stencil[0].write_mask == 0x0 && dsa->stencil[1].write_mask == 0x0) { + /* All changes are writemasked out, so no need to calculate + * what those changes might be, and no need to write anything back. */ need_to_calculate_stencil_values = false; need_to_writemask_stencil_values = false; } - else if (dsa->stencil[0].write_mask == 0xff && (!dsa->stencil[1].enabled || dsa->stencil[1].write_mask == 0xff)) { + else if (dsa->stencil[0].write_mask == 0xff && dsa->stencil[1].write_mask == 0xff) { /* Still trivial, but a little less so. We need to write the stencil * values, but we don't need to mask them. */ diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c index 63818d4c46..d726622d94 100644 --- a/src/gallium/drivers/cell/spu/spu_command.c +++ b/src/gallium/drivers/cell/spu/spu_command.c @@ -244,8 +244,7 @@ cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops) } } - spu.read_depth = spu.depth_stencil_alpha.depth.enabled; - spu.read_stencil = spu.depth_stencil_alpha.stencil[0].enabled; + spu.read_depth_stencil = (spu.depth_stencil_alpha.depth.enabled || spu.depth_stencil_alpha.stencil[0].enabled); } diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h index 668af10be2..692790c9f3 100644 --- a/src/gallium/drivers/cell/spu/spu_main.h +++ b/src/gallium/drivers/cell/spu/spu_main.h @@ -160,8 +160,7 @@ struct spu_global tile_t ztile ALIGN16_ATTRIB; /** Read depth/stencil tiles? */ - boolean read_depth; - boolean read_stencil; + boolean read_depth_stencil; /** Current tiles' status */ ubyte cur_ctile_status, cur_ztile_status; diff --git a/src/gallium/drivers/cell/spu/spu_render.c b/src/gallium/drivers/cell/spu/spu_render.c index 5515bb55c9..7c225e2f27 100644 --- a/src/gallium/drivers/cell/spu/spu_render.c +++ b/src/gallium/drivers/cell/spu/spu_render.c @@ -98,7 +98,7 @@ my_tile(uint tx, uint ty) static INLINE void get_cz_tiles(uint tx, uint ty) { - if (spu.read_depth) { + if (spu.read_depth_stencil) { if (spu.cur_ztile_status != TILE_STATUS_CLEAR) { //printf("SPU %u: getting Z tile %u, %u\n", spu.init.id, tx, ty); get_tile(tx, ty, &spu.ztile, TAG_READ_TILE_Z, 1); @@ -153,7 +153,7 @@ static INLINE void wait_put_cz_tiles(void) { wait_on_mask(1 << TAG_WRITE_TILE_COLOR); - if (spu.read_depth) { + if (spu.read_depth_stencil) { wait_on_mask(1 << TAG_WRITE_TILE_Z); } } diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c index 4caf7d6b61..5f908159bb 100644 --- a/src/gallium/drivers/cell/spu/spu_tri.c +++ b/src/gallium/drivers/cell/spu/spu_tri.c @@ -369,7 +369,7 @@ flush_spans(void) } ASSERT(spu.cur_ctile_status != TILE_STATUS_DEFINED); - if (spu.read_depth) { + if (spu.read_depth_stencil) { if (spu.cur_ztile_status == TILE_STATUS_GETTING) { /* wait for mfc_get() to complete */ //printf("SPU: %u: waiting for ztile\n", spu.init.id); -- cgit v1.2.3 From 443e102fdc8084dd2c73549c83de10524eb94b31 Mon Sep 17 00:00:00 2001 From: Jonathan White Date: Thu, 30 Oct 2008 15:53:12 -0600 Subject: cell: Protected use of non-initialized untile buffers --- src/gallium/drivers/cell/ppu/cell_texture.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'src/gallium/drivers/cell') diff --git a/src/gallium/drivers/cell/ppu/cell_texture.c b/src/gallium/drivers/cell/ppu/cell_texture.c index 7734381c7e..28161d166e 100644 --- a/src/gallium/drivers/cell/ppu/cell_texture.c +++ b/src/gallium/drivers/cell/ppu/cell_texture.c @@ -376,8 +376,10 @@ cell_untwiddle_texture(struct pipe_screen *screen, } break; default: - printf("Cell: untwiddle unsupported texture format\n"); - ; + { + ct->untiled_data[level] = NULL; + printf("Cell: untwiddle unsupported texture format\n"); + } } pipe_buffer_unmap(screen, surface->buffer); @@ -442,7 +444,8 @@ cell_tex_surface_release(struct pipe_screen *screen, struct cell_texture *ct = cell_texture((*s)->texture); const uint level = (*s)->level; - if ((*s)->usage & PIPE_BUFFER_USAGE_CPU_READ) { + if (((*s)->usage & PIPE_BUFFER_USAGE_CPU_READ) && (ct->untiled_data[level])) + { align_free(ct->untiled_data[level]); ct->untiled_data[level] = NULL; } @@ -476,7 +479,7 @@ cell_surface_map(struct pipe_screen *screen, return NULL; else { - if (surface->usage & PIPE_BUFFER_USAGE_CPU_READ) { + if ((surface->usage & PIPE_BUFFER_USAGE_CPU_READ) && (ct->untiled_data[level])) { return (void *) ((ubyte *) ct->untiled_data[level] + surface->offset); } else { -- cgit v1.2.3 From 14e1505cce24ee294cb98683504cc4537c20f34a Mon Sep 17 00:00:00 2001 From: Robert Ellison Date: Thu, 30 Oct 2008 21:31:07 -0600 Subject: CELL: fix use of stencil value mask The Cell stencil tests were completely ignoring the stencil value mask. Now the original code paths are still used if the stencil value mask is all 1s; but code to use the mask for the stencil value and reference value comparisons is now emitted if the mask is not all 1s. --- src/gallium/drivers/cell/ppu/cell_gen_fragment.c | 154 ++++++++++++++++------- 1 file changed, 112 insertions(+), 42 deletions(-) (limited to 'src/gallium/drivers/cell') diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c index 8e4dd82404..6e2a5d2980 100644 --- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c +++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c @@ -1141,13 +1141,17 @@ gen_colormask(struct spe_function *f, * access to the Compare Immediate instructions where we don't in * gen_depth_test(), which is what makes us very different. * + * There's some added complexity if there's a non-trivial state->mask + * value; then stencil and reference both must be masked + * * The return value in the stencil_pass_reg is a bitmask of valid * fragments that also passed the stencil test. The bitmask of valid - * fragments that failed would be found in (mask_reg & ~stencil_pass_reg). + * fragments that failed would be found in (fragment_mask_reg & ~stencil_pass_reg). */ static void gen_stencil_test(struct spe_function *f, const struct pipe_stencil_state *state, - unsigned int mask_reg, unsigned int fbS_reg, + unsigned int stencil_max_value, + unsigned int fragment_mask_reg, unsigned int fbS_reg, unsigned int stencil_pass_reg) { /* Generate code that puts the set of passing fragments into the stencil_pass_reg @@ -1155,68 +1159,134 @@ gen_stencil_test(struct spe_function *f, const struct pipe_stencil_state *state, */ switch (state->func) { case PIPE_FUNC_EQUAL: - /* stencil_pass = mask & (s == reference) */ - spe_compare_equal_uint(f, stencil_pass_reg, fbS_reg, state->ref_value); - spe_and(f, stencil_pass_reg, mask_reg, stencil_pass_reg); + if (state->value_mask == stencil_max_value) { + /* stencil_pass = fragment_mask & (s == reference) */ + spe_compare_equal_uint(f, stencil_pass_reg, fbS_reg, state->ref_value); + spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg); + } + else { + /* stencil_pass = fragment_mask & ((s&mask) == (reference&mask)) */ + unsigned int tmp_masked_stencil = spe_allocate_available_register(f); + spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->value_mask); + spe_compare_equal_uint(f, stencil_pass_reg, tmp_masked_stencil, state->value_mask & state->ref_value); + spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg); + spe_release_register(f, tmp_masked_stencil); + } break; case PIPE_FUNC_NOTEQUAL: - /* stencil_pass = mask & ~(s == reference) */ - spe_compare_equal_uint(f, stencil_pass_reg, fbS_reg, state->ref_value); - spe_andc(f, stencil_pass_reg, mask_reg, stencil_pass_reg); + if (state->value_mask == stencil_max_value) { + /* stencil_pass = fragment_mask & ~(s == reference) */ + spe_compare_equal_uint(f, stencil_pass_reg, fbS_reg, state->ref_value); + spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg); + } + else { + /* stencil_pass = fragment_mask & ~((s&mask) == (reference&mask)) */ + unsigned int tmp_masked_stencil = spe_allocate_available_register(f); + spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->value_mask); + spe_compare_equal_uint(f, stencil_pass_reg, tmp_masked_stencil, state->value_mask & state->ref_value); + spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg); + spe_release_register(f, tmp_masked_stencil); + } break; case PIPE_FUNC_GREATER: - /* stencil_pass = mask & (s > reference) */ - spe_compare_greater_uint(f, stencil_pass_reg, fbS_reg, state->ref_value); - spe_and(f, stencil_pass_reg, mask_reg, stencil_pass_reg); + if (state->value_mask == stencil_max_value) { + /* stencil_pass = fragment_mask & (s > reference) */ + spe_compare_greater_uint(f, stencil_pass_reg, fbS_reg, state->ref_value); + spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg); + } + else { + /* stencil_pass = fragment_mask & ((s&mask) > (reference&mask)) */ + unsigned int tmp_masked_stencil = spe_allocate_available_register(f); + spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->value_mask); + spe_compare_greater_uint(f, stencil_pass_reg, tmp_masked_stencil, state->value_mask & state->ref_value); + spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg); + spe_release_register(f, tmp_masked_stencil); + } break; - case PIPE_FUNC_LESS: { - /* stencil_pass = mask & (reference > s) */ - /* There's no convenient Compare Less Than Immediate instruction, so - * we'll have to do this one the harder way, by loading a register and - * comparing directly. Compare Logical Greater Than Word (clgt) - * treats its operands as unsigned - no sign extension. - */ - unsigned int tmp_reg = spe_allocate_available_register(f); - spe_load_uint(f, tmp_reg, state->ref_value); - spe_clgt(f, stencil_pass_reg, tmp_reg, fbS_reg); - spe_and(f, stencil_pass_reg, mask_reg, stencil_pass_reg); - spe_release_register(f, tmp_reg); + case PIPE_FUNC_LESS: + if (state->value_mask == stencil_max_value) { + /* stencil_pass = fragment_mask & (reference > s) */ + /* There's no convenient Compare Less Than Immediate instruction, so + * we'll have to do this one the harder way, by loading a register and + * comparing directly. Compare Logical Greater Than Word (clgt) + * treats its operands as unsigned - no sign extension. + */ + unsigned int tmp_reg = spe_allocate_available_register(f); + spe_load_uint(f, tmp_reg, state->ref_value); + spe_clgt(f, stencil_pass_reg, tmp_reg, fbS_reg); + spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg); + spe_release_register(f, tmp_reg); + } + else { + /* stencil_pass = fragment_mask & ((reference&mask) > (s&mask)) */ + unsigned int tmp_reg = spe_allocate_available_register(f); + unsigned int tmp_masked_stencil = spe_allocate_available_register(f); + spe_load_uint(f, tmp_reg, state->value_mask & state->ref_value); + spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->value_mask); + spe_clgt(f, stencil_pass_reg, tmp_reg, tmp_masked_stencil); + spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg); + spe_release_register(f, tmp_reg); + spe_release_register(f, tmp_masked_stencil); + } break; - } case PIPE_FUNC_LEQUAL: - /* stencil_pass = mask & (s <= reference) = mask & ~(s > reference) */ - spe_compare_greater_uint(f, stencil_pass_reg, fbS_reg, state->ref_value); - spe_andc(f, stencil_pass_reg, mask_reg, stencil_pass_reg); + if (state->value_mask == stencil_max_value) { + /* stencil_pass = fragment_mask & (s <= reference) + * = fragment_mask & ~(s > reference) */ + spe_compare_greater_uint(f, stencil_pass_reg, fbS_reg, state->ref_value); + spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg); + } + else { + /* stencil_pass = fragment_mask & ~((s&mask) > (reference&mask)) */ + unsigned int tmp_masked_stencil = spe_allocate_available_register(f); + spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->value_mask); + spe_compare_greater_uint(f, stencil_pass_reg, tmp_masked_stencil, state->value_mask & state->ref_value); + spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg); + spe_release_register(f, tmp_masked_stencil); + } break; - case PIPE_FUNC_GEQUAL: { - /* stencil_pass = mask & (s >= reference) = mask & ~(reference > s) */ - /* As above, we have to do this by loading a register */ - unsigned int tmp_reg = spe_allocate_available_register(f); - spe_load_uint(f, tmp_reg, state->ref_value); - spe_clgt(f, stencil_pass_reg, tmp_reg, fbS_reg); - spe_andc(f, stencil_pass_reg, mask_reg, stencil_pass_reg); - spe_release_register(f, tmp_reg); + case PIPE_FUNC_GEQUAL: + if (state->value_mask == stencil_max_value) { + /* stencil_pass = fragment_mask & (s >= reference) ] + * = fragment_mask & ~(reference > s) */ + /* As above, we have to do this by loading a register */ + unsigned int tmp_reg = spe_allocate_available_register(f); + spe_load_uint(f, tmp_reg, state->ref_value); + spe_clgt(f, stencil_pass_reg, tmp_reg, fbS_reg); + spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg); + spe_release_register(f, tmp_reg); + } + else { + /* stencil_pass = fragment_mask & ~((reference&mask) > (s&mask)) */ + unsigned int tmp_reg = spe_allocate_available_register(f); + unsigned int tmp_masked_stencil = spe_allocate_available_register(f); + spe_load_uint(f, tmp_reg, state->ref_value & state->value_mask); + spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->value_mask); + spe_clgt(f, stencil_pass_reg, tmp_reg, tmp_masked_stencil); + spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg); + spe_release_register(f, tmp_reg); + spe_release_register(f, tmp_masked_stencil); + } break; - } case PIPE_FUNC_NEVER: - /* stencil_pass = mask & 0 = 0 */ + /* stencil_pass = fragment_mask & 0 = 0 */ spe_load_uint(f, stencil_pass_reg, 0); break; case PIPE_FUNC_ALWAYS: - /* stencil_pass = mask & 1 = mask */ - spe_move(f, stencil_pass_reg, mask_reg); + /* stencil_pass = fragment_mask & 1 = fragment_mask */ + spe_move(f, stencil_pass_reg, fragment_mask_reg); break; } /* The fragments that passed the stencil test are now in stencil_pass_reg. - * The fragments that failed would be (mask_reg & ~stencil_pass_reg). + * The fragments that failed would be (fragment_mask_reg & ~stencil_pass_reg). */ } @@ -1596,7 +1666,7 @@ gen_stencil_depth_test(struct spe_function *f, */ spe_comment(f, 0, "Running basic stencil test"); stencil_pass_reg = spe_allocate_available_register(f); - gen_stencil_test(f, &dsa->stencil[0], mask_reg, fbS_reg, stencil_pass_reg); + gen_stencil_test(f, &dsa->stencil[0], 0xff, mask_reg, fbS_reg, stencil_pass_reg); /* If two-sided stenciling is on, generate code to run the stencil * test on the backfacing stencil as well, and combine the two results @@ -1605,7 +1675,7 @@ gen_stencil_depth_test(struct spe_function *f, if (dsa->stencil[1].enabled) { unsigned int temp_reg = spe_allocate_available_register(f); spe_comment(f, 0, "Running backface stencil test"); - gen_stencil_test(f, &dsa->stencil[1], mask_reg, fbS_reg, temp_reg); + gen_stencil_test(f, &dsa->stencil[1], 0xff, mask_reg, fbS_reg, temp_reg); spe_selb(f, stencil_pass_reg, stencil_pass_reg, temp_reg, facing_reg); spe_release_register(f, temp_reg); } -- cgit v1.2.3 From 88360913a730795d031b2ff20fe50d438ef1c151 Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Wed, 5 Nov 2008 17:20:35 -0700 Subject: cell: minor reformatting, var renaming --- src/gallium/drivers/cell/ppu/cell_texture.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'src/gallium/drivers/cell') diff --git a/src/gallium/drivers/cell/ppu/cell_texture.c b/src/gallium/drivers/cell/ppu/cell_texture.c index 28161d166e..ae88d06912 100644 --- a/src/gallium/drivers/cell/ppu/cell_texture.c +++ b/src/gallium/drivers/cell/ppu/cell_texture.c @@ -28,6 +28,7 @@ * Authors: * Keith Whitwell * Michel Dänzer + * Brian Paul */ #include "pipe/p_context.h" @@ -41,10 +42,10 @@ #include "cell_state.h" #include "cell_texture.h" -/* Simple, maximally packed layout. - */ -static unsigned minify( unsigned d ) + +static unsigned +minify(unsigned d) { return MAX2(1, d>>1); } @@ -209,6 +210,7 @@ twiddle_image_uint(uint w, uint h, uint tile_size, uint *dst, } } + /** * For Cell. Basically, rearrange the pixels/quads from this layout: * +--+--+--+--+ @@ -238,22 +240,22 @@ twiddle_tile(const uint *tileIn, uint *tileOut) } } + /** * Convert image from tiled layout to linear layout. 4-byte pixels. */ static void untwiddle_image_uint(uint w, uint h, uint tile_size, uint *dst, - uint src_stride, const uint *src) + uint dst_stride, const uint *src) { const uint tile_size2 = tile_size * tile_size; const uint h_t = (h + tile_size - 1) / tile_size; const uint w_t = (w + tile_size - 1) / tile_size; uint *tile_buf; - uint it, jt; /* tile counters */ uint i, j; /* intra-tile counters */ - src_stride /= 4; /* convert from bytes to pixels */ + dst_stride /= 4; /* convert from bytes to pixels */ tile_buf = align_malloc(tile_size * tile_size * 4, 16); @@ -282,7 +284,7 @@ untwiddle_image_uint(uint w, uint h, uint tile_size, uint *dst, uint dstj = jt * tile_size + j; ASSERT(dsti < h); ASSERT(dstj < w); - dst[dsti * src_stride + dstj] = tsrc[i * tile_size + j]; + dst[dsti * dst_stride + dstj] = tsrc[i * tile_size + j]; } } } @@ -291,6 +293,7 @@ untwiddle_image_uint(uint w, uint h, uint tile_size, uint *dst, align_free(tile_buf); } + /** * Convert linear texture image data to tiled format for SPU usage. */ @@ -341,6 +344,7 @@ cell_twiddle_texture(struct pipe_screen *screen, pipe_buffer_unmap(screen, surface->buffer); } + /** * Convert SPU tiled texture image data to linear format for app usage. */ -- cgit v1.2.3 From b493fdd7e333b9a94176a603009643326a538690 Mon Sep 17 00:00:00 2001 From: Robert Ellison Date: Fri, 7 Nov 2008 11:29:07 -0700 Subject: CELL: fix several stencil problems This small set of changes repairs several different stenciling problems; now redbook/stencil also runs correctly (and maybe others - I haven't checked everything yet). - The number of instructions that had been allocated for fragment ops used to be 64 (in cell/common.h). With complicated stencil use, we managed to get up to 93, which caused a segfault before we noticed we'd overran our memory buffer. It's now been bumped to 128, which should be enough for even complicated stencil and fragment op usage. - The status of cell surfaces never changed beyond the initial PIPE_SURFACE_STATUS_UNDEFINED. When a user called glClear() to clear just the Z buffer (but not the stencil buffer), this caused the check_clear_depth_with_quad() function to return false (because the surface status was believed to be undefined), and so the device was instructed to clear the whole buffer (including the stencil buffer), instead of correctly using a quad to clear just the depth, leaving the stencil alone. This has been fixed similarly to the way the i915 driver handles the surface status: during cell_clear_surface(), the status is set to PIPE_SURFACE_STATUS_DEFINED. Then a partial buffer clear is handled with a quad, as expected. Note that we are *not* using PIPE_SURFACE_STATUS_CLEAR (also similar to the i915); technically, we should be setting the surface status to CLEAR on a clear, and to DEFINED when we actually draw something (say on cell_vbuf_draw()), but it's difficult to figure out exactly which surfaces are affected by a cell_vbuf_draw(), so for now we're doing the easy thing. - The fragment ops handling was very clever about only pulling out the parts of the Z/stencil buffer that it needed for calculations; but this failed when only part of the buffer was written, because the part that was never pulled out was inadvertently cleared. Now all the data from the combined Z/stencil buffer is pulled out, just so the proper values can be recombined later and written back to the buffer correctly. As a bonus, the fragment op code generation is simplified. --- src/gallium/drivers/cell/common.h | 2 +- src/gallium/drivers/cell/ppu/cell_clear.c | 13 ++ src/gallium/drivers/cell/ppu/cell_gen_fragment.c | 153 ++++++++++------------- 3 files changed, 80 insertions(+), 88 deletions(-) (limited to 'src/gallium/drivers/cell') diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h index 23fb0b0831..87488ea2d7 100644 --- a/src/gallium/drivers/cell/common.h +++ b/src/gallium/drivers/cell/common.h @@ -122,7 +122,7 @@ #define CELL_DEBUG_CACHE (1 << 6) /** Max instructions for doing per-fragment operations */ -#define SPU_MAX_FRAGMENT_OPS_INSTS 64 +#define SPU_MAX_FRAGMENT_OPS_INSTS 128 diff --git a/src/gallium/drivers/cell/ppu/cell_clear.c b/src/gallium/drivers/cell/ppu/cell_clear.c index c9c0c721bb..037635e466 100644 --- a/src/gallium/drivers/cell/ppu/cell_clear.c +++ b/src/gallium/drivers/cell/ppu/cell_clear.c @@ -106,4 +106,17 @@ cell_clear_surface(struct pipe_context *pipe, struct pipe_surface *ps, clr->surface = surfIndex; clr->value = clearValue; } + + /* Technically, the surface's contents are now known and cleared, + * so we could set the status to PIPE_SURFACE_STATUS_CLEAR. But + * it turns out it's quite painful to recognize when any particular + * surface goes from PIPE_SURFACE_STATUS_CLEAR to + * PIPE_SURFACE_STATUS_DEFINED (i.e. with known contents), because + * the drawing commands could be operating on numerous draw buffers, + * which we'd have to iterate through to set all their stati... + * For now, we cheat a bit and set the surface's status to DEFINED + * right here. Later we should revisit this and set the status to + * CLEAR here, and find a better place to set the status to DEFINED. + */ + ps->status = PIPE_SURFACE_STATUS_DEFINED; } diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c index 6e2a5d2980..d9c3ff3f4d 100644 --- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c +++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c @@ -1997,81 +1997,79 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f) * Z and/or stencil. We'll also convert the incoming fragment Z * value in fragZ_reg from a floating point value in [0.0..1.0] to * an unsigned integer value with the appropriate resolution. + * Note that even if depth or stencil is *not* enabled, if it's + * present in the buffer, we pull it out and put it back later; + * otherwise, we can inadvertently destroy the contents of + * buffers we're not supposed to touch (e.g., if the user is + * clearing the depth buffer but not the stencil buffer, a + * quad of constant depth is drawn over the surface; the stencil + * buffer must be maintained). */ switch(zs_format) { case PIPE_FORMAT_S8Z24_UNORM: /* fall through */ case PIPE_FORMAT_X8Z24_UNORM: - if (dsa->depth.enabled) { - /* We need the Z part at least */ - setup_optional_register(f, &fbZ_reg_set, &fbZ_reg); - /* four 24-bit Z values in the low-order bits */ - spe_and_uint(f, fbZ_reg, fbZS_reg, 0x00ffffff); - - /* Incoming fragZ_reg value is a float in 0.0...1.0; convert - * to a 24-bit unsigned integer - */ - spe_cfltu(f, fragZ_reg, fragZ_reg, 32); - spe_rotmi(f, fragZ_reg, fragZ_reg, -8); - } - if (dsa->stencil[0].enabled) { - setup_optional_register(f, &fbS_reg_set, &fbS_reg); - /* four 8-bit Z values in the high-order bits */ - spe_rotmi(f, fbS_reg, fbZS_reg, -24); - } - break; + /* Pull out both Z and stencil */ + setup_optional_register(f, &fbZ_reg_set, &fbZ_reg); + setup_optional_register(f, &fbS_reg_set, &fbS_reg); + + /* four 24-bit Z values in the low-order bits */ + spe_and_uint(f, fbZ_reg, fbZS_reg, 0x00ffffff); + + /* Incoming fragZ_reg value is a float in 0.0...1.0; convert + * to a 24-bit unsigned integer + */ + spe_cfltu(f, fragZ_reg, fragZ_reg, 32); + spe_rotmi(f, fragZ_reg, fragZ_reg, -8); + + /* four 8-bit stencil values in the high-order bits */ + spe_rotmi(f, fbS_reg, fbZS_reg, -24); + break; case PIPE_FORMAT_Z24S8_UNORM: /* fall through */ case PIPE_FORMAT_Z24X8_UNORM: - if (dsa->depth.enabled) { - setup_optional_register(f, &fbZ_reg_set, &fbZ_reg); - /* shift by 8 to get the upper 24-bit values */ - spe_rotmi(f, fbS_reg, fbZS_reg, -8); - - /* Incoming fragZ_reg value is a float in 0.0...1.0; convert - * to a 24-bit unsigned integer - */ - spe_cfltu(f, fragZ_reg, fragZ_reg, 32); - spe_rotmi(f, fragZ_reg, fragZ_reg, -8); - } - if (dsa->stencil[0].enabled) { - setup_optional_register(f, &fbS_reg_set, &fbS_reg); - /* 8-bit stencil in the low-order bits - mask them out */ - spe_and_uint(f, fbS_reg, fbZS_reg, 0x000000ff); - } - break; + setup_optional_register(f, &fbZ_reg_set, &fbZ_reg); + setup_optional_register(f, &fbS_reg_set, &fbS_reg); + + /* shift by 8 to get the upper 24-bit values */ + spe_rotmi(f, fbS_reg, fbZS_reg, -8); + + /* Incoming fragZ_reg value is a float in 0.0...1.0; convert + * to a 24-bit unsigned integer + */ + spe_cfltu(f, fragZ_reg, fragZ_reg, 32); + spe_rotmi(f, fragZ_reg, fragZ_reg, -8); + + /* 8-bit stencil in the low-order bits - mask them out */ + spe_and_uint(f, fbS_reg, fbZS_reg, 0x000000ff); + break; case PIPE_FORMAT_Z32_UNORM: - if (dsa->depth.enabled) { - setup_optional_register(f, &fbZ_reg_set, &fbZ_reg); - /* Copy over 4 32-bit values */ - spe_move(f, fbZ_reg, fbZS_reg); - - /* Incoming fragZ_reg value is a float in 0.0...1.0; convert - * to a 32-bit unsigned integer - */ - spe_cfltu(f, fragZ_reg, fragZ_reg, 32); - } + setup_optional_register(f, &fbZ_reg_set, &fbZ_reg); + /* Copy over 4 32-bit values */ + spe_move(f, fbZ_reg, fbZS_reg); + + /* Incoming fragZ_reg value is a float in 0.0...1.0; convert + * to a 32-bit unsigned integer + */ + spe_cfltu(f, fragZ_reg, fragZ_reg, 32); /* No stencil, so can't do anything there */ - break; + break; case PIPE_FORMAT_Z16_UNORM: - if (dsa->depth.enabled) { - /* XXX Not sure this is correct, but it was here before, so we're - * going with it for now - */ - setup_optional_register(f, &fbZ_reg_set, &fbZ_reg); - /* Copy over 4 32-bit values */ - spe_move(f, fbZ_reg, fbZS_reg); - - /* Incoming fragZ_reg value is a float in 0.0...1.0; convert - * to a 16-bit unsigned integer - */ - spe_cfltu(f, fragZ_reg, fragZ_reg, 32); - spe_rotmi(f, fragZ_reg, fragZ_reg, -16); - } + /* XXX Not sure this is correct, but it was here before, so we're + * going with it for now + */ + setup_optional_register(f, &fbZ_reg_set, &fbZ_reg); + /* Copy over 4 32-bit values */ + spe_move(f, fbZ_reg, fbZS_reg); + + /* Incoming fragZ_reg value is a float in 0.0...1.0; convert + * to a 16-bit unsigned integer + */ + spe_cfltu(f, fragZ_reg, fragZ_reg, 32); + spe_rotmi(f, fragZ_reg, fragZ_reg, -16); /* No stencil */ - break; default: ASSERT(0); /* invalid format */ @@ -2118,39 +2116,19 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f) spe_comment(f, 0, "Store quad's depth/stencil values in tile"); if (zs_format == PIPE_FORMAT_S8Z24_UNORM || zs_format == PIPE_FORMAT_X8Z24_UNORM) { - if (fbS_reg_set && fbZ_reg_set) { - spe_shli(f, fbS_reg, fbS_reg, 24); /* fbS = fbS << 24 */ - spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */ - } - else if (fbS_reg_set) { - spe_shli(f, fbZS_reg, fbS_reg, 24); /* fbS = fbS << 24 */ - } - else { - spe_move(f, fbZS_reg, fbZ_reg); - } + spe_shli(f, fbS_reg, fbS_reg, 24); /* fbS = fbS << 24 */ + spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */ } else if (zs_format == PIPE_FORMAT_Z24S8_UNORM || zs_format == PIPE_FORMAT_Z24X8_UNORM) { - if (fbS_reg_set && fbZ_reg_set) { - spe_shli(f, fbZ_reg, fbZ_reg, 8); /* fbZ = fbZ << 8 */ - spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */ - } - else if (fbS_reg_set) { - spe_move(f, fbZS_reg, fbS_reg); - } - else { - spe_shli(f, fbZ_reg, fbZ_reg, 8); /* fbZ = fbZ << 8 */ - } + spe_shli(f, fbZ_reg, fbZ_reg, 8); /* fbZ = fbZ << 8 */ + spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */ } else if (zs_format == PIPE_FORMAT_Z32_UNORM) { - if (fbZ_reg_set) { - spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */ - } + spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */ } else if (zs_format == PIPE_FORMAT_Z16_UNORM) { - if (fbZ_reg_set) { - spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */ - } + spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */ } else if (zs_format == PIPE_FORMAT_S8_UNORM) { ASSERT(0); /* XXX to do */ @@ -2163,6 +2141,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f) spe_stqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg); } + /* Don't need these any more */ release_optional_register(f, &fbZ_reg_set, fbZ_reg); release_optional_register(f, &fbS_reg_set, fbS_reg); } -- cgit v1.2.3