diff options
Diffstat (limited to 'src/gallium/drivers/cell/spu')
-rw-r--r-- | src/gallium/drivers/cell/spu/spu_command.c | 152 | ||||
-rw-r--r-- | src/gallium/drivers/cell/spu/spu_dcache.c | 4 | ||||
-rw-r--r-- | src/gallium/drivers/cell/spu/spu_debug.h | 60 | ||||
-rw-r--r-- | src/gallium/drivers/cell/spu/spu_funcs.c | 34 | ||||
-rw-r--r-- | src/gallium/drivers/cell/spu/spu_main.c | 9 | ||||
-rw-r--r-- | src/gallium/drivers/cell/spu/spu_main.h | 50 | ||||
-rw-r--r-- | src/gallium/drivers/cell/spu/spu_per_fragment_op.c | 122 | ||||
-rw-r--r-- | src/gallium/drivers/cell/spu/spu_render.c | 34 | ||||
-rw-r--r-- | src/gallium/drivers/cell/spu/spu_texture.c | 208 | ||||
-rw-r--r-- | src/gallium/drivers/cell/spu/spu_texture.h | 34 | ||||
-rw-r--r-- | src/gallium/drivers/cell/spu/spu_tri.c | 162 | ||||
-rw-r--r-- | src/gallium/drivers/cell/spu/spu_tri.h | 2 |
12 files changed, 487 insertions, 384 deletions
diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c index c28677ebf8..a6ed29ea63 100644 --- a/src/gallium/drivers/cell/spu/spu_command.c +++ b/src/gallium/drivers/cell/spu/spu_command.c @@ -44,7 +44,6 @@ #include "spu_tile.h" #include "spu_vertex_shader.h" #include "spu_dcache.h" -#include "spu_debug.h" #include "cell/common.h" @@ -77,9 +76,10 @@ static void release_buffer(uint buffer) { /* Evidently, using less than a 16-byte status doesn't work reliably */ - static const uint status[4] ALIGN16_ATTRIB - = {CELL_BUFFER_STATUS_FREE, 0, 0, 0}; - + static const vector unsigned int status = {CELL_BUFFER_STATUS_FREE, + CELL_BUFFER_STATUS_FREE, + CELL_BUFFER_STATUS_FREE, + CELL_BUFFER_STATUS_FREE}; const uint index = 4 * (spu.init.id * CELL_NUM_BUFFERS + buffer); uint *dst = spu.init.buffer_status + index; @@ -94,10 +94,33 @@ release_buffer(uint buffer) } +/** + * Write CELL_FENCE_SIGNALLED back to the fence status qword in main memory. + * There's a qword of status per SPU. + */ +static void +cmd_fence(struct cell_command_fence *fence_cmd) +{ + static const vector unsigned int status = {CELL_FENCE_SIGNALLED, + CELL_FENCE_SIGNALLED, + CELL_FENCE_SIGNALLED, + CELL_FENCE_SIGNALLED}; + uint *dst = (uint *) fence_cmd->fence; + dst += 4 * spu.init.id; /* main store/memory address, not local store */ + + mfc_put((void *) &status, /* src in local memory */ + (unsigned int) dst, /* dst in main memory */ + sizeof(status), /* size */ + TAG_FENCE, /* tag */ + 0, /* tid */ + 0 /* rid */); +} + + static void cmd_clear_surface(const struct cell_command_clear_surface *clear) { - DEBUG_PRINTF("CLEAR SURF %u to 0x%08x\n", clear->surface, clear->value); + D_PRINTF(CELL_DEBUG_CMD, "CLEAR SURF %u to 0x%08x\n", clear->surface, clear->value); if (clear->surface == 0) { spu.fb.color_clear_value = clear->value; @@ -165,14 +188,14 @@ cmd_clear_surface(const struct cell_command_clear_surface *clear) #endif /* CLEAR_OPT */ - DEBUG_PRINTF("CLEAR SURF done\n"); + D_PRINTF(CELL_DEBUG_CMD, "CLEAR SURF done\n"); } static void cmd_release_verts(const struct cell_command_release_verts *release) { - DEBUG_PRINTF("RELEASE VERTS %u\n", release->vertex_buf); + D_PRINTF(CELL_DEBUG_CMD, "RELEASE VERTS %u\n", release->vertex_buf); ASSERT(release->vertex_buf != ~0U); release_buffer(release->vertex_buf); } @@ -189,12 +212,13 @@ cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops) { static int warned = 0; - DEBUG_PRINTF("CMD_STATE_FRAGMENT_OPS\n"); + D_PRINTF(CELL_DEBUG_CMD, "CMD_STATE_FRAGMENT_OPS\n"); /* Copy SPU code from batch buffer to spu buffer */ memcpy(spu.fragment_ops_code, fops->code, SPU_MAX_FRAGMENT_OPS_INSTS * 4); /* Copy state info (for fallback case only) */ memcpy(&spu.depth_stencil_alpha, &fops->dsa, sizeof(fops->dsa)); memcpy(&spu.blend, &fops->blend, sizeof(fops->blend)); + memcpy(&spu.blend_color, &fops->blend_color, sizeof(fops->blend_color)); /* Parity twist! For now, always use the fallback code by default, * only switching to codegen when specifically requested. This @@ -228,7 +252,7 @@ cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops) static void cmd_state_fragment_program(const struct cell_command_fragment_program *fp) { - DEBUG_PRINTF("CMD_STATE_FRAGMENT_PROGRAM\n"); + D_PRINTF(CELL_DEBUG_CMD, "CMD_STATE_FRAGMENT_PROGRAM\n"); /* Copy SPU code from batch buffer to spu buffer */ memcpy(spu.fragment_program_code, fp->code, SPU_MAX_FRAGMENT_PROGRAM_INSTS * 4); @@ -246,10 +270,11 @@ cmd_state_fs_constants(const uint64_t *buffer, uint pos) const float *constants = (const float *) &buffer[pos + 2]; uint i; - DEBUG_PRINTF("CMD_STATE_FS_CONSTANTS (%u)\n", num_const); + D_PRINTF(CELL_DEBUG_CMD, "CMD_STATE_FS_CONSTANTS (%u)\n", num_const); /* Expand each float to float[4] for SOA execution */ for (i = 0; i < num_const; i++) { + D_PRINTF(CELL_DEBUG_CMD, " const[%u] = %f\n", i, constants[i]); spu.constants[i] = spu_splats(constants[i]); } @@ -261,7 +286,7 @@ cmd_state_fs_constants(const uint64_t *buffer, uint pos) static void cmd_state_framebuffer(const struct cell_command_framebuffer *cmd) { - DEBUG_PRINTF("FRAMEBUFFER: %d x %d at %p, cformat 0x%x zformat 0x%x\n", + D_PRINTF(CELL_DEBUG_CMD, "FRAMEBUFFER: %d x %d at %p, cformat 0x%x zformat 0x%x\n", cmd->width, cmd->height, cmd->color_start, @@ -309,8 +334,7 @@ cmd_state_framebuffer(const struct cell_command_framebuffer *cmd) */ static void update_tex_masks(struct spu_texture *texture, - const struct pipe_sampler_state *sampler, - uint unit) + const struct pipe_sampler_state *sampler) { uint i; @@ -337,11 +361,6 @@ update_tex_masks(struct spu_texture *texture, texture->level[i].scale_t = spu_splats(1.0f); } } - - /* XXX temporary hack */ - if (texture->target == PIPE_TEXTURE_CUBE) { - spu.sample_texture4[unit] = sample_texture4_cube; - } } @@ -350,18 +369,18 @@ cmd_state_sampler(const struct cell_command_sampler *sampler) { uint unit = sampler->unit; - DEBUG_PRINTF("SAMPLER [%u]\n", unit); + D_PRINTF(CELL_DEBUG_CMD, "SAMPLER [%u]\n", unit); spu.sampler[unit] = sampler->state; switch (spu.sampler[unit].min_img_filter) { case PIPE_TEX_FILTER_LINEAR: - spu.min_sample_texture4[unit] = sample_texture4_bilinear; + spu.min_sample_texture_2d[unit] = sample_texture_2d_bilinear; break; case PIPE_TEX_FILTER_ANISO: /* fall-through, for now */ case PIPE_TEX_FILTER_NEAREST: - spu.min_sample_texture4[unit] = sample_texture4_nearest; + spu.min_sample_texture_2d[unit] = sample_texture_2d_nearest; break; default: ASSERT(0); @@ -369,12 +388,12 @@ cmd_state_sampler(const struct cell_command_sampler *sampler) switch (spu.sampler[sampler->unit].mag_img_filter) { case PIPE_TEX_FILTER_LINEAR: - spu.mag_sample_texture4[unit] = sample_texture4_bilinear; + spu.mag_sample_texture_2d[unit] = sample_texture_2d_bilinear; break; case PIPE_TEX_FILTER_ANISO: /* fall-through, for now */ case PIPE_TEX_FILTER_NEAREST: - spu.mag_sample_texture4[unit] = sample_texture4_nearest; + spu.mag_sample_texture_2d[unit] = sample_texture_2d_nearest; break; default: ASSERT(0); @@ -383,16 +402,16 @@ cmd_state_sampler(const struct cell_command_sampler *sampler) switch (spu.sampler[sampler->unit].min_mip_filter) { case PIPE_TEX_MIPFILTER_NEAREST: case PIPE_TEX_MIPFILTER_LINEAR: - spu.sample_texture4[unit] = sample_texture4_lod; + spu.sample_texture_2d[unit] = sample_texture_2d_lod; break; case PIPE_TEX_MIPFILTER_NONE: - spu.sample_texture4[unit] = spu.mag_sample_texture4[unit]; + spu.sample_texture_2d[unit] = spu.mag_sample_texture_2d[unit]; break; default: ASSERT(0); } - update_tex_masks(&spu.texture[unit], &spu.sampler[unit], unit); + update_tex_masks(&spu.texture[unit], &spu.sampler[unit]); } @@ -402,9 +421,7 @@ cmd_state_texture(const struct cell_command_texture *texture) const uint unit = texture->unit; uint i; - //if (spu.init.id==0) Debug=1; - - DEBUG_PRINTF("TEXTURE [%u]\n", texture->unit); + D_PRINTF(CELL_DEBUG_CMD, "TEXTURE [%u]\n", texture->unit); spu.texture[unit].max_level = 0; spu.texture[unit].target = texture->target; @@ -414,7 +431,7 @@ cmd_state_texture(const struct cell_command_texture *texture) uint height = texture->height[i]; uint depth = texture->depth[i]; - DEBUG_PRINTF(" LEVEL %u: at %p size[0] %u x %u\n", i, + D_PRINTF(CELL_DEBUG_CMD, " LEVEL %u: at %p size[0] %u x %u\n", i, texture->start[i], texture->width[i], texture->height[i]); spu.texture[unit].level[i].start = texture->start[i]; @@ -435,16 +452,14 @@ cmd_state_texture(const struct cell_command_texture *texture) spu.texture[unit].max_level = i; } - update_tex_masks(&spu.texture[unit], &spu.sampler[unit], unit); - - //Debug=0; + update_tex_masks(&spu.texture[unit], &spu.sampler[unit]); } static void cmd_state_vertex_info(const struct vertex_info *vinfo) { - DEBUG_PRINTF("VERTEX_INFO num_attribs=%u\n", vinfo->num_attribs); + D_PRINTF(CELL_DEBUG_CMD, "VERTEX_INFO num_attribs=%u\n", vinfo->num_attribs); ASSERT(vinfo->num_attribs >= 1); ASSERT(vinfo->num_attribs <= 8); memcpy(&spu.vertex_info, vinfo, sizeof(*vinfo)); @@ -483,7 +498,7 @@ cmd_state_attrib_fetch(const struct cell_attribute_fetch_code *code) static void cmd_finish(void) { - DEBUG_PRINTF("FINISH\n"); + D_PRINTF(CELL_DEBUG_CMD, "FINISH\n"); really_clear_tiles(0); /* wait for all outstanding DMAs to finish */ mfc_write_tag_mask(~0); @@ -508,7 +523,7 @@ cmd_batch(uint opcode) const unsigned usize = size / sizeof(buffer[0]); uint pos; - DEBUG_PRINTF("BATCH buffer %u, len %u, from %p\n", + D_PRINTF(CELL_DEBUG_CMD, "BATCH buffer %u, len %u, from %p\n", buf, size, spu.init.buffers[buf]); ASSERT((opcode & CELL_CMD_OPCODE_MASK) == CELL_CMD_BATCH); @@ -528,7 +543,7 @@ cmd_batch(uint opcode) wait_on_mask(1 << TAG_BATCH_BUFFER); /* Tell PPU we're done copying the buffer to local store */ - DEBUG_PRINTF("release batch buf %u\n", buf); + D_PRINTF(CELL_DEBUG_CMD, "release batch buf %u\n", buf); release_buffer(buf); /* @@ -586,6 +601,14 @@ cmd_batch(uint opcode) case CELL_CMD_STATE_FS_CONSTANTS: pos = cmd_state_fs_constants(buffer, pos); break; + case CELL_CMD_STATE_RASTERIZER: + { + struct cell_command_rasterizer *rast = + (struct cell_command_rasterizer *) &buffer[pos]; + spu.rasterizer = rast->rasterizer; + pos += sizeof(*rast) / 8; + } + break; case CELL_CMD_STATE_SAMPLER: { struct cell_command_sampler *sampler @@ -638,6 +661,14 @@ cmd_batch(uint opcode) cmd_finish(); pos += 1; break; + case CELL_CMD_FENCE: + { + struct cell_command_fence *fence_cmd = + (struct cell_command_fence *) &buffer[pos]; + cmd_fence(fence_cmd); + pos += sizeof(*fence_cmd) / 8; + } + break; case CELL_CMD_RELEASE_VERTS: { struct cell_command_release_verts *release @@ -661,10 +692,12 @@ cmd_batch(uint opcode) } } - DEBUG_PRINTF("BATCH complete\n"); + D_PRINTF(CELL_DEBUG_CMD, "BATCH complete\n"); } +#define PERF 0 + /** * Main loop for SPEs: Get a command, execute it, repeat. @@ -672,41 +705,29 @@ cmd_batch(uint opcode) void command_loop(void) { - struct cell_command cmd; int exitFlag = 0; + uint t0, t1; - DEBUG_PRINTF("Enter command loop\n"); - - ASSERT((sizeof(struct cell_command) & 0xf) == 0); - ASSERT_ALIGN16(&cmd); + D_PRINTF(CELL_DEBUG_CMD, "Enter command loop\n"); while (!exitFlag) { unsigned opcode; - int tag = 0; - DEBUG_PRINTF("Wait for cmd...\n"); + D_PRINTF(CELL_DEBUG_CMD, "Wait for cmd...\n"); + + if (PERF) + spu_write_decrementer(~0); /* read/wait from mailbox */ opcode = (unsigned int) spu_read_in_mbox(); + D_PRINTF(CELL_DEBUG_CMD, "got cmd 0x%x\n", opcode); - DEBUG_PRINTF("got cmd 0x%x\n", opcode); - - /* command payload */ - mfc_get(&cmd, /* dest */ - (unsigned int) spu.init.cmd, /* src */ - sizeof(struct cell_command), /* bytes */ - tag, - 0, /* tid */ - 0 /* rid */); - wait_on_mask( 1 << tag ); - - /* - * NOTE: most commands should be contained in a batch buffer - */ + if (PERF) + t0 = spu_read_decrementer(); switch (opcode & CELL_CMD_OPCODE_MASK) { case CELL_CMD_EXIT: - DEBUG_PRINTF("EXIT\n"); + D_PRINTF(CELL_DEBUG_CMD, "EXIT\n"); exitFlag = 1; break; case CELL_CMD_VS_EXECUTE: @@ -721,9 +742,16 @@ command_loop(void) printf("Bad opcode 0x%x!\n", opcode & CELL_CMD_OPCODE_MASK); } + if (PERF) { + t1 = spu_read_decrementer(); + printf("wait mbox time: %gms batch time: %gms\n", + (~0u - t0) * spu.init.inv_timebase, + (t0 - t1) * spu.init.inv_timebase); + } } - DEBUG_PRINTF("Exit command loop\n"); + D_PRINTF(CELL_DEBUG_CMD, "Exit command loop\n"); - spu_dcache_report(); + if (spu.init.debug_flags & CELL_DEBUG_CACHE) + spu_dcache_report(); } diff --git a/src/gallium/drivers/cell/spu/spu_dcache.c b/src/gallium/drivers/cell/spu/spu_dcache.c index 167404cdc5..a6d67634fd 100644 --- a/src/gallium/drivers/cell/spu/spu_dcache.c +++ b/src/gallium/drivers/cell/spu/spu_dcache.c @@ -36,7 +36,9 @@ #define CACHE_SET_TAGID(set) (((set) & 0x03) + TAG_DCACHE0) #define CACHE_LOG2NNWAY 2 #define CACHE_LOG2NSETS 6 -/*#define CACHE_STATS 1*/ +#ifdef DEBUG +#define CACHE_STATS 1 +#endif #include <cache-api.h> /* Yes folks, this is ugly. diff --git a/src/gallium/drivers/cell/spu/spu_debug.h b/src/gallium/drivers/cell/spu/spu_debug.h deleted file mode 100644 index eeec052655..0000000000 --- a/src/gallium/drivers/cell/spu/spu_debug.h +++ /dev/null @@ -1,60 +0,0 @@ -/************************************************************************** - * - * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas. - * All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sub license, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice (including the - * next paragraph) shall be included in all copies or substantial portions - * of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. - * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR - * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - **************************************************************************/ - - -#ifndef SPU_DEBUG_H -#define SPU_DEBUG_H - - -/* Set to 0 to disable all extraneous debugging code */ -#define DEBUG 1 - -#if DEBUG -extern boolean Debug; -extern boolean force_fragment_ops_fallback; - -/* These debug macros use the unusual construction ", ##__VA_ARGS__" - * which expands to the expected comma + args if variadic arguments - * are supplied, but swallows the comma if there are no variadic - * arguments (which avoids syntax errors that would otherwise occur). - */ -#define DEBUG_PRINTF(format,...) \ - if (Debug) \ - printf("SPU %u: " format, spu.init.id, ##__VA_ARGS__) -#define D_PRINTF(flag, format,...) \ - if (spu.init.debug_flags & (flag)) \ - printf("SPU %u: " format, spu.init.id, ##__VA_ARGS__) - -#else - -#define DEBUG_PRINTF(...) -#define D_PRINTF(...) - -#endif - - -#endif /* SPU_DEBUG_H */ diff --git a/src/gallium/drivers/cell/spu/spu_funcs.c b/src/gallium/drivers/cell/spu/spu_funcs.c index 5c3ee305d4..3534b35000 100644 --- a/src/gallium/drivers/cell/spu/spu_funcs.c +++ b/src/gallium/drivers/cell/spu/spu_funcs.c @@ -43,6 +43,7 @@ #include "cell/common.h" #include "spu_main.h" #include "spu_funcs.h" +#include "spu_texture.h" /** For "return"-ing four vectors */ @@ -102,11 +103,34 @@ spu_log2(vector float x) static struct vec_4x4 -spu_txp(vector float s, vector float t, vector float r, vector float q, - unsigned unit) +spu_tex_2d(vector float s, vector float t, vector float r, vector float q, + unsigned unit) { struct vec_4x4 colors; - spu.sample_texture4[unit](s, t, r, q, unit, 0, 0, colors.v); + (void) r; + (void) q; + spu.sample_texture_2d[unit](s, t, unit, 0, 0, colors.v); + return colors; +} + +static struct vec_4x4 +spu_tex_3d(vector float s, vector float t, vector float r, vector float q, + unsigned unit) +{ + struct vec_4x4 colors; + (void) r; + (void) q; + spu.sample_texture_2d[unit](s, t, unit, 0, 0, colors.v); + return colors; +} + +static struct vec_4x4 +spu_tex_cube(vector float s, vector float t, vector float r, vector float q, + unsigned unit) +{ + struct vec_4x4 colors; + (void) q; + sample_texture_cube(s, t, r, unit, colors.v); return colors; } @@ -147,7 +171,9 @@ return_function_info(void) export_func(&funcs, "spu_pow", &spu_pow); export_func(&funcs, "spu_exp2", &spu_exp2); export_func(&funcs, "spu_log2", &spu_log2); - export_func(&funcs, "spu_txp", &spu_txp); + export_func(&funcs, "spu_tex_2d", &spu_tex_2d); + export_func(&funcs, "spu_tex_3d", &spu_tex_3d); + export_func(&funcs, "spu_tex_cube", &spu_tex_cube); /* Send the function info back to the PPU / main memory */ mfc_put((void *) &funcs, /* src in local store */ diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c index 4becd0f92a..c8bb251905 100644 --- a/src/gallium/drivers/cell/spu/spu_main.c +++ b/src/gallium/drivers/cell/spu/spu_main.c @@ -40,7 +40,6 @@ #include "spu_per_fragment_op.h" #include "spu_texture.h" //#include "spu_test.h" -#include "spu_debug.h" #include "cell/common.h" @@ -53,12 +52,6 @@ helpful headers: struct spu_global spu; -#if DEBUG -boolean Debug = FALSE; -boolean force_fragment_ops_fallback = TRUE; -#endif - - static void one_time_init(void) { @@ -102,7 +95,7 @@ main(main_param_t speid, main_param_t argp) one_time_init(); - DEBUG_PRINTF("main() speid=%lu\n", (unsigned long) speid); + D_PRINTF(CELL_DEBUG_CMD, "main() speid=%lu\n", (unsigned long) speid); D_PRINTF(CELL_DEBUG_FRAGMENT_OP_FALLBACK, "using fragment op fallback\n"); /* get initialization data */ diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h index eff43b870c..668af10be2 100644 --- a/src/gallium/drivers/cell/spu/spu_main.h +++ b/src/gallium/drivers/cell/spu/spu_main.h @@ -36,12 +36,18 @@ #include "pipe/p_state.h" - -#define MAX_WIDTH 1024 -#define MAX_HEIGHT 1024 - - -#define CELL_MAX_CONSTANTS 32 /**< number of float[4] constants */ +#if DEBUG +/* These debug macros use the unusual construction ", ##__VA_ARGS__" + * which expands to the expected comma + args if variadic arguments + * are supplied, but swallows the comma if there are no variadic + * arguments (which avoids syntax errors that would otherwise occur). + */ +#define D_PRINTF(flag, format,...) \ + if (spu.init.debug_flags & (flag)) \ + printf("SPU %u: " format, spu.init.id, ##__VA_ARGS__) +#else +#define D_PRINTF(...) +#endif /** @@ -64,12 +70,10 @@ typedef union { /** Function for sampling textures */ -typedef void (*spu_sample_texture4_func)(vector float s, - vector float t, - vector float r, - vector float q, - uint unit, uint level, uint face, - vector float colors[4]); +typedef void (*spu_sample_texture_2d_func)(vector float s, + vector float t, + uint unit, uint level, uint face, + vector float colors[4]); /** Function for performing per-fragment ops */ @@ -85,9 +89,9 @@ typedef void (*spu_fragment_ops_func)(uint x, uint y, uint facing); /** Function for running fragment program */ -typedef void (*spu_fragment_program_func)(vector float *inputs, - vector float *outputs, - vector float *constants); +typedef vector unsigned int (*spu_fragment_program_func)(vector float *inputs, + vector float *outputs, + vector float *constants); struct spu_framebuffer @@ -145,7 +149,9 @@ struct spu_global struct spu_framebuffer fb; struct pipe_depth_stencil_alpha_state depth_stencil_alpha; struct pipe_blend_state blend; + struct pipe_blend_color blend_color; struct pipe_sampler_state sampler[PIPE_MAX_SAMPLERS]; + struct pipe_rasterizer_state rasterizer; struct spu_texture texture[PIPE_MAX_SAMPLERS]; struct vertex_info vertex_info; @@ -161,8 +167,8 @@ struct spu_global ubyte cur_ctile_status, cur_ztile_status; /** Status of all tiles in framebuffer */ - ubyte ctile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB; - ubyte ztile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB; + ubyte ctile_status[CELL_MAX_HEIGHT/TILE_SIZE][CELL_MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB; + ubyte ztile_status[CELL_MAX_HEIGHT/TILE_SIZE][CELL_MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB; /** Current fragment ops machine code, at 8-byte boundary */ uint fragment_ops_code[SPU_MAX_FRAGMENT_OPS_INSTS] ALIGN8_ATTRIB; @@ -175,9 +181,9 @@ struct spu_global spu_fragment_program_func fragment_program; /** Current texture sampler function */ - spu_sample_texture4_func sample_texture4[CELL_MAX_SAMPLERS]; - spu_sample_texture4_func min_sample_texture4[CELL_MAX_SAMPLERS]; - spu_sample_texture4_func mag_sample_texture4[CELL_MAX_SAMPLERS]; + spu_sample_texture_2d_func sample_texture_2d[CELL_MAX_SAMPLERS]; + spu_sample_texture_2d_func min_sample_texture_2d[CELL_MAX_SAMPLERS]; + spu_sample_texture_2d_func mag_sample_texture_2d[CELL_MAX_SAMPLERS]; /** Fragment program constants */ vector float constants[4 * CELL_MAX_CONSTANTS]; @@ -186,8 +192,6 @@ struct spu_global extern struct spu_global spu; -extern boolean Debug; - @@ -206,7 +210,7 @@ extern boolean Debug; #define TAG_DCACHE1 21 #define TAG_DCACHE2 22 #define TAG_DCACHE3 23 - +#define TAG_FENCE 24 static INLINE void diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c index d252fa6dc1..f8ffc70492 100644 --- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c +++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c @@ -40,6 +40,24 @@ #define LINEAR_QUAD_LAYOUT 1 +static INLINE vector float +spu_min(vector float a, vector float b) +{ + vector unsigned int m; + m = spu_cmpgt(a, b); /* m = a > b ? ~0 : 0 */ + return spu_sel(a, b, m); +} + + +static INLINE vector float +spu_max(vector float a, vector float b) +{ + vector unsigned int m; + m = spu_cmpgt(a, b); /* m = a > b ? ~0 : 0 */ + return spu_sel(b, a, m); +} + + /** * Called by rasterizer for each quad after the shader has run. Do * all the per-fragment operations including alpha test, z test, @@ -242,7 +260,7 @@ spu_fallback_fragment_ops(uint x, uint y, } /* - * Compute Src RGB terms + * Compute Src RGB terms (fragment color * factor) */ switch (spu.blend.rgb_src_factor) { case PIPE_BLENDFACTOR_ONE: @@ -265,13 +283,33 @@ spu_fallback_fragment_ops(uint x, uint y, term1g = spu_mul(fragG, fragA); term1b = spu_mul(fragB, fragA); break; + case PIPE_BLENDFACTOR_DST_COLOR: + term1r = spu_mul(fragR, fbRGBA[0]); + term1g = spu_mul(fragG, fbRGBA[1]); + term1b = spu_mul(fragB, fbRGBA[1]); + break; + case PIPE_BLENDFACTOR_DST_ALPHA: + term1r = spu_mul(fragR, fbRGBA[3]); + term1g = spu_mul(fragG, fbRGBA[3]); + term1b = spu_mul(fragB, fbRGBA[3]); + break; + case PIPE_BLENDFACTOR_CONST_COLOR: + term1r = spu_mul(fragR, spu_splats(spu.blend_color.color[0])); + term1g = spu_mul(fragG, spu_splats(spu.blend_color.color[1])); + term1b = spu_mul(fragB, spu_splats(spu.blend_color.color[2])); + break; + case PIPE_BLENDFACTOR_CONST_ALPHA: + term1r = spu_mul(fragR, spu_splats(spu.blend_color.color[3])); + term1g = spu_mul(fragG, spu_splats(spu.blend_color.color[3])); + term1b = spu_mul(fragB, spu_splats(spu.blend_color.color[3])); + break; /* XXX more cases */ default: ASSERT(0); } /* - * Compute Src Alpha term + * Compute Src Alpha term (fragment alpha * factor) */ switch (spu.blend.alpha_src_factor) { case PIPE_BLENDFACTOR_ONE: @@ -283,19 +321,29 @@ spu_fallback_fragment_ops(uint x, uint y, case PIPE_BLENDFACTOR_SRC_ALPHA: term1a = spu_mul(fragA, fragA); break; + case PIPE_BLENDFACTOR_DST_COLOR: + /* fall-through */ + case PIPE_BLENDFACTOR_DST_ALPHA: + term1a = spu_mul(fragA, fbRGBA[3]); + break; + case PIPE_BLENDFACTOR_CONST_COLOR: + /* fall-through */ + case PIPE_BLENDFACTOR_CONST_ALPHA: + term1a = spu_mul(fragR, spu_splats(spu.blend_color.color[3])); + break; /* XXX more cases */ default: ASSERT(0); } /* - * Compute Dest RGB terms + * Compute Dest RGB terms (framebuffer color * factor) */ switch (spu.blend.rgb_dst_factor) { case PIPE_BLENDFACTOR_ONE: - term2r = fragR; - term2g = fragG; - term2b = fragB; + term2r = fbRGBA[0]; + term2g = fbRGBA[1]; + term2b = fbRGBA[2]; break; case PIPE_BLENDFACTOR_ZERO: term2r = @@ -319,17 +367,37 @@ spu_fallback_fragment_ops(uint x, uint y, term2g = spu_mul(fbRGBA[1], tmp); term2b = spu_mul(fbRGBA[2], tmp); break; - /* XXX more cases */ + case PIPE_BLENDFACTOR_DST_COLOR: + term2r = spu_mul(fbRGBA[0], fbRGBA[0]); + term2g = spu_mul(fbRGBA[1], fbRGBA[1]); + term2b = spu_mul(fbRGBA[2], fbRGBA[2]); + break; + case PIPE_BLENDFACTOR_DST_ALPHA: + term2r = spu_mul(fbRGBA[0], fbRGBA[3]); + term2g = spu_mul(fbRGBA[1], fbRGBA[3]); + term2b = spu_mul(fbRGBA[2], fbRGBA[3]); + break; + case PIPE_BLENDFACTOR_CONST_COLOR: + term2r = spu_mul(fbRGBA[0], spu_splats(spu.blend_color.color[0])); + term2g = spu_mul(fbRGBA[1], spu_splats(spu.blend_color.color[1])); + term2b = spu_mul(fbRGBA[2], spu_splats(spu.blend_color.color[2])); + break; + case PIPE_BLENDFACTOR_CONST_ALPHA: + term2r = spu_mul(fbRGBA[0], spu_splats(spu.blend_color.color[3])); + term2g = spu_mul(fbRGBA[1], spu_splats(spu.blend_color.color[3])); + term2b = spu_mul(fbRGBA[2], spu_splats(spu.blend_color.color[3])); + break; + /* XXX more cases */ default: ASSERT(0); } /* - * Compute Dest Alpha term + * Compute Dest Alpha term (framebuffer alpha * factor) */ switch (spu.blend.alpha_dst_factor) { case PIPE_BLENDFACTOR_ONE: - term2a = fragA; + term2a = fbRGBA[3]; break; case PIPE_BLENDFACTOR_SRC_COLOR: term2a = spu_splats(0.0f); @@ -342,6 +410,16 @@ spu_fallback_fragment_ops(uint x, uint y, tmp = spu_sub(one, fragA); term2a = spu_mul(fbRGBA[3], tmp); break; + case PIPE_BLENDFACTOR_DST_COLOR: + /* fall-through */ + case PIPE_BLENDFACTOR_DST_ALPHA: + term2a = spu_mul(fbRGBA[3], fbRGBA[3]); + break; + case PIPE_BLENDFACTOR_CONST_COLOR: + /* fall-through */ + case PIPE_BLENDFACTOR_CONST_ALPHA: + term2a = spu_mul(fbRGBA[3], spu_splats(spu.blend_color.color[3])); + break; /* XXX more cases */ default: ASSERT(0); @@ -361,7 +439,21 @@ spu_fallback_fragment_ops(uint x, uint y, fragG = spu_sub(term1g, term2g); fragB = spu_sub(term1b, term2b); break; - /* XXX more cases */ + case PIPE_BLEND_REVERSE_SUBTRACT: + fragR = spu_sub(term2r, term1r); + fragG = spu_sub(term2g, term1g); + fragB = spu_sub(term2b, term1b); + break; + case PIPE_BLEND_MIN: + fragR = spu_min(term1r, term2r); + fragG = spu_min(term1g, term2g); + fragB = spu_min(term1b, term2b); + break; + case PIPE_BLEND_MAX: + fragR = spu_max(term1r, term2r); + fragG = spu_max(term1g, term2g); + fragB = spu_max(term1b, term2b); + break; default: ASSERT(0); } @@ -376,7 +468,15 @@ spu_fallback_fragment_ops(uint x, uint y, case PIPE_BLEND_SUBTRACT: fragA = spu_sub(term1a, term2a); break; - /* XXX more cases */ + case PIPE_BLEND_REVERSE_SUBTRACT: + fragA = spu_sub(term2a, term1a); + break; + case PIPE_BLEND_MIN: + fragA = spu_min(term1a, term2a); + break; + case PIPE_BLEND_MAX: + fragA = spu_max(term1a, term2a); + break; default: ASSERT(0); } diff --git a/src/gallium/drivers/cell/spu/spu_render.c b/src/gallium/drivers/cell/spu/spu_render.c index 82dbeb26b7..5515bb55c9 100644 --- a/src/gallium/drivers/cell/spu/spu_render.c +++ b/src/gallium/drivers/cell/spu/spu_render.c @@ -175,22 +175,14 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr) const ubyte *vertices; const ushort *indexes; uint i, j; + uint num_tiles; - - if (Debug) { - printf("SPU %u: RENDER prim %u, num_vert=%u num_ind=%u " - "inline_vert=%u\n", - spu.init.id, - render->prim_type, - render->num_verts, - render->num_indexes, - render->inline_verts); - - /* - printf(" bound: %g, %g .. %g, %g\n", - render->xmin, render->ymin, render->xmax, render->ymax); - */ - } + D_PRINTF(CELL_DEBUG_CMD, + "RENDER prim=%u num_vert=%u num_ind=%u inline_vert=%u\n", + render->prim_type, + render->num_verts, + render->num_indexes, + render->inline_verts); ASSERT(sizeof(*render) % 4 == 0); ASSERT(total_vertex_bytes % 16 == 0); @@ -251,6 +243,8 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr) wait_on_mask(1 << TAG_SURFACE_CLEAR); /* XXX temporary */ + num_tiles = 0; + /** ** loop over tiles, rendering tris **/ @@ -264,6 +258,8 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr) if (!my_tile(tx, ty)) continue; + num_tiles++; + spu.cur_ctile_status = spu.ctile_status[ty][tx]; spu.cur_ztile_status = spu.ztile_status[ty][tx]; @@ -279,7 +275,7 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr) v1 = (const float *) (vertices + indexes[j+1] * vertex_size); v2 = (const float *) (vertices + indexes[j+2] * vertex_size); - drawn += tri_draw(v0, v1, v2, tx, ty, render->front_winding); + drawn += tri_draw(v0, v1, v2, tx, ty); } //printf("SPU %u: drew %u of %u\n", spu.init.id, drawn, render->num_indexes/3); @@ -293,7 +289,7 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr) spu.ztile_status[ty][tx] = spu.cur_ztile_status; } - if (Debug) - printf("SPU %u: RENDER done\n", - spu.init.id); + D_PRINTF(CELL_DEBUG_CMD, + "RENDER done (%u tiles hit)\n", + num_tiles); } diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c index 42eb06a362..69784c8978 100644 --- a/src/gallium/drivers/cell/spu/spu_texture.c +++ b/src/gallium/drivers/cell/spu/spu_texture.c @@ -72,10 +72,10 @@ invalidate_tex_cache(void) * a time. */ static void -get_four_texels(uint unit, uint level, uint face, vec_int4 x, vec_int4 y, +get_four_texels(const struct spu_texture_level *tlevel, uint face, + vec_int4 x, vec_int4 y, vec_uint4 *texels) { - const struct spu_texture_level *tlevel = &spu.texture[unit].level[level]; unsigned texture_ea = (uintptr_t) tlevel->start; const vec_int4 tile_x = spu_rlmask(x, -5); /* tile_x = x / 32 */ const vec_int4 tile_y = spu_rlmask(y, -5); /* tile_y = y / 32 */ @@ -126,10 +126,9 @@ spu_clamp(vector signed int vec, vector signed int max) * \param colors returned colors in SOA format (rrrr, gggg, bbbb, aaaa). */ void -sample_texture4_nearest(vector float s, vector float t, - vector float r, vector float q, - uint unit, uint level, uint face, - vector float colors[4]) +sample_texture_2d_nearest(vector float s, vector float t, + uint unit, uint level, uint face, + vector float colors[4]) { const struct spu_texture_level *tlevel = &spu.texture[unit].level[level]; vector float ss = spu_mul(s, tlevel->scale_s); @@ -146,7 +145,7 @@ sample_texture4_nearest(vector float s, vector float t, is = spu_clamp(is, tlevel->max_s); it = spu_clamp(it, tlevel->max_t); - get_four_texels(unit, level, face, is, it, texels); + get_four_texels(tlevel, face, is, it, texels); /* convert four packed ARGBA pixels to float RRRR,GGGG,BBBB,AAAA */ spu_unpack_A8R8G8B8_transpose4(texels, colors); @@ -158,10 +157,9 @@ sample_texture4_nearest(vector float s, vector float t, * \param colors returned colors in SOA format (rrrr, gggg, bbbb, aaaa). */ void -sample_texture4_bilinear(vector float s, vector float t, - vector float r, vector float q, - uint unit, uint level, uint face, - vector float colors[4]) +sample_texture_2d_bilinear(vector float s, vector float t, + uint unit, uint level, uint face, + vector float colors[4]) { const struct spu_texture_level *tlevel = &spu.texture[unit].level[level]; static const vector float half = {-0.5f, -0.5f, -0.5f, -0.5f}; @@ -190,14 +188,10 @@ sample_texture4_bilinear(vector float s, vector float t, /* get packed int texels */ vector unsigned int texels[16]; - get_four_texels(unit, level, face, is0, it0, texels + 0); /* upper-left */ - get_four_texels(unit, level, face, is1, it0, texels + 4); /* upper-right */ - get_four_texels(unit, level, face, is0, it1, texels + 8); /* lower-left */ - get_four_texels(unit, level, face, is1, it1, texels + 12); /* lower-right */ - - /* XXX possibly rework following code to compute the weighted sample - * colors with integer arithmetic for fewer int->float conversions. - */ + get_four_texels(tlevel, face, is0, it0, texels + 0); /* upper-left */ + get_four_texels(tlevel, face, is1, it0, texels + 4); /* upper-right */ + get_four_texels(tlevel, face, is0, it1, texels + 8); /* lower-left */ + get_four_texels(tlevel, face, is1, it1, texels + 12); /* lower-right */ /* convert packed int texels to float colors */ vector float ftexels[16]; @@ -305,13 +299,13 @@ transpose(vector unsigned int *mOut0, /** - * Bilinear filtering, using int intead of float arithmetic + * Bilinear filtering, using int instead of float arithmetic for computing + * sample weights. */ void -sample_texture4_bilinear_2(vector float s, vector float t, - vector float r, vector float q, - uint unit, uint level, uint face, - vector float colors[4]) +sample_texture_2d_bilinear_int(vector float s, vector float t, + uint unit, uint level, uint face, + vector float colors[4]) { const struct spu_texture_level *tlevel = &spu.texture[unit].level[level]; static const vector float half = {-0.5f, -0.5f, -0.5f, -0.5f}; @@ -320,19 +314,19 @@ sample_texture4_bilinear_2(vector float s, vector float t, vector float ss = spu_madd(s, tlevel->scale_s, half); vector float tt = spu_madd(t, tlevel->scale_t, half); - /* convert float coords to fixed-pt coords with 8 fraction bits */ - vector signed int is = spu_convts(ss, 8); - vector signed int it = spu_convts(tt, 8); + /* convert float coords to fixed-pt coords with 7 fraction bits */ + vector signed int is = spu_convts(ss, 7); /* XXX really need floor() here */ + vector signed int it = spu_convts(tt, 7); /* XXX really need floor() here */ - /* compute integer texel weights in [0, 255] */ - vector signed int sWeights0 = spu_and(is, 255); - vector signed int tWeights0 = spu_and(it, 255); - vector signed int sWeights1 = spu_sub(255, sWeights0); - vector signed int tWeights1 = spu_sub(255, tWeights0); + /* compute integer texel weights in [0, 127] */ + vector signed int sWeights0 = spu_and(is, 127); + vector signed int tWeights0 = spu_and(it, 127); + vector signed int sWeights1 = spu_sub(127, sWeights0); + vector signed int tWeights1 = spu_sub(127, tWeights0); - /* texel coords: is0 = is / 256, it0 = is / 256 */ - vector signed int is0 = spu_rlmask(is, -8); - vector signed int it0 = spu_rlmask(it, -8); + /* texel coords: is0 = is / 128, it0 = is / 128 */ + vector signed int is0 = spu_rlmask(is, -7); + vector signed int it0 = spu_rlmask(it, -7); /* texel coords: i1 = is0 + 1, it1 = it0 + 1 */ vector signed int is1 = spu_add(is0, 1); @@ -352,10 +346,10 @@ sample_texture4_bilinear_2(vector float s, vector float t, /* get packed int texels */ vector unsigned int texels[16]; - get_four_texels(unit, level, face, is0, it0, texels + 0); /* upper-left */ - get_four_texels(unit, level, face, is1, it0, texels + 4); /* upper-right */ - get_four_texels(unit, level, face, is0, it1, texels + 8); /* lower-left */ - get_four_texels(unit, level, face, is1, it1, texels + 12); /* lower-right */ + get_four_texels(tlevel, face, is0, it0, texels + 0); /* upper-left */ + get_four_texels(tlevel, face, is1, it0, texels + 4); /* upper-right */ + get_four_texels(tlevel, face, is0, it1, texels + 8); /* lower-left */ + get_four_texels(tlevel, face, is1, it1, texels + 12); /* lower-right */ /* twiddle packed 32-bit BGRA pixels into RGBA as four unsigned ints */ { @@ -383,36 +377,36 @@ sample_texture4_bilinear_2(vector float s, vector float t, vector unsigned int c0, c1, c2, c3, cSum; /* red */ - c0 = (vector unsigned int) si_mpyu((qword) texel0, si_mpyu((qword) sWeights1, (qword) tWeights1)); /*ul*/ - c1 = (vector unsigned int) si_mpyu((qword) texel4, si_mpyu((qword) sWeights0, (qword) tWeights1)); /*ur*/ - c2 = (vector unsigned int) si_mpyu((qword) texel8, si_mpyu((qword) sWeights1, (qword) tWeights0)); /*ll*/ - c3 = (vector unsigned int) si_mpyu((qword) texel12, si_mpyu((qword) sWeights0, (qword) tWeights0)); /*lr*/ + c0 = (vector unsigned int) si_mpy((qword) texel0, si_mpy((qword) sWeights1, (qword) tWeights1)); /*ul*/ + c1 = (vector unsigned int) si_mpy((qword) texel4, si_mpy((qword) sWeights0, (qword) tWeights1)); /*ur*/ + c2 = (vector unsigned int) si_mpy((qword) texel8, si_mpy((qword) sWeights1, (qword) tWeights0)); /*ll*/ + c3 = (vector unsigned int) si_mpy((qword) texel12, si_mpy((qword) sWeights0, (qword) tWeights0)); /*lr*/ cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3)); - colors[0] = spu_convtf(cSum, 24); + colors[0] = spu_convtf(cSum, 22); /* green */ - c0 = (vector unsigned int) si_mpyu((qword) texel1, si_mpyu((qword) sWeights1, (qword) tWeights1)); /*ul*/ - c1 = (vector unsigned int) si_mpyu((qword) texel5, si_mpyu((qword) sWeights0, (qword) tWeights1)); /*ur*/ - c2 = (vector unsigned int) si_mpyu((qword) texel9, si_mpyu((qword) sWeights1, (qword) tWeights0)); /*ll*/ - c3 = (vector unsigned int) si_mpyu((qword) texel13, si_mpyu((qword) sWeights0, (qword) tWeights0)); /*lr*/ + c0 = (vector unsigned int) si_mpy((qword) texel1, si_mpy((qword) sWeights1, (qword) tWeights1)); /*ul*/ + c1 = (vector unsigned int) si_mpy((qword) texel5, si_mpy((qword) sWeights0, (qword) tWeights1)); /*ur*/ + c2 = (vector unsigned int) si_mpy((qword) texel9, si_mpy((qword) sWeights1, (qword) tWeights0)); /*ll*/ + c3 = (vector unsigned int) si_mpy((qword) texel13, si_mpy((qword) sWeights0, (qword) tWeights0)); /*lr*/ cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3)); - colors[1] = spu_convtf(cSum, 24); + colors[1] = spu_convtf(cSum, 22); /* blue */ - c0 = (vector unsigned int) si_mpyu((qword) texel2, si_mpyu((qword) sWeights1, (qword) tWeights1)); /*ul*/ - c1 = (vector unsigned int) si_mpyu((qword) texel6, si_mpyu((qword) sWeights0, (qword) tWeights1)); /*ur*/ - c2 = (vector unsigned int) si_mpyu((qword) texel10, si_mpyu((qword) sWeights1, (qword) tWeights0)); /*ll*/ - c3 = (vector unsigned int) si_mpyu((qword) texel14, si_mpyu((qword) sWeights0, (qword) tWeights0)); /*lr*/ + c0 = (vector unsigned int) si_mpy((qword) texel2, si_mpy((qword) sWeights1, (qword) tWeights1)); /*ul*/ + c1 = (vector unsigned int) si_mpy((qword) texel6, si_mpy((qword) sWeights0, (qword) tWeights1)); /*ur*/ + c2 = (vector unsigned int) si_mpy((qword) texel10, si_mpy((qword) sWeights1, (qword) tWeights0)); /*ll*/ + c3 = (vector unsigned int) si_mpy((qword) texel14, si_mpy((qword) sWeights0, (qword) tWeights0)); /*lr*/ cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3)); - colors[2] = spu_convtf(cSum, 24); + colors[2] = spu_convtf(cSum, 22); /* alpha */ - c0 = (vector unsigned int) si_mpyu((qword) texel3, si_mpyu((qword) sWeights1, (qword) tWeights1)); /*ul*/ - c1 = (vector unsigned int) si_mpyu((qword) texel7, si_mpyu((qword) sWeights0, (qword) tWeights1)); /*ur*/ - c2 = (vector unsigned int) si_mpyu((qword) texel11, si_mpyu((qword) sWeights1, (qword) tWeights0)); /*ll*/ - c3 = (vector unsigned int) si_mpyu((qword) texel15, si_mpyu((qword) sWeights0, (qword) tWeights0)); /*lr*/ + c0 = (vector unsigned int) si_mpy((qword) texel3, si_mpy((qword) sWeights1, (qword) tWeights1)); /*ul*/ + c1 = (vector unsigned int) si_mpy((qword) texel7, si_mpy((qword) sWeights0, (qword) tWeights1)); /*ur*/ + c2 = (vector unsigned int) si_mpy((qword) texel11, si_mpy((qword) sWeights1, (qword) tWeights0)); /*ll*/ + c3 = (vector unsigned int) si_mpy((qword) texel15, si_mpy((qword) sWeights0, (qword) tWeights0)); /*lr*/ cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3)); - colors[3] = spu_convtf(cSum, 24); + colors[3] = spu_convtf(cSum, 22); } @@ -420,8 +414,8 @@ sample_texture4_bilinear_2(vector float s, vector float t, /** * Compute level of detail factor from texcoords. */ -static float -compute_lambda(uint unit, vector float s, vector float t) +static INLINE float +compute_lambda_2d(uint unit, vector float s, vector float t) { uint baseLevel = 0; float width = spu.texture[unit].level[baseLevel].width; @@ -430,30 +424,60 @@ compute_lambda(uint unit, vector float s, vector float t) float dsdy = width * (spu_extract(s, 2) - spu_extract(s, 0)); float dtdx = height * (spu_extract(t, 1) - spu_extract(t, 0)); float dtdy = height * (spu_extract(t, 2) - spu_extract(t, 0)); +#if 0 + /* ideal value */ float x = dsdx * dsdx + dtdx * dtdx; float y = dsdy * dsdy + dtdy * dtdy; float rho = x > y ? x : y; rho = sqrtf(rho); - float lambda = logf(rho) * 1.442695f; +#else + /* approximation */ + dsdx = fabsf(dsdx); + dsdy = fabsf(dsdy); + dtdx = fabsf(dtdx); + dtdy = fabsf(dtdy); + float rho = (dsdx + dsdy + dtdx + dtdy) * 0.5; +#endif + float lambda = logf(rho) * 1.442695f; /* compute logbase2(rho) */ return lambda; } +/** + * Blend two sets of colors according to weight. + */ +static void +blend_colors(vector float c0[4], const vector float c1[4], float weight) +{ + vector float t = spu_splats(weight); + vector float dc0 = spu_sub(c1[0], c0[0]); + vector float dc1 = spu_sub(c1[1], c0[1]); + vector float dc2 = spu_sub(c1[2], c0[2]); + vector float dc3 = spu_sub(c1[3], c0[3]); + c0[0] = spu_madd(dc0, t, c0[0]); + c0[1] = spu_madd(dc1, t, c0[1]); + c0[2] = spu_madd(dc2, t, c0[2]); + c0[3] = spu_madd(dc3, t, c0[3]); +} + /** - * Texture sampling with level of detail selection. + * Texture sampling with level of detail selection and possibly mipmap + * interpolation. */ void -sample_texture4_lod(vector float s, vector float t, - vector float r, vector float q, - uint unit, uint level_ignored, uint face, - vector float colors[4]) +sample_texture_2d_lod(vector float s, vector float t, + uint unit, uint level_ignored, uint face, + vector float colors[4]) { /* * Note that we're computing a lambda/lod here that's used for all * four pixels in the quad. */ - float lambda = compute_lambda(unit, s, t); + float lambda = compute_lambda_2d(unit, s, t); + + (void) face; + (void) level_ignored; /* apply lod bias */ lambda += spu.sampler[unit].lod_bias; @@ -466,15 +490,34 @@ sample_texture4_lod(vector float s, vector float t, if (lambda <= 0.0f) { /* magnify */ - spu.mag_sample_texture4[unit](s, t, r, q, unit, 0, 0, colors); + spu.mag_sample_texture_2d[unit](s, t, unit, 0, face, colors); } else { /* minify */ - int level = (int) (lambda + 0.5f); - if (level > (int) spu.texture[unit].max_level) - level = spu.texture[unit].max_level; - spu.min_sample_texture4[unit](s, t, r, q, unit, level, 0, colors); - /* XXX to do: mipmap level interpolation */ + if (spu.sampler[unit].min_img_filter == PIPE_TEX_FILTER_LINEAR) { + /* sample two mipmap levels and interpolate */ + int level = (int) lambda; + if (level > (int) spu.texture[unit].max_level) + level = spu.texture[unit].max_level; + spu.min_sample_texture_2d[unit](s, t, unit, level, face, colors); + if (spu.sampler[unit].min_img_filter == PIPE_TEX_FILTER_LINEAR) { + /* sample second mipmap level */ + float weight = lambda - (float) level; + level++; + if (level <= (int) spu.texture[unit].max_level) { + vector float colors2[4]; + spu.min_sample_texture_2d[unit](s, t, unit, level, face, colors2); + blend_colors(colors, colors2, weight); + } + } + } + else { + /* sample one mipmap level */ + int level = (int) (lambda + 0.5f); + if (level > (int) spu.texture[unit].max_level) + level = spu.texture[unit].max_level; + spu.min_sample_texture_2d[unit](s, t, unit, level, face, colors); + } } } @@ -552,16 +595,13 @@ choose_cube_face(float rx, float ry, float rz, float *newS, float *newT) void -sample_texture4_cube(vector float s, vector float t, - vector float r, vector float q, - uint unit, uint level, uint face_ignored, - vector float colors[4]) +sample_texture_cube(vector float s, vector float t, vector float r, + uint unit, vector float colors[4]) { - static const vector float zero = {0.0f, 0.0f, 0.0f, 0.0f}; - uint p, faces[4]; + uint p, faces[4], level = 0; float newS[4], newT[4]; - /* Compute cube face referenced by the four sets of texcoords. + /* Compute cube faces referenced by the four sets of texcoords. * XXX we should SIMD-ize this. */ for (p = 0; p < 4; p++) { @@ -577,15 +617,15 @@ sample_texture4_cube(vector float s, vector float t, /* GOOD! All four texcoords refer to the same cube face */ s = (vector float) {newS[0], newS[1], newS[2], newS[3]}; t = (vector float) {newT[0], newT[1], newT[2], newT[3]}; - sample_texture4_nearest(s, t, zero, zero, unit, level, faces[0], colors); + spu.sample_texture_2d[unit](s, t, unit, level, faces[0], colors); } else { /* BAD! The four texcoords refer to different faces */ for (p = 0; p < 4; p++) { vector float c[4]; - sample_texture4_nearest(spu_splats(newS[p]), spu_splats(newT[p]), - zero, zero, unit, level, faces[p], c); + spu.sample_texture_2d[unit](spu_splats(newS[p]), spu_splats(newT[p]), + unit, level, faces[p], c); float red = spu_extract(c[0], p); float green = spu_extract(c[1], p); diff --git a/src/gallium/drivers/cell/spu/spu_texture.h b/src/gallium/drivers/cell/spu/spu_texture.h index 387484c3ad..7b75b007b5 100644 --- a/src/gallium/drivers/cell/spu/spu_texture.h +++ b/src/gallium/drivers/cell/spu/spu_texture.h @@ -37,37 +37,31 @@ invalidate_tex_cache(void); extern void -sample_texture4_nearest(vector float s, vector float t, - vector float r, vector float q, - uint unit, uint level, uint face, - vector float colors[4]); +sample_texture_2d_nearest(vector float s, vector float t, + uint unit, uint level, uint face, + vector float colors[4]); extern void -sample_texture4_bilinear(vector float s, vector float t, - vector float r, vector float q, - uint unit, uint level, uint face, - vector float colors[4]); - -extern void -sample_texture4_bilinear_2(vector float s, vector float t, - vector float r, vector float q, +sample_texture_2d_bilinear(vector float s, vector float t, uint unit, uint level, uint face, vector float colors[4]); +extern void +sample_texture_2d_bilinear_int(vector float s, vector float t, + uint unit, uint level, uint face, + vector float colors[4]); + extern void -sample_texture4_lod(vector float s, vector float t, - vector float r, vector float q, - uint unit, uint level, uint face, - vector float colors[4]); +sample_texture_2d_lod(vector float s, vector float t, + uint unit, uint level, uint face, + vector float colors[4]); extern void -sample_texture4_cube(vector float s, vector float t, - vector float r, vector float q, - uint unit, uint level_ignored, uint face_ignored, - vector float colors[4]); +sample_texture_cube(vector float s, vector float t, vector float r, + uint unit, vector float colors[4]); #endif /* SPU_TEXTURE_H */ diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c index 03f094373d..4caf7d6b61 100644 --- a/src/gallium/drivers/cell/spu/spu_tri.c +++ b/src/gallium/drivers/cell/spu/spu_tri.c @@ -43,11 +43,6 @@ /** Masks are uint[4] vectors with each element being 0 or 0xffffffff */ typedef vector unsigned int mask_t; -typedef union -{ - vector float v; - float f[4]; -} float4; /** @@ -91,9 +86,9 @@ struct edge { struct interp_coef { - float4 a0; - float4 dadx; - float4 dady; + vector float a0; + vector float dadx; + vector float dady; }; @@ -116,7 +111,7 @@ struct setup_stage { struct edge etop; struct edge emaj; - float oneOverArea; + float oneOverArea; /* XXX maybe make into vector? */ uint facing; @@ -152,14 +147,14 @@ eval_coeff(uint slot, float x, float y, vector float w, vector float result[4]) result[QUAD_TOP_LEFT] = result[QUAD_TOP_RIGHT] = result[QUAD_BOTTOM_LEFT] = - result[QUAD_BOTTOM_RIGHT] = setup.coef[slot].a0.v; + result[QUAD_BOTTOM_RIGHT] = setup.coef[slot].a0; break; case INTERP_LINEAR: { - vector float dadx = setup.coef[slot].dadx.v; - vector float dady = setup.coef[slot].dady.v; + vector float dadx = setup.coef[slot].dadx; + vector float dady = setup.coef[slot].dady; vector float topLeft = - spu_add(setup.coef[slot].a0.v, + spu_add(setup.coef[slot].a0, spu_add(spu_mul(spu_splats(x), dadx), spu_mul(spu_splats(y), dady))); @@ -171,10 +166,10 @@ eval_coeff(uint slot, float x, float y, vector float w, vector float result[4]) break; case INTERP_PERSPECTIVE: { - vector float dadx = setup.coef[slot].dadx.v; - vector float dady = setup.coef[slot].dady.v; + vector float dadx = setup.coef[slot].dadx; + vector float dady = setup.coef[slot].dady; vector float topLeft = - spu_add(setup.coef[slot].a0.v, + spu_add(setup.coef[slot].a0, spu_add(spu_mul(spu_splats(x), dadx), spu_mul(spu_splats(y), dady))); @@ -212,9 +207,9 @@ static INLINE vector float eval_z(float x, float y) { const uint slot = 0; - const float dzdx = setup.coef[slot].dadx.f[2]; - const float dzdy = setup.coef[slot].dady.f[2]; - const float topLeft = setup.coef[slot].a0.f[2] + x * dzdx + y * dzdy; + const float dzdx = spu_extract(setup.coef[slot].dadx, 2); + const float dzdy = spu_extract(setup.coef[slot].dady, 2); + const float topLeft = spu_extract(setup.coef[slot].a0, 2) + x * dzdx + y * dzdy; const vector float topLeftv = spu_splats(topLeft); const vector float derivs = (vector float) { 0.0, dzdx, dzdy, dzdx + dzdy }; return spu_add(topLeftv, derivs); @@ -226,9 +221,9 @@ static INLINE vector float eval_w(float x, float y) { const uint slot = 0; - const float dwdx = setup.coef[slot].dadx.f[3]; - const float dwdy = setup.coef[slot].dady.f[3]; - const float topLeft = setup.coef[slot].a0.f[3] + x * dwdx + y * dwdy; + const float dwdx = spu_extract(setup.coef[slot].dadx, 3); + const float dwdy = spu_extract(setup.coef[slot].dady, 3); + const float topLeft = spu_extract(setup.coef[slot].a0, 3) + x * dwdx + y * dwdy; const vector float topLeftv = spu_splats(topLeft); const vector float derivs = (vector float) { 0.0, dwdx, dwdy, dwdx + dwdy }; return spu_add(topLeftv, derivs); @@ -259,6 +254,7 @@ emit_quad( int x, int y, mask_t mask) vector float inputs[4*4], outputs[2*4]; vector float fragZ = eval_z((float) x, (float) y); vector float fragW = eval_w((float) x, (float) y); + vector unsigned int kill_mask; /* setup inputs */ #if 0 @@ -273,7 +269,9 @@ emit_quad( int x, int y, mask_t mask) ASSERT(spu.fragment_ops); /* Execute the current fragment program */ - spu.fragment_program(inputs, outputs, spu.constants); + kill_mask = spu.fragment_program(inputs, outputs, spu.constants); + + mask = spu_andc(mask, kill_mask); /* Execute per-fragment/quad operations, including: * alpha test, z test, stencil test, blend and framebuffer writing. @@ -404,30 +402,41 @@ flush_spans(void) static void print_vertex(const struct vertex_header *v) { - int i; - fprintf(stderr, "Vertex: (%p)\n", v); - for (i = 0; i < setup.quad.nr_attrs; i++) { - fprintf(stderr, " %d: %f %f %f %f\n", i, - v->data[i][0], v->data[i][1], v->data[i][2], v->data[i][3]); + uint i; + fprintf(stderr, " Vertex: (%p)\n", v); + for (i = 0; i < spu.vertex_info.num_attribs; i++) { + fprintf(stderr, " %d: %f %f %f %f\n", i, + spu_extract(v->data[i], 0), + spu_extract(v->data[i], 1), + spu_extract(v->data[i], 2), + spu_extract(v->data[i], 3)); } } #endif +/** + * Sort vertices from top to bottom. + * Compute area and determine front vs. back facing. + * Do coarse clip test against tile bounds + * \return FALSE if tri is totally outside tile, TRUE otherwise + */ static boolean setup_sort_vertices(const struct vertex_header *v0, const struct vertex_header *v1, const struct vertex_header *v2) { + float area, sign; + #if DEBUG_VERTS - fprintf(stderr, "Triangle:\n"); - print_vertex(v0); - print_vertex(v1); - print_vertex(v2); + if (spu.init.id==0) { + fprintf(stderr, "SPU %u: Triangle:\n", spu.init.id); + print_vertex(v0); + print_vertex(v1); + print_vertex(v2); + } #endif - setup.vprovoke = v2; - /* determine bottom to top order of vertices */ { float y0 = spu_extract(v0->data[0], 1); @@ -439,18 +448,21 @@ setup_sort_vertices(const struct vertex_header *v0, setup.vmin = v0; setup.vmid = v1; setup.vmax = v2; + sign = -1.0f; } else if (y2 <= y0) { /* y2<=y0<=y1 */ setup.vmin = v2; setup.vmid = v0; setup.vmax = v1; + sign = -1.0f; } else { /* y0<=y2<=y1 */ setup.vmin = v0; setup.vmid = v2; setup.vmax = v1; + sign = 1.0f; } } else { @@ -459,18 +471,21 @@ setup_sort_vertices(const struct vertex_header *v0, setup.vmin = v1; setup.vmid = v0; setup.vmax = v2; + sign = 1.0f; } else if (y2 <= y1) { /* y2<=y1<=y0 */ setup.vmin = v2; setup.vmid = v1; setup.vmax = v0; + sign = 1.0f; } else { /* y1<=y2<=y0 */ setup.vmin = v1; setup.vmid = v2; setup.vmax = v0; + sign = -1.0f; } } } @@ -499,31 +514,16 @@ setup_sort_vertices(const struct vertex_header *v0, /* * Compute triangle's area. Use 1/area to compute partial * derivatives of attributes later. - * - * The area will be the same as prim->det, but the sign may be - * different depending on how the vertices get sorted above. - * - * To determine whether the primitive is front or back facing we - * use the prim->det value because its sign is correct. */ - { - const float area = (setup.emaj.dx * setup.ebot.dy - - setup.ebot.dx * setup.emaj.dy); - - setup.oneOverArea = 1.0f / area; - /* - _mesa_printf("%s one-over-area %f area %f det %f\n", - __FUNCTION__, setup.oneOverArea, area, prim->det ); - */ - } + area = setup.emaj.dx * setup.ebot.dy - setup.ebot.dx * setup.emaj.dy; -#if 0 - /* We need to know if this is a front or back-facing triangle for: - * - the GLSL gl_FrontFacing fragment attribute (bool) - * - two-sided stencil test - */ - setup.quad.facing = (prim->det > 0.0) ^ (setup.softpipe->rasterizer->front_winding == PIPE_WINDING_CW); -#endif + setup.oneOverArea = 1.0f / area; + + /* The product of area * sign indicates front/back orientation (0/1) */ + setup.facing = (area * sign > 0.0f) + ^ (spu.rasterizer.front_winding == PIPE_WINDING_CW); + + setup.vprovoke = v2; return TRUE; } @@ -538,9 +538,9 @@ setup_sort_vertices(const struct vertex_header *v0, static INLINE void const_coeff4(uint slot) { - setup.coef[slot].dadx.v = (vector float) {0.0, 0.0, 0.0, 0.0}; - setup.coef[slot].dady.v = (vector float) {0.0, 0.0, 0.0, 0.0}; - setup.coef[slot].a0.v = setup.vprovoke->data[slot]; + setup.coef[slot].dadx = (vector float) {0.0, 0.0, 0.0, 0.0}; + setup.coef[slot].dady = (vector float) {0.0, 0.0, 0.0, 0.0}; + setup.coef[slot].a0 = setup.vprovoke->data[slot]; } @@ -564,13 +564,13 @@ tri_linear_coeff4(uint slot) vector float b = spu_sub(spu_mul(spu_splats(setup.emaj.dx), botda), spu_mul(majda, spu_splats(setup.ebot.dx))); - setup.coef[slot].dadx.v = spu_mul(a, spu_splats(setup.oneOverArea)); - setup.coef[slot].dady.v = spu_mul(b, spu_splats(setup.oneOverArea)); + setup.coef[slot].dadx = spu_mul(a, spu_splats(setup.oneOverArea)); + setup.coef[slot].dady = spu_mul(b, spu_splats(setup.oneOverArea)); - vector float tempx = spu_mul(setup.coef[slot].dadx.v, xxxx); - vector float tempy = spu_mul(setup.coef[slot].dady.v, yyyy); + vector float tempx = spu_mul(setup.coef[slot].dadx, xxxx); + vector float tempy = spu_mul(setup.coef[slot].dady, yyyy); - setup.coef[slot].a0.v = spu_sub(vmin_d, spu_add(tempx, tempy)); + setup.coef[slot].a0 = spu_sub(vmin_d, spu_add(tempx, tempy)); } @@ -608,13 +608,13 @@ tri_persp_coeff4(uint slot) vector float b = spu_sub(spu_mul(spu_splats(setup.emaj.dx), botda), spu_mul(majda, spu_splats(setup.ebot.dx))); - setup.coef[slot].dadx.v = spu_mul(a, spu_splats(setup.oneOverArea)); - setup.coef[slot].dady.v = spu_mul(b, spu_splats(setup.oneOverArea)); + setup.coef[slot].dadx = spu_mul(a, spu_splats(setup.oneOverArea)); + setup.coef[slot].dady = spu_mul(b, spu_splats(setup.oneOverArea)); - vector float tempx = spu_mul(setup.coef[slot].dadx.v, xxxx); - vector float tempy = spu_mul(setup.coef[slot].dady.v, yyyy); + vector float tempx = spu_mul(setup.coef[slot].dadx, xxxx); + vector float tempy = spu_mul(setup.coef[slot].dady, yyyy); - setup.coef[slot].a0.v = spu_sub(vmin_d, spu_add(tempx, tempy)); + setup.coef[slot].a0 = spu_sub(vmin_d, spu_add(tempx, tempy)); } @@ -750,27 +750,13 @@ subtriangle(struct edge *eleft, struct edge *eright, unsigned lines) } -static float -determinant(const float *v0, const float *v1, const float *v2) -{ - /* edge vectors e = v0 - v2, f = v1 - v2 */ - const float ex = v0[0] - v2[0]; - const float ey = v0[1] - v2[1]; - const float fx = v1[0] - v2[0]; - const float fy = v1[1] - v2[1]; - - /* det = cross(e,f).z */ - return ex * fy - ey * fx; -} - - /** * Draw triangle into tile at (tx, ty) (tile coords) * The tile data should have already been fetched. */ boolean tri_draw(const float *v0, const float *v1, const float *v2, - uint tx, uint ty, uint front_winding) + uint tx, uint ty) { setup.tx = tx; setup.ty = ty; @@ -781,12 +767,6 @@ tri_draw(const float *v0, const float *v1, const float *v2, setup.cliprect_maxx = (tx + 1) * TILE_SIZE; setup.cliprect_maxy = (ty + 1) * TILE_SIZE; - /* Before we sort vertices, determine the facing of the triangle, - * which will be needed for front/back-face stencil application - */ - float det = determinant(v0, v1, v2); - setup.facing = (det > 0.0) ^ (front_winding == PIPE_WINDING_CW); - if (!setup_sort_vertices((struct vertex_header *) v0, (struct vertex_header *) v1, (struct vertex_header *) v2)) { diff --git a/src/gallium/drivers/cell/spu/spu_tri.h b/src/gallium/drivers/cell/spu/spu_tri.h index abc3d35160..aa694dd7c9 100644 --- a/src/gallium/drivers/cell/spu/spu_tri.h +++ b/src/gallium/drivers/cell/spu/spu_tri.h @@ -31,7 +31,7 @@ extern boolean -tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty, uint front_winding); +tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty); #endif /* SPU_TRI_H */ |