diff options
author | Ben Skeggs <skeggsb@gmail.com> | 2008-02-09 16:15:14 +1100 |
---|---|---|
committer | Ben Skeggs <skeggsb@gmail.com> | 2008-02-09 16:15:14 +1100 |
commit | fb2760c5a64ada7b0f0a9635e7865b415a8aa286 (patch) | |
tree | c70e82ce993cfbbb789860516d667a6a44528668 /src/mesa/pipe/cell | |
parent | ae905056d4feb5a39d956a38ab377f4d78bf4065 (diff) | |
parent | 479b5e9b5d9e0e387332c6fbeaffffa7612a0c52 (diff) |
Merge branch 'upstream-gallium-0.1' into darktama-gallium-0.1
Diffstat (limited to 'src/mesa/pipe/cell')
38 files changed, 5172 insertions, 955 deletions
diff --git a/src/mesa/pipe/cell/common.h b/src/mesa/pipe/cell/common.h index 0b63ed39be..4de514c358 100644 --- a/src/mesa/pipe/cell/common.h +++ b/src/mesa/pipe/cell/common.h @@ -51,16 +51,21 @@ /** for sanity checking */ #define ASSERT_ALIGN16(ptr) \ - assert((((unsigned long) (ptr)) & 0xf) == 0); + ASSERT((((unsigned long) (ptr)) & 0xf) == 0); /** round up value to next multiple of 4 */ #define ROUNDUP4(k) (((k) + 0x3) & ~0x3) +/** round up value to next multiple of 8 */ +#define ROUNDUP8(k) (((k) + 0x7) & ~0x7) + /** round up value to next multiple of 16 */ #define ROUNDUP16(k) (((k) + 0xf) & ~0xf) +#define CELL_MAX_SPUS 6 + #define TILE_SIZE 32 @@ -68,21 +73,27 @@ * The low byte of a mailbox word contains the command opcode. * Remaining higher bytes are command specific. */ -#define CELL_CMD_OPCODE_MASK 0xf +#define CELL_CMD_OPCODE_MASK 0xff #define CELL_CMD_EXIT 1 #define CELL_CMD_CLEAR_SURFACE 2 #define CELL_CMD_FINISH 3 #define CELL_CMD_RENDER 4 #define CELL_CMD_BATCH 5 +#define CELL_CMD_RELEASE_VERTS 6 #define CELL_CMD_STATE_FRAMEBUFFER 10 #define CELL_CMD_STATE_DEPTH_STENCIL 11 #define CELL_CMD_STATE_SAMPLER 12 -#define CELL_CMD_STATE_VERTEX_INFO 13 +#define CELL_CMD_STATE_TEXTURE 13 +#define CELL_CMD_STATE_VERTEX_INFO 14 +#define CELL_CMD_STATE_VIEWPORT 15 +#define CELL_CMD_STATE_VS_ARRAY_INFO 16 +#define CELL_CMD_STATE_BLEND 17 +#define CELL_CMD_VS_EXECUTE 18 -#define CELL_NUM_BATCH_BUFFERS 3 -#define CELL_BATCH_BUFFER_SIZE 1024 /**< 16KB would be the max */ +#define CELL_NUM_BUFFERS 4 +#define CELL_BUFFER_SIZE (4*1024) /**< 16KB would be the max */ #define CELL_BUFFER_STATUS_FREE 10 #define CELL_BUFFER_STATUS_USED 20 @@ -94,11 +105,11 @@ */ struct cell_command_framebuffer { - uint opcode; + uint64_t opcode; /**< CELL_CMD_FRAMEBUFFER */ int width, height; void *color_start, *depth_start; enum pipe_format color_format, depth_format; -} ALIGN16_ATTRIB; +}; /** @@ -106,38 +117,90 @@ struct cell_command_framebuffer */ struct cell_command_clear_surface { - uint opcode; + uint64_t opcode; /**< CELL_CMD_CLEAR_SURFACE */ uint surface; /**< Temporary: 0=color, 1=Z */ uint value; +}; + + +/** + * Array info used by the vertex shader's vertex puller. + */ +struct cell_array_info +{ + uint64_t base; /**< Base address of the 0th element. */ + uint attr; /**< Attribute that this state is for. */ + uint pitch; /**< Byte pitch from one entry to the next. */ + uint format; /**< Pipe format of each entry. */ +} ALIGN16_ATTRIB; + + +struct cell_shader_info +{ + unsigned num_outputs; + + uint64_t declarations; + unsigned num_declarations; + uint64_t instructions; + unsigned num_instructions; + uint64_t uniforms; + uint64_t immediates; + unsigned num_immediates; } ALIGN16_ATTRIB; -#define CELL_MAX_VBUF_SIZE (16 * 1024) -#define CELL_MAX_VBUF_INDEXES 1024 +#define SPU_VERTS_PER_BATCH 64 +struct cell_command_vs +{ + uint64_t opcode; /**< CELL_CMD_VS_EXECUTE */ + struct cell_shader_info shader; + unsigned num_elts; + unsigned elts[SPU_VERTS_PER_BATCH]; + uint64_t vOut[SPU_VERTS_PER_BATCH]; + float plane[12][4]; + unsigned nr_planes; + unsigned nr_attrs; +} ALIGN16_ATTRIB; struct cell_command_render { - uint opcode; /**< CELL_CMD_RENDER */ + uint64_t opcode; /**< CELL_CMD_RENDER */ uint prim_type; /**< PIPE_PRIM_x */ uint num_verts; uint vertex_size; /**< bytes per vertex */ - uint dummy; /* XXX this dummy field works around a compiler bug */ uint num_indexes; - const void *vertex_data; - const ushort *index_data; - float xmin, ymin, xmax, ymax; - boolean inline_indexes; + uint vertex_buf; /**< which cell->buffer[] contains the vertex data */ + float xmin, ymin, xmax, ymax; /* XXX another dummy field */ + uint min_index; boolean inline_verts; -} ALIGN16_ATTRIB; +}; + + +struct cell_command_release_verts +{ + uint64_t opcode; /**< CELL_CMD_RELEASE_VERTS */ + uint vertex_buf; /**< in [0, CELL_NUM_BUFFERS-1] */ +}; + + +struct cell_command_texture +{ + void *start; /**< Address in main memory */ + uint width, height; +}; /** XXX unions don't seem to work */ +/* XXX this should go away; all commands should be placed in batch buffers */ struct cell_command { +#if 0 struct cell_command_framebuffer fb; struct cell_command_clear_surface clear; struct cell_command_render render; +#endif + struct cell_command_vs vs; } ALIGN16_ATTRIB; @@ -147,7 +210,9 @@ struct cell_init_info unsigned id; unsigned num_spus; struct cell_command *cmd; - ubyte *batch_buffers[CELL_NUM_BATCH_BUFFERS]; + + /** Buffers for command batches, vertex/index data */ + ubyte *buffers[CELL_NUM_BUFFERS]; uint *buffer_status; /**< points at cell_context->buffer_status */ } ALIGN16_ATTRIB; diff --git a/src/mesa/pipe/cell/ppu/Makefile b/src/mesa/pipe/cell/ppu/Makefile index e7f2562da7..50060f5cd3 100644 --- a/src/mesa/pipe/cell/ppu/Makefile +++ b/src/mesa/pipe/cell/ppu/Makefile @@ -34,6 +34,7 @@ SOURCES = \ cell_surface.c \ cell_texture.c \ cell_vbuf.c \ + cell_vertex_shader.c \ cell_winsys.c diff --git a/src/mesa/pipe/cell/ppu/cell_batch.c b/src/mesa/pipe/cell/ppu/cell_batch.c index c894ef8608..f45e5f25b6 100644 --- a/src/mesa/pipe/cell/ppu/cell_batch.c +++ b/src/mesa/pipe/cell/ppu/cell_batch.c @@ -31,12 +31,55 @@ #include "cell_spu.h" + +uint +cell_get_empty_buffer(struct cell_context *cell) +{ + uint buf = 0, tries = 0; + + /* Find a buffer that's marked as free by all SPUs */ + while (1) { + uint spu, num_free = 0; + + for (spu = 0; spu < cell->num_spus; spu++) { + if (cell->buffer_status[spu][buf][0] == CELL_BUFFER_STATUS_FREE) { + num_free++; + + if (num_free == cell->num_spus) { + /* found a free buffer, now mark status as used */ + for (spu = 0; spu < cell->num_spus; spu++) { + cell->buffer_status[spu][buf][0] = CELL_BUFFER_STATUS_USED; + } + /* + printf("PPU: ALLOC BUFFER %u\n", buf); + */ + return buf; + } + } + else { + break; + } + } + + /* try next buf */ + buf = (buf + 1) % CELL_NUM_BUFFERS; + + tries++; + if (tries == 100) { + /* + printf("PPU WAITING for buffer...\n"); + */ + } + } +} + + void cell_batch_flush(struct cell_context *cell) { static boolean flushing = FALSE; uint batch = cell->cur_batch; - const uint size = cell->batch_buffer_size[batch]; + const uint size = cell->buffer_size[batch]; uint spu, cmd_word; assert(!flushing); @@ -46,7 +89,7 @@ cell_batch_flush(struct cell_context *cell) flushing = TRUE; - assert(batch < CELL_NUM_BATCH_BUFFERS); + assert(batch < CELL_NUM_BUFFERS); /* printf("cell_batch_dispatch: buf %u at %p, size %u\n", @@ -68,28 +111,9 @@ cell_batch_flush(struct cell_context *cell) * array indicating that the PPU can re-use the buffer. */ + batch = cell_get_empty_buffer(cell); - /* Find a buffer that's marked as free by all SPUs */ - while (1) { - uint num_free = 0; - - batch = (batch + 1) % CELL_NUM_BATCH_BUFFERS; - - for (spu = 0; spu < cell->num_spus; spu++) { - if (cell->buffer_status[spu][batch][0] == CELL_BUFFER_STATUS_FREE) - num_free++; - } - - if (num_free == cell->num_spus) { - /* found a free buffer, now mark status as used */ - for (spu = 0; spu < cell->num_spus; spu++) { - cell->buffer_status[spu][batch][0] = CELL_BUFFER_STATUS_USED; - } - break; - } - } - - cell->batch_buffer_size[batch] = 0; /* empty */ + cell->buffer_size[batch] = 0; /* empty */ cell->cur_batch = batch; flushing = FALSE; @@ -99,61 +123,95 @@ cell_batch_flush(struct cell_context *cell) uint cell_batch_free_space(const struct cell_context *cell) { - uint free = CELL_BATCH_BUFFER_SIZE - - cell->batch_buffer_size[cell->cur_batch]; + uint free = CELL_BUFFER_SIZE - cell->buffer_size[cell->cur_batch]; return free; } /** - * \param cmd command to append - * \param length command size in bytes + * Append data to current batch. */ void -cell_batch_append(struct cell_context *cell, const void *cmd, uint length) +cell_batch_append(struct cell_context *cell, const void *data, uint bytes) { uint size; - assert(length % 4 == 0); - assert(cell->cur_batch >= 0); + ASSERT(bytes % 8 == 0); + ASSERT(bytes <= CELL_BUFFER_SIZE); + ASSERT(cell->cur_batch >= 0); + +#ifdef ASSERT + { + uint spu; + for (spu = 0; spu < cell->num_spus; spu++) { + ASSERT(cell->buffer_status[spu][cell->cur_batch][0] + == CELL_BUFFER_STATUS_USED); + } + } +#endif - size = cell->batch_buffer_size[cell->cur_batch]; + size = cell->buffer_size[cell->cur_batch]; - if (size + length > CELL_BATCH_BUFFER_SIZE) { + if (size + bytes > CELL_BUFFER_SIZE) { cell_batch_flush(cell); size = 0; } - assert(size + length <= CELL_BATCH_BUFFER_SIZE); + ASSERT(size + bytes <= CELL_BUFFER_SIZE); - memcpy(cell->batch_buffer[cell->cur_batch] + size, cmd, length); + memcpy(cell->buffer[cell->cur_batch] + size, data, bytes); - cell->batch_buffer_size[cell->cur_batch] = size + length; + cell->buffer_size[cell->cur_batch] = size + bytes; } void * cell_batch_alloc(struct cell_context *cell, uint bytes) { + return cell_batch_alloc_aligned(cell, bytes, 1); +} + + +void * +cell_batch_alloc_aligned(struct cell_context *cell, uint bytes, + uint alignment) +{ void *pos; - uint size; + uint size, padbytes; - ASSERT(bytes % 4 == 0); + ASSERT(bytes % 8 == 0); + ASSERT(bytes <= CELL_BUFFER_SIZE); + ASSERT(alignment > 0); + ASSERT(cell->cur_batch >= 0); - assert(cell->cur_batch >= 0); +#ifdef ASSERT + { + uint spu; + for (spu = 0; spu < cell->num_spus; spu++) { + ASSERT(cell->buffer_status[spu][cell->cur_batch][0] + == CELL_BUFFER_STATUS_USED); + } + } +#endif - size = cell->batch_buffer_size[cell->cur_batch]; + size = cell->buffer_size[cell->cur_batch]; - if (size + bytes > CELL_BATCH_BUFFER_SIZE) { + padbytes = (alignment - (size % alignment)) % alignment; + + if (padbytes + size + bytes > CELL_BUFFER_SIZE) { cell_batch_flush(cell); size = 0; } + else { + size += padbytes; + } - assert(size + bytes <= CELL_BATCH_BUFFER_SIZE); + ASSERT(size % alignment == 0); + ASSERT(size + bytes <= CELL_BUFFER_SIZE); - pos = (void *) (cell->batch_buffer[cell->cur_batch] + size); + pos = (void *) (cell->buffer[cell->cur_batch] + size); - cell->batch_buffer_size[cell->cur_batch] = size + bytes; + cell->buffer_size[cell->cur_batch] = size + bytes; return pos; } diff --git a/src/mesa/pipe/cell/ppu/cell_batch.h b/src/mesa/pipe/cell/ppu/cell_batch.h index c4ba7feb3d..a6eee0a8b1 100644 --- a/src/mesa/pipe/cell/ppu/cell_batch.h +++ b/src/mesa/pipe/cell/ppu/cell_batch.h @@ -35,6 +35,9 @@ struct cell_context; +extern uint +cell_get_empty_buffer(struct cell_context *cell); + extern void cell_batch_flush(struct cell_context *cell); @@ -42,10 +45,14 @@ extern uint cell_batch_free_space(const struct cell_context *cell); extern void -cell_batch_append(struct cell_context *cell, const void *cmd, uint length); +cell_batch_append(struct cell_context *cell, const void *data, uint bytes); extern void * cell_batch_alloc(struct cell_context *cell, uint bytes); +extern void * +cell_batch_alloc_aligned(struct cell_context *cell, uint bytes, + uint alignment); + #endif /* CELL_BATCH_H */ diff --git a/src/mesa/pipe/cell/ppu/cell_clear.c b/src/mesa/pipe/cell/ppu/cell_clear.c index e01640b994..07b908eec5 100644 --- a/src/mesa/pipe/cell/ppu/cell_clear.c +++ b/src/mesa/pipe/cell/ppu/cell_clear.c @@ -48,9 +48,12 @@ cell_clear_surface(struct pipe_context *pipe, struct pipe_surface *ps, unsigned clearValue) { struct cell_context *cell = cell_context(pipe); - /*uint i;*/ uint surfIndex; + if (cell->dirty) + cell_update_derived(cell); + + if (!cell->cbuf_map[0]) cell->cbuf_map[0] = pipe_surface_map(ps); @@ -61,29 +64,7 @@ cell_clear_surface(struct pipe_context *pipe, struct pipe_surface *ps, surfIndex = 0; } -#if 0 - for (i = 0; i < cell->num_spus; i++) { -#if 1 - uint clr = clearValue; - if (surfIndex == 0) { - /* XXX debug: clear color varied per-SPU to visualize tiles */ - if ((clr & 0xff) == 0) - clr |= 64 + i * 8; - if ((clr & 0xff00) == 0) - clr |= (64 + i * 8) << 8; - if ((clr & 0xff0000) == 0) - clr |= (64 + i * 8) << 16; - if ((clr & 0xff000000) == 0) - clr |= (64 + i * 8) << 24; - } - cell_global.command[i].clear.value = clr; -#else - cell_global.command[i].clear.value = clearValue; -#endif - cell_global.command[i].clear.surface = surfIndex; - send_mbox_message(cell_global.spe_contexts[i], CELL_CMD_CLEAR_SURFACE); - } -#else + { struct cell_command_clear_surface *clr = (struct cell_command_clear_surface *) @@ -92,9 +73,4 @@ cell_clear_surface(struct pipe_context *pipe, struct pipe_surface *ps, clr->surface = surfIndex; clr->value = clearValue; } -#endif - - /* XXX temporary */ - cell_flush(&cell->pipe, 0x0); - } diff --git a/src/mesa/pipe/cell/ppu/cell_context.c b/src/mesa/pipe/cell/ppu/cell_context.c index 8cb0c48f40..bbe1fd7a11 100644 --- a/src/mesa/pipe/cell/ppu/cell_context.c +++ b/src/mesa/pipe/cell/ppu/cell_context.c @@ -39,6 +39,7 @@ #include "pipe/p_winsys.h" #include "pipe/cell/common.h" #include "pipe/draw/draw_context.h" +#include "pipe/draw/draw_private.h" #include "cell_clear.h" #include "cell_context.h" #include "cell_draw_arrays.h" @@ -156,6 +157,19 @@ cell_destroy_context( struct pipe_context *pipe ) } +static struct draw_context * +cell_draw_create(struct cell_context *cell) +{ + struct draw_context *draw = draw_create(); + + if (getenv("GALLIUM_CELL_VS")) { + /* plug in SPU-based vertex transformation code */ + draw->shader_queue_flush = cell_vertex_shader_queue_flush; + draw->driver_private = cell; + } + + return draw; +} struct pipe_context * @@ -242,7 +256,7 @@ cell_create_context(struct pipe_winsys *winsys, struct cell_winsys *cws) cell_init_surface_functions(cell); - cell->draw = draw_create(); + cell->draw = cell_draw_create(cell); cell_init_vbuf(cell); draw_set_rasterize_stage(cell->draw, cell->vbuf); @@ -254,8 +268,9 @@ cell_create_context(struct pipe_winsys *winsys, struct cell_winsys *cws) cell_start_spus(cell); - for (buf = 0; buf < CELL_NUM_BATCH_BUFFERS; buf++) { - cell->batch_buffer_size[buf] = 0; + /* init command, vertex/index buffer info */ + for (buf = 0; buf < CELL_NUM_BUFFERS; buf++) { + cell->buffer_size[buf] = 0; /* init batch buffer status values, * mark 0th buffer as used, rest as free. diff --git a/src/mesa/pipe/cell/ppu/cell_context.h b/src/mesa/pipe/cell/ppu/cell_context.h index 3bd88bfd5b..3b63419b5e 100644 --- a/src/mesa/pipe/cell/ppu/cell_context.h +++ b/src/mesa/pipe/cell/ppu/cell_context.h @@ -38,9 +38,6 @@ #include "pipe/cell/common.h" -#define CELL_MAX_SPUS 6 - - struct cell_vbuf_render; struct cell_vertex_shader_state @@ -76,7 +73,7 @@ struct cell_context struct pipe_framebuffer_state framebuffer; struct pipe_poly_stipple poly_stipple; struct pipe_scissor_state scissor; - struct pipe_texture *texture[PIPE_MAX_SAMPLERS]; + struct cell_texture *texture[PIPE_MAX_SAMPLERS]; struct pipe_viewport_state viewport; struct pipe_vertex_buffer vertex_buffer[PIPE_ATTRIB_MAX]; struct pipe_vertex_element vertex_element[PIPE_ATTRIB_MAX]; @@ -84,6 +81,9 @@ struct cell_context ubyte *cbuf_map[PIPE_MAX_COLOR_BUFS]; ubyte *zsbuf_map; + struct pipe_surface *tex_surf; + uint *tex_map; + uint dirty; /** The primitive drawing context */ @@ -102,12 +102,14 @@ struct cell_context uint num_spus; - uint batch_buffer_size[CELL_NUM_BATCH_BUFFERS]; - ubyte batch_buffer[CELL_NUM_BATCH_BUFFERS][CELL_BATCH_BUFFER_SIZE] ALIGN16_ATTRIB; - int cur_batch; /**< which batch buffer is being filled */ + /** Buffers for command batches, vertex/index data */ + uint buffer_size[CELL_NUM_BUFFERS]; + ubyte buffer[CELL_NUM_BUFFERS][CELL_BUFFER_SIZE] ALIGN16_ATTRIB; + + int cur_batch; /**< which buffer is being filled w/ commands */ /** [4] to ensure 16-byte alignment for each status word */ - uint buffer_status[CELL_MAX_SPUS][CELL_NUM_BATCH_BUFFERS][4] ALIGN16_ATTRIB; + uint buffer_status[CELL_MAX_SPUS][CELL_NUM_BUFFERS][4] ALIGN16_ATTRIB; }; @@ -124,6 +126,8 @@ cell_context(struct pipe_context *pipe) extern struct pipe_context * cell_create_context(struct pipe_winsys *ws, struct cell_winsys *cws); +extern void +cell_vertex_shader_queue_flush(struct draw_context *draw); diff --git a/src/mesa/pipe/cell/ppu/cell_flush.c b/src/mesa/pipe/cell/ppu/cell_flush.c index b98bb566b1..f62bc4650c 100644 --- a/src/mesa/pipe/cell/ppu/cell_flush.c +++ b/src/mesa/pipe/cell/ppu/cell_flush.c @@ -39,6 +39,9 @@ cell_flush(struct pipe_context *pipe, unsigned flags) { struct cell_context *cell = cell_context(pipe); + if (flags & PIPE_FLUSH_SWAPBUFFERS) + flags |= PIPE_FLUSH_WAIT; + draw_flush( cell->draw ); cell_flush_int(pipe, flags); } @@ -56,7 +59,7 @@ cell_flush_int(struct pipe_context *pipe, unsigned flags) flushing = TRUE; if (flags & PIPE_FLUSH_WAIT) { - uint *cmd = (uint *) cell_batch_alloc(cell, sizeof(uint)); + uint64_t *cmd = (uint64_t *) cell_batch_alloc(cell, sizeof(uint64_t)); *cmd = CELL_CMD_FINISH; } diff --git a/src/mesa/pipe/cell/ppu/cell_spu.c b/src/mesa/pipe/cell/ppu/cell_spu.c index 4627bc8d1f..7c83a47e57 100644 --- a/src/mesa/pipe/cell/ppu/cell_spu.c +++ b/src/mesa/pipe/cell/ppu/cell_spu.c @@ -111,8 +111,8 @@ cell_start_spus(struct cell_context *cell) cell_global.inits[i].id = i; cell_global.inits[i].num_spus = cell->num_spus; cell_global.inits[i].cmd = &cell_global.command[i]; - for (j = 0; j < CELL_NUM_BATCH_BUFFERS; j++) { - cell_global.inits[i].batch_buffers[j] = cell->batch_buffer[j]; + for (j = 0; j < CELL_NUM_BUFFERS; j++) { + cell_global.inits[i].buffers[j] = cell->buffer[j]; } cell_global.inits[i].buffer_status = &cell->buffer_status[0][0][0]; diff --git a/src/mesa/pipe/cell/ppu/cell_state_blend.c b/src/mesa/pipe/cell/ppu/cell_state_blend.c index 34ae0128ea..4fc60548c8 100644 --- a/src/mesa/pipe/cell/ppu/cell_state_blend.c +++ b/src/mesa/pipe/cell/ppu/cell_state_blend.c @@ -29,6 +29,7 @@ */ #include "pipe/p_util.h" +#include "pipe/draw/draw_context.h" #include "cell_context.h" #include "cell_state.h" @@ -38,9 +39,7 @@ void * cell_create_blend_state(struct pipe_context *pipe, const struct pipe_blend_state *blend) { - struct pipe_blend_state *state = MALLOC(sizeof(struct pipe_blend_state)); - memcpy(state, blend, sizeof(struct pipe_blend_state)); - return state; + return mem_dup(blend, sizeof(*blend)); } @@ -49,6 +48,8 @@ cell_bind_blend_state(struct pipe_context *pipe, void *blend) { struct cell_context *cell = cell_context(pipe); + draw_flush(cell->draw); + cell->blend = (const struct pipe_blend_state *)blend; cell->dirty |= CELL_NEW_BLEND; @@ -68,6 +69,8 @@ cell_set_blend_color(struct pipe_context *pipe, { struct cell_context *cell = cell_context(pipe); + draw_flush(cell->draw); + cell->blend_color = *blend_color; cell->dirty |= CELL_NEW_BLEND; @@ -80,10 +83,7 @@ void * cell_create_depth_stencil_alpha_state(struct pipe_context *pipe, const struct pipe_depth_stencil_alpha_state *depth_stencil) { - struct pipe_depth_stencil_alpha_state *state = - MALLOC(sizeof(struct pipe_depth_stencil_alpha_state)); - memcpy(state, depth_stencil, sizeof(struct pipe_depth_stencil_alpha_state)); - return state; + return mem_dup(depth_stencil, sizeof(*depth_stencil)); } @@ -93,6 +93,8 @@ cell_bind_depth_stencil_alpha_state(struct pipe_context *pipe, { struct cell_context *cell = cell_context(pipe); + draw_flush(cell->draw); + cell->depth_stencil = (const struct pipe_depth_stencil_alpha_state *) depth_stencil; diff --git a/src/mesa/pipe/cell/ppu/cell_state_emit.c b/src/mesa/pipe/cell/ppu/cell_state_emit.c index dbca900c35..5d2a786449 100644 --- a/src/mesa/pipe/cell/ppu/cell_state_emit.c +++ b/src/mesa/pipe/cell/ppu/cell_state_emit.c @@ -30,6 +30,18 @@ #include "cell_state.h" #include "cell_state_emit.h" #include "cell_batch.h" +#include "cell_texture.h" + + +static void +emit_state_cmd(struct cell_context *cell, uint cmd, + const void *state, uint state_size) +{ + uint64_t *dst = (uint64_t *) + cell_batch_alloc(cell, ROUNDUP8(sizeof(uint64_t) + state_size)); + *dst = cmd; + memcpy(dst + 1, state, state_size); +} @@ -50,23 +62,42 @@ cell_emit_state(struct cell_context *cell) fb->height = cell->framebuffer.cbufs[0]->height; } + if (cell->dirty & CELL_NEW_BLEND) { + emit_state_cmd(cell, CELL_CMD_STATE_BLEND, + cell->blend, + sizeof(struct pipe_blend_state)); + } + if (cell->dirty & CELL_NEW_DEPTH_STENCIL) { - uint cmd = CELL_CMD_STATE_DEPTH_STENCIL; - cell_batch_append(cell, &cmd, 4); - cell_batch_append(cell, cell->depth_stencil, - sizeof(struct pipe_depth_stencil_alpha_state)); + emit_state_cmd(cell, CELL_CMD_STATE_DEPTH_STENCIL, + cell->depth_stencil, + sizeof(struct pipe_depth_stencil_alpha_state)); } if (cell->dirty & CELL_NEW_SAMPLER) { - uint cmd = CELL_CMD_STATE_SAMPLER; - cell_batch_append(cell, &cmd, 4); - cell_batch_append(cell, cell->sampler[0], - sizeof(struct pipe_sampler_state)); + emit_state_cmd(cell, CELL_CMD_STATE_SAMPLER, + cell->sampler[0], sizeof(struct pipe_sampler_state)); + } + + if (cell->dirty & CELL_NEW_TEXTURE) { + struct cell_command_texture texture; + if (cell->texture[0]) { + texture.start = cell->texture[0]->tiled_data; + texture.width = cell->texture[0]->base.width[0]; + texture.height = cell->texture[0]->base.height[0]; + } + else { + texture.start = NULL; + texture.width = 0; + texture.height = 0; + } + + emit_state_cmd(cell, CELL_CMD_STATE_TEXTURE, + &texture, sizeof(struct cell_command_texture)); } if (cell->dirty & CELL_NEW_VERTEX_INFO) { - uint cmd = CELL_CMD_STATE_VERTEX_INFO; - cell_batch_append(cell, &cmd, 4); - cell_batch_append(cell, &cell->vertex_info, sizeof(struct vertex_info)); + emit_state_cmd(cell, CELL_CMD_STATE_VERTEX_INFO, + &cell->vertex_info, sizeof(struct vertex_info)); } } diff --git a/src/mesa/pipe/cell/ppu/cell_state_fs.c b/src/mesa/pipe/cell/ppu/cell_state_fs.c index 81c2ac14dd..96a52273b0 100644 --- a/src/mesa/pipe/cell/ppu/cell_state_fs.c +++ b/src/mesa/pipe/cell/ppu/cell_state_fs.c @@ -45,7 +45,7 @@ void * cell_create_fs_state(struct pipe_context *pipe, const struct pipe_shader_state *templ) { - struct cell_context *cell = cell_context(pipe); + /*struct cell_context *cell = cell_context(pipe);*/ struct cell_fragment_shader_state *state; state = CALLOC_STRUCT(cell_fragment_shader_state); @@ -94,8 +94,6 @@ cell_bind_fs_state(struct pipe_context *pipe, void *fs) void cell_delete_fs_state(struct pipe_context *pipe, void *fs) { - struct cell_context *cell = cell_context(pipe); - struct cell_fragment_shader_state *state = (struct cell_fragment_shader_state *) fs; diff --git a/src/mesa/pipe/cell/ppu/cell_state_sampler.c b/src/mesa/pipe/cell/ppu/cell_state_sampler.c index ae1eeb4620..ade6cc8338 100644 --- a/src/mesa/pipe/cell/ppu/cell_state_sampler.c +++ b/src/mesa/pipe/cell/ppu/cell_state_sampler.c @@ -30,21 +30,17 @@ */ #include "pipe/p_util.h" +#include "pipe/draw/draw_context.h" #include "cell_context.h" #include "cell_state.h" -#if 0 #include "cell_texture.h" -#include "cell_tile_cache.h" -#endif void * cell_create_sampler_state(struct pipe_context *pipe, const struct pipe_sampler_state *sampler) { - struct pipe_sampler_state *state = MALLOC( sizeof(struct pipe_sampler_state) ); - memcpy(state, sampler, sizeof(struct pipe_sampler_state)); - return state; + return mem_dup(sampler, sizeof(*sampler)); } void @@ -53,6 +49,8 @@ cell_bind_sampler_state(struct pipe_context *pipe, { struct cell_context *cell = cell_context(pipe); + draw_flush(cell->draw); + assert(unit < PIPE_MAX_SAMPLERS); cell->sampler[unit] = (struct pipe_sampler_state *)sampler; @@ -76,7 +74,11 @@ cell_set_sampler_texture(struct pipe_context *pipe, { struct cell_context *cell = cell_context(pipe); + draw_flush(cell->draw); + cell->texture[sampler] = texture; + cell_update_texture_mapping(cell); + cell->dirty |= CELL_NEW_TEXTURE; } diff --git a/src/mesa/pipe/cell/ppu/cell_texture.c b/src/mesa/pipe/cell/ppu/cell_texture.c index 0a8190d983..df178d9ca2 100644 --- a/src/mesa/pipe/cell/ppu/cell_texture.c +++ b/src/mesa/pipe/cell/ppu/cell_texture.c @@ -79,31 +79,30 @@ cell_texture_layout(struct cell_texture * spt) } -void -cell_texture_create(struct pipe_context *pipe, struct pipe_texture **pt) +struct pipe_texture * +cell_texture_create(struct pipe_context *pipe, const struct pipe_texture *templat) { - struct cell_texture *spt = REALLOC(*pt, sizeof(struct pipe_texture), - sizeof(struct cell_texture)); + struct cell_texture *spt = CALLOC_STRUCT(cell_texture); + if (!spt) + return NULL; - if (spt) { - memset(&spt->base + 1, 0, - sizeof(struct cell_texture) - sizeof(struct pipe_texture)); + spt->base = *templat; - cell_texture_layout(spt); + cell_texture_layout(spt); - spt->buffer = pipe->winsys->buffer_create(pipe->winsys, 32, - PIPE_BUFFER_USAGE_PIXEL, - spt->buffer_size); + spt->buffer = pipe->winsys->buffer_create(pipe->winsys, 32, + PIPE_BUFFER_USAGE_PIXEL, + spt->buffer_size); - if (!spt->buffer) { - FREE(spt); - spt = NULL; - } + if (!spt->buffer) { + FREE(spt); + return NULL; } - *pt = &spt->base; + return &spt->base; } + void cell_texture_release(struct pipe_context *pipe, struct pipe_texture **pt) { @@ -163,3 +162,91 @@ cell_get_tex_surface(struct pipe_context *pipe, } return ps; } + + + +static void +tile_copy_data(uint w, uint h, uint tile_size, uint *dst, const uint *src) +{ + const uint tile_size2 = tile_size * tile_size; + const uint h_t = h / tile_size, w_t = w / tile_size; + + uint it, jt; /* tile counters */ + uint i, j; /* intra-tile counters */ + + for (it = 0; it < h_t; it++) { + for (jt = 0; jt < w_t; jt++) { + /* fill in tile (i, j) */ + uint *tdst = dst + (it * w_t + jt) * tile_size2; + for (i = 0; i < tile_size; i++) { + for (j = 0; j < tile_size; j++) { + const uint srci = it * tile_size + i; + const uint srcj = jt * tile_size + j; + *tdst++ = src[srci * h + srcj]; + } + } + } + } +} + + + +/** + * Convert linear texture image data to tiled format for SPU usage. + */ +static void +cell_tile_texture(struct cell_context *cell, + struct cell_texture *texture) +{ + uint face = 0, level = 0, zslice = 0; + struct pipe_surface *surf; + const uint w = texture->base.width[0], h = texture->base.height[0]; + const uint *src; + + /* temporary restrictions: */ + assert(w >= TILE_SIZE); + assert(h >= TILE_SIZE); + assert(w % TILE_SIZE == 0); + assert(h % TILE_SIZE == 0); + + surf = cell_get_tex_surface(&cell->pipe, &texture->base, face, level, zslice); + ASSERT(surf); + + src = (const uint *) pipe_surface_map(surf); + + if (texture->tiled_data) { + align_free(texture->tiled_data); + } + texture->tiled_data = align_malloc(w * h * 4, 16); + + tile_copy_data(w, h, TILE_SIZE, texture->tiled_data, src); + + pipe_surface_unmap(surf); + + pipe_surface_reference(&surf, NULL); +} + + + +void +cell_update_texture_mapping(struct cell_context *cell) +{ + uint face = 0, level = 0, zslice = 0; + + if (cell->texture[0]) + cell_tile_texture(cell, cell->texture[0]); +#if 0 + if (cell->tex_surf && cell->tex_map) { + pipe_surface_unmap(cell->tex_surf); + cell->tex_map = NULL; + } + + /* XXX free old surface */ + + cell->tex_surf = cell_get_tex_surface(&cell->pipe, + &cell->texture[0]->base, + face, level, zslice); + + cell->tex_map = pipe_surface_map(cell->tex_surf); +#endif +} diff --git a/src/mesa/pipe/cell/ppu/cell_texture.h b/src/mesa/pipe/cell/ppu/cell_texture.h index ef5808c086..0264fed88e 100644 --- a/src/mesa/pipe/cell/ppu/cell_texture.h +++ b/src/mesa/pipe/cell/ppu/cell_texture.h @@ -46,6 +46,8 @@ struct cell_texture */ struct pipe_buffer *buffer; unsigned long buffer_size; + + void *tiled_data; /* XXX this may be temporary */ /*ALIGN16*/ }; @@ -58,8 +60,9 @@ cell_texture(struct pipe_texture *pt) -extern void -cell_texture_create(struct pipe_context *pipe, struct pipe_texture **pt); +extern struct pipe_texture * +cell_texture_create(struct pipe_context *pipe, + const struct pipe_texture *templat); extern void cell_texture_release(struct pipe_context *pipe, struct pipe_texture **pt); @@ -70,4 +73,8 @@ cell_get_tex_surface(struct pipe_context *pipe, unsigned face, unsigned level, unsigned zslice); +extern void +cell_update_texture_mapping(struct cell_context *cell); + + #endif /* CELL_TEXTURE */ diff --git a/src/mesa/pipe/cell/ppu/cell_vbuf.c b/src/mesa/pipe/cell/ppu/cell_vbuf.c index ee572b3a51..e9fafe492e 100644 --- a/src/mesa/pipe/cell/ppu/cell_vbuf.c +++ b/src/mesa/pipe/cell/ppu/cell_vbuf.c @@ -39,8 +39,7 @@ #include "pipe/draw/draw_vbuf.h" -/** Allow prim indexes, verts to be inlined after RENDER command */ -#define ALLOW_INLINE_INDEXES 1 +/** Allow vertex data to be inlined after RENDER command */ #define ALLOW_INLINE_VERTS 1 @@ -52,9 +51,10 @@ struct cell_vbuf_render { struct vbuf_render base; struct cell_context *cell; - uint prim; - uint vertex_size; - void *vertex_buffer; + uint prim; /**< PIPE_PRIM_x */ + uint vertex_size; /**< in bytes */ + void *vertex_buffer; /**< just for debug, really */ + uint vertex_buf; /**< in [0, CELL_NUM_BUFFERS-1] */ }; @@ -81,14 +81,46 @@ cell_vbuf_allocate_vertices(struct vbuf_render *vbr, { struct cell_vbuf_render *cvbr = cell_vbuf_render(vbr); /*printf("Alloc verts %u * %u\n", vertex_size, nr_vertices);*/ - assert(!cvbr->vertex_buffer); - cvbr->vertex_buffer = align_malloc(vertex_size * nr_vertices, 16); + + assert(cvbr->vertex_buf == ~0); + cvbr->vertex_buf = cell_get_empty_buffer(cvbr->cell); + cvbr->vertex_buffer = cvbr->cell->buffer[cvbr->vertex_buf]; cvbr->vertex_size = vertex_size; return cvbr->vertex_buffer; } static void +cell_vbuf_release_vertices(struct vbuf_render *vbr, void *vertices, + unsigned vertex_size, unsigned vertices_used) +{ + struct cell_vbuf_render *cvbr = cell_vbuf_render(vbr); + struct cell_context *cell = cvbr->cell; + + /* + printf("%s vertex_buf = %u count = %u\n", + __FUNCTION__, cvbr->vertex_buf, vertices_used); + */ + + /* Tell SPUs they can release the vert buf */ + if (cvbr->vertex_buf != ~0U) { + struct cell_command_release_verts *release + = (struct cell_command_release_verts *) + cell_batch_alloc(cell, sizeof(struct cell_command_release_verts)); + release->opcode = CELL_CMD_RELEASE_VERTS; + release->vertex_buf = cvbr->vertex_buf; + } + + cvbr->vertex_buf = ~0; + cell_flush_int(&cell->pipe, 0x0); + + assert(vertices == cvbr->vertex_buffer); + cvbr->vertex_buffer = NULL; +} + + + +static void cell_vbuf_set_primitive(struct vbuf_render *vbr, unsigned prim) { struct cell_vbuf_render *cvbr = cell_vbuf_render(vbr); @@ -106,17 +138,24 @@ cell_vbuf_draw(struct vbuf_render *vbr, struct cell_context *cell = cvbr->cell; float xmin, ymin, xmax, ymax; uint i; - uint nr_vertices = 0; + uint nr_vertices = 0, min_index = ~0; const void *vertices = cvbr->vertex_buffer; const uint vertex_size = cvbr->vertex_size; for (i = 0; i < nr_indices; i++) { if (indices[i] > nr_vertices) nr_vertices = indices[i]; + if (indices[i] < min_index) + min_index = indices[i]; } nr_vertices++; #if 0 + /*if (min_index > 0)*/ + printf("%s min_index = %u\n", __FUNCTION__, min_index); +#endif + +#if 0 printf("cell_vbuf_draw() nr_indices = %u nr_verts = %u\n", nr_indices, nr_vertices); printf(" "); @@ -137,7 +176,7 @@ cell_vbuf_draw(struct vbuf_render *vbr, /* compute x/y bounding box */ xmin = ymin = 1e50; xmax = ymax = -1e50; - for (i = 0; i < nr_vertices; i++) { + for (i = min_index; i < nr_vertices; i++) { const float *v = (float *) ((ubyte *) vertices + i * vertex_size); if (v[0] < xmin) xmin = v[0]; @@ -148,83 +187,68 @@ cell_vbuf_draw(struct vbuf_render *vbr, if (v[1] > ymax) ymax = v[1]; } +#if 0 + printf("PPU Bounds %g, %g .. %g, %g\n", xmin, ymin, xmax, ymax); + fflush(stdout); +#endif if (cvbr->prim != PIPE_PRIM_TRIANGLES) return; /* only render tris for now */ /* build/insert batch RENDER command */ { - const uint index_bytes = ROUNDUP4(nr_indices * 2); + const uint index_bytes = ROUNDUP8(nr_indices * 2); const uint vertex_bytes = nr_vertices * 4 * cell->vertex_info.size; + const uint batch_size = sizeof(struct cell_command_render) + index_bytes; struct cell_command_render *render = (struct cell_command_render *) - cell_batch_alloc(cell, sizeof(*render)); + cell_batch_alloc(cell, batch_size); + render->opcode = CELL_CMD_RENDER; render->prim_type = cvbr->prim; render->num_indexes = nr_indices; - if (ALLOW_INLINE_INDEXES && - index_bytes <= cell_batch_free_space(cell)) { - /* indices inlined, right after render cmd */ - void *dst = cell_batch_alloc(cell, index_bytes); - memcpy(dst, indices, nr_indices * 2); - render->inline_indexes = TRUE; - render->index_data = NULL; - } - else { - /* indices in separate buffer */ - render->inline_indexes = FALSE; - render->index_data = indices; - ASSERT_ALIGN16(render->index_data); - } + render->min_index = min_index; + + /* append indices after render command */ + memcpy(render + 1, indices, nr_indices * 2); + /* if there's room, append vertices after the indices, else leave + * vertices in the original/separate buffer. + */ render->vertex_size = 4 * cell->vertex_info.size; render->num_verts = nr_vertices; if (ALLOW_INLINE_VERTS && - render->inline_indexes && - vertex_bytes <= cell_batch_free_space(cell)) { - /* vertex data inlined, after indices */ - void *dst = cell_batch_alloc(cell, vertex_bytes); + min_index == 0 && + vertex_bytes + 16 <= cell_batch_free_space(cell)) { + /* vertex data inlined, after indices, at 16-byte boundary */ + void *dst = cell_batch_alloc_aligned(cell, vertex_bytes, 16); memcpy(dst, vertices, vertex_bytes); render->inline_verts = TRUE; - render->vertex_data = NULL; + render->vertex_buf = ~0; } else { + /* vertex data in separate buffer */ render->inline_verts = FALSE; - render->vertex_data = vertices; - ASSERT_ALIGN16(render->vertex_data); + ASSERT(cvbr->vertex_buf >= 0); + render->vertex_buf = cvbr->vertex_buf; } - render->xmin = xmin; render->ymin = ymin; render->xmax = xmax; render->ymax = ymax; } -#if 01 - /* XXX this is temporary */ +#if 0 + /* helpful for debug */ cell_flush_int(&cell->pipe, PIPE_FLUSH_WAIT); #endif } static void -cell_vbuf_release_vertices(struct vbuf_render *vbr, void *vertices, - unsigned vertex_size, unsigned vertices_used) -{ - struct cell_vbuf_render *cvbr = cell_vbuf_render(vbr); - - /*printf("Free verts %u * %u\n", vertex_size, vertices_used);*/ - align_free(vertices); - - assert(vertices == cvbr->vertex_buffer); - cvbr->vertex_buffer = NULL; -} - - -static void cell_vbuf_destroy(struct vbuf_render *vbr) { struct cell_vbuf_render *cvbr = cell_vbuf_render(vbr); @@ -244,8 +268,15 @@ cell_init_vbuf(struct cell_context *cell) cell->vbuf_render = CALLOC_STRUCT(cell_vbuf_render); - cell->vbuf_render->base.max_indices = CELL_MAX_VBUF_INDEXES; - cell->vbuf_render->base.max_vertex_buffer_bytes = CELL_MAX_VBUF_SIZE; + /* The max number of indexes is what can fix into a batch buffer, + * minus the render and release-verts commands. + */ + cell->vbuf_render->base.max_indices + = (CELL_BUFFER_SIZE + - sizeof(struct cell_command_render) + - sizeof(struct cell_command_release_verts)) + / sizeof(ushort); + cell->vbuf_render->base.max_vertex_buffer_bytes = CELL_BUFFER_SIZE; cell->vbuf_render->base.get_vertex_info = cell_vbuf_get_vertex_info; cell->vbuf_render->base.allocate_vertices = cell_vbuf_allocate_vertices; @@ -255,6 +286,9 @@ cell_init_vbuf(struct cell_context *cell) cell->vbuf_render->base.destroy = cell_vbuf_destroy; cell->vbuf_render->cell = cell; +#if 1 + cell->vbuf_render->vertex_buf = ~0; +#endif cell->vbuf = draw_vbuf_stage(cell->draw, &cell->vbuf_render->base); } diff --git a/src/mesa/pipe/cell/ppu/cell_vertex_shader.c b/src/mesa/pipe/cell/ppu/cell_vertex_shader.c new file mode 100644 index 0000000000..80dd500b34 --- /dev/null +++ b/src/mesa/pipe/cell/ppu/cell_vertex_shader.c @@ -0,0 +1,120 @@ +/* + * (C) Copyright IBM Corporation 2008 + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +/** + * \file cell_vertex_shader.c + * Vertex shader interface routines for Cell. + * + * \author Ian Romanick <idr@us.ibm.com> + */ + +#include "pipe/p_defines.h" +#include "pipe/p_context.h" +#include "pipe/p_winsys.h" + +#include "cell_context.h" +#include "cell_draw_arrays.h" +#include "cell_spu.h" +#include "cell_batch.h" + +#include "pipe/cell/common.h" +#include "pipe/draw/draw_context.h" +#include "pipe/draw/draw_private.h" + +/** + * Run the vertex shader on all vertices in the vertex queue. + * Called by the draw module when the vertx cache needs to be flushed. + */ +void +cell_vertex_shader_queue_flush(struct draw_context *draw) +{ + struct cell_context *const cell = + (struct cell_context *) draw->driver_private; + struct cell_command_vs *const vs = &cell_global.command[0].vs; + uint64_t *batch; + struct cell_array_info *array_info; + unsigned i, j; + + assert(draw->vs.queue_nr != 0); + + /* XXX: do this on statechange: + */ + draw_update_vertex_fetch(draw); + + for (i = 0; i < draw->vertex_fetch.nr_attrs; i++) { + batch = cell_batch_alloc(cell, sizeof(batch[0]) + sizeof(*array_info)); + + batch[0] = CELL_CMD_STATE_VS_ARRAY_INFO; + + array_info = (struct cell_array_info *) &batch[1]; + assert(draw->vertex_fetch.src_ptr[i] != NULL); + array_info->base = (uintptr_t) draw->vertex_fetch.src_ptr[i]; + array_info->attr = i; + array_info->pitch = draw->vertex_fetch.pitch[i]; + array_info->format = draw->vertex_element[i].src_format; + } + + batch = cell_batch_alloc(cell, sizeof(batch[0]) + + sizeof(struct pipe_viewport_state)); + batch[0] = CELL_CMD_STATE_VIEWPORT; + (void) memcpy(&batch[1], &draw->viewport, + sizeof(struct pipe_viewport_state)); + + cell_batch_flush(cell); + + vs->opcode = CELL_CMD_VS_EXECUTE; + vs->shader.num_outputs = draw->num_vs_outputs; + vs->shader.declarations = (uintptr_t) draw->machine.Declarations; + vs->shader.num_declarations = draw->machine.NumDeclarations; + vs->shader.instructions = (uintptr_t) draw->machine.Instructions; + vs->shader.num_instructions = draw->machine.NumInstructions; + vs->shader.uniforms = (uintptr_t) draw->user.constants; + vs->shader.immediates = (uintptr_t) draw->machine.Imms; + vs->shader.num_immediates = draw->machine.ImmLimit / 4; + vs->nr_attrs = draw->vertex_fetch.nr_attrs; + + (void) memcpy(vs->plane, draw->plane, sizeof(draw->plane)); + vs->nr_planes = draw->nr_planes; + + for (i = 0; i < draw->vs.queue_nr; i += SPU_VERTS_PER_BATCH) { + const unsigned n = MIN2(SPU_VERTS_PER_BATCH, draw->vs.queue_nr - i); + + for (j = 0; j < n; j++) { + vs->elts[j] = draw->vs.queue[i + j].elt; + vs->vOut[j] = (uintptr_t) draw->vs.queue[i + j].dest; + } + + for (/* empty */; j < SPU_VERTS_PER_BATCH; j++) { + vs->elts[j] = vs->elts[0]; + vs->vOut[j] = vs->vOut[0]; + } + + vs->num_elts = n; + send_mbox_message(cell_global.spe_contexts[0], CELL_CMD_VS_EXECUTE); + + cell_flush_int(& cell->pipe, PIPE_FLUSH_WAIT); + } + + draw->vs.queue_nr = 0; +} diff --git a/src/mesa/pipe/cell/spu/Makefile b/src/mesa/pipe/cell/spu/Makefile index 417ae1b072..f202971d73 100644 --- a/src/mesa/pipe/cell/spu/Makefile +++ b/src/mesa/pipe/cell/spu/Makefile @@ -17,8 +17,15 @@ PROG_SPU_EMBED_O = $(PROG)_spu-embed.o SOURCES = \ spu_main.c \ + spu_blend.c \ + spu_render.c \ + spu_texture.c \ spu_tile.c \ - spu_tri.c + spu_tri.c \ + spu_exec.c \ + spu_util.c \ + spu_vertex_fetch.c \ + spu_vertex_shader.c SPU_OBJECTS = $(SOURCES:.c=.o) \ diff --git a/src/mesa/pipe/cell/spu/spu_blend.c b/src/mesa/pipe/cell/spu/spu_blend.c new file mode 100644 index 0000000000..23ec0eeb45 --- /dev/null +++ b/src/mesa/pipe/cell/spu/spu_blend.c @@ -0,0 +1,62 @@ +/************************************************************************** + * + * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + + +#include "spu_main.h" +#include "spu_blend.h" +#include "spu_colorpack.h" + + +void +blend_quad(uint itx, uint ity, vector float colors[4]) +{ + /* simple SRC_ALPHA, ONE_MINUS_SRC_ALPHA blending */ + vector float fbc00 = spu_unpack_color(spu.ctile.ui[ity][itx]); + vector float fbc01 = spu_unpack_color(spu.ctile.ui[ity][itx+1]); + vector float fbc10 = spu_unpack_color(spu.ctile.ui[ity+1][itx]); + vector float fbc11 = spu_unpack_color(spu.ctile.ui[ity+1][itx+1]); + + vector float alpha00 = spu_splats(spu_extract(colors[0], 3)); + vector float alpha01 = spu_splats(spu_extract(colors[1], 3)); + vector float alpha10 = spu_splats(spu_extract(colors[2], 3)); + vector float alpha11 = spu_splats(spu_extract(colors[3], 3)); + + vector float one_minus_alpha00 = spu_sub(spu_splats(1.0f), alpha00); + vector float one_minus_alpha01 = spu_sub(spu_splats(1.0f), alpha01); + vector float one_minus_alpha10 = spu_sub(spu_splats(1.0f), alpha10); + vector float one_minus_alpha11 = spu_sub(spu_splats(1.0f), alpha11); + + colors[0] = spu_add(spu_mul(colors[0], alpha00), + spu_mul(fbc00, one_minus_alpha00)); + colors[1] = spu_add(spu_mul(colors[1], alpha01), + spu_mul(fbc01, one_minus_alpha01)); + colors[2] = spu_add(spu_mul(colors[2], alpha10), + spu_mul(fbc10, one_minus_alpha10)); + colors[3] = spu_add(spu_mul(colors[3], alpha11), + spu_mul(fbc11, one_minus_alpha11)); +} + diff --git a/src/mesa/pipe/cell/spu/spu_blend.h b/src/mesa/pipe/cell/spu/spu_blend.h new file mode 100644 index 0000000000..2b594b578b --- /dev/null +++ b/src/mesa/pipe/cell/spu/spu_blend.h @@ -0,0 +1,37 @@ +/************************************************************************** + * + * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + + +#ifndef SPU_BLEND_H +#define SPU_BLEND_H + + +extern void +blend_quad(uint itx, uint ity, vector float colors[4]); + + +#endif /* SPU_BLEND_H */ diff --git a/src/mesa/pipe/cell/spu/spu_colorpack.h b/src/mesa/pipe/cell/spu/spu_colorpack.h new file mode 100644 index 0000000000..e9fee8a3a6 --- /dev/null +++ b/src/mesa/pipe/cell/spu/spu_colorpack.h @@ -0,0 +1,110 @@ +/************************************************************************** + * + * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + + + +#ifndef SPU_COLORPACK_H +#define SPU_COLORPACK_H + + +#include <spu_intrinsics.h> + + +static INLINE unsigned int +spu_pack_R8G8B8A8(vector float rgba) +{ + vector unsigned int out = spu_convtu(rgba, 32); + + out = spu_shuffle(out, out, ((vector unsigned char) { + 0, 4, 8, 12, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0 }) ); + + return spu_extract(out, 0); +} + + +static INLINE unsigned int +spu_pack_A8R8G8B8(vector float rgba) +{ + vector unsigned int out = spu_convtu(rgba, 32); + out = spu_shuffle(out, out, ((vector unsigned char) { + 12, 0, 4, 8, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0}) ); + return spu_extract(out, 0); +} + + +static INLINE unsigned int +spu_pack_B8G8R8A8(vector float rgba) +{ + vector unsigned int out = spu_convtu(rgba, 32); + out = spu_shuffle(out, out, ((vector unsigned char) { + 8, 4, 0, 12, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0}) ); + return spu_extract(out, 0); +} + + +static INLINE unsigned int +spu_pack_color_shuffle(vector float rgba, vector unsigned char shuffle) +{ + vector unsigned int out = spu_convtu(rgba, 32); + out = spu_shuffle(out, out, shuffle); + return spu_extract(out, 0); +} + + +static INLINE vector float +spu_unpack_color(uint color) +{ + vector unsigned int color_u4 = spu_splats(color); + color_u4 = spu_shuffle(color_u4, color_u4, + ((vector unsigned char) { + 0, 0, 0, 0, + 5, 5, 5, 5, + 10, 10, 10, 10, + 15, 15, 15, 15}) ); + return spu_convtf(color_u4, 32); +} + + +static INLINE vector float +spu_unpack_A8R8G8B8(uint color) +{ + vector unsigned int color_u4 = spu_splats(color); + color_u4 = spu_shuffle(color_u4, color_u4, + ((vector unsigned char) { + 5, 5, 5, 5, + 10, 10, 10, 10, + 15, 15, 15, 15, + 0, 0, 0, 0}) ); + + return spu_convtf(color_u4, 32); +} + + +#endif /* SPU_COLORPACK_H */ diff --git a/src/mesa/pipe/cell/spu/spu_exec.c b/src/mesa/pipe/cell/spu/spu_exec.c new file mode 100644 index 0000000000..e51008b9b3 --- /dev/null +++ b/src/mesa/pipe/cell/spu/spu_exec.c @@ -0,0 +1,1948 @@ +/************************************************************************** + * + * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +/** + * TGSI interpretor/executor. + * + * Flow control information: + * + * Since we operate on 'quads' (4 pixels or 4 vertices in parallel) + * flow control statements (IF/ELSE/ENDIF, LOOP/ENDLOOP) require special + * care since a condition may be true for some quad components but false + * for other components. + * + * We basically execute all statements (even if they're in the part of + * an IF/ELSE clause that's "not taken") and use a special mask to + * control writing to destination registers. This is the ExecMask. + * See store_dest(). + * + * The ExecMask is computed from three other masks (CondMask, LoopMask and + * ContMask) which are controlled by the flow control instructions (namely: + * (IF/ELSE/ENDIF, LOOP/ENDLOOP and CONT). + * + * + * Authors: + * Michal Krol + * Brian Paul + */ + +#include <libmisc.h> +#include <spu_mfcio.h> +#include <transpose_matrix4x4.h> +#include <simdmath/ceilf4.h> +#include <simdmath/cosf4.h> +#include <simdmath/divf4.h> +#include <simdmath/floorf4.h> +#include <simdmath/log2f4.h> +#include <simdmath/powf4.h> +#include <simdmath/sinf4.h> +#include <simdmath/sqrtf4.h> +#include <simdmath/truncf4.h> + +#include "pipe/p_compiler.h" +#include "pipe/p_state.h" +#include "pipe/p_util.h" +#include "pipe/p_shader_tokens.h" +#include "pipe/tgsi/util/tgsi_parse.h" +#include "pipe/tgsi/util/tgsi_util.h" +#include "spu_exec.h" +#include "spu_main.h" +#include "spu_vertex_shader.h" + +#define TILE_TOP_LEFT 0 +#define TILE_TOP_RIGHT 1 +#define TILE_BOTTOM_LEFT 2 +#define TILE_BOTTOM_RIGHT 3 + +/* + * Shorthand locations of various utility registers (_I = Index, _C = Channel) + */ +#define TEMP_0_I TGSI_EXEC_TEMP_00000000_I +#define TEMP_0_C TGSI_EXEC_TEMP_00000000_C +#define TEMP_7F_I TGSI_EXEC_TEMP_7FFFFFFF_I +#define TEMP_7F_C TGSI_EXEC_TEMP_7FFFFFFF_C +#define TEMP_80_I TGSI_EXEC_TEMP_80000000_I +#define TEMP_80_C TGSI_EXEC_TEMP_80000000_C +#define TEMP_FF_I TGSI_EXEC_TEMP_FFFFFFFF_I +#define TEMP_FF_C TGSI_EXEC_TEMP_FFFFFFFF_C +#define TEMP_1_I TGSI_EXEC_TEMP_ONE_I +#define TEMP_1_C TGSI_EXEC_TEMP_ONE_C +#define TEMP_2_I TGSI_EXEC_TEMP_TWO_I +#define TEMP_2_C TGSI_EXEC_TEMP_TWO_C +#define TEMP_128_I TGSI_EXEC_TEMP_128_I +#define TEMP_128_C TGSI_EXEC_TEMP_128_C +#define TEMP_M128_I TGSI_EXEC_TEMP_MINUS_128_I +#define TEMP_M128_C TGSI_EXEC_TEMP_MINUS_128_C +#define TEMP_KILMASK_I TGSI_EXEC_TEMP_KILMASK_I +#define TEMP_KILMASK_C TGSI_EXEC_TEMP_KILMASK_C +#define TEMP_OUTPUT_I TGSI_EXEC_TEMP_OUTPUT_I +#define TEMP_OUTPUT_C TGSI_EXEC_TEMP_OUTPUT_C +#define TEMP_PRIMITIVE_I TGSI_EXEC_TEMP_PRIMITIVE_I +#define TEMP_PRIMITIVE_C TGSI_EXEC_TEMP_PRIMITIVE_C +#define TEMP_R0 TGSI_EXEC_TEMP_R0 + +#define FOR_EACH_CHANNEL(CHAN)\ + for (CHAN = 0; CHAN < 4; CHAN++) + +#define IS_CHANNEL_ENABLED(INST, CHAN)\ + ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN))) + +#define IS_CHANNEL_ENABLED2(INST, CHAN)\ + ((INST).FullDstRegisters[1].DstRegister.WriteMask & (1 << (CHAN))) + +#define FOR_EACH_ENABLED_CHANNEL(INST, CHAN)\ + FOR_EACH_CHANNEL( CHAN )\ + if (IS_CHANNEL_ENABLED( INST, CHAN )) + +#define FOR_EACH_ENABLED_CHANNEL2(INST, CHAN)\ + FOR_EACH_CHANNEL( CHAN )\ + if (IS_CHANNEL_ENABLED2( INST, CHAN )) + + +/** The execution mask depends on the conditional mask and the loop mask */ +#define UPDATE_EXEC_MASK(MACH) \ + MACH->ExecMask = MACH->CondMask & MACH->LoopMask & MACH->ContMask & MACH->FuncMask + + +#define CHAN_X 0 +#define CHAN_Y 1 +#define CHAN_Z 2 +#define CHAN_W 3 + + + +/** + * Initialize machine state by expanding tokens to full instructions, + * allocating temporary storage, setting up constants, etc. + * After this, we can call spu_exec_machine_run() many times. + */ +void +spu_exec_machine_init(struct spu_exec_machine *mach, + uint numSamplers, + struct spu_sampler *samplers, + unsigned processor) +{ + qword zero; + qword not_zero; + uint i; + + mach->Samplers = samplers; + mach->Processor = processor; + mach->Addrs = &mach->Temps[TGSI_EXEC_NUM_TEMPS]; + + zero = si_xor(zero, zero); + not_zero = si_xori(zero, 0xff); + + /* Setup constants. */ + mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].q = zero; + mach->Temps[TEMP_FF_I].xyzw[TEMP_FF_C].q = not_zero; + mach->Temps[TEMP_7F_I].xyzw[TEMP_7F_C].q = si_shli(not_zero, -1); + mach->Temps[TEMP_80_I].xyzw[TEMP_80_C].q = si_shli(not_zero, 31); + + mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].q = (qword) spu_splats(1.0f); + mach->Temps[TEMP_2_I].xyzw[TEMP_2_C].q = (qword) spu_splats(2.0f); + mach->Temps[TEMP_128_I].xyzw[TEMP_128_C].q = (qword) spu_splats(128.0f); + mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C].q = (qword) spu_splats(-128.0f); +} + + +static INLINE qword +micro_abs(qword src) +{ + return si_rotmi(si_shli(src, 1), -1); +} + +static INLINE qword +micro_ceil(qword src) +{ + return (qword) _ceilf4((vec_float4) src); +} + +static INLINE qword +micro_cos(qword src) +{ + return (qword) _cosf4((vec_float4) src); +} + +static const qword br_shuf = { + TILE_BOTTOM_RIGHT + 0, TILE_BOTTOM_RIGHT + 1, + TILE_BOTTOM_RIGHT + 2, TILE_BOTTOM_RIGHT + 3, + TILE_BOTTOM_RIGHT + 0, TILE_BOTTOM_RIGHT + 1, + TILE_BOTTOM_RIGHT + 2, TILE_BOTTOM_RIGHT + 3, + TILE_BOTTOM_RIGHT + 0, TILE_BOTTOM_RIGHT + 1, + TILE_BOTTOM_RIGHT + 2, TILE_BOTTOM_RIGHT + 3, + TILE_BOTTOM_RIGHT + 0, TILE_BOTTOM_RIGHT + 1, + TILE_BOTTOM_RIGHT + 2, TILE_BOTTOM_RIGHT + 3, +}; + +static const qword bl_shuf = { + TILE_BOTTOM_LEFT + 0, TILE_BOTTOM_LEFT + 1, + TILE_BOTTOM_LEFT + 2, TILE_BOTTOM_LEFT + 3, + TILE_BOTTOM_LEFT + 0, TILE_BOTTOM_LEFT + 1, + TILE_BOTTOM_LEFT + 2, TILE_BOTTOM_LEFT + 3, + TILE_BOTTOM_LEFT + 0, TILE_BOTTOM_LEFT + 1, + TILE_BOTTOM_LEFT + 2, TILE_BOTTOM_LEFT + 3, + TILE_BOTTOM_LEFT + 0, TILE_BOTTOM_LEFT + 1, + TILE_BOTTOM_LEFT + 2, TILE_BOTTOM_LEFT + 3, +}; + +static const qword tl_shuf = { + TILE_TOP_LEFT + 0, TILE_TOP_LEFT + 1, + TILE_TOP_LEFT + 2, TILE_TOP_LEFT + 3, + TILE_TOP_LEFT + 0, TILE_TOP_LEFT + 1, + TILE_TOP_LEFT + 2, TILE_TOP_LEFT + 3, + TILE_TOP_LEFT + 0, TILE_TOP_LEFT + 1, + TILE_TOP_LEFT + 2, TILE_TOP_LEFT + 3, + TILE_TOP_LEFT + 0, TILE_TOP_LEFT + 1, + TILE_TOP_LEFT + 2, TILE_TOP_LEFT + 3, +}; + +static qword +micro_ddx(qword src) +{ + qword bottom_right = si_shufb(src, src, br_shuf); + qword bottom_left = si_shufb(src, src, bl_shuf); + + return si_fs(bottom_right, bottom_left); +} + +static qword +micro_ddy(qword src) +{ + qword top_left = si_shufb(src, src, tl_shuf); + qword bottom_left = si_shufb(src, src, bl_shuf); + + return si_fs(top_left, bottom_left); +} + +static INLINE qword +micro_div(qword src0, qword src1) +{ + return (qword) _divf4((vec_float4) src0, (vec_float4) src1); +} + +static qword +micro_flr(qword src) +{ + return (qword) _floorf4((vec_float4) src); +} + +static qword +micro_frc(qword src) +{ + return si_fs(src, (qword) _floorf4((vec_float4) src)); +} + +static INLINE qword +micro_ge(qword src0, qword src1) +{ + return si_or(si_fceq(src0, src1), si_fcgt(src0, src1)); +} + +static qword +micro_lg2(qword src) +{ + return (qword) _log2f4((vec_float4) src); +} + +static INLINE qword +micro_lt(qword src0, qword src1) +{ + const qword tmp = si_or(si_fceq(src0, src1), si_fcgt(src0, src1)); + + return si_xori(tmp, 0xff); +} + +static INLINE qword +micro_max(qword src0, qword src1) +{ + return si_selb(src1, src0, si_fcgt(src0, src1)); +} + +static INLINE qword +micro_min(qword src0, qword src1) +{ + return si_selb(src0, src1, si_fcgt(src0, src1)); +} + +static qword +micro_neg(qword src) +{ + return si_xor(src, (qword) spu_splats(0x80000000)); +} + +static qword +micro_set_sign(qword src) +{ + return si_or(src, (qword) spu_splats(0x80000000)); +} + +static qword +micro_pow(qword src0, qword src1) +{ + return (qword) _powf4((vec_float4) src0, (vec_float4) src1); +} + +static qword +micro_rnd(qword src) +{ + const qword half = (qword) spu_splats(0.5f); + + /* May be able to use _roundf4. There may be some difference, though. + */ + return (qword) _floorf4((vec_float4) si_fa(src, half)); +} + +static INLINE qword +micro_ishr(qword src0, qword src1) +{ + return si_rotma(src0, si_sfi(src1, 0)); +} + +static qword +micro_trunc(qword src) +{ + return (qword) _truncf4((vec_float4) src); +} + +static qword +micro_sin(qword src) +{ + return (qword) _sinf4((vec_float4) src); +} + +static INLINE qword +micro_sqrt(qword src) +{ + return (qword) _sqrtf4((vec_float4) src); +} + +static void +fetch_src_file_channel( + const struct spu_exec_machine *mach, + const uint file, + const uint swizzle, + const union spu_exec_channel *index, + union spu_exec_channel *chan ) +{ + switch( swizzle ) { + case TGSI_EXTSWIZZLE_X: + case TGSI_EXTSWIZZLE_Y: + case TGSI_EXTSWIZZLE_Z: + case TGSI_EXTSWIZZLE_W: + switch( file ) { + case TGSI_FILE_CONSTANT: { + unsigned char buffer[32] ALIGN16_ATTRIB; + unsigned i; + + for (i = 0; i < 4; i++) { + const float *ptr = mach->Consts[index->i[i]]; + const uint64_t addr = (uint64_t)(uintptr_t) ptr; + const unsigned size = ((addr & 0x0f) == 0) ? 16 : 32; + + mfc_get(buffer, addr & ~0x0f, size, TAG_VERTEX_BUFFER, 0, 0); + wait_on_mask(1 << TAG_VERTEX_BUFFER); + + (void) memcpy(& chan->f[i], &buffer[(addr & 0x0f) + + (sizeof(float) * swizzle)], sizeof(float)); + } + break; + } + + case TGSI_FILE_INPUT: + chan->u[0] = mach->Inputs[index->i[0]].xyzw[swizzle].u[0]; + chan->u[1] = mach->Inputs[index->i[1]].xyzw[swizzle].u[1]; + chan->u[2] = mach->Inputs[index->i[2]].xyzw[swizzle].u[2]; + chan->u[3] = mach->Inputs[index->i[3]].xyzw[swizzle].u[3]; + break; + + case TGSI_FILE_TEMPORARY: + chan->u[0] = mach->Temps[index->i[0]].xyzw[swizzle].u[0]; + chan->u[1] = mach->Temps[index->i[1]].xyzw[swizzle].u[1]; + chan->u[2] = mach->Temps[index->i[2]].xyzw[swizzle].u[2]; + chan->u[3] = mach->Temps[index->i[3]].xyzw[swizzle].u[3]; + break; + + case TGSI_FILE_IMMEDIATE: + assert( index->i[0] < (int) mach->ImmLimit ); + assert( index->i[1] < (int) mach->ImmLimit ); + assert( index->i[2] < (int) mach->ImmLimit ); + assert( index->i[3] < (int) mach->ImmLimit ); + + chan->f[0] = mach->Imms[index->i[0]][swizzle]; + chan->f[1] = mach->Imms[index->i[1]][swizzle]; + chan->f[2] = mach->Imms[index->i[2]][swizzle]; + chan->f[3] = mach->Imms[index->i[3]][swizzle]; + break; + + case TGSI_FILE_ADDRESS: + chan->u[0] = mach->Addrs[index->i[0]].xyzw[swizzle].u[0]; + chan->u[1] = mach->Addrs[index->i[1]].xyzw[swizzle].u[1]; + chan->u[2] = mach->Addrs[index->i[2]].xyzw[swizzle].u[2]; + chan->u[3] = mach->Addrs[index->i[3]].xyzw[swizzle].u[3]; + break; + + case TGSI_FILE_OUTPUT: + /* vertex/fragment output vars can be read too */ + chan->u[0] = mach->Outputs[index->i[0]].xyzw[swizzle].u[0]; + chan->u[1] = mach->Outputs[index->i[1]].xyzw[swizzle].u[1]; + chan->u[2] = mach->Outputs[index->i[2]].xyzw[swizzle].u[2]; + chan->u[3] = mach->Outputs[index->i[3]].xyzw[swizzle].u[3]; + break; + + default: + assert( 0 ); + } + break; + + case TGSI_EXTSWIZZLE_ZERO: + *chan = mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]; + break; + + case TGSI_EXTSWIZZLE_ONE: + *chan = mach->Temps[TEMP_1_I].xyzw[TEMP_1_C]; + break; + + default: + assert( 0 ); + } +} + +static void +fetch_source( + const struct spu_exec_machine *mach, + union spu_exec_channel *chan, + const struct tgsi_full_src_register *reg, + const uint chan_index ) +{ + union spu_exec_channel index; + uint swizzle; + + index.i[0] = + index.i[1] = + index.i[2] = + index.i[3] = reg->SrcRegister.Index; + + if (reg->SrcRegister.Indirect) { + union spu_exec_channel index2; + union spu_exec_channel indir_index; + + index2.i[0] = + index2.i[1] = + index2.i[2] = + index2.i[3] = reg->SrcRegisterInd.Index; + + swizzle = tgsi_util_get_src_register_swizzle(®->SrcRegisterInd, + CHAN_X); + fetch_src_file_channel( + mach, + reg->SrcRegisterInd.File, + swizzle, + &index2, + &indir_index ); + + index.q = si_a(index.q, indir_index.q); + } + + if( reg->SrcRegister.Dimension ) { + switch( reg->SrcRegister.File ) { + case TGSI_FILE_INPUT: + index.q = si_mpyi(index.q, 17); + break; + case TGSI_FILE_CONSTANT: + index.q = si_shli(index.q, 12); + break; + default: + assert( 0 ); + } + + index.i[0] += reg->SrcRegisterDim.Index; + index.i[1] += reg->SrcRegisterDim.Index; + index.i[2] += reg->SrcRegisterDim.Index; + index.i[3] += reg->SrcRegisterDim.Index; + + if (reg->SrcRegisterDim.Indirect) { + union spu_exec_channel index2; + union spu_exec_channel indir_index; + + index2.i[0] = + index2.i[1] = + index2.i[2] = + index2.i[3] = reg->SrcRegisterDimInd.Index; + + swizzle = tgsi_util_get_src_register_swizzle( ®->SrcRegisterDimInd, CHAN_X ); + fetch_src_file_channel( + mach, + reg->SrcRegisterDimInd.File, + swizzle, + &index2, + &indir_index ); + + index.q = si_a(index.q, indir_index.q); + } + } + + swizzle = tgsi_util_get_full_src_register_extswizzle( reg, chan_index ); + fetch_src_file_channel( + mach, + reg->SrcRegister.File, + swizzle, + &index, + chan ); + + switch (tgsi_util_get_full_src_register_sign_mode( reg, chan_index )) { + case TGSI_UTIL_SIGN_CLEAR: + chan->q = micro_abs(chan->q); + break; + + case TGSI_UTIL_SIGN_SET: + chan->q = micro_set_sign(chan->q); + break; + + case TGSI_UTIL_SIGN_TOGGLE: + chan->q = micro_neg(chan->q); + break; + + case TGSI_UTIL_SIGN_KEEP: + break; + } + + if (reg->SrcRegisterExtMod.Complement) { + chan->q = si_fs(mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].q, chan->q); + } +} + +static void +store_dest( + struct spu_exec_machine *mach, + const union spu_exec_channel *chan, + const struct tgsi_full_dst_register *reg, + const struct tgsi_full_instruction *inst, + uint chan_index ) +{ + union spu_exec_channel *dst; + + switch( reg->DstRegister.File ) { + case TGSI_FILE_NULL: + return; + + case TGSI_FILE_OUTPUT: + dst = &mach->Outputs[mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] + + reg->DstRegister.Index].xyzw[chan_index]; + break; + + case TGSI_FILE_TEMPORARY: + dst = &mach->Temps[reg->DstRegister.Index].xyzw[chan_index]; + break; + + case TGSI_FILE_ADDRESS: + dst = &mach->Addrs[reg->DstRegister.Index].xyzw[chan_index]; + break; + + default: + assert( 0 ); + return; + } + + switch (inst->Instruction.Saturate) + { + case TGSI_SAT_NONE: + if (mach->ExecMask & 0x1) + dst->i[0] = chan->i[0]; + if (mach->ExecMask & 0x2) + dst->i[1] = chan->i[1]; + if (mach->ExecMask & 0x4) + dst->i[2] = chan->i[2]; + if (mach->ExecMask & 0x8) + dst->i[3] = chan->i[3]; + break; + + case TGSI_SAT_ZERO_ONE: + /* XXX need to obey ExecMask here */ + dst->q = micro_max(chan->q, mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].q); + dst->q = micro_min(dst->q, mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].q); + break; + + case TGSI_SAT_MINUS_PLUS_ONE: + assert( 0 ); + break; + + default: + assert( 0 ); + } +} + +#define FETCH(VAL,INDEX,CHAN)\ + fetch_source (mach, VAL, &inst->FullSrcRegisters[INDEX], CHAN) + +#define STORE(VAL,INDEX,CHAN)\ + store_dest (mach, VAL, &inst->FullDstRegisters[INDEX], inst, CHAN ) + + +/** + * Execute ARB-style KIL which is predicated by a src register. + * Kill fragment if any of the four values is less than zero. + */ +static void +exec_kilp(struct spu_exec_machine *mach, + const struct tgsi_full_instruction *inst) +{ + uint uniquemask; + uint chan_index; + uint kilmask = 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */ + union spu_exec_channel r[1]; + + /* This mask stores component bits that were already tested. Note that + * we test if the value is less than zero, so 1.0 and 0.0 need not to be + * tested. */ + uniquemask = (1 << TGSI_EXTSWIZZLE_ZERO) | (1 << TGSI_EXTSWIZZLE_ONE); + + for (chan_index = 0; chan_index < 4; chan_index++) + { + uint swizzle; + uint i; + + /* unswizzle channel */ + swizzle = tgsi_util_get_full_src_register_extswizzle ( + &inst->FullSrcRegisters[0], + chan_index); + + /* check if the component has not been already tested */ + if (uniquemask & (1 << swizzle)) + continue; + uniquemask |= 1 << swizzle; + + FETCH(&r[0], 0, chan_index); + for (i = 0; i < 4; i++) + if (r[0].f[i] < 0.0f) + kilmask |= 1 << i; + } + + mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask; +} + + +/* + * Fetch a texel using STR texture coordinates. + */ +static void +fetch_texel( struct spu_sampler *sampler, + const union spu_exec_channel *s, + const union spu_exec_channel *t, + const union spu_exec_channel *p, + float lodbias, /* XXX should be float[4] */ + union spu_exec_channel *r, + union spu_exec_channel *g, + union spu_exec_channel *b, + union spu_exec_channel *a ) +{ + qword rgba[4]; + qword out[4]; + + sampler->get_samples(sampler, s->f, t->f, p->f, lodbias, (float *) rgba); + + _transpose_matrix4x4(out, rgba); + r->q = out[0]; + g->q = out[1]; + b->q = out[2]; + a->q = out[3]; +} + + +static void +exec_tex(struct spu_exec_machine *mach, + const struct tgsi_full_instruction *inst, + boolean biasLod) +{ + const uint unit = inst->FullSrcRegisters[1].SrcRegister.Index; + union spu_exec_channel r[8]; + uint chan_index; + float lodBias; + + /* printf("Sampler %u unit %u\n", sampler, unit); */ + + switch (inst->InstructionExtTexture.Texture) { + case TGSI_TEXTURE_1D: + + FETCH(&r[0], 0, CHAN_X); + + switch (inst->FullSrcRegisters[0].SrcRegisterExtSwz.ExtDivide) { + case TGSI_EXTSWIZZLE_W: + FETCH(&r[1], 0, CHAN_W); + r[0].q = micro_div(r[0].q, r[1].q); + break; + + case TGSI_EXTSWIZZLE_ONE: + break; + + default: + assert (0); + } + + if (biasLod) { + FETCH(&r[1], 0, CHAN_W); + lodBias = r[2].f[0]; + } + else + lodBias = 0.0; + + fetch_texel(&mach->Samplers[unit], + &r[0], NULL, NULL, lodBias, /* S, T, P, BIAS */ + &r[0], &r[1], &r[2], &r[3]); /* R, G, B, A */ + break; + + case TGSI_TEXTURE_2D: + case TGSI_TEXTURE_RECT: + + FETCH(&r[0], 0, CHAN_X); + FETCH(&r[1], 0, CHAN_Y); + FETCH(&r[2], 0, CHAN_Z); + + switch (inst->FullSrcRegisters[0].SrcRegisterExtSwz.ExtDivide) { + case TGSI_EXTSWIZZLE_W: + FETCH(&r[3], 0, CHAN_W); + r[0].q = micro_div(r[0].q, r[3].q); + r[1].q = micro_div(r[1].q, r[3].q); + r[2].q = micro_div(r[2].q, r[3].q); + break; + + case TGSI_EXTSWIZZLE_ONE: + break; + + default: + assert (0); + } + + if (biasLod) { + FETCH(&r[3], 0, CHAN_W); + lodBias = r[3].f[0]; + } + else + lodBias = 0.0; + + fetch_texel(&mach->Samplers[unit], + &r[0], &r[1], &r[2], lodBias, /* inputs */ + &r[0], &r[1], &r[2], &r[3]); /* outputs */ + break; + + case TGSI_TEXTURE_3D: + case TGSI_TEXTURE_CUBE: + + FETCH(&r[0], 0, CHAN_X); + FETCH(&r[1], 0, CHAN_Y); + FETCH(&r[2], 0, CHAN_Z); + + switch (inst->FullSrcRegisters[0].SrcRegisterExtSwz.ExtDivide) { + case TGSI_EXTSWIZZLE_W: + FETCH(&r[3], 0, CHAN_W); + r[0].q = micro_div(r[0].q, r[3].q); + r[1].q = micro_div(r[1].q, r[3].q); + r[2].q = micro_div(r[2].q, r[3].q); + break; + + case TGSI_EXTSWIZZLE_ONE: + break; + + default: + assert (0); + } + + if (biasLod) { + FETCH(&r[3], 0, CHAN_W); + lodBias = r[3].f[0]; + } + else + lodBias = 0.0; + + fetch_texel(&mach->Samplers[unit], + &r[0], &r[1], &r[2], lodBias, + &r[0], &r[1], &r[2], &r[3]); + break; + + default: + assert (0); + } + + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + STORE( &r[chan_index], 0, chan_index ); + } +} + + + +static void +constant_interpolation( + struct spu_exec_machine *mach, + unsigned attrib, + unsigned chan ) +{ + unsigned i; + + for( i = 0; i < QUAD_SIZE; i++ ) { + mach->Inputs[attrib].xyzw[chan].f[i] = mach->InterpCoefs[attrib].a0[chan]; + } +} + +static void +linear_interpolation( + struct spu_exec_machine *mach, + unsigned attrib, + unsigned chan ) +{ + const float x = mach->QuadPos.xyzw[0].f[0]; + const float y = mach->QuadPos.xyzw[1].f[0]; + const float dadx = mach->InterpCoefs[attrib].dadx[chan]; + const float dady = mach->InterpCoefs[attrib].dady[chan]; + const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y; + mach->Inputs[attrib].xyzw[chan].f[0] = a0; + mach->Inputs[attrib].xyzw[chan].f[1] = a0 + dadx; + mach->Inputs[attrib].xyzw[chan].f[2] = a0 + dady; + mach->Inputs[attrib].xyzw[chan].f[3] = a0 + dadx + dady; +} + +static void +perspective_interpolation( + struct spu_exec_machine *mach, + unsigned attrib, + unsigned chan ) +{ + const float x = mach->QuadPos.xyzw[0].f[0]; + const float y = mach->QuadPos.xyzw[1].f[0]; + const float dadx = mach->InterpCoefs[attrib].dadx[chan]; + const float dady = mach->InterpCoefs[attrib].dady[chan]; + const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y; + const float *w = mach->QuadPos.xyzw[3].f; + /* divide by W here */ + mach->Inputs[attrib].xyzw[chan].f[0] = a0 / w[0]; + mach->Inputs[attrib].xyzw[chan].f[1] = (a0 + dadx) / w[1]; + mach->Inputs[attrib].xyzw[chan].f[2] = (a0 + dady) / w[2]; + mach->Inputs[attrib].xyzw[chan].f[3] = (a0 + dadx + dady) / w[3]; +} + + +typedef void (* interpolation_func)( + struct spu_exec_machine *mach, + unsigned attrib, + unsigned chan ); + +static void +exec_declaration(struct spu_exec_machine *mach, + const struct tgsi_full_declaration *decl) +{ + if( mach->Processor == TGSI_PROCESSOR_FRAGMENT ) { + if( decl->Declaration.File == TGSI_FILE_INPUT ) { + unsigned first, last, mask; + interpolation_func interp; + + assert( decl->Declaration.Declare == TGSI_DECLARE_RANGE ); + + first = decl->u.DeclarationRange.First; + last = decl->u.DeclarationRange.Last; + mask = decl->Declaration.UsageMask; + + switch( decl->Interpolation.Interpolate ) { + case TGSI_INTERPOLATE_CONSTANT: + interp = constant_interpolation; + break; + + case TGSI_INTERPOLATE_LINEAR: + interp = linear_interpolation; + break; + + case TGSI_INTERPOLATE_PERSPECTIVE: + interp = perspective_interpolation; + break; + + default: + assert( 0 ); + } + + if( mask == TGSI_WRITEMASK_XYZW ) { + unsigned i, j; + + for( i = first; i <= last; i++ ) { + for( j = 0; j < NUM_CHANNELS; j++ ) { + interp( mach, i, j ); + } + } + } + else { + unsigned i, j; + + for( j = 0; j < NUM_CHANNELS; j++ ) { + if( mask & (1 << j) ) { + for( i = first; i <= last; i++ ) { + interp( mach, i, j ); + } + } + } + } + } + } +} + +static void +exec_instruction( + struct spu_exec_machine *mach, + const struct tgsi_full_instruction *inst, + int *pc ) +{ + uint chan_index; + union spu_exec_channel r[8]; + + (*pc)++; + + switch (inst->Instruction.Opcode) { + case TGSI_OPCODE_ARL: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + r[0].q = si_cflts(r[0].q, 0); + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_MOV: + /* TGSI_OPCODE_SWZ */ + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_LIT: + if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) { + STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X ); + } + + if (IS_CHANNEL_ENABLED( *inst, CHAN_Y ) || IS_CHANNEL_ENABLED( *inst, CHAN_Z )) { + FETCH( &r[0], 0, CHAN_X ); + if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) { + r[0].q = micro_max(r[0].q, mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].q); + STORE( &r[0], 0, CHAN_Y ); + } + + if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) { + FETCH( &r[1], 0, CHAN_Y ); + r[1].q = micro_max(r[1].q, mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].q); + + FETCH( &r[2], 0, CHAN_W ); + r[2].q = micro_min(r[2].q, mach->Temps[TEMP_128_I].xyzw[TEMP_128_C].q); + r[2].q = micro_max(r[2].q, mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C].q); + r[1].q = micro_pow(r[1].q, r[2].q); + + /* r0 = (r0 > 0.0) ? r1 : 0.0 + */ + r[0].q = si_fcgt(r[0].q, mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].q); + r[0].q = si_selb(mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].q, r[1].q, + r[0].q); + STORE( &r[0], 0, CHAN_Z ); + } + } + + if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) { + STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W ); + } + break; + + case TGSI_OPCODE_RCP: + /* TGSI_OPCODE_RECIP */ + FETCH( &r[0], 0, CHAN_X ); + r[0].q = micro_div(mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].q, r[0].q); + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_RSQ: + /* TGSI_OPCODE_RECIPSQRT */ + FETCH( &r[0], 0, CHAN_X ); + r[0].q = micro_sqrt(r[0].q); + r[0].q = micro_div(mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].q, r[0].q); + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_EXP: + assert (0); + break; + + case TGSI_OPCODE_LOG: + assert (0); + break; + + case TGSI_OPCODE_MUL: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) + { + FETCH(&r[0], 0, chan_index); + FETCH(&r[1], 1, chan_index); + + r[0].q = si_fm(r[0].q, r[1].q); + + STORE(&r[0], 0, chan_index); + } + break; + + case TGSI_OPCODE_ADD: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + FETCH( &r[1], 1, chan_index ); + r[0].q = si_fa(r[0].q, r[1].q); + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_DP3: + /* TGSI_OPCODE_DOT3 */ + FETCH( &r[0], 0, CHAN_X ); + FETCH( &r[1], 1, CHAN_X ); + r[0].q = si_fm(r[0].q, r[1].q); + + FETCH( &r[1], 0, CHAN_Y ); + FETCH( &r[2], 1, CHAN_Y ); + r[0].q = si_fma(r[1].q, r[2].q, r[0].q); + + + FETCH( &r[1], 0, CHAN_Z ); + FETCH( &r[2], 1, CHAN_Z ); + r[0].q = si_fma(r[1].q, r[2].q, r[0].q); + + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_DP4: + /* TGSI_OPCODE_DOT4 */ + FETCH(&r[0], 0, CHAN_X); + FETCH(&r[1], 1, CHAN_X); + + r[0].q = si_fm(r[0].q, r[1].q); + + FETCH(&r[1], 0, CHAN_Y); + FETCH(&r[2], 1, CHAN_Y); + + r[0].q = si_fma(r[1].q, r[2].q, r[0].q); + + FETCH(&r[1], 0, CHAN_Z); + FETCH(&r[2], 1, CHAN_Z); + + r[0].q = si_fma(r[1].q, r[2].q, r[0].q); + + FETCH(&r[1], 0, CHAN_W); + FETCH(&r[2], 1, CHAN_W); + + r[0].q = si_fma(r[1].q, r[2].q, r[0].q); + + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_DST: + if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) { + STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X ); + } + + if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) { + FETCH( &r[0], 0, CHAN_Y ); + FETCH( &r[1], 1, CHAN_Y); + r[0].q = si_fm(r[0].q, r[1].q); + STORE( &r[0], 0, CHAN_Y ); + } + + if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) { + FETCH( &r[0], 0, CHAN_Z ); + STORE( &r[0], 0, CHAN_Z ); + } + + if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) { + FETCH( &r[0], 1, CHAN_W ); + STORE( &r[0], 0, CHAN_W ); + } + break; + + case TGSI_OPCODE_MIN: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH(&r[0], 0, chan_index); + FETCH(&r[1], 1, chan_index); + + r[0].q = micro_min(r[0].q, r[1].q); + + STORE(&r[0], 0, chan_index); + } + break; + + case TGSI_OPCODE_MAX: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH(&r[0], 0, chan_index); + FETCH(&r[1], 1, chan_index); + + r[0].q = micro_max(r[0].q, r[1].q); + + STORE(&r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_SLT: + /* TGSI_OPCODE_SETLT */ + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + FETCH( &r[1], 1, chan_index ); + + r[0].q = micro_ge(r[0].q, r[1].q); + r[0].q = si_xori(r[0].q, 0xff); + + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_SGE: + /* TGSI_OPCODE_SETGE */ + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + FETCH( &r[1], 1, chan_index ); + r[0].q = micro_ge(r[0].q, r[1].q); + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_MAD: + /* TGSI_OPCODE_MADD */ + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + FETCH( &r[1], 1, chan_index ); + FETCH( &r[2], 2, chan_index ); + r[0].q = si_fma(r[0].q, r[1].q, r[2].q); + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_SUB: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH(&r[0], 0, chan_index); + FETCH(&r[1], 1, chan_index); + + r[0].q = si_fs(r[0].q, r[1].q); + + STORE(&r[0], 0, chan_index); + } + break; + + case TGSI_OPCODE_LERP: + /* TGSI_OPCODE_LRP */ + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH(&r[0], 0, chan_index); + FETCH(&r[1], 1, chan_index); + FETCH(&r[2], 2, chan_index); + + r[1].q = si_fs(r[1].q, r[2].q); + r[0].q = si_fma(r[0].q, r[1].q, r[2].q); + + STORE(&r[0], 0, chan_index); + } + break; + + case TGSI_OPCODE_CND: + assert (0); + break; + + case TGSI_OPCODE_CND0: + assert (0); + break; + + case TGSI_OPCODE_DOT2ADD: + /* TGSI_OPCODE_DP2A */ + assert (0); + break; + + case TGSI_OPCODE_INDEX: + assert (0); + break; + + case TGSI_OPCODE_NEGATE: + assert (0); + break; + + case TGSI_OPCODE_FRAC: + /* TGSI_OPCODE_FRC */ + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + r[0].q = micro_frc(r[0].q); + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_CLAMP: + assert (0); + break; + + case TGSI_OPCODE_FLOOR: + /* TGSI_OPCODE_FLR */ + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + r[0].q = micro_flr(r[0].q); + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_ROUND: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + r[0].q = micro_rnd(r[0].q); + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_EXPBASE2: + /* TGSI_OPCODE_EX2 */ + FETCH(&r[0], 0, CHAN_X); + + r[0].q = micro_pow(mach->Temps[TEMP_2_I].xyzw[TEMP_2_C].q, r[0].q); + + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_LOGBASE2: + /* TGSI_OPCODE_LG2 */ + FETCH( &r[0], 0, CHAN_X ); + r[0].q = micro_lg2(r[0].q); + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_POWER: + /* TGSI_OPCODE_POW */ + FETCH(&r[0], 0, CHAN_X); + FETCH(&r[1], 1, CHAN_X); + + r[0].q = micro_pow(r[0].q, r[1].q); + + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_CROSSPRODUCT: + /* TGSI_OPCODE_XPD */ + FETCH(&r[0], 0, CHAN_Y); + FETCH(&r[1], 1, CHAN_Z); + FETCH(&r[3], 0, CHAN_Z); + FETCH(&r[4], 1, CHAN_Y); + + /* r2 = (r0 * r1) - (r3 * r5) + */ + r[2].q = si_fm(r[3].q, r[5].q); + r[2].q = si_fms(r[0].q, r[1].q, r[2].q); + + if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) { + STORE( &r[2], 0, CHAN_X ); + } + + FETCH(&r[2], 1, CHAN_X); + FETCH(&r[5], 0, CHAN_X); + + /* r3 = (r3 * r2) - (r1 * r5) + */ + r[1].q = si_fm(r[1].q, r[5].q); + r[3].q = si_fms(r[3].q, r[2].q, r[1].q); + + if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) { + STORE( &r[3], 0, CHAN_Y ); + } + + /* r5 = (r5 * r4) - (r0 * r2) + */ + r[0].q = si_fm(r[0].q, r[2].q); + r[5].q = si_fms(r[5].q, r[4].q, r[0].q); + + if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) { + STORE( &r[5], 0, CHAN_Z ); + } + + if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) { + STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W ); + } + break; + + case TGSI_OPCODE_MULTIPLYMATRIX: + assert (0); + break; + + case TGSI_OPCODE_ABS: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH(&r[0], 0, chan_index); + + r[0].q = micro_abs(r[0].q); + + STORE(&r[0], 0, chan_index); + } + break; + + case TGSI_OPCODE_RCC: + assert (0); + break; + + case TGSI_OPCODE_DPH: + FETCH(&r[0], 0, CHAN_X); + FETCH(&r[1], 1, CHAN_X); + + r[0].q = si_fm(r[0].q, r[1].q); + + FETCH(&r[1], 0, CHAN_Y); + FETCH(&r[2], 1, CHAN_Y); + + r[0].q = si_fma(r[1].q, r[2].q, r[0].q); + + FETCH(&r[1], 0, CHAN_Z); + FETCH(&r[2], 1, CHAN_Z); + + r[0].q = si_fma(r[1].q, r[2].q, r[0].q); + + FETCH(&r[1], 1, CHAN_W); + + r[0].q = si_fa(r[0].q, r[1].q); + + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_COS: + FETCH(&r[0], 0, CHAN_X); + + r[0].q = micro_cos(r[0].q); + + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_DDX: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + r[0].q = micro_ddx(r[0].q); + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_DDY: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + r[0].q = micro_ddy(r[0].q); + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_KILP: + exec_kilp (mach, inst); + break; + + case TGSI_OPCODE_KIL: + /* for enabled ExecMask bits, set the killed bit */ + mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= mach->ExecMask; + break; + + case TGSI_OPCODE_PK2H: + assert (0); + break; + + case TGSI_OPCODE_PK2US: + assert (0); + break; + + case TGSI_OPCODE_PK4B: + assert (0); + break; + + case TGSI_OPCODE_PK4UB: + assert (0); + break; + + case TGSI_OPCODE_RFL: + assert (0); + break; + + case TGSI_OPCODE_SEQ: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + FETCH( &r[1], 1, chan_index ); + + r[0].q = si_fceq(r[0].q, r[1].q); + + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_SFL: + assert (0); + break; + + case TGSI_OPCODE_SGT: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + FETCH( &r[1], 1, chan_index ); + r[0].q = si_fcgt(r[0].q, r[1].q); + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_SIN: + FETCH( &r[0], 0, CHAN_X ); + r[0].q = micro_sin(r[0].q); + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_SLE: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + FETCH( &r[1], 1, chan_index ); + + r[0].q = si_fcgt(r[0].q, r[1].q); + r[0].q = si_xori(r[0].q, 0xff); + + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_SNE: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + FETCH( &r[1], 1, chan_index ); + + r[0].q = si_fceq(r[0].q, r[1].q); + r[0].q = si_xori(r[0].q, 0xff); + + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_STR: + assert (0); + break; + + case TGSI_OPCODE_TEX: + /* simple texture lookup */ + /* src[0] = texcoord */ + /* src[1] = sampler unit */ + exec_tex(mach, inst, FALSE); + break; + + case TGSI_OPCODE_TXB: + /* Texture lookup with lod bias */ + /* src[0] = texcoord (src[0].w = load bias) */ + /* src[1] = sampler unit */ + exec_tex(mach, inst, TRUE); + break; + + case TGSI_OPCODE_TXD: + /* Texture lookup with explict partial derivatives */ + /* src[0] = texcoord */ + /* src[1] = d[strq]/dx */ + /* src[2] = d[strq]/dy */ + /* src[3] = sampler unit */ + assert (0); + break; + + case TGSI_OPCODE_TXL: + /* Texture lookup with explit LOD */ + /* src[0] = texcoord (src[0].w = load bias) */ + /* src[1] = sampler unit */ + exec_tex(mach, inst, TRUE); + break; + + case TGSI_OPCODE_UP2H: + assert (0); + break; + + case TGSI_OPCODE_UP2US: + assert (0); + break; + + case TGSI_OPCODE_UP4B: + assert (0); + break; + + case TGSI_OPCODE_UP4UB: + assert (0); + break; + + case TGSI_OPCODE_X2D: + assert (0); + break; + + case TGSI_OPCODE_ARA: + assert (0); + break; + + case TGSI_OPCODE_ARR: + assert (0); + break; + + case TGSI_OPCODE_BRA: + assert (0); + break; + + case TGSI_OPCODE_CAL: + /* skip the call if no execution channels are enabled */ + if (mach->ExecMask) { + /* do the call */ + + /* push the Cond, Loop, Cont stacks */ + assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING); + mach->CondStack[mach->CondStackTop++] = mach->CondMask; + assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING); + mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask; + assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING); + mach->ContStack[mach->ContStackTop++] = mach->ContMask; + + assert(mach->FuncStackTop < TGSI_EXEC_MAX_CALL_NESTING); + mach->FuncStack[mach->FuncStackTop++] = mach->FuncMask; + + /* note that PC was already incremented above */ + mach->CallStack[mach->CallStackTop++] = *pc; + *pc = inst->InstructionExtLabel.Label; + } + break; + + case TGSI_OPCODE_RET: + mach->FuncMask &= ~mach->ExecMask; + UPDATE_EXEC_MASK(mach); + + if (mach->ExecMask == 0x0) { + /* really return now (otherwise, keep executing */ + + if (mach->CallStackTop == 0) { + /* returning from main() */ + *pc = -1; + return; + } + *pc = mach->CallStack[--mach->CallStackTop]; + + /* pop the Cond, Loop, Cont stacks */ + assert(mach->CondStackTop > 0); + mach->CondMask = mach->CondStack[--mach->CondStackTop]; + assert(mach->LoopStackTop > 0); + mach->LoopMask = mach->LoopStack[--mach->LoopStackTop]; + assert(mach->ContStackTop > 0); + mach->ContMask = mach->ContStack[--mach->ContStackTop]; + assert(mach->FuncStackTop > 0); + mach->FuncMask = mach->FuncStack[--mach->FuncStackTop]; + + UPDATE_EXEC_MASK(mach); + } + break; + + case TGSI_OPCODE_SSG: + assert (0); + break; + + case TGSI_OPCODE_CMP: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH(&r[0], 0, chan_index); + FETCH(&r[1], 1, chan_index); + FETCH(&r[2], 2, chan_index); + + /* r0 = (r0 < 0.0) ? r1 : r2 + */ + r[3].q = si_xor(r[3].q, r[3].q); + r[0].q = micro_lt(r[0].q, r[3].q); + r[0].q = si_selb(r[1].q, r[2].q, r[0].q); + + STORE(&r[0], 0, chan_index); + } + break; + + case TGSI_OPCODE_SCS: + if( IS_CHANNEL_ENABLED( *inst, CHAN_X ) || IS_CHANNEL_ENABLED( *inst, CHAN_Y ) ) { + FETCH( &r[0], 0, CHAN_X ); + } + if( IS_CHANNEL_ENABLED( *inst, CHAN_X ) ) { + r[1].q = micro_cos(r[0].q); + STORE( &r[1], 0, CHAN_X ); + } + if( IS_CHANNEL_ENABLED( *inst, CHAN_Y ) ) { + r[1].q = micro_sin(r[0].q); + STORE( &r[1], 0, CHAN_Y ); + } + if( IS_CHANNEL_ENABLED( *inst, CHAN_Z ) ) { + STORE( &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], 0, CHAN_Z ); + } + if( IS_CHANNEL_ENABLED( *inst, CHAN_W ) ) { + STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W ); + } + break; + + case TGSI_OPCODE_NRM: + assert (0); + break; + + case TGSI_OPCODE_DIV: + assert( 0 ); + break; + + case TGSI_OPCODE_DP2: + FETCH( &r[0], 0, CHAN_X ); + FETCH( &r[1], 1, CHAN_X ); + r[0].q = si_fm(r[0].q, r[1].q); + + FETCH( &r[1], 0, CHAN_Y ); + FETCH( &r[2], 1, CHAN_Y ); + r[0].q = si_fma(r[1].q, r[2].q, r[0].q); + + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_IF: + /* push CondMask */ + assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING); + mach->CondStack[mach->CondStackTop++] = mach->CondMask; + FETCH( &r[0], 0, CHAN_X ); + /* update CondMask */ + if( ! r[0].u[0] ) { + mach->CondMask &= ~0x1; + } + if( ! r[0].u[1] ) { + mach->CondMask &= ~0x2; + } + if( ! r[0].u[2] ) { + mach->CondMask &= ~0x4; + } + if( ! r[0].u[3] ) { + mach->CondMask &= ~0x8; + } + UPDATE_EXEC_MASK(mach); + /* Todo: If CondMask==0, jump to ELSE */ + break; + + case TGSI_OPCODE_ELSE: + /* invert CondMask wrt previous mask */ + { + uint prevMask; + assert(mach->CondStackTop > 0); + prevMask = mach->CondStack[mach->CondStackTop - 1]; + mach->CondMask = ~mach->CondMask & prevMask; + UPDATE_EXEC_MASK(mach); + /* Todo: If CondMask==0, jump to ENDIF */ + } + break; + + case TGSI_OPCODE_ENDIF: + /* pop CondMask */ + assert(mach->CondStackTop > 0); + mach->CondMask = mach->CondStack[--mach->CondStackTop]; + UPDATE_EXEC_MASK(mach); + break; + + case TGSI_OPCODE_END: + /* halt execution */ + *pc = -1; + break; + + case TGSI_OPCODE_REP: + assert (0); + break; + + case TGSI_OPCODE_ENDREP: + assert (0); + break; + + case TGSI_OPCODE_PUSHA: + assert (0); + break; + + case TGSI_OPCODE_POPA: + assert (0); + break; + + case TGSI_OPCODE_CEIL: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + r[0].q = micro_ceil(r[0].q); + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_I2F: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + r[0].q = si_csflt(r[0].q, 0); + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_NOT: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + r[0].q = si_xorbi(r[0].q, 0xff); + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_TRUNC: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + r[0].q = micro_trunc(r[0].q); + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_SHL: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + FETCH( &r[1], 1, chan_index ); + + r[0].q = si_shl(r[0].q, r[1].q); + + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_SHR: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + FETCH( &r[1], 1, chan_index ); + r[0].q = micro_ishr(r[0].q, r[1].q); + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_AND: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + FETCH( &r[1], 1, chan_index ); + r[0].q = si_and(r[0].q, r[1].q); + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_OR: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + FETCH( &r[1], 1, chan_index ); + r[0].q = si_or(r[0].q, r[1].q); + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_MOD: + assert (0); + break; + + case TGSI_OPCODE_XOR: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + FETCH( &r[1], 1, chan_index ); + r[0].q = si_xor(r[0].q, r[1].q); + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_SAD: + assert (0); + break; + + case TGSI_OPCODE_TXF: + assert (0); + break; + + case TGSI_OPCODE_TXQ: + assert (0); + break; + + case TGSI_OPCODE_EMIT: + mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] += 16; + mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]]++; + break; + + case TGSI_OPCODE_ENDPRIM: + mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]++; + mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]] = 0; + break; + + case TGSI_OPCODE_LOOP: + /* fall-through (for now) */ + case TGSI_OPCODE_BGNLOOP2: + /* push LoopMask and ContMasks */ + assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING); + mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask; + assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING); + mach->ContStack[mach->ContStackTop++] = mach->ContMask; + break; + + case TGSI_OPCODE_ENDLOOP: + /* fall-through (for now at least) */ + case TGSI_OPCODE_ENDLOOP2: + /* Restore ContMask, but don't pop */ + assert(mach->ContStackTop > 0); + mach->ContMask = mach->ContStack[mach->ContStackTop - 1]; + if (mach->LoopMask) { + /* repeat loop: jump to instruction just past BGNLOOP */ + *pc = inst->InstructionExtLabel.Label + 1; + } + else { + /* exit loop: pop LoopMask */ + assert(mach->LoopStackTop > 0); + mach->LoopMask = mach->LoopStack[--mach->LoopStackTop]; + /* pop ContMask */ + assert(mach->ContStackTop > 0); + mach->ContMask = mach->ContStack[--mach->ContStackTop]; + } + UPDATE_EXEC_MASK(mach); + break; + + case TGSI_OPCODE_BRK: + /* turn off loop channels for each enabled exec channel */ + mach->LoopMask &= ~mach->ExecMask; + /* Todo: if mach->LoopMask == 0, jump to end of loop */ + UPDATE_EXEC_MASK(mach); + break; + + case TGSI_OPCODE_CONT: + /* turn off cont channels for each enabled exec channel */ + mach->ContMask &= ~mach->ExecMask; + /* Todo: if mach->LoopMask == 0, jump to end of loop */ + UPDATE_EXEC_MASK(mach); + break; + + case TGSI_OPCODE_BGNSUB: + /* no-op */ + break; + + case TGSI_OPCODE_ENDSUB: + /* no-op */ + break; + + case TGSI_OPCODE_NOISE1: + assert( 0 ); + break; + + case TGSI_OPCODE_NOISE2: + assert( 0 ); + break; + + case TGSI_OPCODE_NOISE3: + assert( 0 ); + break; + + case TGSI_OPCODE_NOISE4: + assert( 0 ); + break; + + case TGSI_OPCODE_NOP: + break; + + default: + assert( 0 ); + } +} + + +/** + * Run TGSI interpreter. + * \return bitmask of "alive" quad components + */ +uint +spu_exec_machine_run( struct spu_exec_machine *mach ) +{ + uint i; + int pc = 0; + + mach->CondMask = 0xf; + mach->LoopMask = 0xf; + mach->ContMask = 0xf; + mach->FuncMask = 0xf; + mach->ExecMask = 0xf; + + mach->CondStackTop = 0; /* temporarily subvert this assertion */ + assert(mach->CondStackTop == 0); + assert(mach->LoopStackTop == 0); + assert(mach->ContStackTop == 0); + assert(mach->CallStackTop == 0); + + mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] = 0; + mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] = 0; + + if( mach->Processor == TGSI_PROCESSOR_GEOMETRY ) { + mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0] = 0; + mach->Primitives[0] = 0; + } + + + /* execute declarations (interpolants) */ + if( mach->Processor == TGSI_PROCESSOR_FRAGMENT ) { + for (i = 0; i < mach->NumDeclarations; i++) { + uint8_t buffer[sizeof(struct tgsi_full_declaration) + 32] ALIGN16_ATTRIB; + struct tgsi_full_declaration decl; + unsigned long decl_addr = (unsigned long) (mach->Declarations+i); + unsigned size = ((sizeof(decl) + (decl_addr & 0x0f) + 0x0f) & ~0x0f); + + mfc_get(buffer, decl_addr & ~0x0f, size, TAG_INSTRUCTION_FETCH, 0, 0); + wait_on_mask(1 << TAG_INSTRUCTION_FETCH); + + memcpy(& decl, buffer + (decl_addr & 0x0f), sizeof(decl)); + exec_declaration( mach, &decl ); + } + } + + /* execute instructions, until pc is set to -1 */ + while (pc != -1) { + uint8_t buffer[sizeof(struct tgsi_full_instruction) + 32] ALIGN16_ATTRIB; + struct tgsi_full_instruction inst; + unsigned long inst_addr = (unsigned long) (mach->Instructions + pc); + unsigned size = ((sizeof(inst) + (inst_addr & 0x0f) + 0x0f) & ~0x0f); + + assert(pc < mach->NumInstructions); + mfc_get(buffer, inst_addr & ~0x0f, size, TAG_INSTRUCTION_FETCH, 0, 0); + wait_on_mask(1 << TAG_INSTRUCTION_FETCH); + + memcpy(& inst, buffer + (inst_addr & 0x0f), sizeof(inst)); + exec_instruction( mach, & inst, &pc ); + } + +#if 0 + /* we scale from floats in [0,1] to Zbuffer ints in sp_quad_depth_test.c */ + if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) { + /* + * Scale back depth component. + */ + for (i = 0; i < 4; i++) + mach->Outputs[0].xyzw[2].f[i] *= ctx->DrawBuffer->_DepthMaxF; + } +#endif + + return ~mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0]; +} + + diff --git a/src/mesa/pipe/cell/spu/spu_exec.h b/src/mesa/pipe/cell/spu/spu_exec.h new file mode 100644 index 0000000000..b4c7661ef6 --- /dev/null +++ b/src/mesa/pipe/cell/spu/spu_exec.h @@ -0,0 +1,172 @@ +/************************************************************************** + * + * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +#if !defined SPU_EXEC_H +#define SPU_EXEC_H + +#include "pipe/p_compiler.h" +#include "pipe/tgsi/exec/tgsi_exec.h" + +#if defined __cplusplus +extern "C" { +#endif + +/** + * Registers may be treated as float, signed int or unsigned int. + */ +union spu_exec_channel +{ + float f[QUAD_SIZE]; + int i[QUAD_SIZE]; + unsigned u[QUAD_SIZE]; + qword q; +}; + +/** + * A vector[RGBA] of channels[4 pixels] + */ +struct spu_exec_vector +{ + union spu_exec_channel xyzw[NUM_CHANNELS]; +}; + +/** + * For fragment programs, information for computing fragment input + * values from plane equation of the triangle/line. + */ +struct spu_interp_coef +{ + float a0[NUM_CHANNELS]; /* in an xyzw layout */ + float dadx[NUM_CHANNELS]; + float dady[NUM_CHANNELS]; +}; + + +struct softpipe_tile_cache; /**< Opaque to TGSI */ + +/** + * Information for sampling textures, which must be implemented + * by code outside the TGSI executor. + */ +struct spu_sampler +{ + const struct pipe_sampler_state *state; + struct pipe_texture *texture; + /** Get samples for four fragments in a quad */ + void (*get_samples)(struct spu_sampler *sampler, + const float s[QUAD_SIZE], + const float t[QUAD_SIZE], + const float p[QUAD_SIZE], + float lodbias, + float rgba[NUM_CHANNELS][QUAD_SIZE]); + void *pipe; /*XXX temporary*/ + struct softpipe_tile_cache *cache; +}; + + +/** + * Run-time virtual machine state for executing TGSI shader. + */ +struct spu_exec_machine +{ + /* + * 32 program temporaries + * 4 internal temporaries + * 1 address + */ + struct spu_exec_vector Temps[TGSI_EXEC_NUM_TEMPS + + TGSI_EXEC_NUM_ADDRS + 1] + ALIGN16_ATTRIB; + + struct spu_exec_vector *Addrs; + + struct spu_sampler *Samplers; + + float Imms[TGSI_EXEC_NUM_IMMEDIATES][4]; + unsigned ImmLimit; + float (*Consts)[4]; + struct spu_exec_vector *Inputs; + struct spu_exec_vector *Outputs; + unsigned Processor; + + /* GEOMETRY processor only. */ + unsigned *Primitives; + + /* FRAGMENT processor only. */ + const struct spu_interp_coef *InterpCoefs; + struct spu_exec_vector QuadPos; + + /* Conditional execution masks */ + uint CondMask; /**< For IF/ELSE/ENDIF */ + uint LoopMask; /**< For BGNLOOP/ENDLOOP */ + uint ContMask; /**< For loop CONT statements */ + uint FuncMask; /**< For function calls */ + uint ExecMask; /**< = CondMask & LoopMask */ + + /** Condition mask stack (for nested conditionals) */ + uint CondStack[TGSI_EXEC_MAX_COND_NESTING]; + int CondStackTop; + + /** Loop mask stack (for nested loops) */ + uint LoopStack[TGSI_EXEC_MAX_LOOP_NESTING]; + int LoopStackTop; + + /** Loop continue mask stack (see comments in tgsi_exec.c) */ + uint ContStack[TGSI_EXEC_MAX_LOOP_NESTING]; + int ContStackTop; + + /** Function execution mask stack (for executing subroutine code) */ + uint FuncStack[TGSI_EXEC_MAX_CALL_NESTING]; + int FuncStackTop; + + /** Function call stack for saving/restoring the program counter */ + uint CallStack[TGSI_EXEC_MAX_CALL_NESTING]; + int CallStackTop; + + struct tgsi_full_instruction *Instructions; + uint NumInstructions; + + struct tgsi_full_declaration *Declarations; + uint NumDeclarations; +}; + + +extern void +spu_exec_machine_init(struct spu_exec_machine *mach, + uint numSamplers, + struct spu_sampler *samplers, + unsigned processor); + +extern uint +spu_exec_machine_run( struct spu_exec_machine *mach ); + + +#if defined __cplusplus +} /* extern "C" */ +#endif + +#endif /* SPU_EXEC_H */ diff --git a/src/mesa/pipe/cell/spu/spu_main.c b/src/mesa/pipe/cell/spu/spu_main.c index 0c83900a18..e375197fe6 100644 --- a/src/mesa/pipe/cell/spu/spu_main.c +++ b/src/mesa/pipe/cell/spu/spu_main.c @@ -31,11 +31,13 @@ #include <stdio.h> #include <libmisc.h> -#include <spu_mfcio.h> #include "spu_main.h" -#include "spu_tri.h" +#include "spu_render.h" +#include "spu_texture.h" #include "spu_tile.h" +//#include "spu_test.h" +#include "spu_vertex_shader.h" #include "pipe/cell/common.h" #include "pipe/p_defines.h" @@ -46,28 +48,37 @@ helpful headers: /opt/ibm/cell-sdk/prototype/sysroot/usr/include/libmisc.h */ -static boolean Debug = FALSE; +boolean Debug = FALSE; struct spu_global spu; +struct spu_vs_context draw; -void -wait_on_mask(unsigned tagMask) +/** + * Tell the PPU that this SPU has finished copying a buffer to + * local store and that it may be reused by the PPU. + * This is done by writting a 16-byte batch-buffer-status block back into + * main memory (in cell_context->buffer_status[]). + */ +static void +release_buffer(uint buffer) { - mfc_write_tag_mask( tagMask ); - /* wait for completion of _any_ DMAs specified by tagMask */ - mfc_read_tag_status_any(); -} + /* Evidently, using less than a 16-byte status doesn't work reliably */ + static const uint status[4] ALIGN16_ATTRIB + = {CELL_BUFFER_STATUS_FREE, 0, 0, 0}; + const uint index = 4 * (spu.init.id * CELL_NUM_BUFFERS + buffer); + uint *dst = spu.init.buffer_status + index; -static void -wait_on_mask_all(unsigned tagMask) -{ - mfc_write_tag_mask( tagMask ); - /* wait for completion of _any_ DMAs specified by tagMask */ - mfc_read_tag_status_all(); -} + ASSERT(buffer < CELL_NUM_BUFFERS); + mfc_put((void *) &status, /* src in local memory */ + (unsigned int) dst, /* dst in main memory */ + sizeof(status), /* size */ + TAG_MISC, /* tag is unimportant */ + 0, /* tid */ + 0 /* rid */); +} /** @@ -81,24 +92,24 @@ really_clear_tiles(uint surfaceIndex) uint i; if (surfaceIndex == 0) { - clear_c_tile(&ctile); + clear_c_tile(&spu.ctile); for (i = spu.init.id; i < num_tiles; i += spu.init.num_spus) { uint tx = i % spu.fb.width_tiles; uint ty = i / spu.fb.width_tiles; - if (tile_status[ty][tx] == TILE_STATUS_CLEAR) { - put_tile(tx, ty, &ctile, TAG_SURFACE_CLEAR, 0); + if (spu.ctile_status[ty][tx] == TILE_STATUS_CLEAR) { + put_tile(tx, ty, &spu.ctile, TAG_SURFACE_CLEAR, 0); } } } else { - clear_z_tile(&ztile); + clear_z_tile(&spu.ztile); for (i = spu.init.id; i < num_tiles; i += spu.init.num_spus) { uint tx = i % spu.fb.width_tiles; uint ty = i / spu.fb.width_tiles; - if (tile_status_z[ty][tx] == TILE_STATUS_CLEAR) - put_tile(tx, ty, &ctile, TAG_SURFACE_CLEAR, 1); + if (spu.ztile_status[ty][tx] == TILE_STATUS_CLEAR) + put_tile(tx, ty, &spu.ctile, TAG_SURFACE_CLEAR, 1); } } @@ -122,11 +133,11 @@ cmd_clear_surface(const struct cell_command_clear_surface *clear) #if CLEAR_OPT /* set all tile's status to CLEAR */ if (clear->surface == 0) { - memset(tile_status, TILE_STATUS_CLEAR, sizeof(tile_status)); + memset(spu.ctile_status, TILE_STATUS_CLEAR, sizeof(spu.ctile_status)); spu.fb.color_clear_value = clear->value; } else { - memset(tile_status_z, TILE_STATUS_CLEAR, sizeof(tile_status_z)); + memset(spu.ztile_status, TILE_STATUS_CLEAR, sizeof(spu.ztile_status)); spu.fb.depth_clear_value = clear->value; } return; @@ -134,11 +145,11 @@ cmd_clear_surface(const struct cell_command_clear_surface *clear) if (clear->surface == 0) { spu.fb.color_clear_value = clear->value; - clear_c_tile(&ctile); + clear_c_tile(&spu.ctile); } else { spu.fb.depth_clear_value = clear->value; - clear_z_tile(&ztile); + clear_z_tile(&spu.ztile); } /* @@ -150,9 +161,9 @@ cmd_clear_surface(const struct cell_command_clear_surface *clear) uint tx = i % spu.fb.width_tiles; uint ty = i / spu.fb.width_tiles; if (clear->surface == 0) - put_tile(tx, ty, &ctile, TAG_SURFACE_CLEAR, 0); + put_tile(tx, ty, &spu.ctile, TAG_SURFACE_CLEAR, 0); else - put_tile(tx, ty, &ztile, TAG_SURFACE_CLEAR, 1); + put_tile(tx, ty, &spu.ztile, TAG_SURFACE_CLEAR, 1); /* XXX we don't want this here, but it fixes bad tile results */ } @@ -165,229 +176,14 @@ cmd_clear_surface(const struct cell_command_clear_surface *clear) } -/** - * Given a rendering command's bounding box (in pixels) compute the - * location of the corresponding screen tile bounding box. - */ -static INLINE void -tile_bounding_box(const struct cell_command_render *render, - uint *txmin, uint *tymin, - uint *box_num_tiles, uint *box_width_tiles) -{ -#if 1 - /* Debug: full-window bounding box */ - uint txmax = spu.fb.width_tiles - 1; - uint tymax = spu.fb.height_tiles - 1; - *txmin = 0; - *tymin = 0; - *box_num_tiles = spu.fb.width_tiles * spu.fb.height_tiles; - *box_width_tiles = spu.fb.width_tiles; - (void) render; - (void) txmax; - (void) tymax; -#else - uint txmax, tymax, box_height_tiles; - - *txmin = (uint) render->xmin / TILE_SIZE; - *tymin = (uint) render->ymin / TILE_SIZE; - txmax = (uint) render->xmax / TILE_SIZE; - tymax = (uint) render->ymax / TILE_SIZE; - *box_width_tiles = txmax - *txmin + 1; - box_height_tiles = tymax - *tymin + 1; - *box_num_tiles = *box_width_tiles * box_height_tiles; -#endif -#if 0 - printf("Render bounds: %g, %g ... %g, %g\n", - render->xmin, render->ymin, render->xmax, render->ymax); - printf("Render tiles: %u, %u .. %u, %u\n", *txmin, *tymin, txmax, tymax); -#endif -} - - -/** - * Render primitives - * \param pos_incr returns value indicating how may words to skip after - * this command in the batch buffer - */ static void -cmd_render(const struct cell_command_render *render, uint *pos_incr) +cmd_release_verts(const struct cell_command_release_verts *release) { - /* we'll DMA into these buffers */ - ubyte vertex_data[CELL_MAX_VBUF_SIZE] ALIGN16_ATTRIB; - ushort index_data[CELL_MAX_VBUF_INDEXES] ALIGN16_ATTRIB; - const uint vertex_size = render->vertex_size; /* in bytes */ - const uint total_vertex_bytes = render->num_verts * vertex_size; - const ubyte *vertices; - const ushort *indexes; - uint mask; - uint i, j; - - - if (Debug) { - printf("SPU %u: RENDER prim %u, num_vert=%u num_ind=%u " - "inline_vert=%u inline_ind=%u\n", - spu.init.id, - render->prim_type, - render->num_verts, - render->num_indexes, - render->inline_verts, - render->inline_indexes); - - /* - printf(" bound: %g, %g .. %g, %g\n", - render->xmin, render->ymin, render->xmax, render->ymax); - */ - printf("SPU %u: indices at %p vertices at %p\n", - spu.init.id, - render->index_data, render->vertex_data); - } - - ASSERT(sizeof(*render) % 4 == 0); - ASSERT_ALIGN16(render->vertex_data); - ASSERT_ALIGN16(render->index_data); - - - /** - ** Get vertex, index buffers if not inlined - **/ - if (!render->inline_verts) { - ASSERT(total_vertex_bytes % 16 == 0); - - mfc_get(vertex_data, /* dest */ - (unsigned int) render->vertex_data, /* src */ - total_vertex_bytes, /* size */ - TAG_VERTEX_BUFFER, - 0, /* tid */ - 0 /* rid */); - - vertices = vertex_data; - } - - if (!render->inline_indexes) { - uint total_index_bytes; - - *pos_incr = 0; - - total_index_bytes = render->num_indexes * sizeof(ushort); - if (total_index_bytes < 16) - total_index_bytes = 16; - else - total_index_bytes = ROUNDUP16(total_index_bytes); - - indexes = index_data; - - /* get index data from main memory */ - mfc_get(index_data, /* dest */ - (unsigned int) render->index_data, /* src */ - total_index_bytes, - TAG_INDEX_BUFFER, - 0, /* tid */ - 0 /* rid */); - } - - - /** - ** Get pointers to inlined indexes, verts, if present - **/ - if (render->inline_indexes) { - /* indexes are right after the render command in the batch buffer */ - indexes = (ushort *) (render + 1); - *pos_incr = (render->num_indexes * 2 + 3) / 4; - - if (render->inline_verts) { - /* vertices are after indexes, if inlined */ - vertices = (const ubyte *) (render + 1) + *pos_incr * 4; - *pos_incr = *pos_incr + total_vertex_bytes / 4; - } - } - - - /* wait for vertex and/or index buffers if not inlined */ - mask = 0x0; - if (!render->inline_verts) - mask |= (1 << TAG_VERTEX_BUFFER); - if (!render->inline_indexes) - mask |= (1 << TAG_INDEX_BUFFER); - wait_on_mask_all(mask); - - - /** - ** find tiles which intersect the prim bounding box - **/ - uint txmin, tymin, box_width_tiles, box_num_tiles; -#if 0 - tile_bounding_box(render, &txmin, &tymin, - &box_num_tiles, &box_width_tiles); -#else - txmin = 0; - tymin = 0; - box_num_tiles = spu.fb.width_tiles * spu.fb.height_tiles; - box_width_tiles = spu.fb.width_tiles; -#endif - - /* make sure any pending clears have completed */ - wait_on_mask(1 << TAG_SURFACE_CLEAR); - - - /** - ** loop over tiles, rendering tris - **/ - for (i = spu.init.id; i < box_num_tiles; i += spu.init.num_spus) { - const uint tx = txmin + i % box_width_tiles; - const uint ty = tymin + i / box_width_tiles; - - ASSERT(tx < spu.fb.width_tiles); - ASSERT(ty < spu.fb.height_tiles); - - /* Start fetching color/z tiles. We'll wait for completion when - * we need read/write to them later in triangle rasterization. - */ - if (spu.depth_stencil.depth.enabled) { - if (tile_status_z[ty][tx] != TILE_STATUS_CLEAR) { - get_tile(tx, ty, &ztile, TAG_READ_TILE_Z, 1); - } - } - - if (tile_status[ty][tx] != TILE_STATUS_CLEAR) { - get_tile(tx, ty, &ctile, TAG_READ_TILE_COLOR, 0); - } - - ASSERT(render->prim_type == PIPE_PRIM_TRIANGLES); - ASSERT(render->num_indexes % 3 == 0); - - /* loop over tris */ - for (j = 0; j < render->num_indexes; j += 3) { - const float *v0, *v1, *v2; - - v0 = (const float *) (vertices + indexes[j+0] * vertex_size); - v1 = (const float *) (vertices + indexes[j+1] * vertex_size); - v2 = (const float *) (vertices + indexes[j+2] * vertex_size); - - tri_draw(v0, v1, v2, tx, ty); - } - - /* write color/z tiles back to main framebuffer, if dirtied */ - if (tile_status[ty][tx] == TILE_STATUS_DIRTY) { - put_tile(tx, ty, &ctile, TAG_WRITE_TILE_COLOR, 0); - tile_status[ty][tx] = TILE_STATUS_DEFINED; - } - if (spu.depth_stencil.depth.enabled) { - if (tile_status_z[ty][tx] == TILE_STATUS_DIRTY) { - put_tile(tx, ty, &ztile, TAG_WRITE_TILE_Z, 1); - tile_status_z[ty][tx] = TILE_STATUS_DEFINED; - } - } - - /* XXX move these... */ - wait_on_mask(1 << TAG_WRITE_TILE_COLOR); - if (spu.depth_stencil.depth.enabled) { - wait_on_mask(1 << TAG_WRITE_TILE_Z); - } - } - if (Debug) - printf("SPU %u: RENDER done\n", - spu.init.id); + printf("SPU %u: RELEASE VERTS %u\n", + spu.init.id, release->vertex_buf); + ASSERT(release->vertex_buf != ~0U); + release_buffer(release->vertex_buf); } @@ -421,6 +217,29 @@ cmd_state_framebuffer(const struct cell_command_framebuffer *cmd) spu.fb.zsize = 2; else spu.fb.zsize = 0; + + if (spu.fb.color_format == PIPE_FORMAT_A8R8G8B8_UNORM) + spu.color_shuffle = ((vector unsigned char) { + 12, 0, 4, 8, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0}); + else if (spu.fb.color_format == PIPE_FORMAT_B8G8R8A8_UNORM) + spu.color_shuffle = ((vector unsigned char) { + 8, 4, 0, 12, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0}); + else + ASSERT(0); +} + + +static void +cmd_state_blend(const struct pipe_blend_state *state) +{ + if (Debug) + printf("SPU %u: BLEND: enabled %d\n", + spu.init.id, + state->blend_enable); + + memcpy(&spu.blend, state, sizeof(*state)); } @@ -444,19 +263,53 @@ cmd_state_sampler(const struct pipe_sampler_state *state) spu.init.id); memcpy(&spu.sampler[0], state, sizeof(*state)); + if (spu.sampler[0].min_img_filter == PIPE_TEX_FILTER_LINEAR) + spu.sample_texture = sample_texture_bilinear; + else + spu.sample_texture = sample_texture_nearest; } static void -cmd_state_vertex_info(const struct vertex_info *vinfo) +cmd_state_texture(const struct cell_command_texture *texture) { if (Debug) + printf("SPU %u: TEXTURE at %p size %u x %u\n", + spu.init.id, texture->start, texture->width, texture->height); + + memcpy(&spu.texture, texture, sizeof(*texture)); + spu.tex_size = (vector float) + { spu.texture.width, spu.texture.height, 0.0, 0.0}; + spu.tex_size_mask = (vector unsigned int) + { spu.texture.width - 1, spu.texture.height - 1, 0, 0 }; +} + + +static void +cmd_state_vertex_info(const struct vertex_info *vinfo) +{ + if (Debug) { printf("SPU %u: VERTEX_INFO num_attribs=%u\n", spu.init.id, vinfo->num_attribs); + } + ASSERT(vinfo->num_attribs >= 1); + ASSERT(vinfo->num_attribs <= 8); memcpy(&spu.vertex_info, vinfo, sizeof(*vinfo)); } +static void +cmd_state_vs_array_info(const struct cell_array_info *vs_info) +{ + const unsigned attr = vs_info->attr; + + ASSERT(attr < PIPE_ATTRIB_MAX); + draw.vertex_fetch.src_ptr[attr] = vs_info->base; + draw.vertex_fetch.pitch[attr] = vs_info->pitch; + draw.vertex_fetch.format[attr] = vs_info->format; + draw.vertex_fetch.dirty = 1; +} + static void cmd_finish(void) @@ -473,38 +326,6 @@ cmd_finish(void) /** - * Tell the PPU that this SPU has finished copying a batch buffer to - * local store and that it may be reused by the PPU. - * This is done by writting a 16-byte batch-buffer-status block back into - * main memory (in cell_contex->buffer_status[]). - */ -static void -release_batch_buffer(uint buffer) -{ - /* Evidently, using less than a 16-byte status doesn't work reliably */ - static const uint status[4] ALIGN16_ATTRIB - = {CELL_BUFFER_STATUS_FREE, 0, 0, 0}; - - const uint index = 4 * (spu.init.id * CELL_NUM_BATCH_BUFFERS + buffer); - uint *dst = spu.init.buffer_status + index; - - ASSERT(buffer < CELL_NUM_BATCH_BUFFERS); - - /* - printf("SPU %u: Set batch status buf=%u, index %u, at %p to FREE\n", - spu.init.id, buffer, index, dst); - */ - - mfc_put((void *) &status, /* src in local memory */ - (unsigned int) dst, /* dst in main memory */ - sizeof(status), /* size */ - TAG_MISC, /* tag is unimportant */ - 0, /* tid */ - 0 /* rid */); -} - - -/** * Execute a batch of commands * The opcode param encodes the location of the buffer and its size. */ @@ -513,24 +334,24 @@ cmd_batch(uint opcode) { const uint buf = (opcode >> 8) & 0xff; uint size = (opcode >> 16); - uint buffer[CELL_BATCH_BUFFER_SIZE / 4] ALIGN16_ATTRIB; - const uint usize = size / sizeof(uint); + uint64_t buffer[CELL_BUFFER_SIZE / 8] ALIGN16_ATTRIB; + const unsigned usize = size / sizeof(buffer[0]); uint pos; if (Debug) printf("SPU %u: BATCH buffer %u, len %u, from %p\n", - spu.init.id, buf, size, spu.init.batch_buffers[buf]); + spu.init.id, buf, size, spu.init.buffers[buf]); ASSERT((opcode & CELL_CMD_OPCODE_MASK) == CELL_CMD_BATCH); - ASSERT_ALIGN16(spu.init.batch_buffers[buf]); + ASSERT_ALIGN16(spu.init.buffers[buf]); size = ROUNDUP16(size); - ASSERT_ALIGN16(spu.init.batch_buffers[buf]); + ASSERT_ALIGN16(spu.init.buffers[buf]); mfc_get(buffer, /* dest */ - (unsigned int) spu.init.batch_buffers[buf], /* src */ + (unsigned int) spu.init.buffers[buf], /* src */ size, TAG_BATCH_BUFFER, 0, /* tid */ @@ -538,7 +359,9 @@ cmd_batch(uint opcode) wait_on_mask(1 << TAG_BATCH_BUFFER); /* Tell PPU we're done copying the buffer to local store */ - release_batch_buffer(buf); + if (Debug) + printf("SPU %u: release batch buf %u\n", spu.init.id, buf); + release_buffer(buf); for (pos = 0; pos < usize; /* no incr */) { switch (buffer[pos]) { @@ -547,7 +370,7 @@ cmd_batch(uint opcode) struct cell_command_framebuffer *fb = (struct cell_command_framebuffer *) &buffer[pos]; cmd_state_framebuffer(fb); - pos += sizeof(*fb) / 4; + pos += sizeof(*fb) / 8; } break; case CELL_CMD_CLEAR_SURFACE: @@ -555,7 +378,7 @@ cmd_batch(uint opcode) struct cell_command_clear_surface *clr = (struct cell_command_clear_surface *) &buffer[pos]; cmd_clear_surface(clr); - pos += sizeof(*clr) / 4; + pos += sizeof(*clr) / 8; } break; case CELL_CMD_RENDER: @@ -564,28 +387,54 @@ cmd_batch(uint opcode) = (struct cell_command_render *) &buffer[pos]; uint pos_incr; cmd_render(render, &pos_incr); - pos += sizeof(*render) / 4 + pos_incr; + pos += pos_incr; + } + break; + case CELL_CMD_RELEASE_VERTS: + { + struct cell_command_release_verts *release + = (struct cell_command_release_verts *) &buffer[pos]; + cmd_release_verts(release); + pos += sizeof(*release) / 8; } break; case CELL_CMD_FINISH: cmd_finish(); pos += 1; break; + case CELL_CMD_STATE_BLEND: + cmd_state_blend((struct pipe_blend_state *) + &buffer[pos+1]); + pos += (1 + ROUNDUP8(sizeof(struct pipe_blend_state)) / 8); + break; case CELL_CMD_STATE_DEPTH_STENCIL: cmd_state_depth_stencil((struct pipe_depth_stencil_alpha_state *) &buffer[pos+1]); - pos += (1 + sizeof(struct pipe_depth_stencil_alpha_state) / 4); + pos += (1 + ROUNDUP8(sizeof(struct pipe_depth_stencil_alpha_state)) / 8); break; case CELL_CMD_STATE_SAMPLER: cmd_state_sampler((struct pipe_sampler_state *) &buffer[pos+1]); - pos += (1 + sizeof(struct pipe_sampler_state) / 4); + pos += (1 + ROUNDUP8(sizeof(struct pipe_sampler_state)) / 8); + break; + case CELL_CMD_STATE_TEXTURE: + cmd_state_texture((struct cell_command_texture *) &buffer[pos+1]); + pos += (1 + ROUNDUP8(sizeof(struct cell_command_texture)) / 8); break; case CELL_CMD_STATE_VERTEX_INFO: cmd_state_vertex_info((struct vertex_info *) &buffer[pos+1]); - pos += (1 + sizeof(struct vertex_info) / 4); + pos += (1 + ROUNDUP8(sizeof(struct vertex_info)) / 8); + break; + case CELL_CMD_STATE_VIEWPORT: + (void) memcpy(& draw.viewport, &buffer[pos+1], + sizeof(struct pipe_viewport_state)); + pos += (1 + ROUNDUP8(sizeof(struct pipe_viewport_state)) / 8); + break; + case CELL_CMD_STATE_VS_ARRAY_INFO: + cmd_state_vs_array_info((struct cell_array_info *) &buffer[pos+1]); + pos += (1 + ROUNDUP8(sizeof(struct cell_array_info)) / 8); break; default: - printf("SPU %u: bad opcode: 0x%x\n", spu.init.id, buffer[pos]); + printf("SPU %u: bad opcode: 0x%llx\n", spu.init.id, buffer[pos]); ASSERT(0); break; } @@ -633,31 +482,22 @@ main_loop(void) 0 /* rid */); wait_on_mask( 1 << tag ); + /* + * NOTE: most commands should be contained in a batch buffer + */ + switch (opcode & CELL_CMD_OPCODE_MASK) { case CELL_CMD_EXIT: if (Debug) printf("SPU %u: EXIT\n", spu.init.id); exitFlag = 1; break; - case CELL_CMD_STATE_FRAMEBUFFER: - cmd_state_framebuffer(&cmd.fb); - break; - case CELL_CMD_CLEAR_SURFACE: - cmd_clear_surface(&cmd.clear); - break; - case CELL_CMD_RENDER: - { - uint pos_incr; - cmd_render(&cmd.render, &pos_incr); - assert(pos_incr == 0); - } + case CELL_CMD_VS_EXECUTE: + spu_execute_vertex_shader(&draw, &cmd.vs); break; case CELL_CMD_BATCH: cmd_batch(opcode); break; - case CELL_CMD_FINISH: - cmd_finish(); - break; default: printf("Bad opcode!\n"); } @@ -673,11 +513,13 @@ main_loop(void) static void one_time_init(void) { - memset(tile_status, TILE_STATUS_DEFINED, sizeof(tile_status)); - memset(tile_status_z, TILE_STATUS_DEFINED, sizeof(tile_status_z)); + memset(spu.ctile_status, TILE_STATUS_DEFINED, sizeof(spu.ctile_status)); + memset(spu.ztile_status, TILE_STATUS_DEFINED, sizeof(spu.ztile_status)); + invalidate_tex_cache(); } + /* In some versions of the SDK the SPE main takes 'unsigned long' as a * parameter. In others it takes 'unsigned long long'. Use a define to * select between the two. @@ -698,6 +540,9 @@ main(main_param_t speid, main_param_t argp) (void) speid; + ASSERT(sizeof(tile_t) == TILE_SIZE * TILE_SIZE * 4); + ASSERT(sizeof(struct cell_command_render) % 8 == 0); + one_time_init(); if (Debug) @@ -711,6 +556,10 @@ main(main_param_t speid, main_param_t argp) 0 /* rid */); wait_on_mask( 1 << tag ); +#if 0 + if (spu.init.id==0) + spu_test_misc(); +#endif main_loop(); diff --git a/src/mesa/pipe/cell/spu/spu_main.h b/src/mesa/pipe/cell/spu/spu_main.h index 5bc5d9fa99..1710a17512 100644 --- a/src/mesa/pipe/cell/spu/spu_main.h +++ b/src/mesa/pipe/cell/spu/spu_main.h @@ -29,11 +29,33 @@ #define SPU_MAIN_H +#include <spu_mfcio.h> + #include "pipe/cell/common.h" #include "pipe/draw/draw_vertex.h" #include "pipe/p_state.h" + +#define MAX_WIDTH 1024 +#define MAX_HEIGHT 1024 + + +typedef union { + ushort us[TILE_SIZE][TILE_SIZE]; + uint ui[TILE_SIZE][TILE_SIZE]; + vector unsigned short us8[TILE_SIZE/2][TILE_SIZE/4]; + vector unsigned int ui4[TILE_SIZE/2][TILE_SIZE/2]; +} tile_t; + + +#define TILE_STATUS_CLEAR 1 +#define TILE_STATUS_DEFINED 2 /**< defined in FB, but not in local store */ +#define TILE_STATUS_CLEAN 3 /**< in local store, but not changed */ +#define TILE_STATUS_DIRTY 4 /**< modified locally, but not put back yet */ +#define TILE_STATUS_GETTING 5 /**< mfc_get() called but not yet arrived */ + + struct spu_framebuffer { void *color_start; /**< addr of color surface in main memory */ void *depth_start; /**< addr of depth surface in main memory */ @@ -57,18 +79,42 @@ struct spu_global struct cell_init_info init; struct spu_framebuffer fb; + struct pipe_blend_state blend_stencil; struct pipe_depth_stencil_alpha_state depth_stencil; struct pipe_blend_state blend; struct pipe_sampler_state sampler[PIPE_MAX_SAMPLERS]; + struct cell_command_texture texture; struct vertex_info vertex_info; /* XXX more state to come */ + + /** current color and Z tiles */ + tile_t ctile ALIGN16_ATTRIB; + tile_t ztile ALIGN16_ATTRIB; + + /** Current tiles' status */ + ubyte cur_ctile_status, cur_ztile_status; + + /** Status of all tiles in framebuffer */ + ubyte ctile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB; + ubyte ztile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB; + + + /** for converting RGBA to PIPE_FORMAT_x colors */ + vector unsigned char color_shuffle; + + vector float tex_size; + vector unsigned int tex_size_mask; /**< == int(size - 1) */ + + vector float (*sample_texture)(vector float texcoord); + } ALIGN16_ATTRIB; extern struct spu_global spu; +extern boolean Debug; @@ -84,10 +130,30 @@ extern struct spu_global spu; #define TAG_INDEX_BUFFER 16 #define TAG_BATCH_BUFFER 17 #define TAG_MISC 18 +#define TAG_TEXTURE_TILE 19 +#define TAG_INSTRUCTION_FETCH 20 + + + +static INLINE void +wait_on_mask(unsigned tagMask) +{ + mfc_write_tag_mask( tagMask ); + /* wait for completion of _any_ DMAs specified by tagMask */ + mfc_read_tag_status_any(); +} + + +static INLINE void +wait_on_mask_all(unsigned tagMask) +{ + mfc_write_tag_mask( tagMask ); + /* wait for completion of _any_ DMAs specified by tagMask */ + mfc_read_tag_status_all(); +} + -extern void -wait_on_mask(unsigned tag); static INLINE void diff --git a/src/mesa/pipe/cell/spu/spu_render.c b/src/mesa/pipe/cell/spu/spu_render.c new file mode 100644 index 0000000000..932fb500b3 --- /dev/null +++ b/src/mesa/pipe/cell/spu/spu_render.c @@ -0,0 +1,301 @@ +/************************************************************************** + * + * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + + +#include <stdio.h> +#include <libmisc.h> +#include <spu_mfcio.h> + +#include "spu_main.h" +#include "spu_render.h" +#include "spu_tri.h" +#include "spu_tile.h" +#include "pipe/cell/common.h" + + + +/** + * Given a rendering command's bounding box (in pixels) compute the + * location of the corresponding screen tile bounding box. + */ +static INLINE void +tile_bounding_box(const struct cell_command_render *render, + uint *txmin, uint *tymin, + uint *box_num_tiles, uint *box_width_tiles) +{ +#if 0 + /* Debug: full-window bounding box */ + uint txmax = spu.fb.width_tiles - 1; + uint tymax = spu.fb.height_tiles - 1; + *txmin = 0; + *tymin = 0; + *box_num_tiles = spu.fb.width_tiles * spu.fb.height_tiles; + *box_width_tiles = spu.fb.width_tiles; + (void) render; + (void) txmax; + (void) tymax; +#else + uint txmax, tymax, box_height_tiles; + + *txmin = (uint) render->xmin / TILE_SIZE; + *tymin = (uint) render->ymin / TILE_SIZE; + txmax = (uint) render->xmax / TILE_SIZE; + tymax = (uint) render->ymax / TILE_SIZE; + if (txmax >= spu.fb.width_tiles) + txmax = spu.fb.width_tiles-1; + if (tymax >= spu.fb.height_tiles) + tymax = spu.fb.height_tiles-1; + *box_width_tiles = txmax - *txmin + 1; + box_height_tiles = tymax - *tymin + 1; + *box_num_tiles = *box_width_tiles * box_height_tiles; +#endif +#if 0 + printf("SPU %u: bounds: %g, %g ... %g, %g\n", spu.init.id, + render->xmin, render->ymin, render->xmax, render->ymax); + printf("SPU %u: tiles: %u, %u .. %u, %u\n", + spu.init.id, *txmin, *tymin, txmax, tymax); + ASSERT(render->xmin <= render->xmax); + ASSERT(render->ymin <= render->ymax); +#endif +} + + +/** Check if the tile at (tx,ty) belongs to this SPU */ +static INLINE boolean +my_tile(uint tx, uint ty) +{ + return (spu.fb.width_tiles * ty + tx) % spu.init.num_spus == spu.init.id; +} + + +/** + * Start fetching non-clear color/Z tiles from main memory + */ +static INLINE void +get_cz_tiles(uint tx, uint ty) +{ + if (spu.depth_stencil.depth.enabled) { + if (spu.cur_ztile_status != TILE_STATUS_CLEAR) { + //printf("SPU %u: getting Z tile %u, %u\n", spu.init.id, tx, ty); + get_tile(tx, ty, &spu.ztile, TAG_READ_TILE_Z, 1); + spu.cur_ztile_status = TILE_STATUS_GETTING; + } + } + + if (spu.cur_ctile_status != TILE_STATUS_CLEAR) { + //printf("SPU %u: getting C tile %u, %u\n", spu.init.id, tx, ty); + get_tile(tx, ty, &spu.ctile, TAG_READ_TILE_COLOR, 0); + spu.cur_ctile_status = TILE_STATUS_GETTING; + } +} + + +/** + * Start putting dirty color/Z tiles back to main memory + */ +static INLINE void +put_cz_tiles(uint tx, uint ty) +{ + if (spu.cur_ztile_status == TILE_STATUS_DIRTY) { + /* tile was modified and needs to be written back */ + //printf("SPU %u: put dirty Z tile %u, %u\n", spu.init.id, tx, ty); + put_tile(tx, ty, &spu.ztile, TAG_WRITE_TILE_Z, 1); + spu.cur_ztile_status = TILE_STATUS_DEFINED; + } + else if (spu.cur_ztile_status == TILE_STATUS_GETTING) { + /* tile was never used */ + spu.cur_ztile_status = TILE_STATUS_DEFINED; + //printf("SPU %u: put getting Z tile %u, %u\n", spu.init.id, tx, ty); + } + + if (spu.cur_ctile_status == TILE_STATUS_DIRTY) { + /* tile was modified and needs to be written back */ + //printf("SPU %u: put dirty C tile %u, %u\n", spu.init.id, tx, ty); + put_tile(tx, ty, &spu.ctile, TAG_WRITE_TILE_COLOR, 0); + spu.cur_ctile_status = TILE_STATUS_DEFINED; + } + else if (spu.cur_ctile_status == TILE_STATUS_GETTING) { + /* tile was never used */ + spu.cur_ctile_status = TILE_STATUS_DEFINED; + //printf("SPU %u: put getting C tile %u, %u\n", spu.init.id, tx, ty); + } +} + + +/** + * Wait for 'put' of color/z tiles to complete. + */ +static INLINE void +wait_put_cz_tiles(void) +{ + wait_on_mask(1 << TAG_WRITE_TILE_COLOR); + if (spu.depth_stencil.depth.enabled) { + wait_on_mask(1 << TAG_WRITE_TILE_Z); + } +} + + +/** + * Render primitives + * \param pos_incr returns value indicating how may words to skip after + * this command in the batch buffer + */ +void +cmd_render(const struct cell_command_render *render, uint *pos_incr) +{ + /* we'll DMA into these buffers */ + ubyte vertex_data[CELL_BUFFER_SIZE] ALIGN16_ATTRIB; + const uint vertex_size = render->vertex_size; /* in bytes */ + /*const*/ uint total_vertex_bytes = render->num_verts * vertex_size; + uint index_bytes; + const ubyte *vertices; + const ushort *indexes; + uint i, j; + + + if (Debug) { + printf("SPU %u: RENDER prim %u, num_vert=%u num_ind=%u " + "inline_vert=%u\n", + spu.init.id, + render->prim_type, + render->num_verts, + render->num_indexes, + render->inline_verts); + + /* + printf(" bound: %g, %g .. %g, %g\n", + render->xmin, render->ymin, render->xmax, render->ymax); + */ + } + + ASSERT(sizeof(*render) % 4 == 0); + ASSERT(total_vertex_bytes % 16 == 0); + ASSERT(render->prim_type == PIPE_PRIM_TRIANGLES); + ASSERT(render->num_indexes % 3 == 0); + + + /* indexes are right after the render command in the batch buffer */ + indexes = (const ushort *) (render + 1); + index_bytes = ROUNDUP8(render->num_indexes * 2); + *pos_incr = index_bytes / 8 + sizeof(*render) / 8; + + + if (render->inline_verts) { + /* Vertices are after indexes in batch buffer at next 16-byte addr */ + vertices = (const ubyte *) render + (*pos_incr * 8); + vertices = (const ubyte *) align_pointer((void *) vertices, 16); + ASSERT_ALIGN16(vertices); + *pos_incr = ((vertices + total_vertex_bytes) - (ubyte *) render) / 8; + } + else { + /* Begin DMA fetch of vertex buffer */ + ubyte *src = spu.init.buffers[render->vertex_buf]; + ubyte *dest = vertex_data; + + /* skip vertex data we won't use */ +#if 01 + src += render->min_index * vertex_size; + dest += render->min_index * vertex_size; + total_vertex_bytes -= render->min_index * vertex_size; +#endif + ASSERT(total_vertex_bytes % 16 == 0); + ASSERT_ALIGN16(dest); + ASSERT_ALIGN16(src); + + mfc_get(dest, /* in vertex_data[] array */ + (unsigned int) src, /* src in main memory */ + total_vertex_bytes, /* size */ + TAG_VERTEX_BUFFER, + 0, /* tid */ + 0 /* rid */); + + vertices = vertex_data; + + wait_on_mask(1 << TAG_VERTEX_BUFFER); + } + + + /** + ** find tiles which intersect the prim bounding box + **/ + uint txmin, tymin, box_width_tiles, box_num_tiles; + tile_bounding_box(render, &txmin, &tymin, + &box_num_tiles, &box_width_tiles); + + + /* make sure any pending clears have completed */ + wait_on_mask(1 << TAG_SURFACE_CLEAR); /* XXX temporary */ + + + /** + ** loop over tiles, rendering tris + **/ + for (i = 0; i < box_num_tiles; i++) { + const uint tx = txmin + i % box_width_tiles; + const uint ty = tymin + i / box_width_tiles; + + ASSERT(tx < spu.fb.width_tiles); + ASSERT(ty < spu.fb.height_tiles); + + if (!my_tile(tx, ty)) + continue; + + spu.cur_ctile_status = spu.ctile_status[ty][tx]; + spu.cur_ztile_status = spu.ztile_status[ty][tx]; + + get_cz_tiles(tx, ty); + + uint drawn = 0; + + /* loop over tris */ + for (j = 0; j < render->num_indexes; j += 3) { + const float *v0, *v1, *v2; + + v0 = (const float *) (vertices + indexes[j+0] * vertex_size); + v1 = (const float *) (vertices + indexes[j+1] * vertex_size); + v2 = (const float *) (vertices + indexes[j+2] * vertex_size); + + drawn += tri_draw(v0, v1, v2, tx, ty); + } + + //printf("SPU %u: drew %u of %u\n", spu.init.id, drawn, render->num_indexes/3); + + /* write color/z tiles back to main framebuffer, if dirtied */ + put_cz_tiles(tx, ty); + + wait_put_cz_tiles(); /* XXX seems unnecessary... */ + + spu.ctile_status[ty][tx] = spu.cur_ctile_status; + spu.ztile_status[ty][tx] = spu.cur_ztile_status; + } + + if (Debug) + printf("SPU %u: RENDER done\n", + spu.init.id); +} + + diff --git a/src/mesa/pipe/cell/spu/spu_render.h b/src/mesa/pipe/cell/spu/spu_render.h new file mode 100644 index 0000000000..fbcdc5ec31 --- /dev/null +++ b/src/mesa/pipe/cell/spu/spu_render.h @@ -0,0 +1,38 @@ +/************************************************************************** + * + * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + + +#ifndef SPU_RENDER_H +#define SPU_RENDER_H + +#include "pipe/cell/common.h" + +extern void +cmd_render(const struct cell_command_render *render, uint *pos_incr); + +#endif /* SPU_RENDER_H */ + diff --git a/src/mesa/pipe/cell/spu/spu_texture.c b/src/mesa/pipe/cell/spu/spu_texture.c new file mode 100644 index 0000000000..3962aaa4a9 --- /dev/null +++ b/src/mesa/pipe/cell/spu/spu_texture.c @@ -0,0 +1,217 @@ +/************************************************************************** + * + * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + + +#include "pipe/p_compiler.h" +#include "spu_main.h" +#include "spu_texture.h" +#include "spu_tile.h" +#include "spu_colorpack.h" + + +/** + * Number of texture tiles to cache. + * Note that this will probably be the largest consumer of SPU local store/ + * memory for this driver! + */ +#define CACHE_SIZE 16 + +static tile_t tex_tiles[CACHE_SIZE] ALIGN16_ATTRIB; + +static vector unsigned int tex_tile_xy[CACHE_SIZE]; + + + +/** + * Mark all tex cache entries as invalid. + */ +void +invalidate_tex_cache(void) +{ + /* XXX memset? */ + uint i; + for (i = 0; i < CACHE_SIZE; i++) { + tex_tile_xy[i] = ((vector unsigned int) { ~0U, ~0U, ~0U, ~0U }); + } +} + + +/** + * Return the cache pos/index which corresponds to tile (tx,ty) + */ +static INLINE uint +cache_pos(vector unsigned int txty) +{ + uint pos = (spu_extract(txty,0) + spu_extract(txty,1) * 4) % CACHE_SIZE; + return pos; +} + + +/** + * Make sure the tile for texel (i,j) is present, return its position/index + * in the cache. + */ +static uint +get_tex_tile(vector unsigned int ij) +{ + /* tile address: tx,ty */ + const vector unsigned int txty = spu_rlmask(ij, -5); /* divide by 32 */ + const uint pos = cache_pos(txty); + + if ((spu_extract(tex_tile_xy[pos], 0) != spu_extract(txty, 0)) || + (spu_extract(tex_tile_xy[pos], 1) != spu_extract(txty, 1))) { + + /* texture cache miss, fetch tile from main memory */ + const uint tiles_per_row = spu.texture.width / TILE_SIZE; + const uint bytes_per_tile = sizeof(tile_t); + const void *src = (const ubyte *) spu.texture.start + + (spu_extract(txty,1) * tiles_per_row + spu_extract(txty,0)) * bytes_per_tile; + + printf("SPU %u: tex cache miss at %d, %d pos=%u old=%d,%d\n", + spu.init.id, + spu_extract(txty,0), + spu_extract(txty,1), + pos, + spu_extract(tex_tile_xy[pos],0), + spu_extract(tex_tile_xy[pos],1)); + + ASSERT_ALIGN16(tex_tiles[pos].ui); + ASSERT_ALIGN16(src); + + mfc_get(tex_tiles[pos].ui, /* dest */ + (unsigned int) src, + bytes_per_tile, /* size */ + TAG_TEXTURE_TILE, + 0, /* tid */ + 0 /* rid */); + + wait_on_mask(1 << TAG_TEXTURE_TILE); + + tex_tile_xy[pos] = txty; + } + else { +#if 0 + printf("SPU %u: tex cache HIT at %d, %d\n", + spu.init.id, tx, ty); +#endif + } + + return pos; +} + + +/** + * Get texture sample at texcoord. + * XXX this is extremely primitive for now. + */ +vector float +sample_texture_nearest(vector float texcoord) +{ + vector float tc = spu_mul(texcoord, spu.tex_size); + vector unsigned int itc = spu_convtu(tc, 0); /* convert to int */ + itc = spu_and(itc, spu.tex_size_mask); /* mask (GL_REPEAT) */ + vector unsigned int ij = spu_and(itc, TILE_SIZE-1); /* intra tile addr */ + uint pos = get_tex_tile(itc); + uint texel = tex_tiles[pos].ui[spu_extract(ij, 1)][spu_extract(ij, 0)]; + return spu_unpack_A8R8G8B8(texel); +} + + +vector float +sample_texture_bilinear(vector float texcoord) +{ + static const vector unsigned int offset10 = {1, 0, 0, 0}; + static const vector unsigned int offset01 = {0, 1, 0, 0}; + + vector float tc = spu_mul(texcoord, spu.tex_size); + tc = spu_add(tc, spu_splats(-0.5f)); /* half texel bias */ + + /* integer texcoords S,T: */ + vector unsigned int itc00 = spu_convtu(tc, 0); /* convert to int */ + vector unsigned int itc01 = spu_add(itc00, offset01); + vector unsigned int itc10 = spu_add(itc00, offset10); + vector unsigned int itc11 = spu_add(itc10, offset01); + + /* mask (GL_REPEAT) */ + itc00 = spu_and(itc00, spu.tex_size_mask); + itc01 = spu_and(itc01, spu.tex_size_mask); + itc10 = spu_and(itc10, spu.tex_size_mask); + itc11 = spu_and(itc11, spu.tex_size_mask); + + /* intra tile addr */ + vector unsigned int ij00 = spu_and(itc00, TILE_SIZE-1); + vector unsigned int ij01 = spu_and(itc01, TILE_SIZE-1); + vector unsigned int ij10 = spu_and(itc10, TILE_SIZE-1); + vector unsigned int ij11 = spu_and(itc11, TILE_SIZE-1); + + /* get tile cache positions */ + uint pos00 = get_tex_tile(itc00); + uint pos01, pos10, pos11; + if ((spu_extract(ij00, 0) < TILE_SIZE-1) && + (spu_extract(ij00, 1) < TILE_SIZE-1)) { + /* all texels are in the same tile */ + pos01 = pos10 = pos11 = pos00; + } + else { + pos01 = get_tex_tile(itc01); + pos10 = get_tex_tile(itc10); + pos11 = get_tex_tile(itc11); + } + + /* get texels from tiles and convert to float[4] */ + vector float texel00 = spu_unpack_A8R8G8B8(tex_tiles[pos00].ui[spu_extract(ij00, 1)][spu_extract(ij00, 0)]); + vector float texel01 = spu_unpack_A8R8G8B8(tex_tiles[pos01].ui[spu_extract(ij01, 1)][spu_extract(ij01, 0)]); + vector float texel10 = spu_unpack_A8R8G8B8(tex_tiles[pos10].ui[spu_extract(ij10, 1)][spu_extract(ij10, 0)]); + vector float texel11 = spu_unpack_A8R8G8B8(tex_tiles[pos11].ui[spu_extract(ij11, 1)][spu_extract(ij11, 0)]); + + /* Compute weighting factors in [0,1] + * Multiply texcoord by 1024, AND with 1023, convert back to float. + */ + vector float tc1024 = spu_mul(tc, spu_splats(1024.0f)); + vector signed int itc1024 = spu_convts(tc1024, 0); + itc1024 = spu_and(itc1024, spu_splats((1 << 10) - 1)); + vector float weight = spu_convtf(itc1024, 10); + + /* smeared frac and 1-frac */ + vector float sfrac = spu_splats(spu_extract(weight, 0)); + vector float tfrac = spu_splats(spu_extract(weight, 1)); + vector float sfrac1 = spu_sub(spu_splats(1.0f), sfrac); + vector float tfrac1 = spu_sub(spu_splats(1.0f), tfrac); + + /* multiply the samples (colors) by the S/T weights */ + texel00 = spu_mul(spu_mul(texel00, sfrac1), tfrac1); + texel10 = spu_mul(spu_mul(texel10, sfrac ), tfrac1); + texel01 = spu_mul(spu_mul(texel01, sfrac1), tfrac ); + texel11 = spu_mul(spu_mul(texel11, sfrac ), tfrac ); + + /* compute sum of weighted samples */ + vector float texel_sum = spu_add(texel00, texel01); + texel_sum = spu_add(texel_sum, texel10); + texel_sum = spu_add(texel_sum, texel11); + + return texel_sum; +} diff --git a/src/mesa/pipe/cell/spu/spu_texture.h b/src/mesa/pipe/cell/spu/spu_texture.h new file mode 100644 index 0000000000..95eb87080f --- /dev/null +++ b/src/mesa/pipe/cell/spu/spu_texture.h @@ -0,0 +1,47 @@ +/************************************************************************** + * + * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +#ifndef SPU_TEXTURE_H +#define SPU_TEXTURE_H + + +#include "pipe/p_compiler.h" + + +extern void +invalidate_tex_cache(void); + + +extern vector float +sample_texture_nearest(vector float texcoord); + + +extern vector float +sample_texture_bilinear(vector float texcoord); + + +#endif /* SPU_TEXTURE_H */ diff --git a/src/mesa/pipe/cell/spu/spu_tile.c b/src/mesa/pipe/cell/spu/spu_tile.c index ca1352f9f8..12dc246328 100644 --- a/src/mesa/pipe/cell/spu/spu_tile.c +++ b/src/mesa/pipe/cell/spu/spu_tile.c @@ -28,15 +28,7 @@ #include "spu_tile.h" - - - -tile_t ctile ALIGN16_ATTRIB; -tile_t ztile ALIGN16_ATTRIB; - -ubyte tile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB; -ubyte tile_status_z[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB; - +#include "spu_main.h" void @@ -55,7 +47,7 @@ get_tile(uint tx, uint ty, tile_t *tile, int tag, int zBuf) printf("get_tile: dest: %p src: 0x%x size: %d\n", tile, (unsigned int) src, bytesPerTile); */ - mfc_get(tile->t32, /* dest in local memory */ + mfc_get(tile->ui, /* dest in local memory */ (unsigned int) src, /* src in main memory */ bytesPerTile, tag, @@ -81,7 +73,7 @@ put_tile(uint tx, uint ty, const tile_t *tile, int tag, int zBuf) spu.init.id, tile, (unsigned int) dst, bytesPerTile); */ - mfc_put((void *) tile->t32, /* src in local memory */ + mfc_put((void *) tile->ui, /* src in local memory */ (unsigned int) dst, /* dst in main memory */ bytesPerTile, tag, diff --git a/src/mesa/pipe/cell/spu/spu_tile.h b/src/mesa/pipe/cell/spu/spu_tile.h index f83dc009c2..e53340a55a 100644 --- a/src/mesa/pipe/cell/spu/spu_tile.h +++ b/src/mesa/pipe/cell/spu/spu_tile.h @@ -35,27 +35,6 @@ #include "pipe/cell/common.h" -#define MAX_WIDTH 1024 -#define MAX_HEIGHT 1024 - - -typedef union { - ushort t16[TILE_SIZE][TILE_SIZE]; - uint t32[TILE_SIZE][TILE_SIZE]; -} tile_t; - - -extern tile_t ctile ALIGN16_ATTRIB; -extern tile_t ztile ALIGN16_ATTRIB; - - -#define TILE_STATUS_CLEAR 1 -#define TILE_STATUS_DEFINED 2 /**< defined pixel data */ -#define TILE_STATUS_DIRTY 3 /**< modified, but not put back yet */ - -extern ubyte tile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB; -extern ubyte tile_status_z[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB; - void get_tile(uint tx, uint ty, tile_t *tile, int tag, int zBuf); @@ -68,7 +47,7 @@ put_tile(uint tx, uint ty, const tile_t *tile, int tag, int zBuf); static INLINE void clear_c_tile(tile_t *ctile) { - memset32((uint*) ctile->t32, + memset32((uint*) ctile->ui, spu.fb.color_clear_value, TILE_SIZE * TILE_SIZE); } @@ -78,12 +57,13 @@ static INLINE void clear_z_tile(tile_t *ztile) { if (spu.fb.depth_format == PIPE_FORMAT_Z16_UNORM) { - memset16((ushort*) ztile->t16, + memset16((ushort*) ztile->us, spu.fb.depth_clear_value, TILE_SIZE * TILE_SIZE); } else { - memset32((uint*) ztile->t32, + ASSERT(spu.fb.depth_format == PIPE_FORMAT_Z32_UNORM); + memset32((uint*) ztile->ui, spu.fb.depth_clear_value, TILE_SIZE * TILE_SIZE); } diff --git a/src/mesa/pipe/cell/spu/spu_tri.c b/src/mesa/pipe/cell/spu/spu_tri.c index 3d0d106c10..be9624cf7d 100644 --- a/src/mesa/pipe/cell/spu/spu_tri.c +++ b/src/mesa/pipe/cell/spu/spu_tri.c @@ -32,22 +32,33 @@ #include "pipe/p_compiler.h" #include "pipe/p_format.h" #include "pipe/p_util.h" +#include "spu_blend.h" +#include "spu_colorpack.h" #include "spu_main.h" +#include "spu_texture.h" #include "spu_tile.h" #include "spu_tri.h" +#include "spu_ztest.h" + + +/** Masks are uint[4] vectors with each element being 0 or 0xffffffff */ +typedef vector unsigned int mask_t; + +typedef union +{ + vector float v; + float f[4]; +} float4; /** * Simplified types taken from other parts of Gallium */ struct vertex_header { - float data[0][4]; + vector float data[1]; }; -struct prim_header { - struct vertex_header *v[3]; -}; /* XXX fix this */ @@ -82,9 +93,9 @@ struct edge { struct interp_coef { - float a0[4]; - float dadx[4]; - float dady[4]; + float4 a0; + float4 dadx; + float4 dady; }; @@ -133,6 +144,12 @@ struct setup_stage { }; + +static struct setup_stage setup; + + + + #if 0 /** * Basically a cast wrapper. @@ -145,33 +162,33 @@ static INLINE struct setup_stage *setup_stage( struct draw_stage *stage ) #if 0 /** - * Clip setup->quad against the scissor/surface bounds. + * Clip setup.quad against the scissor/surface bounds. */ static INLINE void quad_clip(struct setup_stage *setup) { - const struct pipe_scissor_state *cliprect = &setup->softpipe->cliprect; + const struct pipe_scissor_state *cliprect = &setup.softpipe->cliprect; const int minx = (int) cliprect->minx; const int maxx = (int) cliprect->maxx; const int miny = (int) cliprect->miny; const int maxy = (int) cliprect->maxy; - if (setup->quad.x0 >= maxx || - setup->quad.y0 >= maxy || - setup->quad.x0 + 1 < minx || - setup->quad.y0 + 1 < miny) { + if (setup.quad.x0 >= maxx || + setup.quad.y0 >= maxy || + setup.quad.x0 + 1 < minx || + setup.quad.y0 + 1 < miny) { /* totally clipped */ - setup->quad.mask = 0x0; + setup.quad.mask = 0x0; return; } - if (setup->quad.x0 < minx) - setup->quad.mask &= (MASK_BOTTOM_RIGHT | MASK_TOP_RIGHT); - if (setup->quad.y0 < miny) - setup->quad.mask &= (MASK_BOTTOM_LEFT | MASK_BOTTOM_RIGHT); - if (setup->quad.x0 == maxx - 1) - setup->quad.mask &= (MASK_BOTTOM_LEFT | MASK_TOP_LEFT); - if (setup->quad.y0 == maxy - 1) - setup->quad.mask &= (MASK_TOP_LEFT | MASK_TOP_RIGHT); + if (setup.quad.x0 < minx) + setup.quad.mask &= (MASK_BOTTOM_RIGHT | MASK_TOP_RIGHT); + if (setup.quad.y0 < miny) + setup.quad.mask &= (MASK_BOTTOM_LEFT | MASK_BOTTOM_RIGHT); + if (setup.quad.x0 == maxx - 1) + setup.quad.mask &= (MASK_BOTTOM_LEFT | MASK_TOP_LEFT); + if (setup.quad.y0 == maxy - 1) + setup.quad.mask &= (MASK_TOP_LEFT | MASK_TOP_RIGHT); } #endif @@ -183,9 +200,9 @@ static INLINE void clip_emit_quad(struct setup_stage *setup) { quad_clip(setup); - if (setup->quad.mask) { - struct softpipe_context *sp = setup->softpipe; - sp->quad.first->run(sp->quad.first, &setup->quad); + if (setup.quad.mask) { + struct softpipe_context *sp = setup.softpipe; + sp->quad.first->run(sp->quad.first, &setup.quad); } } #endif @@ -196,200 +213,145 @@ clip_emit_quad(struct setup_stage *setup) * Eg: four colors will be compute. */ static INLINE void -eval_coeff( struct setup_stage *setup, uint slot, - float x, float y, float result[4][4]) +eval_coeff(uint slot, float x, float y, vector float result[4]) { - uint i; - const float *dadx = setup->coef[slot].dadx; - const float *dady = setup->coef[slot].dady; - - /* loop over XYZW comps */ - for (i = 0; i < 4; i++) { - result[QUAD_TOP_LEFT][i] = setup->coef[slot].a0[i] + x * dadx[i] + y * dady[i]; - result[QUAD_TOP_RIGHT][i] = result[0][i] + dadx[i]; - result[QUAD_BOTTOM_LEFT][i] = result[0][i] + dady[i]; - result[QUAD_BOTTOM_RIGHT][i] = result[0][i] + dadx[i] + dady[i]; + switch (spu.vertex_info.interp_mode[slot]) { + case INTERP_CONSTANT: + result[QUAD_TOP_LEFT] = + result[QUAD_TOP_RIGHT] = + result[QUAD_BOTTOM_LEFT] = + result[QUAD_BOTTOM_RIGHT] = setup.coef[slot].a0.v; + break; + + case INTERP_LINEAR: + /* fall-through, for now */ + default: + { + register vector float dadx = setup.coef[slot].dadx.v; + register vector float dady = setup.coef[slot].dady.v; + register vector float topLeft + = spu_add(setup.coef[slot].a0.v, + spu_add(spu_mul(spu_splats(x), dadx), + spu_mul(spu_splats(y), dady))); + + result[QUAD_TOP_LEFT] = topLeft; + result[QUAD_TOP_RIGHT] = spu_add(topLeft, dadx); + result[QUAD_BOTTOM_LEFT] = spu_add(topLeft, dady); + result[QUAD_BOTTOM_RIGHT] = spu_add(spu_add(topLeft, dadx), dady); + } } } -static INLINE void -eval_z( struct setup_stage *setup, - float x, float y, float result[4]) +static INLINE vector float +eval_z(float x, float y) { const uint slot = 0; - const uint i = 2; - const float *dadx = setup->coef[slot].dadx; - const float *dady = setup->coef[slot].dady; - - result[QUAD_TOP_LEFT] = setup->coef[slot].a0[i] + x * dadx[i] + y * dady[i]; - result[QUAD_TOP_RIGHT] = result[0] + dadx[i]; - result[QUAD_BOTTOM_LEFT] = result[0] + dady[i]; - result[QUAD_BOTTOM_RIGHT] = result[0] + dadx[i] + dady[i]; + const float dzdx = setup.coef[slot].dadx.f[2]; + const float dzdy = setup.coef[slot].dady.f[2]; + const float topLeft = setup.coef[slot].a0.f[2] + x * dzdx + y * dzdy; + const vector float topLeftv = spu_splats(topLeft); + const vector float derivs = (vector float) { 0.0, dzdx, dzdy, dzdx + dzdy }; + return spu_add(topLeftv, derivs); } -static INLINE uint -pack_color(const float color[4]) +static INLINE mask_t +do_depth_test(int x, int y, mask_t quadmask) { - uint r = (uint) (color[0] * 255.0); - uint g = (uint) (color[1] * 255.0); - uint b = (uint) (color[2] * 255.0); - uint a = (uint) (color[3] * 255.0); - r = MIN2(r, 255); - g = MIN2(g, 255); - b = MIN2(b, 255); - a = MIN2(a, 255); - switch (spu.fb.color_format) { - case PIPE_FORMAT_A8R8G8B8_UNORM: - return (a << 24) | (r << 16) | (g << 8) | b; - case PIPE_FORMAT_B8G8R8A8_UNORM: - return (b << 24) | (g << 16) | (r << 8) | a; - default: - ASSERT(0); - return 0; - } -} - - -static uint -do_depth_test(struct setup_stage *setup, int x, int y, unsigned mask) -{ - int ix = x - setup->cliprect_minx; - int iy = y - setup->cliprect_miny; - float zvals[4]; - - eval_z(setup, (float) x, (float) y, zvals); - - if (tile_status_z[setup->ty][setup->tx] == TILE_STATUS_CLEAR) { - /* now, _really_ clear the tile */ - clear_z_tile(&ztile); - } - else { - /* make sure we've got the tile from main mem */ - wait_on_mask(1 << TAG_READ_TILE_Z); - } - tile_status_z[setup->ty][setup->tx] = TILE_STATUS_DIRTY; + float4 zvals; + mask_t mask; + zvals.v = eval_z((float) x, (float) y); if (spu.fb.depth_format == PIPE_FORMAT_Z16_UNORM) { - const float zscale = 65535.0; - if (mask & MASK_TOP_LEFT) { - uint z = (uint) (zvals[0] * zscale); - if (z < ztile.t16[iy][ix]) - ztile.t16[iy][ix] = z; - else - mask &= ~MASK_TOP_LEFT; - } - - if (mask & MASK_TOP_RIGHT) { - uint z = (uint) (zvals[1] * zscale); - if (z < ztile.t16[iy][ix+1]) - ztile.t16[iy][ix+1] = z; - else - mask &= ~MASK_TOP_RIGHT; - } - - if (mask & MASK_BOTTOM_LEFT) { - uint z = (uint) (zvals[2] * zscale); - if (z < ztile.t16[iy+1][ix]) - ztile.t16[iy+1][ix] = z; - else - mask &= ~MASK_BOTTOM_LEFT; - } - - if (mask & MASK_BOTTOM_RIGHT) { - uint z = (uint) (zvals[3] * zscale); - if (z < ztile.t16[iy+1][ix+1]) - ztile.t16[iy+1][ix+1] = z; - else - mask &= ~MASK_BOTTOM_RIGHT; - } + int ix = (x - setup.cliprect_minx) / 4; + int iy = (y - setup.cliprect_miny) / 2; + mask = spu_z16_test_less(zvals.v, &spu.ztile.us8[iy][ix], x>>1, quadmask); } else { - const float zscale = (float) 0xffffffff; - ASSERT(spu.fb.depth_format == PIPE_FORMAT_Z32_UNORM); - if (mask & MASK_TOP_LEFT) { - uint z = (uint) (zvals[0] * zscale); - if (z < ztile.t32[iy][ix]) - ztile.t32[iy][ix] = z; - else - mask &= ~MASK_TOP_LEFT; - } - - if (mask & MASK_TOP_RIGHT) { - uint z = (uint) (zvals[1] * zscale); - if (z < ztile.t32[iy][ix+1]) - ztile.t32[iy][ix+1] = z; - else - mask &= ~MASK_TOP_RIGHT; - } - - if (mask & MASK_BOTTOM_LEFT) { - uint z = (uint) (zvals[2] * zscale); - if (z < ztile.t32[iy+1][ix]) - ztile.t32[iy+1][ix] = z; - else - mask &= ~MASK_BOTTOM_LEFT; - } - - if (mask & MASK_BOTTOM_RIGHT) { - uint z = (uint) (zvals[3] * zscale); - if (z < ztile.t32[iy+1][ix+1]) - ztile.t32[iy+1][ix+1] = z; - else - mask &= ~MASK_BOTTOM_RIGHT; - } + int ix = (x - setup.cliprect_minx) / 2; + int iy = (y - setup.cliprect_miny) / 2; + mask = spu_z32_test_less(zvals.v, &spu.ztile.ui4[iy][ix], quadmask); } + if (spu_extract(spu_orx(mask), 0)) + spu.cur_ztile_status = TILE_STATUS_DIRTY; + return mask; } /** * Emit a quad (pass to next stage). No clipping is done. + * Note: about 1/5 to 1/7 of the time, mask is zero and this function + * should be skipped. But adding the test for that slows things down + * overall. */ static INLINE void -emit_quad( struct setup_stage *setup, int x, int y, unsigned mask ) +emit_quad( int x, int y, mask_t mask ) { #if 0 - struct softpipe_context *sp = setup->softpipe; - setup->quad.x0 = x; - setup->quad.y0 = y; - setup->quad.mask = mask; - sp->quad.first->run(sp->quad.first, &setup->quad); + struct softpipe_context *sp = setup.softpipe; + setup.quad.x0 = x; + setup.quad.y0 = y; + setup.quad.mask = mask; + sp->quad.first->run(sp->quad.first, &setup.quad); #else - /* Cell: "write" quad fragments to the tile by setting prim color */ - const int ix = x - setup->cliprect_minx; - const int iy = y - setup->cliprect_miny; - float colors[4][4]; - - eval_coeff(setup, 1, (float) x, (float) y, colors); if (spu.depth_stencil.depth.enabled) { - mask &= do_depth_test(setup, x, y, mask); + mask = do_depth_test(x, y, mask); } - if (mask) { - if (tile_status[setup->ty][setup->tx] == TILE_STATUS_CLEAR) { - /* now, _really_ clear the tile */ - clear_c_tile(&ctile); + /* If any bits in mask are set... */ + if (spu_extract(spu_orx(mask), 0)) { + const int ix = x - setup.cliprect_minx; + const int iy = y - setup.cliprect_miny; + const vector unsigned char shuffle = spu.color_shuffle; + vector float colors[4]; + + spu.cur_ctile_status = TILE_STATUS_DIRTY; + + if (spu.texture.start) { + /* texture mapping */ + vector float texcoords[4]; + eval_coeff(2, (float) x, (float) y, texcoords); + + if (spu_extract(mask, 0)) + colors[0] = spu.sample_texture(texcoords[0]); + if (spu_extract(mask, 1)) + colors[1] = spu.sample_texture(texcoords[1]); + if (spu_extract(mask, 2)) + colors[2] = spu.sample_texture(texcoords[2]); + if (spu_extract(mask, 3)) + colors[3] = spu.sample_texture(texcoords[3]); } else { - /* make sure we've got the tile from main mem */ - wait_on_mask(1 << TAG_READ_TILE_COLOR); + /* simple shading */ + eval_coeff(1, (float) x, (float) y, colors); } - tile_status[setup->ty][setup->tx] = TILE_STATUS_DIRTY; - - if (mask & MASK_TOP_LEFT) - ctile.t32[iy][ix] = pack_color(colors[QUAD_TOP_LEFT]); - if (mask & MASK_TOP_RIGHT) - ctile.t32[iy][ix+1] = pack_color(colors[QUAD_TOP_RIGHT]); - if (mask & MASK_BOTTOM_LEFT) - ctile.t32[iy+1][ix] = pack_color(colors[QUAD_BOTTOM_LEFT]); - if (mask & MASK_BOTTOM_RIGHT) - ctile.t32[iy+1][ix+1] = pack_color(colors[QUAD_BOTTOM_RIGHT]); + +#if 1 + if (spu.blend.blend_enable) + blend_quad(ix % TILE_SIZE, iy % TILE_SIZE, colors); +#endif + + if (spu_extract(mask, 0)) + spu.ctile.ui[iy][ix] = spu_pack_color_shuffle(colors[0], shuffle); + if (spu_extract(mask, 1)) + spu.ctile.ui[iy][ix+1] = spu_pack_color_shuffle(colors[1], shuffle); + if (spu_extract(mask, 2)) + spu.ctile.ui[iy+1][ix] = spu_pack_color_shuffle(colors[2], shuffle); + if (spu_extract(mask, 3)) + spu.ctile.ui[iy+1][ix+1] = spu_pack_color_shuffle(colors[3], shuffle); + +#if 0 + /* SIMD_Z with swizzled color buffer (someday) */ + vector unsigned int uicolors = *((vector unsigned int *) &colors); + spu.ctile.ui4[iy/2][ix/2] = spu_sel(spu.ctile.ui4[iy/2][ix/2], uicolors, mask); +#endif } + #endif } @@ -407,26 +369,19 @@ static INLINE int block( int x ) /** * Compute mask which indicates which pixels in the 2x2 quad are actually inside * the triangle's bounds. - * - * this is pretty nasty... may need to rework flush_spans again to - * fix it, if possible. + * The mask is a uint4 vector and each element will be 0 or 0xffffffff. */ -static unsigned calculate_mask( struct setup_stage *setup, int x ) +static INLINE mask_t calculate_mask( int x ) { - unsigned mask = 0x0; - - if (x >= setup->span.left[0] && x < setup->span.right[0]) - mask |= MASK_TOP_LEFT; - - if (x >= setup->span.left[1] && x < setup->span.right[1]) - mask |= MASK_BOTTOM_LEFT; - - if (x+1 >= setup->span.left[0] && x+1 < setup->span.right[0]) - mask |= MASK_TOP_RIGHT; - - if (x+1 >= setup->span.left[1] && x+1 < setup->span.right[1]) - mask |= MASK_BOTTOM_RIGHT; - + /* This is a little tricky. + * Use & instead of && to avoid branches. + * Use negation to convert true/false to ~0/0 values. + */ + mask_t mask; + mask = spu_insert(-((x >= setup.span.left[0]) & (x < setup.span.right[0])), mask, 0); + mask = spu_insert(-((x+1 >= setup.span.left[0]) & (x+1 < setup.span.right[0])), mask, 1); + mask = spu_insert(-((x >= setup.span.left[1]) & (x < setup.span.right[1])), mask, 2); + mask = spu_insert(-((x+1 >= setup.span.left[1]) & (x+1 < setup.span.right[1])), mask, 3); return mask; } @@ -434,144 +389,175 @@ static unsigned calculate_mask( struct setup_stage *setup, int x ) /** * Render a horizontal span of quads */ -static void flush_spans( struct setup_stage *setup ) +static void flush_spans( void ) { int minleft, maxright; int x; - switch (setup->span.y_flags) { + switch (setup.span.y_flags) { case 0x3: /* both odd and even lines written (both quad rows) */ - minleft = MIN2(setup->span.left[0], setup->span.left[1]); - maxright = MAX2(setup->span.right[0], setup->span.right[1]); + minleft = MIN2(setup.span.left[0], setup.span.left[1]); + maxright = MAX2(setup.span.right[0], setup.span.right[1]); break; case 0x1: /* only even line written (quad top row) */ - minleft = setup->span.left[0]; - maxright = setup->span.right[0]; + minleft = setup.span.left[0]; + maxright = setup.span.right[0]; break; case 0x2: /* only odd line written (quad bottom row) */ - minleft = setup->span.left[1]; - maxright = setup->span.right[1]; + minleft = setup.span.left[1]; + maxright = setup.span.right[1]; break; default: return; } + + /* OK, we're very likely to need the tile data now. + * clear or finish waiting if needed. + */ + if (spu.cur_ctile_status == TILE_STATUS_GETTING) { + /* wait for mfc_get() to complete */ + //printf("SPU: %u: waiting for ctile\n", spu.init.id); + wait_on_mask(1 << TAG_READ_TILE_COLOR); + spu.cur_ctile_status = TILE_STATUS_CLEAN; + } + else if (spu.cur_ctile_status == TILE_STATUS_CLEAR) { + //printf("SPU %u: clearing C tile %u, %u\n", spu.init.id, setup.tx, setup.ty); + clear_c_tile(&spu.ctile); + spu.cur_ctile_status = TILE_STATUS_DIRTY; + } + ASSERT(spu.cur_ctile_status != TILE_STATUS_DEFINED); + + if (spu.depth_stencil.depth.enabled) { + if (spu.cur_ztile_status == TILE_STATUS_GETTING) { + /* wait for mfc_get() to complete */ + //printf("SPU: %u: waiting for ztile\n", spu.init.id); + wait_on_mask(1 << TAG_READ_TILE_Z); + spu.cur_ztile_status = TILE_STATUS_CLEAN; + } + else if (spu.cur_ztile_status == TILE_STATUS_CLEAR) { + //printf("SPU %u: clearing Z tile %u, %u\n", spu.init.id, setup.tx, setup.ty); + clear_z_tile(&spu.ztile); + spu.cur_ztile_status = TILE_STATUS_DIRTY; + } + ASSERT(spu.cur_ztile_status != TILE_STATUS_DEFINED); + } + /* XXX this loop could be moved into the above switch cases and * calculate_mask() could be simplified a bit... */ for (x = block(minleft); x <= block(maxright); x += 2) { - emit_quad( setup, x, setup->span.y, - calculate_mask( setup, x ) ); +#if 1 + emit_quad( x, setup.span.y, calculate_mask( x ) ); +#endif } - setup->span.y = 0; - setup->span.y_flags = 0; - setup->span.right[0] = 0; - setup->span.right[1] = 0; + setup.span.y = 0; + setup.span.y_flags = 0; + setup.span.right[0] = 0; + setup.span.right[1] = 0; } #if DEBUG_VERTS -static void print_vertex(const struct setup_stage *setup, - const struct vertex_header *v) +static void print_vertex(const struct vertex_header *v) { int i; fprintf(stderr, "Vertex: (%p)\n", v); - for (i = 0; i < setup->quad.nr_attrs; i++) { + for (i = 0; i < setup.quad.nr_attrs; i++) { fprintf(stderr, " %d: %f %f %f %f\n", i, v->data[i][0], v->data[i][1], v->data[i][2], v->data[i][3]); } } #endif -static boolean setup_sort_vertices( struct setup_stage *setup, - const struct prim_header *prim ) + +static boolean setup_sort_vertices(const struct vertex_header *v0, + const struct vertex_header *v1, + const struct vertex_header *v2) { - const struct vertex_header *v0 = prim->v[0]; - const struct vertex_header *v1 = prim->v[1]; - const struct vertex_header *v2 = prim->v[2]; #if DEBUG_VERTS fprintf(stderr, "Triangle:\n"); - print_vertex(setup, v0); - print_vertex(setup, v1); - print_vertex(setup, v2); + print_vertex(v0); + print_vertex(v1); + print_vertex(v2); #endif - setup->vprovoke = v2; + setup.vprovoke = v2; /* determine bottom to top order of vertices */ { - float y0 = v0->data[0][1]; - float y1 = v1->data[0][1]; - float y2 = v2->data[0][1]; + float y0 = spu_extract(v0->data[0], 1); + float y1 = spu_extract(v1->data[0], 1); + float y2 = spu_extract(v2->data[0], 1); if (y0 <= y1) { if (y1 <= y2) { /* y0<=y1<=y2 */ - setup->vmin = v0; - setup->vmid = v1; - setup->vmax = v2; + setup.vmin = v0; + setup.vmid = v1; + setup.vmax = v2; } else if (y2 <= y0) { /* y2<=y0<=y1 */ - setup->vmin = v2; - setup->vmid = v0; - setup->vmax = v1; + setup.vmin = v2; + setup.vmid = v0; + setup.vmax = v1; } else { /* y0<=y2<=y1 */ - setup->vmin = v0; - setup->vmid = v2; - setup->vmax = v1; + setup.vmin = v0; + setup.vmid = v2; + setup.vmax = v1; } } else { if (y0 <= y2) { /* y1<=y0<=y2 */ - setup->vmin = v1; - setup->vmid = v0; - setup->vmax = v2; + setup.vmin = v1; + setup.vmid = v0; + setup.vmax = v2; } else if (y2 <= y1) { /* y2<=y1<=y0 */ - setup->vmin = v2; - setup->vmid = v1; - setup->vmax = v0; + setup.vmin = v2; + setup.vmid = v1; + setup.vmax = v0; } else { /* y1<=y2<=y0 */ - setup->vmin = v1; - setup->vmid = v2; - setup->vmax = v0; + setup.vmin = v1; + setup.vmid = v2; + setup.vmax = v0; } } } /* Check if triangle is completely outside the tile bounds */ - if (setup->vmin->data[0][1] > setup->cliprect_maxy) + if (spu_extract(setup.vmin->data[0], 1) > setup.cliprect_maxy) return FALSE; - if (setup->vmax->data[0][1] < setup->cliprect_miny) + if (spu_extract(setup.vmax->data[0], 1) < setup.cliprect_miny) return FALSE; - if (setup->vmin->data[0][0] < setup->cliprect_minx && - setup->vmid->data[0][0] < setup->cliprect_minx && - setup->vmax->data[0][0] < setup->cliprect_minx) + if (spu_extract(setup.vmin->data[0], 0) < setup.cliprect_minx && + spu_extract(setup.vmid->data[0], 0) < setup.cliprect_minx && + spu_extract(setup.vmax->data[0], 0) < setup.cliprect_minx) return FALSE; - if (setup->vmin->data[0][0] > setup->cliprect_maxx && - setup->vmid->data[0][0] > setup->cliprect_maxx && - setup->vmax->data[0][0] > setup->cliprect_maxx) + if (spu_extract(setup.vmin->data[0], 0) > setup.cliprect_maxx && + spu_extract(setup.vmid->data[0], 0) > setup.cliprect_maxx && + spu_extract(setup.vmax->data[0], 0) > setup.cliprect_maxx) return FALSE; - setup->ebot.dx = setup->vmid->data[0][0] - setup->vmin->data[0][0]; - setup->ebot.dy = setup->vmid->data[0][1] - setup->vmin->data[0][1]; - setup->emaj.dx = setup->vmax->data[0][0] - setup->vmin->data[0][0]; - setup->emaj.dy = setup->vmax->data[0][1] - setup->vmin->data[0][1]; - setup->etop.dx = setup->vmax->data[0][0] - setup->vmid->data[0][0]; - setup->etop.dy = setup->vmax->data[0][1] - setup->vmid->data[0][1]; + setup.ebot.dx = spu_extract(setup.vmid->data[0], 0) - spu_extract(setup.vmin->data[0], 0); + setup.ebot.dy = spu_extract(setup.vmid->data[0], 1) - spu_extract(setup.vmin->data[0], 1); + setup.emaj.dx = spu_extract(setup.vmax->data[0], 0) - spu_extract(setup.vmin->data[0], 0); + setup.emaj.dy = spu_extract(setup.vmax->data[0], 1) - spu_extract(setup.vmin->data[0], 1); + setup.etop.dx = spu_extract(setup.vmax->data[0], 0) - spu_extract(setup.vmid->data[0], 0); + setup.etop.dy = spu_extract(setup.vmax->data[0], 1) - spu_extract(setup.vmid->data[0], 1); /* * Compute triangle's area. Use 1/area to compute partial @@ -584,13 +570,13 @@ static boolean setup_sort_vertices( struct setup_stage *setup, * use the prim->det value because its sign is correct. */ { - const float area = (setup->emaj.dx * setup->ebot.dy - - setup->ebot.dx * setup->emaj.dy); + const float area = (setup.emaj.dx * setup.ebot.dy - + setup.ebot.dx * setup.emaj.dy); - setup->oneoverarea = 1.0f / area; + setup.oneoverarea = 1.0f / area; /* _mesa_printf("%s one-over-area %f area %f det %f\n", - __FUNCTION__, setup->oneoverarea, area, prim->det ); + __FUNCTION__, setup.oneoverarea, area, prim->det ); */ } @@ -599,56 +585,52 @@ static boolean setup_sort_vertices( struct setup_stage *setup, * - the GLSL gl_FrontFacing fragment attribute (bool) * - two-sided stencil test */ - setup->quad.facing = (prim->det > 0.0) ^ (setup->softpipe->rasterizer->front_winding == PIPE_WINDING_CW); + setup.quad.facing = (prim->det > 0.0) ^ (setup.softpipe->rasterizer->front_winding == PIPE_WINDING_CW); #endif return TRUE; } -#if 0 /** * Compute a0 for a constant-valued coefficient (GL_FLAT shading). - * The value value comes from vertex->data[slot][i]. - * The result will be put into setup->coef[slot].a0[i]. + * The value value comes from vertex->data[slot]. + * The result will be put into setup.coef[slot].a0. * \param slot which attribute slot - * \param i which component of the slot (0..3) */ -static void const_coeff( struct setup_stage *setup, - unsigned slot, - unsigned i ) +static INLINE void +const_coeff(uint slot) { - assert(slot < PIPE_MAX_SHADER_INPUTS); - assert(i <= 3); - - setup->coef[slot].dadx[i] = 0; - setup->coef[slot].dady[i] = 0; - - /* need provoking vertex info! - */ - setup->coef[slot].a0[i] = setup->vprovoke->data[slot][i]; + setup.coef[slot].dadx.v = (vector float) {0.0, 0.0, 0.0, 0.0}; + setup.coef[slot].dady.v = (vector float) {0.0, 0.0, 0.0, 0.0}; + setup.coef[slot].a0.v = setup.vprovoke->data[slot]; } -#endif /** * Compute a0, dadx and dady for a linearly interpolated coefficient, * for a triangle. */ -static void tri_linear_coeff( struct setup_stage *setup, - uint slot, uint firstComp, uint lastComp ) +static INLINE void +tri_linear_coeff(uint slot, uint firstComp, uint lastComp) { uint i; + const float *vmin_d = (float *) &setup.vmin->data[slot]; + const float *vmid_d = (float *) &setup.vmid->data[slot]; + const float *vmax_d = (float *) &setup.vmax->data[slot]; + const float x = spu_extract(setup.vmin->data[0], 0) - 0.5f; + const float y = spu_extract(setup.vmin->data[0], 1) - 0.5f; + for (i = firstComp; i < lastComp; i++) { - float botda = setup->vmid->data[slot][i] - setup->vmin->data[slot][i]; - float majda = setup->vmax->data[slot][i] - setup->vmin->data[slot][i]; - float a = setup->ebot.dy * majda - botda * setup->emaj.dy; - float b = setup->emaj.dx * botda - majda * setup->ebot.dx; + float botda = vmid_d[i] - vmin_d[i]; + float majda = vmax_d[i] - vmin_d[i]; + float a = setup.ebot.dy * majda - botda * setup.emaj.dy; + float b = setup.emaj.dx * botda - majda * setup.ebot.dx; ASSERT(slot < PIPE_MAX_SHADER_INPUTS); - setup->coef[slot].dadx[i] = a * setup->oneoverarea; - setup->coef[slot].dady[i] = b * setup->oneoverarea; + setup.coef[slot].dadx.f[i] = a * setup.oneoverarea; + setup.coef[slot].dady.f[i] = b * setup.oneoverarea; /* calculate a0 as the value which would be sampled for the * fragment at (0,0), taking into account that we want to sample at @@ -662,21 +644,52 @@ static void tri_linear_coeff( struct setup_stage *setup, * to define a0 as the sample at a pixel center somewhere near vmin * instead - i'll switch to this later. */ - setup->coef[slot].a0[i] = (setup->vmin->data[slot][i] - - (setup->coef[slot].dadx[i] * (setup->vmin->data[0][0] - 0.5f) + - setup->coef[slot].dady[i] * (setup->vmin->data[0][1] - 0.5f))); + setup.coef[slot].a0.f[i] = (vmin_d[i] - + (setup.coef[slot].dadx.f[i] * x + + setup.coef[slot].dady.f[i] * y)); } /* _mesa_printf("attr[%d].%c: %f dx:%f dy:%f\n", slot, "xyzw"[i], - setup->coef[slot].a0[i], - setup->coef[slot].dadx[i], - setup->coef[slot].dady[i]); + setup.coef[slot].a0[i], + setup.coef[slot].dadx.f[i], + setup.coef[slot].dady.f[i]); */ } +/** + * As above, but interp setup all four vector components. + */ +static INLINE void +tri_linear_coeff4(uint slot) +{ + const vector float vmin_d = setup.vmin->data[slot]; + const vector float vmid_d = setup.vmid->data[slot]; + const vector float vmax_d = setup.vmax->data[slot]; + const vector float xxxx = spu_splats(spu_extract(setup.vmin->data[0], 0) - 0.5f); + const vector float yyyy = spu_splats(spu_extract(setup.vmin->data[0], 1) - 0.5f); + + vector float botda = vmid_d - vmin_d; + vector float majda = vmax_d - vmin_d; + + vector float a = spu_sub(spu_mul(spu_splats(setup.ebot.dy), majda), + spu_mul(botda, spu_splats(setup.emaj.dy))); + vector float b = spu_sub(spu_mul(spu_splats(setup.emaj.dx), botda), + spu_mul(majda, spu_splats(setup.ebot.dx))); + + setup.coef[slot].dadx.v = spu_mul(a, spu_splats(setup.oneoverarea)); + setup.coef[slot].dady.v = spu_mul(b, spu_splats(setup.oneoverarea)); + + vector float tempx = spu_mul(setup.coef[slot].dadx.v, xxxx); + vector float tempy = spu_mul(setup.coef[slot].dady.v, yyyy); + + setup.coef[slot].a0.v = spu_sub(vmin_d, spu_add(tempx, tempy)); +} + + + #if 0 /** * Compute a0, dadx and dady for a perspective-corrected interpolant, @@ -686,46 +699,45 @@ static void tri_linear_coeff( struct setup_stage *setup, * Later, when we compute the value at a particular fragment position we'll * divide the interpolated value by the interpolated W at that fragment. */ -static void tri_persp_coeff( struct setup_stage *setup, - unsigned slot, +static void tri_persp_coeff( unsigned slot, unsigned i ) { /* premultiply by 1/w: */ - float mina = setup->vmin->data[slot][i] * setup->vmin->data[0][3]; - float mida = setup->vmid->data[slot][i] * setup->vmid->data[0][3]; - float maxa = setup->vmax->data[slot][i] * setup->vmax->data[0][3]; + float mina = setup.vmin->data[slot][i] * setup.vmin->data[0][3]; + float mida = setup.vmid->data[slot][i] * setup.vmid->data[0][3]; + float maxa = setup.vmax->data[slot][i] * setup.vmax->data[0][3]; float botda = mida - mina; float majda = maxa - mina; - float a = setup->ebot.dy * majda - botda * setup->emaj.dy; - float b = setup->emaj.dx * botda - majda * setup->ebot.dx; + float a = setup.ebot.dy * majda - botda * setup.emaj.dy; + float b = setup.emaj.dx * botda - majda * setup.ebot.dx; /* printf("tri persp %d,%d: %f %f %f\n", slot, i, - setup->vmin->data[slot][i], - setup->vmid->data[slot][i], - setup->vmax->data[slot][i] + setup.vmin->data[slot][i], + setup.vmid->data[slot][i], + setup.vmax->data[slot][i] ); */ assert(slot < PIPE_MAX_SHADER_INPUTS); assert(i <= 3); - setup->coef[slot].dadx[i] = a * setup->oneoverarea; - setup->coef[slot].dady[i] = b * setup->oneoverarea; - setup->coef[slot].a0[i] = (mina - - (setup->coef[slot].dadx[i] * (setup->vmin->data[0][0] - 0.5f) + - setup->coef[slot].dady[i] * (setup->vmin->data[0][1] - 0.5f))); + setup.coef[slot].dadx.f[i] = a * setup.oneoverarea; + setup.coef[slot].dady.f[i] = b * setup.oneoverarea; + setup.coef[slot].a0.f[i] = (mina - + (setup.coef[slot].dadx.f[i] * (setup.vmin->data[0][0] - 0.5f) + + setup.coef[slot].dady.f[i] * (setup.vmin->data[0][1] - 0.5f))); } #endif /** - * Compute the setup->coef[] array dadx, dady, a0 values. - * Must be called after setup->vmin,vmid,vmax,vprovoke are initialized. + * Compute the setup.coef[] array dadx, dady, a0 values. + * Must be called after setup.vmin,vmid,vmax,vprovoke are initialized. */ -static void setup_tri_coefficients( struct setup_stage *setup ) +static void setup_tri_coefficients(void) { #if 1 uint i; @@ -735,15 +747,18 @@ static void setup_tri_coefficients( struct setup_stage *setup ) case INTERP_NONE: break; case INTERP_POS: - tri_linear_coeff(setup, i, 2, 3); /* slot 0, z */ + /*tri_linear_coeff(i, 2, 3);*/ /* XXX interp W if PERSPECTIVE... */ + tri_linear_coeff4(i); break; case INTERP_CONSTANT: - /* fall-through */ + const_coeff(i); + break; case INTERP_LINEAR: - tri_linear_coeff(setup, i, 0, 4); /* slot 1, color */ + tri_linear_coeff4(i); break; case INTERP_PERSPECTIVE: + tri_linear_coeff4(i); /* temporary */ break; default: ASSERT(0); @@ -753,35 +768,35 @@ static void setup_tri_coefficients( struct setup_stage *setup ) ASSERT(spu.vertex_info.interp_mode[0] == INTERP_POS); ASSERT(spu.vertex_info.interp_mode[1] == INTERP_LINEAR || spu.vertex_info.interp_mode[1] == INTERP_CONSTANT); - tri_linear_coeff(setup, 0, 2, 3); /* slot 0, z */ - tri_linear_coeff(setup, 1, 0, 4); /* slot 1, color */ + tri_linear_coeff(0, 2, 3); /* slot 0, z */ + tri_linear_coeff(1, 0, 4); /* slot 1, color */ #endif } -static void setup_tri_edges( struct setup_stage *setup ) +static void setup_tri_edges(void) { - float vmin_x = setup->vmin->data[0][0] + 0.5f; - float vmid_x = setup->vmid->data[0][0] + 0.5f; - - float vmin_y = setup->vmin->data[0][1] - 0.5f; - float vmid_y = setup->vmid->data[0][1] - 0.5f; - float vmax_y = setup->vmax->data[0][1] - 0.5f; - - setup->emaj.sy = CEILF(vmin_y); - setup->emaj.lines = (int) CEILF(vmax_y - setup->emaj.sy); - setup->emaj.dxdy = setup->emaj.dx / setup->emaj.dy; - setup->emaj.sx = vmin_x + (setup->emaj.sy - vmin_y) * setup->emaj.dxdy; - - setup->etop.sy = CEILF(vmid_y); - setup->etop.lines = (int) CEILF(vmax_y - setup->etop.sy); - setup->etop.dxdy = setup->etop.dx / setup->etop.dy; - setup->etop.sx = vmid_x + (setup->etop.sy - vmid_y) * setup->etop.dxdy; - - setup->ebot.sy = CEILF(vmin_y); - setup->ebot.lines = (int) CEILF(vmid_y - setup->ebot.sy); - setup->ebot.dxdy = setup->ebot.dx / setup->ebot.dy; - setup->ebot.sx = vmin_x + (setup->ebot.sy - vmin_y) * setup->ebot.dxdy; + float vmin_x = spu_extract(setup.vmin->data[0], 0) + 0.5f; + float vmid_x = spu_extract(setup.vmid->data[0], 0) + 0.5f; + + float vmin_y = spu_extract(setup.vmin->data[0], 1) - 0.5f; + float vmid_y = spu_extract(setup.vmid->data[0], 1) - 0.5f; + float vmax_y = spu_extract(setup.vmax->data[0], 1) - 0.5f; + + setup.emaj.sy = CEILF(vmin_y); + setup.emaj.lines = (int) CEILF(vmax_y - setup.emaj.sy); + setup.emaj.dxdy = setup.emaj.dx / setup.emaj.dy; + setup.emaj.sx = vmin_x + (setup.emaj.sy - vmin_y) * setup.emaj.dxdy; + + setup.etop.sy = CEILF(vmid_y); + setup.etop.lines = (int) CEILF(vmax_y - setup.etop.sy); + setup.etop.dxdy = setup.etop.dx / setup.etop.dy; + setup.etop.sx = vmid_x + (setup.etop.sy - vmid_y) * setup.etop.dxdy; + + setup.ebot.sy = CEILF(vmin_y); + setup.ebot.lines = (int) CEILF(vmid_y - setup.ebot.sy); + setup.ebot.dxdy = setup.ebot.dx / setup.ebot.dy; + setup.ebot.sx = vmin_x + (setup.ebot.sy - vmin_y) * setup.ebot.dxdy; } @@ -789,15 +804,14 @@ static void setup_tri_edges( struct setup_stage *setup ) * Render the upper or lower half of a triangle. * Scissoring/cliprect is applied here too. */ -static void subtriangle( struct setup_stage *setup, - struct edge *eleft, +static void subtriangle( struct edge *eleft, struct edge *eright, unsigned lines ) { - const int minx = setup->cliprect_minx; - const int maxx = setup->cliprect_maxx; - const int miny = setup->cliprect_miny; - const int maxy = setup->cliprect_maxy; + const int minx = setup.cliprect_minx; + const int maxx = setup.cliprect_maxx; + const int miny = setup.cliprect_miny; + const int maxy = setup.cliprect_maxy; int y, start_y, finish_y; int sy = (int)eleft->sy; @@ -839,14 +853,14 @@ static void subtriangle( struct setup_stage *setup, if (left < right) { int _y = sy + y; - if (block(_y) != setup->span.y) { - flush_spans(setup); - setup->span.y = block(_y); + if (block(_y) != setup.span.y) { + flush_spans(); + setup.span.y = block(_y); } - setup->span.left[_y&1] = left; - setup->span.right[_y&1] = right; - setup->span.y_flags |= 1<<(_y&1); + setup.span.left[_y&1] = left; + setup.span.right[_y&1] = right; + setup.span.y_flags |= 1<<(_y&1); } } @@ -861,70 +875,52 @@ static void subtriangle( struct setup_stage *setup, /** - * Do setup for triangle rasterization, then render the triangle. + * Draw triangle into tile at (tx, ty) (tile coords) + * The tile data should have already been fetched. */ -static void -setup_tri(struct setup_stage *setup, struct prim_header *prim) +boolean +tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty) { - if (!setup_sort_vertices( setup, prim )) { - return; /* totally clipped */ - } + setup.tx = tx; + setup.ty = ty; - setup_tri_coefficients( setup ); - setup_tri_edges( setup ); + /* set clipping bounds to tile bounds */ + setup.cliprect_minx = tx * TILE_SIZE; + setup.cliprect_miny = ty * TILE_SIZE; + setup.cliprect_maxx = (tx + 1) * TILE_SIZE; + setup.cliprect_maxy = (ty + 1) * TILE_SIZE; -#if 0 - setup->quad.prim = PRIM_TRI; -#endif + if (!setup_sort_vertices((struct vertex_header *) v0, + (struct vertex_header *) v1, + (struct vertex_header *) v2)) { + return FALSE; /* totally clipped */ + } + + setup_tri_coefficients(); + setup_tri_edges(); - setup->span.y = 0; - setup->span.y_flags = 0; - setup->span.right[0] = 0; - setup->span.right[1] = 0; - /* setup->span.z_mode = tri_z_mode( setup->ctx ); */ + setup.span.y = 0; + setup.span.y_flags = 0; + setup.span.right[0] = 0; + setup.span.right[1] = 0; + /* setup.span.z_mode = tri_z_mode( setup.ctx ); */ /* init_constant_attribs( setup ); */ - if (setup->oneoverarea < 0.0) { + if (setup.oneoverarea < 0.0) { /* emaj on left: */ - subtriangle( setup, &setup->emaj, &setup->ebot, setup->ebot.lines ); - subtriangle( setup, &setup->emaj, &setup->etop, setup->etop.lines ); + subtriangle( &setup.emaj, &setup.ebot, setup.ebot.lines ); + subtriangle( &setup.emaj, &setup.etop, setup.etop.lines ); } else { /* emaj on right: */ - subtriangle( setup, &setup->ebot, &setup->emaj, setup->ebot.lines ); - subtriangle( setup, &setup->etop, &setup->emaj, setup->etop.lines ); + subtriangle( &setup.ebot, &setup.emaj, setup.ebot.lines ); + subtriangle( &setup.etop, &setup.emaj, setup.etop.lines ); } - flush_spans( setup ); -} - + flush_spans(); - -/** - * Draw triangle into tile at (tx, ty) (tile coords) - * The tile data should have already been fetched. - */ -void -tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty) -{ - struct prim_header tri; - struct setup_stage setup; - - tri.v[0] = (struct vertex_header *) v0; - tri.v[1] = (struct vertex_header *) v1; - tri.v[2] = (struct vertex_header *) v2; - - setup.tx = tx; - setup.ty = ty; - - /* set clipping bounds to tile bounds */ - setup.cliprect_minx = tx * TILE_SIZE; - setup.cliprect_miny = ty * TILE_SIZE; - setup.cliprect_maxx = (tx + 1) * TILE_SIZE; - setup.cliprect_maxy = (ty + 1) * TILE_SIZE; - - setup_tri(&setup, &tri); + return TRUE; } diff --git a/src/mesa/pipe/cell/spu/spu_tri.h b/src/mesa/pipe/cell/spu/spu_tri.h index 86c42b6339..aa694dd7c9 100644 --- a/src/mesa/pipe/cell/spu/spu_tri.h +++ b/src/mesa/pipe/cell/spu/spu_tri.h @@ -30,7 +30,7 @@ #define SPU_TRI_H -extern void +extern boolean tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty); diff --git a/src/mesa/pipe/cell/spu/spu_util.c b/src/mesa/pipe/cell/spu/spu_util.c new file mode 100644 index 0000000000..ac373240c1 --- /dev/null +++ b/src/mesa/pipe/cell/spu/spu_util.c @@ -0,0 +1,165 @@ +#include "pipe/p_util.h" +#include "pipe/p_shader_tokens.h" +#include "pipe/tgsi/util/tgsi_parse.h" +//#include "tgsi_build.h" +#include "pipe/tgsi/util/tgsi_util.h" + +unsigned +tgsi_util_get_src_register_swizzle( + const struct tgsi_src_register *reg, + unsigned component ) +{ + switch( component ) { + case 0: + return reg->SwizzleX; + case 1: + return reg->SwizzleY; + case 2: + return reg->SwizzleZ; + case 3: + return reg->SwizzleW; + default: + assert( 0 ); + } + return 0; +} + +unsigned +tgsi_util_get_src_register_extswizzle( + const struct tgsi_src_register_ext_swz *reg, + unsigned component ) +{ + switch( component ) { + case 0: + return reg->ExtSwizzleX; + case 1: + return reg->ExtSwizzleY; + case 2: + return reg->ExtSwizzleZ; + case 3: + return reg->ExtSwizzleW; + default: + assert( 0 ); + } + return 0; +} + +unsigned +tgsi_util_get_full_src_register_extswizzle( + const struct tgsi_full_src_register *reg, + unsigned component ) +{ + unsigned swizzle; + + /* + * First, calculate the extended swizzle for a given channel. This will give + * us either a channel index into the simple swizzle or a constant 1 or 0. + */ + swizzle = tgsi_util_get_src_register_extswizzle( + ®->SrcRegisterExtSwz, + component ); + + assert (TGSI_SWIZZLE_X == TGSI_EXTSWIZZLE_X); + assert (TGSI_SWIZZLE_Y == TGSI_EXTSWIZZLE_Y); + assert (TGSI_SWIZZLE_Z == TGSI_EXTSWIZZLE_Z); + assert (TGSI_SWIZZLE_W == TGSI_EXTSWIZZLE_W); + assert (TGSI_EXTSWIZZLE_ZERO > TGSI_SWIZZLE_W); + assert (TGSI_EXTSWIZZLE_ONE > TGSI_SWIZZLE_W); + + /* + * Second, calculate the simple swizzle for the unswizzled channel index. + * Leave the constants intact, they are not affected by the simple swizzle. + */ + if( swizzle <= TGSI_SWIZZLE_W ) { + swizzle = tgsi_util_get_src_register_swizzle( + ®->SrcRegister, + component ); + } + + return swizzle; +} + +unsigned +tgsi_util_get_src_register_extnegate( + const struct tgsi_src_register_ext_swz *reg, + unsigned component ) +{ + switch( component ) { + case 0: + return reg->NegateX; + case 1: + return reg->NegateY; + case 2: + return reg->NegateZ; + case 3: + return reg->NegateW; + default: + assert( 0 ); + } + return 0; +} + +void +tgsi_util_set_src_register_extnegate( + struct tgsi_src_register_ext_swz *reg, + unsigned negate, + unsigned component ) +{ + switch( component ) { + case 0: + reg->NegateX = negate; + break; + case 1: + reg->NegateY = negate; + break; + case 2: + reg->NegateZ = negate; + break; + case 3: + reg->NegateW = negate; + break; + default: + assert( 0 ); + } +} + +unsigned +tgsi_util_get_full_src_register_sign_mode( + const struct tgsi_full_src_register *reg, + unsigned component ) +{ + unsigned sign_mode; + + if( reg->SrcRegisterExtMod.Absolute ) { + /* Consider only the post-abs negation. */ + + if( reg->SrcRegisterExtMod.Negate ) { + sign_mode = TGSI_UTIL_SIGN_SET; + } + else { + sign_mode = TGSI_UTIL_SIGN_CLEAR; + } + } + else { + /* Accumulate the three negations. */ + + unsigned negate; + + negate = reg->SrcRegister.Negate; + if( tgsi_util_get_src_register_extnegate( ®->SrcRegisterExtSwz, component ) ) { + negate = !negate; + } + if( reg->SrcRegisterExtMod.Negate ) { + negate = !negate; + } + + if( negate ) { + sign_mode = TGSI_UTIL_SIGN_TOGGLE; + } + else { + sign_mode = TGSI_UTIL_SIGN_KEEP; + } + } + + return sign_mode; +} diff --git a/src/mesa/pipe/cell/spu/spu_vertex_fetch.c b/src/mesa/pipe/cell/spu/spu_vertex_fetch.c new file mode 100644 index 0000000000..6e86a919ce --- /dev/null +++ b/src/mesa/pipe/cell/spu/spu_vertex_fetch.c @@ -0,0 +1,393 @@ +/************************************************************************** + * + * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + + /* + * Authors: + * Keith Whitwell <keith@tungstengraphics.com> + */ + +#include <spu_mfcio.h> +#include <transpose_matrix4x4.h> + +#include "pipe/p_util.h" +#include "pipe/p_state.h" +#include "pipe/p_shader_tokens.h" +#include "spu_exec.h" +#include "spu_vertex_shader.h" +#include "spu_main.h" + + +#define DRAW_DBG 0 + + +static const vec_float4 defaults = { 0.0, 0.0, 0.0, 1.0 }; + +/** + * Fetch a float[4] vertex attribute from memory, doing format/type + * conversion as needed. + * + * This is probably needed/dupliocated elsewhere, eg format + * conversion, texture sampling etc. + */ +#define FETCH_ATTRIB( NAME, SZ, CVT ) \ +static qword \ +fetch_##NAME(const void *ptr) \ +{ \ + vec_float4 attrib = defaults; \ + int i; \ + \ + for (i = 0; i < SZ; i++) { \ + attrib = spu_insert(CVT, attrib, i); \ + } \ + return (qword) attrib; \ +} + +#define CVT_64_FLOAT (float) ((double *) ptr)[i] +#define CVT_32_FLOAT ((float *) ptr)[i] + +#define CVT_8_USCALED (float) ((unsigned char *) ptr)[i] +#define CVT_16_USCALED (float) ((unsigned short *) ptr)[i] +#define CVT_32_USCALED (float) ((unsigned int *) ptr)[i] + +#define CVT_8_SSCALED (float) ((char *) ptr)[i] +#define CVT_16_SSCALED (float) ((short *) ptr)[i] +#define CVT_32_SSCALED (float) ((int *) ptr)[i] + +#define CVT_8_UNORM (float) ((unsigned char *) ptr)[i] / 255.0f +#define CVT_16_UNORM (float) ((unsigned short *) ptr)[i] / 65535.0f +#define CVT_32_UNORM (float) ((unsigned int *) ptr)[i] / 4294967295.0f + +#define CVT_8_SNORM (float) ((char *) ptr)[i] / 127.0f +#define CVT_16_SNORM (float) ((short *) ptr)[i] / 32767.0f +#define CVT_32_SNORM (float) ((int *) ptr)[i] / 2147483647.0f + +FETCH_ATTRIB( R64G64B64A64_FLOAT, 4, CVT_64_FLOAT ) +FETCH_ATTRIB( R64G64B64_FLOAT, 3, CVT_64_FLOAT ) +FETCH_ATTRIB( R64G64_FLOAT, 2, CVT_64_FLOAT ) +FETCH_ATTRIB( R64_FLOAT, 1, CVT_64_FLOAT ) + +FETCH_ATTRIB( R32G32B32A32_FLOAT, 4, CVT_32_FLOAT ) +FETCH_ATTRIB( R32G32B32_FLOAT, 3, CVT_32_FLOAT ) +FETCH_ATTRIB( R32G32_FLOAT, 2, CVT_32_FLOAT ) +FETCH_ATTRIB( R32_FLOAT, 1, CVT_32_FLOAT ) + +FETCH_ATTRIB( R32G32B32A32_USCALED, 4, CVT_32_USCALED ) +FETCH_ATTRIB( R32G32B32_USCALED, 3, CVT_32_USCALED ) +FETCH_ATTRIB( R32G32_USCALED, 2, CVT_32_USCALED ) +FETCH_ATTRIB( R32_USCALED, 1, CVT_32_USCALED ) + +FETCH_ATTRIB( R32G32B32A32_SSCALED, 4, CVT_32_SSCALED ) +FETCH_ATTRIB( R32G32B32_SSCALED, 3, CVT_32_SSCALED ) +FETCH_ATTRIB( R32G32_SSCALED, 2, CVT_32_SSCALED ) +FETCH_ATTRIB( R32_SSCALED, 1, CVT_32_SSCALED ) + +FETCH_ATTRIB( R32G32B32A32_UNORM, 4, CVT_32_UNORM ) +FETCH_ATTRIB( R32G32B32_UNORM, 3, CVT_32_UNORM ) +FETCH_ATTRIB( R32G32_UNORM, 2, CVT_32_UNORM ) +FETCH_ATTRIB( R32_UNORM, 1, CVT_32_UNORM ) + +FETCH_ATTRIB( R32G32B32A32_SNORM, 4, CVT_32_SNORM ) +FETCH_ATTRIB( R32G32B32_SNORM, 3, CVT_32_SNORM ) +FETCH_ATTRIB( R32G32_SNORM, 2, CVT_32_SNORM ) +FETCH_ATTRIB( R32_SNORM, 1, CVT_32_SNORM ) + +FETCH_ATTRIB( R16G16B16A16_USCALED, 4, CVT_16_USCALED ) +FETCH_ATTRIB( R16G16B16_USCALED, 3, CVT_16_USCALED ) +FETCH_ATTRIB( R16G16_USCALED, 2, CVT_16_USCALED ) +FETCH_ATTRIB( R16_USCALED, 1, CVT_16_USCALED ) + +FETCH_ATTRIB( R16G16B16A16_SSCALED, 4, CVT_16_SSCALED ) +FETCH_ATTRIB( R16G16B16_SSCALED, 3, CVT_16_SSCALED ) +FETCH_ATTRIB( R16G16_SSCALED, 2, CVT_16_SSCALED ) +FETCH_ATTRIB( R16_SSCALED, 1, CVT_16_SSCALED ) + +FETCH_ATTRIB( R16G16B16A16_UNORM, 4, CVT_16_UNORM ) +FETCH_ATTRIB( R16G16B16_UNORM, 3, CVT_16_UNORM ) +FETCH_ATTRIB( R16G16_UNORM, 2, CVT_16_UNORM ) +FETCH_ATTRIB( R16_UNORM, 1, CVT_16_UNORM ) + +FETCH_ATTRIB( R16G16B16A16_SNORM, 4, CVT_16_SNORM ) +FETCH_ATTRIB( R16G16B16_SNORM, 3, CVT_16_SNORM ) +FETCH_ATTRIB( R16G16_SNORM, 2, CVT_16_SNORM ) +FETCH_ATTRIB( R16_SNORM, 1, CVT_16_SNORM ) + +FETCH_ATTRIB( R8G8B8A8_USCALED, 4, CVT_8_USCALED ) +FETCH_ATTRIB( R8G8B8_USCALED, 3, CVT_8_USCALED ) +FETCH_ATTRIB( R8G8_USCALED, 2, CVT_8_USCALED ) +FETCH_ATTRIB( R8_USCALED, 1, CVT_8_USCALED ) + +FETCH_ATTRIB( R8G8B8A8_SSCALED, 4, CVT_8_SSCALED ) +FETCH_ATTRIB( R8G8B8_SSCALED, 3, CVT_8_SSCALED ) +FETCH_ATTRIB( R8G8_SSCALED, 2, CVT_8_SSCALED ) +FETCH_ATTRIB( R8_SSCALED, 1, CVT_8_SSCALED ) + +FETCH_ATTRIB( R8G8B8A8_UNORM, 4, CVT_8_UNORM ) +FETCH_ATTRIB( R8G8B8_UNORM, 3, CVT_8_UNORM ) +FETCH_ATTRIB( R8G8_UNORM, 2, CVT_8_UNORM ) +FETCH_ATTRIB( R8_UNORM, 1, CVT_8_UNORM ) + +FETCH_ATTRIB( R8G8B8A8_SNORM, 4, CVT_8_SNORM ) +FETCH_ATTRIB( R8G8B8_SNORM, 3, CVT_8_SNORM ) +FETCH_ATTRIB( R8G8_SNORM, 2, CVT_8_SNORM ) +FETCH_ATTRIB( R8_SNORM, 1, CVT_8_SNORM ) + +FETCH_ATTRIB( A8R8G8B8_UNORM, 4, CVT_8_UNORM ) +//FETCH_ATTRIB( R8G8B8A8_UNORM, 4, CVT_8_UNORM ) + + + +static spu_fetch_func get_fetch_func( enum pipe_format format ) +{ +#if 0 + { + char tmp[80]; + pf_sprint_name(tmp, format); + _mesa_printf("%s: %s\n", __FUNCTION__, tmp); + } +#endif + + switch (format) { + case PIPE_FORMAT_R64_FLOAT: + return fetch_R64_FLOAT; + case PIPE_FORMAT_R64G64_FLOAT: + return fetch_R64G64_FLOAT; + case PIPE_FORMAT_R64G64B64_FLOAT: + return fetch_R64G64B64_FLOAT; + case PIPE_FORMAT_R64G64B64A64_FLOAT: + return fetch_R64G64B64A64_FLOAT; + + case PIPE_FORMAT_R32_FLOAT: + return fetch_R32_FLOAT; + case PIPE_FORMAT_R32G32_FLOAT: + return fetch_R32G32_FLOAT; + case PIPE_FORMAT_R32G32B32_FLOAT: + return fetch_R32G32B32_FLOAT; + case PIPE_FORMAT_R32G32B32A32_FLOAT: + return fetch_R32G32B32A32_FLOAT; + + case PIPE_FORMAT_R32_UNORM: + return fetch_R32_UNORM; + case PIPE_FORMAT_R32G32_UNORM: + return fetch_R32G32_UNORM; + case PIPE_FORMAT_R32G32B32_UNORM: + return fetch_R32G32B32_UNORM; + case PIPE_FORMAT_R32G32B32A32_UNORM: + return fetch_R32G32B32A32_UNORM; + + case PIPE_FORMAT_R32_USCALED: + return fetch_R32_USCALED; + case PIPE_FORMAT_R32G32_USCALED: + return fetch_R32G32_USCALED; + case PIPE_FORMAT_R32G32B32_USCALED: + return fetch_R32G32B32_USCALED; + case PIPE_FORMAT_R32G32B32A32_USCALED: + return fetch_R32G32B32A32_USCALED; + + case PIPE_FORMAT_R32_SNORM: + return fetch_R32_SNORM; + case PIPE_FORMAT_R32G32_SNORM: + return fetch_R32G32_SNORM; + case PIPE_FORMAT_R32G32B32_SNORM: + return fetch_R32G32B32_SNORM; + case PIPE_FORMAT_R32G32B32A32_SNORM: + return fetch_R32G32B32A32_SNORM; + + case PIPE_FORMAT_R32_SSCALED: + return fetch_R32_SSCALED; + case PIPE_FORMAT_R32G32_SSCALED: + return fetch_R32G32_SSCALED; + case PIPE_FORMAT_R32G32B32_SSCALED: + return fetch_R32G32B32_SSCALED; + case PIPE_FORMAT_R32G32B32A32_SSCALED: + return fetch_R32G32B32A32_SSCALED; + + case PIPE_FORMAT_R16_UNORM: + return fetch_R16_UNORM; + case PIPE_FORMAT_R16G16_UNORM: + return fetch_R16G16_UNORM; + case PIPE_FORMAT_R16G16B16_UNORM: + return fetch_R16G16B16_UNORM; + case PIPE_FORMAT_R16G16B16A16_UNORM: + return fetch_R16G16B16A16_UNORM; + + case PIPE_FORMAT_R16_USCALED: + return fetch_R16_USCALED; + case PIPE_FORMAT_R16G16_USCALED: + return fetch_R16G16_USCALED; + case PIPE_FORMAT_R16G16B16_USCALED: + return fetch_R16G16B16_USCALED; + case PIPE_FORMAT_R16G16B16A16_USCALED: + return fetch_R16G16B16A16_USCALED; + + case PIPE_FORMAT_R16_SNORM: + return fetch_R16_SNORM; + case PIPE_FORMAT_R16G16_SNORM: + return fetch_R16G16_SNORM; + case PIPE_FORMAT_R16G16B16_SNORM: + return fetch_R16G16B16_SNORM; + case PIPE_FORMAT_R16G16B16A16_SNORM: + return fetch_R16G16B16A16_SNORM; + + case PIPE_FORMAT_R16_SSCALED: + return fetch_R16_SSCALED; + case PIPE_FORMAT_R16G16_SSCALED: + return fetch_R16G16_SSCALED; + case PIPE_FORMAT_R16G16B16_SSCALED: + return fetch_R16G16B16_SSCALED; + case PIPE_FORMAT_R16G16B16A16_SSCALED: + return fetch_R16G16B16A16_SSCALED; + + case PIPE_FORMAT_R8_UNORM: + return fetch_R8_UNORM; + case PIPE_FORMAT_R8G8_UNORM: + return fetch_R8G8_UNORM; + case PIPE_FORMAT_R8G8B8_UNORM: + return fetch_R8G8B8_UNORM; + case PIPE_FORMAT_R8G8B8A8_UNORM: + return fetch_R8G8B8A8_UNORM; + + case PIPE_FORMAT_R8_USCALED: + return fetch_R8_USCALED; + case PIPE_FORMAT_R8G8_USCALED: + return fetch_R8G8_USCALED; + case PIPE_FORMAT_R8G8B8_USCALED: + return fetch_R8G8B8_USCALED; + case PIPE_FORMAT_R8G8B8A8_USCALED: + return fetch_R8G8B8A8_USCALED; + + case PIPE_FORMAT_R8_SNORM: + return fetch_R8_SNORM; + case PIPE_FORMAT_R8G8_SNORM: + return fetch_R8G8_SNORM; + case PIPE_FORMAT_R8G8B8_SNORM: + return fetch_R8G8B8_SNORM; + case PIPE_FORMAT_R8G8B8A8_SNORM: + return fetch_R8G8B8A8_SNORM; + + case PIPE_FORMAT_R8_SSCALED: + return fetch_R8_SSCALED; + case PIPE_FORMAT_R8G8_SSCALED: + return fetch_R8G8_SSCALED; + case PIPE_FORMAT_R8G8B8_SSCALED: + return fetch_R8G8B8_SSCALED; + case PIPE_FORMAT_R8G8B8A8_SSCALED: + return fetch_R8G8B8A8_SSCALED; + + case PIPE_FORMAT_A8R8G8B8_UNORM: + return fetch_A8R8G8B8_UNORM; + + case 0: + return NULL; /* not sure why this is needed */ + + default: + assert(0); + return NULL; + } +} + + +/** + * Fetch vertex attributes for 'count' vertices. + */ +static void generic_vertex_fetch(struct spu_vs_context *draw, + struct spu_exec_machine *machine, + const unsigned *elts, + unsigned count) +{ + unsigned nr_attrs = draw->vertex_fetch.nr_attrs; + unsigned attr; + + assert(count <= 4); + + wait_on_mask(1 << TAG_VERTEX_BUFFER); + +#if DRAW_DBG + printf("SPU: %s count = %u, nr_attrs = %u\n", + __FUNCTION__, count, nr_attrs); +#endif + + /* loop over vertex attributes (vertex shader inputs) + */ + for (attr = 0; attr < nr_attrs; attr++) { + const unsigned pitch = draw->vertex_fetch.pitch[attr]; + const uint64_t src = draw->vertex_fetch.src_ptr[attr]; + const spu_fetch_func fetch = draw->vertex_fetch.fetch[attr]; + unsigned i; + qword p[4]; + + + /* Fetch four attributes for four vertices. + * + * Could fetch directly into AOS format, but this is meant to be + * a prototype for an sse implementation, which would have + * difficulties doing that. + */ + for (i = 0; i < count; i++) { + uint8_t buffer[32] ALIGN16_ATTRIB; + const uint64_t addr = src + (elts[i] * pitch); + const unsigned size = ((addr & 0x0f) == 0) ? 16 : 32; + +#if DRAW_DBG + printf("SPU: fetching = 0x%llx\n", addr); +#endif + mfc_get(buffer, addr & ~0x0f, size, TAG_VERTEX_BUFFER, 0, 0); + wait_on_mask(1 << TAG_VERTEX_BUFFER); + + p[i] = (*fetch)(buffer + (addr & 0x0f)); + } + + /* Be nice and zero out any missing vertices: + */ + for (/* empty */; i < 4; i++) + p[i] = si_xor(p[i], p[i]); + + /* Transpose/swizzle into vector-friendly format. Currently + * assuming that all vertex shader inputs are float[4], but this + * isn't true -- if the vertex shader only wants tex0.xy, we + * could optimize for that. + * + * To do so fully without codegen would probably require an + * excessive number of fetch functions, but we could at least + * minimize the transpose step: + */ + _transpose_matrix4x4(&machine->Inputs[attr].xyzw[0].q, p); + } +} + + +void spu_update_vertex_fetch( struct spu_vs_context *draw ) +{ + unsigned i; + + + for (i = 0; i < draw->vertex_fetch.nr_attrs; i++) { + draw->vertex_fetch.fetch[i] = + get_fetch_func(draw->vertex_fetch.format[i]); + } + + draw->vertex_fetch.fetch_func = generic_vertex_fetch; +} diff --git a/src/mesa/pipe/cell/spu/spu_vertex_shader.c b/src/mesa/pipe/cell/spu/spu_vertex_shader.c new file mode 100644 index 0000000000..c1cbbb6d1e --- /dev/null +++ b/src/mesa/pipe/cell/spu/spu_vertex_shader.c @@ -0,0 +1,231 @@ +/************************************************************************** + * + * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + + /* + * Authors: + * Keith Whitwell <keith@tungstengraphics.com> + * Brian Paul + * Ian Romanick <idr@us.ibm.com> + */ + +#include <spu_mfcio.h> + +#include "pipe/p_util.h" +#include "pipe/p_state.h" +#include "pipe/p_shader_tokens.h" +#include "spu_vertex_shader.h" +#include "spu_exec.h" +#include "pipe/draw/draw_private.h" +#include "pipe/draw/draw_context.h" +#include "pipe/cell/common.h" +#include "spu_main.h" + +static INLINE unsigned +compute_clipmask(const float *clip, /*const*/ float plane[][4], unsigned nr) +{ + unsigned mask = 0; + unsigned i; + + /* Do the hardwired planes first: + */ + if (-clip[0] + clip[3] < 0) mask |= CLIP_RIGHT_BIT; + if ( clip[0] + clip[3] < 0) mask |= CLIP_LEFT_BIT; + if (-clip[1] + clip[3] < 0) mask |= CLIP_TOP_BIT; + if ( clip[1] + clip[3] < 0) mask |= CLIP_BOTTOM_BIT; + if (-clip[2] + clip[3] < 0) mask |= CLIP_FAR_BIT; + if ( clip[2] + clip[3] < 0) mask |= CLIP_NEAR_BIT; + + /* Followed by any remaining ones: + */ + for (i = 6; i < nr; i++) { + if (dot4(clip, plane[i]) < 0) + mask |= (1<<i); + } + + return mask; +} + + +/** + * Transform vertices with the current vertex program/shader + * Up to four vertices can be shaded at a time. + * \param vbuffer the input vertex data + * \param elts indexes of four input vertices + * \param count number of vertices to shade [1..4] + * \param vOut array of pointers to four output vertices + */ +static void +run_vertex_program(struct spu_vs_context *draw, + unsigned elts[4], unsigned count, + const uint64_t *vOut) +{ + struct spu_exec_machine *machine = &draw->machine; + unsigned int j; + + ALIGN16_DECL(struct spu_exec_vector, inputs, PIPE_ATTRIB_MAX); + ALIGN16_DECL(struct spu_exec_vector, outputs, PIPE_ATTRIB_MAX); + const float *scale = draw->viewport.scale; + const float *trans = draw->viewport.translate; + + assert(count <= 4); + + machine->Processor = TGSI_PROCESSOR_VERTEX; + + ASSERT_ALIGN16(draw->constants); + machine->Consts = (float (*)[4]) draw->constants; + + machine->Inputs = ALIGN16_ASSIGN(inputs); + machine->Outputs = ALIGN16_ASSIGN(outputs); + + spu_vertex_fetch( draw, machine, elts, count ); + + /* run shader */ + spu_exec_machine_run( machine ); + + + /* store machine results */ + for (j = 0; j < count; j++) { + unsigned slot; + float x, y, z, w; + unsigned char buffer[sizeof(struct vertex_header) + + MAX_VERTEX_SIZE] ALIGN16_ATTRIB; + struct vertex_header *const tmpOut = + (struct vertex_header *) buffer; + const unsigned vert_size = ROUNDUP16(sizeof(struct vertex_header) + + (sizeof(float) * 4 + * draw->num_vs_outputs)); + + mfc_get(tmpOut, vOut[j], vert_size, TAG_VERTEX_BUFFER, 0, 0); + wait_on_mask(1 << TAG_VERTEX_BUFFER); + + + /* Handle attr[0] (position) specially: + * + * XXX: Computing the clipmask should be done in the vertex + * program as a set of DP4 instructions appended to the + * user-provided code. + */ + x = tmpOut->clip[0] = machine->Outputs[0].xyzw[0].f[j]; + y = tmpOut->clip[1] = machine->Outputs[0].xyzw[1].f[j]; + z = tmpOut->clip[2] = machine->Outputs[0].xyzw[2].f[j]; + w = tmpOut->clip[3] = machine->Outputs[0].xyzw[3].f[j]; + + tmpOut->clipmask = compute_clipmask(tmpOut->clip, draw->plane, + draw->nr_planes); + tmpOut->edgeflag = 1; + + /* divide by w */ + w = 1.0f / w; + x *= w; + y *= w; + z *= w; + + /* Viewport mapping */ + tmpOut->data[0][0] = x * scale[0] + trans[0]; + tmpOut->data[0][1] = y * scale[1] + trans[1]; + tmpOut->data[0][2] = z * scale[2] + trans[2]; + tmpOut->data[0][3] = w; + + /* Remaining attributes are packed into sequential post-transform + * vertex attrib slots. + */ + for (slot = 1; slot < draw->num_vs_outputs; slot++) { + tmpOut->data[slot][0] = machine->Outputs[slot].xyzw[0].f[j]; + tmpOut->data[slot][1] = machine->Outputs[slot].xyzw[1].f[j]; + tmpOut->data[slot][2] = machine->Outputs[slot].xyzw[2].f[j]; + tmpOut->data[slot][3] = machine->Outputs[slot].xyzw[3].f[j]; + } + + mfc_put(tmpOut, vOut[j], vert_size, TAG_VERTEX_BUFFER, 0, 0); + } /* loop over vertices */ +} + + +static void +spu_bind_vertex_shader(struct spu_vs_context *draw, + void *uniforms, + void *planes, + unsigned nr_planes, + unsigned num_outputs + ) +{ + draw->constants = (float (*)[4]) uniforms; + + (void) memcpy(draw->plane, planes, sizeof(float) * 4 * nr_planes); + draw->nr_planes = nr_planes; + draw->num_vs_outputs = num_outputs; + + /* specify the shader to interpret/execute */ + spu_exec_machine_init(&draw->machine, + PIPE_MAX_SAMPLERS, + NULL /*samplers*/, + PIPE_SHADER_VERTEX); +} + + +unsigned char immediates[(sizeof(float) * 4 * TGSI_EXEC_NUM_IMMEDIATES) + 32] + ALIGN16_ATTRIB; + +void +spu_execute_vertex_shader(struct spu_vs_context *draw, + const struct cell_command_vs *vs) +{ + unsigned i; + + const uint64_t immediate_addr = vs->shader.immediates; + const unsigned immediate_size = + ROUNDUP16((sizeof(float) * 4 * vs->shader.num_immediates) + + (immediate_addr & 0x0f)); + + mfc_get(immediates, immediate_addr & ~0x0f, immediate_size, + TAG_VERTEX_BUFFER, 0, 0); + + draw->machine.Instructions = (struct tgsi_full_instruction *) + vs->shader.instructions; + draw->machine.NumInstructions = vs->shader.num_instructions; + + draw->machine.Declarations = (struct tgsi_full_declaration *) + vs->shader.declarations; + draw->machine.NumDeclarations = vs->shader.num_declarations; + + draw->vertex_fetch.nr_attrs = vs->nr_attrs; + + wait_on_mask(1 << TAG_VERTEX_BUFFER); + + (void) memcpy(& draw->machine.Imms, &immediates[immediate_addr & 0x0f], + sizeof(float) * 4 * vs->shader.num_immediates); + + spu_bind_vertex_shader(draw, vs->shader.uniforms, + vs->plane, vs->nr_planes, + vs->shader.num_outputs); + + for (i = 0; i < vs->num_elts; i += 4) { + const unsigned batch_size = MIN2(vs->num_elts - i, 4); + + run_vertex_program(draw, & vs->elts[i], batch_size, &vs->vOut[i]); + } +} diff --git a/src/mesa/pipe/cell/spu/spu_vertex_shader.h b/src/mesa/pipe/cell/spu/spu_vertex_shader.h new file mode 100644 index 0000000000..c96b93ff0a --- /dev/null +++ b/src/mesa/pipe/cell/spu/spu_vertex_shader.h @@ -0,0 +1,61 @@ +#ifndef SPU_VERTEX_SHADER_H +#define SPU_VERTEX_SHADER_H + +#include "pipe/p_format.h" +#include "spu_exec.h" + +struct spu_vs_context; + +typedef qword (*spu_fetch_func)(const void *ptr); +typedef void (*spu_full_fetch_func)( struct spu_vs_context *draw, + struct spu_exec_machine *machine, + const unsigned *elts, + unsigned count ); + +struct spu_vs_context { + struct pipe_viewport_state viewport; + + struct { + uint64_t src_ptr[PIPE_ATTRIB_MAX]; + unsigned pitch[PIPE_ATTRIB_MAX]; + enum pipe_format format[PIPE_ATTRIB_MAX]; + unsigned nr_attrs; + boolean dirty; + + spu_fetch_func fetch[PIPE_ATTRIB_MAX]; + spu_full_fetch_func fetch_func; + } vertex_fetch; + + /* Clip derived state: + */ + float plane[12][4]; + unsigned nr_planes; + + struct spu_exec_machine machine; + const float (*constants)[4]; + + unsigned num_vs_outputs; +}; + +extern void spu_update_vertex_fetch(struct spu_vs_context *draw); + +static INLINE void spu_vertex_fetch(struct spu_vs_context *draw, + struct spu_exec_machine *machine, + const unsigned *elts, + unsigned count) +{ + if (draw->vertex_fetch.dirty) { + spu_update_vertex_fetch(draw); + draw->vertex_fetch.dirty = 0; + } + + (*draw->vertex_fetch.fetch_func)(draw, machine, elts, count); +} + +struct cell_command_vs; + +extern void +spu_execute_vertex_shader(struct spu_vs_context *draw, + const struct cell_command_vs *vs); + +#endif /* SPU_VERTEX_SHADER_H */ diff --git a/src/mesa/pipe/cell/spu/spu_ztest.h b/src/mesa/pipe/cell/spu/spu_ztest.h new file mode 100644 index 0000000000..ce8ad00339 --- /dev/null +++ b/src/mesa/pipe/cell/spu/spu_ztest.h @@ -0,0 +1,135 @@ +/************************************************************************** + * + * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + + +/** + * Zbuffer/depth test code. + */ + + +#ifndef SPU_ZTEST_H +#define SPU_ZTEST_H + + +#ifdef __SPU__ +#include <spu_intrinsics.h> +#endif + + + +/** + * Perform Z testing for a 16-bit/value Z buffer. + * + * \param zvals vector of four fragment zvalues as floats + * \param zbuf ptr to vector of ushort[8] zbuffer values. Note that this + * contains the Z values for 2 quads, 8 pixels. + * \param x x coordinate of quad (only lsbit is significant) + * \param inMask indicates which fragments in the quad are alive + * \return new mask indicating which fragments are alive after ztest + */ +static INLINE vector unsigned int +spu_z16_test_less(vector float zvals, vector unsigned short *zbuf, + uint x, vector unsigned int inMask) +{ +#define ZERO 0x80 + vector unsigned int zvals_ui4, zbuf_ui4, mask; + + /* convert floats to uints in [0, 65535] */ + zvals_ui4 = spu_convtu(zvals, 32); /* convert to [0, 2^32] */ + zvals_ui4 = spu_rlmask(zvals_ui4, -16); /* right shift 16 */ + + /* XXX this conditional could be removed with a bit of work */ + if (x & 1) { + /* convert zbuffer values from ushorts to uints */ + /* gather lower four ushorts */ + zbuf_ui4 = spu_shuffle((vector unsigned int) *zbuf, + (vector unsigned int) *zbuf, + ((vector unsigned char) { + ZERO, ZERO, 8, 9, ZERO, ZERO, 10, 11, + ZERO, ZERO, 12, 13, ZERO, ZERO, 14, 15})); + /* mask = (zbuf_ui4 < zvals_ui4) ? ~0 : 0 */ + mask = spu_cmpgt(zbuf_ui4, zvals_ui4); + /* mask &= inMask */ + mask = spu_and(mask, inMask); + /* zbuf = mask ? zval : zbuf */ + zbuf_ui4 = spu_sel(zbuf_ui4, zvals_ui4, mask); + /* convert zbuffer values from uints back to ushorts, preserve lower 4 */ + *zbuf = (vector unsigned short) + spu_shuffle(zbuf_ui4, (vector unsigned int) *zbuf, + ((vector unsigned char) { + 16, 17, 18, 19, 20, 21, 22, 23, + 2, 3, 6, 7, 10, 11, 14, 15})); + } + else { + /* convert zbuffer values from ushorts to uints */ + /* gather upper four ushorts */ + zbuf_ui4 = spu_shuffle((vector unsigned int) *zbuf, + (vector unsigned int) *zbuf, + ((vector unsigned char) { + ZERO, ZERO, 0, 1, ZERO, ZERO, 2, 3, + ZERO, ZERO, 4, 5, ZERO, ZERO, 6, 7})); + /* mask = (zbuf_ui4 < zvals_ui4) ? ~0 : 0 */ + mask = spu_cmpgt(zbuf_ui4, zvals_ui4); + /* mask &= inMask */ + mask = spu_and(mask, inMask); + /* zbuf = mask ? zval : zbuf */ + zbuf_ui4 = spu_sel(zbuf_ui4, zvals_ui4, mask); + /* convert zbuffer values from uints back to ushorts, preserve upper 4 */ + *zbuf = (vector unsigned short) + spu_shuffle(zbuf_ui4, (vector unsigned int) *zbuf, + ((vector unsigned char) { + 2, 3, 6, 7, 10, 11, 14, 15, + 24, 25, 26, 27, 28, 29, 30, 31})); + } + return mask; +#undef ZERO +} + + +/** + * As above, but Zbuffer values as 32-bit uints + */ +static INLINE vector unsigned int +spu_z32_test_less(vector float zvals, vector unsigned int *zbuf_ptr, + vector unsigned int inMask) +{ + vector unsigned int zvals_ui4, mask, zbuf = *zbuf_ptr; + + /* convert floats to uints in [0, 0xffffffff] */ + zvals_ui4 = spu_convtu(zvals, 32); + /* mask = (zbuf < zvals_ui4) ? ~0 : 0 */ + mask = spu_cmpgt(zbuf, zvals_ui4); + /* mask &= inMask */ + mask = spu_and(mask, inMask); + /* zbuf = mask ? zval : zbuf */ + *zbuf_ptr = spu_sel(zbuf, zvals_ui4, mask); + + return mask; +} + + +#endif /* SPU_ZTEST_H */ |