summaryrefslogtreecommitdiff
path: root/src/gallium/drivers/cell/ppu
diff options
context:
space:
mode:
Diffstat (limited to 'src/gallium/drivers/cell/ppu')
-rw-r--r--src/gallium/drivers/cell/ppu/Makefile1
-rw-r--r--src/gallium/drivers/cell/ppu/cell_batch.c58
-rw-r--r--src/gallium/drivers/cell/ppu/cell_clear.c13
-rw-r--r--src/gallium/drivers/cell/ppu/cell_context.c38
-rw-r--r--src/gallium/drivers/cell/ppu/cell_context.h53
-rw-r--r--src/gallium/drivers/cell/ppu/cell_fence.c168
-rw-r--r--src/gallium/drivers/cell/ppu/cell_fence.h57
-rw-r--r--src/gallium/drivers/cell/ppu/cell_flush.c2
-rw-r--r--src/gallium/drivers/cell/ppu/cell_gen_fp.c1772
-rw-r--r--src/gallium/drivers/cell/ppu/cell_gen_fragment.c1541
-rw-r--r--src/gallium/drivers/cell/ppu/cell_gen_fragment.h2
-rw-r--r--src/gallium/drivers/cell/ppu/cell_pipe_state.c101
-rw-r--r--src/gallium/drivers/cell/ppu/cell_render.c1
-rw-r--r--src/gallium/drivers/cell/ppu/cell_screen.c18
-rw-r--r--src/gallium/drivers/cell/ppu/cell_spu.c46
-rw-r--r--src/gallium/drivers/cell/ppu/cell_spu.h13
-rw-r--r--src/gallium/drivers/cell/ppu/cell_state.h5
-rw-r--r--src/gallium/drivers/cell/ppu/cell_state_emit.c240
-rw-r--r--src/gallium/drivers/cell/ppu/cell_state_shader.c7
-rw-r--r--src/gallium/drivers/cell/ppu/cell_surface.c1
-rw-r--r--src/gallium/drivers/cell/ppu/cell_texture.c463
-rw-r--r--src/gallium/drivers/cell/ppu/cell_texture.h18
-rw-r--r--src/gallium/drivers/cell/ppu/cell_vbuf.c6
-rw-r--r--src/gallium/drivers/cell/ppu/cell_vertex_fetch.c33
24 files changed, 4047 insertions, 610 deletions
diff --git a/src/gallium/drivers/cell/ppu/Makefile b/src/gallium/drivers/cell/ppu/Makefile
index b28f4c5c31..9358a47284 100644
--- a/src/gallium/drivers/cell/ppu/Makefile
+++ b/src/gallium/drivers/cell/ppu/Makefile
@@ -24,6 +24,7 @@ SOURCES = \
cell_clear.c \
cell_context.c \
cell_draw_arrays.c \
+ cell_fence.c \
cell_flush.c \
cell_gen_fragment.c \
cell_gen_fp.c \
diff --git a/src/gallium/drivers/cell/ppu/cell_batch.c b/src/gallium/drivers/cell/ppu/cell_batch.c
index 16882c0129..962775cd33 100644
--- a/src/gallium/drivers/cell/ppu/cell_batch.c
+++ b/src/gallium/drivers/cell/ppu/cell_batch.c
@@ -28,6 +28,7 @@
#include "cell_context.h"
#include "cell_batch.h"
+#include "cell_fence.h"
#include "cell_spu.h"
@@ -42,7 +43,9 @@
uint
cell_get_empty_buffer(struct cell_context *cell)
{
- uint buf = 0, tries = 0;
+ static uint prev_buffer = 0;
+ uint buf = (prev_buffer + 1) % CELL_NUM_BUFFERS;
+ uint tries = 0;
/* Find a buffer that's marked as free by all SPUs */
while (1) {
@@ -58,8 +61,13 @@ cell_get_empty_buffer(struct cell_context *cell)
cell->buffer_status[spu][buf][0] = CELL_BUFFER_STATUS_USED;
}
/*
- printf("PPU: ALLOC BUFFER %u\n", buf);
+ printf("PPU: ALLOC BUFFER %u, %u tries\n", buf, tries);
*/
+ prev_buffer = buf;
+
+ /* release tex buffer associated w/ prev use of this batch buf */
+ cell_free_fenced_buffers(cell, &cell->fenced_buffers[buf]);
+
return buf;
}
}
@@ -82,6 +90,37 @@ cell_get_empty_buffer(struct cell_context *cell)
/**
+ * Append a fence command to the current batch buffer.
+ * Note that we're sure there's always room for this because of the
+ * adjusted size check in cell_batch_free_space().
+ */
+static void
+emit_fence(struct cell_context *cell)
+{
+ const uint batch = cell->cur_batch;
+ const uint size = cell->buffer_size[batch];
+ struct cell_command_fence *fence_cmd;
+ struct cell_fence *fence = &cell->fenced_buffers[batch].fence;
+ uint i;
+
+ /* set fence status to emitted, not yet signalled */
+ for (i = 0; i < cell->num_spus; i++) {
+ fence->status[i][0] = CELL_FENCE_EMITTED;
+ }
+
+ ASSERT(size + sizeof(struct cell_command_fence) <= CELL_BUFFER_SIZE);
+
+ fence_cmd = (struct cell_command_fence *) (cell->buffer[batch] + size);
+ fence_cmd->opcode = CELL_CMD_FENCE;
+ fence_cmd->fence = fence;
+
+ /* update batch buffer size */
+ cell->buffer_size[batch] = size + sizeof(struct cell_command_fence);
+ assert(sizeof(struct cell_command_fence) % 8 == 0);
+}
+
+
+/**
* Flush the current batch buffer to the SPUs.
* An empty buffer will be found and set as the new current batch buffer
* for subsequent commands/data.
@@ -91,7 +130,7 @@ cell_batch_flush(struct cell_context *cell)
{
static boolean flushing = FALSE;
uint batch = cell->cur_batch;
- const uint size = cell->buffer_size[batch];
+ uint size = cell->buffer_size[batch];
uint spu, cmd_word;
assert(!flushing);
@@ -99,6 +138,14 @@ cell_batch_flush(struct cell_context *cell)
if (size == 0)
return;
+ /* Before we use this batch buffer, make sure any fenced texture buffers
+ * are released.
+ */
+ if (cell->fenced_buffers[batch].head) {
+ emit_fence(cell);
+ size = cell->buffer_size[batch];
+ }
+
flushing = TRUE;
assert(batch < CELL_NUM_BUFFERS);
@@ -139,6 +186,7 @@ uint
cell_batch_free_space(const struct cell_context *cell)
{
uint free = CELL_BUFFER_SIZE - cell->buffer_size[cell->cur_batch];
+ free -= sizeof(struct cell_command_fence);
return free;
}
@@ -169,7 +217,7 @@ cell_batch_append(struct cell_context *cell, const void *data, uint bytes)
size = cell->buffer_size[cell->cur_batch];
- if (size + bytes > CELL_BUFFER_SIZE) {
+ if (bytes > cell_batch_free_space(cell)) {
cell_batch_flush(cell);
size = 0;
}
@@ -223,7 +271,7 @@ cell_batch_alloc_aligned(struct cell_context *cell, uint bytes,
padbytes = (alignment - (size % alignment)) % alignment;
- if (padbytes + size + bytes > CELL_BUFFER_SIZE) {
+ if (padbytes + bytes > cell_batch_free_space(cell)) {
cell_batch_flush(cell);
size = 0;
}
diff --git a/src/gallium/drivers/cell/ppu/cell_clear.c b/src/gallium/drivers/cell/ppu/cell_clear.c
index c9c0c721bb..037635e466 100644
--- a/src/gallium/drivers/cell/ppu/cell_clear.c
+++ b/src/gallium/drivers/cell/ppu/cell_clear.c
@@ -106,4 +106,17 @@ cell_clear_surface(struct pipe_context *pipe, struct pipe_surface *ps,
clr->surface = surfIndex;
clr->value = clearValue;
}
+
+ /* Technically, the surface's contents are now known and cleared,
+ * so we could set the status to PIPE_SURFACE_STATUS_CLEAR. But
+ * it turns out it's quite painful to recognize when any particular
+ * surface goes from PIPE_SURFACE_STATUS_CLEAR to
+ * PIPE_SURFACE_STATUS_DEFINED (i.e. with known contents), because
+ * the drawing commands could be operating on numerous draw buffers,
+ * which we'd have to iterate through to set all their stati...
+ * For now, we cheat a bit and set the surface's status to DEFINED
+ * right here. Later we should revisit this and set the status to
+ * CLEAR here, and find a better place to set the status to DEFINED.
+ */
+ ps->status = PIPE_SURFACE_STATUS_DEFINED;
}
diff --git a/src/gallium/drivers/cell/ppu/cell_context.c b/src/gallium/drivers/cell/ppu/cell_context.c
index 71f1a3049d..22d552d8e3 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.c
+++ b/src/gallium/drivers/cell/ppu/cell_context.c
@@ -47,6 +47,7 @@
#include "cell_clear.h"
#include "cell_context.h"
#include "cell_draw_arrays.h"
+#include "cell_fence.h"
#include "cell_flush.h"
#include "cell_state.h"
#include "cell_surface.h"
@@ -62,6 +63,8 @@ cell_destroy_context( struct pipe_context *pipe )
{
struct cell_context *cell = cell_context(pipe);
+ util_delete_keymap(cell->fragment_ops_cache, NULL);
+
cell_spu_exit(cell);
align_free(cell);
@@ -85,13 +88,16 @@ cell_draw_create(struct cell_context *cell)
}
-#ifdef DEBUG
static const struct debug_named_value cell_debug_flags[] = {
{"checker", CELL_DEBUG_CHECKER},/**< modulate tile clear color by SPU ID */
+ {"asm", CELL_DEBUG_ASM}, /**< dump SPU asm code */
{"sync", CELL_DEBUG_SYNC}, /**< SPUs do synchronous DMA */
+ {"fragops", CELL_DEBUG_FRAGMENT_OPS}, /**< SPUs emit fragment ops debug messages*/
+ {"fragopfallback", CELL_DEBUG_FRAGMENT_OP_FALLBACK}, /**< SPUs use reference implementation for fragment ops*/
+ {"cmd", CELL_DEBUG_CMD}, /**< SPUs dump command buffer info */
+ {"cache", CELL_DEBUG_CACHE}, /**< report texture cache stats on exit */
{NULL, 0}
};
-#endif
struct pipe_context *
@@ -99,6 +105,7 @@ cell_create_context(struct pipe_screen *screen,
struct cell_winsys *cws)
{
struct cell_context *cell;
+ uint i;
/* some fields need to be 16-byte aligned, so align the whole object */
cell = (struct cell_context*) align_malloc(sizeof(struct cell_context), 16);
@@ -125,11 +132,14 @@ cell_create_context(struct pipe_screen *screen,
cell_init_state_functions(cell);
cell_init_shader_functions(cell);
cell_init_surface_functions(cell);
- cell_init_texture_functions(cell);
cell_init_vertex_functions(cell);
cell->draw = cell_draw_create(cell);
+ /* Create cache of fragment ops generated code */
+ cell->fragment_ops_cache =
+ util_new_keymap(sizeof(struct cell_fragment_ops_key), ~0, NULL);
+
cell_init_vbuf(cell);
draw_set_rasterize_stage(cell->draw, cell->vbuf);
@@ -143,17 +153,31 @@ cell_create_context(struct pipe_screen *screen,
cell_debug_flags,
0 );
+ for (i = 0; i < CELL_NUM_BUFFERS; i++)
+ cell_fence_init(&cell->fenced_buffers[i].fence);
+
+
/*
* SPU stuff
*/
- cell->num_spus = 6;
- /* XXX is this in SDK 3.0 only?
- cell->num_spus = spe_cpu_info_get(SPE_COUNT_PHYSICAL_SPES, -1);
- */
+ /* This call only works with SDK 3.0. Anyone still using 2.1??? */
+ cell->num_cells = spe_cpu_info_get(SPE_COUNT_PHYSICAL_CPU_NODES, -1);
+ cell->num_spus = spe_cpu_info_get(SPE_COUNT_USABLE_SPES, 0);
+ if (cell->debug_flags) {
+ printf("Cell: found %d Cell(s) with %u SPUs\n",
+ cell->num_cells, cell->num_spus);
+ }
+ if (getenv("CELL_NUM_SPUS")) {
+ cell->num_spus = atoi(getenv("CELL_NUM_SPUS"));
+ assert(cell->num_spus > 0);
+ }
cell_start_spus(cell);
cell_init_batch_buffers(cell);
+ /* make sure SPU initializations are done before proceeding */
+ cell_flush_int(cell, CELL_FLUSH_WAIT);
+
return &cell->pipe;
}
diff --git a/src/gallium/drivers/cell/ppu/cell_context.h b/src/gallium/drivers/cell/ppu/cell_context.h
index 14914b9c6f..eb1397bb3f 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.h
+++ b/src/gallium/drivers/cell/ppu/cell_context.h
@@ -38,6 +38,7 @@
#include "cell/common.h"
#include "rtasm/rtasm_ppc_spe.h"
#include "tgsi/tgsi_scan.h"
+#include "util/u_keymap.h"
struct cell_vbuf_render;
@@ -67,31 +68,29 @@ struct cell_fragment_shader_state
/**
- * Cell blend state atom, subclass of pipe_blend_state.
+ * Key for mapping per-fragment state to cached SPU machine code.
+ * keymap(cell_fragment_ops_key) => cell_command_fragment_ops
*/
-struct cell_blend_state
+struct cell_fragment_ops_key
{
- struct pipe_blend_state base;
-
- /**
- * Generated code to perform alpha blending
- */
- struct spe_function code;
+ struct pipe_blend_state blend;
+ struct pipe_blend_color blend_color;
+ struct pipe_depth_stencil_alpha_state dsa;
+ enum pipe_format color_format;
+ enum pipe_format zs_format;
};
+struct cell_buffer_node;
+
/**
- * Cell depth/stencil/alpha state atom, subclass of
- * pipe_depth_stencil_alpha_state.
+ * Fenced buffer list. List of buffers which can be unreferenced after
+ * the fence has been executed/signalled.
*/
-struct cell_depth_stencil_alpha_state
+struct cell_buffer_list
{
- struct pipe_depth_stencil_alpha_state base;
-
- /**
- * Generated code to perform alpha, stencil, and depth testing on the SPE
- */
- struct spe_function code;
+ struct cell_fence fence ALIGN16_ATTRIB;
+ struct cell_buffer_node *head;
};
@@ -104,10 +103,10 @@ struct cell_context
struct cell_winsys *winsys;
- const struct cell_blend_state *blend;
+ const struct pipe_blend_state *blend;
const struct pipe_sampler_state *sampler[PIPE_MAX_SAMPLERS];
uint num_samplers;
- const struct cell_depth_stencil_alpha_state *depth_stencil;
+ const struct pipe_depth_stencil_alpha_state *depth_stencil;
const struct pipe_rasterizer_state *rasterizer;
const struct cell_vertex_shader_state *vs;
const struct cell_fragment_shader_state *fs;
@@ -135,6 +134,11 @@ struct cell_context
uint *tex_map;
uint dirty;
+ uint dirty_textures; /* bitmask of texture units */
+ uint dirty_samplers; /* bitmask of sampler units */
+
+ /** Cache of code generated for per-fragment ops */
+ struct keymap *fragment_ops_cache;
/** The primitive drawing context */
struct draw_context *draw;
@@ -149,8 +153,9 @@ struct cell_context
/** Mapped constant buffers */
void *mapped_constants[PIPE_SHADER_TYPES];
+ struct cell_spu_function_info spu_functions ALIGN16_ATTRIB;
- uint num_spus;
+ uint num_cells, num_spus;
/** Buffers for command batches, vertex/index data */
uint buffer_size[CELL_NUM_BUFFERS];
@@ -162,6 +167,14 @@ struct cell_context
uint buffer_status[CELL_MAX_SPUS][CELL_NUM_BUFFERS][4] ALIGN16_ATTRIB;
+ /** Associated with each command/batch buffer is a list of pipe_buffers
+ * that are fenced. When the last command in a buffer is executed, the
+ * fence will be signalled, indicating that any pipe_buffers preceeding
+ * that fence can be unreferenced (and probably freed).
+ */
+ struct cell_buffer_list fenced_buffers[CELL_NUM_BUFFERS];
+
+
struct spe_function attrib_fetch;
unsigned attrib_fetch_offsets[PIPE_MAX_ATTRIBS];
diff --git a/src/gallium/drivers/cell/ppu/cell_fence.c b/src/gallium/drivers/cell/ppu/cell_fence.c
new file mode 100644
index 0000000000..867b5dcaa0
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_fence.c
@@ -0,0 +1,168 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include <unistd.h>
+#include "util/u_memory.h"
+#include "pipe/p_inlines.h"
+#include "cell_context.h"
+#include "cell_batch.h"
+#include "cell_fence.h"
+#include "cell_texture.h"
+
+
+void
+cell_fence_init(struct cell_fence *fence)
+{
+ uint i;
+ ASSERT_ALIGN16(fence->status);
+ for (i = 0; i < CELL_MAX_SPUS; i++) {
+ fence->status[i][0] = CELL_FENCE_IDLE;
+ }
+}
+
+
+boolean
+cell_fence_signalled(const struct cell_context *cell,
+ const struct cell_fence *fence)
+{
+ uint i;
+ for (i = 0; i < cell->num_spus; i++) {
+ if (fence->status[i][0] != CELL_FENCE_SIGNALLED)
+ return FALSE;
+ /*assert(fence->status[i][0] == CELL_FENCE_EMITTED);*/
+ }
+ return TRUE;
+}
+
+
+void
+cell_fence_finish(const struct cell_context *cell,
+ const struct cell_fence *fence)
+{
+ while (!cell_fence_signalled(cell, fence)) {
+ usleep(10);
+ }
+
+#ifdef DEBUG
+ {
+ uint i;
+ for (i = 0; i < cell->num_spus; i++) {
+ assert(fence->status[i][0] == CELL_FENCE_SIGNALLED);
+ }
+ }
+#endif
+}
+
+
+
+
+struct cell_buffer_node
+{
+ struct pipe_buffer *buffer;
+ struct cell_buffer_node *next;
+};
+
+
+static void
+cell_add_buffer_to_list(struct cell_context *cell,
+ struct cell_buffer_list *list,
+ struct pipe_buffer *buffer)
+{
+ struct pipe_screen *ps = cell->pipe.screen;
+ struct cell_buffer_node *node = CALLOC_STRUCT(cell_buffer_node);
+ /* create new list node which references the buffer, insert at head */
+ if (node) {
+ pipe_buffer_reference(ps, &node->buffer, buffer);
+ node->next = list->head;
+ list->head = node;
+ }
+}
+
+
+/**
+ * Wait for completion of the given fence, then unreference any buffers
+ * on the list.
+ * This typically unrefs/frees texture buffers after any rendering which uses
+ * them has completed.
+ */
+void
+cell_free_fenced_buffers(struct cell_context *cell,
+ struct cell_buffer_list *list)
+{
+ if (list->head) {
+ struct pipe_screen *ps = cell->pipe.screen;
+ struct cell_buffer_node *node;
+
+ cell_fence_finish(cell, &list->fence);
+
+ /* traverse the list, unreferencing buffers, freeing nodes */
+ node = list->head;
+ while (node) {
+ struct cell_buffer_node *next = node->next;
+ assert(node->buffer);
+ pipe_buffer_unmap(ps, node->buffer);
+#if 0
+ printf("Unref buffer %p\n", node->buffer);
+ if (node->buffer->refcount == 1)
+ printf(" Delete!\n");
+#endif
+ pipe_buffer_reference(ps, &node->buffer, NULL);
+ FREE(node);
+ node = next;
+ }
+ list->head = NULL;
+ }
+}
+
+
+/**
+ * This should be called for each render command.
+ * Any texture buffers that are current bound will be added to a fenced
+ * list to be freed later when the fence is executed/signalled.
+ */
+void
+cell_add_fenced_textures(struct cell_context *cell)
+{
+ struct cell_buffer_list *list = &cell->fenced_buffers[cell->cur_batch];
+ uint i;
+
+ for (i = 0; i < cell->num_textures; i++) {
+ struct cell_texture *ct = cell->texture[i];
+ if (ct) {
+ uint level;
+ for (level = 0; level < CELL_MAX_TEXTURE_LEVELS; level++) {
+ if (ct->tiled_buffer[level]) {
+#if 0
+ printf("Adding texture %p buffer %p to list\n",
+ ct, ct->tiled_buffer[level]);
+#endif
+ cell_add_buffer_to_list(cell, list, ct->tiled_buffer[level]);
+ }
+ }
+ }
+ }
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_fence.h b/src/gallium/drivers/cell/ppu/cell_fence.h
new file mode 100644
index 0000000000..536b4ba411
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_fence.h
@@ -0,0 +1,57 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#ifndef CELL_FENCE_H
+#define CELL_FENCE_H
+
+
+extern void
+cell_fence_init(struct cell_fence *fence);
+
+
+extern boolean
+cell_fence_signalled(const struct cell_context *cell,
+ const struct cell_fence *fence);
+
+
+extern void
+cell_fence_finish(const struct cell_context *cell,
+ const struct cell_fence *fence);
+
+
+
+extern void
+cell_free_fenced_buffers(struct cell_context *cell,
+ struct cell_buffer_list *list);
+
+
+extern void
+cell_add_fenced_textures(struct cell_context *cell);
+
+
+#endif /* CELL_FENCE_H */
diff --git a/src/gallium/drivers/cell/ppu/cell_flush.c b/src/gallium/drivers/cell/ppu/cell_flush.c
index 6596b72010..a64967b4b9 100644
--- a/src/gallium/drivers/cell/ppu/cell_flush.c
+++ b/src/gallium/drivers/cell/ppu/cell_flush.c
@@ -49,7 +49,7 @@ cell_flush(struct pipe_context *pipe, unsigned flags,
flags |= CELL_FLUSH_WAIT;
}
- if (flags & PIPE_FLUSH_SWAPBUFFERS)
+ if (flags & (PIPE_FLUSH_SWAPBUFFERS | PIPE_FLUSH_RENDER_CACHE))
flags |= CELL_FLUSH_WAIT;
draw_flush( cell->draw );
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
index 6ffe94eb14..96a1743fc1 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
@@ -37,7 +37,7 @@
* \author Brian Paul
*/
-
+#include <math.h>
#include "pipe/p_defines.h"
#include "pipe/p_state.h"
#include "pipe/p_shader_tokens.h"
@@ -51,25 +51,43 @@
#include "cell_gen_fp.h"
-/** Set to 1 to enable debug/disassembly printfs */
-#define DISASSEM 01
+#define MAX_TEMPS 16
+#define MAX_IMMED 8
+#define CHAN_X 0
+#define CHAN_Y 1
+#define CHAN_Z 2
+#define CHAN_W 3
/**
* Context needed during code generation.
*/
struct codegen
{
+ struct cell_context *cell;
int inputs_reg; /**< 1st function parameter */
int outputs_reg; /**< 2nd function parameter */
int constants_reg; /**< 3rd function parameter */
- int temp_regs[8][4]; /**< maps TGSI temps to SPE registers */
+ int temp_regs[MAX_TEMPS][4]; /**< maps TGSI temps to SPE registers */
+ int imm_regs[MAX_IMMED][4]; /**< maps TGSI immediates to SPE registers */
+
+ int num_imm; /**< number of immediates */
int one_reg; /**< register containing {1.0, 1.0, 1.0, 1.0} */
/** Per-instruction temps / intermediate temps */
int num_itemps;
- int itemps[3];
+ int itemps[12];
+
+ /** Current IF/ELSE/ENDIF nesting level */
+ int if_nesting;
+ /** Index of execution mask register */
+ int exec_mask_reg;
+
+ /** KIL mask: indicates which fragments have been killed */
+ int kill_mask_reg;
+
+ int frame_size; /**< Stack frame size, in words */
struct spe_function *f;
boolean error;
@@ -112,19 +130,78 @@ get_const_one_reg(struct codegen *gen)
{
if (gen->one_reg <= 0) {
gen->one_reg = spe_allocate_available_register(gen->f);
- }
- /* one = {1.0, 1.0, 1.0, 1.0} */
- spe_load_float(gen->f, gen->one_reg, 1.0f);
-#if DISASSEM
- printf("il\tr%d, 1.0f\n", gen->one_reg);
-#endif
+ spe_indent(gen->f, 4);
+ spe_comment(gen->f, -4, "INIT CONSTANT 1.0:");
+
+ /* one = {1.0, 1.0, 1.0, 1.0} */
+ spe_load_float(gen->f, gen->one_reg, 1.0f);
+
+ spe_indent(gen->f, -4);
+ }
return gen->one_reg;
}
/**
+ * Return index of the pixel execution mask.
+ * The register is allocated an initialized upon the first call.
+ *
+ * The pixel execution mask controls which pixels in a quad are
+ * modified, according to surrounding conditionals, loops, etc.
+ */
+static int
+get_exec_mask_reg(struct codegen *gen)
+{
+ if (gen->exec_mask_reg <= 0) {
+ gen->exec_mask_reg = spe_allocate_available_register(gen->f);
+
+ spe_indent(gen->f, 4);
+ spe_comment(gen->f, -4, "INIT EXEC MASK = ~0:");
+
+ /* exec_mask = {~0, ~0, ~0, ~0} */
+ spe_load_int(gen->f, gen->exec_mask_reg, ~0);
+
+ spe_indent(gen->f, -4);
+ }
+
+ return gen->exec_mask_reg;
+}
+
+
+static boolean
+is_register_src(struct codegen *gen, int channel,
+ const struct tgsi_full_src_register *src)
+{
+ int swizzle = tgsi_util_get_full_src_register_extswizzle(src, channel);
+ int sign_op = tgsi_util_get_full_src_register_sign_mode(src, channel);
+
+ if (swizzle > TGSI_SWIZZLE_W || sign_op != TGSI_UTIL_SIGN_KEEP) {
+ return FALSE;
+ }
+ if (src->SrcRegister.File == TGSI_FILE_TEMPORARY ||
+ src->SrcRegister.File == TGSI_FILE_IMMEDIATE) {
+ return TRUE;
+ }
+ return FALSE;
+}
+
+
+static boolean
+is_memory_dst(struct codegen *gen, int channel,
+ const struct tgsi_full_dst_register *dst)
+{
+ if (dst->DstRegister.File == TGSI_FILE_OUTPUT) {
+ return TRUE;
+ }
+ else {
+ return FALSE;
+ }
+}
+
+
+/**
* Return the index of the SPU temporary containing the named TGSI
* source register. If the TGSI register is a TGSI_FILE_TEMPORARY we
* just return the corresponding SPE register. If the TGIS register
@@ -136,35 +213,93 @@ get_src_reg(struct codegen *gen,
int channel,
const struct tgsi_full_src_register *src)
{
- int reg;
+ int reg = -1;
+ int swizzle = tgsi_util_get_full_src_register_extswizzle(src, channel);
+ boolean reg_is_itemp = FALSE;
+ uint sign_op;
+
+ assert(swizzle >= TGSI_SWIZZLE_X);
+ assert(swizzle <= TGSI_EXTSWIZZLE_ONE);
+
+ if (swizzle == TGSI_EXTSWIZZLE_ONE) {
+ /* Load const one float and early out */
+ reg = get_const_one_reg(gen);
+ }
+ else if (swizzle == TGSI_EXTSWIZZLE_ZERO) {
+ /* Load const zero float and early out */
+ reg = get_itemp(gen);
+ spe_xor(gen->f, reg, reg, reg);
+ }
+ else {
+ assert(swizzle < 4);
- /* XXX need to examine src swizzle info here.
- * That will involve changing the channel var...
+ switch (src->SrcRegister.File) {
+ case TGSI_FILE_TEMPORARY:
+ reg = gen->temp_regs[src->SrcRegister.Index][swizzle];
+ break;
+ case TGSI_FILE_INPUT:
+ {
+ /* offset is measured in quadwords, not bytes */
+ int offset = src->SrcRegister.Index * 4 + swizzle;
+ reg = get_itemp(gen);
+ reg_is_itemp = TRUE;
+ /* Load: reg = memory[(machine_reg) + offset] */
+ spe_lqd(gen->f, reg, gen->inputs_reg, offset * 16);
+ }
+ break;
+ case TGSI_FILE_IMMEDIATE:
+ reg = gen->imm_regs[src->SrcRegister.Index][swizzle];
+ break;
+ case TGSI_FILE_CONSTANT:
+ {
+ /* offset is measured in quadwords, not bytes */
+ int offset = src->SrcRegister.Index * 4 + swizzle;
+ reg = get_itemp(gen);
+ reg_is_itemp = TRUE;
+ /* Load: reg = memory[(machine_reg) + offset] */
+ spe_lqd(gen->f, reg, gen->constants_reg, offset * 16);
+ }
+ break;
+ default:
+ assert(0);
+ }
+ }
+
+ /*
+ * Handle absolute value, negate or set-negative of src register.
*/
+ sign_op = tgsi_util_get_full_src_register_sign_mode(src, channel);
+ if (sign_op != TGSI_UTIL_SIGN_KEEP) {
+ /*
+ * All sign ops are done by manipulating bit 31, the IEEE float sign bit.
+ */
+ const int bit31mask_reg = get_itemp(gen);
+ int result_reg;
+
+ if (reg_is_itemp) {
+ /* re-use 'reg' for the result */
+ result_reg = reg;
+ }
+ else {
+ /* alloc a new reg for the result */
+ result_reg = get_itemp(gen);
+ }
+ /* mask with bit 31 set, the rest cleared */
+ spe_load_uint(gen->f, bit31mask_reg, (1 << 31));
- switch (src->SrcRegister.File) {
- case TGSI_FILE_TEMPORARY:
- reg = gen->temp_regs[src->SrcRegister.Index][channel];
- break;
- case TGSI_FILE_INPUT:
- {
- /* offset is measured in quadwords, not bytes */
- int offset = src->SrcRegister.Index * 4 + channel;
- reg = get_itemp(gen);
- /* Load: reg = memory[(machine_reg) + offset] */
- spe_lqd(gen->f, reg, gen->inputs_reg, offset);
-#if DISASSEM
- printf("lqd\tr%d, r%d + %d\n", reg, gen->inputs_reg, offset);
-#endif
+ if (sign_op == TGSI_UTIL_SIGN_CLEAR) {
+ spe_andc(gen->f, result_reg, reg, bit31mask_reg);
}
- break;
- case TGSI_FILE_IMMEDIATE:
- /* xxx fall-through for now / fix */
- case TGSI_FILE_CONSTANT:
- /* xxx fall-through for now / fix */
- default:
- assert(0);
+ else if (sign_op == TGSI_UTIL_SIGN_SET) {
+ spe_and(gen->f, result_reg, reg, bit31mask_reg);
+ }
+ else {
+ assert(sign_op == TGSI_UTIL_SIGN_TOGGLE);
+ spe_xor(gen->f, result_reg, reg, bit31mask_reg);
+ }
+
+ reg = result_reg;
}
return reg;
@@ -183,11 +318,14 @@ get_dst_reg(struct codegen *gen,
int channel,
const struct tgsi_full_dst_register *dest)
{
- int reg;
+ int reg = -1;
switch (dest->DstRegister.File) {
case TGSI_FILE_TEMPORARY:
- reg = gen->temp_regs[dest->DstRegister.Index][channel];
+ if (gen->if_nesting > 0)
+ reg = get_itemp(gen);
+ else
+ reg = gen->temp_regs[dest->DstRegister.Index][channel];
break;
case TGSI_FILE_OUTPUT:
reg = get_itemp(gen);
@@ -211,19 +349,59 @@ store_dest_reg(struct codegen *gen,
int value_reg, int channel,
const struct tgsi_full_dst_register *dest)
{
+ /*
+ * XXX need to implement dst reg clamping/saturation
+ */
+#if 0
+ switch (inst->Instruction.Saturate) {
+ case TGSI_SAT_NONE:
+ break;
+ case TGSI_SAT_ZERO_ONE:
+ break;
+ case TGSI_SAT_MINUS_PLUS_ONE:
+ break;
+ default:
+ assert( 0 );
+ }
+#endif
+
switch (dest->DstRegister.File) {
case TGSI_FILE_TEMPORARY:
- /* no-op */
+ if (gen->if_nesting > 0) {
+ int d_reg = gen->temp_regs[dest->DstRegister.Index][channel];
+ int exec_reg = get_exec_mask_reg(gen);
+ /* Mix d with new value according to exec mask:
+ * d[i] = mask_reg[i] ? value_reg : d_reg
+ */
+ spe_selb(gen->f, d_reg, d_reg, value_reg, exec_reg);
+ }
+ else {
+ /* we're not inside a condition or loop: do nothing special */
+
+ }
break;
case TGSI_FILE_OUTPUT:
{
/* offset is measured in quadwords, not bytes */
int offset = dest->DstRegister.Index * 4 + channel;
- /* Store: memory[(machine_reg) + offset] = reg */
- spe_stqd(gen->f, value_reg, gen->outputs_reg, offset);
-#if DISASSEM
- printf("stqd\tr%d, r%d + %d\n", value_reg, gen->outputs_reg, offset);
-#endif
+ if (gen->if_nesting > 0) {
+ int exec_reg = get_exec_mask_reg(gen);
+ int curval_reg = get_itemp(gen);
+ /* First read the current value from memory:
+ * Load: curval = memory[(machine_reg) + offset]
+ */
+ spe_lqd(gen->f, curval_reg, gen->outputs_reg, offset * 16);
+ /* Mix curval with newvalue according to exec mask:
+ * d[i] = mask_reg[i] ? value_reg : d_reg
+ */
+ spe_selb(gen->f, curval_reg, curval_reg, value_reg, exec_reg);
+ /* Store: memory[(machine_reg) + offset] = curval */
+ spe_stqd(gen->f, curval_reg, gen->outputs_reg, offset * 16);
+ }
+ else {
+ /* Store: memory[(machine_reg) + offset] = reg */
+ spe_stqd(gen->f, value_reg, gen->outputs_reg, offset * 16);
+ }
}
break;
default:
@@ -232,27 +410,114 @@ store_dest_reg(struct codegen *gen,
}
+
+static void
+emit_prologue(struct codegen *gen)
+{
+ gen->frame_size = 1024; /* XXX temporary, should be dynamic */
+
+ spe_comment(gen->f, -4, "Function prologue:");
+
+ /* save $lr on stack # stqd $lr,16($sp) */
+ spe_stqd(gen->f, SPE_REG_RA, SPE_REG_SP, 16);
+
+ if (gen->frame_size >= 512) {
+ /* offset is too large for ai instruction */
+ int offset_reg = spe_allocate_available_register(gen->f);
+ int sp_reg = spe_allocate_available_register(gen->f);
+ /* offset = -framesize */
+ spe_load_int(gen->f, offset_reg, -gen->frame_size);
+ /* sp = $sp */
+ spe_move(gen->f, sp_reg, SPE_REG_SP);
+ /* $sp = $sp + offset_reg */
+ spe_a(gen->f, SPE_REG_SP, SPE_REG_SP, offset_reg);
+ /* save $sp in stack frame */
+ spe_stqd(gen->f, sp_reg, SPE_REG_SP, 0);
+ /* clean up */
+ spe_release_register(gen->f, offset_reg);
+ spe_release_register(gen->f, sp_reg);
+ }
+ else {
+ /* save stack pointer # stqd $sp,-frameSize($sp) */
+ spe_stqd(gen->f, SPE_REG_SP, SPE_REG_SP, -gen->frame_size);
+
+ /* adjust stack pointer # ai $sp,$sp,-frameSize */
+ spe_ai(gen->f, SPE_REG_SP, SPE_REG_SP, -gen->frame_size);
+ }
+}
+
+
+static void
+emit_epilogue(struct codegen *gen)
+{
+ const int return_reg = 3;
+
+ spe_comment(gen->f, -4, "Function epilogue:");
+
+ spe_comment(gen->f, 0, "return the killed mask");
+ if (gen->kill_mask_reg > 0) {
+ /* shader called KIL, return the "alive" mask */
+ spe_move(gen->f, return_reg, gen->kill_mask_reg);
+ }
+ else {
+ /* return {0,0,0,0} */
+ spe_load_uint(gen->f, return_reg, 0);
+ }
+
+ spe_comment(gen->f, 0, "restore stack and return");
+ if (gen->frame_size >= 512) {
+ /* offset is too large for ai instruction */
+ int offset_reg = spe_allocate_available_register(gen->f);
+ /* offset = framesize */
+ spe_load_int(gen->f, offset_reg, gen->frame_size);
+ /* $sp = $sp + offset */
+ spe_a(gen->f, SPE_REG_SP, SPE_REG_SP, offset_reg);
+ /* clean up */
+ spe_release_register(gen->f, offset_reg);
+ }
+ else {
+ /* restore stack pointer # ai $sp,$sp,frameSize */
+ spe_ai(gen->f, SPE_REG_SP, SPE_REG_SP, gen->frame_size);
+ }
+
+ /* restore $lr # lqd $lr,16($sp) */
+ spe_lqd(gen->f, SPE_REG_RA, SPE_REG_SP, 16);
+
+ /* return from function call */
+ spe_bi(gen->f, SPE_REG_RA, 0, 0);
+}
+
+
static boolean
emit_MOV(struct codegen *gen, const struct tgsi_full_instruction *inst)
{
- int ch;
+ int ch, src_reg[4], dst_reg[4];
+
+ spe_comment(gen->f, -4, "MOV:");
for (ch = 0; ch < 4; ch++) {
if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
- int src_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
- int dst_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
- /* XXX we don't always need to actually emit a mov instruction here */
- spe_move(gen->f, dst_reg, src_reg);
-#if DISASSEM
- printf("mov\tr%d, r%d\n", dst_reg, src_reg);
-#endif
- store_dest_reg(gen, dst_reg, ch, &inst->FullDstRegisters[0]);
+ src_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+ dst_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+ }
+ }
+
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ if (is_register_src(gen, ch, &inst->FullSrcRegisters[0]) &&
+ is_memory_dst(gen, ch, &inst->FullDstRegisters[0])) {
+ /* special-case: register to memory store */
+ store_dest_reg(gen, src_reg[ch], ch, &inst->FullDstRegisters[0]);
+ }
+ else {
+ spe_move(gen->f, dst_reg[ch], src_reg[ch]);
+ store_dest_reg(gen, dst_reg[ch], ch, &inst->FullDstRegisters[0]);
+ }
free_itemps(gen);
}
}
return true;
}
-
/**
* Emit addition instructions. Recall that a single TGSI_OPCODE_ADD
* becomes (up to) four SPU "fa" instructions because we're doing SOA
@@ -261,24 +526,25 @@ emit_MOV(struct codegen *gen, const struct tgsi_full_instruction *inst)
static boolean
emit_ADD(struct codegen *gen, const struct tgsi_full_instruction *inst)
{
- int ch;
- /* Loop over Red/Green/Blue/Alpha channels */
+ int ch, s1_reg[4], s2_reg[4], d_reg[4];
+
+ spe_comment(gen->f, -4, "ADD:");
+ /* Loop over Red/Green/Blue/Alpha channels, fetch src operands */
for (ch = 0; ch < 4; ch++) {
/* If the dest R, G, B or A writemask is enabled... */
if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
- /* get indexes of the two src, one dest SPE registers */
- int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
- int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
- int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
-
+ s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+ s2_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+ d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+ }
+ }
+ /* Loop over Red/Green/Blue/Alpha channels, do the add, store results */
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
/* Emit actual SPE instruction: d = s1 + s2 */
- spe_fa(gen->f, d_reg, s1_reg, s2_reg);
-#if DISASSEM
- printf("fa\tr%d, r%d, r%d\n", d_reg, s1_reg, s2_reg);
-#endif
-
+ spe_fa(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch]);
/* Store the result (a no-op for TGSI_FILE_TEMPORARY dests) */
- store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+ store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
/* Free any intermediate temps we allocated */
free_itemps(gen);
}
@@ -286,6 +552,99 @@ emit_ADD(struct codegen *gen, const struct tgsi_full_instruction *inst)
return true;
}
+/**
+ * Emit subtract. See emit_ADD for comments.
+ */
+static boolean
+emit_SUB(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+ int ch, s1_reg[4], s2_reg[4], d_reg[4];
+ spe_comment(gen->f, -4, "SUB:");
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+ s2_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+ d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+ }
+ }
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ /* d = s1 - s2 */
+ spe_fs(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch]);
+ store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
+ free_itemps(gen);
+ }
+ }
+ return true;
+}
+
+/**
+ * Emit multiply add. See emit_ADD for comments.
+ */
+static boolean
+emit_MAD(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+ int ch, s1_reg[4], s2_reg[4], s3_reg[4], d_reg[4];
+ spe_comment(gen->f, -4, "MAD:");
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+ s2_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+ s3_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[2]);
+ d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+ }
+ }
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ /* d = s1 * s2 + s3 */
+ spe_fma(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch], s3_reg[ch]);
+ store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
+ free_itemps(gen);
+ }
+ }
+ return true;
+}
+
+
+/**
+ * Emit linear interpolate. See emit_ADD for comments.
+ */
+static boolean
+emit_LERP(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+ int ch, s1_reg[4], s2_reg[4], s3_reg[4], d_reg[4], tmp_reg[4];
+ spe_comment(gen->f, -4, "LERP:");
+ /* setup/get src/dst/temp regs */
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+ s2_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+ s3_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[2]);
+ d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+ tmp_reg[ch] = get_itemp(gen);
+ }
+ }
+
+ /* d = s3 + s1(s2 - s3) */
+ /* do all subtracts, then all fma, then all stores to better pipeline */
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ spe_fs(gen->f, tmp_reg[ch], s2_reg[ch], s3_reg[ch]);
+ }
+ }
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ spe_fma(gen->f, d_reg[ch], tmp_reg[ch], s1_reg[ch], s3_reg[ch]);
+ }
+ }
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
+ }
+ }
+ free_itemps(gen);
+ return true;
+}
/**
* Emit multiply. See emit_ADD for comments.
@@ -293,17 +652,63 @@ emit_ADD(struct codegen *gen, const struct tgsi_full_instruction *inst)
static boolean
emit_MUL(struct codegen *gen, const struct tgsi_full_instruction *inst)
{
+ int ch, s1_reg[4], s2_reg[4], d_reg[4];
+ spe_comment(gen->f, -4, "MUL:");
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+ s2_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+ d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+ }
+ }
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ /* d = s1 * s2 */
+ spe_fm(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch]);
+ store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
+ free_itemps(gen);
+ }
+ }
+ return true;
+}
+
+/**
+ * Emit reciprocal. See emit_ADD for comments.
+ */
+static boolean
+emit_RCP(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
int ch;
+ spe_comment(gen->f, -4, "RCP:");
for (ch = 0; ch < 4; ch++) {
if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
- int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
- /* d = s1 * s2 */
- spe_fm(gen->f, d_reg, s1_reg, s2_reg);
-#if DISASSEM
- printf("fm\tr%d, r%d, r%d\n", d_reg, s1_reg, s2_reg);
-#endif
+ /* d = 1/s1 */
+ spe_frest(gen->f, d_reg, s1_reg);
+ spe_fi(gen->f, d_reg, s1_reg, d_reg);
+ store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+ free_itemps(gen);
+ }
+ }
+ return true;
+}
+
+/**
+ * Emit reciprocal sqrt. See emit_ADD for comments.
+ */
+static boolean
+emit_RSQ(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+ int ch;
+ spe_comment(gen->f, -4, "RSQ:");
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+ int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+ /* d = 1/s1 */
+ spe_frsqest(gen->f, d_reg, s1_reg);
+ spe_fi(gen->f, d_reg, s1_reg, d_reg);
store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
free_itemps(gen);
}
@@ -311,6 +716,270 @@ emit_MUL(struct codegen *gen, const struct tgsi_full_instruction *inst)
return true;
}
+/**
+ * Emit absolute value. See emit_ADD for comments.
+ */
+static boolean
+emit_ABS(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+ int ch;
+ spe_comment(gen->f, -4, "ABS:");
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+ int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+ const int bit31mask_reg = get_itemp(gen);
+
+ /* mask with bit 31 set, the rest cleared */
+ spe_load_uint(gen->f, bit31mask_reg, (1 << 31));
+
+ /* d = sign bit cleared in s1 */
+ spe_andc(gen->f, d_reg, s1_reg, bit31mask_reg);
+
+ store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+ free_itemps(gen);
+ }
+ }
+ return true;
+}
+
+/**
+ * Emit 3 component dot product. See emit_ADD for comments.
+ */
+static boolean
+emit_DP3(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+ int ch;
+ int s1x_reg, s1y_reg, s1z_reg;
+ int s2x_reg, s2y_reg, s2z_reg;
+ int t0_reg = get_itemp(gen), t1_reg = get_itemp(gen);
+
+ spe_comment(gen->f, -4, "DP3:");
+
+ s1x_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]);
+ s2x_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]);
+ s1y_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]);
+ s2y_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[1]);
+ s1z_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]);
+ s2z_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[1]);
+
+ /* t0 = x0 * x1 */
+ spe_fm(gen->f, t0_reg, s1x_reg, s2x_reg);
+
+ /* t1 = y0 * y1 */
+ spe_fm(gen->f, t1_reg, s1y_reg, s2y_reg);
+
+ /* t0 = z0 * z1 + t0 */
+ spe_fma(gen->f, t0_reg, s1z_reg, s2z_reg, t0_reg);
+
+ /* t0 = t0 + t1 */
+ spe_fa(gen->f, t0_reg, t0_reg, t1_reg);
+
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+ spe_move(gen->f, d_reg, t0_reg);
+ store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+ }
+ }
+
+ free_itemps(gen);
+ return true;
+}
+
+/**
+ * Emit 4 component dot product. See emit_ADD for comments.
+ */
+static boolean
+emit_DP4(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+ int ch;
+ int s0x_reg, s0y_reg, s0z_reg, s0w_reg;
+ int s1x_reg, s1y_reg, s1z_reg, s1w_reg;
+ int t0_reg = get_itemp(gen), t1_reg = get_itemp(gen);
+
+ spe_comment(gen->f, -4, "DP4:");
+
+ s0x_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]);
+ s1x_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]);
+ s0y_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]);
+ s1y_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[1]);
+ s0z_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]);
+ s1z_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[1]);
+ s0w_reg = get_src_reg(gen, CHAN_W, &inst->FullSrcRegisters[0]);
+ s1w_reg = get_src_reg(gen, CHAN_W, &inst->FullSrcRegisters[1]);
+
+ /* t0 = x0 * x1 */
+ spe_fm(gen->f, t0_reg, s0x_reg, s1x_reg);
+
+ /* t1 = y0 * y1 */
+ spe_fm(gen->f, t1_reg, s0y_reg, s1y_reg);
+
+ /* t0 = z0 * z1 + t0 */
+ spe_fma(gen->f, t0_reg, s0z_reg, s1z_reg, t0_reg);
+
+ /* t1 = w0 * w1 + t1 */
+ spe_fma(gen->f, t1_reg, s0w_reg, s1w_reg, t1_reg);
+
+ /* t0 = t0 + t1 */
+ spe_fa(gen->f, t0_reg, t0_reg, t1_reg);
+
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+ spe_move(gen->f, d_reg, t0_reg);
+ store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+ }
+ }
+
+ free_itemps(gen);
+ return true;
+}
+
+/**
+ * Emit homogeneous dot product. See emit_ADD for comments.
+ */
+static boolean
+emit_DPH(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+ /* XXX rewrite this function to look more like DP3/DP4 */
+ int ch;
+ spe_comment(gen->f, -4, "DPH:");
+
+ int s1_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]);
+ int s2_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]);
+ int tmp_reg = get_itemp(gen);
+
+ /* t = x0 * x1 */
+ spe_fm(gen->f, tmp_reg, s1_reg, s2_reg);
+
+ s1_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]);
+ s2_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[1]);
+ /* t = y0 * y1 + t */
+ spe_fma(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg);
+
+ s1_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]);
+ s2_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[1]);
+ /* t = z0 * z1 + t */
+ spe_fma(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg);
+
+ s2_reg = get_src_reg(gen, CHAN_W, &inst->FullSrcRegisters[1]);
+ /* t = w1 + t */
+ spe_fa(gen->f, tmp_reg, s2_reg, tmp_reg);
+
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+ spe_move(gen->f, d_reg, tmp_reg);
+ store_dest_reg(gen, tmp_reg, ch, &inst->FullDstRegisters[0]);
+ }
+ }
+
+ free_itemps(gen);
+ return true;
+}
+
+/**
+ * Emit 3-component vector normalize.
+ */
+static boolean
+emit_NRM3(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+ int ch;
+ int src_reg[3];
+ int t0_reg = get_itemp(gen), t1_reg = get_itemp(gen);
+
+ spe_comment(gen->f, -4, "NRM3:");
+
+ src_reg[0] = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]);
+ src_reg[1] = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]);
+ src_reg[2] = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]);
+
+ /* t0 = x * x */
+ spe_fm(gen->f, t0_reg, src_reg[0], src_reg[0]);
+
+ /* t1 = y * y */
+ spe_fm(gen->f, t1_reg, src_reg[1], src_reg[1]);
+
+ /* t0 = z * z + t0 */
+ spe_fma(gen->f, t0_reg, src_reg[2], src_reg[2], t0_reg);
+
+ /* t0 = t0 + t1 */
+ spe_fa(gen->f, t0_reg, t0_reg, t1_reg);
+
+ /* t1 = 1.0 / sqrt(t0) */
+ spe_frsqest(gen->f, t1_reg, t0_reg);
+ spe_fi(gen->f, t1_reg, t0_reg, t1_reg);
+
+ for (ch = 0; ch < 3; ch++) { /* NOTE: omit W channel */
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+ /* dst = src[ch] * t1 */
+ spe_fm(gen->f, d_reg, src_reg[ch], t1_reg);
+ store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+ }
+ }
+
+ free_itemps(gen);
+ return true;
+}
+
+
+/**
+ * Emit cross product. See emit_ADD for comments.
+ */
+static boolean
+emit_XPD(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+ spe_comment(gen->f, -4, "XPD:");
+
+ int s1_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]);
+ int s2_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[1]);
+ int tmp_reg = get_itemp(gen);
+
+ /* t = z0 * y1 */
+ spe_fm(gen->f, tmp_reg, s1_reg, s2_reg);
+
+ s1_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]);
+ s2_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[1]);
+ /* t = y0 * z1 - t */
+ spe_fms(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg);
+
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << CHAN_X)) {
+ store_dest_reg(gen, tmp_reg, CHAN_X, &inst->FullDstRegisters[0]);
+ }
+
+ s1_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]);
+ s2_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[1]);
+ /* t = x0 * z1 */
+ spe_fm(gen->f, tmp_reg, s1_reg, s2_reg);
+
+ s1_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]);
+ s2_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]);
+ /* t = z0 * x1 - t */
+ spe_fms(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg);
+
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << CHAN_Y)) {
+ store_dest_reg(gen, tmp_reg, CHAN_Y, &inst->FullDstRegisters[0]);
+ }
+
+ s1_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]);
+ s2_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]);
+ /* t = y0 * x1 */
+ spe_fm(gen->f, tmp_reg, s1_reg, s2_reg);
+
+ s1_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]);
+ s2_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[1]);
+ /* t = x0 * y1 - t */
+ spe_fms(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg);
+
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << CHAN_Z)) {
+ store_dest_reg(gen, tmp_reg, CHAN_Z, &inst->FullDstRegisters[0]);
+ }
+
+ free_itemps(gen);
+ return true;
+}
/**
* Emit set-if-greater-than.
@@ -323,6 +992,8 @@ emit_SGT(struct codegen *gen, const struct tgsi_full_instruction *inst)
{
int ch;
+ spe_comment(gen->f, -4, "SGT:");
+
for (ch = 0; ch < 4; ch++) {
if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
@@ -331,26 +1002,776 @@ emit_SGT(struct codegen *gen, const struct tgsi_full_instruction *inst)
/* d = (s1 > s2) */
spe_fcgt(gen->f, d_reg, s1_reg, s2_reg);
-#if DISASSEM
- printf("fcgt\tr%d, r%d, r%d\n", d_reg, s1_reg, s2_reg);
-#endif
/* convert d from 0x0/0xffffffff to 0.0/1.0 */
/* d = d & one_reg */
spe_and(gen->f, d_reg, d_reg, get_const_one_reg(gen));
-#if DISASSEM
- printf("and\tr%d, r%d, r%d\n", d_reg, d_reg, get_const_one_reg(gen));
+
+ store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+ free_itemps(gen);
+ }
+ }
+
+ return true;
+}
+
+/**
+ * Emit set-if_less-then. See emit_SGT for comments.
+ */
+static boolean
+emit_SLT(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+ int ch;
+
+ spe_comment(gen->f, -4, "SLT:");
+
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+ int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+ int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+
+ /* d = (s1 < s2) */
+ spe_fcgt(gen->f, d_reg, s2_reg, s1_reg);
+
+ /* convert d from 0x0/0xffffffff to 0.0/1.0 */
+ /* d = d & one_reg */
+ spe_and(gen->f, d_reg, d_reg, get_const_one_reg(gen));
+
+ store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+ free_itemps(gen);
+ }
+ }
+
+ return true;
+}
+
+/**
+ * Emit set-if_greater-then-or-equal. See emit_SGT for comments.
+ */
+static boolean
+emit_SGE(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+ int ch;
+
+ spe_comment(gen->f, -4, "SGE:");
+
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+ int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+ int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+
+ /* d = (s1 >= s2) */
+ spe_fcgt(gen->f, d_reg, s2_reg, s1_reg);
+
+ /* convert d from 0x0/0xffffffff to 0.0/1.0 */
+ /* d = ~d & one_reg */
+ spe_andc(gen->f, d_reg, get_const_one_reg(gen), d_reg);
+
+ store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+ free_itemps(gen);
+ }
+ }
+
+ return true;
+}
+
+/**
+ * Emit set-if_less-then-or-equal. See emit_SGT for comments.
+ */
+static boolean
+emit_SLE(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+ int ch;
+
+ spe_comment(gen->f, -4, "SLE:");
+
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+ int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+ int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+
+ /* d = (s1 <= s2) */
+ spe_fcgt(gen->f, d_reg, s1_reg, s2_reg);
+
+ /* convert d from 0x0/0xffffffff to 0.0/1.0 */
+ /* d = ~d & one_reg */
+ spe_andc(gen->f, d_reg, get_const_one_reg(gen), d_reg);
+
+ store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+ free_itemps(gen);
+ }
+ }
+
+ return true;
+}
+
+/**
+ * Emit set-if_equal. See emit_SGT for comments.
+ */
+static boolean
+emit_SEQ(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+ int ch;
+
+ spe_comment(gen->f, -4, "SEQ:");
+
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+ int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+ int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+
+ /* d = (s1 == s2) */
+ spe_fceq(gen->f, d_reg, s1_reg, s2_reg);
+
+ /* convert d from 0x0/0xffffffff to 0.0/1.0 */
+ /* d = d & one_reg */
+ spe_and(gen->f, d_reg, d_reg, get_const_one_reg(gen));
+
+ store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+ free_itemps(gen);
+ }
+ }
+
+ return true;
+}
+
+/**
+ * Emit set-if_not_equal. See emit_SGT for comments.
+ */
+static boolean
+emit_SNE(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+ int ch;
+
+ spe_comment(gen->f, -4, "SNE:");
+
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+ int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+ int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+
+ /* d = (s1 != s2) */
+ spe_fceq(gen->f, d_reg, s1_reg, s2_reg);
+ spe_nor(gen->f, d_reg, d_reg, d_reg);
+
+ /* convert d from 0x0/0xffffffff to 0.0/1.0 */
+ /* d = d & one_reg */
+ spe_and(gen->f, d_reg, d_reg, get_const_one_reg(gen));
+
+ store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+ free_itemps(gen);
+ }
+ }
+
+ return true;
+}
+
+/**
+ * Emit compare. See emit_SGT for comments.
+ */
+static boolean
+emit_CMP(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+ int ch;
+
+ spe_comment(gen->f, -4, "CMP:");
+
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+ int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+ int s3_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[2]);
+ int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+ int zero_reg = get_itemp(gen);
+
+ spe_xor(gen->f, zero_reg, zero_reg, zero_reg);
+
+ /* d = (s1 < 0) ? s2 : s3 */
+ spe_fcgt(gen->f, d_reg, zero_reg, s1_reg);
+ spe_selb(gen->f, d_reg, s3_reg, s2_reg, d_reg);
+
+ store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+ free_itemps(gen);
+ }
+ }
+
+ return true;
+}
+
+/**
+ * Emit trunc.
+ * Convert float to signed int
+ * Convert signed int to float
+ */
+static boolean
+emit_TRUNC(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+ int ch;
+
+ spe_comment(gen->f, -4, "TRUNC:");
+
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+ int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+
+ /* Convert float to int */
+ spe_cflts(gen->f, d_reg, s1_reg, 0);
+
+ /* Convert int to float */
+ spe_csflt(gen->f, d_reg, d_reg, 0);
+
+ store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+ free_itemps(gen);
+ }
+ }
+
+ return true;
+}
+
+/**
+ * Emit floor.
+ * If negative int subtract one
+ * Convert float to signed int
+ * Convert signed int to float
+ */
+static boolean
+emit_FLR(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+ int ch;
+
+ spe_comment(gen->f, -4, "FLR:");
+
+ int zero_reg = get_itemp(gen);
+ spe_xor(gen->f, zero_reg, zero_reg, zero_reg);
+
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+ int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+ int tmp_reg = get_itemp(gen);
+
+ /* If negative, subtract 1.0 */
+ spe_fcgt(gen->f, tmp_reg, zero_reg, s1_reg);
+ spe_selb(gen->f, tmp_reg, zero_reg, get_const_one_reg(gen), tmp_reg);
+ spe_fs(gen->f, tmp_reg, s1_reg, tmp_reg);
+
+ /* Convert float to int */
+ spe_cflts(gen->f, tmp_reg, tmp_reg, 0);
+
+ /* Convert int to float */
+ spe_csflt(gen->f, d_reg, tmp_reg, 0);
+
+ store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+ free_itemps(gen);
+ }
+ }
+
+ return true;
+}
+
+/**
+ * Compute frac = Input - FLR(Input)
+ */
+static boolean
+emit_FRC(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+ int ch;
+
+ spe_comment(gen->f, -4, "FRC:");
+
+ int zero_reg = get_itemp(gen);
+ spe_xor(gen->f, zero_reg, zero_reg, zero_reg);
+
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+ int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+ int tmp_reg = get_itemp(gen);
+
+ /* If negative, subtract 1.0 */
+ spe_fcgt(gen->f, tmp_reg, zero_reg, s1_reg);
+ spe_selb(gen->f, tmp_reg, zero_reg, get_const_one_reg(gen), tmp_reg);
+ spe_fs(gen->f, tmp_reg, s1_reg, tmp_reg);
+
+ /* Convert float to int */
+ spe_cflts(gen->f, tmp_reg, tmp_reg, 0);
+
+ /* Convert int to float */
+ spe_csflt(gen->f, tmp_reg, tmp_reg, 0);
+
+ /* d = s1 - FLR(s1) */
+ spe_fs(gen->f, d_reg, s1_reg, tmp_reg);
+
+ store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+ free_itemps(gen);
+ }
+ }
+
+ return true;
+}
+
+
+#if 0
+static void
+print_functions(struct cell_context *cell)
+{
+ struct cell_spu_function_info *funcs = &cell->spu_functions;
+ uint i;
+ for (i = 0; i < funcs->num; i++) {
+ printf("SPU func %u: %s at %u\n",
+ i, funcs->names[i], funcs->addrs[i]);
+ }
+}
#endif
+
+static uint
+lookup_function(struct cell_context *cell, const char *funcname)
+{
+ const struct cell_spu_function_info *funcs = &cell->spu_functions;
+ uint i, addr = 0;
+ for (i = 0; i < funcs->num; i++) {
+ if (strcmp(funcs->names[i], funcname) == 0) {
+ addr = funcs->addrs[i];
+ }
+ }
+ assert(addr && "spu function not found");
+ return addr / 4; /* discard 2 least significant bits */
+}
+
+
+/**
+ * Emit code to call a SPU function.
+ * Used to implement instructions like SIN/COS/POW/TEX/etc.
+ * If scalar, only the X components of the src regs are used, and the
+ * result is replicated across the dest register's XYZW components.
+ */
+static boolean
+emit_function_call(struct codegen *gen,
+ const struct tgsi_full_instruction *inst,
+ char *funcname, uint num_args, boolean scalar)
+{
+ const uint addr = lookup_function(gen->cell, funcname);
+ char comment[100];
+ int s_regs[3];
+ int func_called = FALSE;
+ uint a, ch;
+ int retval_reg = -1;
+
+ assert(num_args <= 3);
+
+ snprintf(comment, sizeof(comment), "CALL %s:", funcname);
+ spe_comment(gen->f, -4, comment);
+
+ if (scalar) {
+ for (a = 0; a < num_args; a++) {
+ s_regs[a] = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[a]);
+ }
+ /* we'll call the function, put the return value in this register,
+ * then replicate it across all write-enabled components in d_reg.
+ */
+ retval_reg = spe_allocate_available_register(gen->f);
+ }
+
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ int d_reg;
+ ubyte usedRegs[SPE_NUM_REGS];
+ uint i, numUsed;
+
+ if (!scalar) {
+ for (a = 0; a < num_args; a++) {
+ s_regs[a] = get_src_reg(gen, ch, &inst->FullSrcRegisters[a]);
+ }
+ }
+
+ d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+
+ if (!scalar || !func_called) {
+ /* for a scalar function, we'll really only call the function once */
+
+ numUsed = spe_get_registers_used(gen->f, usedRegs);
+ assert(numUsed < gen->frame_size / 16 - 2);
+
+ /* save registers to stack */
+ for (i = 0; i < numUsed; i++) {
+ uint reg = usedRegs[i];
+ int offset = 2 + i;
+ spe_stqd(gen->f, reg, SPE_REG_SP, 16 * offset);
+ }
+
+ /* setup function arguments */
+ for (a = 0; a < num_args; a++) {
+ spe_move(gen->f, 3 + a, s_regs[a]);
+ }
+
+ /* branch to function, save return addr */
+ spe_brasl(gen->f, SPE_REG_RA, addr);
+
+ /* save function's return value */
+ if (scalar)
+ spe_move(gen->f, retval_reg, 3);
+ else
+ spe_move(gen->f, d_reg, 3);
+
+ /* restore registers from stack */
+ for (i = 0; i < numUsed; i++) {
+ uint reg = usedRegs[i];
+ if (reg != d_reg && reg != retval_reg) {
+ int offset = 2 + i;
+ spe_lqd(gen->f, reg, SPE_REG_SP, 16 * offset);
+ }
+ }
+
+ func_called = TRUE;
+ }
+
+ if (scalar) {
+ spe_move(gen->f, d_reg, retval_reg);
+ }
+
store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
free_itemps(gen);
}
}
+ if (scalar) {
+ spe_release_register(gen->f, retval_reg);
+ }
+
return true;
}
+static boolean
+emit_TEX(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+ const uint target = inst->InstructionExtTexture.Texture;
+ const uint unit = inst->FullSrcRegisters[1].SrcRegister.Index;
+ uint addr;
+ int ch;
+ int coord_regs[4], d_regs[4];
+
+ switch (target) {
+ case TGSI_TEXTURE_1D:
+ case TGSI_TEXTURE_2D:
+ addr = lookup_function(gen->cell, "spu_tex_2d");
+ break;
+ case TGSI_TEXTURE_3D:
+ addr = lookup_function(gen->cell, "spu_tex_3d");
+ break;
+ case TGSI_TEXTURE_CUBE:
+ addr = lookup_function(gen->cell, "spu_tex_cube");
+ break;
+ default:
+ ASSERT(0 && "unsupported texture target");
+ return FALSE;
+ }
+
+ assert(inst->FullSrcRegisters[1].SrcRegister.File == TGSI_FILE_SAMPLER);
+
+ spe_comment(gen->f, -4, "CALL tex:");
+
+ /* get src/dst reg info */
+ for (ch = 0; ch < 4; ch++) {
+ coord_regs[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+ d_regs[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+ }
+
+ {
+ ubyte usedRegs[SPE_NUM_REGS];
+ uint i, numUsed;
+
+ numUsed = spe_get_registers_used(gen->f, usedRegs);
+ assert(numUsed < gen->frame_size / 16 - 2);
+
+ /* save registers to stack */
+ for (i = 0; i < numUsed; i++) {
+ uint reg = usedRegs[i];
+ int offset = 2 + i;
+ spe_stqd(gen->f, reg, SPE_REG_SP, 16 * offset);
+ }
+
+ /* setup function arguments (XXX depends on target) */
+ for (i = 0; i < 4; i++) {
+ spe_move(gen->f, 3 + i, coord_regs[i]);
+ }
+ spe_load_uint(gen->f, 7, unit); /* sampler unit */
+
+ /* branch to function, save return addr */
+ spe_brasl(gen->f, SPE_REG_RA, addr);
+
+ /* save function's return values (four pixel's colors) */
+ for (i = 0; i < 4; i++) {
+ spe_move(gen->f, d_regs[i], 3 + i);
+ }
+
+ /* restore registers from stack */
+ for (i = 0; i < numUsed; i++) {
+ uint reg = usedRegs[i];
+ if (reg != d_regs[0] &&
+ reg != d_regs[1] &&
+ reg != d_regs[2] &&
+ reg != d_regs[3]) {
+ int offset = 2 + i;
+ spe_lqd(gen->f, reg, SPE_REG_SP, 16 * offset);
+ }
+ }
+ }
+
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ store_dest_reg(gen, d_regs[ch], ch, &inst->FullDstRegisters[0]);
+ free_itemps(gen);
+ }
+ }
+
+ return TRUE;
+}
+
+
+/**
+ * KILL if any of src reg values are less than zero.
+ */
+static boolean
+emit_KIL(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+ int ch;
+ int s_regs[4], kil_reg = -1, cmp_reg, zero_reg;
+
+ spe_comment(gen->f, -4, "CALL kil:");
+
+ /* zero = {0,0,0,0} */
+ zero_reg = get_itemp(gen);
+ spe_load_uint(gen->f, zero_reg, 0);
+
+ cmp_reg = get_itemp(gen);
+
+ /* get src regs */
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ s_regs[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+ }
+ }
+
+ /* test if any src regs are < 0 */
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ if (kil_reg >= 0) {
+ /* cmp = 0 > src ? : ~0 : 0 */
+ spe_fcgt(gen->f, cmp_reg, zero_reg, s_regs[ch]);
+ /* kil = kil | cmp */
+ spe_or(gen->f, kil_reg, kil_reg, cmp_reg);
+ }
+ else {
+ kil_reg = get_itemp(gen);
+ /* kil = 0 > src ? : ~0 : 0 */
+ spe_fcgt(gen->f, kil_reg, zero_reg, s_regs[ch]);
+ }
+ }
+ }
+
+ if (gen->if_nesting) {
+ /* may have been a conditional kil */
+ spe_and(gen->f, kil_reg, kil_reg, gen->exec_mask_reg);
+ }
+
+ /* allocate the kill mask reg if needed */
+ if (gen->kill_mask_reg <= 0) {
+ gen->kill_mask_reg = spe_allocate_available_register(gen->f);
+ spe_move(gen->f, gen->kill_mask_reg, kil_reg);
+ }
+ else {
+ spe_or(gen->f, gen->kill_mask_reg, gen->kill_mask_reg, kil_reg);
+ }
+
+ free_itemps(gen);
+
+ return TRUE;
+}
+
+
+
+/**
+ * Emit max. See emit_SGT for comments.
+ */
+static boolean
+emit_MAX(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+ int ch, s0_reg[4], s1_reg[4], d_reg[4], tmp_reg[4];
+
+ spe_comment(gen->f, -4, "MAX:");
+
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ s0_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+ s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+ d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+ tmp_reg[ch] = get_itemp(gen);
+ }
+ }
+
+ /* d = (s0 > s1) ? s0 : s1 */
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ spe_fcgt(gen->f, tmp_reg[ch], s0_reg[ch], s1_reg[ch]);
+ }
+ }
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ spe_selb(gen->f, d_reg[ch], s1_reg[ch], s0_reg[ch], tmp_reg[ch]);
+ }
+ }
+
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
+ }
+ }
+
+ free_itemps(gen);
+ return true;
+}
+
+/**
+ * Emit max. See emit_SGT for comments.
+ */
+static boolean
+emit_MIN(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+ int ch, s0_reg[4], s1_reg[4], d_reg[4], tmp_reg[4];
+
+ spe_comment(gen->f, -4, "MIN:");
+
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ s0_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+ s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+ d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+ tmp_reg[ch] = get_itemp(gen);
+ }
+ }
+
+ /* d = (s1 > s0) ? s0 : s1 */
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ spe_fcgt(gen->f, tmp_reg[ch], s1_reg[ch], s0_reg[ch]);
+ }
+ }
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ spe_selb(gen->f, d_reg[ch], s1_reg[ch], s0_reg[ch], tmp_reg[ch]);
+ }
+ }
+
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
+ }
+ }
+
+ free_itemps(gen);
+ return true;
+}
+
+static boolean
+emit_IF(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+ const int channel = 0;
+ const int exec_reg = get_exec_mask_reg(gen);
+
+ spe_comment(gen->f, -4, "IF:");
+
+ /* update execution mask with the predicate register */
+ int tmp_reg = get_itemp(gen);
+ int s1_reg = get_src_reg(gen, channel, &inst->FullSrcRegisters[0]);
+
+ /* tmp = (s1_reg == 0) */
+ spe_ceqi(gen->f, tmp_reg, s1_reg, 0);
+ /* tmp = !tmp */
+ spe_complement(gen->f, tmp_reg, tmp_reg);
+ /* exec_mask = exec_mask & tmp */
+ spe_and(gen->f, exec_reg, exec_reg, tmp_reg);
+
+ gen->if_nesting++;
+
+ free_itemps(gen);
+
+ return true;
+}
+
+
+static boolean
+emit_ELSE(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+ const int exec_reg = get_exec_mask_reg(gen);
+
+ spe_comment(gen->f, -4, "ELSE:");
+
+ /* exec_mask = !exec_mask */
+ spe_complement(gen->f, exec_reg, exec_reg);
+
+ return true;
+}
+
+
+static boolean
+emit_ENDIF(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+ const int exec_reg = get_exec_mask_reg(gen);
+
+ spe_comment(gen->f, -4, "ENDIF:");
+
+ /* XXX todo: pop execution mask */
+
+ spe_load_int(gen->f, exec_reg, ~0x0);
+
+ gen->if_nesting--;
+ return true;
+}
+
+
+static boolean
+emit_DDX_DDY(struct codegen *gen, const struct tgsi_full_instruction *inst,
+ boolean ddx)
+{
+ int ch;
+
+ spe_comment(gen->f, -4, ddx ? "DDX:" : "DDY:");
+
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ int s_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+ int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+
+ int t1_reg = get_itemp(gen);
+ int t2_reg = get_itemp(gen);
+
+ spe_splat_word(gen->f, t1_reg, s_reg, 0); /* upper-left pixel */
+ if (ddx) {
+ spe_splat_word(gen->f, t2_reg, s_reg, 1); /* upper-right pixel */
+ }
+ else {
+ spe_splat_word(gen->f, t2_reg, s_reg, 2); /* lower-left pixel */
+ }
+ spe_fs(gen->f, d_reg, t2_reg, t1_reg);
+
+ free_itemps(gen);
+ }
+ }
+
+ return true;
+}
+
+
+
+
/**
* Emit END instruction.
* We just return from the shader function at this point.
@@ -361,11 +1782,8 @@ emit_SGT(struct codegen *gen, const struct tgsi_full_instruction *inst)
static boolean
emit_END(struct codegen *gen)
{
- /* return from function call */
- spe_bi(gen->f, SPE_REG_RA, 0, 0);
-#if DISASSEM
- printf("bi\trRA\n");
-#endif
+ spe_comment(gen->f, -4, "END:");
+ emit_epilogue(gen);
return true;
}
@@ -379,19 +1797,101 @@ emit_instruction(struct codegen *gen,
{
switch (inst->Instruction.Opcode) {
case TGSI_OPCODE_MOV:
+ case TGSI_OPCODE_SWZ:
return emit_MOV(gen, inst);
case TGSI_OPCODE_MUL:
return emit_MUL(gen, inst);
case TGSI_OPCODE_ADD:
return emit_ADD(gen, inst);
+ case TGSI_OPCODE_SUB:
+ return emit_SUB(gen, inst);
+ case TGSI_OPCODE_MAD:
+ return emit_MAD(gen, inst);
+ case TGSI_OPCODE_LERP:
+ return emit_LERP(gen, inst);
+ case TGSI_OPCODE_DP3:
+ return emit_DP3(gen, inst);
+ case TGSI_OPCODE_DP4:
+ return emit_DP4(gen, inst);
+ case TGSI_OPCODE_DPH:
+ return emit_DPH(gen, inst);
+ case TGSI_OPCODE_NRM:
+ return emit_NRM3(gen, inst);
+ case TGSI_OPCODE_XPD:
+ return emit_XPD(gen, inst);
+ case TGSI_OPCODE_RCP:
+ return emit_RCP(gen, inst);
+ case TGSI_OPCODE_RSQ:
+ return emit_RSQ(gen, inst);
+ case TGSI_OPCODE_ABS:
+ return emit_ABS(gen, inst);
case TGSI_OPCODE_SGT:
return emit_SGT(gen, inst);
+ case TGSI_OPCODE_SLT:
+ return emit_SLT(gen, inst);
+ case TGSI_OPCODE_SGE:
+ return emit_SGE(gen, inst);
+ case TGSI_OPCODE_SLE:
+ return emit_SLE(gen, inst);
+ case TGSI_OPCODE_SEQ:
+ return emit_SEQ(gen, inst);
+ case TGSI_OPCODE_SNE:
+ return emit_SNE(gen, inst);
+ case TGSI_OPCODE_CMP:
+ return emit_CMP(gen, inst);
+ case TGSI_OPCODE_MAX:
+ return emit_MAX(gen, inst);
+ case TGSI_OPCODE_MIN:
+ return emit_MIN(gen, inst);
+ case TGSI_OPCODE_TRUNC:
+ return emit_TRUNC(gen, inst);
+ case TGSI_OPCODE_FLR:
+ return emit_FLR(gen, inst);
+ case TGSI_OPCODE_FRC:
+ return emit_FRC(gen, inst);
case TGSI_OPCODE_END:
return emit_END(gen);
+ case TGSI_OPCODE_COS:
+ return emit_function_call(gen, inst, "spu_cos", 1, TRUE);
+ case TGSI_OPCODE_SIN:
+ return emit_function_call(gen, inst, "spu_sin", 1, TRUE);
+ case TGSI_OPCODE_POW:
+ return emit_function_call(gen, inst, "spu_pow", 2, TRUE);
+ case TGSI_OPCODE_EXPBASE2:
+ return emit_function_call(gen, inst, "spu_exp2", 1, TRUE);
+ case TGSI_OPCODE_LOGBASE2:
+ return emit_function_call(gen, inst, "spu_log2", 1, TRUE);
+ case TGSI_OPCODE_TEX:
+ /* fall-through for now */
+ case TGSI_OPCODE_TXD:
+ /* fall-through for now */
+ case TGSI_OPCODE_TXB:
+ /* fall-through for now */
+ case TGSI_OPCODE_TXL:
+ /* fall-through for now */
+ case TGSI_OPCODE_TXP:
+ return emit_TEX(gen, inst);
+ case TGSI_OPCODE_KIL:
+ return emit_KIL(gen, inst);
+
+ case TGSI_OPCODE_IF:
+ return emit_IF(gen, inst);
+ case TGSI_OPCODE_ELSE:
+ return emit_ELSE(gen, inst);
+ case TGSI_OPCODE_ENDIF:
+ return emit_ENDIF(gen, inst);
+
+ case TGSI_OPCODE_DDX:
+ return emit_DDX_DDY(gen, inst, true);
+ case TGSI_OPCODE_DDY:
+ return emit_DDX_DDY(gen, inst, false);
+
/* XXX lots more cases to do... */
default:
+ fprintf(stderr, "Cell: unimplemented TGSI instruction %d!\n",
+ inst->Instruction.Opcode);
return false;
}
@@ -401,48 +1901,93 @@ emit_instruction(struct codegen *gen,
/**
+ * Emit code for a TGSI immediate value (vector of four floats).
+ * This involves register allocation and initialization.
+ * XXX the initialization should be done by a "prepare" stage, not
+ * per quad execution!
+ */
+static boolean
+emit_immediate(struct codegen *gen, const struct tgsi_full_immediate *immed)
+{
+ int ch;
+
+ assert(gen->num_imm < MAX_TEMPS);
+
+ spe_comment(gen->f, -4, "IMMEDIATE:");
+
+ for (ch = 0; ch < 4; ch++) {
+ float val = immed->u.ImmediateFloat32[ch].Float;
+
+ if (ch > 0 && val == immed->u.ImmediateFloat32[ch - 1].Float) {
+ /* re-use previous register */
+ gen->imm_regs[gen->num_imm][ch] = gen->imm_regs[gen->num_imm][ch - 1];
+ }
+ else {
+ int reg = spe_allocate_available_register(gen->f);
+
+ if (reg < 0)
+ return false;
+
+ /* update immediate map */
+ gen->imm_regs[gen->num_imm][ch] = reg;
+
+ /* emit initializer instruction */
+ spe_load_float(gen->f, reg, val);
+ }
+ }
+
+ gen->num_imm++;
+
+ return true;
+}
+
+
+
+/**
* Emit "code" for a TGSI declaration.
* We only care about TGSI TEMPORARY register declarations at this time.
* For each TGSI TEMPORARY we allocate four SPE registers.
*/
-static void
-emit_declaration(struct codegen *gen, const struct tgsi_full_declaration *decl)
+static boolean
+emit_declaration(struct cell_context *cell,
+ struct codegen *gen, const struct tgsi_full_declaration *decl)
{
int i, ch;
switch (decl->Declaration.File) {
case TGSI_FILE_TEMPORARY:
-#if DISASSEM
- printf("Declare temp reg %d .. %d\n",
- decl->DeclarationRange.First,
- decl->DeclarationRange.Last);
-#endif
for (i = decl->DeclarationRange.First;
i <= decl->DeclarationRange.Last;
i++) {
+ assert(i < MAX_TEMPS);
for (ch = 0; ch < 4; ch++) {
gen->temp_regs[i][ch] = spe_allocate_available_register(gen->f);
+ if (gen->temp_regs[i][ch] < 0)
+ return false; /* out of regs */
}
/* XXX if we run out of SPE registers, we need to spill
* to SPU memory. someday...
*/
-#if DISASSEM
- printf(" SPE regs: %d %d %d %d\n",
- gen->temp_regs[i][0],
- gen->temp_regs[i][1],
- gen->temp_regs[i][2],
- gen->temp_regs[i][3]);
-#endif
+ {
+ char buf[100];
+ sprintf(buf, "TGSI temp[%d] maps to SPU regs [$%d $%d $%d $%d]", i,
+ gen->temp_regs[i][0], gen->temp_regs[i][1],
+ gen->temp_regs[i][2], gen->temp_regs[i][3]);
+ spe_comment(gen->f, -4, buf);
+ }
}
break;
default:
; /* ignore */
}
+
+ return true;
}
+
/**
* Translate TGSI shader code to SPE instructions. This is done when
* the state tracker gives us a new shader (via pipe->create_fs_state()).
@@ -460,6 +2005,7 @@ cell_gen_fragment_program(struct cell_context *cell,
struct codegen gen;
memset(&gen, 0, sizeof(gen));
+ gen.cell = cell;
gen.f = f;
/* For SPE function calls: reg $3 = first param, $4 = second param, etc. */
@@ -472,50 +2018,50 @@ cell_gen_fragment_program(struct cell_context *cell,
spe_allocate_register(f, gen.outputs_reg);
spe_allocate_register(f, gen.constants_reg);
-#if DISASSEM
- printf("Begin %s\n", __FUNCTION__);
- tgsi_dump(tokens, 0);
-#endif
+ if (cell->debug_flags & CELL_DEBUG_ASM) {
+ spe_print_code(f, true);
+ spe_indent(f, 8);
+ printf("Begin %s\n", __FUNCTION__);
+ tgsi_dump(tokens, 0);
+ }
tgsi_parse_init(&parse, tokens);
+ emit_prologue(&gen);
+
while (!tgsi_parse_end_of_tokens(&parse) && !gen.error) {
tgsi_parse_token(&parse);
switch (parse.FullToken.Token.Type) {
case TGSI_TOKEN_TYPE_IMMEDIATE:
-#if 0
- if (!note_immediate(&gen, &parse.FullToken.FullImmediate ))
- goto fail;
-#endif
+ if (!emit_immediate(&gen, &parse.FullToken.FullImmediate))
+ gen.error = true;
break;
case TGSI_TOKEN_TYPE_DECLARATION:
- emit_declaration(&gen, &parse.FullToken.FullDeclaration);
+ if (!emit_declaration(cell, &gen, &parse.FullToken.FullDeclaration))
+ gen.error = true;
break;
case TGSI_TOKEN_TYPE_INSTRUCTION:
- if (!emit_instruction(&gen, &parse.FullToken.FullInstruction )) {
+ if (!emit_instruction(&gen, &parse.FullToken.FullInstruction))
gen.error = true;
- }
break;
default:
assert(0);
-
}
}
-
if (gen.error) {
/* terminate the SPE code */
return emit_END(&gen);
}
-#if DISASSEM
- printf("cell_gen_fragment_program nr instructions: %d\n", f->num_inst);
- printf("End %s\n", __FUNCTION__);
-#endif
+ if (cell->debug_flags & CELL_DEBUG_ASM) {
+ printf("cell_gen_fragment_program nr instructions: %d\n", f->num_inst);
+ printf("End %s\n", __FUNCTION__);
+ }
tgsi_parse_free( &parse );
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
index 06219d4e98..2c64eb1bcc 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
@@ -54,12 +54,17 @@
* \param ifragZ_reg register containing integer fragment Z values (in)
* \param ifbZ_reg register containing integer frame buffer Z values (in/out)
* \param zmask_reg register containing result of Z test/comparison (out)
+ *
+ * Returns true if the Z-buffer needs to be updated.
*/
-static void
-gen_depth_test(const struct pipe_depth_stencil_alpha_state *dsa,
- struct spe_function *f,
+static boolean
+gen_depth_test(struct spe_function *f,
+ const struct pipe_depth_stencil_alpha_state *dsa,
int mask_reg, int ifragZ_reg, int ifbZ_reg, int zmask_reg)
{
+ /* NOTE: we use clgt below, not cgt, because we want to compare _unsigned_
+ * quantities. This only makes a difference for 32-bit Z values though.
+ */
ASSERT(dsa->depth.enabled);
switch (dsa->depth.func) {
@@ -79,28 +84,28 @@ gen_depth_test(const struct pipe_depth_stencil_alpha_state *dsa,
case PIPE_FUNC_GREATER:
/* zmask = (ifragZ > ref) */
- spe_cgt(f, zmask_reg, ifragZ_reg, ifbZ_reg);
+ spe_clgt(f, zmask_reg, ifragZ_reg, ifbZ_reg);
/* mask = (mask & zmask) */
spe_and(f, mask_reg, mask_reg, zmask_reg);
break;
case PIPE_FUNC_LESS:
/* zmask = (ref > ifragZ) */
- spe_cgt(f, zmask_reg, ifbZ_reg, ifragZ_reg);
+ spe_clgt(f, zmask_reg, ifbZ_reg, ifragZ_reg);
/* mask = (mask & zmask) */
spe_and(f, mask_reg, mask_reg, zmask_reg);
break;
case PIPE_FUNC_LEQUAL:
/* zmask = (ifragZ > ref) */
- spe_cgt(f, zmask_reg, ifragZ_reg, ifbZ_reg);
+ spe_clgt(f, zmask_reg, ifragZ_reg, ifbZ_reg);
/* mask = (mask & ~zmask) */
spe_andc(f, mask_reg, mask_reg, zmask_reg);
break;
case PIPE_FUNC_GEQUAL:
/* zmask = (ref > ifragZ) */
- spe_cgt(f, zmask_reg, ifbZ_reg, ifragZ_reg);
+ spe_clgt(f, zmask_reg, ifbZ_reg, ifragZ_reg);
/* mask = (mask & ~zmask) */
spe_andc(f, mask_reg, mask_reg, zmask_reg);
break;
@@ -129,7 +134,10 @@ gen_depth_test(const struct pipe_depth_stencil_alpha_state *dsa,
* framebufferZ = (ztest_passed ? fragmentZ : framebufferZ;
*/
spe_selb(f, ifbZ_reg, ifbZ_reg, ifragZ_reg, mask_reg);
+ return true;
}
+
+ return false;
}
@@ -229,7 +237,40 @@ gen_alpha_test(const struct pipe_depth_stencil_alpha_state *dsa,
spe_release_register(f, amask_reg);
}
+/* This pair of functions is used inline to allocate and deallocate
+ * optional constant registers. Once a constant is discovered to be
+ * needed, we will likely need it again, so we don't want to deallocate
+ * it and have to allocate and load it again unnecessarily.
+ */
+static inline void
+setup_optional_register(struct spe_function *f, boolean *is_already_set, unsigned int *r)
+{
+ if (*is_already_set) return;
+ *r = spe_allocate_available_register(f);
+ *is_already_set = true;
+}
+
+static inline void
+release_optional_register(struct spe_function *f, boolean *is_already_set, unsigned int r)
+{
+ if (!*is_already_set) return;
+ spe_release_register(f, r);
+ *is_already_set = false;
+}
+
+static inline void
+setup_const_register(struct spe_function *f, boolean *is_already_set, unsigned int *r, float value)
+{
+ if (*is_already_set) return;
+ setup_optional_register(f, is_already_set, r);
+ spe_load_float(f, *r, value);
+}
+static inline void
+release_const_register(struct spe_function *f, boolean *is_already_set, unsigned int r)
+{
+ release_optional_register(f, is_already_set, r);
+}
/**
* Generate SPE code to implement the given blend mode for a quad of pixels.
@@ -242,6 +283,7 @@ gen_alpha_test(const struct pipe_depth_stencil_alpha_state *dsa,
*/
static void
gen_blend(const struct pipe_blend_state *blend,
+ const struct pipe_blend_color *blend_color,
struct spe_function *f,
enum pipe_format color_format,
int fragR_reg, int fragG_reg, int fragB_reg, int fragA_reg,
@@ -262,10 +304,17 @@ gen_blend(const struct pipe_blend_state *blend,
int fbB_reg = spe_allocate_available_register(f);
int fbA_reg = spe_allocate_available_register(f);
- int one_reg = spe_allocate_available_register(f);
int tmp_reg = spe_allocate_available_register(f);
- boolean one_reg_set = false; /* avoid setting one_reg more than once */
+ /* Optional constant registers we might or might not end up using;
+ * if we do use them, make sure we only allocate them once by
+ * keeping a flag on each one.
+ */
+ boolean one_reg_set = false;
+ unsigned int one_reg;
+ boolean constR_reg_set = false, constG_reg_set = false,
+ constB_reg_set = false, constA_reg_set = false;
+ unsigned int constR_reg, constG_reg, constB_reg, constA_reg;
ASSERT(blend->blend_enable);
@@ -346,127 +395,449 @@ gen_blend(const struct pipe_blend_state *blend,
spe_release_register(f, mask_reg);
}
-
/*
- * Compute Src RGB terms
+ * Compute Src RGB terms. We're actually looking for the value
+ * of (the appropriate RGB factors) * (the incoming source RGB color),
+ * because in some cases (like PIPE_BLENDFACTOR_ONE and
+ * PIPE_BLENDFACTOR_ZERO) we can avoid doing unnecessary math.
*/
switch (blend->rgb_src_factor) {
case PIPE_BLENDFACTOR_ONE:
+ /* factors = (1,1,1), so term = (R,G,B) */
spe_move(f, term1R_reg, fragR_reg);
spe_move(f, term1G_reg, fragG_reg);
spe_move(f, term1B_reg, fragB_reg);
break;
case PIPE_BLENDFACTOR_ZERO:
- spe_zero(f, term1R_reg);
- spe_zero(f, term1G_reg);
- spe_zero(f, term1B_reg);
+ /* factors = (0,0,0), so term = (0,0,0) */
+ spe_load_float(f, term1R_reg, 0.0f);
+ spe_load_float(f, term1G_reg, 0.0f);
+ spe_load_float(f, term1B_reg, 0.0f);
break;
case PIPE_BLENDFACTOR_SRC_COLOR:
+ /* factors = (R,G,B), so term = (R*R, G*G, B*B) */
spe_fm(f, term1R_reg, fragR_reg, fragR_reg);
spe_fm(f, term1G_reg, fragG_reg, fragG_reg);
spe_fm(f, term1B_reg, fragB_reg, fragB_reg);
break;
case PIPE_BLENDFACTOR_SRC_ALPHA:
+ /* factors = (A,A,A), so term = (R*A, G*A, B*A) */
spe_fm(f, term1R_reg, fragR_reg, fragA_reg);
spe_fm(f, term1G_reg, fragG_reg, fragA_reg);
spe_fm(f, term1B_reg, fragB_reg, fragA_reg);
break;
- /* XXX more cases */
+ case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+ /* factors = (1-R,1-G,1-B), so term = (R*(1-R), G*(1-G), B*(1-B))
+ * or in other words term = (R-R*R, G-G*G, B-B*B)
+ * fnms(a,b,c,d) computes a = d - b*c
+ */
+ spe_fnms(f, term1R_reg, fragR_reg, fragR_reg, fragR_reg);
+ spe_fnms(f, term1G_reg, fragG_reg, fragG_reg, fragG_reg);
+ spe_fnms(f, term1B_reg, fragB_reg, fragB_reg, fragB_reg);
+ break;
+ case PIPE_BLENDFACTOR_DST_COLOR:
+ /* factors = (Rfb,Gfb,Bfb), so term = (R*Rfb, G*Gfb, B*Bfb) */
+ spe_fm(f, term1R_reg, fragR_reg, fbR_reg);
+ spe_fm(f, term1G_reg, fragG_reg, fbG_reg);
+ spe_fm(f, term1B_reg, fragB_reg, fbB_reg);
+ break;
+ case PIPE_BLENDFACTOR_INV_DST_COLOR:
+ /* factors = (1-Rfb,1-Gfb,1-Bfb), so term = (R*(1-Rfb),G*(1-Gfb),B*(1-Bfb))
+ * or term = (R-R*Rfb, G-G*Gfb, B-B*Bfb)
+ * fnms(a,b,c,d) computes a = d - b*c
+ */
+ spe_fnms(f, term1R_reg, fragR_reg, fbR_reg, fragR_reg);
+ spe_fnms(f, term1G_reg, fragG_reg, fbG_reg, fragG_reg);
+ spe_fnms(f, term1B_reg, fragB_reg, fbB_reg, fragB_reg);
+ break;
+ case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+ /* factors = (1-A,1-A,1-A), so term = (R*(1-A),G*(1-A),B*(1-A))
+ * or term = (R-R*A,G-G*A,B-B*A)
+ * fnms(a,b,c,d) computes a = d - b*c
+ */
+ spe_fnms(f, term1R_reg, fragR_reg, fragA_reg, fragR_reg);
+ spe_fnms(f, term1G_reg, fragG_reg, fragA_reg, fragG_reg);
+ spe_fnms(f, term1B_reg, fragB_reg, fragA_reg, fragB_reg);
+ break;
+ case PIPE_BLENDFACTOR_DST_ALPHA:
+ /* factors = (Afb, Afb, Afb), so term = (R*Afb, G*Afb, B*Afb) */
+ spe_fm(f, term1R_reg, fragR_reg, fbA_reg);
+ spe_fm(f, term1G_reg, fragG_reg, fbA_reg);
+ spe_fm(f, term1B_reg, fragB_reg, fbA_reg);
+ break;
+ case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+ /* factors = (1-Afb, 1-Afb, 1-Afb), so term = (R*(1-Afb),G*(1-Afb),B*(1-Afb))
+ * or term = (R-R*Afb,G-G*Afb,b-B*Afb)
+ * fnms(a,b,c,d) computes a = d - b*c
+ */
+ spe_fnms(f, term1R_reg, fragR_reg, fbA_reg, fragR_reg);
+ spe_fnms(f, term1G_reg, fragG_reg, fbA_reg, fragG_reg);
+ spe_fnms(f, term1B_reg, fragB_reg, fbA_reg, fragB_reg);
+ break;
+ case PIPE_BLENDFACTOR_CONST_COLOR:
+ /* We need the optional constant color registers */
+ setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
+ setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
+ setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
+ /* now, factor = (Rc,Gc,Bc), so term = (R*Rc,G*Gc,B*Bc) */
+ spe_fm(f, term1R_reg, fragR_reg, constR_reg);
+ spe_fm(f, term1G_reg, fragG_reg, constG_reg);
+ spe_fm(f, term1B_reg, fragB_reg, constB_reg);
+ break;
+ case PIPE_BLENDFACTOR_CONST_ALPHA:
+ /* we'll need the optional constant alpha register */
+ setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
+ /* factor = (Ac,Ac,Ac), so term = (R*Ac,G*Ac,B*Ac) */
+ spe_fm(f, term1R_reg, fragR_reg, constA_reg);
+ spe_fm(f, term1G_reg, fragG_reg, constA_reg);
+ spe_fm(f, term1B_reg, fragB_reg, constA_reg);
+ break;
+ case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+ /* We need the optional constant color registers */
+ setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
+ setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
+ setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
+ /* factor = (1-Rc,1-Gc,1-Bc), so term = (R*(1-Rc),G*(1-Gc),B*(1-Bc))
+ * or term = (R-R*Rc, G-G*Gc, B-B*Bc)
+ * fnms(a,b,c,d) computes a = d - b*c
+ */
+ spe_fnms(f, term1R_reg, fragR_reg, constR_reg, fragR_reg);
+ spe_fnms(f, term1G_reg, fragG_reg, constG_reg, fragG_reg);
+ spe_fnms(f, term1B_reg, fragB_reg, constB_reg, fragB_reg);
+ break;
+ case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+ /* We need the optional constant color registers */
+ setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
+ setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
+ setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
+ /* factor = (1-Ac,1-Ac,1-Ac), so term = (R*(1-Ac),G*(1-Ac),B*(1-Ac))
+ * or term = (R-R*Ac,G-G*Ac,B-B*Ac)
+ * fnms(a,b,c,d) computes a = d - b*c
+ */
+ spe_fnms(f, term1R_reg, fragR_reg, constA_reg, fragR_reg);
+ spe_fnms(f, term1G_reg, fragG_reg, constA_reg, fragG_reg);
+ spe_fnms(f, term1B_reg, fragB_reg, constA_reg, fragB_reg);
+ break;
+ case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+ /* We'll need the optional {1,1,1,1} register */
+ setup_const_register(f, &one_reg_set, &one_reg, 1.0f);
+ /* factor = (min(A,1-Afb),min(A,1-Afb),min(A,1-Afb)), so
+ * term = (R*min(A,1-Afb), G*min(A,1-Afb), B*min(A,1-Afb))
+ * We could expand the term (as a*min(b,c) == min(a*b,a*c)
+ * as long as a is positive), but then we'd have to do three
+ * spe_float_min() functions instead of one, so this is simpler.
+ */
+ /* tmp = 1 - Afb */
+ spe_fs(f, tmp_reg, one_reg, fbA_reg);
+ /* tmp = min(A,tmp) */
+ spe_float_min(f, tmp_reg, fragA_reg, tmp_reg);
+ /* term = R*tmp */
+ spe_fm(f, term1R_reg, fragR_reg, tmp_reg);
+ spe_fm(f, term1G_reg, fragG_reg, tmp_reg);
+ spe_fm(f, term1B_reg, fragB_reg, tmp_reg);
+ break;
+
+ /* These are special D3D cases involving a second color output
+ * from the fragment shader. I'm not sure we can support them
+ * yet... XXX
+ */
+ case PIPE_BLENDFACTOR_SRC1_COLOR:
+ case PIPE_BLENDFACTOR_SRC1_ALPHA:
+ case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+ case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+
default:
ASSERT(0);
}
/*
- * Compute Src Alpha term
+ * Compute Src Alpha term. Like the above, we're looking for
+ * the full term A*factor, not just the factor itself, because
+ * in many cases we can avoid doing unnecessary multiplies.
*/
switch (blend->alpha_src_factor) {
+ case PIPE_BLENDFACTOR_ZERO:
+ /* factor = 0, so term = 0 */
+ spe_load_float(f, term1A_reg, 0.0f);
+ break;
+
+ case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: /* fall through */
case PIPE_BLENDFACTOR_ONE:
+ /* factor = 1, so term = A */
spe_move(f, term1A_reg, fragA_reg);
break;
+
case PIPE_BLENDFACTOR_SRC_COLOR:
+ /* factor = A, so term = A*A */
spe_fm(f, term1A_reg, fragA_reg, fragA_reg);
break;
case PIPE_BLENDFACTOR_SRC_ALPHA:
spe_fm(f, term1A_reg, fragA_reg, fragA_reg);
break;
- /* XXX more cases */
+
+ case PIPE_BLENDFACTOR_INV_SRC_ALPHA: /* fall through */
+ case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+ /* factor = 1-A, so term = A*(1-A) = A-A*A */
+ /* fnms(a,b,c,d) computes a = d - b*c */
+ spe_fnms(f, term1A_reg, fragA_reg, fragA_reg, fragA_reg);
+ break;
+
+ case PIPE_BLENDFACTOR_DST_ALPHA: /* fall through */
+ case PIPE_BLENDFACTOR_DST_COLOR:
+ /* factor = Afb, so term = A*Afb */
+ spe_fm(f, term1A_reg, fragA_reg, fbA_reg);
+ break;
+
+ case PIPE_BLENDFACTOR_INV_DST_ALPHA: /* fall through */
+ case PIPE_BLENDFACTOR_INV_DST_COLOR:
+ /* factor = 1-Afb, so term = A*(1-Afb) = A - A*Afb */
+ /* fnms(a,b,c,d) computes a = d - b*c */
+ spe_fnms(f, term1A_reg, fragA_reg, fbA_reg, fragA_reg);
+ break;
+
+ case PIPE_BLENDFACTOR_CONST_ALPHA: /* fall through */
+ case PIPE_BLENDFACTOR_CONST_COLOR:
+ /* We need the optional constA_reg register */
+ setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
+ /* factor = Ac, so term = A*Ac */
+ spe_fm(f, term1A_reg, fragA_reg, constA_reg);
+ break;
+
+ case PIPE_BLENDFACTOR_INV_CONST_ALPHA: /* fall through */
+ case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+ /* We need the optional constA_reg register */
+ setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
+ /* factor = 1-Ac, so term = A*(1-Ac) = A-A*Ac */
+ /* fnms(a,b,c,d) computes a = d - b*c */
+ spe_fnms(f, term1A_reg, fragA_reg, constA_reg, fragA_reg);
+ break;
+
+ /* These are special D3D cases involving a second color output
+ * from the fragment shader. I'm not sure we can support them
+ * yet... XXX
+ */
+ case PIPE_BLENDFACTOR_SRC1_COLOR:
+ case PIPE_BLENDFACTOR_SRC1_ALPHA:
+ case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+ case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
default:
ASSERT(0);
}
/*
- * Compute Dest RGB terms
+ * Compute Dest RGB term. Like the above, we're looking for
+ * the full term (Rfb,Gfb,Bfb)*(factor), not just the factor itself, because
+ * in many cases we can avoid doing unnecessary multiplies.
*/
switch (blend->rgb_dst_factor) {
case PIPE_BLENDFACTOR_ONE:
+ /* factors = (1,1,1), so term = (Rfb,Gfb,Bfb) */
spe_move(f, term2R_reg, fbR_reg);
spe_move(f, term2G_reg, fbG_reg);
spe_move(f, term2B_reg, fbB_reg);
break;
case PIPE_BLENDFACTOR_ZERO:
- spe_zero(f, term2R_reg);
- spe_zero(f, term2G_reg);
- spe_zero(f, term2B_reg);
+ /* factor s= (0,0,0), so term = (0,0,0) */
+ spe_load_float(f, term2R_reg, 0.0f);
+ spe_load_float(f, term2G_reg, 0.0f);
+ spe_load_float(f, term2B_reg, 0.0f);
break;
case PIPE_BLENDFACTOR_SRC_COLOR:
+ /* factors = (R,G,B), so term = (R*Rfb, G*Gfb, B*Bfb) */
spe_fm(f, term2R_reg, fbR_reg, fragR_reg);
spe_fm(f, term2G_reg, fbG_reg, fragG_reg);
spe_fm(f, term2B_reg, fbB_reg, fragB_reg);
break;
+ case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+ /* factors = (1-R,1-G,1-B), so term = (Rfb*(1-R), Gfb*(1-G), Bfb*(1-B))
+ * or in other words term = (Rfb-Rfb*R, Gfb-Gfb*G, Bfb-Bfb*B)
+ * fnms(a,b,c,d) computes a = d - b*c
+ */
+ spe_fnms(f, term2R_reg, fragR_reg, fbR_reg, fbR_reg);
+ spe_fnms(f, term2G_reg, fragG_reg, fbG_reg, fbG_reg);
+ spe_fnms(f, term2B_reg, fragB_reg, fbB_reg, fbB_reg);
+ break;
case PIPE_BLENDFACTOR_SRC_ALPHA:
+ /* factors = (A,A,A), so term = (Rfb*A, Gfb*A, Bfb*A) */
spe_fm(f, term2R_reg, fbR_reg, fragA_reg);
spe_fm(f, term2G_reg, fbG_reg, fragA_reg);
spe_fm(f, term2B_reg, fbB_reg, fragA_reg);
break;
case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
- /* one = {1.0, 1.0, 1.0, 1.0} */
- if (!one_reg_set) {
- spe_load_float(f, one_reg, 1.0f);
- one_reg_set = true;
- }
- /* tmp = one - fragA */
- spe_fs(f, tmp_reg, one_reg, fragA_reg);
- /* term = fb * tmp */
- spe_fm(f, term2R_reg, fbR_reg, tmp_reg);
- spe_fm(f, term2G_reg, fbG_reg, tmp_reg);
- spe_fm(f, term2B_reg, fbB_reg, tmp_reg);
- break;
- /* XXX more cases */
+ /* factors = (1-A,1-A,1-A) so term = (Rfb-Rfb*A,Gfb-Gfb*A,Bfb-Bfb*A) */
+ /* fnms(a,b,c,d) computes a = d - b*c */
+ spe_fnms(f, term2R_reg, fbR_reg, fragA_reg, fbR_reg);
+ spe_fnms(f, term2G_reg, fbG_reg, fragA_reg, fbG_reg);
+ spe_fnms(f, term2B_reg, fbB_reg, fragA_reg, fbB_reg);
+ break;
+ case PIPE_BLENDFACTOR_DST_COLOR:
+ /* factors = (Rfb,Gfb,Bfb), so term = (Rfb*Rfb, Gfb*Gfb, Bfb*Bfb) */
+ spe_fm(f, term2R_reg, fbR_reg, fbR_reg);
+ spe_fm(f, term2G_reg, fbG_reg, fbG_reg);
+ spe_fm(f, term2B_reg, fbB_reg, fbB_reg);
+ break;
+ case PIPE_BLENDFACTOR_INV_DST_COLOR:
+ /* factors = (1-Rfb,1-Gfb,1-Bfb), so term = (Rfb*(1-Rfb),Gfb*(1-Gfb),Bfb*(1-Bfb))
+ * or term = (Rfb-Rfb*Rfb, Gfb-Gfb*Gfb, Bfb-Bfb*Bfb)
+ * fnms(a,b,c,d) computes a = d - b*c
+ */
+ spe_fnms(f, term2R_reg, fbR_reg, fbR_reg, fbR_reg);
+ spe_fnms(f, term2G_reg, fbG_reg, fbG_reg, fbG_reg);
+ spe_fnms(f, term2B_reg, fbB_reg, fbB_reg, fbB_reg);
+ break;
+
+ case PIPE_BLENDFACTOR_DST_ALPHA:
+ /* factors = (Afb, Afb, Afb), so term = (Rfb*Afb, Gfb*Afb, Bfb*Afb) */
+ spe_fm(f, term2R_reg, fbR_reg, fbA_reg);
+ spe_fm(f, term2G_reg, fbG_reg, fbA_reg);
+ spe_fm(f, term2B_reg, fbB_reg, fbA_reg);
+ break;
+ case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+ /* factors = (1-Afb, 1-Afb, 1-Afb), so term = (Rfb*(1-Afb),Gfb*(1-Afb),Bfb*(1-Afb))
+ * or term = (Rfb-Rfb*Afb,Gfb-Gfb*Afb,Bfb-Bfb*Afb)
+ * fnms(a,b,c,d) computes a = d - b*c
+ */
+ spe_fnms(f, term2R_reg, fbR_reg, fbA_reg, fbR_reg);
+ spe_fnms(f, term2G_reg, fbG_reg, fbA_reg, fbG_reg);
+ spe_fnms(f, term2B_reg, fbB_reg, fbA_reg, fbB_reg);
+ break;
+ case PIPE_BLENDFACTOR_CONST_COLOR:
+ /* We need the optional constant color registers */
+ setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
+ setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
+ setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
+ /* now, factor = (Rc,Gc,Bc), so term = (Rfb*Rc,Gfb*Gc,Bfb*Bc) */
+ spe_fm(f, term2R_reg, fbR_reg, constR_reg);
+ spe_fm(f, term2G_reg, fbG_reg, constG_reg);
+ spe_fm(f, term2B_reg, fbB_reg, constB_reg);
+ break;
+ case PIPE_BLENDFACTOR_CONST_ALPHA:
+ /* we'll need the optional constant alpha register */
+ setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
+ /* factor = (Ac,Ac,Ac), so term = (Rfb*Ac,Gfb*Ac,Bfb*Ac) */
+ spe_fm(f, term2R_reg, fbR_reg, constA_reg);
+ spe_fm(f, term2G_reg, fbG_reg, constA_reg);
+ spe_fm(f, term2B_reg, fbB_reg, constA_reg);
+ break;
+ case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+ /* We need the optional constant color registers */
+ setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
+ setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
+ setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
+ /* factor = (1-Rc,1-Gc,1-Bc), so term = (Rfb*(1-Rc),Gfb*(1-Gc),Bfb*(1-Bc))
+ * or term = (Rfb-Rfb*Rc, Gfb-Gfb*Gc, Bfb-Bfb*Bc)
+ * fnms(a,b,c,d) computes a = d - b*c
+ */
+ spe_fnms(f, term2R_reg, fbR_reg, constR_reg, fbR_reg);
+ spe_fnms(f, term2G_reg, fbG_reg, constG_reg, fbG_reg);
+ spe_fnms(f, term2B_reg, fbB_reg, constB_reg, fbB_reg);
+ break;
+ case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+ /* We need the optional constant color registers */
+ setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
+ setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
+ setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
+ /* factor = (1-Ac,1-Ac,1-Ac), so term = (Rfb*(1-Ac),Gfb*(1-Ac),Bfb*(1-Ac))
+ * or term = (Rfb-Rfb*Ac,Gfb-Gfb*Ac,Bfb-Bfb*Ac)
+ * fnms(a,b,c,d) computes a = d - b*c
+ */
+ spe_fnms(f, term2R_reg, fbR_reg, constA_reg, fbR_reg);
+ spe_fnms(f, term2G_reg, fbG_reg, constA_reg, fbG_reg);
+ spe_fnms(f, term2B_reg, fbB_reg, constA_reg, fbB_reg);
+ break;
+ case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: /* not supported for dest RGB */
+ ASSERT(0);
+ break;
+
+ /* These are special D3D cases involving a second color output
+ * from the fragment shader. I'm not sure we can support them
+ * yet... XXX
+ */
+ case PIPE_BLENDFACTOR_SRC1_COLOR:
+ case PIPE_BLENDFACTOR_SRC1_ALPHA:
+ case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+ case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+
default:
ASSERT(0);
}
/*
- * Compute Dest Alpha term
+ * Compute Dest Alpha term. Like the above, we're looking for
+ * the full term Afb*factor, not just the factor itself, because
+ * in many cases we can avoid doing unnecessary multiplies.
*/
switch (blend->alpha_dst_factor) {
case PIPE_BLENDFACTOR_ONE:
+ /* factor = 1, so term = Afb */
spe_move(f, term2A_reg, fbA_reg);
break;
case PIPE_BLENDFACTOR_ZERO:
- spe_zero(f, term2A_reg);
+ /* factor = 0, so term = 0 */
+ spe_load_float(f, term2A_reg, 0.0f);
break;
- case PIPE_BLENDFACTOR_SRC_ALPHA:
+
+ case PIPE_BLENDFACTOR_SRC_ALPHA: /* fall through */
+ case PIPE_BLENDFACTOR_SRC_COLOR:
+ /* factor = A, so term = Afb*A */
spe_fm(f, term2A_reg, fbA_reg, fragA_reg);
break;
- case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
- /* one = {1.0, 1.0, 1.0, 1.0} */
- if (!one_reg_set) {
- spe_load_float(f, one_reg, 1.0f);
- one_reg_set = true;
- }
- /* tmp = one - fragA */
- spe_fs(f, tmp_reg, one_reg, fragA_reg);
- /* termA = fbA * tmp */
- spe_fm(f, term2A_reg, fbA_reg, tmp_reg);
+
+ case PIPE_BLENDFACTOR_INV_SRC_ALPHA: /* fall through */
+ case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+ /* factor = 1-A, so term = Afb*(1-A) = Afb-Afb*A */
+ /* fnms(a,b,c,d) computes a = d - b*c */
+ spe_fnms(f, term2A_reg, fbA_reg, fragA_reg, fbA_reg);
+ break;
+
+ case PIPE_BLENDFACTOR_DST_ALPHA: /* fall through */
+ case PIPE_BLENDFACTOR_DST_COLOR:
+ /* factor = Afb, so term = Afb*Afb */
+ spe_fm(f, term2A_reg, fbA_reg, fbA_reg);
+ break;
+
+ case PIPE_BLENDFACTOR_INV_DST_ALPHA: /* fall through */
+ case PIPE_BLENDFACTOR_INV_DST_COLOR:
+ /* factor = 1-Afb, so term = Afb*(1-Afb) = Afb - Afb*Afb */
+ /* fnms(a,b,c,d) computes a = d - b*c */
+ spe_fnms(f, term2A_reg, fbA_reg, fbA_reg, fbA_reg);
+ break;
+
+ case PIPE_BLENDFACTOR_CONST_ALPHA: /* fall through */
+ case PIPE_BLENDFACTOR_CONST_COLOR:
+ /* We need the optional constA_reg register */
+ setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
+ /* factor = Ac, so term = Afb*Ac */
+ spe_fm(f, term2A_reg, fbA_reg, constA_reg);
+ break;
+
+ case PIPE_BLENDFACTOR_INV_CONST_ALPHA: /* fall through */
+ case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+ /* We need the optional constA_reg register */
+ setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
+ /* factor = 1-Ac, so term = Afb*(1-Ac) = Afb-Afb*Ac */
+ /* fnms(a,b,c,d) computes a = d - b*c */
+ spe_fnms(f, term2A_reg, fbA_reg, constA_reg, fbA_reg);
+ break;
+
+ case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: /* not supported for dest alpha */
+ ASSERT(0);
break;
- /* XXX more cases */
+
+ /* These are special D3D cases involving a second color output
+ * from the fragment shader. I'm not sure we can support them
+ * yet... XXX
+ */
+ case PIPE_BLENDFACTOR_SRC1_COLOR:
+ case PIPE_BLENDFACTOR_SRC1_ALPHA:
+ case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+ case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
default:
ASSERT(0);
}
/*
- * Combine Src/Dest RGB terms
+ * Combine Src/Dest RGB terms as per the blend equation.
*/
switch (blend->rgb_func) {
case PIPE_BLEND_ADD:
@@ -479,7 +850,21 @@ gen_blend(const struct pipe_blend_state *blend,
spe_fs(f, fragG_reg, term1G_reg, term2G_reg);
spe_fs(f, fragB_reg, term1B_reg, term2B_reg);
break;
- /* XXX more cases */
+ case PIPE_BLEND_REVERSE_SUBTRACT:
+ spe_fs(f, fragR_reg, term2R_reg, term1R_reg);
+ spe_fs(f, fragG_reg, term2G_reg, term1G_reg);
+ spe_fs(f, fragB_reg, term2B_reg, term1B_reg);
+ break;
+ case PIPE_BLEND_MIN:
+ spe_float_min(f, fragR_reg, term1R_reg, term2R_reg);
+ spe_float_min(f, fragG_reg, term1G_reg, term2G_reg);
+ spe_float_min(f, fragB_reg, term1B_reg, term2B_reg);
+ break;
+ case PIPE_BLEND_MAX:
+ spe_float_max(f, fragR_reg, term1R_reg, term2R_reg);
+ spe_float_max(f, fragG_reg, term1G_reg, term2G_reg);
+ spe_float_max(f, fragB_reg, term1B_reg, term2B_reg);
+ break;
default:
ASSERT(0);
}
@@ -494,7 +879,15 @@ gen_blend(const struct pipe_blend_state *blend,
case PIPE_BLEND_SUBTRACT:
spe_fs(f, fragA_reg, term1A_reg, term2A_reg);
break;
- /* XXX more cases */
+ case PIPE_BLEND_REVERSE_SUBTRACT:
+ spe_fs(f, fragA_reg, term2A_reg, term1A_reg);
+ break;
+ case PIPE_BLEND_MIN:
+ spe_float_min(f, fragA_reg, term1A_reg, term2A_reg);
+ break;
+ case PIPE_BLEND_MAX:
+ spe_float_max(f, fragA_reg, term1A_reg, term2A_reg);
+ break;
default:
ASSERT(0);
}
@@ -514,8 +907,14 @@ gen_blend(const struct pipe_blend_state *blend,
spe_release_register(f, fbB_reg);
spe_release_register(f, fbA_reg);
- spe_release_register(f, one_reg);
spe_release_register(f, tmp_reg);
+
+ /* Free any optional registers that actually got used */
+ release_const_register(f, &one_reg_set, one_reg);
+ release_const_register(f, &constR_reg_set, constR_reg);
+ release_const_register(f, &constG_reg_set, constG_reg);
+ release_const_register(f, &constB_reg_set, constB_reg);
+ release_const_register(f, &constA_reg_set, constA_reg);
}
@@ -524,24 +923,74 @@ gen_logicop(const struct pipe_blend_state *blend,
struct spe_function *f,
int fragRGBA_reg, int fbRGBA_reg)
{
- /* XXX to-do */
- /* operate on 32-bit packed pixels, not float colors */
-}
-
-
-static void
-gen_colormask(uint colormask,
- struct spe_function *f,
- int fragRGBA_reg, int fbRGBA_reg)
-{
- /* XXX to-do */
- /* operate on 32-bit packed pixels, not float colors */
+ /* We've got four 32-bit RGBA packed pixels in each of
+ * fragRGBA_reg and fbRGBA_reg, not sets of floating-point
+ * reds, greens, blues, and alphas.
+ * */
+ ASSERT(blend->logicop_enable);
+
+ switch(blend->logicop_func) {
+ case PIPE_LOGICOP_CLEAR: /* 0 */
+ spe_zero(f, fragRGBA_reg);
+ break;
+ case PIPE_LOGICOP_NOR: /* ~(s | d) */
+ spe_nor(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+ break;
+ case PIPE_LOGICOP_AND_INVERTED: /* ~s & d */
+ /* andc R, A, B computes R = A & ~B */
+ spe_andc(f, fragRGBA_reg, fbRGBA_reg, fragRGBA_reg);
+ break;
+ case PIPE_LOGICOP_COPY_INVERTED: /* ~s */
+ spe_complement(f, fragRGBA_reg, fragRGBA_reg);
+ break;
+ case PIPE_LOGICOP_AND_REVERSE: /* s & ~d */
+ /* andc R, A, B computes R = A & ~B */
+ spe_andc(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+ break;
+ case PIPE_LOGICOP_INVERT: /* ~d */
+ /* Note that (A nor A) == ~(A|A) == ~A */
+ spe_nor(f, fragRGBA_reg, fbRGBA_reg, fbRGBA_reg);
+ break;
+ case PIPE_LOGICOP_XOR: /* s ^ d */
+ spe_xor(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+ break;
+ case PIPE_LOGICOP_NAND: /* ~(s & d) */
+ spe_nand(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+ break;
+ case PIPE_LOGICOP_AND: /* s & d */
+ spe_and(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+ break;
+ case PIPE_LOGICOP_EQUIV: /* ~(s ^ d) */
+ spe_xor(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+ spe_complement(f, fragRGBA_reg, fragRGBA_reg);
+ break;
+ case PIPE_LOGICOP_NOOP: /* d */
+ spe_move(f, fragRGBA_reg, fbRGBA_reg);
+ break;
+ case PIPE_LOGICOP_OR_INVERTED: /* ~s | d */
+ /* orc R, A, B computes R = A | ~B */
+ spe_orc(f, fragRGBA_reg, fbRGBA_reg, fragRGBA_reg);
+ break;
+ case PIPE_LOGICOP_COPY: /* s */
+ break;
+ case PIPE_LOGICOP_OR_REVERSE: /* s | ~d */
+ /* orc R, A, B computes R = A | ~B */
+ spe_orc(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+ break;
+ case PIPE_LOGICOP_OR: /* s | d */
+ spe_or(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+ break;
+ case PIPE_LOGICOP_SET: /* 1 */
+ spe_load_int(f, fragRGBA_reg, 0xffffffff);
+ break;
+ default:
+ ASSERT(0);
+ }
}
-
/**
- * Generate code to pack a quad of float colors into a four 32-bit integers.
+ * Generate code to pack a quad of float colors into four 32-bit integers.
*
* \param f SPE function to append instruction onto.
* \param color_format the dest color packing format
@@ -557,13 +1006,16 @@ gen_pack_colors(struct spe_function *f,
int r_reg, int g_reg, int b_reg, int a_reg,
int rgba_reg)
{
+ int rg_reg = spe_allocate_available_register(f);
+ int ba_reg = spe_allocate_available_register(f);
+
/* Convert float[4] in [0.0,1.0] to int[4] in [0,~0], with clamping */
spe_cfltu(f, r_reg, r_reg, 32);
spe_cfltu(f, g_reg, g_reg, 32);
spe_cfltu(f, b_reg, b_reg, 32);
spe_cfltu(f, a_reg, a_reg, 32);
- /* Shift the most significant bytes to least the significant positions.
+ /* Shift the most significant bytes to the least significant positions.
* I.e.: reg = reg >> 24
*/
spe_rotmi(f, r_reg, r_reg, -24);
@@ -595,12 +1047,713 @@ gen_pack_colors(struct spe_function *f,
* OR-ing all those together gives us four packed colors:
* RGBA = {0xffffffff, 0xaa114477, 0xbb225588, 0xcc336699}
*/
- spe_or(f, rgba_reg, r_reg, g_reg);
- spe_or(f, rgba_reg, rgba_reg, b_reg);
- spe_or(f, rgba_reg, rgba_reg, a_reg);
+ spe_or(f, rg_reg, r_reg, g_reg);
+ spe_or(f, ba_reg, a_reg, b_reg);
+ spe_or(f, rgba_reg, rg_reg, ba_reg);
+
+ spe_release_register(f, rg_reg);
+ spe_release_register(f, ba_reg);
+}
+
+static void
+gen_colormask(struct spe_function *f,
+ uint colormask,
+ enum pipe_format color_format,
+ int fragRGBA_reg, int fbRGBA_reg)
+{
+ /* We've got four 32-bit RGBA packed pixels in each of
+ * fragRGBA_reg and fbRGBA_reg, not sets of floating-point
+ * reds, greens, blues, and alphas. Further, the pixels
+ * are packed according to the given color format, not
+ * necessarily RGBA...
+ */
+ unsigned int r_mask;
+ unsigned int g_mask;
+ unsigned int b_mask;
+ unsigned int a_mask;
+
+ /* Calculate exactly where the bits for any particular color
+ * end up, so we can mask them correctly.
+ */
+ switch(color_format) {
+ case PIPE_FORMAT_A8R8G8B8_UNORM:
+ /* ARGB */
+ a_mask = 0xff000000;
+ r_mask = 0x00ff0000;
+ g_mask = 0x0000ff00;
+ b_mask = 0x000000ff;
+ break;
+ case PIPE_FORMAT_B8G8R8A8_UNORM:
+ /* BGRA */
+ b_mask = 0xff000000;
+ g_mask = 0x00ff0000;
+ r_mask = 0x0000ff00;
+ a_mask = 0x000000ff;
+ break;
+ default:
+ ASSERT(0);
+ }
+
+ /* For each R, G, B, and A component we're supposed to mask out,
+ * clear its bits. Then our mask operation later will work
+ * as expected.
+ */
+ if (!(colormask & PIPE_MASK_R)) {
+ r_mask = 0;
+ }
+ if (!(colormask & PIPE_MASK_G)) {
+ g_mask = 0;
+ }
+ if (!(colormask & PIPE_MASK_B)) {
+ b_mask = 0;
+ }
+ if (!(colormask & PIPE_MASK_A)) {
+ a_mask = 0;
+ }
+
+ /* Get a temporary register to hold the mask that will be applied to the fragment */
+ int colormask_reg = spe_allocate_available_register(f);
+
+ /* The actual mask we're going to use is an OR of the remaining R, G, B, and A
+ * masks. Load the result value into our temporary register.
+ */
+ spe_load_uint(f, colormask_reg, r_mask | g_mask | b_mask | a_mask);
+
+ /* Use the mask register to select between the fragment color
+ * values and the frame buffer color values. Wherever the
+ * mask has a 0 bit, the current frame buffer color should override
+ * the fragment color. Wherever the mask has a 1 bit, the
+ * fragment color should persevere. The Select Bits (selb rt, rA, rB, rM)
+ * instruction will select bits from its first operand rA wherever the
+ * the mask bits rM are 0, and from its second operand rB wherever the
+ * mask bits rM are 1. That means that the frame buffer color is the
+ * first operand, and the fragment color the second.
+ */
+ spe_selb(f, fragRGBA_reg, fbRGBA_reg, fragRGBA_reg, colormask_reg);
+
+ /* Release the temporary register and we're done */
+ spe_release_register(f, colormask_reg);
}
+/* This function is annoyingly similar to gen_depth_test(), above, except
+ * that instead of comparing two varying values (i.e. fragment and buffer),
+ * we're comparing a varying value with a static value. As such, we have
+ * access to the Compare Immediate instructions where we don't in
+ * gen_depth_test(), which is what makes us very different.
+ *
+ * There's some added complexity if there's a non-trivial state->mask
+ * value; then stencil and reference both must be masked
+ *
+ * The return value in the stencil_pass_reg is a bitmask of valid
+ * fragments that also passed the stencil test. The bitmask of valid
+ * fragments that failed would be found in (fragment_mask_reg & ~stencil_pass_reg).
+ */
+static void
+gen_stencil_test(struct spe_function *f, const struct pipe_stencil_state *state,
+ unsigned int stencil_max_value,
+ unsigned int fragment_mask_reg, unsigned int fbS_reg,
+ unsigned int stencil_pass_reg)
+{
+ /* Generate code that puts the set of passing fragments into the stencil_pass_reg
+ * register, taking into account whether each fragment was active to begin with.
+ */
+ switch (state->func) {
+ case PIPE_FUNC_EQUAL:
+ if (state->value_mask == stencil_max_value) {
+ /* stencil_pass = fragment_mask & (s == reference) */
+ spe_compare_equal_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
+ spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+ }
+ else {
+ /* stencil_pass = fragment_mask & ((s&mask) == (reference&mask)) */
+ unsigned int tmp_masked_stencil = spe_allocate_available_register(f);
+ spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->value_mask);
+ spe_compare_equal_uint(f, stencil_pass_reg, tmp_masked_stencil, state->value_mask & state->ref_value);
+ spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+ spe_release_register(f, tmp_masked_stencil);
+ }
+ break;
+ case PIPE_FUNC_NOTEQUAL:
+ if (state->value_mask == stencil_max_value) {
+ /* stencil_pass = fragment_mask & ~(s == reference) */
+ spe_compare_equal_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
+ spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+ }
+ else {
+ /* stencil_pass = fragment_mask & ~((s&mask) == (reference&mask)) */
+ unsigned int tmp_masked_stencil = spe_allocate_available_register(f);
+ spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->value_mask);
+ spe_compare_equal_uint(f, stencil_pass_reg, tmp_masked_stencil, state->value_mask & state->ref_value);
+ spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+ spe_release_register(f, tmp_masked_stencil);
+ }
+ break;
+
+ case PIPE_FUNC_LESS:
+ if (state->value_mask == stencil_max_value) {
+ /* stencil_pass = fragment_mask & (reference < s) */
+ spe_compare_greater_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
+ spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+ }
+ else {
+ /* stencil_pass = fragment_mask & ((reference&mask) < (s & mask)) */
+ unsigned int tmp_masked_stencil = spe_allocate_available_register(f);
+ spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->value_mask);
+ spe_compare_greater_uint(f, stencil_pass_reg, tmp_masked_stencil, state->value_mask & state->ref_value);
+ spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+ spe_release_register(f, tmp_masked_stencil);
+ }
+ break;
+
+ case PIPE_FUNC_GREATER:
+ if (state->value_mask == stencil_max_value) {
+ /* stencil_pass = fragment_mask & (reference > s) */
+ /* There's no convenient Compare Less Than Immediate instruction, so
+ * we'll have to do this one the harder way, by loading a register and
+ * comparing directly. Compare Logical Greater Than Word (clgt)
+ * treats its operands as unsigned - no sign extension.
+ */
+ unsigned int tmp_reg = spe_allocate_available_register(f);
+ spe_load_uint(f, tmp_reg, state->ref_value);
+ spe_clgt(f, stencil_pass_reg, tmp_reg, fbS_reg);
+ spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+ spe_release_register(f, tmp_reg);
+ }
+ else {
+ /* stencil_pass = fragment_mask & ((reference&mask) > (s&mask)) */
+ unsigned int tmp_reg = spe_allocate_available_register(f);
+ unsigned int tmp_masked_stencil = spe_allocate_available_register(f);
+ spe_load_uint(f, tmp_reg, state->value_mask & state->ref_value);
+ spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->value_mask);
+ spe_clgt(f, stencil_pass_reg, tmp_reg, tmp_masked_stencil);
+ spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+ spe_release_register(f, tmp_reg);
+ spe_release_register(f, tmp_masked_stencil);
+ }
+ break;
+
+ case PIPE_FUNC_GEQUAL:
+ if (state->value_mask == stencil_max_value) {
+ /* stencil_pass = fragment_mask & (reference >= s)
+ * = fragment_mask & ~(s > reference) */
+ spe_compare_greater_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
+ spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+ }
+ else {
+ /* stencil_pass = fragment_mask & ~((s&mask) > (reference&mask)) */
+ unsigned int tmp_masked_stencil = spe_allocate_available_register(f);
+ spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->value_mask);
+ spe_compare_greater_uint(f, stencil_pass_reg, tmp_masked_stencil, state->value_mask & state->ref_value);
+ spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+ spe_release_register(f, tmp_masked_stencil);
+ }
+ break;
+
+ case PIPE_FUNC_LEQUAL:
+ if (state->value_mask == stencil_max_value) {
+ /* stencil_pass = fragment_mask & (reference <= s) ]
+ * = fragment_mask & ~(reference > s) */
+ /* As above, we have to do this by loading a register */
+ unsigned int tmp_reg = spe_allocate_available_register(f);
+ spe_load_uint(f, tmp_reg, state->ref_value);
+ spe_clgt(f, stencil_pass_reg, tmp_reg, fbS_reg);
+ spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+ spe_release_register(f, tmp_reg);
+ }
+ else {
+ /* stencil_pass = fragment_mask & ~((reference&mask) > (s&mask)) */
+ unsigned int tmp_reg = spe_allocate_available_register(f);
+ unsigned int tmp_masked_stencil = spe_allocate_available_register(f);
+ spe_load_uint(f, tmp_reg, state->ref_value & state->value_mask);
+ spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->value_mask);
+ spe_clgt(f, stencil_pass_reg, tmp_reg, tmp_masked_stencil);
+ spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+ spe_release_register(f, tmp_reg);
+ spe_release_register(f, tmp_masked_stencil);
+ }
+ break;
+
+ case PIPE_FUNC_NEVER:
+ /* stencil_pass = fragment_mask & 0 = 0 */
+ spe_load_uint(f, stencil_pass_reg, 0);
+ break;
+
+ case PIPE_FUNC_ALWAYS:
+ /* stencil_pass = fragment_mask & 1 = fragment_mask */
+ spe_move(f, stencil_pass_reg, fragment_mask_reg);
+ break;
+ }
+
+ /* The fragments that passed the stencil test are now in stencil_pass_reg.
+ * The fragments that failed would be (fragment_mask_reg & ~stencil_pass_reg).
+ */
+}
+
+/* This function generates code that calculates a set of new stencil values
+ * given the earlier values and the operation to apply. It does not
+ * apply any tests. It is intended to be called up to 3 times
+ * (for the stencil fail operation, for the stencil pass-z fail operation,
+ * and for the stencil pass-z pass operation) to collect up to three
+ * possible sets of values, and for the caller to combine them based
+ * on the result of the tests.
+ *
+ * stencil_max_value should be (2^n - 1) where n is the number of bits
+ * in the stencil buffer - in other words, it should be usable as a mask.
+ */
+static void
+gen_stencil_values(struct spe_function *f, unsigned int stencil_op,
+ unsigned int stencil_ref_value, unsigned int stencil_max_value,
+ unsigned int fbS_reg, unsigned int newS_reg)
+{
+ /* The code below assumes that newS_reg and fbS_reg are not the same
+ * register; if they can be, the calculations below will have to use
+ * an additional temporary register. For now, mark the assumption
+ * with an assertion that will fail if they are the same.
+ */
+ ASSERT(fbS_reg != newS_reg);
+
+ /* The code also assumes the the stencil_max_value is of the form
+ * 2^n-1 and can therefore be used as a mask for the valid bits in
+ * addition to a maximum. Make sure this is the case as well.
+ * The clever math below exploits the fact that incrementing a
+ * binary number serves to flip all the bits of a number starting at
+ * the LSB and continuing to (and including) the first zero bit
+ * found. That means that a number and its increment will always
+ * have at least one bit in common (the high order bit, if nothing
+ * else) *unless* the number is zero, *or* the number is of a form
+ * consisting of some number of 1s in the low-order bits followed
+ * by nothing but 0s in the high-order bits. The latter case
+ * implies it's of the form 2^n-1.
+ */
+ ASSERT(stencil_max_value > 0 && ((stencil_max_value + 1) & stencil_max_value) == 0);
+
+ switch(stencil_op) {
+ case PIPE_STENCIL_OP_KEEP:
+ /* newS = S */
+ spe_move(f, newS_reg, fbS_reg);
+ break;
+
+ case PIPE_STENCIL_OP_ZERO:
+ /* newS = 0 */
+ spe_zero(f, newS_reg);
+ break;
+
+ case PIPE_STENCIL_OP_REPLACE:
+ /* newS = stencil reference value */
+ spe_load_uint(f, newS_reg, stencil_ref_value);
+ break;
+
+ case PIPE_STENCIL_OP_INCR: {
+ /* newS = (s == max ? max : s + 1) */
+ unsigned int equals_reg = spe_allocate_available_register(f);
+
+ spe_compare_equal_uint(f, equals_reg, fbS_reg, stencil_max_value);
+ /* Add Word Immediate computes rT = rA + 10-bit signed immediate */
+ spe_ai(f, newS_reg, fbS_reg, 1);
+ /* Select from the current value or the new value based on the equality test */
+ spe_selb(f, newS_reg, newS_reg, fbS_reg, equals_reg);
+
+ spe_release_register(f, equals_reg);
+ break;
+ }
+ case PIPE_STENCIL_OP_DECR: {
+ /* newS = (s == 0 ? 0 : s - 1) */
+ unsigned int equals_reg = spe_allocate_available_register(f);
+
+ spe_compare_equal_uint(f, equals_reg, fbS_reg, 0);
+ /* Add Word Immediate with a (-1) value works */
+ spe_ai(f, newS_reg, fbS_reg, -1);
+ /* Select from the current value or the new value based on the equality test */
+ spe_selb(f, newS_reg, newS_reg, fbS_reg, equals_reg);
+
+ spe_release_register(f, equals_reg);
+ break;
+ }
+ case PIPE_STENCIL_OP_INCR_WRAP:
+ /* newS = (s == max ? 0 : s + 1), but since max is 2^n-1, we can
+ * do a normal add and mask off the correct bits
+ */
+ spe_ai(f, newS_reg, fbS_reg, 1);
+ spe_and_uint(f, newS_reg, newS_reg, stencil_max_value);
+ break;
+
+ case PIPE_STENCIL_OP_DECR_WRAP:
+ /* newS = (s == 0 ? max : s - 1), but we'll pull the same mask trick as above */
+ spe_ai(f, newS_reg, fbS_reg, -1);
+ spe_and_uint(f, newS_reg, newS_reg, stencil_max_value);
+ break;
+
+ case PIPE_STENCIL_OP_INVERT:
+ /* newS = ~s. We take advantage of the mask/max value to invert only
+ * the valid bits for the field so we don't have to do an extra "and".
+ */
+ spe_xor_uint(f, newS_reg, fbS_reg, stencil_max_value);
+ break;
+
+ default:
+ ASSERT(0);
+ }
+}
+
+
+/* This function generates code to get all the necessary possible
+ * stencil values. For each of the output registers (fail_reg,
+ * zfail_reg, and zpass_reg), it either allocates a new register
+ * and calculates a new set of values based on the stencil operation,
+ * or it reuses a register allocation and calculation done for an
+ * earlier (matching) operation, or it reuses the fbS_reg register
+ * (if the stencil operation is KEEP, which doesn't change the
+ * stencil buffer).
+ *
+ * Since this function allocates a variable number of registers,
+ * to avoid incurring complex logic to free them, they should
+ * be allocated after a spe_allocate_register_set() call
+ * and released by the corresponding spe_release_register_set() call.
+ */
+static void
+gen_get_stencil_values(struct spe_function *f, const struct pipe_stencil_state *stencil,
+ const unsigned int depth_enabled,
+ unsigned int fbS_reg,
+ unsigned int *fail_reg, unsigned int *zfail_reg,
+ unsigned int *zpass_reg)
+{
+ unsigned zfail_op;
+
+ /* Stenciling had better be enabled here */
+ ASSERT(stencil->enabled);
+
+ /* If the depth test is not enabled, it is treated as though it always
+ * passes, which means that the zfail_op is not considered - a
+ * failing stencil test triggers the fail_op, and a passing one
+ * triggers the zpass_op
+ *
+ * As an optimization, override calculation of the zfail_op values
+ * if they aren't going to be used. By setting the value of
+ * the operation to PIPE_STENCIL_OP_KEEP, its value will be assumed
+ * to match the incoming stencil values, and no calculation will
+ * be done.
+ */
+ if (depth_enabled) {
+ zfail_op = stencil->zfail_op;
+ }
+ else {
+ zfail_op = PIPE_STENCIL_OP_KEEP;
+ }
+
+ /* One-sided or front-facing stencil */
+ if (stencil->fail_op == PIPE_STENCIL_OP_KEEP) {
+ *fail_reg = fbS_reg;
+ }
+ else {
+ *fail_reg = spe_allocate_available_register(f);
+ gen_stencil_values(f, stencil->fail_op, stencil->ref_value,
+ 0xff, fbS_reg, *fail_reg);
+ }
+
+ /* Check the possibly overridden value, not the structure value */
+ if (zfail_op == PIPE_STENCIL_OP_KEEP) {
+ *zfail_reg = fbS_reg;
+ }
+ else if (zfail_op == stencil->fail_op) {
+ *zfail_reg = *fail_reg;
+ }
+ else {
+ *zfail_reg = spe_allocate_available_register(f);
+ gen_stencil_values(f, stencil->zfail_op, stencil->ref_value,
+ 0xff, fbS_reg, *zfail_reg);
+ }
+
+ if (stencil->zpass_op == PIPE_STENCIL_OP_KEEP) {
+ *zpass_reg = fbS_reg;
+ }
+ else if (stencil->zpass_op == stencil->fail_op) {
+ *zpass_reg = *fail_reg;
+ }
+ else if (stencil->zpass_op == zfail_op) {
+ *zpass_reg = *zfail_reg;
+ }
+ else {
+ *zpass_reg = spe_allocate_available_register(f);
+ gen_stencil_values(f, stencil->zpass_op, stencil->ref_value,
+ 0xff, fbS_reg, *zpass_reg);
+ }
+}
+
+/* Note that fbZ_reg may *not* be set on entry, if in fact
+ * the depth test is not enabled. This function must not use
+ * the register if depth is not enabled.
+ */
+static boolean
+gen_stencil_depth_test(struct spe_function *f,
+ const struct pipe_depth_stencil_alpha_state *dsa,
+ const uint facing,
+ const int mask_reg, const int fragZ_reg,
+ const int fbZ_reg, const int fbS_reg)
+{
+ /* True if we've generated code that could require writeback to the
+ * depth and/or stencil buffers
+ */
+ boolean modified_buffers = false;
+
+ boolean need_to_calculate_stencil_values;
+ boolean need_to_writemask_stencil_values;
+
+ struct pipe_stencil_state *stencil;
+
+ /* Registers. We may or may not actually allocate these, depending
+ * on whether the state values indicate that we need them.
+ */
+ unsigned int stencil_pass_reg, stencil_fail_reg;
+ unsigned int stencil_fail_values, stencil_pass_depth_fail_values, stencil_pass_depth_pass_values;
+ unsigned int stencil_writemask_reg;
+ unsigned int zmask_reg;
+ unsigned int newS_reg;
+
+ /* Stenciling is quite complex: up to six different configurable stencil
+ * operations/calculations can be required (three each for front-facing
+ * and back-facing fragments). Many of those operations will likely
+ * be identical, so there's good reason to try to avoid calculating
+ * the same values more than once (which unfortunately makes the code less
+ * straightforward).
+ *
+ * To make register management easier, we start a new
+ * register set; we can release all the registers in the set at
+ * once, and avoid having to keep track of exactly which registers
+ * we allocate. We can still allocate and free registers as
+ * desired (if we know we no longer need a register), but we don't
+ * have to spend the complexity to track the more difficult variant
+ * register usage scenarios.
+ */
+ spe_comment(f, 0, "Allocating stencil register set");
+ spe_allocate_register_set(f);
+
+ /* The facing we're given is the fragment facing; it doesn't
+ * exactly match the stencil facing. If stencil is enabled,
+ * but two-sided stencil is *not* enabled, we use the same
+ * stencil settings for both front- and back-facing fragments.
+ * We only use the "back-facing" stencil for backfacing fragments
+ * if two-sided stenciling is enabled.
+ */
+ if (facing == CELL_FACING_BACK && dsa->stencil[1].enabled) {
+ stencil = &dsa->stencil[1];
+ }
+ else {
+ stencil = &dsa->stencil[0];
+ }
+
+ /* Calculate the writemask. If the writemask is trivial (either
+ * all 0s, meaning that we don't need to calculate any stencil values
+ * because they're not going to change the stencil anyway, or all 1s,
+ * meaning that we have to calculate the stencil values but do not
+ * need to mask them), we can avoid generating code. Don't forget
+ * that we need to consider backfacing stencil, if enabled.
+ *
+ * Note that if the backface stencil is *not* enabled, the backface
+ * stencil will have the same values as the frontface stencil.
+ */
+ if (stencil->fail_op == PIPE_STENCIL_OP_KEEP &&
+ stencil->zfail_op == PIPE_STENCIL_OP_KEEP &&
+ stencil->zpass_op == PIPE_STENCIL_OP_KEEP) {
+ need_to_calculate_stencil_values = false;
+ need_to_writemask_stencil_values = false;
+ }
+ else if (stencil->write_mask == 0x0) {
+ /* All changes are writemasked out, so no need to calculate
+ * what those changes might be, and no need to write anything back.
+ */
+ need_to_calculate_stencil_values = false;
+ need_to_writemask_stencil_values = false;
+ }
+ else if (stencil->write_mask == 0xff) {
+ /* Still trivial, but a little less so. We need to write the stencil
+ * values, but we don't need to mask them.
+ */
+ need_to_calculate_stencil_values = true;
+ need_to_writemask_stencil_values = false;
+ }
+ else {
+ /* The general case: calculate, mask, and write */
+ need_to_calculate_stencil_values = true;
+ need_to_writemask_stencil_values = true;
+
+ /* While we're here, generate code that calculates what the
+ * writemask should be. If backface stenciling is enabled,
+ * and the backface writemask is not the same as the frontface
+ * writemask, we'll have to generate code that merges the
+ * two masks into a single effective mask based on fragment facing.
+ */
+ spe_comment(f, 0, "Computing stencil writemask");
+ stencil_writemask_reg = spe_allocate_available_register(f);
+ spe_load_uint(f, stencil_writemask_reg, dsa->stencil[facing].write_mask);
+ }
+
+ /* At least one-sided stenciling must be on. Generate code that
+ * runs the stencil test on the basic/front-facing stencil, leaving
+ * the mask of passing stencil bits in stencil_pass_reg. This mask will
+ * be used both to mask the set of active pixels, and also to
+ * determine how the stencil buffer changes.
+ *
+ * This test will *not* change the value in mask_reg (because we don't
+ * yet know whether to apply the two-sided stencil or one-sided stencil).
+ */
+ spe_comment(f, 0, "Running basic stencil test");
+ stencil_pass_reg = spe_allocate_available_register(f);
+ gen_stencil_test(f, stencil, 0xff, mask_reg, fbS_reg, stencil_pass_reg);
+
+ /* Generate code that, given the mask of valid fragments and the
+ * mask of valid fragments that passed the stencil test, computes
+ * the mask of valid fragments that failed the stencil test. We
+ * have to do this before we run a depth test (because the
+ * depth test should not be performed on fragments that failed the
+ * stencil test, and because the depth test will update the
+ * mask of valid fragments based on the results of the depth test).
+ */
+ spe_comment(f, 0, "Computing stencil fail mask and updating fragment mask");
+ stencil_fail_reg = spe_allocate_available_register(f);
+ spe_andc(f, stencil_fail_reg, mask_reg, stencil_pass_reg);
+ /* Now remove the stenciled-out pixels from the valid fragment mask,
+ * so we can later use the valid fragment mask in the depth test.
+ */
+ spe_and(f, mask_reg, mask_reg, stencil_pass_reg);
+
+ /* We may not need to calculate stencil values, if the writemask is off */
+ if (need_to_calculate_stencil_values) {
+ /* Generate code that calculates exactly which stencil values we need,
+ * without calculating the same value twice (say, if two different
+ * stencil ops have the same value). This code will work for one-sided
+ * and two-sided stenciling (so that we take into account that operations
+ * may match between front and back stencils), and will also take into
+ * account whether the depth test is enabled (if the depth test is off,
+ * we don't need any of the zfail results, because the depth test always
+ * is considered to pass if it is disabled). Any register value that
+ * does not need to be calculated will come back with the same value
+ * that's in fbS_reg.
+ *
+ * This function will allocate a variant number of registers that
+ * will be released as part of the register set.
+ */
+ spe_comment(f, 0, facing == CELL_FACING_FRONT ? "Computing front-facing stencil values" : "Computing back-facing stencil values");
+ gen_get_stencil_values(f, stencil, dsa->depth.enabled, fbS_reg,
+ &stencil_fail_values, &stencil_pass_depth_fail_values,
+ &stencil_pass_depth_pass_values);
+ }
+
+ /* We now have all the stencil values we need. We also need
+ * the results of the depth test to figure out which
+ * stencil values will become the new stencil values. (Even if
+ * we aren't actually calculating stencil values, we need to apply
+ * the depth test if it's enabled.)
+ *
+ * The code generated by gen_depth_test() returns the results of the
+ * test in the given register, but also alters the mask_reg based
+ * on the results of the test.
+ */
+ if (dsa->depth.enabled) {
+ spe_comment(f, 0, "Running stencil depth test");
+ zmask_reg = spe_allocate_available_register(f);
+ modified_buffers |= gen_depth_test(f, dsa, mask_reg, fragZ_reg, fbZ_reg, zmask_reg);
+ }
+
+ if (need_to_calculate_stencil_values) {
+
+ /* If we need to writemask the stencil values before going into
+ * the stencil buffer, we'll have to use a new register to
+ * hold the new values. If not, we can just keep using the
+ * current register.
+ */
+ if (need_to_writemask_stencil_values) {
+ newS_reg = spe_allocate_available_register(f);
+ spe_comment(f, 0, "Saving current stencil values for writemasking");
+ spe_move(f, newS_reg, fbS_reg);
+ }
+ else {
+ newS_reg = fbS_reg;
+ }
+
+ /* Merge in the selected stencil fail values */
+ if (stencil_fail_values != fbS_reg) {
+ spe_comment(f, 0, "Loading stencil fail values");
+ spe_selb(f, newS_reg, newS_reg, stencil_fail_values, stencil_fail_reg);
+ modified_buffers = true;
+ }
+
+ /* Same for the stencil pass/depth fail values. If this calculation
+ * is not needed (say, if depth test is off), then the
+ * stencil_pass_depth_fail_values register will be equal to fbS_reg
+ * and we'll skip the calculation.
+ */
+ if (stencil_pass_depth_fail_values != fbS_reg) {
+ /* We don't actually have a stencil pass/depth fail mask yet.
+ * Calculate it here from the stencil passing mask and the
+ * depth passing mask. Note that zmask_reg *must* have been
+ * set above if we're here.
+ */
+ unsigned int stencil_pass_depth_fail_mask = spe_allocate_available_register(f);
+ spe_comment(f, 0, "Loading stencil pass/depth fail values");
+ spe_andc(f, stencil_pass_depth_fail_mask, stencil_pass_reg, zmask_reg);
+
+ spe_selb(f, newS_reg, newS_reg, stencil_pass_depth_fail_values, stencil_pass_depth_fail_mask);
+
+ spe_release_register(f, stencil_pass_depth_fail_mask);
+ modified_buffers = true;
+ }
+
+ /* Same for the stencil pass/depth pass mask. Note that we
+ * *can* get here with zmask_reg being unset (if the depth
+ * test is off but the stencil test is on). In this case,
+ * we assume the depth test passes, and don't need to mask
+ * the stencil pass mask with the Z mask.
+ */
+ if (stencil_pass_depth_pass_values != fbS_reg) {
+ if (dsa->depth.enabled) {
+ unsigned int stencil_pass_depth_pass_mask = spe_allocate_available_register(f);
+ /* We'll need a separate register */
+ spe_comment(f, 0, "Loading stencil pass/depth pass values");
+ spe_and(f, stencil_pass_depth_pass_mask, stencil_pass_reg, zmask_reg);
+ spe_selb(f, newS_reg, newS_reg, stencil_pass_depth_pass_values, stencil_pass_depth_pass_mask);
+ spe_release_register(f, stencil_pass_depth_pass_mask);
+ }
+ else {
+ /* We can use the same stencil-pass register */
+ spe_comment(f, 0, "Loading stencil pass values");
+ spe_selb(f, newS_reg, newS_reg, stencil_pass_depth_pass_values, stencil_pass_reg);
+ }
+ modified_buffers = true;
+ }
+
+ /* Almost done. If we need to writemask, do it now, leaving the
+ * results in the fbS_reg register passed in. If we don't need
+ * to writemask, then the results are *already* in the fbS_reg,
+ * so there's nothing more to do.
+ */
+
+ if (need_to_writemask_stencil_values && modified_buffers) {
+ /* The Select Bytes command makes a fine writemask. Where
+ * the mask is 0, the first (original) values are retained,
+ * effectively masking out changes. Where the mask is 1, the
+ * second (new) values are retained, incorporating changes.
+ */
+ spe_comment(f, 0, "Writemasking new stencil values");
+ spe_selb(f, fbS_reg, fbS_reg, newS_reg, stencil_writemask_reg);
+ }
+
+ } /* done calculating stencil values */
+
+ /* The stencil and/or depth values have been applied, and the
+ * mask_reg, fbS_reg, and fbZ_reg values have been updated.
+ * We're all done, except that we've allocated a fair number
+ * of registers that we didn't bother tracking. Release all
+ * those registers as part of the register set, and go home.
+ */
+ spe_comment(f, 0, "Releasing stencil register set");
+ spe_release_register_set(f);
+
+ /* Return true if we could have modified the stencil and/or
+ * depth buffers.
+ */
+ return modified_buffers;
+}
/**
@@ -621,14 +1774,19 @@ gen_pack_colors(struct spe_function *f,
* should be much faster.
*
* \param cell the rendering context (in)
- * \param f the generated function (out)
+ * \param facing whether the generated code is for front-facing or
+ * back-facing fragments
+ * \param f the generated function (in/out); on input, the function
+ * must already have been initialized. On exit, whatever
+ * instructions within the generated function have had
+ * the fragment ops appended.
*/
void
-cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
+cell_gen_fragment_function(struct cell_context *cell, const uint facing, struct spe_function *f)
{
- const struct pipe_depth_stencil_alpha_state *dsa =
- &cell->depth_stencil->base;
- const struct pipe_blend_state *blend = &cell->blend->base;
+ const struct pipe_depth_stencil_alpha_state *dsa = cell->depth_stencil;
+ const struct pipe_blend_state *blend = cell->blend;
+ const struct pipe_blend_color *blend_color = &cell->blend_color;
const enum pipe_format color_format = cell->framebuffer.cbufs[0]->format;
/* For SPE function calls: reg $3 = first param, $4 = second param, etc. */
@@ -643,6 +1801,8 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
const int fragA_reg = 11; /* vector float */
const int mask_reg = 12; /* vector uint */
+ ASSERT(facing == CELL_FACING_FRONT || facing == CELL_FACING_BACK);
+
/* offset of quad from start of tile
* XXX assuming 4-byte pixels for color AND Z/stencil!!!!
*/
@@ -651,7 +1811,12 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
int fbRGBA_reg; /**< framebuffer's RGBA colors for quad */
int fbZS_reg; /**< framebuffer's combined z/stencil values for quad */
- spe_init_func(f, SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE);
+ if (cell->debug_flags & CELL_DEBUG_ASM) {
+ spe_print_code(f, true);
+ spe_indent(f, 8);
+ spe_comment(f, -4, facing == CELL_FACING_FRONT ? "Begin front-facing per-fragment ops": "Begin back-facing per-fragment ops");
+ }
+
spe_allocate_register(f, x_reg);
spe_allocate_register(f, y_reg);
spe_allocate_register(f, color_tile_reg);
@@ -674,8 +1839,9 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
ASSERT(TILE_SIZE == 32);
- spe_rotmi(f, x2_reg, x_reg, -1); /* x2 = x / 2 */
+ spe_comment(f, 0, "Compute quad offset within tile");
spe_rotmi(f, y2_reg, y_reg, -1); /* y2 = y / 2 */
+ spe_rotmi(f, x2_reg, x_reg, -1); /* x2 = x / 2 */
spe_shli(f, y2_reg, y2_reg, 4); /* y2 *= 16 */
spe_a(f, quad_offset_reg, y2_reg, x2_reg); /* offset = y2 + x2 */
spe_shli(f, quad_offset_reg, quad_offset_reg, 4); /* offset *= 16 */
@@ -684,139 +1850,196 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
spe_release_register(f, y2_reg);
}
-
+ /* Generate the alpha test, if needed. */
if (dsa->alpha.enabled) {
gen_alpha_test(dsa, f, mask_reg, fragA_reg);
}
+ /* If we need the stencil buffers (because one- or two-sided stencil is
+ * enabled) or the depth buffer (because the depth test is enabled),
+ * go grab them. Note that if either one- or two-sided stencil is
+ * enabled, dsa->stencil[0].enabled will be true.
+ */
if (dsa->depth.enabled || dsa->stencil[0].enabled) {
const enum pipe_format zs_format = cell->framebuffer.zsbuf->format;
boolean write_depth_stencil;
- int fbZ_reg = spe_allocate_available_register(f); /* Z values */
- int fbS_reg = spe_allocate_available_register(f); /* Stencil values */
+ /* We may or may not need to allocate a register for Z or stencil values */
+ boolean fbS_reg_set = false, fbZ_reg_set = false;
+ unsigned int fbS_reg, fbZ_reg = 0;
+
+ spe_comment(f, 0, "Fetching Z/stencil quad from tile");
/* fetch quad of depth/stencil values from tile at (x,y) */
/* Load: fbZS_reg = memory[depth_tile_reg + offset_reg] */
+ /* XXX Not sure this is allowed if we've only got a 16-bit Z buffer... */
spe_lqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg);
- if (dsa->depth.enabled) {
- /* Extract Z bits from fbZS_reg into fbZ_reg */
- if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
- zs_format == PIPE_FORMAT_X8Z24_UNORM) {
- int mask_reg = spe_allocate_available_register(f);
- spe_fsmbi(f, mask_reg, 0x7777); /* mask[0,1,2,3] = 0x00ffffff */
- spe_and(f, fbZ_reg, fbZS_reg, mask_reg); /* fbZ = fbZS & mask */
- spe_release_register(f, mask_reg);
- /* OK, fbZ_reg has four 24-bit Z values now */
- }
- else {
- /* XXX handle other z/stencil formats */
- ASSERT(0);
- }
+ /* From the Z/stencil buffer format, pull out the bits we need for
+ * Z and/or stencil. We'll also convert the incoming fragment Z
+ * value in fragZ_reg from a floating point value in [0.0..1.0] to
+ * an unsigned integer value with the appropriate resolution.
+ * Note that even if depth or stencil is *not* enabled, if it's
+ * present in the buffer, we pull it out and put it back later;
+ * otherwise, we can inadvertently destroy the contents of
+ * buffers we're not supposed to touch (e.g., if the user is
+ * clearing the depth buffer but not the stencil buffer, a
+ * quad of constant depth is drawn over the surface; the stencil
+ * buffer must be maintained).
+ */
+ switch(zs_format) {
- /* Convert fragZ values from float[4] to uint[4] */
- if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
- zs_format == PIPE_FORMAT_X8Z24_UNORM ||
- zs_format == PIPE_FORMAT_Z24S8_UNORM ||
- zs_format == PIPE_FORMAT_Z24X8_UNORM) {
- /* 24-bit Z values */
- int scale_reg = spe_allocate_available_register(f);
+ case PIPE_FORMAT_S8Z24_UNORM: /* fall through */
+ case PIPE_FORMAT_X8Z24_UNORM:
+ /* Pull out both Z and stencil */
+ setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
+ setup_optional_register(f, &fbS_reg_set, &fbS_reg);
- /* scale_reg[0,1,2,3] = float(2^24-1) */
- spe_load_float(f, scale_reg, (float) 0xffffff);
+ /* four 24-bit Z values in the low-order bits */
+ spe_and_uint(f, fbZ_reg, fbZS_reg, 0x00ffffff);
- /* XXX these two instructions might be combined */
- spe_fm(f, fragZ_reg, fragZ_reg, scale_reg); /* fragZ *= scale */
- spe_cfltu(f, fragZ_reg, fragZ_reg, 0); /* fragZ = (int) fragZ */
+ /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
+ * to a 24-bit unsigned integer
+ */
+ spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+ spe_rotmi(f, fragZ_reg, fragZ_reg, -8);
- spe_release_register(f, scale_reg);
- }
- else {
- /* XXX handle 16-bit Z format */
- ASSERT(0);
- }
- }
+ /* four 8-bit stencil values in the high-order bits */
+ spe_rotmi(f, fbS_reg, fbZS_reg, -24);
+ break;
- if (dsa->stencil[0].enabled) {
- /* Extract Stencil bit sfrom fbZS_reg into fbS_reg */
- if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
- zs_format == PIPE_FORMAT_X8Z24_UNORM) {
- /* XXX extract with a shift */
- ASSERT(0);
- }
- else if (zs_format == PIPE_FORMAT_Z24S8_UNORM ||
- zs_format == PIPE_FORMAT_Z24X8_UNORM) {
- /* XXX extract with a mask */
- ASSERT(0);
- }
- }
+ case PIPE_FORMAT_Z24S8_UNORM: /* fall through */
+ case PIPE_FORMAT_Z24X8_UNORM:
+ setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
+ setup_optional_register(f, &fbS_reg_set, &fbS_reg);
+
+ /* shift by 8 to get the upper 24-bit values */
+ spe_rotmi(f, fbS_reg, fbZS_reg, -8);
+
+ /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
+ * to a 24-bit unsigned integer
+ */
+ spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+ spe_rotmi(f, fragZ_reg, fragZ_reg, -8);
+
+ /* 8-bit stencil in the low-order bits - mask them out */
+ spe_and_uint(f, fbS_reg, fbZS_reg, 0x000000ff);
+ break;
+
+ case PIPE_FORMAT_Z32_UNORM:
+ setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
+ /* Copy over 4 32-bit values */
+ spe_move(f, fbZ_reg, fbZS_reg);
+
+ /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
+ * to a 32-bit unsigned integer
+ */
+ spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+ /* No stencil, so can't do anything there */
+ break;
+ case PIPE_FORMAT_Z16_UNORM:
+ /* XXX Not sure this is correct, but it was here before, so we're
+ * going with it for now
+ */
+ setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
+ /* Copy over 4 32-bit values */
+ spe_move(f, fbZ_reg, fbZS_reg);
+
+ /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
+ * to a 16-bit unsigned integer
+ */
+ spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+ spe_rotmi(f, fragZ_reg, fragZ_reg, -16);
+ /* No stencil */
+
+ default:
+ ASSERT(0); /* invalid format */
+ }
+ /* If stencil is enabled, use the stencil-specific code
+ * generator to generate both the stencil and depth (if needed)
+ * tests. Otherwise, if only depth is enabled, generate
+ * a quick depth test. The test generators themselves will
+ * report back whether the depth/stencil buffer has to be
+ * written back.
+ */
if (dsa->stencil[0].enabled) {
- /* XXX this may involve depth testing too */
- // gen_stencil_test(dsa, f, ... );
- ASSERT(0);
+ /* This will perform the stencil and depth tests, and update
+ * the mask_reg, fbZ_reg, and fbS_reg as required by the
+ * tests.
+ */
+ ASSERT(fbS_reg_set);
+ spe_comment(f, 0, "Perform stencil test");
+
+ /* Note that fbZ_reg may not be set on entry, if stenciling
+ * is enabled but there's no Z-buffer. The
+ * gen_stencil_depth_test() function must ignore the
+ * fbZ_reg register if depth is not enabled.
+ */
+ write_depth_stencil = gen_stencil_depth_test(f, dsa, facing, mask_reg, fragZ_reg, fbZ_reg, fbS_reg);
}
else if (dsa->depth.enabled) {
int zmask_reg = spe_allocate_available_register(f);
- gen_depth_test(dsa, f, mask_reg, fragZ_reg, fbZ_reg, zmask_reg);
+ ASSERT(fbZ_reg_set);
+ spe_comment(f, 0, "Perform depth test");
+ write_depth_stencil = gen_depth_test(f, dsa, mask_reg, fragZ_reg, fbZ_reg, zmask_reg);
spe_release_register(f, zmask_reg);
}
-
- /* do we need to write Z and/or Stencil back into framebuffer? */
- write_depth_stencil = (dsa->depth.writemask |
- dsa->stencil[0].write_mask |
- dsa->stencil[1].write_mask);
+ else {
+ write_depth_stencil = false;
+ }
if (write_depth_stencil) {
/* Merge latest Z and Stencil values into fbZS_reg.
* fbZ_reg has four Z vals in bits [23..0] or bits [15..0].
* fbS_reg has four 8-bit Z values in bits [7..0].
*/
+ spe_comment(f, 0, "Store quad's depth/stencil values in tile");
if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
zs_format == PIPE_FORMAT_X8Z24_UNORM) {
spe_shli(f, fbS_reg, fbS_reg, 24); /* fbS = fbS << 24 */
spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
}
- else if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
- zs_format == PIPE_FORMAT_X8Z24_UNORM) {
- /* XXX to do */
- ASSERT(0);
+ else if (zs_format == PIPE_FORMAT_Z24S8_UNORM ||
+ zs_format == PIPE_FORMAT_Z24X8_UNORM) {
+ spe_shli(f, fbZ_reg, fbZ_reg, 8); /* fbZ = fbZ << 8 */
+ spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
+ }
+ else if (zs_format == PIPE_FORMAT_Z32_UNORM) {
+ spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */
}
else if (zs_format == PIPE_FORMAT_Z16_UNORM) {
- /* XXX to do */
- ASSERT(0);
+ spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */
}
else if (zs_format == PIPE_FORMAT_S8_UNORM) {
- /* XXX to do */
- ASSERT(0);
+ ASSERT(0); /* XXX to do */
}
else {
- /* bad zs_format */
- ASSERT(0);
+ ASSERT(0); /* bad zs_format */
}
/* Store: memory[depth_tile_reg + quad_offset_reg] = fbZS */
spe_stqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg);
}
- spe_release_register(f, fbZ_reg);
- spe_release_register(f, fbS_reg);
+ /* Don't need these any more */
+ release_optional_register(f, &fbZ_reg_set, fbZ_reg);
+ release_optional_register(f, &fbS_reg_set, fbS_reg);
}
-
/* Get framebuffer quad/colors. We'll need these for blending,
* color masking, and to obey the quad/pixel mask.
* Load: fbRGBA_reg = memory[color_tile + quad_offset]
* Note: if mask={~0,~0,~0,~0} and we're not blending or colormasking
* we could skip this load.
*/
+ spe_comment(f, 0, "Fetch quad colors from tile");
spe_lqx(f, fbRGBA_reg, color_tile_reg, quad_offset_reg);
-
if (blend->blend_enable) {
- gen_blend(blend, f, color_format,
+ spe_comment(f, 0, "Perform blending");
+ gen_blend(blend, blend_color, f, color_format,
fragR_reg, fragG_reg, fragB_reg, fragA_reg, fbRGBA_reg);
}
@@ -829,19 +2052,21 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
int rgba_reg = spe_allocate_available_register(f);
/* Pack four float colors as four 32-bit int colors */
+ spe_comment(f, 0, "Convert float quad colors to packed int framebuffer colors");
gen_pack_colors(f, color_format,
fragR_reg, fragG_reg, fragB_reg, fragA_reg,
rgba_reg);
if (blend->logicop_enable) {
+ spe_comment(f, 0, "Compute logic op");
gen_logicop(blend, f, rgba_reg, fbRGBA_reg);
}
- if (blend->colormask != 0xf) {
- gen_colormask(blend->colormask, f, rgba_reg, fbRGBA_reg);
+ if (blend->colormask != PIPE_MASK_RGBA) {
+ spe_comment(f, 0, "Compute color mask");
+ gen_colormask(f, blend->colormask, color_format, rgba_reg, fbRGBA_reg);
}
-
/* Mix fragment colors with framebuffer colors using the quad/pixel mask:
* if (mask[i])
* rgba[i] = rgba[i];
@@ -853,6 +2078,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
/* Store updated quad in tile:
* memory[color_tile + quad_offset] = rgba_reg;
*/
+ spe_comment(f, 0, "Store quad colors into color tile");
spe_stqx(f, rgba_reg, color_tile_reg, quad_offset_reg);
spe_release_register(f, rgba_reg);
@@ -862,9 +2088,14 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
spe_bi(f, SPE_REG_RA, 0, 0); /* return from function call */
-
spe_release_register(f, fbRGBA_reg);
spe_release_register(f, fbZS_reg);
spe_release_register(f, quad_offset_reg);
-}
+ if (cell->debug_flags & CELL_DEBUG_ASM) {
+ char buffer[1024];
+ sprintf(buffer, "End %s-facing per-fragment ops: %d instructions",
+ facing == CELL_FACING_FRONT ? "front" : "back", f->num_inst);
+ spe_comment(f, -4, buffer);
+ }
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.h b/src/gallium/drivers/cell/ppu/cell_gen_fragment.h
index b59de198dc..21b35d1faf 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.h
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.h
@@ -31,7 +31,7 @@
extern void
-cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f);
+cell_gen_fragment_function(struct cell_context *cell, const uint facing, struct spe_function *f);
#endif /* CELL_GEN_FRAGMENT_H */
diff --git a/src/gallium/drivers/cell/ppu/cell_pipe_state.c b/src/gallium/drivers/cell/ppu/cell_pipe_state.c
index 475c6ef0ce..81efd137c7 100644
--- a/src/gallium/drivers/cell/ppu/cell_pipe_state.c
+++ b/src/gallium/drivers/cell/ppu/cell_pipe_state.c
@@ -35,9 +35,9 @@
#include "draw/draw_context.h"
#include "cell_context.h"
#include "cell_flush.h"
+#include "cell_pipe_state.h"
#include "cell_state.h"
#include "cell_texture.h"
-#include "cell_state_per_fragment.h"
@@ -45,24 +45,18 @@ static void *
cell_create_blend_state(struct pipe_context *pipe,
const struct pipe_blend_state *blend)
{
- struct cell_blend_state *cb = MALLOC(sizeof(struct cell_blend_state));
-
- (void) memcpy(cb, blend, sizeof(*blend));
-#if 0
- cell_generate_alpha_blend(cb);
-#endif
- return cb;
+ return mem_dup(blend, sizeof(*blend));
}
static void
-cell_bind_blend_state(struct pipe_context *pipe, void *state)
+cell_bind_blend_state(struct pipe_context *pipe, void *blend)
{
struct cell_context *cell = cell_context(pipe);
draw_flush(cell->draw);
- cell->blend = (struct cell_blend_state *) state;
+ cell->blend = (struct pipe_blend_state *) blend;
cell->dirty |= CELL_NEW_BLEND;
}
@@ -70,10 +64,7 @@ cell_bind_blend_state(struct pipe_context *pipe, void *state)
static void
cell_delete_blend_state(struct pipe_context *pipe, void *blend)
{
- struct cell_blend_state *cb = (struct cell_blend_state *) blend;
-
- spe_release_func(& cb->code);
- FREE(cb);
+ FREE(blend);
}
@@ -95,41 +86,29 @@ cell_set_blend_color(struct pipe_context *pipe,
static void *
cell_create_depth_stencil_alpha_state(struct pipe_context *pipe,
- const struct pipe_depth_stencil_alpha_state *depth_stencil)
+ const struct pipe_depth_stencil_alpha_state *dsa)
{
- struct cell_depth_stencil_alpha_state *cdsa =
- MALLOC(sizeof(struct cell_depth_stencil_alpha_state));
-
- (void) memcpy(cdsa, depth_stencil, sizeof(*depth_stencil));
-#if 0
- cell_generate_depth_stencil_test(cdsa);
-#endif
- return cdsa;
+ return mem_dup(dsa, sizeof(*dsa));
}
static void
cell_bind_depth_stencil_alpha_state(struct pipe_context *pipe,
- void *depth_stencil)
+ void *dsa)
{
struct cell_context *cell = cell_context(pipe);
draw_flush(cell->draw);
- cell->depth_stencil =
- (struct cell_depth_stencil_alpha_state *) depth_stencil;
+ cell->depth_stencil = (struct pipe_depth_stencil_alpha_state *) dsa;
cell->dirty |= CELL_NEW_DEPTH_STENCIL;
}
static void
-cell_delete_depth_stencil_alpha_state(struct pipe_context *pipe, void *depth)
+cell_delete_depth_stencil_alpha_state(struct pipe_context *pipe, void *dsa)
{
- struct cell_depth_stencil_alpha_state *cdsa =
- (struct cell_depth_stencil_alpha_state *) depth;
-
- spe_release_func(& cdsa->code);
- FREE(cdsa);
+ FREE(dsa);
}
@@ -191,24 +170,23 @@ cell_set_polygon_stipple( struct pipe_context *pipe,
static void *
cell_create_rasterizer_state(struct pipe_context *pipe,
- const struct pipe_rasterizer_state *setup)
+ const struct pipe_rasterizer_state *rasterizer)
{
- struct pipe_rasterizer_state *state
- = MALLOC(sizeof(struct pipe_rasterizer_state));
- memcpy(state, setup, sizeof(struct pipe_rasterizer_state));
- return state;
+ return mem_dup(rasterizer, sizeof(*rasterizer));
}
static void
-cell_bind_rasterizer_state(struct pipe_context *pipe, void *setup)
+cell_bind_rasterizer_state(struct pipe_context *pipe, void *rast)
{
+ struct pipe_rasterizer_state *rasterizer =
+ (struct pipe_rasterizer_state *) rast;
struct cell_context *cell = cell_context(pipe);
/* pass-through to draw module */
- draw_set_rasterizer_state(cell->draw, setup);
+ draw_set_rasterizer_state(cell->draw, rasterizer);
- cell->rasterizer = (struct pipe_rasterizer_state *)setup;
+ cell->rasterizer = rasterizer;
cell->dirty |= CELL_NEW_RASTERIZER;
}
@@ -235,17 +213,24 @@ cell_bind_sampler_states(struct pipe_context *pipe,
unsigned num, void **samplers)
{
struct cell_context *cell = cell_context(pipe);
+ uint i, changed = 0x0;
assert(num <= CELL_MAX_SAMPLERS);
draw_flush(cell->draw);
- memcpy(cell->sampler, samplers, num * sizeof(void *));
- memset(&cell->sampler[num], 0, (CELL_MAX_SAMPLERS - num) *
- sizeof(void *));
- cell->num_samplers = num;
+ for (i = 0; i < CELL_MAX_SAMPLERS; i++) {
+ struct pipe_sampler_state *new_samp = i < num ? samplers[i] : NULL;
+ if (cell->sampler[i] != new_samp) {
+ cell->sampler[i] = new_samp;
+ changed |= (1 << i);
+ }
+ }
- cell->dirty |= CELL_NEW_SAMPLER;
+ if (changed) {
+ cell->dirty |= CELL_NEW_SAMPLER;
+ cell->dirty_samplers |= changed;
+ }
}
@@ -263,27 +248,25 @@ cell_set_sampler_textures(struct pipe_context *pipe,
unsigned num, struct pipe_texture **texture)
{
struct cell_context *cell = cell_context(pipe);
- uint i;
+ uint i, changed = 0x0;
assert(num <= CELL_MAX_SAMPLERS);
- /* Check for no-op */
- if (num == cell->num_textures &&
- !memcmp(cell->texture, texture, num * sizeof(struct pipe_texture *)))
- return;
-
- draw_flush(cell->draw);
-
for (i = 0; i < CELL_MAX_SAMPLERS; i++) {
- struct pipe_texture *tex = i < num ? texture[i] : NULL;
-
- pipe_texture_reference((struct pipe_texture **) &cell->texture[i], tex);
+ struct pipe_texture *new_tex = i < num ? texture[i] : NULL;
+ if ((struct pipe_texture *) cell->texture[i] != new_tex) {
+ pipe_texture_reference((struct pipe_texture **) &cell->texture[i],
+ new_tex);
+ changed |= (1 << i);
+ }
}
- cell->num_textures = num;
- cell_update_texture_mapping(cell);
+ cell->num_textures = num;
- cell->dirty |= CELL_NEW_TEXTURE;
+ if (changed) {
+ cell->dirty |= CELL_NEW_TEXTURE;
+ cell->dirty_textures |= changed;
+ }
}
diff --git a/src/gallium/drivers/cell/ppu/cell_render.c b/src/gallium/drivers/cell/ppu/cell_render.c
index dd25ae880e..79cb8df82f 100644
--- a/src/gallium/drivers/cell/ppu/cell_render.c
+++ b/src/gallium/drivers/cell/ppu/cell_render.c
@@ -152,6 +152,7 @@ cell_flush_prim_buffer(struct cell_context *cell)
struct cell_command_render *render = &cell_global.command[i].render;
render->prim_type = PIPE_PRIM_TRIANGLES;
render->num_verts = cell->prim_buffer.num_verts;
+ render->front_winding = cell->rasterizer->front_winding;
render->vertex_size = cell->vertex_info->size * 4;
render->xmin = cell->prim_buffer.xmin;
render->ymin = cell->prim_buffer.ymin;
diff --git a/src/gallium/drivers/cell/ppu/cell_screen.c b/src/gallium/drivers/cell/ppu/cell_screen.c
index 139b3719b6..d223557950 100644
--- a/src/gallium/drivers/cell/ppu/cell_screen.c
+++ b/src/gallium/drivers/cell/ppu/cell_screen.c
@@ -58,9 +58,9 @@ cell_get_param(struct pipe_screen *screen, int param)
case PIPE_CAP_MAX_TEXTURE_IMAGE_UNITS:
return CELL_MAX_SAMPLERS;
case PIPE_CAP_NPOT_TEXTURES:
- return 0;
+ return 1;
case PIPE_CAP_TWO_SIDED_STENCIL:
- return 0;
+ return 1;
case PIPE_CAP_GLSL:
return 1;
case PIPE_CAP_S3TC:
@@ -68,21 +68,21 @@ cell_get_param(struct pipe_screen *screen, int param)
case PIPE_CAP_ANISOTROPIC_FILTER:
return 0;
case PIPE_CAP_POINT_SPRITE:
- return 0;
+ return 1;
case PIPE_CAP_MAX_RENDER_TARGETS:
return 1;
case PIPE_CAP_OCCLUSION_QUERY:
- return 0;
+ return 1;
case PIPE_CAP_TEXTURE_SHADOW_MAP:
- return 0;
+ return 10;
case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
- return 12; /* max 2Kx2K */
+ return CELL_MAX_TEXTURE_LEVELS;
case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
return 8; /* max 128x128x128 */
case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
- return 12; /* max 2Kx2K */
+ return CELL_MAX_TEXTURE_LEVELS;
default:
- return 0;
+ return 10;
}
}
@@ -108,7 +108,7 @@ cell_get_paramf(struct pipe_screen *screen, int param)
return 16.0; /* arbitrary */
default:
- return 0;
+ return 10;
}
}
diff --git a/src/gallium/drivers/cell/ppu/cell_spu.c b/src/gallium/drivers/cell/ppu/cell_spu.c
index 9508227e29..28e5e6d706 100644
--- a/src/gallium/drivers/cell/ppu/cell_spu.c
+++ b/src/gallium/drivers/cell/ppu/cell_spu.c
@@ -36,6 +36,7 @@
#include "cell_spu.h"
#include "pipe/p_format.h"
#include "pipe/p_state.h"
+#include "util/u_memory.h"
#include "cell/common.h"
@@ -52,6 +53,35 @@ struct cell_global_info cell_global;
/**
+ * Scan /proc/cpuinfo to determine the timebase for the system.
+ * This is used by the SPUs to convert 'decrementer' ticks to seconds.
+ * There may be a better way to get this value...
+ */
+static unsigned
+get_timebase(void)
+{
+ FILE *f = fopen("/proc/cpuinfo", "r");
+ unsigned timebase;
+
+ assert(f);
+ while (!feof(f)) {
+ char line[80];
+ fgets(line, sizeof(line), f);
+ if (strncmp(line, "timebase", 8) == 0) {
+ char *colon = strchr(line, ':');
+ if (colon) {
+ timebase = atoi(colon + 2);
+ break;
+ }
+ }
+ }
+ fclose(f);
+
+ return timebase;
+}
+
+
+/**
* Write a 1-word message to the given SPE mailbox.
*/
void
@@ -114,6 +144,7 @@ cell_start_spus(struct cell_context *cell)
{
static boolean one_time_init = FALSE;
uint i, j;
+ uint timebase = get_timebase();
if (one_time_init) {
fprintf(stderr, "PPU: Multiple rendering contexts not yet supported "
@@ -123,24 +154,29 @@ cell_start_spus(struct cell_context *cell)
one_time_init = TRUE;
- assert(cell->num_spus <= MAX_SPUS);
-
- ASSERT_ALIGN16(&cell_global.command[0]);
- ASSERT_ALIGN16(&cell_global.command[1]);
+ assert(cell->num_spus <= CELL_MAX_SPUS);
ASSERT_ALIGN16(&cell_global.inits[0]);
ASSERT_ALIGN16(&cell_global.inits[1]);
+ /*
+ * Initialize the global 'inits' structure for each SPU.
+ * A pointer to the init struct will be passed to each SPU.
+ * The SPUs will then each grab their init info with mfc_get().
+ */
for (i = 0; i < cell->num_spus; i++) {
cell_global.inits[i].id = i;
cell_global.inits[i].num_spus = cell->num_spus;
cell_global.inits[i].debug_flags = cell->debug_flags;
- cell_global.inits[i].cmd = &cell_global.command[i];
+ cell_global.inits[i].inv_timebase = 1000.0f / timebase;
+
for (j = 0; j < CELL_NUM_BUFFERS; j++) {
cell_global.inits[i].buffers[j] = cell->buffer[j];
}
cell_global.inits[i].buffer_status = &cell->buffer_status[0][0][0];
+ cell_global.inits[i].spu_functions = &cell->spu_functions;
+
cell_global.spe_contexts[i] = spe_context_create(0, NULL);
if (!cell_global.spe_contexts[i]) {
fprintf(stderr, "spe_context_create() failed\n");
diff --git a/src/gallium/drivers/cell/ppu/cell_spu.h b/src/gallium/drivers/cell/ppu/cell_spu.h
index 137f26612e..c93958a9ed 100644
--- a/src/gallium/drivers/cell/ppu/cell_spu.h
+++ b/src/gallium/drivers/cell/ppu/cell_spu.h
@@ -30,14 +30,12 @@
#include <libspe2.h>
-#include <libmisc.h>
+#include <pthread.h>
#include "cell/common.h"
#include "cell_context.h"
-#define MAX_SPUS 8
-
/**
* Global vars, for now anyway.
*/
@@ -46,14 +44,13 @@ struct cell_global_info
/**
* SPU/SPE handles, etc
*/
- spe_context_ptr_t spe_contexts[MAX_SPUS];
- pthread_t spe_threads[MAX_SPUS];
+ spe_context_ptr_t spe_contexts[CELL_MAX_SPUS];
+ pthread_t spe_threads[CELL_MAX_SPUS];
/**
- * Data sent to SPUs
+ * Data sent to SPUs at start-up
*/
- struct cell_init_info inits[MAX_SPUS];
- struct cell_command command[MAX_SPUS];
+ struct cell_init_info inits[CELL_MAX_SPUS];
};
diff --git a/src/gallium/drivers/cell/ppu/cell_state.h b/src/gallium/drivers/cell/ppu/cell_state.h
index a7771a55a3..b193170f9c 100644
--- a/src/gallium/drivers/cell/ppu/cell_state.h
+++ b/src/gallium/drivers/cell/ppu/cell_state.h
@@ -44,8 +44,9 @@
#define CELL_NEW_TEXTURE 0x800
#define CELL_NEW_VERTEX 0x1000
#define CELL_NEW_VS 0x2000
-#define CELL_NEW_CONSTANTS 0x4000
-#define CELL_NEW_VERTEX_INFO 0x8000
+#define CELL_NEW_VS_CONSTANTS 0x4000
+#define CELL_NEW_FS_CONSTANTS 0x8000
+#define CELL_NEW_VERTEX_INFO 0x10000
extern void
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index 2da3097983..0a0af81f53 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -25,18 +25,155 @@
*
**************************************************************************/
+#include "pipe/p_inlines.h"
#include "util/u_memory.h"
#include "cell_context.h"
#include "cell_gen_fragment.h"
#include "cell_state.h"
#include "cell_state_emit.h"
-#include "cell_state_per_fragment.h"
#include "cell_batch.h"
#include "cell_texture.h"
#include "draw/draw_context.h"
#include "draw/draw_private.h"
+/**
+ * Find/create a cell_command_fragment_ops object corresponding to the
+ * current blend/stencil/z/colormask/etc. state.
+ */
+static struct cell_command_fragment_ops *
+lookup_fragment_ops(struct cell_context *cell)
+{
+ struct cell_fragment_ops_key key;
+ struct cell_command_fragment_ops *ops;
+
+ /*
+ * Build key
+ */
+ memset(&key, 0, sizeof(key));
+ key.blend = *cell->blend;
+ key.blend_color = cell->blend_color;
+ key.dsa = *cell->depth_stencil;
+
+ if (cell->framebuffer.cbufs[0])
+ key.color_format = cell->framebuffer.cbufs[0]->format;
+ else
+ key.color_format = PIPE_FORMAT_NONE;
+
+ if (cell->framebuffer.zsbuf)
+ key.zs_format = cell->framebuffer.zsbuf->format;
+ else
+ key.zs_format = PIPE_FORMAT_NONE;
+
+ /*
+ * Look up key in cache.
+ */
+ ops = (struct cell_command_fragment_ops *)
+ util_keymap_lookup(cell->fragment_ops_cache, &key);
+
+ /*
+ * If not found, create/save new fragment ops command.
+ */
+ if (!ops) {
+ struct spe_function spe_code_front, spe_code_back;
+ unsigned int facing_dependent, total_code_size;
+
+ if (0)
+ debug_printf("**** Create New Fragment Ops\n");
+
+ /* Prepare the buffer that will hold the generated code. The
+ * "0" passed in for the size means that the SPE code will
+ * use a default size.
+ */
+ spe_init_func(&spe_code_front, 0);
+ spe_init_func(&spe_code_back, 0);
+
+ /* Generate new code. Always generate new code for both front-facing
+ * and back-facing fragments, even if it's the same code in both
+ * cases.
+ */
+ cell_gen_fragment_function(cell, CELL_FACING_FRONT, &spe_code_front);
+ cell_gen_fragment_function(cell, CELL_FACING_BACK, &spe_code_back);
+
+ /* Make sure the code is a multiple of 8 bytes long; this is
+ * required to ensure that the dual pipe instruction alignment
+ * is correct. It's also important for the SPU unpacking,
+ * which assumes 8-byte boundaries.
+ */
+ unsigned int front_code_size = spe_code_size(&spe_code_front);
+ while (front_code_size % 8 != 0) {
+ spe_lnop(&spe_code_front);
+ front_code_size = spe_code_size(&spe_code_front);
+ }
+ unsigned int back_code_size = spe_code_size(&spe_code_back);
+ while (back_code_size % 8 != 0) {
+ spe_lnop(&spe_code_back);
+ back_code_size = spe_code_size(&spe_code_back);
+ }
+
+ /* Determine whether the code we generated is facing-dependent, by
+ * determining whether the generated code is different for the front-
+ * and back-facing fragments.
+ */
+ if (front_code_size == back_code_size && memcmp(spe_code_front.store, spe_code_back.store, front_code_size) == 0) {
+ /* Code is identical; only need one copy. */
+ facing_dependent = 0;
+ total_code_size = front_code_size;
+ }
+ else {
+ /* Code is different for front-facing and back-facing fragments.
+ * Need to send both copies.
+ */
+ facing_dependent = 1;
+ total_code_size = front_code_size + back_code_size;
+ }
+
+ /* alloc new fragment ops command. Note that this structure
+ * has variant length based on the total code size required.
+ */
+ ops = CALLOC_VARIANT_LENGTH_STRUCT(cell_command_fragment_ops, total_code_size);
+ /* populate the new cell_command_fragment_ops object */
+ ops->opcode = CELL_CMD_STATE_FRAGMENT_OPS;
+ ops->total_code_size = total_code_size;
+ ops->front_code_index = 0;
+ memcpy(ops->code, spe_code_front.store, front_code_size);
+ if (facing_dependent) {
+ /* We have separate front- and back-facing code. Append the
+ * back-facing code to the buffer. Be careful because the code
+ * size is in bytes, but the buffer is of unsigned elements.
+ */
+ ops->back_code_index = front_code_size / sizeof(spe_code_front.store[0]);
+ memcpy(ops->code + ops->back_code_index, spe_code_back.store, back_code_size);
+ }
+ else {
+ /* Use the same code for front- and back-facing fragments */
+ ops->back_code_index = ops->front_code_index;
+ }
+
+ /* Set the fields for the fallback case. Note that these fields
+ * (and the whole fallback case) will eventually go away.
+ */
+ ops->dsa = *cell->depth_stencil;
+ ops->blend = *cell->blend;
+ ops->blend_color = cell->blend_color;
+
+ /* insert cell_command_fragment_ops object into keymap/cache */
+ util_keymap_insert(cell->fragment_ops_cache, &key, ops, NULL);
+
+ /* release rtasm buffer */
+ spe_release_func(&spe_code_front);
+ spe_release_func(&spe_code_back);
+ }
+ else {
+ if (0)
+ debug_printf("**** Re-use Fragment Ops\n");
+ }
+
+ return ops;
+}
+
+
+
static void
emit_state_cmd(struct cell_context *cell, uint cmd,
const void *state, uint state_size)
@@ -73,6 +210,13 @@ cell_emit_state(struct cell_context *cell)
#endif
}
+ if (cell->dirty & (CELL_NEW_RASTERIZER)) {
+ struct cell_command_rasterizer *rast =
+ cell_batch_alloc(cell, sizeof(*rast));
+ rast->opcode = CELL_CMD_STATE_RASTERIZER;
+ rast->rasterizer = *cell->rasterizer;
+ }
+
if (cell->dirty & (CELL_NEW_FS)) {
/* Send new fragment program to SPUs */
struct cell_command_fragment_program *fp
@@ -90,59 +234,81 @@ cell_emit_state(struct cell_context *cell)
}
}
+ if (cell->dirty & (CELL_NEW_FS_CONSTANTS)) {
+ const uint shader = PIPE_SHADER_FRAGMENT;
+ const uint num_const = cell->constants[shader].size / sizeof(float);
+ uint i, j;
+ float *buf = cell_batch_alloc(cell, 16 + num_const * sizeof(float));
+ uint64_t *ibuf = (uint64_t *) buf;
+ const float *constants = pipe_buffer_map(cell->pipe.screen,
+ cell->constants[shader].buffer,
+ PIPE_BUFFER_USAGE_CPU_READ);
+ ibuf[0] = CELL_CMD_STATE_FS_CONSTANTS;
+ ibuf[1] = num_const;
+ j = 4;
+ for (i = 0; i < num_const; i++) {
+ buf[j++] = constants[i];
+ }
+ pipe_buffer_unmap(cell->pipe.screen, cell->constants[shader].buffer);
+ }
+
if (cell->dirty & (CELL_NEW_FRAMEBUFFER |
CELL_NEW_DEPTH_STENCIL |
CELL_NEW_BLEND)) {
- /* XXX we don't want to always do codegen here. We should have
- * a hash/lookup table to cache previous results...
- */
- struct cell_command_fragment_ops *fops
- = cell_batch_alloc(cell, sizeof(*fops));
- struct spe_function spe_code;
-
- /* generate new code */
- cell_gen_fragment_function(cell, &spe_code);
- /* put the new code into the batch buffer */
- fops->opcode = CELL_CMD_STATE_FRAGMENT_OPS;
- memcpy(&fops->code, spe_code.store,
- SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE);
- fops->dsa = cell->depth_stencil->base;
- fops->blend = cell->blend->base;
- /* free codegen buffer */
- spe_release_func(&spe_code);
+ struct cell_command_fragment_ops *fops, *fops_cmd;
+ /* Note that cell_command_fragment_ops is a variant-sized record */
+ fops = lookup_fragment_ops(cell);
+ fops_cmd = cell_batch_alloc(cell, sizeof(*fops_cmd) + fops->total_code_size);
+ memcpy(fops_cmd, fops, sizeof(*fops) + fops->total_code_size);
}
if (cell->dirty & CELL_NEW_SAMPLER) {
uint i;
for (i = 0; i < CELL_MAX_SAMPLERS; i++) {
- if (cell->sampler[i]) {
- struct cell_command_sampler *sampler
- = cell_batch_alloc(cell, sizeof(*sampler));
- sampler->opcode = CELL_CMD_STATE_SAMPLER;
- sampler->unit = i;
- sampler->state = *cell->sampler[i];
+ if (cell->dirty_samplers & (1 << i)) {
+ if (cell->sampler[i]) {
+ struct cell_command_sampler *sampler
+ = cell_batch_alloc(cell, sizeof(*sampler));
+ sampler->opcode = CELL_CMD_STATE_SAMPLER;
+ sampler->unit = i;
+ sampler->state = *cell->sampler[i];
+ }
}
}
+ cell->dirty_samplers = 0x0;
}
if (cell->dirty & CELL_NEW_TEXTURE) {
uint i;
for (i = 0;i < CELL_MAX_SAMPLERS; i++) {
- struct cell_command_texture *texture
- = cell_batch_alloc(cell, sizeof(*texture));
- texture->opcode = CELL_CMD_STATE_TEXTURE;
- texture->unit = i;
- if (cell->texture[i]) {
- texture->start = cell->texture[i]->tiled_data;
- texture->width = cell->texture[i]->base.width[0];
- texture->height = cell->texture[i]->base.height[0];
- }
- else {
- texture->start = NULL;
- texture->width = 1;
- texture->height = 1;
+ if (cell->dirty_textures & (1 << i)) {
+ struct cell_command_texture *texture
+ = cell_batch_alloc(cell, sizeof(*texture));
+ texture->opcode = CELL_CMD_STATE_TEXTURE;
+ texture->unit = i;
+ if (cell->texture[i]) {
+ uint level;
+ for (level = 0; level < CELL_MAX_TEXTURE_LEVELS; level++) {
+ texture->start[level] = cell->texture[i]->tiled_mapped[level];
+ texture->width[level] = cell->texture[i]->base.width[level];
+ texture->height[level] = cell->texture[i]->base.height[level];
+ texture->depth[level] = cell->texture[i]->base.depth[level];
+ }
+ texture->target = cell->texture[i]->base.target;
+ }
+ else {
+ uint level;
+ for (level = 0; level < CELL_MAX_TEXTURE_LEVELS; level++) {
+ texture->start[level] = NULL;
+ texture->width[level] = 0;
+ texture->height[level] = 0;
+ texture->depth[level] = 0;
+ }
+ texture->target = 0;
+ }
}
}
+ cell->dirty_textures = 0x0;
}
if (cell->dirty & CELL_NEW_VERTEX_INFO) {
diff --git a/src/gallium/drivers/cell/ppu/cell_state_shader.c b/src/gallium/drivers/cell/ppu/cell_state_shader.c
index 3a0d066da2..cda39f8d59 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_shader.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_shader.c
@@ -191,13 +191,18 @@ cell_set_constant_buffer(struct pipe_context *pipe,
assert(shader < PIPE_SHADER_TYPES);
assert(index == 0);
+ draw_flush(cell->draw);
+
/* note: reference counting */
winsys_buffer_reference(ws,
&cell->constants[shader].buffer,
buf->buffer);
cell->constants[shader].size = buf->size;
- cell->dirty |= CELL_NEW_CONSTANTS;
+ if (shader == PIPE_SHADER_VERTEX)
+ cell->dirty |= CELL_NEW_VS_CONSTANTS;
+ else if (shader == PIPE_SHADER_FRAGMENT)
+ cell->dirty |= CELL_NEW_FS_CONSTANTS;
}
diff --git a/src/gallium/drivers/cell/ppu/cell_surface.c b/src/gallium/drivers/cell/ppu/cell_surface.c
index 732c64082e..c9203fee08 100644
--- a/src/gallium/drivers/cell/ppu/cell_surface.c
+++ b/src/gallium/drivers/cell/ppu/cell_surface.c
@@ -27,6 +27,7 @@
#include "util/u_rect.h"
#include "cell_context.h"
+#include "cell_surface.h"
void
diff --git a/src/gallium/drivers/cell/ppu/cell_texture.c b/src/gallium/drivers/cell/ppu/cell_texture.c
index b6590dfb86..9f83ab8fa4 100644
--- a/src/gallium/drivers/cell/ppu/cell_texture.c
+++ b/src/gallium/drivers/cell/ppu/cell_texture.c
@@ -28,6 +28,7 @@
* Authors:
* Keith Whitwell <keith@tungstengraphics.com>
* Michel Dänzer <michel@tungstengraphics.com>
+ * Brian Paul
*/
#include "pipe/p_context.h"
@@ -42,30 +43,31 @@
#include "cell_texture.h"
-/* Simple, maximally packed layout.
- */
-static unsigned minify( unsigned d )
+static unsigned
+minify(unsigned d)
{
return MAX2(1, d>>1);
}
static void
-cell_texture_layout(struct cell_texture * spt)
+cell_texture_layout(struct cell_texture *ct)
{
- struct pipe_texture *pt = &spt->base;
+ struct pipe_texture *pt = &ct->base;
unsigned level;
unsigned width = pt->width[0];
unsigned height = pt->height[0];
unsigned depth = pt->depth[0];
- spt->buffer_size = 0;
+ ct->buffer_size = 0;
for ( level = 0 ; level <= pt->last_level ; level++ ) {
unsigned size;
unsigned w_tile, h_tile;
+ assert(level < CELL_MAX_TEXTURE_LEVELS);
+
/* width, height, rounded up to tile size */
w_tile = align(width, TILE_SIZE);
h_tile = align(height, TILE_SIZE);
@@ -76,9 +78,9 @@ cell_texture_layout(struct cell_texture * spt)
pt->nblocksx[level] = pf_get_nblocksx(&pt->block, w_tile);
pt->nblocksy[level] = pf_get_nblocksy(&pt->block, h_tile);
- spt->stride[level] = pt->nblocksx[level] * pt->block.size;
+ ct->stride[level] = pt->nblocksx[level] * pt->block.size;
- spt->level_offset[level] = spt->buffer_size;
+ ct->level_offset[level] = ct->buffer_size;
size = pt->nblocksx[level] * pt->nblocksy[level] * pt->block.size;
if (pt->target == PIPE_TEXTURE_CUBE)
@@ -86,7 +88,7 @@ cell_texture_layout(struct cell_texture * spt)
else
size *= depth;
- spt->buffer_size += size;
+ ct->buffer_size += size;
width = minify(width);
height = minify(height);
@@ -100,26 +102,25 @@ cell_texture_create(struct pipe_screen *screen,
const struct pipe_texture *templat)
{
struct pipe_winsys *ws = screen->winsys;
- struct cell_texture *spt = CALLOC_STRUCT(cell_texture);
- if (!spt)
+ struct cell_texture *ct = CALLOC_STRUCT(cell_texture);
+ if (!ct)
return NULL;
- spt->base = *templat;
- spt->base.refcount = 1;
- spt->base.screen = screen;
+ ct->base = *templat;
+ ct->base.refcount = 1;
+ ct->base.screen = screen;
- cell_texture_layout(spt);
+ cell_texture_layout(ct);
- spt->buffer = ws->buffer_create(ws, 32,
- PIPE_BUFFER_USAGE_PIXEL,
- spt->buffer_size);
+ ct->buffer = ws->buffer_create(ws, 32, PIPE_BUFFER_USAGE_PIXEL,
+ ct->buffer_size);
- if (!spt->buffer) {
- FREE(spt);
+ if (!ct->buffer) {
+ FREE(ct);
return NULL;
}
- return &spt->base;
+ return &ct->base;
}
@@ -135,177 +136,310 @@ cell_texture_release(struct pipe_screen *screen,
__FUNCTION__, (void *) *pt, (*pt)->refcount - 1);
*/
if (--(*pt)->refcount <= 0) {
- struct cell_texture *spt = cell_texture(*pt);
+ /* Delete this texture now.
+ * But note that the underlying pipe_buffer may linger...
+ */
+ struct cell_texture *ct = cell_texture(*pt);
+ uint i;
/*
- DBG("%s deleting %p\n", __FUNCTION__, (void *) spt);
+ DBG("%s deleting %p\n", __FUNCTION__, (void *) ct);
*/
- pipe_buffer_reference(screen, &spt->buffer, NULL);
+ pipe_buffer_reference(screen, &ct->buffer, NULL);
- FREE(spt);
+ for (i = 0; i < CELL_MAX_TEXTURE_LEVELS; i++) {
+ /* Unreference the tiled image buffer.
+ * It may not actually be deleted until a fence is hit.
+ */
+ if (ct->tiled_buffer[i]) {
+ ct->tiled_mapped[i] = NULL;
+ winsys_buffer_reference(screen->winsys, &ct->tiled_buffer[i], NULL);
+ }
+ }
+
+ FREE(ct);
}
*pt = NULL;
}
-#if 0
+
+/**
+ * Convert image from linear layout to tiled layout. 4-byte pixels.
+ */
static void
-cell_texture_update(struct pipe_context *pipe, struct pipe_texture *texture,
- uint face, uint levelsMask)
+twiddle_image_uint(uint w, uint h, uint tile_size, uint *dst,
+ uint src_stride, const uint *src)
{
- /* XXX TO DO: re-tile the texture data ... */
+ const uint tile_size2 = tile_size * tile_size;
+ const uint h_t = (h + tile_size - 1) / tile_size;
+ const uint w_t = (w + tile_size - 1) / tile_size;
-}
-#endif
+ uint it, jt; /* tile counters */
+ uint i, j; /* intra-tile counters */
+ src_stride /= 4; /* convert from bytes to pixels */
-static struct pipe_surface *
-cell_get_tex_surface(struct pipe_screen *screen,
- struct pipe_texture *pt,
- unsigned face, unsigned level, unsigned zslice,
- unsigned usage)
-{
- struct pipe_winsys *ws = screen->winsys;
- struct cell_texture *spt = cell_texture(pt);
- struct pipe_surface *ps;
+ /* loop over dest tiles */
+ for (it = 0; it < h_t; it++) {
+ for (jt = 0; jt < w_t; jt++) {
+ /* start of dest tile: */
+ uint *tdst = dst + (it * w_t + jt) * tile_size2;
- ps = ws->surface_alloc(ws);
- if (ps) {
- assert(ps->refcount);
- assert(ps->winsys);
- winsys_buffer_reference(ws, &ps->buffer, spt->buffer);
- ps->format = pt->format;
- ps->block = pt->block;
- ps->width = pt->width[level];
- ps->height = pt->height[level];
- ps->nblocksx = pt->nblocksx[level];
- ps->nblocksy = pt->nblocksy[level];
- ps->stride = spt->stride[level];
- ps->offset = spt->level_offset[level];
- ps->usage = usage;
+ /* compute size of this tile (may be smaller than tile_size) */
+ /* XXX note: a compiler bug was found here. That's why the code
+ * looks as it does.
+ */
+ uint tile_width = w - jt * tile_size;
+ tile_width = MIN2(tile_width, tile_size);
+ uint tile_height = h - it * tile_size;
+ tile_height = MIN2(tile_height, tile_size);
- /* XXX may need to override usage flags (see sp_texture.c) */
+ /* loop over texels in the tile */
+ for (i = 0; i < tile_height; i++) {
+ for (j = 0; j < tile_width; j++) {
+ const uint srci = it * tile_size + i;
+ const uint srcj = jt * tile_size + j;
+ ASSERT(srci < h);
+ ASSERT(srcj < w);
+ tdst[i * tile_size + j] = src[srci * src_stride + srcj];
+ }
+ }
+ }
+ }
+}
- pipe_texture_reference(&ps->texture, pt);
- ps->face = face;
- ps->level = level;
- ps->zslice = zslice;
- if (pt->target == PIPE_TEXTURE_CUBE || pt->target == PIPE_TEXTURE_3D) {
- ps->offset += ((pt->target == PIPE_TEXTURE_CUBE) ? face : zslice) *
- ps->nblocksy *
- ps->stride;
- }
- else {
- assert(face == 0);
- assert(zslice == 0);
+/**
+ * For Cell. Basically, rearrange the pixels/quads from this layout:
+ * +--+--+--+--+
+ * |p0|p1|p2|p3|....
+ * +--+--+--+--+
+ *
+ * to this layout:
+ * +--+--+
+ * |p0|p1|....
+ * +--+--+
+ * |p2|p3|
+ * +--+--+
+ */
+static void
+twiddle_tile(const uint *tileIn, uint *tileOut)
+{
+ int y, x;
+
+ for (y = 0; y < TILE_SIZE; y+=2) {
+ for (x = 0; x < TILE_SIZE; x+=2) {
+ int k = 4 * (y/2 * TILE_SIZE/2 + x/2);
+ tileOut[y * TILE_SIZE + (x + 0)] = tileIn[k];
+ tileOut[y * TILE_SIZE + (x + 1)] = tileIn[k+1];
+ tileOut[(y + 1) * TILE_SIZE + (x + 0)] = tileIn[k+2];
+ tileOut[(y + 1) * TILE_SIZE + (x + 1)] = tileIn[k+3];
}
}
- return ps;
}
-
/**
- * Copy tile data from linear layout to tiled layout.
- * XXX this should be rolled into the future surface-creation code.
- * XXX also need "untile" code...
+ * Convert image from tiled layout to linear layout. 4-byte pixels.
*/
static void
-tile_copy_data(uint w, uint h, uint tile_size, uint *dst, const uint *src)
+untwiddle_image_uint(uint w, uint h, uint tile_size, uint *dst,
+ uint dst_stride, const uint *src)
{
const uint tile_size2 = tile_size * tile_size;
- const uint h_t = h / tile_size, w_t = w / tile_size;
-
+ const uint h_t = (h + tile_size - 1) / tile_size;
+ const uint w_t = (w + tile_size - 1) / tile_size;
+ uint *tile_buf;
uint it, jt; /* tile counters */
uint i, j; /* intra-tile counters */
- /* loop over dest tiles */
+ dst_stride /= 4; /* convert from bytes to pixels */
+
+ tile_buf = align_malloc(tile_size * tile_size * 4, 16);
+
+ /* loop over src tiles */
for (it = 0; it < h_t; it++) {
for (jt = 0; jt < w_t; jt++) {
- /* start of dest tile: */
- uint *tdst = dst + (it * w_t + jt) * tile_size2;
+ /* start of src tile: */
+ const uint *tsrc = src + (it * w_t + jt) * tile_size2;
+
+ twiddle_tile(tsrc, tile_buf);
+ tsrc = tile_buf;
+
+ /* compute size of this tile (may be smaller than tile_size) */
+ /* XXX note: a compiler bug was found here. That's why the code
+ * looks as it does.
+ */
+ uint tile_width = w - jt * tile_size;
+ tile_width = MIN2(tile_width, tile_size);
+ uint tile_height = h - it * tile_size;
+ tile_height = MIN2(tile_height, tile_size);
+
/* loop over texels in the tile */
- for (i = 0; i < tile_size; i++) {
- for (j = 0; j < tile_size; j++) {
- const uint srci = it * tile_size + i;
- const uint srcj = jt * tile_size + j;
- *tdst++ = src[srci * w + srcj];
+ for (i = 0; i < tile_height; i++) {
+ for (j = 0; j < tile_width; j++) {
+ uint dsti = it * tile_size + i;
+ uint dstj = jt * tile_size + j;
+ ASSERT(dsti < h);
+ ASSERT(dstj < w);
+ dst[dsti * dst_stride + dstj] = tsrc[i * tile_size + j];
}
}
}
}
-}
+ align_free(tile_buf);
+}
/**
* Convert linear texture image data to tiled format for SPU usage.
- * XXX recast this in terms of pipe_surfaces (aka texture views).
*/
static void
-cell_tile_texture(struct cell_context *cell,
- struct cell_texture *texture)
+cell_twiddle_texture(struct pipe_screen *screen,
+ struct pipe_surface *surface)
{
- struct pipe_screen *screen = cell->pipe.screen;
- uint face = 0, level = 0, zslice = 0;
- struct pipe_surface *surf;
- const uint w = texture->base.width[0], h = texture->base.height[0];
- const uint *src;
-
- /* temporary restrictions: */
- assert(w >= TILE_SIZE);
- assert(h >= TILE_SIZE);
- assert(w % TILE_SIZE == 0);
- assert(h % TILE_SIZE == 0);
-
- surf = screen->get_tex_surface(screen, &texture->base, face, level, zslice,
- PIPE_BUFFER_USAGE_CPU_WRITE);
- ASSERT(surf);
-
- src = (const uint *) pipe_surface_map(surf, PIPE_BUFFER_USAGE_CPU_WRITE);
-
- if (texture->tiled_data) {
- align_free(texture->tiled_data);
+ struct cell_texture *ct = cell_texture(surface->texture);
+ const uint level = surface->level;
+ const uint texWidth = ct->base.width[level];
+ const uint texHeight = ct->base.height[level];
+ const uint bufWidth = align(texWidth, TILE_SIZE);
+ const uint bufHeight = align(texHeight, TILE_SIZE);
+ const void *map = pipe_buffer_map(screen, surface->buffer,
+ PIPE_BUFFER_USAGE_CPU_READ);
+ const uint *src = (const uint *) ((const ubyte *) map + surface->offset);
+
+ switch (ct->base.format) {
+ case PIPE_FORMAT_A8R8G8B8_UNORM:
+ case PIPE_FORMAT_B8G8R8A8_UNORM:
+ case PIPE_FORMAT_S8Z24_UNORM:
+ {
+ int numFaces = ct->base.target == PIPE_TEXTURE_CUBE ? 6 : 1;
+ int offset = bufWidth * bufHeight * 4 * surface->face;
+ uint *dst;
+
+ if (!ct->tiled_buffer[level]) {
+ /* allocate buffer for tiled data now */
+ struct pipe_winsys *ws = screen->winsys;
+ uint bytes = bufWidth * bufHeight * 4 * numFaces;
+ ct->tiled_buffer[level] = ws->buffer_create(ws, 16,
+ PIPE_BUFFER_USAGE_PIXEL,
+ bytes);
+ /* and map it */
+ ct->tiled_mapped[level] = ws->buffer_map(ws, ct->tiled_buffer[level],
+ PIPE_BUFFER_USAGE_GPU_READ);
+ }
+ dst = (uint *) ((ubyte *) ct->tiled_mapped[level] + offset);
+
+ twiddle_image_uint(texWidth, texHeight, TILE_SIZE, dst,
+ surface->stride, src);
+ }
+ break;
+ default:
+ printf("Cell: twiddle unsupported texture format %s\n", pf_name(ct->base.format));
+ ;
}
- texture->tiled_data = align_malloc(w * h * 4, 16);
- tile_copy_data(w, h, TILE_SIZE, texture->tiled_data, src);
+ pipe_buffer_unmap(screen, surface->buffer);
+}
+
+
+/**
+ * Convert SPU tiled texture image data to linear format for app usage.
+ */
+static void
+cell_untwiddle_texture(struct pipe_screen *screen,
+ struct pipe_surface *surface)
+{
+ struct cell_texture *ct = cell_texture(surface->texture);
+ const uint level = surface->level;
+ const uint texWidth = ct->base.width[level];
+ const uint texHeight = ct->base.height[level];
+ const void *map = pipe_buffer_map(screen, surface->buffer,
+ PIPE_BUFFER_USAGE_CPU_READ);
+ const uint *src = (const uint *) ((const ubyte *) map + surface->offset);
+
+ switch (ct->base.format) {
+ case PIPE_FORMAT_A8R8G8B8_UNORM:
+ case PIPE_FORMAT_B8G8R8A8_UNORM:
+ case PIPE_FORMAT_S8Z24_UNORM:
+ {
+ int numFaces = ct->base.target == PIPE_TEXTURE_CUBE ? 6 : 1;
+ int offset = surface->stride * texHeight * 4 * surface->face;
+ uint *dst;
+
+ if (!ct->untiled_data[level]) {
+ ct->untiled_data[level] =
+ align_malloc(surface->stride * texHeight * 4 * numFaces, 16);
+ }
+
+ dst = (uint *) ((ubyte *) ct->untiled_data[level] + offset);
- pipe_surface_unmap(surf);
+ untwiddle_image_uint(texWidth, texHeight, TILE_SIZE, dst,
+ surface->stride, src);
+ }
+ break;
+ default:
+ {
+ ct->untiled_data[level] = NULL;
+ printf("Cell: untwiddle unsupported texture format %s\n", pf_name(ct->base.format));
+ }
+ }
- pipe_surface_reference(&surf, NULL);
+ pipe_buffer_unmap(screen, surface->buffer);
}
-void
-cell_update_texture_mapping(struct cell_context *cell)
+static struct pipe_surface *
+cell_get_tex_surface(struct pipe_screen *screen,
+ struct pipe_texture *pt,
+ unsigned face, unsigned level, unsigned zslice,
+ unsigned usage)
{
-#if 0
- uint face = 0, level = 0, zslice = 0;
-#endif
- uint i;
-
- for (i = 0; i < CELL_MAX_SAMPLERS; i++) {
- if (cell->texture[i])
- cell_tile_texture(cell, cell->texture[i]);
- }
+ struct pipe_winsys *ws = screen->winsys;
+ struct cell_texture *ct = cell_texture(pt);
+ struct pipe_surface *ps;
-#if 0
- if (cell->tex_surf && cell->tex_map) {
- pipe_surface_unmap(cell->tex_surf);
- cell->tex_map = NULL;
- }
+ ps = ws->surface_alloc(ws);
+ if (ps) {
+ assert(ps->refcount);
+ assert(ps->winsys);
+ winsys_buffer_reference(ws, &ps->buffer, ct->buffer);
+ ps->format = pt->format;
+ ps->block = pt->block;
+ ps->width = pt->width[level];
+ ps->height = pt->height[level];
+ ps->nblocksx = pt->nblocksx[level];
+ ps->nblocksy = pt->nblocksy[level];
+ ps->stride = ct->stride[level];
+ ps->offset = ct->level_offset[level];
+ ps->usage = usage;
- /* XXX free old surface */
+ /* XXX may need to override usage flags (see sp_texture.c) */
- cell->tex_surf = cell_get_tex_surface(&cell->pipe,
- &cell->texture[0]->base,
- face, level, zslice);
+ pipe_texture_reference(&ps->texture, pt);
+ ps->face = face;
+ ps->level = level;
+ ps->zslice = zslice;
- cell->tex_map = pipe_surface_map(cell->tex_surf);
-#endif
+ if (pt->target == PIPE_TEXTURE_CUBE || pt->target == PIPE_TEXTURE_3D) {
+ ps->offset += ((pt->target == PIPE_TEXTURE_CUBE) ? face : zslice) *
+ ps->nblocksy *
+ ps->stride;
+ }
+ else {
+ assert(face == 0);
+ assert(zslice == 0);
+ }
+
+ if (ps->usage & PIPE_BUFFER_USAGE_CPU_READ) {
+ /* convert from tiled to linear layout */
+ cell_untwiddle_texture(screen, ps);
+ }
+ }
+ return ps;
}
@@ -313,11 +447,17 @@ static void
cell_tex_surface_release(struct pipe_screen *screen,
struct pipe_surface **s)
{
- /* Effectively do the texture_update work here - if texture images
- * needed post-processing to put them into hardware layout, this is
- * where it would happen. For softpipe, nothing to do.
- */
- assert ((*s)->texture);
+ struct cell_texture *ct = cell_texture((*s)->texture);
+ const uint level = (*s)->level;
+
+ if (((*s)->usage & PIPE_BUFFER_USAGE_CPU_READ) && (ct->untiled_data[level]))
+ {
+ align_free(ct->untiled_data[level]);
+ ct->untiled_data[level] = NULL;
+ }
+
+ /* XXX if done rendering to teximage, re-tile */
+
pipe_texture_reference(&(*s)->texture, NULL);
screen->winsys->surface_release(screen->winsys, s);
@@ -325,11 +465,15 @@ cell_tex_surface_release(struct pipe_screen *screen,
static void *
-cell_surface_map( struct pipe_screen *screen,
- struct pipe_surface *surface,
- unsigned flags )
+cell_surface_map(struct pipe_screen *screen,
+ struct pipe_surface *surface,
+ unsigned flags)
{
ubyte *map;
+ struct cell_texture *ct = cell_texture(surface->texture);
+ const uint level = surface->level;
+
+ assert(ct);
if (flags & ~surface->usage) {
assert(0);
@@ -339,22 +483,15 @@ cell_surface_map( struct pipe_screen *screen,
map = pipe_buffer_map( screen, surface->buffer, flags );
if (map == NULL)
return NULL;
-
- /* May want to different things here depending on read/write nature
- * of the map:
- */
- if (surface->texture &&
- (flags & PIPE_BUFFER_USAGE_CPU_WRITE))
+ else
{
- /* Do something to notify sharing contexts of a texture change.
- * In softpipe, that would mean flushing the texture cache.
- */
-#if 0
- cell_screen(screen)->timestamp++;
-#endif
+ if ((surface->usage & PIPE_BUFFER_USAGE_CPU_READ) && (ct->untiled_data[level])) {
+ return (void *) ((ubyte *) ct->untiled_data[level] + surface->offset);
+ }
+ else {
+ return (void *) (map + surface->offset);
+ }
}
-
- return map + surface->offset;
}
@@ -362,17 +499,21 @@ static void
cell_surface_unmap(struct pipe_screen *screen,
struct pipe_surface *surface)
{
- pipe_buffer_unmap( screen, surface->buffer );
-}
+ struct cell_texture *ct = cell_texture(surface->texture);
+ assert(ct);
-void
-cell_init_texture_functions(struct cell_context *cell)
-{
- /*cell->pipe.texture_update = cell_texture_update;*/
+ if ((ct->base.tex_usage & PIPE_TEXTURE_USAGE_SAMPLER) &&
+ (surface->usage & PIPE_BUFFER_USAGE_CPU_WRITE)) {
+ /* convert from linear to tiled layout */
+ cell_twiddle_texture(screen, surface);
+ }
+
+ pipe_buffer_unmap( screen, surface->buffer );
}
+
void
cell_init_screen_texture_funcs(struct pipe_screen *screen)
{
diff --git a/src/gallium/drivers/cell/ppu/cell_texture.h b/src/gallium/drivers/cell/ppu/cell_texture.h
index 6d37e95ebc..7018b0c9bf 100644
--- a/src/gallium/drivers/cell/ppu/cell_texture.h
+++ b/src/gallium/drivers/cell/ppu/cell_texture.h
@@ -40,15 +40,19 @@ struct cell_texture
{
struct pipe_texture base;
- unsigned long level_offset[PIPE_MAX_TEXTURE_LEVELS];
- unsigned long stride[PIPE_MAX_TEXTURE_LEVELS];
+ unsigned long level_offset[CELL_MAX_TEXTURE_LEVELS];
+ unsigned long stride[CELL_MAX_TEXTURE_LEVELS];
/* The data is held here:
*/
struct pipe_buffer *buffer;
unsigned long buffer_size;
- void *tiled_data; /* XXX this may be temporary */ /*ALIGN16*/
+ /** Texture data in tiled layout is held here */
+ struct pipe_buffer *tiled_buffer[CELL_MAX_TEXTURE_LEVELS];
+ /** Mapped, tiled texture data */
+ void *tiled_mapped[CELL_MAX_TEXTURE_LEVELS];
+ void *untiled_data[CELL_MAX_TEXTURE_LEVELS];
};
@@ -62,14 +66,6 @@ cell_texture(struct pipe_texture *pt)
extern void
-cell_update_texture_mapping(struct cell_context *cell);
-
-
-extern void
-cell_init_texture_functions(struct cell_context *cell);
-
-
-extern void
cell_init_screen_texture_funcs(struct pipe_screen *screen);
diff --git a/src/gallium/drivers/cell/ppu/cell_vbuf.c b/src/gallium/drivers/cell/ppu/cell_vbuf.c
index aa63435b93..65ba51b6bb 100644
--- a/src/gallium/drivers/cell/ppu/cell_vbuf.c
+++ b/src/gallium/drivers/cell/ppu/cell_vbuf.c
@@ -38,6 +38,7 @@
#include "cell_batch.h"
#include "cell_context.h"
+#include "cell_fence.h"
#include "cell_flush.h"
#include "cell_spu.h"
#include "cell_vbuf.h"
@@ -108,6 +109,11 @@ cell_vbuf_release_vertices(struct vbuf_render *vbr, void *vertices,
__FUNCTION__, cvbr->vertex_buf, vertices_used);
*/
+ /* Make sure texture buffers aren't released until we're done rendering
+ * with them.
+ */
+ cell_add_fenced_textures(cell);
+
/* Tell SPUs they can release the vert buf */
if (cvbr->vertex_buf != ~0U) {
struct cell_command_release_verts *release
diff --git a/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c b/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c
index 566df7f59e..9cba537d9e 100644
--- a/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c
+++ b/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c
@@ -73,8 +73,8 @@ emit_matrix_transpose(struct spe_function *p,
int col3;
- spe_lqd(p, shuf_hi, shuf_ptr, 3);
- spe_lqd(p, shuf_lo, shuf_ptr, 4);
+ spe_lqd(p, shuf_hi, shuf_ptr, 3*16);
+ spe_lqd(p, shuf_lo, shuf_ptr, 4*16);
spe_shufb(p, t1, row0, row2, shuf_hi);
spe_shufb(p, t2, row0, row2, shuf_lo);
@@ -122,13 +122,13 @@ emit_matrix_transpose(struct spe_function *p,
*/
switch (count) {
case 4:
- spe_stqd(p, col3, dest_ptr, 3);
+ spe_stqd(p, col3, dest_ptr, 3 * 16);
case 3:
- spe_stqd(p, col2, dest_ptr, 2);
+ spe_stqd(p, col2, dest_ptr, 2 * 16);
case 2:
- spe_stqd(p, col1, dest_ptr, 1);
+ spe_stqd(p, col1, dest_ptr, 1 * 16);
case 1:
- spe_stqd(p, col0, dest_ptr, 0);
+ spe_stqd(p, col0, dest_ptr, 0 * 16);
}
@@ -145,6 +145,8 @@ emit_matrix_transpose(struct spe_function *p,
}
+#if 0
+/* This appears to not be used currently */
static void
emit_fetch(struct spe_function *p,
unsigned in_ptr, unsigned *offset,
@@ -166,17 +168,17 @@ emit_fetch(struct spe_function *p,
float scale_signed = 0.0;
float scale_unsigned = 0.0;
- spe_lqd(p, v0, in_ptr, 0 + offset[0]);
- spe_lqd(p, v1, in_ptr, 1 + offset[0]);
- spe_lqd(p, v2, in_ptr, 2 + offset[0]);
- spe_lqd(p, v3, in_ptr, 3 + offset[0]);
+ spe_lqd(p, v0, in_ptr, (0 + offset[0]) * 16);
+ spe_lqd(p, v1, in_ptr, (1 + offset[0]) * 16);
+ spe_lqd(p, v2, in_ptr, (2 + offset[0]) * 16);
+ spe_lqd(p, v3, in_ptr, (3 + offset[0]) * 16);
offset[0] += 4;
switch (bytes) {
case 1:
scale_signed = 1.0f / 127.0f;
scale_unsigned = 1.0f / 255.0f;
- spe_lqd(p, tmp, shuf_ptr, 1);
+ spe_lqd(p, tmp, shuf_ptr, 1 * 16);
spe_shufb(p, v0, v0, v0, tmp);
spe_shufb(p, v1, v1, v1, tmp);
spe_shufb(p, v2, v2, v2, tmp);
@@ -185,7 +187,7 @@ emit_fetch(struct spe_function *p,
case 2:
scale_signed = 1.0f / 32767.0f;
scale_unsigned = 1.0f / 65535.0f;
- spe_lqd(p, tmp, shuf_ptr, 2);
+ spe_lqd(p, tmp, shuf_ptr, 2 * 16);
spe_shufb(p, v0, v0, v0, tmp);
spe_shufb(p, v1, v1, v1, tmp);
spe_shufb(p, v2, v2, v2, tmp);
@@ -241,11 +243,11 @@ emit_fetch(struct spe_function *p,
switch (count) {
case 1:
- spe_stqd(p, float_zero, out_ptr, 1);
+ spe_stqd(p, float_zero, out_ptr, 1 * 16);
case 2:
- spe_stqd(p, float_zero, out_ptr, 2);
+ spe_stqd(p, float_zero, out_ptr, 2 * 16);
case 3:
- spe_stqd(p, float_one, out_ptr, 3);
+ spe_stqd(p, float_one, out_ptr, 3 * 16);
}
if (float_zero != -1) {
@@ -256,6 +258,7 @@ emit_fetch(struct spe_function *p,
spe_release_register(p, float_one);
}
}
+#endif
void cell_update_vertex_fetch(struct draw_context *draw)