summaryrefslogtreecommitdiff
path: root/src/gallium/drivers/cell/spu
diff options
context:
space:
mode:
Diffstat (limited to 'src/gallium/drivers/cell/spu')
-rw-r--r--src/gallium/drivers/cell/spu/.gitignore1
-rw-r--r--src/gallium/drivers/cell/spu/Makefile4
-rw-r--r--src/gallium/drivers/cell/spu/spu_colorpack.h49
-rw-r--r--src/gallium/drivers/cell/spu/spu_command.c815
-rw-r--r--src/gallium/drivers/cell/spu/spu_command.h35
-rw-r--r--src/gallium/drivers/cell/spu/spu_dcache.c4
-rw-r--r--src/gallium/drivers/cell/spu/spu_funcs.c173
-rw-r--r--src/gallium/drivers/cell/spu/spu_funcs.h35
-rw-r--r--src/gallium/drivers/cell/spu/spu_main.c626
-rw-r--r--src/gallium/drivers/cell/spu/spu_main.h87
-rw-r--r--src/gallium/drivers/cell/spu/spu_per_fragment_op.c286
-rw-r--r--src/gallium/drivers/cell/spu/spu_render.c38
-rw-r--r--src/gallium/drivers/cell/spu/spu_texture.c635
-rw-r--r--src/gallium/drivers/cell/spu/spu_texture.h28
-rw-r--r--src/gallium/drivers/cell/spu/spu_tile.c37
-rw-r--r--src/gallium/drivers/cell/spu/spu_tile.h6
-rw-r--r--src/gallium/drivers/cell/spu/spu_tri.c573
17 files changed, 2238 insertions, 1194 deletions
diff --git a/src/gallium/drivers/cell/spu/.gitignore b/src/gallium/drivers/cell/spu/.gitignore
new file mode 100644
index 0000000000..2be9a2d324
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/.gitignore
@@ -0,0 +1 @@
+g3d_spu
diff --git a/src/gallium/drivers/cell/spu/Makefile b/src/gallium/drivers/cell/spu/Makefile
index 1ae0dfb8c1..116453b79c 100644
--- a/src/gallium/drivers/cell/spu/Makefile
+++ b/src/gallium/drivers/cell/spu/Makefile
@@ -16,8 +16,10 @@ PROG_SPU_EMBED_O = $(PROG)_spu-embed.o
SOURCES = \
- spu_main.c \
+ spu_command.c \
spu_dcache.c \
+ spu_funcs.c \
+ spu_main.c \
spu_per_fragment_op.c \
spu_render.c \
spu_texture.c \
diff --git a/src/gallium/drivers/cell/spu/spu_colorpack.h b/src/gallium/drivers/cell/spu/spu_colorpack.h
index fd8dc6ded3..d7ce005524 100644
--- a/src/gallium/drivers/cell/spu/spu_colorpack.h
+++ b/src/gallium/drivers/cell/spu/spu_colorpack.h
@@ -31,6 +31,7 @@
#define SPU_COLORPACK_H
+#include <transpose_matrix4x4.h>
#include <spu_intrinsics.h>
@@ -84,10 +85,10 @@ spu_unpack_B8G8R8A8(uint color)
vector unsigned int color_u4 = spu_splats(color);
color_u4 = spu_shuffle(color_u4, color_u4,
((vector unsigned char) {
- 10, 10, 10, 10,
- 5, 5, 5, 5,
+ 2, 2, 2, 2,
+ 1, 1, 1, 1,
0, 0, 0, 0,
- 15, 15, 15, 15}) );
+ 3, 3, 3, 3}) );
return spu_convtf(color_u4, 32);
}
@@ -98,13 +99,47 @@ spu_unpack_A8R8G8B8(uint color)
vector unsigned int color_u4 = spu_splats(color);
color_u4 = spu_shuffle(color_u4, color_u4,
((vector unsigned char) {
- 5, 5, 5, 5,
- 10, 10, 10, 10,
- 15, 15, 15, 15,
+ 1, 1, 1, 1,
+ 2, 2, 2, 2,
+ 3, 3, 3, 3,
0, 0, 0, 0}) );
-
return spu_convtf(color_u4, 32);
}
+/**
+ * \param color_in - array of 32-bit packed ARGB colors
+ * \param color_out - returns float colors in RRRR, GGGG, BBBB, AAAA order
+ */
+static INLINE void
+spu_unpack_A8R8G8B8_transpose4(const vector unsigned int color_in[4],
+ vector float color_out[4])
+{
+ vector unsigned int c0;
+
+ c0 = spu_shuffle(color_in[0], color_in[0],
+ ((vector unsigned char) {
+ 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 0, 0, 0, 0}) );
+ color_out[0] = spu_convtf(c0, 32);
+
+ c0 = spu_shuffle(color_in[1], color_in[1],
+ ((vector unsigned char) {
+ 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 0, 0, 0, 0}) );
+ color_out[1] = spu_convtf(c0, 32);
+
+ c0 = spu_shuffle(color_in[2], color_in[2],
+ ((vector unsigned char) {
+ 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 0, 0, 0, 0}) );
+ color_out[2] = spu_convtf(c0, 32);
+
+ c0 = spu_shuffle(color_in[3], color_in[3],
+ ((vector unsigned char) {
+ 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 0, 0, 0, 0}) );
+ color_out[3] = spu_convtf(c0, 32);
+
+ _transpose_matrix4x4(color_out, color_out);
+}
+
+
+
#endif /* SPU_COLORPACK_H */
diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
new file mode 100644
index 0000000000..8500d19754
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -0,0 +1,815 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+/**
+ * SPU command processing code
+ */
+
+
+#include <stdio.h>
+#include <libmisc.h>
+
+#include "pipe/p_defines.h"
+
+#include "spu_command.h"
+#include "spu_main.h"
+#include "spu_render.h"
+#include "spu_per_fragment_op.h"
+#include "spu_texture.h"
+#include "spu_tile.h"
+#include "spu_vertex_shader.h"
+#include "spu_dcache.h"
+#include "cell/common.h"
+
+
+struct spu_vs_context draw;
+
+
+/**
+ * Buffers containing dynamically generated SPU code:
+ */
+static unsigned char attribute_fetch_code_buffer[136 * PIPE_MAX_ATTRIBS]
+ ALIGN16_ATTRIB;
+
+
+
+static INLINE int
+align(int value, int alignment)
+{
+ return (value + alignment - 1) & ~(alignment - 1);
+}
+
+
+
+/**
+ * Tell the PPU that this SPU has finished copying a buffer to
+ * local store and that it may be reused by the PPU.
+ * This is done by writting a 16-byte batch-buffer-status block back into
+ * main memory (in cell_context->buffer_status[]).
+ */
+static void
+release_buffer(uint buffer)
+{
+ /* Evidently, using less than a 16-byte status doesn't work reliably */
+ static const vector unsigned int status = {CELL_BUFFER_STATUS_FREE,
+ CELL_BUFFER_STATUS_FREE,
+ CELL_BUFFER_STATUS_FREE,
+ CELL_BUFFER_STATUS_FREE};
+ const uint index = 4 * (spu.init.id * CELL_NUM_BUFFERS + buffer);
+ uint *dst = spu.init.buffer_status + index;
+
+ ASSERT(buffer < CELL_NUM_BUFFERS);
+
+ mfc_put((void *) &status, /* src in local memory */
+ (unsigned int) dst, /* dst in main memory */
+ sizeof(status), /* size */
+ TAG_MISC, /* tag is unimportant */
+ 0, /* tid */
+ 0 /* rid */);
+}
+
+
+/**
+ * Write CELL_FENCE_SIGNALLED back to the fence status qword in main memory.
+ * There's a qword of status per SPU.
+ */
+static void
+cmd_fence(struct cell_command_fence *fence_cmd)
+{
+ static const vector unsigned int status = {CELL_FENCE_SIGNALLED,
+ CELL_FENCE_SIGNALLED,
+ CELL_FENCE_SIGNALLED,
+ CELL_FENCE_SIGNALLED};
+ uint *dst = (uint *) fence_cmd->fence;
+ dst += 4 * spu.init.id; /* main store/memory address, not local store */
+ ASSERT_ALIGN16(dst);
+ mfc_put((void *) &status, /* src in local memory */
+ (unsigned int) dst, /* dst in main memory */
+ sizeof(status), /* size */
+ TAG_FENCE, /* tag */
+ 0, /* tid */
+ 0 /* rid */);
+}
+
+
+static void
+cmd_clear_surface(const struct cell_command_clear_surface *clear)
+{
+ D_PRINTF(CELL_DEBUG_CMD, "CLEAR SURF %u to 0x%08x\n", clear->surface, clear->value);
+
+ if (clear->surface == 0) {
+ spu.fb.color_clear_value = clear->value;
+ if (spu.init.debug_flags & CELL_DEBUG_CHECKER) {
+ uint x = (spu.init.id << 4) | (spu.init.id << 12) |
+ (spu.init.id << 20) | (spu.init.id << 28);
+ spu.fb.color_clear_value ^= x;
+ }
+ }
+ else {
+ spu.fb.depth_clear_value = clear->value;
+ }
+
+#define CLEAR_OPT 1
+#if CLEAR_OPT
+
+ /* Simply set all tiles' status to CLEAR.
+ * When we actually begin rendering into a tile, we'll initialize it to
+ * the clear value. If any tiles go untouched during the frame,
+ * really_clear_tiles() will set them to the clear value.
+ */
+ if (clear->surface == 0) {
+ memset(spu.ctile_status, TILE_STATUS_CLEAR, sizeof(spu.ctile_status));
+ }
+ else {
+ memset(spu.ztile_status, TILE_STATUS_CLEAR, sizeof(spu.ztile_status));
+ }
+
+#else
+
+ /*
+ * This path clears the whole framebuffer to the clear color right now.
+ */
+
+ /*
+ printf("SPU: %s num=%d w=%d h=%d\n",
+ __FUNCTION__, num_tiles, spu.fb.width_tiles, spu.fb.height_tiles);
+ */
+
+ /* init a single tile to the clear value */
+ if (clear->surface == 0) {
+ clear_c_tile(&spu.ctile);
+ }
+ else {
+ clear_z_tile(&spu.ztile);
+ }
+
+ /* walk over my tiles, writing the 'clear' tile's data */
+ {
+ const uint num_tiles = spu.fb.width_tiles * spu.fb.height_tiles;
+ uint i;
+ for (i = spu.init.id; i < num_tiles; i += spu.init.num_spus) {
+ uint tx = i % spu.fb.width_tiles;
+ uint ty = i / spu.fb.width_tiles;
+ if (clear->surface == 0)
+ put_tile(tx, ty, &spu.ctile, TAG_SURFACE_CLEAR, 0);
+ else
+ put_tile(tx, ty, &spu.ztile, TAG_SURFACE_CLEAR, 1);
+ }
+ }
+
+ if (spu.init.debug_flags & CELL_DEBUG_SYNC) {
+ wait_on_mask(1 << TAG_SURFACE_CLEAR);
+ }
+
+#endif /* CLEAR_OPT */
+
+ D_PRINTF(CELL_DEBUG_CMD, "CLEAR SURF done\n");
+}
+
+
+static void
+cmd_release_verts(const struct cell_command_release_verts *release)
+{
+ D_PRINTF(CELL_DEBUG_CMD, "RELEASE VERTS %u\n", release->vertex_buf);
+ ASSERT(release->vertex_buf != ~0U);
+ release_buffer(release->vertex_buf);
+}
+
+
+/**
+ * Process a CELL_CMD_STATE_FRAGMENT_OPS command.
+ * This involves installing new fragment ops SPU code.
+ * If this function is never called, we'll use a regular C fallback function
+ * for fragment processing.
+ */
+static void
+cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops)
+{
+ D_PRINTF(CELL_DEBUG_CMD, "CMD_STATE_FRAGMENT_OPS\n");
+
+ /* Copy state info (for fallback case only - this will eventually
+ * go away when the fallback case goes away)
+ */
+ memcpy(&spu.depth_stencil_alpha, &fops->dsa, sizeof(fops->dsa));
+ memcpy(&spu.blend, &fops->blend, sizeof(fops->blend));
+ memcpy(&spu.blend_color, &fops->blend_color, sizeof(fops->blend_color));
+
+ /* Make sure the SPU knows which buffers it's expected to read when
+ * it's told to pull tiles.
+ */
+ spu.read_depth_stencil = (spu.depth_stencil_alpha.depth.enabled || spu.depth_stencil_alpha.stencil[0].enabled);
+
+ /* If we're forcing the fallback code to be used (for debug purposes),
+ * install that. Otherwise install the incoming SPU code.
+ */
+ if ((spu.init.debug_flags & CELL_DEBUG_FRAGMENT_OP_FALLBACK) != 0) {
+ static unsigned int warned = 0;
+ if (!warned) {
+ fprintf(stderr, "Cell Warning: using fallback per-fragment code\n");
+ warned = 1;
+ }
+ /* The following two lines aren't really necessary if you
+ * know the debug flags won't change during a run, and if you
+ * know that the function pointers are initialized correctly.
+ * We set them here to allow a person to change the debug
+ * flags during a run (from inside a debugger).
+ */
+ spu.fragment_ops[CELL_FACING_FRONT] = spu_fallback_fragment_ops;
+ spu.fragment_ops[CELL_FACING_BACK] = spu_fallback_fragment_ops;
+ return;
+ }
+
+ /* Make sure the SPU code buffer is large enough to hold the incoming code.
+ * Note that we *don't* use align_malloc() and align_free(), because
+ * those utility functions are *not* available in SPU code.
+ * */
+ if (spu.fragment_ops_code_size < fops->total_code_size) {
+ if (spu.fragment_ops_code != NULL) {
+ free(spu.fragment_ops_code);
+ }
+ spu.fragment_ops_code_size = fops->total_code_size;
+ spu.fragment_ops_code = malloc(fops->total_code_size);
+ if (spu.fragment_ops_code == NULL) {
+ /* Whoops. */
+ fprintf(stderr, "CELL Warning: failed to allocate fragment ops code (%d bytes) - using fallback\n", fops->total_code_size);
+ spu.fragment_ops_code = NULL;
+ spu.fragment_ops_code_size = 0;
+ spu.fragment_ops[CELL_FACING_FRONT] = spu_fallback_fragment_ops;
+ spu.fragment_ops[CELL_FACING_BACK] = spu_fallback_fragment_ops;
+ return;
+ }
+ }
+
+ /* Copy the SPU code from the command buffer to the spu buffer */
+ memcpy(spu.fragment_ops_code, fops->code, fops->total_code_size);
+
+ /* Set the pointers for the front-facing and back-facing fragments
+ * to the specified offsets within the code. Note that if the
+ * front-facing and back-facing code are the same, they'll have
+ * the same offset.
+ */
+ spu.fragment_ops[CELL_FACING_FRONT] = (spu_fragment_ops_func) &spu.fragment_ops_code[fops->front_code_index];
+ spu.fragment_ops[CELL_FACING_BACK] = (spu_fragment_ops_func) &spu.fragment_ops_code[fops->back_code_index];
+}
+
+static void
+cmd_state_fragment_program(const struct cell_command_fragment_program *fp)
+{
+ D_PRINTF(CELL_DEBUG_CMD, "CMD_STATE_FRAGMENT_PROGRAM\n");
+ /* Copy SPU code from batch buffer to spu buffer */
+ memcpy(spu.fragment_program_code, fp->code,
+ SPU_MAX_FRAGMENT_PROGRAM_INSTS * 4);
+#if 01
+ /* Point function pointer at new code */
+ spu.fragment_program = (spu_fragment_program_func)spu.fragment_program_code;
+#endif
+}
+
+
+static uint
+cmd_state_fs_constants(const uint64_t *buffer, uint pos)
+{
+ const uint num_const = buffer[pos + 1];
+ const float *constants = (const float *) &buffer[pos + 2];
+ uint i;
+
+ D_PRINTF(CELL_DEBUG_CMD, "CMD_STATE_FS_CONSTANTS (%u)\n", num_const);
+
+ /* Expand each float to float[4] for SOA execution */
+ for (i = 0; i < num_const; i++) {
+ D_PRINTF(CELL_DEBUG_CMD, " const[%u] = %f\n", i, constants[i]);
+ spu.constants[i] = spu_splats(constants[i]);
+ }
+
+ /* return new buffer pos (in 8-byte words) */
+ return pos + 2 + num_const / 2;
+}
+
+
+static void
+cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
+{
+ D_PRINTF(CELL_DEBUG_CMD, "FRAMEBUFFER: %d x %d at %p, cformat 0x%x zformat 0x%x\n",
+ cmd->width,
+ cmd->height,
+ cmd->color_start,
+ cmd->color_format,
+ cmd->depth_format);
+
+ ASSERT_ALIGN16(cmd->color_start);
+ ASSERT_ALIGN16(cmd->depth_start);
+
+ spu.fb.color_start = cmd->color_start;
+ spu.fb.depth_start = cmd->depth_start;
+ spu.fb.color_format = cmd->color_format;
+ spu.fb.depth_format = cmd->depth_format;
+ spu.fb.width = cmd->width;
+ spu.fb.height = cmd->height;
+ spu.fb.width_tiles = (spu.fb.width + TILE_SIZE - 1) / TILE_SIZE;
+ spu.fb.height_tiles = (spu.fb.height + TILE_SIZE - 1) / TILE_SIZE;
+
+ switch (spu.fb.depth_format) {
+ case PIPE_FORMAT_Z32_UNORM:
+ spu.fb.zsize = 4;
+ spu.fb.zscale = (float) 0xffffffffu;
+ break;
+ case PIPE_FORMAT_Z24S8_UNORM:
+ case PIPE_FORMAT_S8Z24_UNORM:
+ case PIPE_FORMAT_Z24X8_UNORM:
+ case PIPE_FORMAT_X8Z24_UNORM:
+ spu.fb.zsize = 4;
+ spu.fb.zscale = (float) 0x00ffffffu;
+ break;
+ case PIPE_FORMAT_Z16_UNORM:
+ spu.fb.zsize = 2;
+ spu.fb.zscale = (float) 0xffffu;
+ break;
+ default:
+ spu.fb.zsize = 0;
+ break;
+ }
+}
+
+
+/**
+ * Tex texture mask_s/t and scale_s/t fields depend on the texture size and
+ * sampler wrap modes.
+ */
+static void
+update_tex_masks(struct spu_texture *texture,
+ const struct pipe_sampler_state *sampler)
+{
+ uint i;
+
+ for (i = 0; i < CELL_MAX_TEXTURE_LEVELS; i++) {
+ int width = texture->level[i].width;
+ int height = texture->level[i].height;
+
+ if (sampler->wrap_s == PIPE_TEX_WRAP_REPEAT)
+ texture->level[i].mask_s = spu_splats(width - 1);
+ else
+ texture->level[i].mask_s = spu_splats(~0);
+
+ if (sampler->wrap_t == PIPE_TEX_WRAP_REPEAT)
+ texture->level[i].mask_t = spu_splats(height - 1);
+ else
+ texture->level[i].mask_t = spu_splats(~0);
+
+ if (sampler->normalized_coords) {
+ texture->level[i].scale_s = spu_splats((float) width);
+ texture->level[i].scale_t = spu_splats((float) height);
+ }
+ else {
+ texture->level[i].scale_s = spu_splats(1.0f);
+ texture->level[i].scale_t = spu_splats(1.0f);
+ }
+ }
+}
+
+
+static void
+cmd_state_sampler(const struct cell_command_sampler *sampler)
+{
+ uint unit = sampler->unit;
+
+ D_PRINTF(CELL_DEBUG_CMD, "SAMPLER [%u]\n", unit);
+
+ spu.sampler[unit] = sampler->state;
+
+ switch (spu.sampler[unit].min_img_filter) {
+ case PIPE_TEX_FILTER_LINEAR:
+ spu.min_sample_texture_2d[unit] = sample_texture_2d_bilinear;
+ break;
+ case PIPE_TEX_FILTER_ANISO:
+ /* fall-through, for now */
+ case PIPE_TEX_FILTER_NEAREST:
+ spu.min_sample_texture_2d[unit] = sample_texture_2d_nearest;
+ break;
+ default:
+ ASSERT(0);
+ }
+
+ switch (spu.sampler[sampler->unit].mag_img_filter) {
+ case PIPE_TEX_FILTER_LINEAR:
+ spu.mag_sample_texture_2d[unit] = sample_texture_2d_bilinear;
+ break;
+ case PIPE_TEX_FILTER_ANISO:
+ /* fall-through, for now */
+ case PIPE_TEX_FILTER_NEAREST:
+ spu.mag_sample_texture_2d[unit] = sample_texture_2d_nearest;
+ break;
+ default:
+ ASSERT(0);
+ }
+
+ switch (spu.sampler[sampler->unit].min_mip_filter) {
+ case PIPE_TEX_MIPFILTER_NEAREST:
+ case PIPE_TEX_MIPFILTER_LINEAR:
+ spu.sample_texture_2d[unit] = sample_texture_2d_lod;
+ break;
+ case PIPE_TEX_MIPFILTER_NONE:
+ spu.sample_texture_2d[unit] = spu.mag_sample_texture_2d[unit];
+ break;
+ default:
+ ASSERT(0);
+ }
+
+ update_tex_masks(&spu.texture[unit], &spu.sampler[unit]);
+}
+
+
+static void
+cmd_state_texture(const struct cell_command_texture *texture)
+{
+ const uint unit = texture->unit;
+ uint i;
+
+ D_PRINTF(CELL_DEBUG_CMD, "TEXTURE [%u]\n", texture->unit);
+
+ spu.texture[unit].max_level = 0;
+ spu.texture[unit].target = texture->target;
+
+ for (i = 0; i < CELL_MAX_TEXTURE_LEVELS; i++) {
+ uint width = texture->width[i];
+ uint height = texture->height[i];
+ uint depth = texture->depth[i];
+
+ D_PRINTF(CELL_DEBUG_CMD, " LEVEL %u: at %p size[0] %u x %u\n", i,
+ texture->start[i], texture->width[i], texture->height[i]);
+
+ spu.texture[unit].level[i].start = texture->start[i];
+ spu.texture[unit].level[i].width = width;
+ spu.texture[unit].level[i].height = height;
+ spu.texture[unit].level[i].depth = depth;
+
+ spu.texture[unit].level[i].tiles_per_row =
+ (width + TILE_SIZE - 1) / TILE_SIZE;
+
+ spu.texture[unit].level[i].bytes_per_image =
+ 4 * align(width, TILE_SIZE) * align(height, TILE_SIZE) * depth;
+
+ spu.texture[unit].level[i].max_s = spu_splats((int) width - 1);
+ spu.texture[unit].level[i].max_t = spu_splats((int) height - 1);
+
+ if (texture->start[i])
+ spu.texture[unit].max_level = i;
+ }
+
+ update_tex_masks(&spu.texture[unit], &spu.sampler[unit]);
+}
+
+
+static void
+cmd_state_vertex_info(const struct vertex_info *vinfo)
+{
+ D_PRINTF(CELL_DEBUG_CMD, "VERTEX_INFO num_attribs=%u\n", vinfo->num_attribs);
+ ASSERT(vinfo->num_attribs >= 1);
+ ASSERT(vinfo->num_attribs <= 8);
+ memcpy(&spu.vertex_info, vinfo, sizeof(*vinfo));
+}
+
+
+static void
+cmd_state_vs_array_info(const struct cell_array_info *vs_info)
+{
+ const unsigned attr = vs_info->attr;
+
+ ASSERT(attr < PIPE_MAX_ATTRIBS);
+ draw.vertex_fetch.src_ptr[attr] = vs_info->base;
+ draw.vertex_fetch.pitch[attr] = vs_info->pitch;
+ draw.vertex_fetch.size[attr] = vs_info->size;
+ draw.vertex_fetch.code_offset[attr] = vs_info->function_offset;
+ draw.vertex_fetch.dirty = 1;
+}
+
+
+static void
+cmd_state_attrib_fetch(const struct cell_attribute_fetch_code *code)
+{
+ mfc_get(attribute_fetch_code_buffer,
+ (unsigned int) code->base, /* src */
+ code->size,
+ TAG_BATCH_BUFFER,
+ 0, /* tid */
+ 0 /* rid */);
+ wait_on_mask(1 << TAG_BATCH_BUFFER);
+
+ draw.vertex_fetch.code = attribute_fetch_code_buffer;
+}
+
+
+static void
+cmd_finish(void)
+{
+ D_PRINTF(CELL_DEBUG_CMD, "FINISH\n");
+ really_clear_tiles(0);
+ /* wait for all outstanding DMAs to finish */
+ mfc_write_tag_mask(~0);
+ mfc_read_tag_status_all();
+ /* send mbox message to PPU */
+ spu_write_out_mbox(CELL_CMD_FINISH);
+}
+
+
+/**
+ * Execute a batch of commands which was sent to us by the PPU.
+ * See the cell_emit_state.c code to see where the commands come from.
+ *
+ * The opcode param encodes the location of the buffer and its size.
+ */
+static void
+cmd_batch(uint opcode)
+{
+ const uint buf = (opcode >> 8) & 0xff;
+ uint size = (opcode >> 16);
+ uint64_t buffer[CELL_BUFFER_SIZE / 8] ALIGN16_ATTRIB;
+ const unsigned usize = size / sizeof(buffer[0]);
+ uint pos;
+
+ D_PRINTF(CELL_DEBUG_CMD, "BATCH buffer %u, len %u, from %p\n",
+ buf, size, spu.init.buffers[buf]);
+
+ ASSERT((opcode & CELL_CMD_OPCODE_MASK) == CELL_CMD_BATCH);
+
+ ASSERT_ALIGN16(spu.init.buffers[buf]);
+
+ size = ROUNDUP16(size);
+
+ ASSERT_ALIGN16(spu.init.buffers[buf]);
+
+ mfc_get(buffer, /* dest */
+ (unsigned int) spu.init.buffers[buf], /* src */
+ size,
+ TAG_BATCH_BUFFER,
+ 0, /* tid */
+ 0 /* rid */);
+ wait_on_mask(1 << TAG_BATCH_BUFFER);
+
+ /* Tell PPU we're done copying the buffer to local store */
+ D_PRINTF(CELL_DEBUG_CMD, "release batch buf %u\n", buf);
+ release_buffer(buf);
+
+ /*
+ * Loop over commands in the batch buffer
+ */
+ for (pos = 0; pos < usize; /* no incr */) {
+ switch (buffer[pos]) {
+ /*
+ * rendering commands
+ */
+ case CELL_CMD_CLEAR_SURFACE:
+ {
+ struct cell_command_clear_surface *clr
+ = (struct cell_command_clear_surface *) &buffer[pos];
+ cmd_clear_surface(clr);
+ pos += sizeof(*clr) / 8;
+ }
+ break;
+ case CELL_CMD_RENDER:
+ {
+ struct cell_command_render *render
+ = (struct cell_command_render *) &buffer[pos];
+ uint pos_incr;
+ cmd_render(render, &pos_incr);
+ pos += pos_incr;
+ }
+ break;
+ /*
+ * state-update commands
+ */
+ case CELL_CMD_STATE_FRAMEBUFFER:
+ {
+ struct cell_command_framebuffer *fb
+ = (struct cell_command_framebuffer *) &buffer[pos];
+ cmd_state_framebuffer(fb);
+ pos += sizeof(*fb) / 8;
+ }
+ break;
+ case CELL_CMD_STATE_FRAGMENT_OPS:
+ {
+ struct cell_command_fragment_ops *fops
+ = (struct cell_command_fragment_ops *) &buffer[pos];
+ cmd_state_fragment_ops(fops);
+ /* This is a variant-sized command */
+ pos += (sizeof(*fops) + fops->total_code_size)/ 8;
+ }
+ break;
+ case CELL_CMD_STATE_FRAGMENT_PROGRAM:
+ {
+ struct cell_command_fragment_program *fp
+ = (struct cell_command_fragment_program *) &buffer[pos];
+ cmd_state_fragment_program(fp);
+ pos += sizeof(*fp) / 8;
+ }
+ break;
+ case CELL_CMD_STATE_FS_CONSTANTS:
+ pos = cmd_state_fs_constants(buffer, pos);
+ break;
+ case CELL_CMD_STATE_RASTERIZER:
+ {
+ struct cell_command_rasterizer *rast =
+ (struct cell_command_rasterizer *) &buffer[pos];
+ spu.rasterizer = rast->rasterizer;
+ pos += sizeof(*rast) / 8;
+ }
+ break;
+ case CELL_CMD_STATE_SAMPLER:
+ {
+ struct cell_command_sampler *sampler
+ = (struct cell_command_sampler *) &buffer[pos];
+ cmd_state_sampler(sampler);
+ pos += sizeof(*sampler) / 8;
+ }
+ break;
+ case CELL_CMD_STATE_TEXTURE:
+ {
+ struct cell_command_texture *texture
+ = (struct cell_command_texture *) &buffer[pos];
+ cmd_state_texture(texture);
+ pos += sizeof(*texture) / 8;
+ }
+ break;
+ case CELL_CMD_STATE_VERTEX_INFO:
+ cmd_state_vertex_info((struct vertex_info *) &buffer[pos+1]);
+ pos += (1 + ROUNDUP8(sizeof(struct vertex_info)) / 8);
+ break;
+ case CELL_CMD_STATE_VIEWPORT:
+ (void) memcpy(& draw.viewport, &buffer[pos+1],
+ sizeof(struct pipe_viewport_state));
+ pos += (1 + ROUNDUP8(sizeof(struct pipe_viewport_state)) / 8);
+ break;
+ case CELL_CMD_STATE_UNIFORMS:
+ draw.constants = (const float (*)[4]) (uintptr_t) buffer[pos + 1];
+ pos += 2;
+ break;
+ case CELL_CMD_STATE_VS_ARRAY_INFO:
+ cmd_state_vs_array_info((struct cell_array_info *) &buffer[pos+1]);
+ pos += (1 + ROUNDUP8(sizeof(struct cell_array_info)) / 8);
+ break;
+ case CELL_CMD_STATE_BIND_VS:
+#if 0
+ spu_bind_vertex_shader(&draw,
+ (struct cell_shader_info *) &buffer[pos+1]);
+#endif
+ pos += (1 + ROUNDUP8(sizeof(struct cell_shader_info)) / 8);
+ break;
+ case CELL_CMD_STATE_ATTRIB_FETCH:
+ cmd_state_attrib_fetch((struct cell_attribute_fetch_code *)
+ &buffer[pos+1]);
+ pos += (1 + ROUNDUP8(sizeof(struct cell_attribute_fetch_code)) / 8);
+ break;
+ /*
+ * misc commands
+ */
+ case CELL_CMD_FINISH:
+ cmd_finish();
+ pos += 1;
+ break;
+ case CELL_CMD_FENCE:
+ {
+ struct cell_command_fence *fence_cmd =
+ (struct cell_command_fence *) &buffer[pos];
+ cmd_fence(fence_cmd);
+ pos += sizeof(*fence_cmd) / 8;
+ }
+ break;
+ case CELL_CMD_RELEASE_VERTS:
+ {
+ struct cell_command_release_verts *release
+ = (struct cell_command_release_verts *) &buffer[pos];
+ cmd_release_verts(release);
+ pos += sizeof(*release) / 8;
+ }
+ break;
+ case CELL_CMD_FLUSH_BUFFER_RANGE: {
+ struct cell_buffer_range *br = (struct cell_buffer_range *)
+ &buffer[pos+1];
+
+ spu_dcache_mark_dirty((unsigned) br->base, br->size);
+ pos += (1 + ROUNDUP8(sizeof(struct cell_buffer_range)) / 8);
+ break;
+ }
+ default:
+ printf("SPU %u: bad opcode: 0x%llx\n", spu.init.id, buffer[pos]);
+ ASSERT(0);
+ break;
+ }
+ }
+
+ D_PRINTF(CELL_DEBUG_CMD, "BATCH complete\n");
+}
+
+
+#define PERF 0
+
+
+/**
+ * Main loop for SPEs: Get a command, execute it, repeat.
+ */
+void
+command_loop(void)
+{
+ int exitFlag = 0;
+ uint t0, t1;
+
+ D_PRINTF(CELL_DEBUG_CMD, "Enter command loop\n");
+
+ while (!exitFlag) {
+ unsigned opcode;
+
+ D_PRINTF(CELL_DEBUG_CMD, "Wait for cmd...\n");
+
+ if (PERF)
+ spu_write_decrementer(~0);
+
+ /* read/wait from mailbox */
+ opcode = (unsigned int) spu_read_in_mbox();
+ D_PRINTF(CELL_DEBUG_CMD, "got cmd 0x%x\n", opcode);
+
+ if (PERF)
+ t0 = spu_read_decrementer();
+
+ switch (opcode & CELL_CMD_OPCODE_MASK) {
+ case CELL_CMD_EXIT:
+ D_PRINTF(CELL_DEBUG_CMD, "EXIT\n");
+ exitFlag = 1;
+ break;
+ case CELL_CMD_VS_EXECUTE:
+#if 0
+ spu_execute_vertex_shader(&draw, &cmd.vs);
+#endif
+ break;
+ case CELL_CMD_BATCH:
+ cmd_batch(opcode);
+ break;
+ default:
+ printf("Bad opcode 0x%x!\n", opcode & CELL_CMD_OPCODE_MASK);
+ }
+
+ if (PERF) {
+ t1 = spu_read_decrementer();
+ printf("wait mbox time: %gms batch time: %gms\n",
+ (~0u - t0) * spu.init.inv_timebase,
+ (t0 - t1) * spu.init.inv_timebase);
+ }
+ }
+
+ D_PRINTF(CELL_DEBUG_CMD, "Exit command loop\n");
+
+ if (spu.init.debug_flags & CELL_DEBUG_CACHE)
+ spu_dcache_report();
+}
+
+/* Initialize this module; we manage the fragment ops buffer here. */
+void
+spu_command_init(void)
+{
+ /* Install default/fallback fragment processing function.
+ * This will normally be overriden by a code-gen'd function
+ * unless CELL_FORCE_FRAGMENT_OPS_FALLBACK is set.
+ */
+ spu.fragment_ops[CELL_FACING_FRONT] = spu_fallback_fragment_ops;
+ spu.fragment_ops[CELL_FACING_BACK] = spu_fallback_fragment_ops;
+
+ /* Set up the basic empty buffer for code-gen'ed fragment ops */
+ spu.fragment_ops_code = NULL;
+ spu.fragment_ops_code_size = 0;
+}
+
+void
+spu_command_close(void)
+{
+ /* Deallocate the code-gen buffer for fragment ops, and reset the
+ * fragment ops functions to their initial setting (just to leave
+ * things in a good state).
+ */
+ if (spu.fragment_ops_code != NULL) {
+ free(spu.fragment_ops_code);
+ }
+ spu_command_init();
+}
diff --git a/src/gallium/drivers/cell/spu/spu_command.h b/src/gallium/drivers/cell/spu/spu_command.h
new file mode 100644
index 0000000000..83dcdade28
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_command.h
@@ -0,0 +1,35 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+extern void
+command_loop(void);
+
+extern void
+spu_command_init(void);
+
+extern void
+spu_command_close(void);
diff --git a/src/gallium/drivers/cell/spu/spu_dcache.c b/src/gallium/drivers/cell/spu/spu_dcache.c
index 167404cdc5..a6d67634fd 100644
--- a/src/gallium/drivers/cell/spu/spu_dcache.c
+++ b/src/gallium/drivers/cell/spu/spu_dcache.c
@@ -36,7 +36,9 @@
#define CACHE_SET_TAGID(set) (((set) & 0x03) + TAG_DCACHE0)
#define CACHE_LOG2NNWAY 2
#define CACHE_LOG2NSETS 6
-/*#define CACHE_STATS 1*/
+#ifdef DEBUG
+#define CACHE_STATS 1
+#endif
#include <cache-api.h>
/* Yes folks, this is ugly.
diff --git a/src/gallium/drivers/cell/spu/spu_funcs.c b/src/gallium/drivers/cell/spu/spu_funcs.c
new file mode 100644
index 0000000000..ff3d609d25
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_funcs.c
@@ -0,0 +1,173 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+/**
+ * SPU functions accessed by shaders.
+ *
+ * Authors: Brian Paul
+ */
+
+
+#include <string.h>
+#include <libmisc.h>
+#include <math.h>
+#include <cos14_v.h>
+#include <sin14_v.h>
+#include <simdmath/exp2f4.h>
+#include <simdmath/log2f4.h>
+#include <simdmath/powf4.h>
+
+#include "cell/common.h"
+#include "spu_main.h"
+#include "spu_funcs.h"
+#include "spu_texture.h"
+
+
+/** For "return"-ing four vectors */
+struct vec_4x4
+{
+ vector float v[4];
+};
+
+
+static vector float
+spu_cos(vector float x)
+{
+ return _cos14_v(x);
+}
+
+static vector float
+spu_sin(vector float x)
+{
+ return _sin14_v(x);
+}
+
+static vector float
+spu_pow(vector float x, vector float y)
+{
+ return _powf4(x, y);
+}
+
+static vector float
+spu_exp2(vector float x)
+{
+ return _exp2f4(x);
+}
+
+static vector float
+spu_log2(vector float x)
+{
+ return _log2f4(x);
+}
+
+
+static struct vec_4x4
+spu_tex_2d(vector float s, vector float t, vector float r, vector float q,
+ unsigned unit)
+{
+ struct vec_4x4 colors;
+ (void) r;
+ (void) q;
+ spu.sample_texture_2d[unit](s, t, unit, 0, 0, colors.v);
+ return colors;
+}
+
+static struct vec_4x4
+spu_tex_3d(vector float s, vector float t, vector float r, vector float q,
+ unsigned unit)
+{
+ struct vec_4x4 colors;
+ (void) r;
+ (void) q;
+ spu.sample_texture_2d[unit](s, t, unit, 0, 0, colors.v);
+ return colors;
+}
+
+static struct vec_4x4
+spu_tex_cube(vector float s, vector float t, vector float r, vector float q,
+ unsigned unit)
+{
+ struct vec_4x4 colors;
+ (void) q;
+ sample_texture_cube(s, t, r, unit, colors.v);
+ return colors;
+}
+
+
+/**
+ * Add named function to list of "exported" functions that will be
+ * made available to the PPU-hosted code generator.
+ */
+static void
+export_func(struct cell_spu_function_info *spu_functions,
+ const char *name, void *addr)
+{
+ uint n = spu_functions->num;
+ ASSERT(strlen(name) < 16);
+ strcpy(spu_functions->names[n], name);
+ spu_functions->addrs[n] = (uint) addr;
+ spu_functions->num++;
+ ASSERT(spu_functions->num <= 16);
+}
+
+
+/**
+ * Return info about the SPU's function to the PPU / main memory.
+ * The PPU needs to know the address of some SPU-side functions so
+ * that we can generate shader code with function calls.
+ */
+void
+return_function_info(void)
+{
+ struct cell_spu_function_info funcs ALIGN16_ATTRIB;
+ int tag = TAG_MISC;
+
+ ASSERT(sizeof(funcs) == 256); /* must be multiple of 16 bytes */
+
+ funcs.num = 0;
+ export_func(&funcs, "spu_cos", &spu_cos);
+ export_func(&funcs, "spu_sin", &spu_sin);
+ export_func(&funcs, "spu_pow", &spu_pow);
+ export_func(&funcs, "spu_exp2", &spu_exp2);
+ export_func(&funcs, "spu_log2", &spu_log2);
+ export_func(&funcs, "spu_tex_2d", &spu_tex_2d);
+ export_func(&funcs, "spu_tex_3d", &spu_tex_3d);
+ export_func(&funcs, "spu_tex_cube", &spu_tex_cube);
+
+ /* Send the function info back to the PPU / main memory */
+ mfc_put((void *) &funcs, /* src in local store */
+ (unsigned int) spu.init.spu_functions, /* dst in main memory */
+ sizeof(funcs), /* bytes */
+ tag,
+ 0, /* tid */
+ 0 /* rid */);
+ wait_on_mask(1 << tag);
+}
+
+
+
diff --git a/src/gallium/drivers/cell/spu/spu_funcs.h b/src/gallium/drivers/cell/spu/spu_funcs.h
new file mode 100644
index 0000000000..3adb6ae99f
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_funcs.h
@@ -0,0 +1,35 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef SPU_FUNCS_H
+#define SPU_FUNCS_H
+
+extern void
+return_function_info(void);
+
+#endif
+
diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index 78260c4259..97c86d194d 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -32,16 +32,15 @@
#include <stdio.h>
#include <libmisc.h>
+#include "pipe/p_defines.h"
+
+#include "spu_funcs.h"
+#include "spu_command.h"
#include "spu_main.h"
-#include "spu_render.h"
#include "spu_per_fragment_op.h"
#include "spu_texture.h"
-#include "spu_tile.h"
//#include "spu_test.h"
-#include "spu_vertex_shader.h"
-#include "spu_dcache.h"
#include "cell/common.h"
-#include "pipe/p_defines.h"
/*
@@ -50,600 +49,8 @@ helpful headers:
/opt/cell/sdk/usr/include/libmisc.h
*/
-boolean Debug = FALSE;
-
struct spu_global spu;
-struct spu_vs_context draw;
-
-
-/**
- * Buffers containing dynamically generated SPU code:
- */
-static unsigned char attribute_fetch_code_buffer[136 * PIPE_MAX_ATTRIBS]
- ALIGN16_ATTRIB;
-
-
-
-/**
- * Tell the PPU that this SPU has finished copying a buffer to
- * local store and that it may be reused by the PPU.
- * This is done by writting a 16-byte batch-buffer-status block back into
- * main memory (in cell_context->buffer_status[]).
- */
-static void
-release_buffer(uint buffer)
-{
- /* Evidently, using less than a 16-byte status doesn't work reliably */
- static const uint status[4] ALIGN16_ATTRIB
- = {CELL_BUFFER_STATUS_FREE, 0, 0, 0};
-
- const uint index = 4 * (spu.init.id * CELL_NUM_BUFFERS + buffer);
- uint *dst = spu.init.buffer_status + index;
-
- ASSERT(buffer < CELL_NUM_BUFFERS);
-
- mfc_put((void *) &status, /* src in local memory */
- (unsigned int) dst, /* dst in main memory */
- sizeof(status), /* size */
- TAG_MISC, /* tag is unimportant */
- 0, /* tid */
- 0 /* rid */);
-}
-
-
-/**
- * For tiles whose status is TILE_STATUS_CLEAR, write solid-filled
- * tiles back to the main framebuffer.
- */
-static void
-really_clear_tiles(uint surfaceIndex)
-{
- const uint num_tiles = spu.fb.width_tiles * spu.fb.height_tiles;
- uint i;
-
- if (surfaceIndex == 0) {
- clear_c_tile(&spu.ctile);
-
- for (i = spu.init.id; i < num_tiles; i += spu.init.num_spus) {
- uint tx = i % spu.fb.width_tiles;
- uint ty = i / spu.fb.width_tiles;
- if (spu.ctile_status[ty][tx] == TILE_STATUS_CLEAR) {
- put_tile(tx, ty, &spu.ctile, TAG_SURFACE_CLEAR, 0);
- }
- }
- }
- else {
- clear_z_tile(&spu.ztile);
-
- for (i = spu.init.id; i < num_tiles; i += spu.init.num_spus) {
- uint tx = i % spu.fb.width_tiles;
- uint ty = i / spu.fb.width_tiles;
- if (spu.ztile_status[ty][tx] == TILE_STATUS_CLEAR)
- put_tile(tx, ty, &spu.ctile, TAG_SURFACE_CLEAR, 1);
- }
- }
-
-#if 0
- wait_on_mask(1 << TAG_SURFACE_CLEAR);
-#endif
-}
-
-
-static void
-cmd_clear_surface(const struct cell_command_clear_surface *clear)
-{
- if (Debug)
- printf("SPU %u: CLEAR SURF %u to 0x%08x\n", spu.init.id,
- clear->surface, clear->value);
-
- if (clear->surface == 0) {
- spu.fb.color_clear_value = clear->value;
- if (spu.init.debug_flags & CELL_DEBUG_CHECKER) {
- uint x = (spu.init.id << 4) | (spu.init.id << 12) |
- (spu.init.id << 20) | (spu.init.id << 28);
- spu.fb.color_clear_value ^= x;
- }
- }
- else {
- spu.fb.depth_clear_value = clear->value;
- }
-
-#define CLEAR_OPT 1
-#if CLEAR_OPT
-
- /* Simply set all tiles' status to CLEAR.
- * When we actually begin rendering into a tile, we'll initialize it to
- * the clear value. If any tiles go untouched during the frame,
- * really_clear_tiles() will set them to the clear value.
- */
- if (clear->surface == 0) {
- memset(spu.ctile_status, TILE_STATUS_CLEAR, sizeof(spu.ctile_status));
- }
- else {
- memset(spu.ztile_status, TILE_STATUS_CLEAR, sizeof(spu.ztile_status));
- }
-
-#else
-
- /*
- * This path clears the whole framebuffer to the clear color right now.
- */
-
- /*
- printf("SPU: %s num=%d w=%d h=%d\n",
- __FUNCTION__, num_tiles, spu.fb.width_tiles, spu.fb.height_tiles);
- */
-
- /* init a single tile to the clear value */
- if (clear->surface == 0) {
- clear_c_tile(&spu.ctile);
- }
- else {
- clear_z_tile(&spu.ztile);
- }
-
- /* walk over my tiles, writing the 'clear' tile's data */
- {
- const uint num_tiles = spu.fb.width_tiles * spu.fb.height_tiles;
- uint i;
- for (i = spu.init.id; i < num_tiles; i += spu.init.num_spus) {
- uint tx = i % spu.fb.width_tiles;
- uint ty = i / spu.fb.width_tiles;
- if (clear->surface == 0)
- put_tile(tx, ty, &spu.ctile, TAG_SURFACE_CLEAR, 0);
- else
- put_tile(tx, ty, &spu.ztile, TAG_SURFACE_CLEAR, 1);
- }
- }
-
- if (spu.init.debug_flags & CELL_DEBUG_SYNC) {
- wait_on_mask(1 << TAG_SURFACE_CLEAR);
- }
-
-#endif /* CLEAR_OPT */
-
- if (Debug)
- printf("SPU %u: CLEAR SURF done\n", spu.init.id);
-}
-
-
-static void
-cmd_release_verts(const struct cell_command_release_verts *release)
-{
- if (Debug)
- printf("SPU %u: RELEASE VERTS %u\n",
- spu.init.id, release->vertex_buf);
- ASSERT(release->vertex_buf != ~0U);
- release_buffer(release->vertex_buf);
-}
-
-
-/**
- * Process a CELL_CMD_STATE_FRAGMENT_OPS command.
- * This involves installing new fragment ops SPU code.
- * If this function is never called, we'll use a regular C fallback function
- * for fragment processing.
- */
-static void
-cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops)
-{
- if (Debug)
- printf("SPU %u: CMD_STATE_FRAGMENT_OPS\n", spu.init.id);
- /* Copy SPU code from batch buffer to spu buffer */
- memcpy(spu.fragment_ops_code, fops->code, SPU_MAX_FRAGMENT_OPS_INSTS * 4);
- /* Copy state info (for fallback case only) */
- memcpy(&spu.depth_stencil_alpha, &fops->dsa, sizeof(fops->dsa));
- memcpy(&spu.blend, &fops->blend, sizeof(fops->blend));
-
- /* Point function pointer at new code */
- spu.fragment_ops = (spu_fragment_ops_func) spu.fragment_ops_code;
-
- spu.read_depth = spu.depth_stencil_alpha.depth.enabled;
- spu.read_stencil = spu.depth_stencil_alpha.stencil[0].enabled;
-}
-
-
-static void
-cmd_state_fragment_program(const struct cell_command_fragment_program *fp)
-{
- if (Debug)
- printf("SPU %u: CMD_STATE_FRAGMENT_PROGRAM\n", spu.init.id);
- /* Copy SPU code from batch buffer to spu buffer */
- memcpy(spu.fragment_program_code, fp->code,
- SPU_MAX_FRAGMENT_PROGRAM_INSTS * 4);
-#if 01
- /* Point function pointer at new code */
- spu.fragment_program = (spu_fragment_program_func)spu.fragment_program_code;
-#endif
-}
-
-
-static void
-cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
-{
- if (Debug)
- printf("SPU %u: FRAMEBUFFER: %d x %d at %p, cformat 0x%x zformat 0x%x\n",
- spu.init.id,
- cmd->width,
- cmd->height,
- cmd->color_start,
- cmd->color_format,
- cmd->depth_format);
-
- ASSERT_ALIGN16(cmd->color_start);
- ASSERT_ALIGN16(cmd->depth_start);
-
- spu.fb.color_start = cmd->color_start;
- spu.fb.depth_start = cmd->depth_start;
- spu.fb.color_format = cmd->color_format;
- spu.fb.depth_format = cmd->depth_format;
- spu.fb.width = cmd->width;
- spu.fb.height = cmd->height;
- spu.fb.width_tiles = (spu.fb.width + TILE_SIZE - 1) / TILE_SIZE;
- spu.fb.height_tiles = (spu.fb.height + TILE_SIZE - 1) / TILE_SIZE;
-
- switch (spu.fb.depth_format) {
- case PIPE_FORMAT_Z32_UNORM:
- spu.fb.zsize = 4;
- spu.fb.zscale = (float) 0xffffffffu;
- break;
- case PIPE_FORMAT_Z24S8_UNORM:
- case PIPE_FORMAT_S8Z24_UNORM:
- case PIPE_FORMAT_Z24X8_UNORM:
- case PIPE_FORMAT_X8Z24_UNORM:
- spu.fb.zsize = 4;
- spu.fb.zscale = (float) 0x00ffffffu;
- break;
- case PIPE_FORMAT_Z16_UNORM:
- spu.fb.zsize = 2;
- spu.fb.zscale = (float) 0xffffu;
- break;
- default:
- spu.fb.zsize = 0;
- break;
- }
-}
-
-
-static void
-cmd_state_sampler(const struct cell_command_sampler *sampler)
-{
- if (Debug)
- printf("SPU %u: SAMPLER [%u]\n",
- spu.init.id, sampler->unit);
-
- spu.sampler[sampler->unit] = sampler->state;
- if (spu.sampler[sampler->unit].min_img_filter == PIPE_TEX_FILTER_LINEAR)
- spu.sample_texture[sampler->unit] = sample_texture_bilinear;
- else
- spu.sample_texture[sampler->unit] = sample_texture_nearest;
-}
-
-
-static void
-cmd_state_texture(const struct cell_command_texture *texture)
-{
- const uint unit = texture->unit;
- const uint width = texture->width;
- const uint height = texture->height;
-
- if (Debug) {
- printf("SPU %u: TEXTURE [%u] at %p size %u x %u\n", spu.init.id,
- texture->unit, texture->start,
- texture->width, texture->height);
- }
-
- spu.texture[unit].start = texture->start;
- spu.texture[unit].width = width;
- spu.texture[unit].height = height;
-
- spu.texture[unit].tiles_per_row = width / TILE_SIZE;
-
- spu.texture[unit].tex_size = (vector float) { width, height, 0.0, 0.0};
- spu.texture[unit].tex_size_mask = (vector unsigned int)
- { width - 1, height - 1, 0, 0 };
- spu.texture[unit].tex_size_x_mask = spu_splats(width - 1);
- spu.texture[unit].tex_size_y_mask = spu_splats(height - 1);
-}
-
-
-static void
-cmd_state_vertex_info(const struct vertex_info *vinfo)
-{
- if (Debug) {
- printf("SPU %u: VERTEX_INFO num_attribs=%u\n", spu.init.id,
- vinfo->num_attribs);
- }
- ASSERT(vinfo->num_attribs >= 1);
- ASSERT(vinfo->num_attribs <= 8);
- memcpy(&spu.vertex_info, vinfo, sizeof(*vinfo));
-}
-
-
-static void
-cmd_state_vs_array_info(const struct cell_array_info *vs_info)
-{
- const unsigned attr = vs_info->attr;
-
- ASSERT(attr < PIPE_MAX_ATTRIBS);
- draw.vertex_fetch.src_ptr[attr] = vs_info->base;
- draw.vertex_fetch.pitch[attr] = vs_info->pitch;
- draw.vertex_fetch.size[attr] = vs_info->size;
- draw.vertex_fetch.code_offset[attr] = vs_info->function_offset;
- draw.vertex_fetch.dirty = 1;
-}
-
-
-static void
-cmd_state_attrib_fetch(const struct cell_attribute_fetch_code *code)
-{
- mfc_get(attribute_fetch_code_buffer,
- (unsigned int) code->base, /* src */
- code->size,
- TAG_BATCH_BUFFER,
- 0, /* tid */
- 0 /* rid */);
- wait_on_mask(1 << TAG_BATCH_BUFFER);
-
- draw.vertex_fetch.code = attribute_fetch_code_buffer;
-}
-
-
-static void
-cmd_finish(void)
-{
- if (Debug)
- printf("SPU %u: FINISH\n", spu.init.id);
- really_clear_tiles(0);
- /* wait for all outstanding DMAs to finish */
- mfc_write_tag_mask(~0);
- mfc_read_tag_status_all();
- /* send mbox message to PPU */
- spu_write_out_mbox(CELL_CMD_FINISH);
-}
-
-
-/**
- * Execute a batch of commands which was sent to us by the PPU.
- * See the cell_emit_state.c code to see where the commands come from.
- *
- * The opcode param encodes the location of the buffer and its size.
- */
-static void
-cmd_batch(uint opcode)
-{
- const uint buf = (opcode >> 8) & 0xff;
- uint size = (opcode >> 16);
- uint64_t buffer[CELL_BUFFER_SIZE / 8] ALIGN16_ATTRIB;
- const unsigned usize = size / sizeof(buffer[0]);
- uint pos;
-
- if (Debug)
- printf("SPU %u: BATCH buffer %u, len %u, from %p\n",
- spu.init.id, buf, size, spu.init.buffers[buf]);
-
- ASSERT((opcode & CELL_CMD_OPCODE_MASK) == CELL_CMD_BATCH);
-
- ASSERT_ALIGN16(spu.init.buffers[buf]);
-
- size = ROUNDUP16(size);
-
- ASSERT_ALIGN16(spu.init.buffers[buf]);
-
- mfc_get(buffer, /* dest */
- (unsigned int) spu.init.buffers[buf], /* src */
- size,
- TAG_BATCH_BUFFER,
- 0, /* tid */
- 0 /* rid */);
- wait_on_mask(1 << TAG_BATCH_BUFFER);
-
- /* Tell PPU we're done copying the buffer to local store */
- if (Debug)
- printf("SPU %u: release batch buf %u\n", spu.init.id, buf);
- release_buffer(buf);
-
- /*
- * Loop over commands in the batch buffer
- */
- for (pos = 0; pos < usize; /* no incr */) {
- switch (buffer[pos]) {
- /*
- * rendering commands
- */
- case CELL_CMD_CLEAR_SURFACE:
- {
- struct cell_command_clear_surface *clr
- = (struct cell_command_clear_surface *) &buffer[pos];
- cmd_clear_surface(clr);
- pos += sizeof(*clr) / 8;
- }
- break;
- case CELL_CMD_RENDER:
- {
- struct cell_command_render *render
- = (struct cell_command_render *) &buffer[pos];
- uint pos_incr;
- cmd_render(render, &pos_incr);
- pos += pos_incr;
- }
- break;
- /*
- * state-update commands
- */
- case CELL_CMD_STATE_FRAMEBUFFER:
- {
- struct cell_command_framebuffer *fb
- = (struct cell_command_framebuffer *) &buffer[pos];
- cmd_state_framebuffer(fb);
- pos += sizeof(*fb) / 8;
- }
- break;
- case CELL_CMD_STATE_FRAGMENT_OPS:
- {
- struct cell_command_fragment_ops *fops
- = (struct cell_command_fragment_ops *) &buffer[pos];
- cmd_state_fragment_ops(fops);
- pos += sizeof(*fops) / 8;
- }
- break;
- case CELL_CMD_STATE_FRAGMENT_PROGRAM:
- {
- struct cell_command_fragment_program *fp
- = (struct cell_command_fragment_program *) &buffer[pos];
- cmd_state_fragment_program(fp);
- pos += sizeof(*fp) / 8;
- }
- break;
- case CELL_CMD_STATE_SAMPLER:
- {
- struct cell_command_sampler *sampler
- = (struct cell_command_sampler *) &buffer[pos];
- cmd_state_sampler(sampler);
- pos += sizeof(*sampler) / 8;
- }
- break;
- case CELL_CMD_STATE_TEXTURE:
- {
- struct cell_command_texture *texture
- = (struct cell_command_texture *) &buffer[pos];
- cmd_state_texture(texture);
- pos += sizeof(*texture) / 8;
- }
- break;
- case CELL_CMD_STATE_VERTEX_INFO:
- cmd_state_vertex_info((struct vertex_info *) &buffer[pos+1]);
- pos += (1 + ROUNDUP8(sizeof(struct vertex_info)) / 8);
- break;
- case CELL_CMD_STATE_VIEWPORT:
- (void) memcpy(& draw.viewport, &buffer[pos+1],
- sizeof(struct pipe_viewport_state));
- pos += (1 + ROUNDUP8(sizeof(struct pipe_viewport_state)) / 8);
- break;
- case CELL_CMD_STATE_UNIFORMS:
- draw.constants = (const float (*)[4]) (uintptr_t) buffer[pos + 1];
- pos += 2;
- break;
- case CELL_CMD_STATE_VS_ARRAY_INFO:
- cmd_state_vs_array_info((struct cell_array_info *) &buffer[pos+1]);
- pos += (1 + ROUNDUP8(sizeof(struct cell_array_info)) / 8);
- break;
- case CELL_CMD_STATE_BIND_VS:
-#if 0
- spu_bind_vertex_shader(&draw,
- (struct cell_shader_info *) &buffer[pos+1]);
-#endif
- pos += (1 + ROUNDUP8(sizeof(struct cell_shader_info)) / 8);
- break;
- case CELL_CMD_STATE_ATTRIB_FETCH:
- cmd_state_attrib_fetch((struct cell_attribute_fetch_code *)
- &buffer[pos+1]);
- pos += (1 + ROUNDUP8(sizeof(struct cell_attribute_fetch_code)) / 8);
- break;
- /*
- * misc commands
- */
- case CELL_CMD_FINISH:
- cmd_finish();
- pos += 1;
- break;
- case CELL_CMD_RELEASE_VERTS:
- {
- struct cell_command_release_verts *release
- = (struct cell_command_release_verts *) &buffer[pos];
- cmd_release_verts(release);
- pos += sizeof(*release) / 8;
- }
- break;
- case CELL_CMD_FLUSH_BUFFER_RANGE: {
- struct cell_buffer_range *br = (struct cell_buffer_range *)
- &buffer[pos+1];
-
- spu_dcache_mark_dirty((unsigned) br->base, br->size);
- pos += (1 + ROUNDUP8(sizeof(struct cell_buffer_range)) / 8);
- break;
- }
- default:
- printf("SPU %u: bad opcode: 0x%llx\n", spu.init.id, buffer[pos]);
- ASSERT(0);
- break;
- }
- }
-
- if (Debug)
- printf("SPU %u: BATCH complete\n", spu.init.id);
-}
-
-
-/**
- * Temporary/simple main loop for SPEs: Get a command, execute it, repeat.
- */
-static void
-main_loop(void)
-{
- struct cell_command cmd;
- int exitFlag = 0;
-
- if (Debug)
- printf("SPU %u: Enter main loop\n", spu.init.id);
-
- ASSERT((sizeof(struct cell_command) & 0xf) == 0);
- ASSERT_ALIGN16(&cmd);
-
- while (!exitFlag) {
- unsigned opcode;
- int tag = 0;
-
- if (Debug)
- printf("SPU %u: Wait for cmd...\n", spu.init.id);
-
- /* read/wait from mailbox */
- opcode = (unsigned int) spu_read_in_mbox();
-
- if (Debug)
- printf("SPU %u: got cmd 0x%x\n", spu.init.id, opcode);
-
- /* command payload */
- mfc_get(&cmd, /* dest */
- (unsigned int) spu.init.cmd, /* src */
- sizeof(struct cell_command), /* bytes */
- tag,
- 0, /* tid */
- 0 /* rid */);
- wait_on_mask( 1 << tag );
-
- /*
- * NOTE: most commands should be contained in a batch buffer
- */
-
- switch (opcode & CELL_CMD_OPCODE_MASK) {
- case CELL_CMD_EXIT:
- if (Debug)
- printf("SPU %u: EXIT\n", spu.init.id);
- exitFlag = 1;
- break;
- case CELL_CMD_VS_EXECUTE:
-#if 0
- spu_execute_vertex_shader(&draw, &cmd.vs);
-#endif
- break;
- case CELL_CMD_BATCH:
- cmd_batch(opcode);
- break;
- default:
- printf("Bad opcode!\n");
- }
-
- }
-
- if (Debug)
- printf("SPU %u: Exit main loop\n", spu.init.id);
-
- spu_dcache_report();
-}
-
-
static void
one_time_init(void)
@@ -651,15 +58,8 @@ one_time_init(void)
memset(spu.ctile_status, TILE_STATUS_DEFINED, sizeof(spu.ctile_status));
memset(spu.ztile_status, TILE_STATUS_DEFINED, sizeof(spu.ztile_status));
invalidate_tex_cache();
-
- /* Install default/fallback fragment processing function.
- * This will normally be overriden by a code-gen'd function.
- */
- spu.fragment_ops = spu_fallback_fragment_ops;
}
-
-
/* In some versions of the SDK the SPE main takes 'unsigned long' as a
* parameter. In others it takes 'unsigned long long'. Use a define to
* select between the two.
@@ -682,12 +82,16 @@ main(main_param_t speid, main_param_t argp)
ASSERT(sizeof(tile_t) == TILE_SIZE * TILE_SIZE * 4);
ASSERT(sizeof(struct cell_command_render) % 8 == 0);
+ ASSERT(sizeof(struct cell_command_fragment_ops) % 8 == 0);
+ ASSERT(((unsigned long) &spu.fragment_program_code) % 8 == 0);
one_time_init();
+ spu_command_init();
- if (Debug)
- printf("SPU: main() speid=%lu\n", (unsigned long) speid);
+ D_PRINTF(CELL_DEBUG_CMD, "main() speid=%lu\n", (unsigned long) speid);
+ D_PRINTF(CELL_DEBUG_FRAGMENT_OP_FALLBACK, "using fragment op fallback\n");
+ /* get initialization data */
mfc_get(&spu.init, /* dest */
(unsigned int) argp, /* src */
sizeof(struct cell_init_info), /* bytes */
@@ -696,12 +100,18 @@ main(main_param_t speid, main_param_t argp)
0 /* rid */);
wait_on_mask( 1 << tag );
+ if (spu.init.id == 0) {
+ return_function_info();
+ }
+
#if 0
if (spu.init.id==0)
- spu_test_misc();
+ spu_test_misc(spu.init.id);
#endif
- main_loop();
+ command_loop();
+
+ spu_command_close();
return 0;
}
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 2c7b625840..33767e7c51 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -36,9 +36,18 @@
#include "pipe/p_state.h"
-
-#define MAX_WIDTH 1024
-#define MAX_HEIGHT 1024
+#if DEBUG
+/* These debug macros use the unusual construction ", ##__VA_ARGS__"
+ * which expands to the expected comma + args if variadic arguments
+ * are supplied, but swallows the comma if there are no variadic
+ * arguments (which avoids syntax errors that would otherwise occur).
+ */
+#define D_PRINTF(flag, format,...) \
+ if (spu.init.debug_flags & (flag)) \
+ printf("SPU %u: " format, spu.init.id, ##__VA_ARGS__)
+#else
+#define D_PRINTF(...)
+#endif
/**
@@ -61,8 +70,11 @@ typedef union {
/** Function for sampling textures */
-typedef vector float (*spu_sample_texture_func)(uint unit,
- vector float texcoord);
+typedef void (*spu_sample_texture_2d_func)(vector float s,
+ vector float t,
+ uint unit, uint level, uint face,
+ vector float colors[4]);
+
/** Function for performing per-fragment ops */
typedef void (*spu_fragment_ops_func)(uint x, uint y,
@@ -76,9 +88,9 @@ typedef void (*spu_fragment_ops_func)(uint x, uint y,
vector unsigned int mask);
/** Function for running fragment program */
-typedef void (*spu_fragment_program_func)(vector float *inputs,
- vector float *outputs,
- vector float *constants);
+typedef vector unsigned int (*spu_fragment_program_func)(vector float *inputs,
+ vector float *outputs,
+ vector float *constants);
struct spu_framebuffer
@@ -98,15 +110,27 @@ struct spu_framebuffer
} ALIGN16_ATTRIB;
-struct spu_texture
+/** per-texture level info */
+struct spu_texture_level
{
void *start;
- ushort width, height;
+ ushort width, height, depth;
ushort tiles_per_row;
- vector float tex_size;
- vector unsigned int tex_size_mask; /**< == int(size - 1) */
- vector unsigned int tex_size_x_mask; /**< == int(size - 1) */
- vector unsigned int tex_size_y_mask; /**< == int(size - 1) */
+ uint bytes_per_image;
+ /** texcoord scale factors */
+ vector float scale_s, scale_t, scale_r;
+ /** texcoord masks (if REPEAT then size-1, else ~0) */
+ vector signed int mask_s, mask_t, mask_r;
+ /** texcoord clamp limits */
+ vector signed int max_s, max_t, max_r;
+} ALIGN16_ATTRIB;
+
+
+struct spu_texture
+{
+ struct spu_texture_level level[CELL_MAX_TEXTURE_LEVELS];
+ uint max_level;
+ uint target; /**< PIPE_TEXTURE_x */
} ALIGN16_ATTRIB;
@@ -124,7 +148,9 @@ struct spu_global
struct spu_framebuffer fb;
struct pipe_depth_stencil_alpha_state depth_stencil_alpha;
struct pipe_blend_state blend;
+ struct pipe_blend_color blend_color;
struct pipe_sampler_state sampler[PIPE_MAX_SAMPLERS];
+ struct pipe_rasterizer_state rasterizer;
struct spu_texture texture[PIPE_MAX_SAMPLERS];
struct vertex_info vertex_info;
@@ -133,39 +159,38 @@ struct spu_global
tile_t ztile ALIGN16_ATTRIB;
/** Read depth/stencil tiles? */
- boolean read_depth;
- boolean read_stencil;
+ boolean read_depth_stencil;
/** Current tiles' status */
ubyte cur_ctile_status, cur_ztile_status;
/** Status of all tiles in framebuffer */
- ubyte ctile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
- ubyte ztile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
+ ubyte ctile_status[CELL_MAX_HEIGHT/TILE_SIZE][CELL_MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
+ ubyte ztile_status[CELL_MAX_HEIGHT/TILE_SIZE][CELL_MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
- /** Current fragment ops machine code */
- uint fragment_ops_code[SPU_MAX_FRAGMENT_OPS_INSTS];
- /** Current fragment ops function */
- spu_fragment_ops_func fragment_ops;
+ /** Current fragment ops machine code, at 8-byte boundary */
+ uint *fragment_ops_code;
+ uint fragment_ops_code_size;
+ /** Current fragment ops functions, 0 = frontfacing, 1 = backfacing */
+ spu_fragment_ops_func fragment_ops[2];
- /** Current fragment program machine code */
- uint fragment_program_code[SPU_MAX_FRAGMENT_PROGRAM_INSTS];
+ /** Current fragment program machine code, at 8-byte boundary */
+ uint fragment_program_code[SPU_MAX_FRAGMENT_PROGRAM_INSTS] ALIGN8_ATTRIB;
/** Current fragment ops function */
spu_fragment_program_func fragment_program;
/** Current texture sampler function */
- spu_sample_texture_func sample_texture[CELL_MAX_SAMPLERS];
+ spu_sample_texture_2d_func sample_texture_2d[CELL_MAX_SAMPLERS];
+ spu_sample_texture_2d_func min_sample_texture_2d[CELL_MAX_SAMPLERS];
+ spu_sample_texture_2d_func mag_sample_texture_2d[CELL_MAX_SAMPLERS];
- /** Fragment program constants (XXX preliminary/used) */
-#define MAX_CONSTANTS 32
- vector float constants[MAX_CONSTANTS];
+ /** Fragment program constants */
+ vector float constants[4 * CELL_MAX_CONSTANTS];
} ALIGN16_ATTRIB;
extern struct spu_global spu;
-extern boolean Debug;
-
@@ -184,7 +209,7 @@ extern boolean Debug;
#define TAG_DCACHE1 21
#define TAG_DCACHE2 22
#define TAG_DCACHE3 23
-
+#define TAG_FENCE 24
static INLINE void
diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
index 03dd547845..683664e8a4 100644
--- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
+++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
@@ -40,6 +40,24 @@
#define LINEAR_QUAD_LAYOUT 1
+static INLINE vector float
+spu_min(vector float a, vector float b)
+{
+ vector unsigned int m;
+ m = spu_cmpgt(a, b); /* m = a > b ? ~0 : 0 */
+ return spu_sel(a, b, m);
+}
+
+
+static INLINE vector float
+spu_max(vector float a, vector float b)
+{
+ vector unsigned int m;
+ m = spu_cmpgt(a, b); /* m = a > b ? ~0 : 0 */
+ return spu_sel(b, a, m);
+}
+
+
/**
* Called by rasterizer for each quad after the shader has run. Do
* all the per-fragment operations including alpha test, z test,
@@ -60,9 +78,12 @@ spu_fallback_fragment_ops(uint x, uint y,
vector unsigned int mask)
{
vector float frag_aos[4];
- unsigned int c0, c1, c2, c3;
+ unsigned int fbc0, fbc1, fbc2, fbc3 ; /* framebuffer/tile colors */
+ unsigned int fragc0, fragc1, fragc2, fragc3; /* fragment colors */
- /* do alpha test */
+ /*
+ * Do alpha test
+ */
if (spu.depth_stencil_alpha.alpha.enabled) {
vector float ref = spu_splats(spu.depth_stencil_alpha.alpha.ref);
vector unsigned int amask;
@@ -102,7 +123,10 @@ spu_fallback_fragment_ops(uint x, uint y,
mask = spu_and(mask, amask);
}
- /* Z and/or stencil testing... */
+
+ /*
+ * Z and/or stencil testing...
+ */
if (spu.depth_stencil_alpha.depth.enabled ||
spu.depth_stencil_alpha.stencil[0].enabled) {
@@ -178,6 +202,32 @@ spu_fallback_fragment_ops(uint x, uint y,
}
}
+
+ /*
+ * If we'll need the current framebuffer/tile colors for blending
+ * or logicop or colormask, fetch them now.
+ */
+ if (spu.blend.blend_enable ||
+ spu.blend.logicop_enable ||
+ spu.blend.colormask != 0xf) {
+
+#if LINEAR_QUAD_LAYOUT /* See comments/diagram below */
+ fbc0 = colorTile->ui[y][x*2+0];
+ fbc1 = colorTile->ui[y][x*2+1];
+ fbc2 = colorTile->ui[y][x*2+2];
+ fbc3 = colorTile->ui[y][x*2+3];
+#else
+ fbc0 = colorTile->ui[y+0][x+0];
+ fbc1 = colorTile->ui[y+0][x+1];
+ fbc2 = colorTile->ui[y+1][x+0];
+ fbc3 = colorTile->ui[y+1][x+1];
+#endif
+ }
+
+
+ /*
+ * Do blending
+ */
if (spu.blend.blend_enable) {
/* blending terms, misc regs */
vector float term1r, term1g, term1b, term1a;
@@ -186,43 +236,30 @@ spu_fallback_fragment_ops(uint x, uint y,
vector float fbRGBA[4]; /* current framebuffer colors */
- /* get colors from framebuffer/tile */
+ /* convert framebuffer colors from packed int to vector float */
{
- vector float fc[4];
- uint c0, c1, c2, c3;
-
-#if LINEAR_QUAD_LAYOUT /* See comments/diagram below */
- c0 = colorTile->ui[y][x*2+0];
- c1 = colorTile->ui[y][x*2+1];
- c2 = colorTile->ui[y][x*2+2];
- c3 = colorTile->ui[y][x*2+3];
-#else
- c0 = colorTile->ui[y+0][x+0];
- c1 = colorTile->ui[y+0][x+1];
- c2 = colorTile->ui[y+1][x+0];
- c3 = colorTile->ui[y+1][x+1];
-#endif
+ vector float temp[4]; /* float colors in AOS form */
switch (spu.fb.color_format) {
case PIPE_FORMAT_B8G8R8A8_UNORM:
- fc[0] = spu_unpack_B8G8R8A8(c0);
- fc[1] = spu_unpack_B8G8R8A8(c1);
- fc[2] = spu_unpack_B8G8R8A8(c2);
- fc[3] = spu_unpack_B8G8R8A8(c3);
+ temp[0] = spu_unpack_B8G8R8A8(fbc0);
+ temp[1] = spu_unpack_B8G8R8A8(fbc1);
+ temp[2] = spu_unpack_B8G8R8A8(fbc2);
+ temp[3] = spu_unpack_B8G8R8A8(fbc3);
break;
case PIPE_FORMAT_A8R8G8B8_UNORM:
- fc[0] = spu_unpack_A8R8G8B8(c0);
- fc[1] = spu_unpack_A8R8G8B8(c1);
- fc[2] = spu_unpack_A8R8G8B8(c2);
- fc[3] = spu_unpack_A8R8G8B8(c3);
+ temp[0] = spu_unpack_A8R8G8B8(fbc0);
+ temp[1] = spu_unpack_A8R8G8B8(fbc1);
+ temp[2] = spu_unpack_A8R8G8B8(fbc2);
+ temp[3] = spu_unpack_A8R8G8B8(fbc3);
break;
default:
ASSERT(0);
}
- _transpose_matrix4x4(fbRGBA, fc);
+ _transpose_matrix4x4(fbRGBA, temp); /* fbRGBA = transpose(temp) */
}
/*
- * Compute Src RGB terms
+ * Compute Src RGB terms (fragment color * factor)
*/
switch (spu.blend.rgb_src_factor) {
case PIPE_BLENDFACTOR_ONE:
@@ -245,13 +282,33 @@ spu_fallback_fragment_ops(uint x, uint y,
term1g = spu_mul(fragG, fragA);
term1b = spu_mul(fragB, fragA);
break;
+ case PIPE_BLENDFACTOR_DST_COLOR:
+ term1r = spu_mul(fragR, fbRGBA[0]);
+ term1g = spu_mul(fragG, fbRGBA[1]);
+ term1b = spu_mul(fragB, fbRGBA[1]);
+ break;
+ case PIPE_BLENDFACTOR_DST_ALPHA:
+ term1r = spu_mul(fragR, fbRGBA[3]);
+ term1g = spu_mul(fragG, fbRGBA[3]);
+ term1b = spu_mul(fragB, fbRGBA[3]);
+ break;
+ case PIPE_BLENDFACTOR_CONST_COLOR:
+ term1r = spu_mul(fragR, spu_splats(spu.blend_color.color[0]));
+ term1g = spu_mul(fragG, spu_splats(spu.blend_color.color[1]));
+ term1b = spu_mul(fragB, spu_splats(spu.blend_color.color[2]));
+ break;
+ case PIPE_BLENDFACTOR_CONST_ALPHA:
+ term1r = spu_mul(fragR, spu_splats(spu.blend_color.color[3]));
+ term1g = spu_mul(fragG, spu_splats(spu.blend_color.color[3]));
+ term1b = spu_mul(fragB, spu_splats(spu.blend_color.color[3]));
+ break;
/* XXX more cases */
default:
ASSERT(0);
}
/*
- * Compute Src Alpha term
+ * Compute Src Alpha term (fragment alpha * factor)
*/
switch (spu.blend.alpha_src_factor) {
case PIPE_BLENDFACTOR_ONE:
@@ -263,19 +320,29 @@ spu_fallback_fragment_ops(uint x, uint y,
case PIPE_BLENDFACTOR_SRC_ALPHA:
term1a = spu_mul(fragA, fragA);
break;
+ case PIPE_BLENDFACTOR_DST_COLOR:
+ /* fall-through */
+ case PIPE_BLENDFACTOR_DST_ALPHA:
+ term1a = spu_mul(fragA, fbRGBA[3]);
+ break;
+ case PIPE_BLENDFACTOR_CONST_COLOR:
+ /* fall-through */
+ case PIPE_BLENDFACTOR_CONST_ALPHA:
+ term1a = spu_mul(fragR, spu_splats(spu.blend_color.color[3]));
+ break;
/* XXX more cases */
default:
ASSERT(0);
}
/*
- * Compute Dest RGB terms
+ * Compute Dest RGB terms (framebuffer color * factor)
*/
switch (spu.blend.rgb_dst_factor) {
case PIPE_BLENDFACTOR_ONE:
- term2r = fragR;
- term2g = fragG;
- term2b = fragB;
+ term2r = fbRGBA[0];
+ term2g = fbRGBA[1];
+ term2b = fbRGBA[2];
break;
case PIPE_BLENDFACTOR_ZERO:
term2r =
@@ -299,17 +366,37 @@ spu_fallback_fragment_ops(uint x, uint y,
term2g = spu_mul(fbRGBA[1], tmp);
term2b = spu_mul(fbRGBA[2], tmp);
break;
- /* XXX more cases */
+ case PIPE_BLENDFACTOR_DST_COLOR:
+ term2r = spu_mul(fbRGBA[0], fbRGBA[0]);
+ term2g = spu_mul(fbRGBA[1], fbRGBA[1]);
+ term2b = spu_mul(fbRGBA[2], fbRGBA[2]);
+ break;
+ case PIPE_BLENDFACTOR_DST_ALPHA:
+ term2r = spu_mul(fbRGBA[0], fbRGBA[3]);
+ term2g = spu_mul(fbRGBA[1], fbRGBA[3]);
+ term2b = spu_mul(fbRGBA[2], fbRGBA[3]);
+ break;
+ case PIPE_BLENDFACTOR_CONST_COLOR:
+ term2r = spu_mul(fbRGBA[0], spu_splats(spu.blend_color.color[0]));
+ term2g = spu_mul(fbRGBA[1], spu_splats(spu.blend_color.color[1]));
+ term2b = spu_mul(fbRGBA[2], spu_splats(spu.blend_color.color[2]));
+ break;
+ case PIPE_BLENDFACTOR_CONST_ALPHA:
+ term2r = spu_mul(fbRGBA[0], spu_splats(spu.blend_color.color[3]));
+ term2g = spu_mul(fbRGBA[1], spu_splats(spu.blend_color.color[3]));
+ term2b = spu_mul(fbRGBA[2], spu_splats(spu.blend_color.color[3]));
+ break;
+ /* XXX more cases */
default:
ASSERT(0);
}
/*
- * Compute Dest Alpha term
+ * Compute Dest Alpha term (framebuffer alpha * factor)
*/
switch (spu.blend.alpha_dst_factor) {
case PIPE_BLENDFACTOR_ONE:
- term2a = fragA;
+ term2a = fbRGBA[3];
break;
case PIPE_BLENDFACTOR_SRC_COLOR:
term2a = spu_splats(0.0f);
@@ -322,6 +409,16 @@ spu_fallback_fragment_ops(uint x, uint y,
tmp = spu_sub(one, fragA);
term2a = spu_mul(fbRGBA[3], tmp);
break;
+ case PIPE_BLENDFACTOR_DST_COLOR:
+ /* fall-through */
+ case PIPE_BLENDFACTOR_DST_ALPHA:
+ term2a = spu_mul(fbRGBA[3], fbRGBA[3]);
+ break;
+ case PIPE_BLENDFACTOR_CONST_COLOR:
+ /* fall-through */
+ case PIPE_BLENDFACTOR_CONST_ALPHA:
+ term2a = spu_mul(fbRGBA[3], spu_splats(spu.blend_color.color[3]));
+ break;
/* XXX more cases */
default:
ASSERT(0);
@@ -341,7 +438,21 @@ spu_fallback_fragment_ops(uint x, uint y,
fragG = spu_sub(term1g, term2g);
fragB = spu_sub(term1b, term2b);
break;
- /* XXX more cases */
+ case PIPE_BLEND_REVERSE_SUBTRACT:
+ fragR = spu_sub(term2r, term1r);
+ fragG = spu_sub(term2g, term1g);
+ fragB = spu_sub(term2b, term1b);
+ break;
+ case PIPE_BLEND_MIN:
+ fragR = spu_min(term1r, term2r);
+ fragG = spu_min(term1g, term2g);
+ fragB = spu_min(term1b, term2b);
+ break;
+ case PIPE_BLEND_MAX:
+ fragR = spu_max(term1r, term2r);
+ fragG = spu_max(term1g, term2g);
+ fragB = spu_max(term1b, term2b);
+ break;
default:
ASSERT(0);
}
@@ -356,7 +467,15 @@ spu_fallback_fragment_ops(uint x, uint y,
case PIPE_BLEND_SUBTRACT:
fragA = spu_sub(term1a, term2a);
break;
- /* XXX more cases */
+ case PIPE_BLEND_REVERSE_SUBTRACT:
+ fragA = spu_sub(term2a, term1a);
+ break;
+ case PIPE_BLEND_MIN:
+ fragA = spu_min(term1a, term2a);
+ break;
+ case PIPE_BLEND_MAX:
+ fragA = spu_max(term1a, term2a);
+ break;
default:
ASSERT(0);
}
@@ -384,21 +503,20 @@ spu_fallback_fragment_ops(uint x, uint y,
#endif
/*
- * Pack float colors into 32-bit RGBA words.
+ * Pack fragment float colors into 32-bit RGBA words.
*/
switch (spu.fb.color_format) {
case PIPE_FORMAT_A8R8G8B8_UNORM:
- c0 = spu_pack_A8R8G8B8(frag_aos[0]);
- c1 = spu_pack_A8R8G8B8(frag_aos[1]);
- c2 = spu_pack_A8R8G8B8(frag_aos[2]);
- c3 = spu_pack_A8R8G8B8(frag_aos[3]);
+ fragc0 = spu_pack_A8R8G8B8(frag_aos[0]);
+ fragc1 = spu_pack_A8R8G8B8(frag_aos[1]);
+ fragc2 = spu_pack_A8R8G8B8(frag_aos[2]);
+ fragc3 = spu_pack_A8R8G8B8(frag_aos[3]);
break;
-
case PIPE_FORMAT_B8G8R8A8_UNORM:
- c0 = spu_pack_B8G8R8A8(frag_aos[0]);
- c1 = spu_pack_B8G8R8A8(frag_aos[1]);
- c2 = spu_pack_B8G8R8A8(frag_aos[2]);
- c3 = spu_pack_B8G8R8A8(frag_aos[3]);
+ fragc0 = spu_pack_B8G8R8A8(frag_aos[0]);
+ fragc1 = spu_pack_B8G8R8A8(frag_aos[1]);
+ fragc2 = spu_pack_B8G8R8A8(frag_aos[2]);
+ fragc3 = spu_pack_B8G8R8A8(frag_aos[3]);
break;
default:
fprintf(stderr, "SPU: Bad pixel format in spu_default_fragment_ops\n");
@@ -407,20 +525,57 @@ spu_fallback_fragment_ops(uint x, uint y,
/*
- * Color masking
+ * Do color masking
*/
if (spu.blend.colormask != 0xf) {
- /* XXX to do */
- /* apply color mask to 32-bit packed colors */
+ uint cmask = 0x0; /* each byte corresponds to a color channel */
+
+ /* Form bitmask depending on color buffer format and colormask bits */
+ switch (spu.fb.color_format) {
+ case PIPE_FORMAT_A8R8G8B8_UNORM:
+ if (spu.blend.colormask & PIPE_MASK_R)
+ cmask |= 0x00ff0000; /* red */
+ if (spu.blend.colormask & PIPE_MASK_G)
+ cmask |= 0x0000ff00; /* green */
+ if (spu.blend.colormask & PIPE_MASK_B)
+ cmask |= 0x000000ff; /* blue */
+ if (spu.blend.colormask & PIPE_MASK_A)
+ cmask |= 0xff000000; /* alpha */
+ break;
+ case PIPE_FORMAT_B8G8R8A8_UNORM:
+ if (spu.blend.colormask & PIPE_MASK_R)
+ cmask |= 0x0000ff00; /* red */
+ if (spu.blend.colormask & PIPE_MASK_G)
+ cmask |= 0x00ff0000; /* green */
+ if (spu.blend.colormask & PIPE_MASK_B)
+ cmask |= 0xff000000; /* blue */
+ if (spu.blend.colormask & PIPE_MASK_A)
+ cmask |= 0x000000ff; /* alpha */
+ break;
+ default:
+ ASSERT(0);
+ }
+
+ /*
+ * Apply color mask to the 32-bit packed colors.
+ * if (cmask[i])
+ * frag color[i] = frag color[i];
+ * else
+ * frag color[i] = framebuffer color[i];
+ */
+ fragc0 = (fragc0 & cmask) | (fbc0 & ~cmask);
+ fragc1 = (fragc1 & cmask) | (fbc1 & ~cmask);
+ fragc2 = (fragc2 & cmask) | (fbc2 & ~cmask);
+ fragc3 = (fragc3 & cmask) | (fbc3 & ~cmask);
}
/*
- * Logic Ops
+ * Do logic ops
*/
if (spu.blend.logicop_enable) {
/* XXX to do */
- /* apply logicop to 32-bit packed colors */
+ /* apply logicop to 32-bit packed colors (fragcx and fbcx) */
}
@@ -431,45 +586,46 @@ spu_fallback_fragment_ops(uint x, uint y,
spu.cur_ctile_status = TILE_STATUS_DIRTY;
}
else {
+ /* write no fragments */
return;
}
/*
- * Write new quad colors to the framebuffer/tile.
+ * Write new fragment/quad colors to the framebuffer/tile.
* Only write pixels where the corresponding mask word is set.
*/
#if LINEAR_QUAD_LAYOUT
/*
* Quad layout:
* +--+--+--+--+
- * |p0|p1|p2|p3|
+ * |p0|p1|p2|p3|...
* +--+--+--+--+
*/
if (spu_extract(mask, 0))
- colorTile->ui[y][x*2] = c0;
+ colorTile->ui[y][x*2] = fragc0;
if (spu_extract(mask, 1))
- colorTile->ui[y][x*2+1] = c1;
+ colorTile->ui[y][x*2+1] = fragc1;
if (spu_extract(mask, 2))
- colorTile->ui[y][x*2+2] = c2;
+ colorTile->ui[y][x*2+2] = fragc2;
if (spu_extract(mask, 3))
- colorTile->ui[y][x*2+3] = c3;
+ colorTile->ui[y][x*2+3] = fragc3;
#else
/*
* Quad layout:
* +--+--+
- * |p0|p1|
+ * |p0|p1|...
* +--+--+
- * |p2|p3|
+ * |p2|p3|...
* +--+--+
*/
if (spu_extract(mask, 0))
- colorTile->ui[y+0][x+0] = c0;
+ colorTile->ui[y+0][x+0] = fragc0;
if (spu_extract(mask, 1))
- colorTile->ui[y+0][x+1] = c1;
+ colorTile->ui[y+0][x+1] = fragc1;
if (spu_extract(mask, 2))
- colorTile->ui[y+1][x+0] = c2;
+ colorTile->ui[y+1][x+0] = fragc2;
if (spu_extract(mask, 3))
- colorTile->ui[y+1][x+1] = c3;
+ colorTile->ui[y+1][x+1] = fragc3;
#endif
}
diff --git a/src/gallium/drivers/cell/spu/spu_render.c b/src/gallium/drivers/cell/spu/spu_render.c
index 305dc98881..7c225e2f27 100644
--- a/src/gallium/drivers/cell/spu/spu_render.c
+++ b/src/gallium/drivers/cell/spu/spu_render.c
@@ -98,7 +98,7 @@ my_tile(uint tx, uint ty)
static INLINE void
get_cz_tiles(uint tx, uint ty)
{
- if (spu.read_depth) {
+ if (spu.read_depth_stencil) {
if (spu.cur_ztile_status != TILE_STATUS_CLEAR) {
//printf("SPU %u: getting Z tile %u, %u\n", spu.init.id, tx, ty);
get_tile(tx, ty, &spu.ztile, TAG_READ_TILE_Z, 1);
@@ -153,7 +153,7 @@ static INLINE void
wait_put_cz_tiles(void)
{
wait_on_mask(1 << TAG_WRITE_TILE_COLOR);
- if (spu.read_depth) {
+ if (spu.read_depth_stencil) {
wait_on_mask(1 << TAG_WRITE_TILE_Z);
}
}
@@ -175,22 +175,14 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
const ubyte *vertices;
const ushort *indexes;
uint i, j;
+ uint num_tiles;
-
- if (Debug) {
- printf("SPU %u: RENDER prim %u, num_vert=%u num_ind=%u "
- "inline_vert=%u\n",
- spu.init.id,
- render->prim_type,
- render->num_verts,
- render->num_indexes,
- render->inline_verts);
-
- /*
- printf(" bound: %g, %g .. %g, %g\n",
- render->xmin, render->ymin, render->xmax, render->ymax);
- */
- }
+ D_PRINTF(CELL_DEBUG_CMD,
+ "RENDER prim=%u num_vert=%u num_ind=%u inline_vert=%u\n",
+ render->prim_type,
+ render->num_verts,
+ render->num_indexes,
+ render->inline_verts);
ASSERT(sizeof(*render) % 4 == 0);
ASSERT(total_vertex_bytes % 16 == 0);
@@ -251,6 +243,8 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
wait_on_mask(1 << TAG_SURFACE_CLEAR); /* XXX temporary */
+ num_tiles = 0;
+
/**
** loop over tiles, rendering tris
**/
@@ -264,6 +258,8 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
if (!my_tile(tx, ty))
continue;
+ num_tiles++;
+
spu.cur_ctile_status = spu.ctile_status[ty][tx];
spu.cur_ztile_status = spu.ztile_status[ty][tx];
@@ -293,9 +289,7 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
spu.ztile_status[ty][tx] = spu.cur_ztile_status;
}
- if (Debug)
- printf("SPU %u: RENDER done\n",
- spu.init.id);
+ D_PRINTF(CELL_DEBUG_CMD,
+ "RENDER done (%u tiles hit)\n",
+ num_tiles);
}
-
-
diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
index 117b8a36f8..69784c8978 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.c
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -26,6 +26,8 @@
**************************************************************************/
+#include <math.h>
+
#include "pipe/p_compiler.h"
#include "spu_main.h"
#include "spu_texture.h"
@@ -40,37 +42,19 @@
void
invalidate_tex_cache(void)
{
- uint unit = 0;
- uint bytes = 4 * spu.texture[unit].width
- * spu.texture[unit].height;
-
- spu_dcache_mark_dirty((unsigned) spu.texture[unit].start, bytes);
-}
+ uint lvl;
+ for (lvl = 0; lvl < CELL_MAX_TEXTURE_LEVELS; lvl++) {
+ uint unit = 0;
+ uint bytes = 4 * spu.texture[unit].level[lvl].width
+ * spu.texture[unit].level[lvl].height;
+ if (spu.texture[unit].target == PIPE_TEXTURE_CUBE)
+ bytes *= 6;
+ else if (spu.texture[unit].target == PIPE_TEXTURE_3D)
+ bytes *= spu.texture[unit].level[lvl].depth;
-/**
- * XXX look into getting texels for all four pixels in a quad at once.
- */
-static uint
-get_texel(uint unit, vec_uint4 coordinate)
-{
- /*
- * XXX we could do the "/ TILE_SIZE" and "% TILE_SIZE" operations as
- * SIMD since X and Y are already in a SIMD register.
- */
- const unsigned texture_ea = (uintptr_t) spu.texture[unit].start;
- ushort x = spu_extract(coordinate, 0);
- ushort y = spu_extract(coordinate, 1);
- unsigned tile_offset = sizeof(tile_t)
- * ((y / TILE_SIZE * spu.texture[unit].tiles_per_row) + (x / TILE_SIZE));
- ushort texel_offset = (ushort) 4
- * (ushort) (((ushort) (y % TILE_SIZE) * (ushort) TILE_SIZE) + (x % TILE_SIZE));
- vec_uint4 tmp;
-
- spu_dcache_fetch_unaligned((qword *) & tmp,
- texture_ea + tile_offset + texel_offset,
- 4);
- return spu_extract(tmp, 0);
+ spu_dcache_mark_dirty((unsigned) spu.texture[unit].level[lvl].start, bytes);
+ }
}
@@ -88,15 +72,17 @@ get_texel(uint unit, vec_uint4 coordinate)
* a time.
*/
static void
-get_four_texels(uint unit, vec_uint4 x, vec_uint4 y, vec_uint4 *texels)
+get_four_texels(const struct spu_texture_level *tlevel, uint face,
+ vec_int4 x, vec_int4 y,
+ vec_uint4 *texels)
{
- const unsigned texture_ea = (uintptr_t) spu.texture[unit].start;
- vec_uint4 tile_x = spu_rlmask(x, -5);
- vec_uint4 tile_y = spu_rlmask(y, -5);
- const qword offset_x = si_andi((qword) x, 0x1f);
- const qword offset_y = si_andi((qword) y, 0x1f);
+ unsigned texture_ea = (uintptr_t) tlevel->start;
+ const vec_int4 tile_x = spu_rlmask(x, -5); /* tile_x = x / 32 */
+ const vec_int4 tile_y = spu_rlmask(y, -5); /* tile_y = y / 32 */
+ const qword offset_x = si_andi((qword) x, 0x1f); /* offset_x = x & 0x1f */
+ const qword offset_y = si_andi((qword) y, 0x1f); /* offset_y = y & 0x1f */
- const qword tiles_per_row = (qword) spu_splats(spu.texture[unit].tiles_per_row);
+ const qword tiles_per_row = (qword) spu_splats(tlevel->tiles_per_row);
const qword tile_size = (qword) spu_splats((unsigned) sizeof(tile_t));
qword tile_offset = si_mpya((qword) tile_y, tiles_per_row, (qword) tile_x);
@@ -107,6 +93,8 @@ get_four_texels(uint unit, vec_uint4 x, vec_uint4 y, vec_uint4 *texels)
vec_uint4 offset = (vec_uint4) si_a(tile_offset, texel_offset);
+ texture_ea = texture_ea + face * tlevel->bytes_per_image;
+
spu_dcache_fetch_unaligned((qword *) & texels[0],
texture_ea + spu_extract(offset, 0), 4);
spu_dcache_fetch_unaligned((qword *) & texels[1],
@@ -118,83 +106,536 @@ get_four_texels(uint unit, vec_uint4 x, vec_uint4 y, vec_uint4 *texels)
}
+/** clamp vec to [0, max] */
+static INLINE vector signed int
+spu_clamp(vector signed int vec, vector signed int max)
+{
+ static const vector signed int zero = {0,0,0,0};
+ vector unsigned int c;
+ c = spu_cmpgt(vec, zero); /* c = vec > zero ? ~0 : 0 */
+ vec = spu_sel(zero, vec, c);
+ c = spu_cmpgt(vec, max); /* c = vec > max ? ~0 : 0 */
+ vec = spu_sel(vec, max, c);
+ return vec;
+}
+
+
+
/**
- * Get texture sample at texcoord.
+ * Do nearest texture sampling for four pixels.
+ * \param colors returned colors in SOA format (rrrr, gggg, bbbb, aaaa).
*/
-vector float
-sample_texture_nearest(uint unit, vector float texcoord)
+void
+sample_texture_2d_nearest(vector float s, vector float t,
+ uint unit, uint level, uint face,
+ vector float colors[4])
{
- vector float tc = spu_mul(texcoord, spu.texture[unit].tex_size);
- vector unsigned int itc = spu_convtu(tc, 0); /* convert to int */
- itc = spu_and(itc, spu.texture[unit].tex_size_mask); /* mask (GL_REPEAT) */
- uint texel = get_texel(unit, itc);
- return spu_unpack_A8R8G8B8(texel);
+ const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
+ vector float ss = spu_mul(s, tlevel->scale_s);
+ vector float tt = spu_mul(t, tlevel->scale_t);
+ vector signed int is = spu_convts(ss, 0);
+ vector signed int it = spu_convts(tt, 0);
+ vec_uint4 texels[4];
+
+ /* PIPE_TEX_WRAP_REPEAT */
+ is = spu_and(is, tlevel->mask_s);
+ it = spu_and(it, tlevel->mask_t);
+
+ /* PIPE_TEX_WRAP_CLAMP */
+ is = spu_clamp(is, tlevel->max_s);
+ it = spu_clamp(it, tlevel->max_t);
+
+ get_four_texels(tlevel, face, is, it, texels);
+
+ /* convert four packed ARGBA pixels to float RRRR,GGGG,BBBB,AAAA */
+ spu_unpack_A8R8G8B8_transpose4(texels, colors);
}
-vector float
-sample_texture_bilinear(uint unit, vector float texcoord)
+/**
+ * Do bilinear texture sampling for four pixels.
+ * \param colors returned colors in SOA format (rrrr, gggg, bbbb, aaaa).
+ */
+void
+sample_texture_2d_bilinear(vector float s, vector float t,
+ uint unit, uint level, uint face,
+ vector float colors[4])
{
- static const vec_uint4 offset_x = {0, 0, 1, 1};
- static const vec_uint4 offset_y = {0, 1, 0, 1};
+ const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
+ static const vector float half = {-0.5f, -0.5f, -0.5f, -0.5f};
- vector float tc = spu_mul(texcoord, spu.texture[unit].tex_size);
- tc = spu_add(tc, spu_splats(-0.5f)); /* half texel bias */
+ vector float ss = spu_madd(s, tlevel->scale_s, half);
+ vector float tt = spu_madd(t, tlevel->scale_t, half);
- /* integer texcoords S,T: */
- vec_uint4 itc = spu_convtu(tc, 0); /* convert to int */
+ vector signed int is0 = spu_convts(ss, 0);
+ vector signed int it0 = spu_convts(tt, 0);
- vec_uint4 texels[4];
-
- /* setup texcoords for quad:
- * +-----+-----+
- * |x0,y0|x1,y1|
- * +-----+-----+
- * |x2,y2|x3,y3|
- * +-----+-----+
- */
- vec_uint4 x = spu_splats(spu_extract(itc, 0));
- vec_uint4 y = spu_splats(spu_extract(itc, 1));
- x = spu_add(x, offset_x);
- y = spu_add(y, offset_y);
+ /* is + 1, it + 1 */
+ vector signed int is1 = spu_add(is0, 1);
+ vector signed int it1 = spu_add(it0, 1);
- /* GL_REPEAT wrap mode: */
- x = spu_and(x, spu.texture[unit].tex_size_x_mask);
- y = spu_and(y, spu.texture[unit].tex_size_y_mask);
+ /* PIPE_TEX_WRAP_REPEAT */
+ is0 = spu_and(is0, tlevel->mask_s);
+ it0 = spu_and(it0, tlevel->mask_t);
+ is1 = spu_and(is1, tlevel->mask_s);
+ it1 = spu_and(it1, tlevel->mask_t);
- get_four_texels(unit, x, y, texels);
+ /* PIPE_TEX_WRAP_CLAMP */
+ is0 = spu_clamp(is0, tlevel->max_s);
+ it0 = spu_clamp(it0, tlevel->max_t);
+ is1 = spu_clamp(is1, tlevel->max_s);
+ it1 = spu_clamp(it1, tlevel->max_t);
- /* integer A8R8G8B8 to float texel conversion */
- vector float texel00 = spu_unpack_A8R8G8B8(spu_extract(texels[0], 0));
- vector float texel01 = spu_unpack_A8R8G8B8(spu_extract(texels[1], 0));
- vector float texel10 = spu_unpack_A8R8G8B8(spu_extract(texels[2], 0));
- vector float texel11 = spu_unpack_A8R8G8B8(spu_extract(texels[3], 0));
+ /* get packed int texels */
+ vector unsigned int texels[16];
+ get_four_texels(tlevel, face, is0, it0, texels + 0); /* upper-left */
+ get_four_texels(tlevel, face, is1, it0, texels + 4); /* upper-right */
+ get_four_texels(tlevel, face, is0, it1, texels + 8); /* lower-left */
+ get_four_texels(tlevel, face, is1, it1, texels + 12); /* lower-right */
+ /* convert packed int texels to float colors */
+ vector float ftexels[16];
+ spu_unpack_A8R8G8B8_transpose4(texels + 0, ftexels + 0);
+ spu_unpack_A8R8G8B8_transpose4(texels + 4, ftexels + 4);
+ spu_unpack_A8R8G8B8_transpose4(texels + 8, ftexels + 8);
+ spu_unpack_A8R8G8B8_transpose4(texels + 12, ftexels + 12);
/* Compute weighting factors in [0,1]
* Multiply texcoord by 1024, AND with 1023, convert back to float.
*/
- vector float tc1024 = spu_mul(tc, spu_splats(1024.0f));
- vector signed int itc1024 = spu_convts(tc1024, 0);
- itc1024 = spu_and(itc1024, spu_splats((1 << 10) - 1));
- vector float weight = spu_convtf(itc1024, 10);
-
- /* smeared frac and 1-frac */
- vector float sfrac = spu_splats(spu_extract(weight, 0));
- vector float tfrac = spu_splats(spu_extract(weight, 1));
- vector float sfrac1 = spu_sub(spu_splats(1.0f), sfrac);
- vector float tfrac1 = spu_sub(spu_splats(1.0f), tfrac);
-
- /* multiply the samples (colors) by the S/T weights */
- texel00 = spu_mul(spu_mul(texel00, sfrac1), tfrac1);
- texel10 = spu_mul(spu_mul(texel10, sfrac ), tfrac1);
- texel01 = spu_mul(spu_mul(texel01, sfrac1), tfrac );
- texel11 = spu_mul(spu_mul(texel11, sfrac ), tfrac );
-
- /* compute sum of weighted samples */
- vector float texel_sum = spu_add(texel00, texel01);
- texel_sum = spu_add(texel_sum, texel10);
- texel_sum = spu_add(texel_sum, texel11);
-
- return texel_sum;
+ vector float ss1024 = spu_mul(ss, spu_splats(1024.0f));
+ vector signed int iss1024 = spu_convts(ss1024, 0);
+ iss1024 = spu_and(iss1024, 1023);
+ vector float sWeights0 = spu_convtf(iss1024, 10);
+
+ vector float tt1024 = spu_mul(tt, spu_splats(1024.0f));
+ vector signed int itt1024 = spu_convts(tt1024, 0);
+ itt1024 = spu_and(itt1024, 1023);
+ vector float tWeights0 = spu_convtf(itt1024, 10);
+
+ /* 1 - sWeight and 1 - tWeight */
+ vector float sWeights1 = spu_sub(spu_splats(1.0f), sWeights0);
+ vector float tWeights1 = spu_sub(spu_splats(1.0f), tWeights0);
+
+ /* reds, for four pixels */
+ ftexels[ 0] = spu_mul(ftexels[ 0], spu_mul(sWeights1, tWeights1)); /*ul*/
+ ftexels[ 4] = spu_mul(ftexels[ 4], spu_mul(sWeights0, tWeights1)); /*ur*/
+ ftexels[ 8] = spu_mul(ftexels[ 8], spu_mul(sWeights1, tWeights0)); /*ll*/
+ ftexels[12] = spu_mul(ftexels[12], spu_mul(sWeights0, tWeights0)); /*lr*/
+ colors[0] = spu_add(spu_add(ftexels[0], ftexels[4]),
+ spu_add(ftexels[8], ftexels[12]));
+
+ /* greens, for four pixels */
+ ftexels[ 1] = spu_mul(ftexels[ 1], spu_mul(sWeights1, tWeights1)); /*ul*/
+ ftexels[ 5] = spu_mul(ftexels[ 5], spu_mul(sWeights0, tWeights1)); /*ur*/
+ ftexels[ 9] = spu_mul(ftexels[ 9], spu_mul(sWeights1, tWeights0)); /*ll*/
+ ftexels[13] = spu_mul(ftexels[13], spu_mul(sWeights0, tWeights0)); /*lr*/
+ colors[1] = spu_add(spu_add(ftexels[1], ftexels[5]),
+ spu_add(ftexels[9], ftexels[13]));
+
+ /* blues, for four pixels */
+ ftexels[ 2] = spu_mul(ftexels[ 2], spu_mul(sWeights1, tWeights1)); /*ul*/
+ ftexels[ 6] = spu_mul(ftexels[ 6], spu_mul(sWeights0, tWeights1)); /*ur*/
+ ftexels[10] = spu_mul(ftexels[10], spu_mul(sWeights1, tWeights0)); /*ll*/
+ ftexels[14] = spu_mul(ftexels[14], spu_mul(sWeights0, tWeights0)); /*lr*/
+ colors[2] = spu_add(spu_add(ftexels[2], ftexels[6]),
+ spu_add(ftexels[10], ftexels[14]));
+
+ /* alphas, for four pixels */
+ ftexels[ 3] = spu_mul(ftexels[ 3], spu_mul(sWeights1, tWeights1)); /*ul*/
+ ftexels[ 7] = spu_mul(ftexels[ 7], spu_mul(sWeights0, tWeights1)); /*ur*/
+ ftexels[11] = spu_mul(ftexels[11], spu_mul(sWeights1, tWeights0)); /*ll*/
+ ftexels[15] = spu_mul(ftexels[15], spu_mul(sWeights0, tWeights0)); /*lr*/
+ colors[3] = spu_add(spu_add(ftexels[3], ftexels[7]),
+ spu_add(ftexels[11], ftexels[15]));
+}
+
+
+
+/**
+ * Adapted from /opt/cell/sdk/usr/spu/include/transpose_matrix4x4.h
+ */
+static INLINE void
+transpose(vector unsigned int *mOut0,
+ vector unsigned int *mOut1,
+ vector unsigned int *mOut2,
+ vector unsigned int *mOut3,
+ vector unsigned int *mIn)
+{
+ vector unsigned int abcd, efgh, ijkl, mnop; /* input vectors */
+ vector unsigned int aeim, bfjn, cgko, dhlp; /* output vectors */
+ vector unsigned int aibj, ckdl, emfn, gohp; /* intermediate vectors */
+
+ vector unsigned char shufflehi = ((vector unsigned char) {
+ 0x00, 0x01, 0x02, 0x03,
+ 0x10, 0x11, 0x12, 0x13,
+ 0x04, 0x05, 0x06, 0x07,
+ 0x14, 0x15, 0x16, 0x17});
+ vector unsigned char shufflelo = ((vector unsigned char) {
+ 0x08, 0x09, 0x0A, 0x0B,
+ 0x18, 0x19, 0x1A, 0x1B,
+ 0x0C, 0x0D, 0x0E, 0x0F,
+ 0x1C, 0x1D, 0x1E, 0x1F});
+ abcd = *(mIn+0);
+ efgh = *(mIn+1);
+ ijkl = *(mIn+2);
+ mnop = *(mIn+3);
+
+ aibj = spu_shuffle(abcd, ijkl, shufflehi);
+ ckdl = spu_shuffle(abcd, ijkl, shufflelo);
+ emfn = spu_shuffle(efgh, mnop, shufflehi);
+ gohp = spu_shuffle(efgh, mnop, shufflelo);
+
+ aeim = spu_shuffle(aibj, emfn, shufflehi);
+ bfjn = spu_shuffle(aibj, emfn, shufflelo);
+ cgko = spu_shuffle(ckdl, gohp, shufflehi);
+ dhlp = spu_shuffle(ckdl, gohp, shufflelo);
+
+ *mOut0 = aeim;
+ *mOut1 = bfjn;
+ *mOut2 = cgko;
+ *mOut3 = dhlp;
+}
+
+
+/**
+ * Bilinear filtering, using int instead of float arithmetic for computing
+ * sample weights.
+ */
+void
+sample_texture_2d_bilinear_int(vector float s, vector float t,
+ uint unit, uint level, uint face,
+ vector float colors[4])
+{
+ const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
+ static const vector float half = {-0.5f, -0.5f, -0.5f, -0.5f};
+
+ /* Scale texcoords by size of texture, and add half pixel bias */
+ vector float ss = spu_madd(s, tlevel->scale_s, half);
+ vector float tt = spu_madd(t, tlevel->scale_t, half);
+
+ /* convert float coords to fixed-pt coords with 7 fraction bits */
+ vector signed int is = spu_convts(ss, 7); /* XXX really need floor() here */
+ vector signed int it = spu_convts(tt, 7); /* XXX really need floor() here */
+
+ /* compute integer texel weights in [0, 127] */
+ vector signed int sWeights0 = spu_and(is, 127);
+ vector signed int tWeights0 = spu_and(it, 127);
+ vector signed int sWeights1 = spu_sub(127, sWeights0);
+ vector signed int tWeights1 = spu_sub(127, tWeights0);
+
+ /* texel coords: is0 = is / 128, it0 = is / 128 */
+ vector signed int is0 = spu_rlmask(is, -7);
+ vector signed int it0 = spu_rlmask(it, -7);
+
+ /* texel coords: i1 = is0 + 1, it1 = it0 + 1 */
+ vector signed int is1 = spu_add(is0, 1);
+ vector signed int it1 = spu_add(it0, 1);
+
+ /* PIPE_TEX_WRAP_REPEAT */
+ is0 = spu_and(is0, tlevel->mask_s);
+ it0 = spu_and(it0, tlevel->mask_t);
+ is1 = spu_and(is1, tlevel->mask_s);
+ it1 = spu_and(it1, tlevel->mask_t);
+
+ /* PIPE_TEX_WRAP_CLAMP */
+ is0 = spu_clamp(is0, tlevel->max_s);
+ it0 = spu_clamp(it0, tlevel->max_t);
+ is1 = spu_clamp(is1, tlevel->max_s);
+ it1 = spu_clamp(it1, tlevel->max_t);
+
+ /* get packed int texels */
+ vector unsigned int texels[16];
+ get_four_texels(tlevel, face, is0, it0, texels + 0); /* upper-left */
+ get_four_texels(tlevel, face, is1, it0, texels + 4); /* upper-right */
+ get_four_texels(tlevel, face, is0, it1, texels + 8); /* lower-left */
+ get_four_texels(tlevel, face, is1, it1, texels + 12); /* lower-right */
+
+ /* twiddle packed 32-bit BGRA pixels into RGBA as four unsigned ints */
+ {
+ static const unsigned char ZERO = 0x80;
+ int i;
+ for (i = 0; i < 16; i++) {
+ texels[i] = spu_shuffle(texels[i], texels[i],
+ ((vector unsigned char) {
+ ZERO, ZERO, ZERO, 1,
+ ZERO, ZERO, ZERO, 2,
+ ZERO, ZERO, ZERO, 3,
+ ZERO, ZERO, ZERO, 0}));
+ }
+ }
+
+ /* convert RGBA,RGBA,RGBA,RGBA to RRRR,GGGG,BBBB,AAAA */
+ vector unsigned int texel0, texel1, texel2, texel3, texel4, texel5, texel6, texel7,
+ texel8, texel9, texel10, texel11, texel12, texel13, texel14, texel15;
+ transpose(&texel0, &texel1, &texel2, &texel3, texels + 0);
+ transpose(&texel4, &texel5, &texel6, &texel7, texels + 4);
+ transpose(&texel8, &texel9, &texel10, &texel11, texels + 8);
+ transpose(&texel12, &texel13, &texel14, &texel15, texels + 12);
+
+ /* computed weighted colors */
+ vector unsigned int c0, c1, c2, c3, cSum;
+
+ /* red */
+ c0 = (vector unsigned int) si_mpy((qword) texel0, si_mpy((qword) sWeights1, (qword) tWeights1)); /*ul*/
+ c1 = (vector unsigned int) si_mpy((qword) texel4, si_mpy((qword) sWeights0, (qword) tWeights1)); /*ur*/
+ c2 = (vector unsigned int) si_mpy((qword) texel8, si_mpy((qword) sWeights1, (qword) tWeights0)); /*ll*/
+ c3 = (vector unsigned int) si_mpy((qword) texel12, si_mpy((qword) sWeights0, (qword) tWeights0)); /*lr*/
+ cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3));
+ colors[0] = spu_convtf(cSum, 22);
+
+ /* green */
+ c0 = (vector unsigned int) si_mpy((qword) texel1, si_mpy((qword) sWeights1, (qword) tWeights1)); /*ul*/
+ c1 = (vector unsigned int) si_mpy((qword) texel5, si_mpy((qword) sWeights0, (qword) tWeights1)); /*ur*/
+ c2 = (vector unsigned int) si_mpy((qword) texel9, si_mpy((qword) sWeights1, (qword) tWeights0)); /*ll*/
+ c3 = (vector unsigned int) si_mpy((qword) texel13, si_mpy((qword) sWeights0, (qword) tWeights0)); /*lr*/
+ cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3));
+ colors[1] = spu_convtf(cSum, 22);
+
+ /* blue */
+ c0 = (vector unsigned int) si_mpy((qword) texel2, si_mpy((qword) sWeights1, (qword) tWeights1)); /*ul*/
+ c1 = (vector unsigned int) si_mpy((qword) texel6, si_mpy((qword) sWeights0, (qword) tWeights1)); /*ur*/
+ c2 = (vector unsigned int) si_mpy((qword) texel10, si_mpy((qword) sWeights1, (qword) tWeights0)); /*ll*/
+ c3 = (vector unsigned int) si_mpy((qword) texel14, si_mpy((qword) sWeights0, (qword) tWeights0)); /*lr*/
+ cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3));
+ colors[2] = spu_convtf(cSum, 22);
+
+ /* alpha */
+ c0 = (vector unsigned int) si_mpy((qword) texel3, si_mpy((qword) sWeights1, (qword) tWeights1)); /*ul*/
+ c1 = (vector unsigned int) si_mpy((qword) texel7, si_mpy((qword) sWeights0, (qword) tWeights1)); /*ur*/
+ c2 = (vector unsigned int) si_mpy((qword) texel11, si_mpy((qword) sWeights1, (qword) tWeights0)); /*ll*/
+ c3 = (vector unsigned int) si_mpy((qword) texel15, si_mpy((qword) sWeights0, (qword) tWeights0)); /*lr*/
+ cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3));
+ colors[3] = spu_convtf(cSum, 22);
+}
+
+
+
+/**
+ * Compute level of detail factor from texcoords.
+ */
+static INLINE float
+compute_lambda_2d(uint unit, vector float s, vector float t)
+{
+ uint baseLevel = 0;
+ float width = spu.texture[unit].level[baseLevel].width;
+ float height = spu.texture[unit].level[baseLevel].width;
+ float dsdx = width * (spu_extract(s, 1) - spu_extract(s, 0));
+ float dsdy = width * (spu_extract(s, 2) - spu_extract(s, 0));
+ float dtdx = height * (spu_extract(t, 1) - spu_extract(t, 0));
+ float dtdy = height * (spu_extract(t, 2) - spu_extract(t, 0));
+#if 0
+ /* ideal value */
+ float x = dsdx * dsdx + dtdx * dtdx;
+ float y = dsdy * dsdy + dtdy * dtdy;
+ float rho = x > y ? x : y;
+ rho = sqrtf(rho);
+#else
+ /* approximation */
+ dsdx = fabsf(dsdx);
+ dsdy = fabsf(dsdy);
+ dtdx = fabsf(dtdx);
+ dtdy = fabsf(dtdy);
+ float rho = (dsdx + dsdy + dtdx + dtdy) * 0.5;
+#endif
+ float lambda = logf(rho) * 1.442695f; /* compute logbase2(rho) */
+ return lambda;
+}
+
+
+/**
+ * Blend two sets of colors according to weight.
+ */
+static void
+blend_colors(vector float c0[4], const vector float c1[4], float weight)
+{
+ vector float t = spu_splats(weight);
+ vector float dc0 = spu_sub(c1[0], c0[0]);
+ vector float dc1 = spu_sub(c1[1], c0[1]);
+ vector float dc2 = spu_sub(c1[2], c0[2]);
+ vector float dc3 = spu_sub(c1[3], c0[3]);
+ c0[0] = spu_madd(dc0, t, c0[0]);
+ c0[1] = spu_madd(dc1, t, c0[1]);
+ c0[2] = spu_madd(dc2, t, c0[2]);
+ c0[3] = spu_madd(dc3, t, c0[3]);
+}
+
+
+/**
+ * Texture sampling with level of detail selection and possibly mipmap
+ * interpolation.
+ */
+void
+sample_texture_2d_lod(vector float s, vector float t,
+ uint unit, uint level_ignored, uint face,
+ vector float colors[4])
+{
+ /*
+ * Note that we're computing a lambda/lod here that's used for all
+ * four pixels in the quad.
+ */
+ float lambda = compute_lambda_2d(unit, s, t);
+
+ (void) face;
+ (void) level_ignored;
+
+ /* apply lod bias */
+ lambda += spu.sampler[unit].lod_bias;
+
+ /* clamp */
+ if (lambda < spu.sampler[unit].min_lod)
+ lambda = spu.sampler[unit].min_lod;
+ else if (lambda > spu.sampler[unit].max_lod)
+ lambda = spu.sampler[unit].max_lod;
+
+ if (lambda <= 0.0f) {
+ /* magnify */
+ spu.mag_sample_texture_2d[unit](s, t, unit, 0, face, colors);
+ }
+ else {
+ /* minify */
+ if (spu.sampler[unit].min_img_filter == PIPE_TEX_FILTER_LINEAR) {
+ /* sample two mipmap levels and interpolate */
+ int level = (int) lambda;
+ if (level > (int) spu.texture[unit].max_level)
+ level = spu.texture[unit].max_level;
+ spu.min_sample_texture_2d[unit](s, t, unit, level, face, colors);
+ if (spu.sampler[unit].min_img_filter == PIPE_TEX_FILTER_LINEAR) {
+ /* sample second mipmap level */
+ float weight = lambda - (float) level;
+ level++;
+ if (level <= (int) spu.texture[unit].max_level) {
+ vector float colors2[4];
+ spu.min_sample_texture_2d[unit](s, t, unit, level, face, colors2);
+ blend_colors(colors, colors2, weight);
+ }
+ }
+ }
+ else {
+ /* sample one mipmap level */
+ int level = (int) (lambda + 0.5f);
+ if (level > (int) spu.texture[unit].max_level)
+ level = spu.texture[unit].max_level;
+ spu.min_sample_texture_2d[unit](s, t, unit, level, face, colors);
+ }
+ }
+}
+
+
+/** XXX need a SIMD version of this */
+static unsigned
+choose_cube_face(float rx, float ry, float rz, float *newS, float *newT)
+{
+ /*
+ major axis
+ direction target sc tc ma
+ ---------- ------------------------------- --- --- ---
+ +rx TEXTURE_CUBE_MAP_POSITIVE_X_EXT -rz -ry rx
+ -rx TEXTURE_CUBE_MAP_NEGATIVE_X_EXT +rz -ry rx
+ +ry TEXTURE_CUBE_MAP_POSITIVE_Y_EXT +rx +rz ry
+ -ry TEXTURE_CUBE_MAP_NEGATIVE_Y_EXT +rx -rz ry
+ +rz TEXTURE_CUBE_MAP_POSITIVE_Z_EXT +rx -ry rz
+ -rz TEXTURE_CUBE_MAP_NEGATIVE_Z_EXT -rx -ry rz
+ */
+ const float arx = fabsf(rx);
+ const float ary = fabsf(ry);
+ const float arz = fabsf(rz);
+ unsigned face;
+ float sc, tc, ma;
+
+ if (arx > ary && arx > arz) {
+ if (rx >= 0.0F) {
+ face = PIPE_TEX_FACE_POS_X;
+ sc = -rz;
+ tc = -ry;
+ ma = arx;
+ }
+ else {
+ face = PIPE_TEX_FACE_NEG_X;
+ sc = rz;
+ tc = -ry;
+ ma = arx;
+ }
+ }
+ else if (ary > arx && ary > arz) {
+ if (ry >= 0.0F) {
+ face = PIPE_TEX_FACE_POS_Y;
+ sc = rx;
+ tc = rz;
+ ma = ary;
+ }
+ else {
+ face = PIPE_TEX_FACE_NEG_Y;
+ sc = rx;
+ tc = -rz;
+ ma = ary;
+ }
+ }
+ else {
+ if (rz > 0.0F) {
+ face = PIPE_TEX_FACE_POS_Z;
+ sc = rx;
+ tc = -ry;
+ ma = arz;
+ }
+ else {
+ face = PIPE_TEX_FACE_NEG_Z;
+ sc = -rx;
+ tc = -ry;
+ ma = arz;
+ }
+ }
+
+ *newS = (sc / ma + 1.0F) * 0.5F;
+ *newT = (tc / ma + 1.0F) * 0.5F;
+
+ return face;
+}
+
+
+
+void
+sample_texture_cube(vector float s, vector float t, vector float r,
+ uint unit, vector float colors[4])
+{
+ uint p, faces[4], level = 0;
+ float newS[4], newT[4];
+
+ /* Compute cube faces referenced by the four sets of texcoords.
+ * XXX we should SIMD-ize this.
+ */
+ for (p = 0; p < 4; p++) {
+ float rx = spu_extract(s, p);
+ float ry = spu_extract(t, p);
+ float rz = spu_extract(r, p);
+ faces[p] = choose_cube_face(rx, ry, rz, &newS[p], &newT[p]);
+ }
+
+ if (faces[0] == faces[1] &&
+ faces[0] == faces[2] &&
+ faces[0] == faces[3]) {
+ /* GOOD! All four texcoords refer to the same cube face */
+ s = (vector float) {newS[0], newS[1], newS[2], newS[3]};
+ t = (vector float) {newT[0], newT[1], newT[2], newT[3]};
+ spu.sample_texture_2d[unit](s, t, unit, level, faces[0], colors);
+ }
+ else {
+ /* BAD! The four texcoords refer to different faces */
+ for (p = 0; p < 4; p++) {
+ vector float c[4];
+
+ spu.sample_texture_2d[unit](spu_splats(newS[p]), spu_splats(newT[p]),
+ unit, level, faces[p], c);
+
+ float red = spu_extract(c[0], p);
+ float green = spu_extract(c[1], p);
+ float blue = spu_extract(c[2], p);
+ float alpha = spu_extract(c[3], p);
+
+ colors[0] = spu_insert(red, colors[0], p);
+ colors[1] = spu_insert(green, colors[1], p);
+ colors[2] = spu_insert(blue, colors[2], p);
+ colors[3] = spu_insert(alpha, colors[3], p);
+ }
+ }
}
diff --git a/src/gallium/drivers/cell/spu/spu_texture.h b/src/gallium/drivers/cell/spu/spu_texture.h
index f7c9738be8..7b75b007b5 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.h
+++ b/src/gallium/drivers/cell/spu/spu_texture.h
@@ -36,12 +36,32 @@ extern void
invalidate_tex_cache(void);
-extern vector float
-sample_texture_nearest(uint unit, vector float texcoord);
+extern void
+sample_texture_2d_nearest(vector float s, vector float t,
+ uint unit, uint level, uint face,
+ vector float colors[4]);
+
+
+extern void
+sample_texture_2d_bilinear(vector float s, vector float t,
+ uint unit, uint level, uint face,
+ vector float colors[4]);
+
+extern void
+sample_texture_2d_bilinear_int(vector float s, vector float t,
+ uint unit, uint level, uint face,
+ vector float colors[4]);
+
+extern void
+sample_texture_2d_lod(vector float s, vector float t,
+ uint unit, uint level, uint face,
+ vector float colors[4]);
-extern vector float
-sample_texture_bilinear(uint unit, vector float texcoord);
+
+extern void
+sample_texture_cube(vector float s, vector float t, vector float r,
+ uint unit, vector float colors[4]);
#endif /* SPU_TEXTURE_H */
diff --git a/src/gallium/drivers/cell/spu/spu_tile.c b/src/gallium/drivers/cell/spu/spu_tile.c
index 216a33126b..6905015a48 100644
--- a/src/gallium/drivers/cell/spu/spu_tile.c
+++ b/src/gallium/drivers/cell/spu/spu_tile.c
@@ -87,3 +87,40 @@ put_tile(uint tx, uint ty, const tile_t *tile, int tag, int zBuf)
0 /* rid */);
}
+
+/**
+ * For tiles whose status is TILE_STATUS_CLEAR, write solid-filled
+ * tiles back to the main framebuffer.
+ */
+void
+really_clear_tiles(uint surfaceIndex)
+{
+ const uint num_tiles = spu.fb.width_tiles * spu.fb.height_tiles;
+ uint i;
+
+ if (surfaceIndex == 0) {
+ clear_c_tile(&spu.ctile);
+
+ for (i = spu.init.id; i < num_tiles; i += spu.init.num_spus) {
+ uint tx = i % spu.fb.width_tiles;
+ uint ty = i / spu.fb.width_tiles;
+ if (spu.ctile_status[ty][tx] == TILE_STATUS_CLEAR) {
+ put_tile(tx, ty, &spu.ctile, TAG_SURFACE_CLEAR, 0);
+ }
+ }
+ }
+ else {
+ clear_z_tile(&spu.ztile);
+
+ for (i = spu.init.id; i < num_tiles; i += spu.init.num_spus) {
+ uint tx = i % spu.fb.width_tiles;
+ uint ty = i / spu.fb.width_tiles;
+ if (spu.ztile_status[ty][tx] == TILE_STATUS_CLEAR)
+ put_tile(tx, ty, &spu.ctile, TAG_SURFACE_CLEAR, 1);
+ }
+ }
+
+#if 0
+ wait_on_mask(1 << TAG_SURFACE_CLEAR);
+#endif
+}
diff --git a/src/gallium/drivers/cell/spu/spu_tile.h b/src/gallium/drivers/cell/spu/spu_tile.h
index 1b5491112d..7bfb52be8f 100644
--- a/src/gallium/drivers/cell/spu/spu_tile.h
+++ b/src/gallium/drivers/cell/spu/spu_tile.h
@@ -36,12 +36,14 @@
-void
+extern void
get_tile(uint tx, uint ty, tile_t *tile, int tag, int zBuf);
-void
+extern void
put_tile(uint tx, uint ty, const tile_t *tile, int tag, int zBuf);
+extern void
+really_clear_tiles(uint surfaceIndex);
static INLINE void
diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index 8b93878192..22e51a86ae 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -43,11 +43,6 @@
/** Masks are uint[4] vectors with each element being 0 or 0xffffffff */
typedef vector unsigned int mask_t;
-typedef union
-{
- vector float v;
- float f[4];
-} float4;
/**
@@ -91,9 +86,9 @@ struct edge {
struct interp_coef
{
- float4 a0;
- float4 dadx;
- float4 dady;
+ vector float a0;
+ vector float dadx;
+ vector float dady;
};
@@ -116,21 +111,15 @@ struct setup_stage {
struct edge etop;
struct edge emaj;
- float oneoverarea;
+ float oneOverArea; /* XXX maybe make into vector? */
+
+ uint facing;
- uint tx, ty;
+ uint tx, ty; /**< position of current tile (x, y) */
int cliprect_minx, cliprect_maxx, cliprect_miny, cliprect_maxy;
-#if 0
- struct tgsi_interp_coef coef[PIPE_MAX_SHADER_INPUTS];
-#else
struct interp_coef coef[PIPE_MAX_SHADER_INPUTS];
-#endif
-
-#if 0
- struct quad_header quad;
-#endif
struct {
int left[2]; /**< [0] = row0, [1] = row1 */
@@ -142,118 +131,105 @@ struct setup_stage {
};
-
static struct setup_stage setup;
-
-
-#if 0
-/**
- * Basically a cast wrapper.
- */
-static INLINE struct setup_stage *setup_stage( struct draw_stage *stage )
-{
- return (struct setup_stage *)stage;
-}
-#endif
-
-#if 0
-/**
- * Clip setup.quad against the scissor/surface bounds.
- */
-static INLINE void
-quad_clip(struct setup_stage *setup)
-{
- const struct pipe_scissor_state *cliprect = &setup.softpipe->cliprect;
- const int minx = (int) cliprect->minx;
- const int maxx = (int) cliprect->maxx;
- const int miny = (int) cliprect->miny;
- const int maxy = (int) cliprect->maxy;
-
- if (setup.quad.x0 >= maxx ||
- setup.quad.y0 >= maxy ||
- setup.quad.x0 + 1 < minx ||
- setup.quad.y0 + 1 < miny) {
- /* totally clipped */
- setup.quad.mask = 0x0;
- return;
- }
- if (setup.quad.x0 < minx)
- setup.quad.mask &= (MASK_BOTTOM_RIGHT | MASK_TOP_RIGHT);
- if (setup.quad.y0 < miny)
- setup.quad.mask &= (MASK_BOTTOM_LEFT | MASK_BOTTOM_RIGHT);
- if (setup.quad.x0 == maxx - 1)
- setup.quad.mask &= (MASK_BOTTOM_LEFT | MASK_TOP_LEFT);
- if (setup.quad.y0 == maxy - 1)
- setup.quad.mask &= (MASK_TOP_LEFT | MASK_TOP_RIGHT);
-}
-#endif
-
-#if 0
-/**
- * Emit a quad (pass to next stage) with clipping.
- */
-static INLINE void
-clip_emit_quad(struct setup_stage *setup)
-{
- quad_clip(setup);
- if (setup.quad.mask) {
- struct softpipe_context *sp = setup.softpipe;
- sp->quad.first->run(sp->quad.first, &setup.quad);
- }
-}
-#endif
-
/**
* Evaluate attribute coefficients (plane equations) to compute
* attribute values for the four fragments in a quad.
* Eg: four colors will be computed (in AoS format).
*/
static INLINE void
-eval_coeff(uint slot, float x, float y, vector float result[4])
+eval_coeff(uint slot, float x, float y, vector float w, vector float result[4])
{
- switch (spu.vertex_info.interp_mode[slot]) {
+ switch (spu.vertex_info.attrib[slot].interp_mode) {
case INTERP_CONSTANT:
result[QUAD_TOP_LEFT] =
result[QUAD_TOP_RIGHT] =
result[QUAD_BOTTOM_LEFT] =
- result[QUAD_BOTTOM_RIGHT] = setup.coef[slot].a0.v;
+ result[QUAD_BOTTOM_RIGHT] = setup.coef[slot].a0;
break;
-
case INTERP_LINEAR:
- /* fall-through, for now */
- default:
{
- register vector float dadx = setup.coef[slot].dadx.v;
- register vector float dady = setup.coef[slot].dady.v;
- register vector float topLeft
- = spu_add(setup.coef[slot].a0.v,
- spu_add(spu_mul(spu_splats(x), dadx),
- spu_mul(spu_splats(y), dady)));
+ vector float dadx = setup.coef[slot].dadx;
+ vector float dady = setup.coef[slot].dady;
+ vector float topLeft =
+ spu_add(setup.coef[slot].a0,
+ spu_add(spu_mul(spu_splats(x), dadx),
+ spu_mul(spu_splats(y), dady)));
result[QUAD_TOP_LEFT] = topLeft;
result[QUAD_TOP_RIGHT] = spu_add(topLeft, dadx);
result[QUAD_BOTTOM_LEFT] = spu_add(topLeft, dady);
result[QUAD_BOTTOM_RIGHT] = spu_add(spu_add(topLeft, dadx), dady);
}
+ break;
+ case INTERP_PERSPECTIVE:
+ {
+ vector float dadx = setup.coef[slot].dadx;
+ vector float dady = setup.coef[slot].dady;
+ vector float topLeft =
+ spu_add(setup.coef[slot].a0,
+ spu_add(spu_mul(spu_splats(x), dadx),
+ spu_mul(spu_splats(y), dady)));
+
+ vector float wInv = spu_re(w); /* 1.0 / w */
+
+ result[QUAD_TOP_LEFT] = spu_mul(topLeft, wInv);
+ result[QUAD_TOP_RIGHT] = spu_mul(spu_add(topLeft, dadx), wInv);
+ result[QUAD_BOTTOM_LEFT] = spu_mul(spu_add(topLeft, dady), wInv);
+ result[QUAD_BOTTOM_RIGHT] = spu_mul(spu_add(spu_add(topLeft, dadx), dady), wInv);
+ }
+ break;
+ case INTERP_POS:
+ case INTERP_NONE:
+ break;
+ default:
+ ASSERT(0);
}
}
+/**
+ * As above, but return 4 vectors in SOA format.
+ * XXX this will all be re-written someday.
+ */
+static INLINE void
+eval_coeff_soa(uint slot, float x, float y, vector float w, vector float result[4])
+{
+ eval_coeff(slot, x, y, w, result);
+ _transpose_matrix4x4(result, result);
+}
+
+
+/** Evalute coefficients to get Z for four pixels in a quad */
static INLINE vector float
eval_z(float x, float y)
{
const uint slot = 0;
- const float dzdx = setup.coef[slot].dadx.f[2];
- const float dzdy = setup.coef[slot].dady.f[2];
- const float topLeft = setup.coef[slot].a0.f[2] + x * dzdx + y * dzdy;
+ const float dzdx = spu_extract(setup.coef[slot].dadx, 2);
+ const float dzdy = spu_extract(setup.coef[slot].dady, 2);
+ const float topLeft = spu_extract(setup.coef[slot].a0, 2) + x * dzdx + y * dzdy;
const vector float topLeftv = spu_splats(topLeft);
const vector float derivs = (vector float) { 0.0, dzdx, dzdy, dzdx + dzdy };
return spu_add(topLeftv, derivs);
}
+/** Evalute coefficients to get W for four pixels in a quad */
+static INLINE vector float
+eval_w(float x, float y)
+{
+ const uint slot = 0;
+ const float dwdx = spu_extract(setup.coef[slot].dadx, 3);
+ const float dwdy = spu_extract(setup.coef[slot].dady, 3);
+ const float topLeft = spu_extract(setup.coef[slot].a0, 3) + x * dwdx + y * dwdy;
+ const vector float topLeftv = spu_splats(topLeft);
+ const vector float derivs = (vector float) { 0.0, dwdx, dwdy, dwdx + dwdy };
+ return spu_add(topLeftv, derivs);
+}
+
+
/**
* Emit a quad (pass to next stage). No clipping is done.
* Note: about 1/5 to 1/7 of the time, mask is zero and this function
@@ -261,120 +237,59 @@ eval_z(float x, float y)
* overall.
*/
static INLINE void
-emit_quad( int x, int y, mask_t mask )
+emit_quad( int x, int y, mask_t mask)
{
/* If any bits in mask are set... */
if (spu_extract(spu_orx(mask), 0)) {
const int ix = x - setup.cliprect_minx;
const int iy = y - setup.cliprect_miny;
- vector float colors[4];
spu.cur_ctile_status = TILE_STATUS_DIRTY;
spu.cur_ztile_status = TILE_STATUS_DIRTY;
- if (spu.texture[0].start) {
- /* texture mapping */
- const uint unit = 0;
- vector float texcoords[4];
- eval_coeff(2, (float) x, (float) y, texcoords);
-
- if (spu_extract(mask, 0))
- colors[0] = spu.sample_texture[unit](unit, texcoords[0]);
- if (spu_extract(mask, 1))
- colors[1] = spu.sample_texture[unit](unit, texcoords[1]);
- if (spu_extract(mask, 2))
- colors[2] = spu.sample_texture[unit](unit, texcoords[2]);
- if (spu_extract(mask, 3))
- colors[3] = spu.sample_texture[unit](unit, texcoords[3]);
-
-
- if (spu.texture[1].start) {
- /* multi-texture mapping */
- const uint unit = 1;
- vector float colors1[4];
-
- eval_coeff(2, (float) x, (float) y, texcoords);
-
- if (spu_extract(mask, 0))
- colors1[0] = spu.sample_texture[unit](unit, texcoords[0]);
- if (spu_extract(mask, 1))
- colors1[1] = spu.sample_texture[unit](unit, texcoords[1]);
- if (spu_extract(mask, 2))
- colors1[2] = spu.sample_texture[unit](unit, texcoords[2]);
- if (spu_extract(mask, 3))
- colors1[3] = spu.sample_texture[unit](unit, texcoords[3]);
-
- /* hack: modulate first texture by second */
- colors[0] = spu_mul(colors[0], colors1[0]);
- colors[1] = spu_mul(colors[1], colors1[1]);
- colors[2] = spu_mul(colors[2], colors1[2]);
- colors[3] = spu_mul(colors[3], colors1[3]);
- }
+ {
+ /*
+ * Run fragment shader, execute per-fragment ops, update fb/tile.
+ */
+ vector float inputs[4*4], outputs[2*4];
+ vector float fragZ = eval_z((float) x, (float) y);
+ vector float fragW = eval_w((float) x, (float) y);
+ vector unsigned int kill_mask;
- }
- else {
- /* simple shading */
+ /* setup inputs */
#if 0
- eval_coeff(1, (float) x, (float) y, colors);
-
+ eval_coeff_soa(1, (float) x, (float) y, fragW, inputs);
#else
- /* XXX new fragment program code */
-
- if (spu.fragment_program) {
- vector float inputs[4*4], outputs[2*4];
-
- /* setup inputs */
- eval_coeff(1, (float) x, (float) y, inputs);
-
- /* Execute the current fragment program */
- spu.fragment_program(inputs, outputs, spu.constants);
-
- /* Copy outputs */
- colors[0] = outputs[0*4+0];
- colors[1] = outputs[0*4+1];
- colors[2] = outputs[0*4+2];
- colors[3] = outputs[0*4+3];
-
- if (0 && spu.init.id==0 && y == 48) {
- printf("colors[0] = %f %f %f %f\n",
- spu_extract(colors[0], 0),
- spu_extract(colors[0], 1),
- spu_extract(colors[0], 2),
- spu_extract(colors[0], 3));
- printf("colors[1] = %f %f %f %f\n",
- spu_extract(colors[1], 0),
- spu_extract(colors[1], 1),
- spu_extract(colors[1], 2),
- spu_extract(colors[1], 3));
- }
-
+ uint i;
+ for (i = 0; i < spu.vertex_info.num_attribs; i++) {
+ eval_coeff_soa(i+1, (float) x, (float) y, fragW, inputs + i * 4);
}
#endif
- }
-
-
- {
- /* Convert fragment data from AoS to SoA format.
- * I.e. (RGBA,RGBA,RGBA,RGBA) -> (RRRR,GGGG,BBBB,AAAA)
- * This is temporary!
+ ASSERT(spu.fragment_program);
+ ASSERT(spu.fragment_ops);
+
+ /* Execute the current fragment program */
+ kill_mask = spu.fragment_program(inputs, outputs, spu.constants);
+
+ mask = spu_andc(mask, kill_mask);
+
+ /* Execute per-fragment/quad operations, including:
+ * alpha test, z test, stencil test, blend and framebuffer writing.
+ * Note that there are two different fragment operations functions
+ * that can be called, one for front-facing fragments, and one
+ * for back-facing fragments. (Often the two are the same;
+ * but in some cases, like two-sided stenciling, they can be
+ * very different.) So choose the correct function depending
+ * on the calculated facing.
*/
- vector float soa_frag[4];
- _transpose_matrix4x4(soa_frag, colors);
-
- float4 fragZ;
-
- fragZ.v = eval_z((float) x, (float) y);
-
- /* Do all per-fragment/quad operations here, including:
- * alpha test, z test, stencil test, blend and framebuffer writing.
- */
- spu.fragment_ops(ix, iy, &spu.ctile, &spu.ztile,
- fragZ.v,
- soa_frag[0], soa_frag[1],
- soa_frag[2], soa_frag[3],
+ spu.fragment_ops[setup.facing](ix, iy, &spu.ctile, &spu.ztile,
+ fragZ,
+ outputs[0*4+0],
+ outputs[0*4+1],
+ outputs[0*4+2],
+ outputs[0*4+3],
mask);
}
-
}
}
@@ -383,7 +298,8 @@ emit_quad( int x, int y, mask_t mask )
* Given an X or Y coordinate, return the block/quad coordinate that it
* belongs to.
*/
-static INLINE int block( int x )
+static INLINE int
+block(int x)
{
return x & ~1;
}
@@ -394,7 +310,8 @@ static INLINE int block( int x )
* the triangle's bounds.
* The mask is a uint4 vector and each element will be 0 or 0xffffffff.
*/
-static INLINE mask_t calculate_mask( int x )
+static INLINE mask_t
+calculate_mask(int x)
{
/* This is a little tricky.
* Use & instead of && to avoid branches.
@@ -412,7 +329,8 @@ static INLINE mask_t calculate_mask( int x )
/**
* Render a horizontal span of quads
*/
-static void flush_spans( void )
+static void
+flush_spans(void)
{
int minleft, maxright;
int x;
@@ -440,7 +358,6 @@ static void flush_spans( void )
return;
}
-
/* OK, we're very likely to need the tile data now.
* clear or finish waiting if needed.
*/
@@ -457,7 +374,7 @@ static void flush_spans( void )
}
ASSERT(spu.cur_ctile_status != TILE_STATUS_DEFINED);
- if (spu.read_depth) {
+ if (spu.read_depth_stencil) {
if (spu.cur_ztile_status == TILE_STATUS_GETTING) {
/* wait for mfc_get() to complete */
//printf("SPU: %u: waiting for ztile\n", spu.init.id);
@@ -476,9 +393,7 @@ static void flush_spans( void )
* calculate_mask() could be simplified a bit...
*/
for (x = block(minleft); x <= block(maxright); x += 2) {
-#if 1
- emit_quad( x, setup.span.y, calculate_mask( x ) );
-#endif
+ emit_quad( x, setup.span.y, calculate_mask( x ));
}
setup.span.y = 0;
@@ -487,33 +402,46 @@ static void flush_spans( void )
setup.span.right[1] = 0;
}
+
#if DEBUG_VERTS
-static void print_vertex(const struct vertex_header *v)
+static void
+print_vertex(const struct vertex_header *v)
{
- int i;
- fprintf(stderr, "Vertex: (%p)\n", v);
- for (i = 0; i < setup.quad.nr_attrs; i++) {
- fprintf(stderr, " %d: %f %f %f %f\n", i,
- v->data[i][0], v->data[i][1], v->data[i][2], v->data[i][3]);
+ uint i;
+ fprintf(stderr, " Vertex: (%p)\n", v);
+ for (i = 0; i < spu.vertex_info.num_attribs; i++) {
+ fprintf(stderr, " %d: %f %f %f %f\n", i,
+ spu_extract(v->data[i], 0),
+ spu_extract(v->data[i], 1),
+ spu_extract(v->data[i], 2),
+ spu_extract(v->data[i], 3));
}
}
#endif
-static boolean setup_sort_vertices(const struct vertex_header *v0,
- const struct vertex_header *v1,
- const struct vertex_header *v2)
+/**
+ * Sort vertices from top to bottom.
+ * Compute area and determine front vs. back facing.
+ * Do coarse clip test against tile bounds
+ * \return FALSE if tri is totally outside tile, TRUE otherwise
+ */
+static boolean
+setup_sort_vertices(const struct vertex_header *v0,
+ const struct vertex_header *v1,
+ const struct vertex_header *v2)
{
+ float area, sign;
#if DEBUG_VERTS
- fprintf(stderr, "Triangle:\n");
- print_vertex(v0);
- print_vertex(v1);
- print_vertex(v2);
+ if (spu.init.id==0) {
+ fprintf(stderr, "SPU %u: Triangle:\n", spu.init.id);
+ print_vertex(v0);
+ print_vertex(v1);
+ print_vertex(v2);
+ }
#endif
- setup.vprovoke = v2;
-
/* determine bottom to top order of vertices */
{
float y0 = spu_extract(v0->data[0], 1);
@@ -525,18 +453,21 @@ static boolean setup_sort_vertices(const struct vertex_header *v0,
setup.vmin = v0;
setup.vmid = v1;
setup.vmax = v2;
+ sign = -1.0f;
}
else if (y2 <= y0) {
/* y2<=y0<=y1 */
setup.vmin = v2;
setup.vmid = v0;
setup.vmax = v1;
+ sign = -1.0f;
}
else {
/* y0<=y2<=y1 */
setup.vmin = v0;
setup.vmid = v2;
setup.vmax = v1;
+ sign = 1.0f;
}
}
else {
@@ -545,18 +476,21 @@ static boolean setup_sort_vertices(const struct vertex_header *v0,
setup.vmin = v1;
setup.vmid = v0;
setup.vmax = v2;
+ sign = 1.0f;
}
else if (y2 <= y1) {
/* y2<=y1<=y0 */
setup.vmin = v2;
setup.vmid = v1;
setup.vmax = v0;
+ sign = 1.0f;
}
else {
/* y1<=y2<=y0 */
setup.vmin = v1;
setup.vmid = v2;
setup.vmax = v0;
+ sign = -1.0f;
}
}
}
@@ -585,31 +519,23 @@ static boolean setup_sort_vertices(const struct vertex_header *v0,
/*
* Compute triangle's area. Use 1/area to compute partial
* derivatives of attributes later.
- *
- * The area will be the same as prim->det, but the sign may be
- * different depending on how the vertices get sorted above.
- *
- * To determine whether the primitive is front or back facing we
- * use the prim->det value because its sign is correct.
*/
- {
- const float area = (setup.emaj.dx * setup.ebot.dy -
- setup.ebot.dx * setup.emaj.dy);
-
- setup.oneoverarea = 1.0f / area;
- /*
- _mesa_printf("%s one-over-area %f area %f det %f\n",
- __FUNCTION__, setup.oneoverarea, area, prim->det );
- */
- }
+ area = setup.emaj.dx * setup.ebot.dy - setup.ebot.dx * setup.emaj.dy;
-#if 0
- /* We need to know if this is a front or back-facing triangle for:
- * - the GLSL gl_FrontFacing fragment attribute (bool)
- * - two-sided stencil test
+ setup.oneOverArea = 1.0f / area;
+
+ /* The product of area * sign indicates front/back orientation (0/1).
+ * Just in case someone gets the bright idea of switching the front
+ * and back constants without noticing that we're assuming their
+ * values in this operation, also assert that the values are
+ * what we think they are.
*/
- setup.quad.facing = (prim->det > 0.0) ^ (setup.softpipe->rasterizer->front_winding == PIPE_WINDING_CW);
-#endif
+ ASSERT(CELL_FACING_FRONT == 0);
+ ASSERT(CELL_FACING_BACK == 1);
+ setup.facing = (area * sign > 0.0f)
+ ^ (spu.rasterizer.front_winding == PIPE_WINDING_CW);
+
+ setup.vprovoke = v2;
return TRUE;
}
@@ -622,63 +548,11 @@ static boolean setup_sort_vertices(const struct vertex_header *v0,
* \param slot which attribute slot
*/
static INLINE void
-const_coeff(uint slot)
+const_coeff4(uint slot)
{
- setup.coef[slot].dadx.v = (vector float) {0.0, 0.0, 0.0, 0.0};
- setup.coef[slot].dady.v = (vector float) {0.0, 0.0, 0.0, 0.0};
- setup.coef[slot].a0.v = setup.vprovoke->data[slot];
-}
-
-
-/**
- * Compute a0, dadx and dady for a linearly interpolated coefficient,
- * for a triangle.
- */
-static INLINE void
-tri_linear_coeff(uint slot, uint firstComp, uint lastComp)
-{
- uint i;
- const float *vmin_d = (float *) &setup.vmin->data[slot];
- const float *vmid_d = (float *) &setup.vmid->data[slot];
- const float *vmax_d = (float *) &setup.vmax->data[slot];
- const float x = spu_extract(setup.vmin->data[0], 0) - 0.5f;
- const float y = spu_extract(setup.vmin->data[0], 1) - 0.5f;
-
- for (i = firstComp; i < lastComp; i++) {
- float botda = vmid_d[i] - vmin_d[i];
- float majda = vmax_d[i] - vmin_d[i];
- float a = setup.ebot.dy * majda - botda * setup.emaj.dy;
- float b = setup.emaj.dx * botda - majda * setup.ebot.dx;
-
- ASSERT(slot < PIPE_MAX_SHADER_INPUTS);
-
- setup.coef[slot].dadx.f[i] = a * setup.oneoverarea;
- setup.coef[slot].dady.f[i] = b * setup.oneoverarea;
-
- /* calculate a0 as the value which would be sampled for the
- * fragment at (0,0), taking into account that we want to sample at
- * pixel centers, in other words (0.5, 0.5).
- *
- * this is neat but unfortunately not a good way to do things for
- * triangles with very large values of dadx or dady as it will
- * result in the subtraction and re-addition from a0 of a very
- * large number, which means we'll end up loosing a lot of the
- * fractional bits and precision from a0. the way to fix this is
- * to define a0 as the sample at a pixel center somewhere near vmin
- * instead - i'll switch to this later.
- */
- setup.coef[slot].a0.f[i] = (vmin_d[i] -
- (setup.coef[slot].dadx.f[i] * x +
- setup.coef[slot].dady.f[i] * y));
- }
-
- /*
- _mesa_printf("attr[%d].%c: %f dx:%f dy:%f\n",
- slot, "xyzw"[i],
- setup.coef[slot].a0[i],
- setup.coef[slot].dadx.f[i],
- setup.coef[slot].dady.f[i]);
- */
+ setup.coef[slot].dadx = (vector float) {0.0, 0.0, 0.0, 0.0};
+ setup.coef[slot].dady = (vector float) {0.0, 0.0, 0.0, 0.0};
+ setup.coef[slot].a0 = setup.vprovoke->data[slot];
}
@@ -702,18 +576,16 @@ tri_linear_coeff4(uint slot)
vector float b = spu_sub(spu_mul(spu_splats(setup.emaj.dx), botda),
spu_mul(majda, spu_splats(setup.ebot.dx)));
- setup.coef[slot].dadx.v = spu_mul(a, spu_splats(setup.oneoverarea));
- setup.coef[slot].dady.v = spu_mul(b, spu_splats(setup.oneoverarea));
+ setup.coef[slot].dadx = spu_mul(a, spu_splats(setup.oneOverArea));
+ setup.coef[slot].dady = spu_mul(b, spu_splats(setup.oneOverArea));
- vector float tempx = spu_mul(setup.coef[slot].dadx.v, xxxx);
- vector float tempy = spu_mul(setup.coef[slot].dady.v, yyyy);
+ vector float tempx = spu_mul(setup.coef[slot].dadx, xxxx);
+ vector float tempy = spu_mul(setup.coef[slot].dady, yyyy);
- setup.coef[slot].a0.v = spu_sub(vmin_d, spu_add(tempx, tempy));
+ setup.coef[slot].a0 = spu_sub(vmin_d, spu_add(tempx, tempy));
}
-
-#if 0
/**
* Compute a0, dadx and dady for a perspective-corrected interpolant,
* for a triangle.
@@ -722,82 +594,76 @@ tri_linear_coeff4(uint slot)
* Later, when we compute the value at a particular fragment position we'll
* divide the interpolated value by the interpolated W at that fragment.
*/
-static void tri_persp_coeff( unsigned slot,
- unsigned i )
+static void
+tri_persp_coeff4(uint slot)
{
- /* premultiply by 1/w:
- */
- float mina = setup.vmin->data[slot][i] * setup.vmin->data[0][3];
- float mida = setup.vmid->data[slot][i] * setup.vmid->data[0][3];
- float maxa = setup.vmax->data[slot][i] * setup.vmax->data[0][3];
-
- float botda = mida - mina;
- float majda = maxa - mina;
- float a = setup.ebot.dy * majda - botda * setup.emaj.dy;
- float b = setup.emaj.dx * botda - majda * setup.ebot.dx;
-
- /*
- printf("tri persp %d,%d: %f %f %f\n", slot, i,
- setup.vmin->data[slot][i],
- setup.vmid->data[slot][i],
- setup.vmax->data[slot][i]
- );
- */
+ const vector float xxxx = spu_splats(spu_extract(setup.vmin->data[0], 0) - 0.5f);
+ const vector float yyyy = spu_splats(spu_extract(setup.vmin->data[0], 1) - 0.5f);
+
+ const vector float vmin_w = spu_splats(spu_extract(setup.vmin->data[0], 3));
+ const vector float vmid_w = spu_splats(spu_extract(setup.vmid->data[0], 3));
+ const vector float vmax_w = spu_splats(spu_extract(setup.vmax->data[0], 3));
+
+ vector float vmin_d = setup.vmin->data[slot];
+ vector float vmid_d = setup.vmid->data[slot];
+ vector float vmax_d = setup.vmax->data[slot];
+
+ vmin_d = spu_mul(vmin_d, vmin_w);
+ vmid_d = spu_mul(vmid_d, vmid_w);
+ vmax_d = spu_mul(vmax_d, vmax_w);
+
+ vector float botda = vmid_d - vmin_d;
+ vector float majda = vmax_d - vmin_d;
- assert(slot < PIPE_MAX_SHADER_INPUTS);
- assert(i <= 3);
+ vector float a = spu_sub(spu_mul(spu_splats(setup.ebot.dy), majda),
+ spu_mul(botda, spu_splats(setup.emaj.dy)));
+ vector float b = spu_sub(spu_mul(spu_splats(setup.emaj.dx), botda),
+ spu_mul(majda, spu_splats(setup.ebot.dx)));
- setup.coef[slot].dadx.f[i] = a * setup.oneoverarea;
- setup.coef[slot].dady.f[i] = b * setup.oneoverarea;
- setup.coef[slot].a0.f[i] = (mina -
- (setup.coef[slot].dadx.f[i] * (setup.vmin->data[0][0] - 0.5f) +
- setup.coef[slot].dady.f[i] * (setup.vmin->data[0][1] - 0.5f)));
+ setup.coef[slot].dadx = spu_mul(a, spu_splats(setup.oneOverArea));
+ setup.coef[slot].dady = spu_mul(b, spu_splats(setup.oneOverArea));
+
+ vector float tempx = spu_mul(setup.coef[slot].dadx, xxxx);
+ vector float tempy = spu_mul(setup.coef[slot].dady, yyyy);
+
+ setup.coef[slot].a0 = spu_sub(vmin_d, spu_add(tempx, tempy));
}
-#endif
+
/**
* Compute the setup.coef[] array dadx, dady, a0 values.
* Must be called after setup.vmin,vmid,vmax,vprovoke are initialized.
*/
-static void setup_tri_coefficients(void)
+static void
+setup_tri_coefficients(void)
{
-#if 1
uint i;
for (i = 0; i < spu.vertex_info.num_attribs; i++) {
- switch (spu.vertex_info.interp_mode[i]) {
+ switch (spu.vertex_info.attrib[i].interp_mode) {
case INTERP_NONE:
break;
- case INTERP_POS:
- /*tri_linear_coeff(i, 2, 3);*/
- /* XXX interp W if PERSPECTIVE... */
- tri_linear_coeff4(i);
- break;
case INTERP_CONSTANT:
- const_coeff(i);
+ const_coeff4(i);
break;
+ case INTERP_POS:
+ /* fall-through */
case INTERP_LINEAR:
tri_linear_coeff4(i);
break;
case INTERP_PERSPECTIVE:
- tri_linear_coeff4(i); /* temporary */
+ tri_persp_coeff4(i);
break;
default:
ASSERT(0);
}
}
-#else
- ASSERT(spu.vertex_info.interp_mode[0] == INTERP_POS);
- ASSERT(spu.vertex_info.interp_mode[1] == INTERP_LINEAR ||
- spu.vertex_info.interp_mode[1] == INTERP_CONSTANT);
- tri_linear_coeff(0, 2, 3); /* slot 0, z */
- tri_linear_coeff(1, 0, 4); /* slot 1, color */
-#endif
}
-static void setup_tri_edges(void)
+static void
+setup_tri_edges(void)
{
float vmin_x = spu_extract(setup.vmin->data[0], 0) + 0.5f;
float vmid_x = spu_extract(setup.vmid->data[0], 0) + 0.5f;
@@ -827,9 +693,8 @@ static void setup_tri_edges(void)
* Render the upper or lower half of a triangle.
* Scissoring/cliprect is applied here too.
*/
-static void subtriangle( struct edge *eleft,
- struct edge *eright,
- unsigned lines )
+static void
+subtriangle(struct edge *eleft, struct edge *eright, unsigned lines)
{
const int minx = setup.cliprect_minx;
const int maxx = setup.cliprect_maxx;
@@ -902,7 +767,8 @@ static void subtriangle( struct edge *eleft,
* The tile data should have already been fetched.
*/
boolean
-tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty)
+tri_draw(const float *v0, const float *v1, const float *v2,
+ uint tx, uint ty)
{
setup.tx = tx;
setup.ty = ty;
@@ -926,19 +792,14 @@ tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty)
setup.span.y_flags = 0;
setup.span.right[0] = 0;
setup.span.right[1] = 0;
- /* setup.span.z_mode = tri_z_mode( setup.ctx ); */
- /* init_constant_attribs( setup ); */
-
- if (setup.oneoverarea < 0.0) {
- /* emaj on left:
- */
+ if (setup.oneOverArea < 0.0) {
+ /* emaj on left */
subtriangle( &setup.emaj, &setup.ebot, setup.ebot.lines );
subtriangle( &setup.emaj, &setup.etop, setup.etop.lines );
}
else {
- /* emaj on right:
- */
+ /* emaj on right */
subtriangle( &setup.ebot, &setup.emaj, setup.ebot.lines );
subtriangle( &setup.etop, &setup.emaj, setup.etop.lines );
}