From 9d00cd3fc726a3fe01b98fd222dd4c71b3e95d44 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 26 Sep 2008 10:15:11 -0600
Subject: cell: move command processing code into new spu_command.c file

---
 src/gallium/drivers/cell/spu/spu_command.c | 599 +++++++++++++++++++++++++++++
 1 file changed, 599 insertions(+)
 create mode 100644 src/gallium/drivers/cell/spu/spu_command.c

(limited to 'src/gallium/drivers/cell/spu/spu_command.c')

diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
new file mode 100644
index 0000000000..ec9da5d887
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -0,0 +1,599 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+/**
+ * SPU command processing code
+ */
+
+
+#include <stdio.h>
+#include <libmisc.h>
+
+#include "pipe/p_defines.h"
+
+#include "spu_command.h"
+#include "spu_main.h"
+#include "spu_render.h"
+#include "spu_per_fragment_op.h"
+#include "spu_texture.h"
+#include "spu_tile.h"
+#include "spu_vertex_shader.h"
+#include "spu_dcache.h"
+#include "spu_debug.h"
+#include "cell/common.h"
+
+
+struct spu_vs_context draw;
+
+
+/**
+ * Buffers containing dynamically generated SPU code:
+ */
+static unsigned char attribute_fetch_code_buffer[136 * PIPE_MAX_ATTRIBS]
+    ALIGN16_ATTRIB;
+
+
+
+/**
+ * Tell the PPU that this SPU has finished copying a buffer to
+ * local store and that it may be reused by the PPU.
+ * This is done by writting a 16-byte batch-buffer-status block back into
+ * main memory (in cell_context->buffer_status[]).
+ */
+static void
+release_buffer(uint buffer)
+{
+   /* Evidently, using less than a 16-byte status doesn't work reliably */
+   static const uint status[4] ALIGN16_ATTRIB
+      = {CELL_BUFFER_STATUS_FREE, 0, 0, 0};
+
+   const uint index = 4 * (spu.init.id * CELL_NUM_BUFFERS + buffer);
+   uint *dst = spu.init.buffer_status + index;
+
+   ASSERT(buffer < CELL_NUM_BUFFERS);
+
+   mfc_put((void *) &status,    /* src in local memory */
+           (unsigned int) dst,  /* dst in main memory */
+           sizeof(status),      /* size */
+           TAG_MISC,            /* tag is unimportant */
+           0, /* tid */
+           0  /* rid */);
+}
+
+
+static void
+cmd_clear_surface(const struct cell_command_clear_surface *clear)
+{
+   DEBUG_PRINTF("CLEAR SURF %u to 0x%08x\n", clear->surface, clear->value);
+
+   if (clear->surface == 0) {
+      spu.fb.color_clear_value = clear->value;
+      if (spu.init.debug_flags & CELL_DEBUG_CHECKER) {
+         uint x = (spu.init.id << 4) | (spu.init.id << 12) |
+            (spu.init.id << 20) | (spu.init.id << 28);
+         spu.fb.color_clear_value ^= x;
+      }
+   }
+   else {
+      spu.fb.depth_clear_value = clear->value;
+   }
+
+#define CLEAR_OPT 1
+#if CLEAR_OPT
+
+   /* Simply set all tiles' status to CLEAR.
+    * When we actually begin rendering into a tile, we'll initialize it to
+    * the clear value.  If any tiles go untouched during the frame,
+    * really_clear_tiles() will set them to the clear value.
+    */
+   if (clear->surface == 0) {
+      memset(spu.ctile_status, TILE_STATUS_CLEAR, sizeof(spu.ctile_status));
+   }
+   else {
+      memset(spu.ztile_status, TILE_STATUS_CLEAR, sizeof(spu.ztile_status));
+   }
+
+#else
+
+   /*
+    * This path clears the whole framebuffer to the clear color right now.
+    */
+
+   /*
+   printf("SPU: %s num=%d w=%d h=%d\n",
+          __FUNCTION__, num_tiles, spu.fb.width_tiles, spu.fb.height_tiles);
+   */
+
+   /* init a single tile to the clear value */
+   if (clear->surface == 0) {
+      clear_c_tile(&spu.ctile);
+   }
+   else {
+      clear_z_tile(&spu.ztile);
+   }
+
+   /* walk over my tiles, writing the 'clear' tile's data */
+   {
+      const uint num_tiles = spu.fb.width_tiles * spu.fb.height_tiles;
+      uint i;
+      for (i = spu.init.id; i < num_tiles; i += spu.init.num_spus) {
+         uint tx = i % spu.fb.width_tiles;
+         uint ty = i / spu.fb.width_tiles;
+         if (clear->surface == 0)
+            put_tile(tx, ty, &spu.ctile, TAG_SURFACE_CLEAR, 0);
+         else
+            put_tile(tx, ty, &spu.ztile, TAG_SURFACE_CLEAR, 1);
+      }
+   }
+
+   if (spu.init.debug_flags & CELL_DEBUG_SYNC) {
+      wait_on_mask(1 << TAG_SURFACE_CLEAR);
+   }
+
+#endif /* CLEAR_OPT */
+
+   DEBUG_PRINTF("CLEAR SURF done\n");
+}
+
+
+static void
+cmd_release_verts(const struct cell_command_release_verts *release)
+{
+   DEBUG_PRINTF("RELEASE VERTS %u\n", release->vertex_buf);
+   ASSERT(release->vertex_buf != ~0U);
+   release_buffer(release->vertex_buf);
+}
+
+
+/**
+ * Process a CELL_CMD_STATE_FRAGMENT_OPS command.
+ * This involves installing new fragment ops SPU code.
+ * If this function is never called, we'll use a regular C fallback function
+ * for fragment processing.
+ */
+static void
+cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops)
+{
+   static int warned = 0;
+
+   DEBUG_PRINTF("CMD_STATE_FRAGMENT_OPS\n");
+   /* Copy SPU code from batch buffer to spu buffer */
+   memcpy(spu.fragment_ops_code, fops->code, SPU_MAX_FRAGMENT_OPS_INSTS * 4);
+   /* Copy state info (for fallback case only) */
+   memcpy(&spu.depth_stencil_alpha, &fops->dsa, sizeof(fops->dsa));
+   memcpy(&spu.blend, &fops->blend, sizeof(fops->blend));
+
+   /* Parity twist!  For now, always use the fallback code by default,
+    * only switching to codegen when specifically requested.  This
+    * allows us to develop freely without risking taking down the
+    * branch.
+    *
+    * Later, the parity of this check will be reversed, so that
+    * codegen is *always* used, unless we specifically indicate that
+    * we don't want it.
+    *
+    * Eventually, the option will be removed completely, because in
+    * final code we'll always use codegen and won't even provide the
+    * raw state records that the fallback code requires.
+    */
+   if ((spu.init.debug_flags & CELL_DEBUG_FRAGMENT_OP_FALLBACK) == 0) {
+      spu.fragment_ops = (spu_fragment_ops_func) spu.fragment_ops_code;
+   }
+   else {
+      /* otherwise, the default fallback code remains in place */
+      if (!warned) {
+         fprintf(stderr, "Cell Warning: using fallback per-fragment code\n");
+         warned = 1;
+      }
+   }
+
+   spu.read_depth = spu.depth_stencil_alpha.depth.enabled;
+   spu.read_stencil = spu.depth_stencil_alpha.stencil[0].enabled;
+}
+
+
+static void
+cmd_state_fragment_program(const struct cell_command_fragment_program *fp)
+{
+   DEBUG_PRINTF("CMD_STATE_FRAGMENT_PROGRAM\n");
+   /* Copy SPU code from batch buffer to spu buffer */
+   memcpy(spu.fragment_program_code, fp->code,
+          SPU_MAX_FRAGMENT_PROGRAM_INSTS * 4);
+#if 01
+   /* Point function pointer at new code */
+   spu.fragment_program = (spu_fragment_program_func)spu.fragment_program_code;
+#endif
+}
+
+
+static void
+cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
+{
+   DEBUG_PRINTF("FRAMEBUFFER: %d x %d at %p, cformat 0x%x  zformat 0x%x\n",
+             cmd->width,
+             cmd->height,
+             cmd->color_start,
+             cmd->color_format,
+             cmd->depth_format);
+
+   ASSERT_ALIGN16(cmd->color_start);
+   ASSERT_ALIGN16(cmd->depth_start);
+
+   spu.fb.color_start = cmd->color_start;
+   spu.fb.depth_start = cmd->depth_start;
+   spu.fb.color_format = cmd->color_format;
+   spu.fb.depth_format = cmd->depth_format;
+   spu.fb.width = cmd->width;
+   spu.fb.height = cmd->height;
+   spu.fb.width_tiles = (spu.fb.width + TILE_SIZE - 1) / TILE_SIZE;
+   spu.fb.height_tiles = (spu.fb.height + TILE_SIZE - 1) / TILE_SIZE;
+
+   switch (spu.fb.depth_format) {
+   case PIPE_FORMAT_Z32_UNORM:
+      spu.fb.zsize = 4;
+      spu.fb.zscale = (float) 0xffffffffu;
+      break;
+   case PIPE_FORMAT_Z24S8_UNORM:
+   case PIPE_FORMAT_S8Z24_UNORM:
+   case PIPE_FORMAT_Z24X8_UNORM:
+   case PIPE_FORMAT_X8Z24_UNORM:
+      spu.fb.zsize = 4;
+      spu.fb.zscale = (float) 0x00ffffffu;
+      break;
+   case PIPE_FORMAT_Z16_UNORM:
+      spu.fb.zsize = 2;
+      spu.fb.zscale = (float) 0xffffu;
+      break;
+   default:
+      spu.fb.zsize = 0;
+      break;
+   }
+}
+
+
+static void
+cmd_state_sampler(const struct cell_command_sampler *sampler)
+{
+   DEBUG_PRINTF("SAMPLER [%u]\n", sampler->unit);
+
+   spu.sampler[sampler->unit] = sampler->state;
+   if (spu.sampler[sampler->unit].min_img_filter == PIPE_TEX_FILTER_LINEAR)
+      spu.sample_texture[sampler->unit] = sample_texture_bilinear;
+   else
+      spu.sample_texture[sampler->unit] = sample_texture_nearest;
+}
+
+
+static void
+cmd_state_texture(const struct cell_command_texture *texture)
+{
+   const uint unit = texture->unit;
+   const uint width = texture->width;
+   const uint height = texture->height;
+
+   DEBUG_PRINTF("TEXTURE [%u] at %p  size %u x %u\n",
+             texture->unit, texture->start,
+             texture->width, texture->height);
+
+   spu.texture[unit].start = texture->start;
+   spu.texture[unit].width = width;
+   spu.texture[unit].height = height;
+
+   spu.texture[unit].tiles_per_row = width / TILE_SIZE;
+
+   spu.texture[unit].tex_size = (vector float) { width, height, 0.0, 0.0};
+   spu.texture[unit].tex_size_mask = (vector unsigned int)
+         { width - 1, height - 1, 0, 0 };
+   spu.texture[unit].tex_size_x_mask = spu_splats(width - 1);
+   spu.texture[unit].tex_size_y_mask = spu_splats(height - 1);
+}
+
+
+static void
+cmd_state_vertex_info(const struct vertex_info *vinfo)
+{
+   DEBUG_PRINTF("VERTEX_INFO num_attribs=%u\n", vinfo->num_attribs);
+   ASSERT(vinfo->num_attribs >= 1);
+   ASSERT(vinfo->num_attribs <= 8);
+   memcpy(&spu.vertex_info, vinfo, sizeof(*vinfo));
+}
+
+
+static void
+cmd_state_vs_array_info(const struct cell_array_info *vs_info)
+{
+   const unsigned attr = vs_info->attr;
+
+   ASSERT(attr < PIPE_MAX_ATTRIBS);
+   draw.vertex_fetch.src_ptr[attr] = vs_info->base;
+   draw.vertex_fetch.pitch[attr] = vs_info->pitch;
+   draw.vertex_fetch.size[attr] = vs_info->size;
+   draw.vertex_fetch.code_offset[attr] = vs_info->function_offset;
+   draw.vertex_fetch.dirty = 1;
+}
+
+
+static void
+cmd_state_attrib_fetch(const struct cell_attribute_fetch_code *code)
+{
+   mfc_get(attribute_fetch_code_buffer,
+           (unsigned int) code->base,  /* src */
+           code->size,
+           TAG_BATCH_BUFFER,
+           0, /* tid */
+           0  /* rid */);
+   wait_on_mask(1 << TAG_BATCH_BUFFER);
+
+   draw.vertex_fetch.code = attribute_fetch_code_buffer;
+}
+
+
+static void
+cmd_finish(void)
+{
+   DEBUG_PRINTF("FINISH\n");
+   really_clear_tiles(0);
+   /* wait for all outstanding DMAs to finish */
+   mfc_write_tag_mask(~0);
+   mfc_read_tag_status_all();
+   /* send mbox message to PPU */
+   spu_write_out_mbox(CELL_CMD_FINISH);
+}
+
+
+/**
+ * Execute a batch of commands which was sent to us by the PPU.
+ * See the cell_emit_state.c code to see where the commands come from.
+ *
+ * The opcode param encodes the location of the buffer and its size.
+ */
+static void
+cmd_batch(uint opcode)
+{
+   const uint buf = (opcode >> 8) & 0xff;
+   uint size = (opcode >> 16);
+   uint64_t buffer[CELL_BUFFER_SIZE / 8] ALIGN16_ATTRIB;
+   const unsigned usize = size / sizeof(buffer[0]);
+   uint pos;
+
+   DEBUG_PRINTF("BATCH buffer %u, len %u, from %p\n",
+             buf, size, spu.init.buffers[buf]);
+
+   ASSERT((opcode & CELL_CMD_OPCODE_MASK) == CELL_CMD_BATCH);
+
+   ASSERT_ALIGN16(spu.init.buffers[buf]);
+
+   size = ROUNDUP16(size);
+
+   ASSERT_ALIGN16(spu.init.buffers[buf]);
+
+   mfc_get(buffer,  /* dest */
+           (unsigned int) spu.init.buffers[buf],  /* src */
+           size,
+           TAG_BATCH_BUFFER,
+           0, /* tid */
+           0  /* rid */);
+   wait_on_mask(1 << TAG_BATCH_BUFFER);
+
+   /* Tell PPU we're done copying the buffer to local store */
+   DEBUG_PRINTF("release batch buf %u\n", buf);
+   release_buffer(buf);
+
+   /*
+    * Loop over commands in the batch buffer
+    */
+   for (pos = 0; pos < usize; /* no incr */) {
+      switch (buffer[pos]) {
+      /*
+       * rendering commands
+       */
+      case CELL_CMD_CLEAR_SURFACE:
+         {
+            struct cell_command_clear_surface *clr
+               = (struct cell_command_clear_surface *) &buffer[pos];
+            cmd_clear_surface(clr);
+            pos += sizeof(*clr) / 8;
+         }
+         break;
+      case CELL_CMD_RENDER:
+         {
+            struct cell_command_render *render
+               = (struct cell_command_render *) &buffer[pos];
+            uint pos_incr;
+            cmd_render(render, &pos_incr);
+            pos += pos_incr;
+         }
+         break;
+      /*
+       * state-update commands
+       */
+      case CELL_CMD_STATE_FRAMEBUFFER:
+         {
+            struct cell_command_framebuffer *fb
+               = (struct cell_command_framebuffer *) &buffer[pos];
+            cmd_state_framebuffer(fb);
+            pos += sizeof(*fb) / 8;
+         }
+         break;
+      case CELL_CMD_STATE_FRAGMENT_OPS:
+         {
+            struct cell_command_fragment_ops *fops
+               = (struct cell_command_fragment_ops *) &buffer[pos];
+            cmd_state_fragment_ops(fops);
+            pos += sizeof(*fops) / 8;
+         }
+         break;
+      case CELL_CMD_STATE_FRAGMENT_PROGRAM:
+         {
+            struct cell_command_fragment_program *fp
+               = (struct cell_command_fragment_program *) &buffer[pos];
+            cmd_state_fragment_program(fp);
+            pos += sizeof(*fp) / 8;
+         }
+         break;
+      case CELL_CMD_STATE_SAMPLER:
+         {
+            struct cell_command_sampler *sampler
+               = (struct cell_command_sampler *) &buffer[pos];
+            cmd_state_sampler(sampler);
+            pos += sizeof(*sampler) / 8;
+         }
+         break;
+      case CELL_CMD_STATE_TEXTURE:
+         {
+            struct cell_command_texture *texture
+               = (struct cell_command_texture *) &buffer[pos];
+            cmd_state_texture(texture);
+            pos += sizeof(*texture) / 8;
+         }
+         break;
+      case CELL_CMD_STATE_VERTEX_INFO:
+         cmd_state_vertex_info((struct vertex_info *) &buffer[pos+1]);
+         pos += (1 + ROUNDUP8(sizeof(struct vertex_info)) / 8);
+         break;
+      case CELL_CMD_STATE_VIEWPORT:
+         (void) memcpy(& draw.viewport, &buffer[pos+1],
+                       sizeof(struct pipe_viewport_state));
+         pos += (1 + ROUNDUP8(sizeof(struct pipe_viewport_state)) / 8);
+         break;
+      case CELL_CMD_STATE_UNIFORMS:
+         draw.constants = (const float (*)[4]) (uintptr_t) buffer[pos + 1];
+         pos += 2;
+         break;
+      case CELL_CMD_STATE_VS_ARRAY_INFO:
+         cmd_state_vs_array_info((struct cell_array_info *) &buffer[pos+1]);
+         pos += (1 + ROUNDUP8(sizeof(struct cell_array_info)) / 8);
+         break;
+      case CELL_CMD_STATE_BIND_VS:
+#if 0
+         spu_bind_vertex_shader(&draw,
+                                (struct cell_shader_info *) &buffer[pos+1]);
+#endif
+         pos += (1 + ROUNDUP8(sizeof(struct cell_shader_info)) / 8);
+         break;
+      case CELL_CMD_STATE_ATTRIB_FETCH:
+         cmd_state_attrib_fetch((struct cell_attribute_fetch_code *)
+                                &buffer[pos+1]);
+         pos += (1 + ROUNDUP8(sizeof(struct cell_attribute_fetch_code)) / 8);
+         break;
+      /*
+       * misc commands
+       */
+      case CELL_CMD_FINISH:
+         cmd_finish();
+         pos += 1;
+         break;
+      case CELL_CMD_RELEASE_VERTS:
+         {
+            struct cell_command_release_verts *release
+               = (struct cell_command_release_verts *) &buffer[pos];
+            cmd_release_verts(release);
+            pos += sizeof(*release) / 8;
+         }
+         break;
+      case CELL_CMD_FLUSH_BUFFER_RANGE: {
+	 struct cell_buffer_range *br = (struct cell_buffer_range *)
+	     &buffer[pos+1];
+
+	 spu_dcache_mark_dirty((unsigned) br->base, br->size);
+         pos += (1 + ROUNDUP8(sizeof(struct cell_buffer_range)) / 8);
+	 break;
+      }
+      default:
+         printf("SPU %u: bad opcode: 0x%llx\n", spu.init.id, buffer[pos]);
+         ASSERT(0);
+         break;
+      }
+   }
+
+   DEBUG_PRINTF("BATCH complete\n");
+}
+
+
+
+/**
+ * Main loop for SPEs: Get a command, execute it, repeat.
+ */
+void
+command_loop(void)
+{
+   struct cell_command cmd;
+   int exitFlag = 0;
+
+   DEBUG_PRINTF("Enter command loop\n");
+
+   ASSERT((sizeof(struct cell_command) & 0xf) == 0);
+   ASSERT_ALIGN16(&cmd);
+
+   while (!exitFlag) {
+      unsigned opcode;
+      int tag = 0;
+
+      DEBUG_PRINTF("Wait for cmd...\n");
+
+      /* read/wait from mailbox */
+      opcode = (unsigned int) spu_read_in_mbox();
+
+      DEBUG_PRINTF("got cmd 0x%x\n", opcode);
+
+      /* command payload */
+      mfc_get(&cmd,  /* dest */
+              (unsigned int) spu.init.cmd, /* src */
+              sizeof(struct cell_command), /* bytes */
+              tag,
+              0, /* tid */
+              0  /* rid */);
+      wait_on_mask( 1 << tag );
+
+      /*
+       * NOTE: most commands should be contained in a batch buffer
+       */
+
+      switch (opcode & CELL_CMD_OPCODE_MASK) {
+      case CELL_CMD_EXIT:
+         DEBUG_PRINTF("EXIT\n");
+         exitFlag = 1;
+         break;
+      case CELL_CMD_VS_EXECUTE:
+#if 0
+         spu_execute_vertex_shader(&draw, &cmd.vs);
+#endif
+         break;
+      case CELL_CMD_BATCH:
+         cmd_batch(opcode);
+         break;
+      default:
+         printf("Bad opcode 0x%x!\n", opcode & CELL_CMD_OPCODE_MASK);
+      }
+
+   }
+
+   DEBUG_PRINTF("Exit command loop\n");
+
+   spu_dcache_report();
+}
-- 
cgit v1.2.3


From 800c350d71132bbb5126bd89310df540332978f4 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Tue, 7 Oct 2008 16:14:27 -0600
Subject: cell: add support for fragment shader constant buffers

---
 src/gallium/drivers/cell/common.h                |  1 +
 src/gallium/drivers/cell/ppu/cell_gen_fp.c       | 10 +++++++++-
 src/gallium/drivers/cell/ppu/cell_state.h        |  5 +++--
 src/gallium/drivers/cell/ppu/cell_state_emit.c   | 19 +++++++++++++++++++
 src/gallium/drivers/cell/ppu/cell_state_shader.c |  5 ++++-
 src/gallium/drivers/cell/spu/spu_command.c       | 22 ++++++++++++++++++++++
 src/gallium/drivers/cell/spu/spu_main.h          |  8 +++++---
 7 files changed, 63 insertions(+), 7 deletions(-)

(limited to 'src/gallium/drivers/cell/spu/spu_command.c')

diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index c223bc1744..d261c1a640 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -94,6 +94,7 @@
 #define CELL_CMD_STATE_BIND_VS       18
 #define CELL_CMD_STATE_FRAGMENT_PROGRAM 19
 #define CELL_CMD_STATE_ATTRIB_FETCH  20
+#define CELL_CMD_STATE_FS_CONSTANTS  21
 #define CELL_CMD_VS_EXECUTE          22
 #define CELL_CMD_FLUSH_BUFFER_RANGE  23
 
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
index 131a2356fe..3065869d04 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
@@ -215,7 +215,15 @@ get_src_reg(struct codegen *gen,
          reg = gen->imm_regs[src->SrcRegister.Index][swizzle];
          break;
       case TGSI_FILE_CONSTANT:
-         /* xxx fall-through for now / fix */
+         {
+            /* offset is measured in quadwords, not bytes */
+            int offset = src->SrcRegister.Index * 4 + swizzle;
+            reg = get_itemp(gen);
+            reg_is_itemp = TRUE;
+            /* Load:  reg = memory[(machine_reg) + offset] */
+            spe_lqd(gen->f, reg, gen->constants_reg, offset);
+         }
+         break;
       default:
          assert(0);
       }
diff --git a/src/gallium/drivers/cell/ppu/cell_state.h b/src/gallium/drivers/cell/ppu/cell_state.h
index a7771a55a3..b193170f9c 100644
--- a/src/gallium/drivers/cell/ppu/cell_state.h
+++ b/src/gallium/drivers/cell/ppu/cell_state.h
@@ -44,8 +44,9 @@
 #define CELL_NEW_TEXTURE       0x800
 #define CELL_NEW_VERTEX        0x1000
 #define CELL_NEW_VS            0x2000
-#define CELL_NEW_CONSTANTS     0x4000
-#define CELL_NEW_VERTEX_INFO   0x8000
+#define CELL_NEW_VS_CONSTANTS  0x4000
+#define CELL_NEW_FS_CONSTANTS  0x8000
+#define CELL_NEW_VERTEX_INFO   0x10000
 
 
 extern void
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index a36fd3a601..cbfa393cfb 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -25,6 +25,7 @@
  * 
  **************************************************************************/
 
+#include "pipe/p_inlines.h"
 #include "util/u_memory.h"
 #include "cell_context.h"
 #include "cell_gen_fragment.h"
@@ -162,6 +163,24 @@ cell_emit_state(struct cell_context *cell)
       }
    }
 
+   if (cell->dirty & (CELL_NEW_FS_CONSTANTS)) {
+      const uint shader = PIPE_SHADER_FRAGMENT;
+      const uint num_const = cell->constants[shader].size / sizeof(float);
+      uint i, j;
+      float *buf = cell_batch_alloc(cell, 16 + num_const * sizeof(float));
+      uint64_t *ibuf = (uint64_t *) buf;
+      const float *constants = pipe_buffer_map(cell->pipe.screen,
+                                               cell->constants[shader].buffer,
+                                               PIPE_BUFFER_USAGE_CPU_READ);
+      ibuf[0] = CELL_CMD_STATE_FS_CONSTANTS;
+      ibuf[1] = num_const;
+      j = 4;
+      for (i = 0; i < num_const; i++) {
+         buf[j++] = constants[i];
+      }
+      pipe_buffer_unmap(cell->pipe.screen, cell->constants[shader].buffer);
+   }
+
    if (cell->dirty & (CELL_NEW_FRAMEBUFFER |
                       CELL_NEW_DEPTH_STENCIL |
                       CELL_NEW_BLEND)) {
diff --git a/src/gallium/drivers/cell/ppu/cell_state_shader.c b/src/gallium/drivers/cell/ppu/cell_state_shader.c
index 3a0d066da2..54a17eaf2b 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_shader.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_shader.c
@@ -197,7 +197,10 @@ cell_set_constant_buffer(struct pipe_context *pipe,
                         buf->buffer);
    cell->constants[shader].size = buf->size;
 
-   cell->dirty |= CELL_NEW_CONSTANTS;
+   if (shader == PIPE_SHADER_VERTEX)
+      cell->dirty |= CELL_NEW_VS_CONSTANTS;
+   else if (shader == PIPE_SHADER_FRAGMENT)
+      cell->dirty |= CELL_NEW_FS_CONSTANTS;
 }
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index ec9da5d887..91a4c137e7 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -231,6 +231,25 @@ cmd_state_fragment_program(const struct cell_command_fragment_program *fp)
 }
 
 
+static uint
+cmd_state_fs_constants(const uint64_t *buffer, uint pos)
+{
+   const uint num_const = buffer[pos + 1];
+   const float *constants = (const float *) &buffer[pos + 2];
+   uint i;
+
+   DEBUG_PRINTF("CMD_STATE_FS_CONSTANTS (%u)\n", num_const);
+
+   /* Expand each float to float[4] for SOA execution */
+   for (i = 0; i < num_const; i++) {
+      spu.constants[i] = spu_splats(constants[i]);
+   }
+
+   /* return new buffer pos (in 8-byte words) */
+   return pos + 2 + num_const / 2;
+}
+
+
 static void
 cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
 {
@@ -456,6 +475,9 @@ cmd_batch(uint opcode)
             pos += sizeof(*fp) / 8;
          }
          break;
+      case CELL_CMD_STATE_FS_CONSTANTS:
+         pos = cmd_state_fs_constants(buffer, pos);
+         break;
       case CELL_CMD_STATE_SAMPLER:
          {
             struct cell_command_sampler *sampler
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 1cd577c23c..82c9c69a3a 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -41,6 +41,9 @@
 #define MAX_HEIGHT 1024
 
 
+#define CELL_MAX_CONSTANTS 32  /**< number of float[4] constants */
+
+
 /**
  * A tile is basically a TILE_SIZE x TILE_SIZE block of 4-byte pixels.
  * The data may be addressed through several different types.
@@ -157,9 +160,8 @@ struct spu_global
    /** Current texture sampler function */
    spu_sample_texture_func sample_texture[CELL_MAX_SAMPLERS];
 
-   /** Fragment program constants (XXX preliminary/used) */
-#define MAX_CONSTANTS 32
-   vector float constants[MAX_CONSTANTS];
+   /** Fragment program constants */
+   vector float constants[4 * CELL_MAX_CONSTANTS];
 
 } ALIGN16_ATTRIB;
 
-- 
cgit v1.2.3


From 3b07c28dee74c7aa3be5efac8084d610675af291 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Mon, 13 Oct 2008 10:55:08 -0600
Subject: cell: do texture sampling/filtering for four pixels at a time.

---
 src/gallium/drivers/cell/spu/spu_command.c |  11 ++-
 src/gallium/drivers/cell/spu/spu_funcs.c   |   4 +
 src/gallium/drivers/cell/spu/spu_main.h    |  19 ++++-
 src/gallium/drivers/cell/spu/spu_texture.c | 125 ++++++++++++++++++++++++++++-
 src/gallium/drivers/cell/spu/spu_texture.h |  12 +++
 5 files changed, 161 insertions(+), 10 deletions(-)

(limited to 'src/gallium/drivers/cell/spu/spu_command.c')

diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index 91a4c137e7..c59be7defd 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -301,10 +301,14 @@ cmd_state_sampler(const struct cell_command_sampler *sampler)
    DEBUG_PRINTF("SAMPLER [%u]\n", sampler->unit);
 
    spu.sampler[sampler->unit] = sampler->state;
-   if (spu.sampler[sampler->unit].min_img_filter == PIPE_TEX_FILTER_LINEAR)
+   if (spu.sampler[sampler->unit].min_img_filter == PIPE_TEX_FILTER_LINEAR) {
       spu.sample_texture[sampler->unit] = sample_texture_bilinear;
-   else
+      spu.sample_texture4[sampler->unit] = sample_texture4_bilinear;
+   }
+   else {
       spu.sample_texture[sampler->unit] = sample_texture_nearest;
+      spu.sample_texture4[sampler->unit] = sample_texture4_nearest;
+   }
 }
 
 
@@ -323,6 +327,9 @@ cmd_state_texture(const struct cell_command_texture *texture)
    spu.texture[unit].width = width;
    spu.texture[unit].height = height;
 
+   spu.texture[unit].width4 = spu_splats((float) width);
+   spu.texture[unit].height4 = spu_splats((float) height);
+
    spu.texture[unit].tiles_per_row = width / TILE_SIZE;
 
    spu.texture[unit].tex_size = (vector float) { width, height, 0.0, 0.0};
diff --git a/src/gallium/drivers/cell/spu/spu_funcs.c b/src/gallium/drivers/cell/spu/spu_funcs.c
index 7dd7fcd253..13c234ea2e 100644
--- a/src/gallium/drivers/cell/spu/spu_funcs.c
+++ b/src/gallium/drivers/cell/spu/spu_funcs.c
@@ -106,6 +106,7 @@ spu_txp(vector float s, vector float t, vector float r, vector float q,
 {
    //const uint unit = 0;
    struct vec_4x4 colors;
+#if 0
    vector float coords[4];
 
    coords[0] = s;
@@ -121,6 +122,9 @@ spu_txp(vector float s, vector float t, vector float r, vector float q,
    colors.v[3] = spu.sample_texture[unit](unit, coords[3]);
 
    _transpose_matrix4x4(colors.v, colors.v);
+#else
+   spu.sample_texture4[unit](s, t, r, q, unit, colors.v);
+#endif
    return colors;
 }
 
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 82c9c69a3a..5d14be51c2 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -67,6 +67,14 @@ typedef union {
 typedef vector float (*spu_sample_texture_func)(uint unit,
                                                 vector float texcoord);
 
+typedef void (*spu_sample_texture4_func)(vector float s,
+                                         vector float t,
+                                         vector float r,
+                                         vector float q,
+                                         uint unit,
+                                         vector float colors[4]);
+
+
 /** Function for performing per-fragment ops */
 typedef void (*spu_fragment_ops_func)(uint x, uint y,
                                       tile_t *colorTile,
@@ -107,10 +115,12 @@ struct spu_texture
    void *start;
    ushort width, height;
    ushort tiles_per_row;
-   vector float tex_size;
-   vector unsigned int tex_size_mask; /**< == int(size - 1) */
-   vector unsigned int tex_size_x_mask; /**< == int(size - 1) */
-   vector unsigned int tex_size_y_mask; /**< == int(size - 1) */
+   vector float tex_size; /**< == {width, height, 0, 0} */
+   vector float width4;   /**< == {width, width, width, width} */
+   vector float height4;  /**< == {height, height, height, height} */
+   vector unsigned int tex_size_mask; /**< == {width-1, height-1, 0, 0 } */
+   vector unsigned int tex_size_x_mask; /**< splat(width-1) */
+   vector unsigned int tex_size_y_mask; /**< splat(height-1) */
 } ALIGN16_ATTRIB;
 
 
@@ -159,6 +169,7 @@ struct spu_global
 
    /** Current texture sampler function */
    spu_sample_texture_func sample_texture[CELL_MAX_SAMPLERS];
+   spu_sample_texture4_func sample_texture4[CELL_MAX_SAMPLERS];
 
    /** Fragment program constants */
    vector float constants[4 * CELL_MAX_CONSTANTS];
diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
index 117b8a36f8..12e6ed1ba1 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.c
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -26,6 +26,8 @@
  **************************************************************************/
 
 
+#include <transpose_matrix4x4.h>
+
 #include "pipe/p_compiler.h"
 #include "spu_main.h"
 #include "spu_texture.h"
@@ -91,10 +93,10 @@ static void
 get_four_texels(uint unit, vec_uint4 x, vec_uint4 y, vec_uint4 *texels)
 {
    const unsigned texture_ea = (uintptr_t) spu.texture[unit].start;
-   vec_uint4 tile_x = spu_rlmask(x, -5);
-   vec_uint4 tile_y = spu_rlmask(y, -5);
-   const qword offset_x = si_andi((qword) x, 0x1f);
-   const qword offset_y = si_andi((qword) y, 0x1f);
+   vec_uint4 tile_x = spu_rlmask(x, -5);  /* tile_x = x / 32 */
+   vec_uint4 tile_y = spu_rlmask(y, -5);  /* tile_y = y / 32 */
+   const qword offset_x = si_andi((qword) x, 0x1f); /* offset_x = x & 0x1f */
+   const qword offset_y = si_andi((qword) y, 0x1f); /* offset_y = y & 0x1f */
 
    const qword tiles_per_row = (qword) spu_splats(spu.texture[unit].tiles_per_row);
    const qword tile_size = (qword) spu_splats((unsigned) sizeof(tile_t));
@@ -132,6 +134,31 @@ sample_texture_nearest(uint unit, vector float texcoord)
 }
 
 
+/**
+ * \param colors  returned colors in SOA format (rrrr, gggg, bbbb, aaaa).
+ */
+void
+sample_texture4_nearest(vector float s, vector float t,
+                        vector float r, vector float q,
+                        uint unit, vector float colors[4])
+{
+   vector float ss = spu_mul(s, spu.texture[unit].width4);
+   vector float tt = spu_mul(t, spu.texture[unit].height4);
+   vector unsigned int is = spu_convtu(ss, 0);
+   vector unsigned int it = spu_convtu(tt, 0);
+   vec_uint4 texels[4];
+
+   /* GL_REPEAT wrap mode: */
+   is = spu_and(is, spu.texture[unit].tex_size_x_mask);
+   it = spu_and(it, spu.texture[unit].tex_size_y_mask);
+
+   get_four_texels(unit, is, it, texels);
+
+   /* convert four packed ARGBA pixels to float RRRR,GGGG,BBBB,AAAA */
+   spu_unpack_A8R8G8B8_transpose4(texels, colors);
+}
+
+
 vector float
 sample_texture_bilinear(uint unit, vector float texcoord)
 {
@@ -198,3 +225,93 @@ sample_texture_bilinear(uint unit, vector float texcoord)
 
    return texel_sum;
 }
+
+
+void
+sample_texture4_bilinear(vector float s, vector float t,
+                         vector float r, vector float q,
+                         uint unit, vector float colors[4])
+{
+   vector float ss = spu_madd(s, spu.texture[unit].width4,  spu_splats(-0.5f));
+   vector float tt = spu_madd(t, spu.texture[unit].height4, spu_splats(-0.5f));
+
+   vector unsigned int is0 = spu_convtu(ss, 0);
+   vector unsigned int it0 = spu_convtu(tt, 0);
+
+   /* is + 1, it + 1 */
+   vector unsigned int is1 = spu_add(is0, 1);
+   vector unsigned int it1 = spu_add(it0, 1);
+
+   /* PIPE_TEX_WRAP_REPEAT */
+   is0 = spu_and(is0, spu.texture[unit].tex_size_x_mask);
+   it0 = spu_and(it0, spu.texture[unit].tex_size_y_mask);
+   is1 = spu_and(is1, spu.texture[unit].tex_size_x_mask);
+   it1 = spu_and(it1, spu.texture[unit].tex_size_y_mask);
+
+   /* get packed int texels */
+   vector unsigned int texels[16];
+   get_four_texels(unit, is0, it0, texels + 0);  /* upper-left */
+   get_four_texels(unit, is1, it0, texels + 4);  /* upper-right */
+   get_four_texels(unit, is0, it1, texels + 8);  /* lower-left */
+   get_four_texels(unit, is1, it1, texels + 12); /* lower-right */
+
+   /* XXX possibly rework following code to compute the weighted sample
+    * colors with integer arithmetic for fewer int->float conversions.
+    */
+
+   /* convert packed int texels to float colors */
+   vector float ftexels[16];
+   spu_unpack_A8R8G8B8_transpose4(texels + 0, ftexels + 0);
+   spu_unpack_A8R8G8B8_transpose4(texels + 4, ftexels + 4);
+   spu_unpack_A8R8G8B8_transpose4(texels + 8, ftexels + 8);
+   spu_unpack_A8R8G8B8_transpose4(texels + 12, ftexels + 12);
+
+   /* Compute weighting factors in [0,1]
+    * Multiply texcoord by 1024, AND with 1023, convert back to float.
+    */
+   vector float ss1024 = spu_mul(ss, spu_splats(1024.0f));
+   vector signed int iss1024 = spu_convts(ss1024, 0);
+   iss1024 = spu_and(iss1024, 1023);
+   vector float sWeights0 = spu_convtf(iss1024, 10);
+
+   vector float tt1024 = spu_mul(tt, spu_splats(1024.0f));
+   vector signed int itt1024 = spu_convts(tt1024, 0);
+   itt1024 = spu_and(itt1024, 1023);
+   vector float tWeights0 = spu_convtf(itt1024, 10);
+
+   /* 1 - sWeight and 1 - tWeight */
+   vector float sWeights1 = spu_sub(spu_splats(1.0f), sWeights0);
+   vector float tWeights1 = spu_sub(spu_splats(1.0f), tWeights0);
+
+   /* reds, for four pixels */
+   ftexels[ 0] = spu_mul(ftexels[ 0], spu_mul(sWeights1, tWeights1)); /*ul*/
+   ftexels[ 4] = spu_mul(ftexels[ 4], spu_mul(sWeights0, tWeights1)); /*ur*/
+   ftexels[ 8] = spu_mul(ftexels[ 8], spu_mul(sWeights1, tWeights0)); /*ll*/
+   ftexels[12] = spu_mul(ftexels[12], spu_mul(sWeights0, tWeights0)); /*lr*/
+   colors[0] = spu_add(spu_add(ftexels[0], ftexels[4]),
+                       spu_add(ftexels[8], ftexels[12]));
+
+   /* greens, for four pixels */
+   ftexels[ 1] = spu_mul(ftexels[ 1], spu_mul(sWeights1, tWeights1)); /*ul*/
+   ftexels[ 5] = spu_mul(ftexels[ 5], spu_mul(sWeights0, tWeights1)); /*ur*/
+   ftexels[ 9] = spu_mul(ftexels[ 9], spu_mul(sWeights1, tWeights0)); /*ll*/
+   ftexels[13] = spu_mul(ftexels[13], spu_mul(sWeights0, tWeights0)); /*lr*/
+   colors[1] = spu_add(spu_add(ftexels[1], ftexels[5]),
+                       spu_add(ftexels[9], ftexels[13]));
+
+   /* blues, for four pixels */
+   ftexels[ 2] = spu_mul(ftexels[ 2], spu_mul(sWeights1, tWeights1)); /*ul*/
+   ftexels[ 6] = spu_mul(ftexels[ 6], spu_mul(sWeights0, tWeights1)); /*ur*/
+   ftexels[10] = spu_mul(ftexels[10], spu_mul(sWeights1, tWeights0)); /*ll*/
+   ftexels[14] = spu_mul(ftexels[14], spu_mul(sWeights0, tWeights0)); /*lr*/
+   colors[2] = spu_add(spu_add(ftexels[2], ftexels[6]),
+                       spu_add(ftexels[10], ftexels[14]));
+
+   /* alphas, for four pixels */
+   ftexels[ 3] = spu_mul(ftexels[ 3], spu_mul(sWeights1, tWeights1)); /*ul*/
+   ftexels[ 7] = spu_mul(ftexels[ 7], spu_mul(sWeights0, tWeights1)); /*ur*/
+   ftexels[11] = spu_mul(ftexels[11], spu_mul(sWeights1, tWeights0)); /*ll*/
+   ftexels[15] = spu_mul(ftexels[15], spu_mul(sWeights0, tWeights0)); /*lr*/
+   colors[3] = spu_add(spu_add(ftexels[3], ftexels[7]),
+                       spu_add(ftexels[11], ftexels[15]));
+}
diff --git a/src/gallium/drivers/cell/spu/spu_texture.h b/src/gallium/drivers/cell/spu/spu_texture.h
index f7c9738be8..f019e7d8ef 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.h
+++ b/src/gallium/drivers/cell/spu/spu_texture.h
@@ -40,8 +40,20 @@ extern vector float
 sample_texture_nearest(uint unit, vector float texcoord);
 
 
+extern void
+sample_texture4_nearest(vector float s, vector float t,
+                        vector float r, vector float q,
+                        uint unit, vector float colors[4]);
+
+
 extern vector float
 sample_texture_bilinear(uint unit, vector float texcoord);
 
 
+extern void
+sample_texture4_bilinear(vector float s, vector float t,
+                         vector float r, vector float q,
+                         uint unit, vector float colors[4]);
+
+
 #endif /* SPU_TEXTURE_H */
-- 
cgit v1.2.3


From c8fb3682619ea49c5fefdf8b88cdb95eac7478ff Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Mon, 13 Oct 2008 11:16:04 -0600
Subject: cell: remove old texture code

---
 src/gallium/drivers/cell/spu/spu_command.c |  2 -
 src/gallium/drivers/cell/spu/spu_funcs.c   | 19 -------
 src/gallium/drivers/cell/spu/spu_main.h    |  4 --
 src/gallium/drivers/cell/spu/spu_texture.c | 88 ++----------------------------
 src/gallium/drivers/cell/spu/spu_texture.h |  8 ---
 src/gallium/drivers/cell/spu/spu_tri.c     | 67 +----------------------
 6 files changed, 7 insertions(+), 181 deletions(-)

(limited to 'src/gallium/drivers/cell/spu/spu_command.c')

diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index c59be7defd..d4cc9a2146 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -302,11 +302,9 @@ cmd_state_sampler(const struct cell_command_sampler *sampler)
 
    spu.sampler[sampler->unit] = sampler->state;
    if (spu.sampler[sampler->unit].min_img_filter == PIPE_TEX_FILTER_LINEAR) {
-      spu.sample_texture[sampler->unit] = sample_texture_bilinear;
       spu.sample_texture4[sampler->unit] = sample_texture4_bilinear;
    }
    else {
-      spu.sample_texture[sampler->unit] = sample_texture_nearest;
       spu.sample_texture4[sampler->unit] = sample_texture4_nearest;
    }
 }
diff --git a/src/gallium/drivers/cell/spu/spu_funcs.c b/src/gallium/drivers/cell/spu/spu_funcs.c
index 13c234ea2e..4c90b701ee 100644
--- a/src/gallium/drivers/cell/spu/spu_funcs.c
+++ b/src/gallium/drivers/cell/spu/spu_funcs.c
@@ -104,27 +104,8 @@ static struct vec_4x4
 spu_txp(vector float s, vector float t, vector float r, vector float q,
         unsigned unit)
 {
-   //const uint unit = 0;
    struct vec_4x4 colors;
-#if 0
-   vector float coords[4];
-
-   coords[0] = s;
-   coords[1] = t;
-   coords[2] = r;
-   coords[3] = q;
-   _transpose_matrix4x4(coords, coords);
-
-   /* get four texture samples */
-   colors.v[0] = spu.sample_texture[unit](unit, coords[0]);
-   colors.v[1] = spu.sample_texture[unit](unit, coords[1]);
-   colors.v[2] = spu.sample_texture[unit](unit, coords[2]);
-   colors.v[3] = spu.sample_texture[unit](unit, coords[3]);
-
-   _transpose_matrix4x4(colors.v, colors.v);
-#else
    spu.sample_texture4[unit](s, t, r, q, unit, colors.v);
-#endif
    return colors;
 }
 
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 5d14be51c2..2a8cb00f8d 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -64,9 +64,6 @@ typedef union {
 
 
 /** Function for sampling textures */
-typedef vector float (*spu_sample_texture_func)(uint unit,
-                                                vector float texcoord);
-
 typedef void (*spu_sample_texture4_func)(vector float s,
                                          vector float t,
                                          vector float r,
@@ -168,7 +165,6 @@ struct spu_global
    spu_fragment_program_func fragment_program;
 
    /** Current texture sampler function */
-   spu_sample_texture_func sample_texture[CELL_MAX_SAMPLERS];
    spu_sample_texture4_func sample_texture4[CELL_MAX_SAMPLERS];
 
    /** Fragment program constants */
diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
index 12e6ed1ba1..ba62ad27fd 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.c
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -120,21 +120,9 @@ get_four_texels(uint unit, vec_uint4 x, vec_uint4 y, vec_uint4 *texels)
 }
 
 
-/**
- * Get texture sample at texcoord.
- */
-vector float
-sample_texture_nearest(uint unit, vector float texcoord)
-{
-   vector float tc = spu_mul(texcoord, spu.texture[unit].tex_size);
-   vector unsigned int itc = spu_convtu(tc, 0);  /* convert to int */
-   itc = spu_and(itc, spu.texture[unit].tex_size_mask); /* mask (GL_REPEAT) */
-   uint texel = get_texel(unit, itc);
-   return spu_unpack_A8R8G8B8(texel);
-}
-
 
 /**
+ * Do nearest texture sampling for four pixels.
  * \param colors  returned colors in SOA format (rrrr, gggg, bbbb, aaaa).
  */
 void
@@ -148,7 +136,7 @@ sample_texture4_nearest(vector float s, vector float t,
    vector unsigned int it = spu_convtu(tt, 0);
    vec_uint4 texels[4];
 
-   /* GL_REPEAT wrap mode: */
+   /* PIPE_TEX_WRAP_REPEAT */
    is = spu_and(is, spu.texture[unit].tex_size_x_mask);
    it = spu_and(it, spu.texture[unit].tex_size_y_mask);
 
@@ -159,74 +147,10 @@ sample_texture4_nearest(vector float s, vector float t,
 }
 
 
-vector float
-sample_texture_bilinear(uint unit, vector float texcoord)
-{
-   static const vec_uint4 offset_x = {0, 0, 1, 1};
-   static const vec_uint4 offset_y = {0, 1, 0, 1};
-
-   vector float tc = spu_mul(texcoord, spu.texture[unit].tex_size);
-   tc = spu_add(tc, spu_splats(-0.5f));  /* half texel bias */
-
-   /* integer texcoords S,T: */
-   vec_uint4 itc = spu_convtu(tc, 0);  /* convert to int */
-
-   vec_uint4 texels[4];
-   
-   /* setup texcoords for quad:
-    *  +-----+-----+
-    *  |x0,y0|x1,y1|
-    *  +-----+-----+
-    *  |x2,y2|x3,y3|
-    *  +-----+-----+
-    */
-   vec_uint4 x = spu_splats(spu_extract(itc, 0));
-   vec_uint4 y = spu_splats(spu_extract(itc, 1));
-   x = spu_add(x, offset_x);
-   y = spu_add(y, offset_y);
-
-   /* GL_REPEAT wrap mode: */
-   x = spu_and(x, spu.texture[unit].tex_size_x_mask);
-   y = spu_and(y, spu.texture[unit].tex_size_y_mask);
-
-   get_four_texels(unit, x, y, texels);
-
-   /* integer A8R8G8B8 to float texel conversion */
-   vector float texel00 = spu_unpack_A8R8G8B8(spu_extract(texels[0], 0));
-   vector float texel01 = spu_unpack_A8R8G8B8(spu_extract(texels[1], 0));
-   vector float texel10 = spu_unpack_A8R8G8B8(spu_extract(texels[2], 0));
-   vector float texel11 = spu_unpack_A8R8G8B8(spu_extract(texels[3], 0));
-
-
-   /* Compute weighting factors in [0,1]
-    * Multiply texcoord by 1024, AND with 1023, convert back to float.
-    */
-   vector float tc1024 = spu_mul(tc, spu_splats(1024.0f));
-   vector signed int itc1024 = spu_convts(tc1024, 0);
-   itc1024 = spu_and(itc1024, spu_splats((1 << 10) - 1));
-   vector float weight = spu_convtf(itc1024, 10);
-
-   /* smeared frac and 1-frac */
-   vector float sfrac = spu_splats(spu_extract(weight, 0));
-   vector float tfrac = spu_splats(spu_extract(weight, 1));
-   vector float sfrac1 = spu_sub(spu_splats(1.0f), sfrac);
-   vector float tfrac1 = spu_sub(spu_splats(1.0f), tfrac);
-
-   /* multiply the samples (colors) by the S/T weights */
-   texel00 = spu_mul(spu_mul(texel00, sfrac1), tfrac1);
-   texel10 = spu_mul(spu_mul(texel10, sfrac ), tfrac1);
-   texel01 = spu_mul(spu_mul(texel01, sfrac1), tfrac );
-   texel11 = spu_mul(spu_mul(texel11, sfrac ), tfrac );
-
-   /* compute sum of weighted samples */
-   vector float texel_sum = spu_add(texel00, texel01);
-   texel_sum = spu_add(texel_sum, texel10);
-   texel_sum = spu_add(texel_sum, texel11);
-
-   return texel_sum;
-}
-
-
+/**
+ * Do bilinear texture sampling for four pixels.
+ * \param colors  returned colors in SOA format (rrrr, gggg, bbbb, aaaa).
+ */
 void
 sample_texture4_bilinear(vector float s, vector float t,
                          vector float r, vector float q,
diff --git a/src/gallium/drivers/cell/spu/spu_texture.h b/src/gallium/drivers/cell/spu/spu_texture.h
index f019e7d8ef..d576aed719 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.h
+++ b/src/gallium/drivers/cell/spu/spu_texture.h
@@ -36,20 +36,12 @@ extern void
 invalidate_tex_cache(void);
 
 
-extern vector float
-sample_texture_nearest(uint unit, vector float texcoord);
-
-
 extern void
 sample_texture4_nearest(vector float s, vector float t,
                         vector float r, vector float q,
                         uint unit, vector float colors[4]);
 
 
-extern vector float
-sample_texture_bilinear(uint unit, vector float texcoord);
-
-
 extern void
 sample_texture4_bilinear(vector float s, vector float t,
                          vector float r, vector float q,
diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index a62d4f0f2f..022d21ba8f 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -286,72 +286,7 @@ emit_quad( int x, int y, mask_t mask)
       spu.cur_ctile_status = TILE_STATUS_DIRTY;
       spu.cur_ztile_status = TILE_STATUS_DIRTY;
 
-      if (0/*spu.texture[0].start*/) {
-         /*
-          * Temporary texture mapping path
-          * This will go away when fragment programs support TEX inst.
-          */
-         const uint unit = 0;
-         vector float colors[4];
-         vector float texcoords[4];
-         eval_coeff(2, (float) x, (float) y, texcoords);
-
-         if (spu_extract(mask, 0))
-            colors[0] = spu.sample_texture[unit](unit, texcoords[0]);
-         if (spu_extract(mask, 1))
-            colors[1] = spu.sample_texture[unit](unit, texcoords[1]);
-         if (spu_extract(mask, 2))
-            colors[2] = spu.sample_texture[unit](unit, texcoords[2]);
-         if (spu_extract(mask, 3))
-            colors[3] = spu.sample_texture[unit](unit, texcoords[3]);
-
-
-         if (spu.texture[1].start) {
-            /* multi-texture mapping */
-            const uint unit = 1;
-            vector float colors1[4];
-
-            eval_coeff(2, (float) x, (float) y, texcoords);
-
-            if (spu_extract(mask, 0))
-               colors1[0] = spu.sample_texture[unit](unit, texcoords[0]);
-            if (spu_extract(mask, 1))
-               colors1[1] = spu.sample_texture[unit](unit, texcoords[1]);
-            if (spu_extract(mask, 2))
-               colors1[2] = spu.sample_texture[unit](unit, texcoords[2]);
-            if (spu_extract(mask, 3))
-               colors1[3] = spu.sample_texture[unit](unit, texcoords[3]);
-
-            /* hack: modulate first texture by second */
-            colors[0] = spu_mul(colors[0], colors1[0]);
-            colors[1] = spu_mul(colors[1], colors1[1]);
-            colors[2] = spu_mul(colors[2], colors1[2]);
-            colors[3] = spu_mul(colors[3], colors1[3]);
-         }
-
-         {
-            /* Convert fragment data from AoS to SoA format.
-             * I.e. (RGBA,RGBA,RGBA,RGBA) -> (RRRR,GGGG,BBBB,AAAA)
-             * This is temporary!
-             */
-            vector float soa_frag[4];
-            _transpose_matrix4x4(soa_frag, colors);
-
-            vector float fragZ = eval_z((float) x, (float) y);
-
-            /* Do all per-fragment/quad operations here, including:
-             * alpha test, z test, stencil test, blend and framebuffer writing.
-             */
-            spu.fragment_ops(ix, iy, &spu.ctile, &spu.ztile,
-                             fragZ,
-                             soa_frag[0], soa_frag[1],
-                             soa_frag[2], soa_frag[3],
-                             mask,
-                             setup.facing);
-         }
-
-      }
-      else {
+      {
          /*
           * Run fragment shader, execute per-fragment ops, update fb/tile.
           */
-- 
cgit v1.2.3


From b0c136cfb1fcbcea35e17dc699a96acbb24738f5 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Mon, 13 Oct 2008 15:17:01 -0600
Subject: cell: remove old texture-related fields

---
 src/gallium/drivers/cell/spu/spu_command.c | 3 ---
 src/gallium/drivers/cell/spu/spu_main.h    | 2 --
 2 files changed, 5 deletions(-)

(limited to 'src/gallium/drivers/cell/spu/spu_command.c')

diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index d4cc9a2146..64890f6dbd 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -330,9 +330,6 @@ cmd_state_texture(const struct cell_command_texture *texture)
 
    spu.texture[unit].tiles_per_row = width / TILE_SIZE;
 
-   spu.texture[unit].tex_size = (vector float) { width, height, 0.0, 0.0};
-   spu.texture[unit].tex_size_mask = (vector unsigned int)
-         { width - 1, height - 1, 0, 0 };
    spu.texture[unit].tex_size_x_mask = spu_splats(width - 1);
    spu.texture[unit].tex_size_y_mask = spu_splats(height - 1);
 }
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 2a8cb00f8d..e3960dbe8b 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -112,10 +112,8 @@ struct spu_texture
    void *start;
    ushort width, height;
    ushort tiles_per_row;
-   vector float tex_size; /**< == {width, height, 0, 0} */
    vector float width4;   /**< == {width, width, width, width} */
    vector float height4;  /**< == {height, height, height, height} */
-   vector unsigned int tex_size_mask; /**< == {width-1, height-1, 0, 0 } */
    vector unsigned int tex_size_x_mask; /**< splat(width-1) */
    vector unsigned int tex_size_y_mask; /**< splat(height-1) */
 } ALIGN16_ATTRIB;
-- 
cgit v1.2.3


From 978799beb2a9c51550abb1f37bb6f63d06bc4717 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Mon, 13 Oct 2008 16:43:11 -0600
Subject: cell: initial work for mipmap texture filtering

---
 src/gallium/drivers/cell/common.h              |   6 +-
 src/gallium/drivers/cell/ppu/cell_screen.c     |   4 +-
 src/gallium/drivers/cell/ppu/cell_state_emit.c |  18 ++--
 src/gallium/drivers/cell/ppu/cell_texture.c    |  48 ++++++----
 src/gallium/drivers/cell/ppu/cell_texture.h    |   6 +-
 src/gallium/drivers/cell/spu/spu_command.c     |  37 +++++---
 src/gallium/drivers/cell/spu/spu_funcs.c       |   1 +
 src/gallium/drivers/cell/spu/spu_main.h        |   7 +-
 src/gallium/drivers/cell/spu/spu_texture.c     | 120 ++++++++++++++++++-------
 src/gallium/drivers/cell/spu/spu_texture.h     |   6 ++
 10 files changed, 176 insertions(+), 77 deletions(-)

(limited to 'src/gallium/drivers/cell/spu/spu_command.c')

diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index 5dc756023f..e4de9a551d 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -67,6 +67,7 @@
 #define CELL_MAX_SPUS 6
 
 #define CELL_MAX_SAMPLERS 4
+#define CELL_MAX_TEXTURE_LEVELS 12  /* 2k x 2k */
 
 #define TILE_SIZE 32
 
@@ -251,8 +252,9 @@ struct cell_command_texture
 {
    uint64_t opcode;     /**< CELL_CMD_STATE_TEXTURE */
    uint unit;
-   void *start;         /**< Address in main memory */
-   ushort width, height;
+   void *start[CELL_MAX_TEXTURE_LEVELS];   /**< Address in main memory */
+   ushort width[CELL_MAX_TEXTURE_LEVELS];
+   ushort height[CELL_MAX_TEXTURE_LEVELS];
 };
 
 
diff --git a/src/gallium/drivers/cell/ppu/cell_screen.c b/src/gallium/drivers/cell/ppu/cell_screen.c
index 47ba6fa290..d223557950 100644
--- a/src/gallium/drivers/cell/ppu/cell_screen.c
+++ b/src/gallium/drivers/cell/ppu/cell_screen.c
@@ -76,11 +76,11 @@ cell_get_param(struct pipe_screen *screen, int param)
    case PIPE_CAP_TEXTURE_SHADOW_MAP:
       return 10;
    case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
-      return 12; /* max 2Kx2K */
+      return CELL_MAX_TEXTURE_LEVELS;
    case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
       return 8;  /* max 128x128x128 */
    case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
-      return 12; /* max 2Kx2K */
+      return CELL_MAX_TEXTURE_LEVELS;
    default:
       return 10;
    }
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index cbfa393cfb..7090b4c99f 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -211,14 +211,20 @@ cell_emit_state(struct cell_context *cell)
          texture->opcode = CELL_CMD_STATE_TEXTURE;
          texture->unit = i;
          if (cell->texture[i]) {
-            texture->start = cell->texture[i]->tiled_data;
-            texture->width = cell->texture[i]->base.width[0];
-            texture->height = cell->texture[i]->base.height[0];
+            uint level;
+            for (level = 0; level < CELL_MAX_TEXTURE_LEVELS; level++) {
+               texture->start[level] = cell->texture[i]->tiled_data[level];
+               texture->width[level] = cell->texture[i]->base.width[level];
+               texture->height[level] = cell->texture[i]->base.height[level];
+            }
          }
          else {
-            texture->start = NULL;
-            texture->width = 1;
-            texture->height = 1;
+            uint level;
+            for (level = 0; level < CELL_MAX_TEXTURE_LEVELS; level++) {
+               texture->start[level] = NULL;
+               texture->width[level] = 1;
+               texture->height[level] = 1;
+            }
          }
       }
    }
diff --git a/src/gallium/drivers/cell/ppu/cell_texture.c b/src/gallium/drivers/cell/ppu/cell_texture.c
index b6590dfb86..f5f81ac3cc 100644
--- a/src/gallium/drivers/cell/ppu/cell_texture.c
+++ b/src/gallium/drivers/cell/ppu/cell_texture.c
@@ -66,6 +66,8 @@ cell_texture_layout(struct cell_texture * spt)
       unsigned size;
       unsigned w_tile, h_tile;
 
+      assert(level < CELL_MAX_TEXTURE_LEVELS);
+
       /* width, height, rounded up to tile size */
       w_tile = align(width, TILE_SIZE);
       h_tile = align(height, TILE_SIZE);
@@ -249,33 +251,41 @@ cell_tile_texture(struct cell_context *cell,
                   struct cell_texture *texture)
 {
    struct pipe_screen *screen = cell->pipe.screen;
-   uint face = 0, level = 0, zslice = 0;
-   struct pipe_surface *surf;
-   const uint w = texture->base.width[0], h = texture->base.height[0];
+   uint face = 0, level, zslice = 0;
    const uint *src;
 
-   /* temporary restrictions: */
-   assert(w >= TILE_SIZE);
-   assert(h >= TILE_SIZE);
-   assert(w % TILE_SIZE == 0);
-   assert(h % TILE_SIZE == 0);
+   for (level = 0; level <= texture->base.last_level; level++) {
+      if (!texture->tiled_data[level]) {
+         struct pipe_surface *surf;
 
-   surf = screen->get_tex_surface(screen, &texture->base, face, level, zslice,
-                                  PIPE_BUFFER_USAGE_CPU_WRITE);
-   ASSERT(surf);
+         const uint w = texture->base.width[level], h = texture->base.height[level];
 
-   src = (const uint *) pipe_surface_map(surf, PIPE_BUFFER_USAGE_CPU_WRITE);
+         if (w < 32 || h < 32)
+            continue;
+         /* temporary restrictions: */
+         assert(w >= TILE_SIZE);
+         assert(h >= TILE_SIZE);
+         assert(w % TILE_SIZE == 0);
+         assert(h % TILE_SIZE == 0);
 
-   if (texture->tiled_data) {
-      align_free(texture->tiled_data);
-   }
-   texture->tiled_data = align_malloc(w * h * 4, 16);
+         surf = screen->get_tex_surface(screen, &texture->base, face, level, zslice,
+                                        PIPE_BUFFER_USAGE_CPU_WRITE);
+         ASSERT(surf);
+         
+         src = (const uint *) pipe_surface_map(surf, PIPE_BUFFER_USAGE_CPU_WRITE);
 
-   tile_copy_data(w, h, TILE_SIZE, texture->tiled_data, src);
+         if (texture->tiled_data[level]) {
+            align_free(texture->tiled_data[level]);
+         }
+         texture->tiled_data[level] = align_malloc(w * h * 4, 16);
 
-   pipe_surface_unmap(surf);
+         tile_copy_data(w, h, TILE_SIZE, texture->tiled_data[level], src);
 
-   pipe_surface_reference(&surf, NULL);
+         pipe_surface_unmap(surf);
+
+         pipe_surface_reference(&surf, NULL);
+      }
+   }
 }
 
 
diff --git a/src/gallium/drivers/cell/ppu/cell_texture.h b/src/gallium/drivers/cell/ppu/cell_texture.h
index 6d37e95ebc..6d35736984 100644
--- a/src/gallium/drivers/cell/ppu/cell_texture.h
+++ b/src/gallium/drivers/cell/ppu/cell_texture.h
@@ -40,15 +40,15 @@ struct cell_texture
 {
    struct pipe_texture base;
 
-   unsigned long level_offset[PIPE_MAX_TEXTURE_LEVELS];
-   unsigned long stride[PIPE_MAX_TEXTURE_LEVELS];
+   unsigned long level_offset[CELL_MAX_TEXTURE_LEVELS];
+   unsigned long stride[CELL_MAX_TEXTURE_LEVELS];
 
    /* The data is held here:
     */
    struct pipe_buffer *buffer;
    unsigned long buffer_size;
 
-   void *tiled_data;  /* XXX this may be temporary */ /*ALIGN16*/
+   void *tiled_data[CELL_MAX_TEXTURE_LEVELS];  /* XXX this may be temporary */ /*ALIGN16*/
 };
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index 64890f6dbd..089af22415 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -301,6 +301,12 @@ cmd_state_sampler(const struct cell_command_sampler *sampler)
    DEBUG_PRINTF("SAMPLER [%u]\n", sampler->unit);
 
    spu.sampler[sampler->unit] = sampler->state;
+#if 0
+   if (spu.sampler[sampler->unit].min_mip_filter != PIPE_TEX_MIPFILTER_NONE) {
+      spu.sample_texture4[sampler->unit] = sample_texture4_lod;
+   }
+   else
+#endif
    if (spu.sampler[sampler->unit].min_img_filter == PIPE_TEX_FILTER_LINEAR) {
       spu.sample_texture4[sampler->unit] = sample_texture4_bilinear;
    }
@@ -314,24 +320,29 @@ static void
 cmd_state_texture(const struct cell_command_texture *texture)
 {
    const uint unit = texture->unit;
-   const uint width = texture->width;
-   const uint height = texture->height;
+   uint i;
 
-   DEBUG_PRINTF("TEXTURE [%u] at %p  size %u x %u\n",
-             texture->unit, texture->start,
-             texture->width, texture->height);
+   DEBUG_PRINTF("TEXTURE [%u]\n", texture->unit);
 
-   spu.texture[unit].start = texture->start;
-   spu.texture[unit].width = width;
-   spu.texture[unit].height = height;
+   for (i = 0; i < CELL_MAX_TEXTURE_LEVELS; i++) {
+      uint width = texture->width[i];
+      uint height = texture->height[i];
 
-   spu.texture[unit].width4 = spu_splats((float) width);
-   spu.texture[unit].height4 = spu_splats((float) height);
+      DEBUG_PRINTF("  LEVEL %u: at %p  size[0] %u x %u\n", i,
+             texture->start[i], texture->width[i], texture->height[i]);
 
-   spu.texture[unit].tiles_per_row = width / TILE_SIZE;
+      spu.texture[unit].level[i].start = texture->start[i];
+      spu.texture[unit].level[i].width = width;
+      spu.texture[unit].level[i].height = height;
 
-   spu.texture[unit].tex_size_x_mask = spu_splats(width - 1);
-   spu.texture[unit].tex_size_y_mask = spu_splats(height - 1);
+      spu.texture[unit].level[i].tiles_per_row = width / TILE_SIZE;
+
+      spu.texture[unit].level[i].width4 = spu_splats((float) width);
+      spu.texture[unit].level[i].height4 = spu_splats((float) height);
+
+      spu.texture[unit].level[i].tex_size_x_mask = spu_splats(width - 1);
+      spu.texture[unit].level[i].tex_size_y_mask = spu_splats(height - 1);
+   }
 }
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_funcs.c b/src/gallium/drivers/cell/spu/spu_funcs.c
index 4c90b701ee..f2946010bd 100644
--- a/src/gallium/drivers/cell/spu/spu_funcs.c
+++ b/src/gallium/drivers/cell/spu/spu_funcs.c
@@ -100,6 +100,7 @@ spu_log2(vector float x)
    return spu_mul(v, k);
 }
 
+
 static struct vec_4x4
 spu_txp(vector float s, vector float t, vector float r, vector float q,
         unsigned unit)
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index e3960dbe8b..9515543efe 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -107,7 +107,7 @@ struct spu_framebuffer
 } ALIGN16_ATTRIB;
 
 
-struct spu_texture
+struct spu_texture_level
 {
    void *start;
    ushort width, height;
@@ -118,6 +118,11 @@ struct spu_texture
    vector unsigned int tex_size_y_mask; /**< splat(height-1) */
 } ALIGN16_ATTRIB;
 
+struct spu_texture
+{
+   struct spu_texture_level level[CELL_MAX_TEXTURE_LEVELS];
+} ALIGN16_ATTRIB;
+
 
 /**
  * All SPU global/context state will be in a singleton object of this type:
diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
index 96ef88822a..96c09e3ccb 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.c
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -27,6 +27,7 @@
 
 
 #include <transpose_matrix4x4.h>
+#include <math.h>
 
 #include "pipe/p_compiler.h"
 #include "spu_main.h"
@@ -42,11 +43,12 @@
 void
 invalidate_tex_cache(void)
 {
+   uint lvl = 0;
    uint unit = 0;
-   uint bytes = 4 * spu.texture[unit].width
-      * spu.texture[unit].height;
+   uint bytes = 4 * spu.texture[unit].level[lvl].width
+      * spu.texture[unit].level[lvl].height;
 
-   spu_dcache_mark_dirty((unsigned) spu.texture[unit].start, bytes);
+   spu_dcache_mark_dirty((unsigned) spu.texture[unit].level[lvl].start, bytes);
 }
 
 
@@ -64,15 +66,17 @@ invalidate_tex_cache(void)
  * a time.
  */
 static void
-get_four_texels(uint unit, vec_uint4 x, vec_uint4 y, vec_uint4 *texels)
+get_four_texels(uint unit, uint level, vec_uint4 x, vec_uint4 y,
+                vec_uint4 *texels)
 {
-   const unsigned texture_ea = (uintptr_t) spu.texture[unit].start;
+   const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
+   const unsigned texture_ea = (uintptr_t) tlevel->start;
    vec_uint4 tile_x = spu_rlmask(x, -5);  /* tile_x = x / 32 */
    vec_uint4 tile_y = spu_rlmask(y, -5);  /* tile_y = y / 32 */
    const qword offset_x = si_andi((qword) x, 0x1f); /* offset_x = x & 0x1f */
    const qword offset_y = si_andi((qword) y, 0x1f); /* offset_y = y & 0x1f */
 
-   const qword tiles_per_row = (qword) spu_splats(spu.texture[unit].tiles_per_row);
+   const qword tiles_per_row = (qword) spu_splats(tlevel->tiles_per_row);
    const qword tile_size = (qword) spu_splats((unsigned) sizeof(tile_t));
 
    qword tile_offset = si_mpya((qword) tile_y, tiles_per_row, (qword) tile_x);
@@ -104,17 +108,18 @@ sample_texture4_nearest(vector float s, vector float t,
                         vector float r, vector float q,
                         uint unit, vector float colors[4])
 {
-   vector float ss = spu_mul(s, spu.texture[unit].width4);
-   vector float tt = spu_mul(t, spu.texture[unit].height4);
+   const uint lvl = 0;
+   vector float ss = spu_mul(s, spu.texture[unit].level[lvl].width4);
+   vector float tt = spu_mul(t, spu.texture[unit].level[lvl].height4);
    vector unsigned int is = spu_convtu(ss, 0);
    vector unsigned int it = spu_convtu(tt, 0);
    vec_uint4 texels[4];
 
    /* PIPE_TEX_WRAP_REPEAT */
-   is = spu_and(is, spu.texture[unit].tex_size_x_mask);
-   it = spu_and(it, spu.texture[unit].tex_size_y_mask);
+   is = spu_and(is, spu.texture[unit].level[lvl].tex_size_x_mask);
+   it = spu_and(it, spu.texture[unit].level[lvl].tex_size_y_mask);
 
-   get_four_texels(unit, is, it, texels);
+   get_four_texels(unit, lvl, is, it, texels);
 
    /* convert four packed ARGBA pixels to float RRRR,GGGG,BBBB,AAAA */
    spu_unpack_A8R8G8B8_transpose4(texels, colors);
@@ -130,8 +135,9 @@ sample_texture4_bilinear(vector float s, vector float t,
                          vector float r, vector float q,
                          uint unit, vector float colors[4])
 {
-   vector float ss = spu_madd(s, spu.texture[unit].width4,  spu_splats(-0.5f));
-   vector float tt = spu_madd(t, spu.texture[unit].height4, spu_splats(-0.5f));
+   const uint lvl = 0;
+   vector float ss = spu_madd(s, spu.texture[unit].level[lvl].width4,  spu_splats(-0.5f));
+   vector float tt = spu_madd(t, spu.texture[unit].level[lvl].height4, spu_splats(-0.5f));
 
    vector unsigned int is0 = spu_convtu(ss, 0);
    vector unsigned int it0 = spu_convtu(tt, 0);
@@ -141,17 +147,17 @@ sample_texture4_bilinear(vector float s, vector float t,
    vector unsigned int it1 = spu_add(it0, 1);
 
    /* PIPE_TEX_WRAP_REPEAT */
-   is0 = spu_and(is0, spu.texture[unit].tex_size_x_mask);
-   it0 = spu_and(it0, spu.texture[unit].tex_size_y_mask);
-   is1 = spu_and(is1, spu.texture[unit].tex_size_x_mask);
-   it1 = spu_and(it1, spu.texture[unit].tex_size_y_mask);
+   is0 = spu_and(is0, spu.texture[unit].level[lvl].tex_size_x_mask);
+   it0 = spu_and(it0, spu.texture[unit].level[lvl].tex_size_y_mask);
+   is1 = spu_and(is1, spu.texture[unit].level[lvl].tex_size_x_mask);
+   it1 = spu_and(it1, spu.texture[unit].level[lvl].tex_size_y_mask);
 
    /* get packed int texels */
    vector unsigned int texels[16];
-   get_four_texels(unit, is0, it0, texels + 0);  /* upper-left */
-   get_four_texels(unit, is1, it0, texels + 4);  /* upper-right */
-   get_four_texels(unit, is0, it1, texels + 8);  /* lower-left */
-   get_four_texels(unit, is1, it1, texels + 12); /* lower-right */
+   get_four_texels(unit, lvl, is0, it0, texels + 0);  /* upper-left */
+   get_four_texels(unit, lvl, is1, it0, texels + 4);  /* upper-right */
+   get_four_texels(unit, lvl, is0, it1, texels + 8);  /* lower-left */
+   get_four_texels(unit, lvl, is1, it1, texels + 12); /* lower-right */
 
    /* XXX possibly rework following code to compute the weighted sample
     * colors with integer arithmetic for fewer int->float conversions.
@@ -270,10 +276,11 @@ sample_texture4_bilinear_2(vector float s, vector float t,
                          vector float r, vector float q,
                          uint unit, vector float colors[4])
 {
+   const uint lvl = 0;
    static const vector float half = {-0.5f, -0.5f, -0.5f, -0.5f};
    /* Scale texcoords by size of texture, and add half pixel bias */
-   vector float ss = spu_madd(s, spu.texture[unit].width4, half);
-   vector float tt = spu_madd(t, spu.texture[unit].height4, half);
+   vector float ss = spu_madd(s, spu.texture[unit].level[lvl].width4, half);
+   vector float tt = spu_madd(t, spu.texture[unit].level[lvl].height4, half);
 
    /* convert float coords to fixed-pt coords with 8 fraction bits */
    vector unsigned int is = spu_convtu(ss, 8);
@@ -294,17 +301,17 @@ sample_texture4_bilinear_2(vector float s, vector float t,
    vector unsigned int it1 = spu_add(it0, 1);
 
    /* PIPE_TEX_WRAP_REPEAT */
-   is0 = spu_and(is0, spu.texture[unit].tex_size_x_mask);
-   it0 = spu_and(it0, spu.texture[unit].tex_size_y_mask);
-   is1 = spu_and(is1, spu.texture[unit].tex_size_x_mask);
-   it1 = spu_and(it1, spu.texture[unit].tex_size_y_mask);
+   is0 = spu_and(is0, spu.texture[unit].level[lvl].tex_size_x_mask);
+   it0 = spu_and(it0, spu.texture[unit].level[lvl].tex_size_y_mask);
+   is1 = spu_and(is1, spu.texture[unit].level[lvl].tex_size_x_mask);
+   it1 = spu_and(it1, spu.texture[unit].level[lvl].tex_size_y_mask);
 
    /* get packed int texels */
    vector unsigned int texels[16];
-   get_four_texels(unit, is0, it0, texels + 0);  /* upper-left */
-   get_four_texels(unit, is1, it0, texels + 4);  /* upper-right */
-   get_four_texels(unit, is0, it1, texels + 8);  /* lower-left */
-   get_four_texels(unit, is1, it1, texels + 12); /* lower-right */
+   get_four_texels(unit, lvl, is0, it0, texels + 0);  /* upper-left */
+   get_four_texels(unit, lvl, is1, it0, texels + 4);  /* upper-right */
+   get_four_texels(unit, lvl, is0, it1, texels + 8);  /* lower-left */
+   get_four_texels(unit, lvl, is1, it1, texels + 12); /* lower-right */
 
    /* twiddle packed 32-bit BGRA pixels into RGBA as four unsigned ints */
    {
@@ -363,3 +370,54 @@ sample_texture4_bilinear_2(vector float s, vector float t,
    cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3));
    colors[3] = spu_convtf(cSum, 24);
 }
+
+
+
+/**
+ * Compute level of detail factor from texcoords.
+ */
+static float
+compute_lambda(uint unit, vector float s, vector float t)
+{
+   uint lvl = 0;
+   float width = spu.texture[unit].level[lvl].width;
+   float height = spu.texture[unit].level[lvl].width;
+   float dsdx = width * (spu_extract(s, 1) - spu_extract(s, 0));
+   float dsdy = width * (spu_extract(s, 2) - spu_extract(s, 0));
+   float dtdx = height * (spu_extract(t, 1) - spu_extract(t, 0));
+   float dtdy = height * (spu_extract(t, 2) - spu_extract(t, 0));
+   float x = dsdx * dsdx + dtdx * dtdx;
+   float y = dsdy * dsdy + dtdy * dtdy;
+   float rho = x > y ? x : y;
+   rho = sqrtf(rho);
+   float lambda = logf(rho) * 1.442695f;
+   return lambda;
+}
+
+
+
+/**
+ * Texture sampling with level of detail selection.
+ */
+void
+sample_texture4_lod(vector float s, vector float t,
+                    vector float r, vector float q,
+                    uint unit, vector float colors[4])
+{
+   float lambda = compute_lambda(unit, s, t);
+
+   if (lambda < spu.sampler[unit].min_lod)
+      lambda = spu.sampler[unit].min_lod;
+   else if (lambda > spu.sampler[unit].max_lod)
+      lambda = spu.sampler[unit].max_lod;
+
+   /* hack for now */
+   int level = (int) lambda;
+   if (level > 3)
+      level = 3;
+
+   /*
+   sample_texture4_bilinear_2(s, t, r, q, unit, level, colors);
+   */
+}
+
diff --git a/src/gallium/drivers/cell/spu/spu_texture.h b/src/gallium/drivers/cell/spu/spu_texture.h
index 38a17deda2..4802f7c47c 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.h
+++ b/src/gallium/drivers/cell/spu/spu_texture.h
@@ -53,4 +53,10 @@ sample_texture4_bilinear_2(vector float s, vector float t,
                          uint unit, vector float colors[4]);
 
 
+extern void
+sample_texture4_lod(vector float s, vector float t,
+                    vector float r, vector float q,
+                    uint unit, vector float colors[4]);
+
+
 #endif /* SPU_TEXTURE_H */
-- 
cgit v1.2.3


From f8bddf698d523f597fea0f721b064daee81d8005 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Tue, 14 Oct 2008 12:11:52 -0600
Subject: cell: basic mipmap filtering works now

Though, only GL_MIPMAP_NEAREST / GL_LINEAR works right now.
---
 src/gallium/drivers/cell/spu/spu_command.c |  21 ++++--
 src/gallium/drivers/cell/spu/spu_funcs.c   |   2 +-
 src/gallium/drivers/cell/spu/spu_main.h    |   3 +-
 src/gallium/drivers/cell/spu/spu_texture.c | 106 +++++++++++++++--------------
 src/gallium/drivers/cell/spu/spu_texture.h |   8 +--
 5 files changed, 79 insertions(+), 61 deletions(-)

(limited to 'src/gallium/drivers/cell/spu/spu_command.c')

diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index 089af22415..4e98eea338 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -301,16 +301,18 @@ cmd_state_sampler(const struct cell_command_sampler *sampler)
    DEBUG_PRINTF("SAMPLER [%u]\n", sampler->unit);
 
    spu.sampler[sampler->unit] = sampler->state;
-#if 0
+
    if (spu.sampler[sampler->unit].min_mip_filter != PIPE_TEX_MIPFILTER_NONE) {
+      /* use lambda/lod to determine min vs. mag filter */
       spu.sample_texture4[sampler->unit] = sample_texture4_lod;
    }
-   else
-#endif
-   if (spu.sampler[sampler->unit].min_img_filter == PIPE_TEX_FILTER_LINEAR) {
+   else if (spu.sampler[sampler->unit].min_img_filter
+            == PIPE_TEX_FILTER_LINEAR) {
+      /* min = mag = bilinear */
       spu.sample_texture4[sampler->unit] = sample_texture4_bilinear;
    }
    else {
+      /* min = mag = inearest */
       spu.sample_texture4[sampler->unit] = sample_texture4_nearest;
    }
 }
@@ -322,8 +324,12 @@ cmd_state_texture(const struct cell_command_texture *texture)
    const uint unit = texture->unit;
    uint i;
 
+   //if (spu.init.id==0) Debug=1;
+
    DEBUG_PRINTF("TEXTURE [%u]\n", texture->unit);
 
+   spu.texture[unit].max_level = 0;
+
    for (i = 0; i < CELL_MAX_TEXTURE_LEVELS; i++) {
       uint width = texture->width[i];
       uint height = texture->height[i];
@@ -335,14 +341,19 @@ cmd_state_texture(const struct cell_command_texture *texture)
       spu.texture[unit].level[i].width = width;
       spu.texture[unit].level[i].height = height;
 
-      spu.texture[unit].level[i].tiles_per_row = width / TILE_SIZE;
+      spu.texture[unit].level[i].tiles_per_row =
+         (width + TILE_SIZE - 1) / TILE_SIZE;
 
       spu.texture[unit].level[i].width4 = spu_splats((float) width);
       spu.texture[unit].level[i].height4 = spu_splats((float) height);
 
       spu.texture[unit].level[i].tex_size_x_mask = spu_splats(width - 1);
       spu.texture[unit].level[i].tex_size_y_mask = spu_splats(height - 1);
+
+      if (texture->start[i])
+         spu.texture[unit].max_level = i;
    }
+   //Debug=0;
 }
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_funcs.c b/src/gallium/drivers/cell/spu/spu_funcs.c
index f2946010bd..66b82f673d 100644
--- a/src/gallium/drivers/cell/spu/spu_funcs.c
+++ b/src/gallium/drivers/cell/spu/spu_funcs.c
@@ -106,7 +106,7 @@ spu_txp(vector float s, vector float t, vector float r, vector float q,
         unsigned unit)
 {
    struct vec_4x4 colors;
-   spu.sample_texture4[unit](s, t, r, q, unit, colors.v);
+   spu.sample_texture4[unit](s, t, r, q, unit, 0, colors.v);
    return colors;
 }
 
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 9515543efe..cfb645add0 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -68,7 +68,7 @@ typedef void (*spu_sample_texture4_func)(vector float s,
                                          vector float t,
                                          vector float r,
                                          vector float q,
-                                         uint unit,
+                                         uint unit, uint level,
                                          vector float colors[4]);
 
 
@@ -121,6 +121,7 @@ struct spu_texture_level
 struct spu_texture
 {
    struct spu_texture_level level[CELL_MAX_TEXTURE_LEVELS];
+   uint max_level;
 } ALIGN16_ATTRIB;
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
index 96c09e3ccb..10036330c6 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.c
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -26,7 +26,6 @@
  **************************************************************************/
 
 
-#include <transpose_matrix4x4.h>
 #include <math.h>
 
 #include "pipe/p_compiler.h"
@@ -43,12 +42,14 @@
 void
 invalidate_tex_cache(void)
 {
-   uint lvl = 0;
-   uint unit = 0;
-   uint bytes = 4 * spu.texture[unit].level[lvl].width
-      * spu.texture[unit].level[lvl].height;
+   uint lvl;
+   for (lvl = 0; lvl < CELL_MAX_TEXTURE_LEVELS; lvl++) {
+      uint unit = 0;
+      uint bytes = 4 * spu.texture[unit].level[lvl].width
+         * spu.texture[unit].level[lvl].height;
 
-   spu_dcache_mark_dirty((unsigned) spu.texture[unit].level[lvl].start, bytes);
+      spu_dcache_mark_dirty((unsigned) spu.texture[unit].level[lvl].start, bytes);
+   }
 }
 
 
@@ -71,8 +72,8 @@ get_four_texels(uint unit, uint level, vec_uint4 x, vec_uint4 y,
 {
    const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
    const unsigned texture_ea = (uintptr_t) tlevel->start;
-   vec_uint4 tile_x = spu_rlmask(x, -5);  /* tile_x = x / 32 */
-   vec_uint4 tile_y = spu_rlmask(y, -5);  /* tile_y = y / 32 */
+   const vec_uint4 tile_x = spu_rlmask(x, -5);  /* tile_x = x / 32 */
+   const vec_uint4 tile_y = spu_rlmask(y, -5);  /* tile_y = y / 32 */
    const qword offset_x = si_andi((qword) x, 0x1f); /* offset_x = x & 0x1f */
    const qword offset_y = si_andi((qword) y, 0x1f); /* offset_y = y & 0x1f */
 
@@ -106,20 +107,19 @@ get_four_texels(uint unit, uint level, vec_uint4 x, vec_uint4 y,
 void
 sample_texture4_nearest(vector float s, vector float t,
                         vector float r, vector float q,
-                        uint unit, vector float colors[4])
+                        uint unit, uint level, vector float colors[4])
 {
-   const uint lvl = 0;
-   vector float ss = spu_mul(s, spu.texture[unit].level[lvl].width4);
-   vector float tt = spu_mul(t, spu.texture[unit].level[lvl].height4);
+   vector float ss = spu_mul(s, spu.texture[unit].level[level].width4);
+   vector float tt = spu_mul(t, spu.texture[unit].level[level].height4);
    vector unsigned int is = spu_convtu(ss, 0);
    vector unsigned int it = spu_convtu(tt, 0);
    vec_uint4 texels[4];
 
    /* PIPE_TEX_WRAP_REPEAT */
-   is = spu_and(is, spu.texture[unit].level[lvl].tex_size_x_mask);
-   it = spu_and(it, spu.texture[unit].level[lvl].tex_size_y_mask);
+   is = spu_and(is, spu.texture[unit].level[level].tex_size_x_mask);
+   it = spu_and(it, spu.texture[unit].level[level].tex_size_y_mask);
 
-   get_four_texels(unit, lvl, is, it, texels);
+   get_four_texels(unit, level, is, it, texels);
 
    /* convert four packed ARGBA pixels to float RRRR,GGGG,BBBB,AAAA */
    spu_unpack_A8R8G8B8_transpose4(texels, colors);
@@ -133,11 +133,10 @@ sample_texture4_nearest(vector float s, vector float t,
 void
 sample_texture4_bilinear(vector float s, vector float t,
                          vector float r, vector float q,
-                         uint unit, vector float colors[4])
+                         uint unit, uint level, vector float colors[4])
 {
-   const uint lvl = 0;
-   vector float ss = spu_madd(s, spu.texture[unit].level[lvl].width4,  spu_splats(-0.5f));
-   vector float tt = spu_madd(t, spu.texture[unit].level[lvl].height4, spu_splats(-0.5f));
+   vector float ss = spu_madd(s, spu.texture[unit].level[level].width4,  spu_splats(-0.5f));
+   vector float tt = spu_madd(t, spu.texture[unit].level[level].height4, spu_splats(-0.5f));
 
    vector unsigned int is0 = spu_convtu(ss, 0);
    vector unsigned int it0 = spu_convtu(tt, 0);
@@ -147,17 +146,17 @@ sample_texture4_bilinear(vector float s, vector float t,
    vector unsigned int it1 = spu_add(it0, 1);
 
    /* PIPE_TEX_WRAP_REPEAT */
-   is0 = spu_and(is0, spu.texture[unit].level[lvl].tex_size_x_mask);
-   it0 = spu_and(it0, spu.texture[unit].level[lvl].tex_size_y_mask);
-   is1 = spu_and(is1, spu.texture[unit].level[lvl].tex_size_x_mask);
-   it1 = spu_and(it1, spu.texture[unit].level[lvl].tex_size_y_mask);
+   is0 = spu_and(is0, spu.texture[unit].level[level].tex_size_x_mask);
+   it0 = spu_and(it0, spu.texture[unit].level[level].tex_size_y_mask);
+   is1 = spu_and(is1, spu.texture[unit].level[level].tex_size_x_mask);
+   it1 = spu_and(it1, spu.texture[unit].level[level].tex_size_y_mask);
 
    /* get packed int texels */
    vector unsigned int texels[16];
-   get_four_texels(unit, lvl, is0, it0, texels + 0);  /* upper-left */
-   get_four_texels(unit, lvl, is1, it0, texels + 4);  /* upper-right */
-   get_four_texels(unit, lvl, is0, it1, texels + 8);  /* lower-left */
-   get_four_texels(unit, lvl, is1, it1, texels + 12); /* lower-right */
+   get_four_texels(unit, level, is0, it0, texels + 0);  /* upper-left */
+   get_four_texels(unit, level, is1, it0, texels + 4);  /* upper-right */
+   get_four_texels(unit, level, is0, it1, texels + 8);  /* lower-left */
+   get_four_texels(unit, level, is1, it1, texels + 12); /* lower-right */
 
    /* XXX possibly rework following code to compute the weighted sample
     * colors with integer arithmetic for fewer int->float conversions.
@@ -273,14 +272,13 @@ transpose(vector unsigned int *mOut0,
  */
 void
 sample_texture4_bilinear_2(vector float s, vector float t,
-                         vector float r, vector float q,
-                         uint unit, vector float colors[4])
+                           vector float r, vector float q,
+                           uint unit, uint level, vector float colors[4])
 {
-   const uint lvl = 0;
    static const vector float half = {-0.5f, -0.5f, -0.5f, -0.5f};
    /* Scale texcoords by size of texture, and add half pixel bias */
-   vector float ss = spu_madd(s, spu.texture[unit].level[lvl].width4, half);
-   vector float tt = spu_madd(t, spu.texture[unit].level[lvl].height4, half);
+   vector float ss = spu_madd(s, spu.texture[unit].level[level].width4, half);
+   vector float tt = spu_madd(t, spu.texture[unit].level[level].height4, half);
 
    /* convert float coords to fixed-pt coords with 8 fraction bits */
    vector unsigned int is = spu_convtu(ss, 8);
@@ -301,17 +299,17 @@ sample_texture4_bilinear_2(vector float s, vector float t,
    vector unsigned int it1 = spu_add(it0, 1);
 
    /* PIPE_TEX_WRAP_REPEAT */
-   is0 = spu_and(is0, spu.texture[unit].level[lvl].tex_size_x_mask);
-   it0 = spu_and(it0, spu.texture[unit].level[lvl].tex_size_y_mask);
-   is1 = spu_and(is1, spu.texture[unit].level[lvl].tex_size_x_mask);
-   it1 = spu_and(it1, spu.texture[unit].level[lvl].tex_size_y_mask);
+   is0 = spu_and(is0, spu.texture[unit].level[level].tex_size_x_mask);
+   it0 = spu_and(it0, spu.texture[unit].level[level].tex_size_y_mask);
+   is1 = spu_and(is1, spu.texture[unit].level[level].tex_size_x_mask);
+   it1 = spu_and(it1, spu.texture[unit].level[level].tex_size_y_mask);
 
    /* get packed int texels */
    vector unsigned int texels[16];
-   get_four_texels(unit, lvl, is0, it0, texels + 0);  /* upper-left */
-   get_four_texels(unit, lvl, is1, it0, texels + 4);  /* upper-right */
-   get_four_texels(unit, lvl, is0, it1, texels + 8);  /* lower-left */
-   get_four_texels(unit, lvl, is1, it1, texels + 12); /* lower-right */
+   get_four_texels(unit, level, is0, it0, texels + 0);  /* upper-left */
+   get_four_texels(unit, level, is1, it0, texels + 4);  /* upper-right */
+   get_four_texels(unit, level, is0, it1, texels + 8);  /* lower-left */
+   get_four_texels(unit, level, is1, it1, texels + 12); /* lower-right */
 
    /* twiddle packed 32-bit BGRA pixels into RGBA as four unsigned ints */
    {
@@ -379,9 +377,9 @@ sample_texture4_bilinear_2(vector float s, vector float t,
 static float
 compute_lambda(uint unit, vector float s, vector float t)
 {
-   uint lvl = 0;
-   float width = spu.texture[unit].level[lvl].width;
-   float height = spu.texture[unit].level[lvl].width;
+   uint baseLevel = 0;
+   float width = spu.texture[unit].level[baseLevel].width;
+   float height = spu.texture[unit].level[baseLevel].width;
    float dsdx = width * (spu_extract(s, 1) - spu_extract(s, 0));
    float dsdy = width * (spu_extract(s, 2) - spu_extract(s, 0));
    float dtdx = height * (spu_extract(t, 1) - spu_extract(t, 0));
@@ -402,22 +400,30 @@ compute_lambda(uint unit, vector float s, vector float t)
 void
 sample_texture4_lod(vector float s, vector float t,
                     vector float r, vector float q,
-                    uint unit, vector float colors[4])
+                    uint unit, uint level, vector float colors[4])
 {
+   /*
+    * Note that we're computing a lambda/lod here that's used for all
+    * four pixels in the quad.
+    */
    float lambda = compute_lambda(unit, s, t);
 
+   /* apply lod bias */
+   lambda += spu.sampler[unit].lod_bias;
+
+   /* clamp */
    if (lambda < spu.sampler[unit].min_lod)
       lambda = spu.sampler[unit].min_lod;
    else if (lambda > spu.sampler[unit].max_lod)
       lambda = spu.sampler[unit].max_lod;
 
-   /* hack for now */
-   int level = (int) lambda;
-   if (level > 3)
-      level = 3;
+   /* convert to int level */
+   level = (int) (lambda + 0.5f);
+   ASSERT(level >= 0);
+
+   if (level > spu.texture[unit].max_level)
+      level = spu.texture[unit].max_level;
 
-   /*
    sample_texture4_bilinear_2(s, t, r, q, unit, level, colors);
-   */
 }
 
diff --git a/src/gallium/drivers/cell/spu/spu_texture.h b/src/gallium/drivers/cell/spu/spu_texture.h
index 4802f7c47c..ec06a50b4a 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.h
+++ b/src/gallium/drivers/cell/spu/spu_texture.h
@@ -39,24 +39,24 @@ invalidate_tex_cache(void);
 extern void
 sample_texture4_nearest(vector float s, vector float t,
                         vector float r, vector float q,
-                        uint unit, vector float colors[4]);
+                        uint unit, uint level, vector float colors[4]);
 
 
 extern void
 sample_texture4_bilinear(vector float s, vector float t,
                          vector float r, vector float q,
-                         uint unit, vector float colors[4]);
+                         uint unit, uint level, vector float colors[4]);
 
 extern void
 sample_texture4_bilinear_2(vector float s, vector float t,
                          vector float r, vector float q,
-                         uint unit, vector float colors[4]);
+                           uint unit, uint level, vector float colors[4]);
 
 
 extern void
 sample_texture4_lod(vector float s, vector float t,
                     vector float r, vector float q,
-                    uint unit, vector float colors[4]);
+                    uint unit, uint level, vector float colors[4]);
 
 
 #endif /* SPU_TEXTURE_H */
-- 
cgit v1.2.3


From 6d2d5ceca21c87bea5e269e8099fb6f1d821b97a Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Tue, 14 Oct 2008 12:42:21 -0600
Subject: cell: use minify vs magnify filters

---
 src/gallium/drivers/cell/spu/spu_command.c | 50 +++++++++++++++++++++++-------
 src/gallium/drivers/cell/spu/spu_main.h    |  2 ++
 src/gallium/drivers/cell/spu/spu_texture.c | 22 +++++++------
 3 files changed, 53 insertions(+), 21 deletions(-)

(limited to 'src/gallium/drivers/cell/spu/spu_command.c')

diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index 4e98eea338..fa78377c66 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -298,22 +298,48 @@ cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
 static void
 cmd_state_sampler(const struct cell_command_sampler *sampler)
 {
-   DEBUG_PRINTF("SAMPLER [%u]\n", sampler->unit);
+   uint unit = sampler->unit;
 
-   spu.sampler[sampler->unit] = sampler->state;
+   DEBUG_PRINTF("SAMPLER [%u]\n", unit);
 
-   if (spu.sampler[sampler->unit].min_mip_filter != PIPE_TEX_MIPFILTER_NONE) {
-      /* use lambda/lod to determine min vs. mag filter */
-      spu.sample_texture4[sampler->unit] = sample_texture4_lod;
+   spu.sampler[unit] = sampler->state;
+
+   switch (spu.sampler[unit].min_img_filter) {
+   case PIPE_TEX_FILTER_LINEAR:
+      spu.min_sample_texture4[unit] = sample_texture4_bilinear;
+      break;
+   case PIPE_TEX_FILTER_ANISO:
+      /* fall-through, for now */
+   case PIPE_TEX_FILTER_NEAREST:
+      spu.min_sample_texture4[unit] = sample_texture4_nearest;
+      break;
+   default:
+      ASSERT(0);
    }
-   else if (spu.sampler[sampler->unit].min_img_filter
-            == PIPE_TEX_FILTER_LINEAR) {
-      /* min = mag = bilinear */
-      spu.sample_texture4[sampler->unit] = sample_texture4_bilinear;
+
+   switch (spu.sampler[sampler->unit].mag_img_filter) {
+   case PIPE_TEX_FILTER_LINEAR:
+      spu.mag_sample_texture4[unit] = sample_texture4_bilinear;
+      break;
+   case PIPE_TEX_FILTER_ANISO:
+      /* fall-through, for now */
+   case PIPE_TEX_FILTER_NEAREST:
+      spu.mag_sample_texture4[unit] = sample_texture4_nearest;
+      break;
+   default:
+      ASSERT(0);
    }
-   else {
-      /* min = mag = inearest */
-      spu.sample_texture4[sampler->unit] = sample_texture4_nearest;
+
+   switch (spu.sampler[sampler->unit].min_mip_filter) {
+   case PIPE_TEX_MIPFILTER_NEAREST:
+   case PIPE_TEX_MIPFILTER_LINEAR:
+      spu.sample_texture4[unit] = sample_texture4_lod;
+      break;
+   case PIPE_TEX_MIPFILTER_NONE:
+      spu.sample_texture4[unit] = spu.mag_sample_texture4[unit];
+      break;
+   default:
+      ASSERT(0);
    }
 }
 
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index cfb645add0..56aac655e9 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -170,6 +170,8 @@ struct spu_global
 
    /** Current texture sampler function */
    spu_sample_texture4_func sample_texture4[CELL_MAX_SAMPLERS];
+   spu_sample_texture4_func min_sample_texture4[CELL_MAX_SAMPLERS];
+   spu_sample_texture4_func mag_sample_texture4[CELL_MAX_SAMPLERS];
 
    /** Fragment program constants */
    vector float constants[4 * CELL_MAX_CONSTANTS];
diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
index 10036330c6..267f2302f6 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.c
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -400,7 +400,7 @@ compute_lambda(uint unit, vector float s, vector float t)
 void
 sample_texture4_lod(vector float s, vector float t,
                     vector float r, vector float q,
-                    uint unit, uint level, vector float colors[4])
+                    uint unit, uint level_ignored, vector float colors[4])
 {
    /*
     * Note that we're computing a lambda/lod here that's used for all
@@ -417,13 +417,17 @@ sample_texture4_lod(vector float s, vector float t,
    else if (lambda > spu.sampler[unit].max_lod)
       lambda = spu.sampler[unit].max_lod;
 
-   /* convert to int level */
-   level = (int) (lambda + 0.5f);
-   ASSERT(level >= 0);
-
-   if (level > spu.texture[unit].max_level)
-      level = spu.texture[unit].max_level;
-
-   sample_texture4_bilinear_2(s, t, r, q, unit, level, colors);
+   if (lambda <= 0.0f) {
+      /* magnify */
+      spu.mag_sample_texture4[unit](s, t, r, q, unit, 0, colors);
+   }
+   else {
+      /* minify */
+      int level = (int) (lambda + 0.5f);
+      if (level > (int) spu.texture[unit].max_level)
+         level = spu.texture[unit].max_level;
+      spu.min_sample_texture4[unit](s, t, r, q, unit, level, colors);
+      /* XXX to do: mipmap level interpolation */
+   }
 }
 
-- 
cgit v1.2.3


From 85dc1aec9c5fc63a01bb8db07215b84790d15d8f Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Tue, 14 Oct 2008 15:19:01 -0600
Subject: cell: support NPOT textures, clamp/repeat mode, normalized/unorm
 texcoords

glDrawPixels works now.
---
 src/gallium/drivers/cell/spu/spu_command.c | 48 +++++++++++++--
 src/gallium/drivers/cell/spu/spu_main.h    | 12 ++--
 src/gallium/drivers/cell/spu/spu_texture.c | 99 ++++++++++++++++++++----------
 3 files changed, 117 insertions(+), 42 deletions(-)

(limited to 'src/gallium/drivers/cell/spu/spu_command.c')

diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index fa78377c66..b1efe97e76 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -295,6 +295,42 @@ cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
 }
 
 
+/**
+ * Tex texture mask_s/t and scale_s/t fields depend on the texture size and
+ * sampler wrap modes.
+ */
+static void
+update_tex_masks(struct spu_texture *texture,
+                 const struct pipe_sampler_state *sampler)
+{
+   uint i;
+
+   for (i = 0; i < CELL_MAX_TEXTURE_LEVELS; i++) {
+      int width = texture->level[i].width;
+      int height = texture->level[i].height;
+
+      if (sampler->wrap_s == PIPE_TEX_WRAP_REPEAT)
+         texture->level[i].mask_s = spu_splats(width - 1);
+      else
+         texture->level[i].mask_s = spu_splats(~0);
+
+      if (sampler->wrap_t == PIPE_TEX_WRAP_REPEAT)
+         texture->level[i].mask_t = spu_splats(height - 1);
+      else
+         texture->level[i].mask_t = spu_splats(~0);
+
+      if (sampler->normalized_coords) {
+         texture->level[i].scale_s = spu_splats((float) width);
+         texture->level[i].scale_t = spu_splats((float) height);
+      }
+      else {
+         texture->level[i].scale_s = spu_splats(1.0f);
+         texture->level[i].scale_t = spu_splats(1.0f);
+      }
+   }
+}
+
+
 static void
 cmd_state_sampler(const struct cell_command_sampler *sampler)
 {
@@ -341,6 +377,8 @@ cmd_state_sampler(const struct cell_command_sampler *sampler)
    default:
       ASSERT(0);
    }
+
+   update_tex_masks(&spu.texture[unit], &spu.sampler[unit]);
 }
 
 
@@ -370,15 +408,15 @@ cmd_state_texture(const struct cell_command_texture *texture)
       spu.texture[unit].level[i].tiles_per_row =
          (width + TILE_SIZE - 1) / TILE_SIZE;
 
-      spu.texture[unit].level[i].width4 = spu_splats((float) width);
-      spu.texture[unit].level[i].height4 = spu_splats((float) height);
-
-      spu.texture[unit].level[i].tex_size_x_mask = spu_splats(width - 1);
-      spu.texture[unit].level[i].tex_size_y_mask = spu_splats(height - 1);
+      spu.texture[unit].level[i].max_s = spu_splats((int) width - 1);
+      spu.texture[unit].level[i].max_t = spu_splats((int) height - 1);
 
       if (texture->start[i])
          spu.texture[unit].max_level = i;
    }
+
+   update_tex_masks(&spu.texture[unit], &spu.sampler[unit]);
+
    //Debug=0;
 }
 
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 56aac655e9..45c6f4ced1 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -107,17 +107,21 @@ struct spu_framebuffer
 } ALIGN16_ATTRIB;
 
 
+/** per-texture level info */
 struct spu_texture_level
 {
    void *start;
    ushort width, height;
    ushort tiles_per_row;
-   vector float width4;   /**< == {width, width, width, width} */
-   vector float height4;  /**< == {height, height, height, height} */
-   vector unsigned int tex_size_x_mask; /**< splat(width-1) */
-   vector unsigned int tex_size_y_mask; /**< splat(height-1) */
+   /** texcoord scale factors */
+   vector float scale_s, scale_t;
+   /** texcoord masks (if REPEAT then size-1, else ~0) */
+   vector signed int mask_s, mask_t;
+   /** texcoord clamp limits */
+   vector signed int max_s, max_t;
 } ALIGN16_ATTRIB;
 
+
 struct spu_texture
 {
    struct spu_texture_level level[CELL_MAX_TEXTURE_LEVELS];
diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
index 83cf7dc394..b21c43a467 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.c
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -67,13 +67,13 @@ invalidate_tex_cache(void)
  * a time.
  */
 static void
-get_four_texels(uint unit, uint level, vec_uint4 x, vec_uint4 y,
+get_four_texels(uint unit, uint level, vec_int4 x, vec_int4 y,
                 vec_uint4 *texels)
 {
    const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
    const unsigned texture_ea = (uintptr_t) tlevel->start;
-   const vec_uint4 tile_x = spu_rlmask(x, -5);  /* tile_x = x / 32 */
-   const vec_uint4 tile_y = spu_rlmask(y, -5);  /* tile_y = y / 32 */
+   const vec_int4 tile_x = spu_rlmask(x, -5);  /* tile_x = x / 32 */
+   const vec_int4 tile_y = spu_rlmask(y, -5);  /* tile_y = y / 32 */
    const qword offset_x = si_andi((qword) x, 0x1f); /* offset_x = x & 0x1f */
    const qword offset_y = si_andi((qword) y, 0x1f); /* offset_y = y & 0x1f */
 
@@ -99,6 +99,20 @@ get_four_texels(uint unit, uint level, vec_uint4 x, vec_uint4 y,
 }
 
 
+/** clamp vec to [0, max] */
+static INLINE vector signed int
+spu_clamp(vector signed int vec, vector signed int max)
+{
+   static const vector signed int zero = {0,0,0,0};
+   vector unsigned int c;
+   c = spu_cmpgt(vec, zero);    /* c = vec > zero ? ~0 : 0 */
+   vec = spu_sel(zero, vec, c);
+   c = spu_cmpgt(vec, max);    /* c = vec > max ? ~0 : 0 */
+   vec = spu_sel(vec, max, c);
+   return vec;
+}
+
+
 
 /**
  * Do nearest texture sampling for four pixels.
@@ -109,15 +123,20 @@ sample_texture4_nearest(vector float s, vector float t,
                         vector float r, vector float q,
                         uint unit, uint level, vector float colors[4])
 {
-   vector float ss = spu_mul(s, spu.texture[unit].level[level].width4);
-   vector float tt = spu_mul(t, spu.texture[unit].level[level].height4);
-   vector unsigned int is = spu_convtu(ss, 0);
-   vector unsigned int it = spu_convtu(tt, 0);
+   const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
+   vector float ss = spu_mul(s, tlevel->scale_s);
+   vector float tt = spu_mul(t, tlevel->scale_t);
+   vector signed int is = spu_convts(ss, 0);
+   vector signed int it = spu_convts(tt, 0);
    vec_uint4 texels[4];
 
    /* PIPE_TEX_WRAP_REPEAT */
-   is = spu_and(is, spu.texture[unit].level[level].tex_size_x_mask);
-   it = spu_and(it, spu.texture[unit].level[level].tex_size_y_mask);
+   is = spu_and(is, tlevel->mask_s);
+   it = spu_and(it, tlevel->mask_t);
+
+   /* PIPE_TEX_WRAP_CLAMP */
+   is = spu_clamp(is, tlevel->max_s);
+   it = spu_clamp(it, tlevel->max_t);
 
    get_four_texels(unit, level, is, it, texels);
 
@@ -135,21 +154,28 @@ sample_texture4_bilinear(vector float s, vector float t,
                          vector float r, vector float q,
                          uint unit, uint level, vector float colors[4])
 {
-   vector float ss = spu_madd(s, spu.texture[unit].level[level].width4,  spu_splats(-0.5f));
-   vector float tt = spu_madd(t, spu.texture[unit].level[level].height4, spu_splats(-0.5f));
+   const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
+   vector float ss = spu_madd(s, tlevel->scale_s,  spu_splats(-0.5f));
+   vector float tt = spu_madd(t, tlevel->scale_t, spu_splats(-0.5f));
 
-   vector unsigned int is0 = (vector unsigned int) spu_convts(ss, 0);
-   vector unsigned int it0 = (vector unsigned int) spu_convts(tt, 0);
+   vector signed int is0 = spu_convts(ss, 0);
+   vector signed int it0 = spu_convts(tt, 0);
 
    /* is + 1, it + 1 */
-   vector unsigned int is1 = spu_add(is0, 1);
-   vector unsigned int it1 = spu_add(it0, 1);
+   vector signed int is1 = spu_add(is0, 1);
+   vector signed int it1 = spu_add(it0, 1);
 
    /* PIPE_TEX_WRAP_REPEAT */
-   is0 = spu_and(is0, spu.texture[unit].level[level].tex_size_x_mask);
-   it0 = spu_and(it0, spu.texture[unit].level[level].tex_size_y_mask);
-   is1 = spu_and(is1, spu.texture[unit].level[level].tex_size_x_mask);
-   it1 = spu_and(it1, spu.texture[unit].level[level].tex_size_y_mask);
+   is0 = spu_and(is0, tlevel->mask_s);
+   it0 = spu_and(it0, tlevel->mask_t);
+   is1 = spu_and(is1, tlevel->mask_s);
+   it1 = spu_and(it1, tlevel->mask_t);
+
+   /* PIPE_TEX_WRAP_CLAMP */
+   is0 = spu_clamp(is0, tlevel->max_s);
+   it0 = spu_clamp(it0, tlevel->max_t);
+   is1 = spu_clamp(is1, tlevel->max_s);
+   it1 = spu_clamp(it1, tlevel->max_t);
 
    /* get packed int texels */
    vector unsigned int texels[16];
@@ -275,34 +301,41 @@ sample_texture4_bilinear_2(vector float s, vector float t,
                            vector float r, vector float q,
                            uint unit, uint level, vector float colors[4])
 {
+   const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
    static const vector float half = {-0.5f, -0.5f, -0.5f, -0.5f};
    /* Scale texcoords by size of texture, and add half pixel bias */
-   vector float ss = spu_madd(s, spu.texture[unit].level[level].width4, half);
-   vector float tt = spu_madd(t, spu.texture[unit].level[level].height4, half);
+   vector float ss = spu_madd(s, tlevel->scale_s, half);
+   vector float tt = spu_madd(t, tlevel->scale_t, half);
 
    /* convert float coords to fixed-pt coords with 8 fraction bits */
-   vector unsigned int is = (vector unsigned int) spu_convts(ss, 8);
-   vector unsigned int it = (vector unsigned int) spu_convts(tt, 8);
+   vector signed int is = spu_convts(ss, 8);
+   vector signed int it = spu_convts(tt, 8);
 
    /* compute integer texel weights in [0, 255] */
-   vector signed int sWeights0 = spu_and((vector signed int) is, 255);
-   vector signed int tWeights0 = spu_and((vector signed int) it, 255);
+   vector signed int sWeights0 = spu_and(is, 255);
+   vector signed int tWeights0 = spu_and(it, 255);
    vector signed int sWeights1 = spu_sub(255, sWeights0);
    vector signed int tWeights1 = spu_sub(255, tWeights0);
 
    /* texel coords: is0 = is / 256, it0 = is / 256 */
-   vector unsigned int is0 = spu_rlmask(is, -8);
-   vector unsigned int it0 = spu_rlmask(it, -8);
+   vector signed int is0 = spu_rlmask(is, -8);
+   vector signed int it0 = spu_rlmask(it, -8);
 
    /* texel coords: i1 = is0 + 1, it1 = it0 + 1 */
-   vector unsigned int is1 = spu_add(is0, 1);
-   vector unsigned int it1 = spu_add(it0, 1);
+   vector signed int is1 = spu_add(is0, 1);
+   vector signed int it1 = spu_add(it0, 1);
 
    /* PIPE_TEX_WRAP_REPEAT */
-   is0 = spu_and(is0, spu.texture[unit].level[level].tex_size_x_mask);
-   it0 = spu_and(it0, spu.texture[unit].level[level].tex_size_y_mask);
-   is1 = spu_and(is1, spu.texture[unit].level[level].tex_size_x_mask);
-   it1 = spu_and(it1, spu.texture[unit].level[level].tex_size_y_mask);
+   is0 = spu_and(is0, tlevel->mask_s);
+   it0 = spu_and(it0, tlevel->mask_t);
+   is1 = spu_and(is1, tlevel->mask_s);
+   it1 = spu_and(it1, tlevel->mask_t);
+
+   /* PIPE_TEX_WRAP_CLAMP */
+   is0 = spu_clamp(is0, tlevel->max_s);
+   it0 = spu_clamp(it0, tlevel->max_t);
+   is1 = spu_clamp(is1, tlevel->max_s);
+   it1 = spu_clamp(it1, tlevel->max_t);
 
    /* get packed int texels */
    vector unsigned int texels[16];
-- 
cgit v1.2.3


From 8f7c6b55ae962e30f32cfec9a14a652d3b5b5943 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Tue, 14 Oct 2008 17:11:29 -0600
Subject: cell: support for cubemaps

Though, progs/demos/cubemap.c doesn't quite work right...
---
 src/gallium/drivers/cell/common.h              |   1 +
 src/gallium/drivers/cell/ppu/cell_state_emit.c |   2 +
 src/gallium/drivers/cell/ppu/cell_texture.c    |  37 ++++--
 src/gallium/drivers/cell/spu/spu_command.c     |  17 ++-
 src/gallium/drivers/cell/spu/spu_funcs.c       |   2 +-
 src/gallium/drivers/cell/spu/spu_main.h        |   4 +-
 src/gallium/drivers/cell/spu/spu_texture.c     | 171 ++++++++++++++++++++++---
 src/gallium/drivers/cell/spu/spu_texture.h     |  21 ++-
 8 files changed, 214 insertions(+), 41 deletions(-)

(limited to 'src/gallium/drivers/cell/spu/spu_command.c')

diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index e4de9a551d..c1e78f4db3 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -251,6 +251,7 @@ struct cell_command_sampler
 struct cell_command_texture
 {
    uint64_t opcode;     /**< CELL_CMD_STATE_TEXTURE */
+   uint target;         /**< PIPE_TEXTURE_x */
    uint unit;
    void *start[CELL_MAX_TEXTURE_LEVELS];   /**< Address in main memory */
    ushort width[CELL_MAX_TEXTURE_LEVELS];
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index cae546b700..d4a867ffcf 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -217,6 +217,7 @@ cell_emit_state(struct cell_context *cell)
                texture->width[level] = cell->texture[i]->base.width[level];
                texture->height[level] = cell->texture[i]->base.height[level];
             }
+            texture->target = cell->texture[i]->base.target;
          }
          else {
             uint level;
@@ -225,6 +226,7 @@ cell_emit_state(struct cell_context *cell)
                texture->width[level] = 0;
                texture->height[level] = 0;
             }
+            texture->target = 0;
          }
       }
    }
diff --git a/src/gallium/drivers/cell/ppu/cell_texture.c b/src/gallium/drivers/cell/ppu/cell_texture.c
index 4fd66bdea0..4c92ef154f 100644
--- a/src/gallium/drivers/cell/ppu/cell_texture.c
+++ b/src/gallium/drivers/cell/ppu/cell_texture.c
@@ -137,6 +137,7 @@ cell_texture_release(struct pipe_screen *screen,
    */
    if (--(*pt)->refcount <= 0) {
       struct cell_texture *ct = cell_texture(*pt);
+      uint i;
 
       /*
       DBG("%s deleting %p\n", __FUNCTION__, (void *) ct);
@@ -144,6 +145,12 @@ cell_texture_release(struct pipe_screen *screen,
 
       pipe_buffer_reference(screen, &ct->buffer, NULL);
 
+      for (i = 0; i < CELL_MAX_TEXTURE_LEVELS; i++) {
+         if (ct->tiled_data[i]) {
+            FREE(ct->tiled_data[i]);
+         }
+      }
+
       FREE(ct);
    }
    *pt = NULL;
@@ -204,27 +211,33 @@ static void
 cell_twiddle_texture(struct pipe_screen *screen,
                      struct pipe_surface *surface)
 {
-   struct cell_texture *texture = cell_texture(surface->texture);
+   struct cell_texture *ct = cell_texture(surface->texture);
    const uint level = surface->level;
-   const uint texWidth = texture->base.width[level];
-   const uint texHeight = texture->base.height[level];
+   const uint texWidth = ct->base.width[level];
+   const uint texHeight = ct->base.height[level];
    const uint bufWidth = align(texWidth, TILE_SIZE);
    const uint bufHeight = align(texHeight, TILE_SIZE);
    const void *map = pipe_buffer_map(screen, surface->buffer,
                                      PIPE_BUFFER_USAGE_CPU_READ);
    const uint *src = (const uint *) ((const ubyte *) map + surface->offset);
 
-   switch (texture->base.format) {
+   switch (ct->base.format) {
    case PIPE_FORMAT_A8R8G8B8_UNORM:
-      /* free old tiled data */
-      if (texture->tiled_data[level]) {
-         align_free(texture->tiled_data[level]);
+      {
+         int numFaces = ct->base.target == PIPE_TEXTURE_CUBE ? 6 : 1;
+         int offset = bufWidth * bufHeight * 4 * surface->face;
+         uint *dst;
+
+         if (!ct->tiled_data[level]) {
+            ct->tiled_data[level] =
+               align_malloc(bufWidth * bufHeight * 4 * numFaces, 16);
+         }
+
+         dst = (uint *) ((ubyte *) ct->tiled_data[level] + offset);
+
+         twiddle_image_uint(texWidth, texHeight, TILE_SIZE, dst,
+                            surface->stride, src);
       }
-      /* alloc new tiled data */
-      texture->tiled_data[level] = align_malloc(bufWidth * bufHeight * 4, 16);
-      twiddle_image_uint(texWidth, texHeight, TILE_SIZE,
-                         texture->tiled_data[level],
-                         surface->stride, src);
       break;
    default:
       printf("Cell: twiddle unsupported texture format\n");
diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index b1efe97e76..c951fa6f31 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -301,7 +301,8 @@ cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
  */
 static void
 update_tex_masks(struct spu_texture *texture,
-                 const struct pipe_sampler_state *sampler)
+                 const struct pipe_sampler_state *sampler,
+                 uint unit)
 {
    uint i;
 
@@ -328,6 +329,11 @@ update_tex_masks(struct spu_texture *texture,
          texture->level[i].scale_t = spu_splats(1.0f);
       }
    }
+
+   /* XXX temporary hack */
+   if (texture->target == PIPE_TEXTURE_CUBE) {
+      spu.sample_texture4[unit] = sample_texture4_cube;
+   }
 }
 
 
@@ -378,7 +384,7 @@ cmd_state_sampler(const struct cell_command_sampler *sampler)
       ASSERT(0);
    }
 
-   update_tex_masks(&spu.texture[unit], &spu.sampler[unit]);
+   update_tex_masks(&spu.texture[unit], &spu.sampler[unit], unit);
 }
 
 
@@ -393,6 +399,7 @@ cmd_state_texture(const struct cell_command_texture *texture)
    DEBUG_PRINTF("TEXTURE [%u]\n", texture->unit);
 
    spu.texture[unit].max_level = 0;
+   spu.texture[unit].target = texture->target;
 
    for (i = 0; i < CELL_MAX_TEXTURE_LEVELS; i++) {
       uint width = texture->width[i];
@@ -408,6 +415,10 @@ cmd_state_texture(const struct cell_command_texture *texture)
       spu.texture[unit].level[i].tiles_per_row =
          (width + TILE_SIZE - 1) / TILE_SIZE;
 
+      spu.texture[unit].level[i].bytes_per_image =
+         4 * ((width + TILE_SIZE - 1) & ~(TILE_SIZE-1))
+         * ((height + TILE_SIZE - 1) & ~(TILE_SIZE-1));
+
       spu.texture[unit].level[i].max_s = spu_splats((int) width - 1);
       spu.texture[unit].level[i].max_t = spu_splats((int) height - 1);
 
@@ -415,7 +426,7 @@ cmd_state_texture(const struct cell_command_texture *texture)
          spu.texture[unit].max_level = i;
    }
 
-   update_tex_masks(&spu.texture[unit], &spu.sampler[unit]);
+   update_tex_masks(&spu.texture[unit], &spu.sampler[unit], unit);
 
    //Debug=0;
 }
diff --git a/src/gallium/drivers/cell/spu/spu_funcs.c b/src/gallium/drivers/cell/spu/spu_funcs.c
index 66b82f673d..5c3ee305d4 100644
--- a/src/gallium/drivers/cell/spu/spu_funcs.c
+++ b/src/gallium/drivers/cell/spu/spu_funcs.c
@@ -106,7 +106,7 @@ spu_txp(vector float s, vector float t, vector float r, vector float q,
         unsigned unit)
 {
    struct vec_4x4 colors;
-   spu.sample_texture4[unit](s, t, r, q, unit, 0, colors.v);
+   spu.sample_texture4[unit](s, t, r, q, unit, 0, 0, colors.v);
    return colors;
 }
 
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 45c6f4ced1..8781041bff 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -68,7 +68,7 @@ typedef void (*spu_sample_texture4_func)(vector float s,
                                          vector float t,
                                          vector float r,
                                          vector float q,
-                                         uint unit, uint level,
+                                         uint unit, uint level, uint face,
                                          vector float colors[4]);
 
 
@@ -113,6 +113,7 @@ struct spu_texture_level
    void *start;
    ushort width, height;
    ushort tiles_per_row;
+   uint bytes_per_image;
    /** texcoord scale factors */
    vector float scale_s, scale_t;
    /** texcoord masks (if REPEAT then size-1, else ~0) */
@@ -126,6 +127,7 @@ struct spu_texture
 {
    struct spu_texture_level level[CELL_MAX_TEXTURE_LEVELS];
    uint max_level;
+   uint target;  /**< PIPE_TEXTURE_x */
 } ALIGN16_ATTRIB;
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
index b21c43a467..2570f02c73 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.c
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -48,6 +48,9 @@ invalidate_tex_cache(void)
       uint bytes = 4 * spu.texture[unit].level[lvl].width
          * spu.texture[unit].level[lvl].height;
 
+      if (spu.texture[unit].target == PIPE_TEXTURE_CUBE)
+         bytes *= 6;
+
       spu_dcache_mark_dirty((unsigned) spu.texture[unit].level[lvl].start, bytes);
    }
 }
@@ -67,11 +70,11 @@ invalidate_tex_cache(void)
  * a time.
  */
 static void
-get_four_texels(uint unit, uint level, vec_int4 x, vec_int4 y,
+get_four_texels(uint unit, uint level, uint face, vec_int4 x, vec_int4 y,
                 vec_uint4 *texels)
 {
    const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
-   const unsigned texture_ea = (uintptr_t) tlevel->start;
+   unsigned texture_ea = (uintptr_t) tlevel->start;
    const vec_int4 tile_x = spu_rlmask(x, -5);  /* tile_x = x / 32 */
    const vec_int4 tile_y = spu_rlmask(y, -5);  /* tile_y = y / 32 */
    const qword offset_x = si_andi((qword) x, 0x1f); /* offset_x = x & 0x1f */
@@ -88,6 +91,8 @@ get_four_texels(uint unit, uint level, vec_int4 x, vec_int4 y,
    
    vec_uint4 offset = (vec_uint4) si_a(tile_offset, texel_offset);
    
+   texture_ea = texture_ea + face * tlevel->bytes_per_image;
+
    spu_dcache_fetch_unaligned((qword *) & texels[0],
                               texture_ea + spu_extract(offset, 0), 4);
    spu_dcache_fetch_unaligned((qword *) & texels[1],
@@ -121,7 +126,8 @@ spu_clamp(vector signed int vec, vector signed int max)
 void
 sample_texture4_nearest(vector float s, vector float t,
                         vector float r, vector float q,
-                        uint unit, uint level, vector float colors[4])
+                        uint unit, uint level, uint face,
+                        vector float colors[4])
 {
    const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
    vector float ss = spu_mul(s, tlevel->scale_s);
@@ -138,7 +144,7 @@ sample_texture4_nearest(vector float s, vector float t,
    is = spu_clamp(is, tlevel->max_s);
    it = spu_clamp(it, tlevel->max_t);
 
-   get_four_texels(unit, level, is, it, texels);
+   get_four_texels(unit, level, face, is, it, texels);
 
    /* convert four packed ARGBA pixels to float RRRR,GGGG,BBBB,AAAA */
    spu_unpack_A8R8G8B8_transpose4(texels, colors);
@@ -152,11 +158,14 @@ sample_texture4_nearest(vector float s, vector float t,
 void
 sample_texture4_bilinear(vector float s, vector float t,
                          vector float r, vector float q,
-                         uint unit, uint level, vector float colors[4])
+                         uint unit, uint level, uint face,
+                         vector float colors[4])
 {
    const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
-   vector float ss = spu_madd(s, tlevel->scale_s,  spu_splats(-0.5f));
-   vector float tt = spu_madd(t, tlevel->scale_t, spu_splats(-0.5f));
+   static const vector float half = {-0.5f, -0.5f, -0.5f, -0.5f};
+
+   vector float ss = spu_madd(s, tlevel->scale_s, half);
+   vector float tt = spu_madd(t, tlevel->scale_t, half);
 
    vector signed int is0 = spu_convts(ss, 0);
    vector signed int it0 = spu_convts(tt, 0);
@@ -179,10 +188,10 @@ sample_texture4_bilinear(vector float s, vector float t,
 
    /* get packed int texels */
    vector unsigned int texels[16];
-   get_four_texels(unit, level, is0, it0, texels + 0);  /* upper-left */
-   get_four_texels(unit, level, is1, it0, texels + 4);  /* upper-right */
-   get_four_texels(unit, level, is0, it1, texels + 8);  /* lower-left */
-   get_four_texels(unit, level, is1, it1, texels + 12); /* lower-right */
+   get_four_texels(unit, level, face, is0, it0, texels + 0);  /* upper-left */
+   get_four_texels(unit, level, face, is1, it0, texels + 4);  /* upper-right */
+   get_four_texels(unit, level, face, is0, it1, texels + 8);  /* lower-left */
+   get_four_texels(unit, level, face, is1, it1, texels + 12); /* lower-right */
 
    /* XXX possibly rework following code to compute the weighted sample
     * colors with integer arithmetic for fewer int->float conversions.
@@ -299,10 +308,12 @@ transpose(vector unsigned int *mOut0,
 void
 sample_texture4_bilinear_2(vector float s, vector float t,
                            vector float r, vector float q,
-                           uint unit, uint level, vector float colors[4])
+                           uint unit, uint level, uint face,
+                           vector float colors[4])
 {
    const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
    static const vector float half = {-0.5f, -0.5f, -0.5f, -0.5f};
+
    /* Scale texcoords by size of texture, and add half pixel bias */
    vector float ss = spu_madd(s, tlevel->scale_s, half);
    vector float tt = spu_madd(t, tlevel->scale_t, half);
@@ -339,10 +350,10 @@ sample_texture4_bilinear_2(vector float s, vector float t,
 
    /* get packed int texels */
    vector unsigned int texels[16];
-   get_four_texels(unit, level, is0, it0, texels + 0);  /* upper-left */
-   get_four_texels(unit, level, is1, it0, texels + 4);  /* upper-right */
-   get_four_texels(unit, level, is0, it1, texels + 8);  /* lower-left */
-   get_four_texels(unit, level, is1, it1, texels + 12); /* lower-right */
+   get_four_texels(unit, level, face, is0, it0, texels + 0);  /* upper-left */
+   get_four_texels(unit, level, face, is1, it0, texels + 4);  /* upper-right */
+   get_four_texels(unit, level, face, is0, it1, texels + 8);  /* lower-left */
+   get_four_texels(unit, level, face, is1, it1, texels + 12); /* lower-right */
 
    /* twiddle packed 32-bit BGRA pixels into RGBA as four unsigned ints */
    {
@@ -433,7 +444,8 @@ compute_lambda(uint unit, vector float s, vector float t)
 void
 sample_texture4_lod(vector float s, vector float t,
                     vector float r, vector float q,
-                    uint unit, uint level_ignored, vector float colors[4])
+                    uint unit, uint level_ignored, uint face,
+                    vector float colors[4])
 {
    /*
     * Note that we're computing a lambda/lod here that's used for all
@@ -452,15 +464,136 @@ sample_texture4_lod(vector float s, vector float t,
 
    if (lambda <= 0.0f) {
       /* magnify */
-      spu.mag_sample_texture4[unit](s, t, r, q, unit, 0, colors);
+      spu.mag_sample_texture4[unit](s, t, r, q, unit, 0, 0, colors);
    }
    else {
       /* minify */
       int level = (int) (lambda + 0.5f);
       if (level > (int) spu.texture[unit].max_level)
          level = spu.texture[unit].max_level;
-      spu.min_sample_texture4[unit](s, t, r, q, unit, level, colors);
+      spu.min_sample_texture4[unit](s, t, r, q, unit, level, 0, colors);
       /* XXX to do: mipmap level interpolation */
    }
 }
 
+
+/** XXX need a SIMD version of this */
+static unsigned
+choose_cube_face(float rx, float ry, float rz, float *newS, float *newT)
+{
+   /*
+      major axis
+      direction     target                             sc     tc    ma
+      ----------    -------------------------------    ---    ---   ---
+       +rx          TEXTURE_CUBE_MAP_POSITIVE_X_EXT    -rz    -ry   rx
+       -rx          TEXTURE_CUBE_MAP_NEGATIVE_X_EXT    +rz    -ry   rx
+       +ry          TEXTURE_CUBE_MAP_POSITIVE_Y_EXT    +rx    +rz   ry
+       -ry          TEXTURE_CUBE_MAP_NEGATIVE_Y_EXT    +rx    -rz   ry
+       +rz          TEXTURE_CUBE_MAP_POSITIVE_Z_EXT    +rx    -ry   rz
+       -rz          TEXTURE_CUBE_MAP_NEGATIVE_Z_EXT    -rx    -ry   rz
+   */
+   const float arx = fabsf(rx);
+   const float ary = fabsf(ry);
+   const float arz = fabsf(rz);
+   unsigned face;
+   float sc, tc, ma;
+
+   if (arx > ary && arx > arz) {
+      if (rx >= 0.0F) {
+         face = PIPE_TEX_FACE_POS_X;
+         sc = -rz;
+         tc = -ry;
+         ma = arx;
+      }
+      else {
+         face = PIPE_TEX_FACE_NEG_X;
+         sc = rz;
+         tc = -ry;
+         ma = arx;
+      }
+   }
+   else if (ary > arx && ary > arz) {
+      if (ry >= 0.0F) {
+         face = PIPE_TEX_FACE_POS_Y;
+         sc = rx;
+         tc = rz;
+         ma = ary;
+      }
+      else {
+         face = PIPE_TEX_FACE_NEG_Y;
+         sc = rx;
+         tc = -rz;
+         ma = ary;
+      }
+   }
+   else {
+      if (rz > 0.0F) {
+         face = PIPE_TEX_FACE_POS_Z;
+         sc = rx;
+         tc = -ry;
+         ma = arz;
+      }
+      else {
+         face = PIPE_TEX_FACE_NEG_Z;
+         sc = -rx;
+         tc = -ry;
+         ma = arz;
+      }
+   }
+
+   *newS = (sc / ma + 1.0F) * 0.5F;
+   *newT = (tc / ma + 1.0F) * 0.5F;
+
+   return face;
+}
+
+
+
+void
+sample_texture4_cube(vector float s, vector float t,
+                     vector float r, vector float q,
+                     uint unit, uint level, int face_ignored,
+                     vector float colors[4])
+{
+   static const vector float zero = {0.0f, 0.0f, 0.0f, 0.0f};
+   uint p, faces[4];
+   float newS[4], newT[4];
+
+   /* Compute cube face referenced by the four sets of texcoords.
+    * XXX we should SIMD-ize this.
+    */
+   for (p = 0; p < 4; p++) {      
+      float rx = spu_extract(s, p);
+      float ry = spu_extract(t, p);
+      float rz = spu_extract(r, p);
+      faces[p] = choose_cube_face(rx, ry, rz, &newS[p], &newT[p]);
+   }
+
+   if (faces[0] == faces[1] &&
+       faces[0] == faces[2] &&
+       faces[0] == faces[3]) {
+      /* GOOD!  All four texcoords refer to the same cube face */
+      s = (vector float) {newS[0], newS[1], newS[2], newS[3]};
+      t = (vector float) {newT[0], newT[1], newT[2], newT[3]};
+      sample_texture4_nearest(s, t, zero, zero, unit, level, faces[0], colors);
+   }
+   else {
+      /* BAD!  The four texcoords refer to different faces */
+      for (p = 0; p < 4; p++) {      
+         vector float c[4];
+
+         sample_texture4_nearest(spu_splats(newS[p]), spu_splats(newT[p]),
+                                 zero, zero, unit, level, faces[p], c);
+
+         float red = spu_extract(c[0], p);
+         float green = spu_extract(c[1], p);
+         float blue = spu_extract(c[2], p);
+         float alpha = spu_extract(c[3], p);
+
+         colors[0] = spu_insert(red,   colors[0], p);
+         colors[1] = spu_insert(green, colors[1], p);
+         colors[2] = spu_insert(blue,  colors[2], p);
+         colors[3] = spu_insert(alpha, colors[3], p);
+      }
+   }
+}
diff --git a/src/gallium/drivers/cell/spu/spu_texture.h b/src/gallium/drivers/cell/spu/spu_texture.h
index ec06a50b4a..08b891a4a8 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.h
+++ b/src/gallium/drivers/cell/spu/spu_texture.h
@@ -39,24 +39,35 @@ invalidate_tex_cache(void);
 extern void
 sample_texture4_nearest(vector float s, vector float t,
                         vector float r, vector float q,
-                        uint unit, uint level, vector float colors[4]);
+                        uint unit, uint level, uint face,
+                        vector float colors[4]);
 
 
 extern void
 sample_texture4_bilinear(vector float s, vector float t,
                          vector float r, vector float q,
-                         uint unit, uint level, vector float colors[4]);
+                         uint unit, uint level, uint face,
+                         vector float colors[4]);
 
 extern void
 sample_texture4_bilinear_2(vector float s, vector float t,
-                         vector float r, vector float q,
-                           uint unit, uint level, vector float colors[4]);
+                           vector float r, vector float q,
+                           uint unit, uint level, uint face,
+                           vector float colors[4]);
 
 
 extern void
 sample_texture4_lod(vector float s, vector float t,
                     vector float r, vector float q,
-                    uint unit, uint level, vector float colors[4]);
+                    uint unit, uint level, uint face,
+                    vector float colors[4]);
+
+
+extern void
+sample_texture4_cube(vector float s, vector float t,
+                     vector float r, vector float q,
+                     uint unit, uint level_ignored, int face_ignored,
+                     vector float colors[4]);
 
 
 #endif /* SPU_TEXTURE_H */
-- 
cgit v1.2.3


From 41ccdde767e7aba6e8e6a9a035eacd6338c03a95 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Tue, 14 Oct 2008 17:22:40 -0600
Subject: cell: initial bits for 3D texture support

---
 src/gallium/drivers/cell/common.h              |  1 +
 src/gallium/drivers/cell/ppu/cell_state_emit.c |  2 ++
 src/gallium/drivers/cell/spu/spu_command.c     | 13 +++++++++++--
 src/gallium/drivers/cell/spu/spu_main.h        |  8 ++++----
 src/gallium/drivers/cell/spu/spu_texture.c     |  2 ++
 5 files changed, 20 insertions(+), 6 deletions(-)

(limited to 'src/gallium/drivers/cell/spu/spu_command.c')

diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index c1e78f4db3..b0169b8e32 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -256,6 +256,7 @@ struct cell_command_texture
    void *start[CELL_MAX_TEXTURE_LEVELS];   /**< Address in main memory */
    ushort width[CELL_MAX_TEXTURE_LEVELS];
    ushort height[CELL_MAX_TEXTURE_LEVELS];
+   ushort depth[CELL_MAX_TEXTURE_LEVELS];
 };
 
 
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index d4a867ffcf..bb694aa107 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -216,6 +216,7 @@ cell_emit_state(struct cell_context *cell)
                texture->start[level] = cell->texture[i]->tiled_data[level];
                texture->width[level] = cell->texture[i]->base.width[level];
                texture->height[level] = cell->texture[i]->base.height[level];
+               texture->depth[level] = cell->texture[i]->base.depth[level];
             }
             texture->target = cell->texture[i]->base.target;
          }
@@ -225,6 +226,7 @@ cell_emit_state(struct cell_context *cell)
                texture->start[level] = NULL;
                texture->width[level] = 0;
                texture->height[level] = 0;
+               texture->depth[level] = 0;
             }
             texture->target = 0;
          }
diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index c951fa6f31..c28677ebf8 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -59,6 +59,14 @@ static unsigned char attribute_fetch_code_buffer[136 * PIPE_MAX_ATTRIBS]
 
 
+static INLINE int
+align(int value, int alignment)
+{
+   return (value + alignment - 1) & ~(alignment - 1);
+}
+
+
+
 /**
  * Tell the PPU that this SPU has finished copying a buffer to
  * local store and that it may be reused by the PPU.
@@ -404,6 +412,7 @@ cmd_state_texture(const struct cell_command_texture *texture)
    for (i = 0; i < CELL_MAX_TEXTURE_LEVELS; i++) {
       uint width = texture->width[i];
       uint height = texture->height[i];
+      uint depth = texture->depth[i];
 
       DEBUG_PRINTF("  LEVEL %u: at %p  size[0] %u x %u\n", i,
              texture->start[i], texture->width[i], texture->height[i]);
@@ -411,13 +420,13 @@ cmd_state_texture(const struct cell_command_texture *texture)
       spu.texture[unit].level[i].start = texture->start[i];
       spu.texture[unit].level[i].width = width;
       spu.texture[unit].level[i].height = height;
+      spu.texture[unit].level[i].depth = depth;
 
       spu.texture[unit].level[i].tiles_per_row =
          (width + TILE_SIZE - 1) / TILE_SIZE;
 
       spu.texture[unit].level[i].bytes_per_image =
-         4 * ((width + TILE_SIZE - 1) & ~(TILE_SIZE-1))
-         * ((height + TILE_SIZE - 1) & ~(TILE_SIZE-1));
+         4 * align(width, TILE_SIZE) * align(height, TILE_SIZE) * depth;
 
       spu.texture[unit].level[i].max_s = spu_splats((int) width - 1);
       spu.texture[unit].level[i].max_t = spu_splats((int) height - 1);
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 8781041bff..eff43b870c 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -111,15 +111,15 @@ struct spu_framebuffer
 struct spu_texture_level
 {
    void *start;
-   ushort width, height;
+   ushort width, height, depth;
    ushort tiles_per_row;
    uint bytes_per_image;
    /** texcoord scale factors */
-   vector float scale_s, scale_t;
+   vector float scale_s, scale_t, scale_r;
    /** texcoord masks (if REPEAT then size-1, else ~0) */
-   vector signed int mask_s, mask_t;
+   vector signed int mask_s, mask_t, mask_r;
    /** texcoord clamp limits */
-   vector signed int max_s, max_t;
+   vector signed int max_s, max_t, max_r;
 } ALIGN16_ATTRIB;
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
index 9e25094d13..42eb06a362 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.c
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -50,6 +50,8 @@ invalidate_tex_cache(void)
 
       if (spu.texture[unit].target == PIPE_TEXTURE_CUBE)
          bytes *= 6;
+      else if (spu.texture[unit].target == PIPE_TEXTURE_3D)
+         bytes *= spu.texture[unit].level[lvl].depth;
 
       spu_dcache_mark_dirty((unsigned) spu.texture[unit].level[lvl].start, bytes);
    }
-- 
cgit v1.2.3


From f60c756ed14f25731ff2a52d6b695ceb5b7a6f6b Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 15 Oct 2008 10:54:06 -0600
Subject: cell: additional debug

---
 src/gallium/drivers/cell/spu/spu_command.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'src/gallium/drivers/cell/spu/spu_command.c')

diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index c28677ebf8..a07b312111 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -250,6 +250,7 @@ cmd_state_fs_constants(const uint64_t *buffer, uint pos)
 
    /* Expand each float to float[4] for SOA execution */
    for (i = 0; i < num_const; i++) {
+      DEBUG_PRINTF("  const[%u] = %f\n", i, constants[i]);
       spu.constants[i] = spu_splats(constants[i]);
    }
 
-- 
cgit v1.2.3


From 53951531ae7bfd64afae1ae55aac7f6ebd3fe4f5 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 15 Oct 2008 12:35:51 -0600
Subject: cell: propogate blend color to SPUs for the fallback fragment ops
 code

---
 src/gallium/drivers/cell/common.h                  |  4 ++
 src/gallium/drivers/cell/ppu/cell_context.h        |  1 +
 src/gallium/drivers/cell/ppu/cell_state_emit.c     |  1 +
 src/gallium/drivers/cell/spu/spu_command.c         |  1 +
 src/gallium/drivers/cell/spu/spu_main.h            |  1 +
 src/gallium/drivers/cell/spu/spu_per_fragment_op.c | 75 +++++++++++++++++++---
 6 files changed, 74 insertions(+), 9 deletions(-)

(limited to 'src/gallium/drivers/cell/spu/spu_command.c')

diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index b0169b8e32..3b5a25e165 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -118,12 +118,16 @@
 
 /**
  * Command to specify per-fragment operations state and generated code.
+ * Note that the dsa, blend, blend_color fields are really only needed
+ * for the fallback/C per-pixel code.  They're not used when we generate
+ * dynamic SPU fragment code (which is the normal case).
  */
 struct cell_command_fragment_ops
 {
    uint64_t opcode;      /**< CELL_CMD_STATE_FRAGMENT_OPS */
    struct pipe_depth_stencil_alpha_state dsa;
    struct pipe_blend_state blend;
+   struct pipe_blend_color blend_color;
    unsigned code[SPU_MAX_FRAGMENT_OPS_INSTS];
 };
 
diff --git a/src/gallium/drivers/cell/ppu/cell_context.h b/src/gallium/drivers/cell/ppu/cell_context.h
index 80a9b3d7e1..1fcf03c2b8 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.h
+++ b/src/gallium/drivers/cell/ppu/cell_context.h
@@ -74,6 +74,7 @@ struct cell_fragment_shader_state
 struct cell_fragment_ops_key
 {
    struct pipe_blend_state blend;
+   struct pipe_blend_color blend_color;
    struct pipe_depth_stencil_alpha_state dsa;
    enum pipe_format color_format;
    enum pipe_format zs_format;
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index bb694aa107..d2427584ba 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -52,6 +52,7 @@ lookup_fragment_ops(struct cell_context *cell)
     */
    memset(&key, 0, sizeof(key));
    key.blend = *cell->blend;
+   key.blend_color = cell->blend_color;
    key.dsa = *cell->depth_stencil;
 
    if (cell->framebuffer.cbufs[0])
diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index a07b312111..b521c3aecf 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -195,6 +195,7 @@ cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops)
    /* Copy state info (for fallback case only) */
    memcpy(&spu.depth_stencil_alpha, &fops->dsa, sizeof(fops->dsa));
    memcpy(&spu.blend, &fops->blend, sizeof(fops->blend));
+   memcpy(&spu.blend_color, &fops->blend_color, sizeof(fops->blend_color));
 
    /* Parity twist!  For now, always use the fallback code by default,
     * only switching to codegen when specifically requested.  This
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index eff43b870c..ca72baea8b 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -145,6 +145,7 @@ struct spu_global
    struct spu_framebuffer fb;
    struct pipe_depth_stencil_alpha_state depth_stencil_alpha;
    struct pipe_blend_state blend;
+   struct pipe_blend_color blend_color;
    struct pipe_sampler_state sampler[PIPE_MAX_SAMPLERS];
    struct spu_texture texture[PIPE_MAX_SAMPLERS];
    struct vertex_info vertex_info;
diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
index 9404704abf..f8ffc70492 100644
--- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
+++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
@@ -260,7 +260,7 @@ spu_fallback_fragment_ops(uint x, uint y,
       }
 
       /*
-       * Compute Src RGB terms
+       * Compute Src RGB terms (fragment color * factor)
        */
       switch (spu.blend.rgb_src_factor) {
       case PIPE_BLENDFACTOR_ONE:
@@ -283,13 +283,33 @@ spu_fallback_fragment_ops(uint x, uint y,
          term1g = spu_mul(fragG, fragA);
          term1b = spu_mul(fragB, fragA);
          break;
+      case PIPE_BLENDFACTOR_DST_COLOR:
+         term1r = spu_mul(fragR, fbRGBA[0]);
+         term1g = spu_mul(fragG, fbRGBA[1]);
+         term1b = spu_mul(fragB, fbRGBA[1]);
+         break;
+      case PIPE_BLENDFACTOR_DST_ALPHA:
+         term1r = spu_mul(fragR, fbRGBA[3]);
+         term1g = spu_mul(fragG, fbRGBA[3]);
+         term1b = spu_mul(fragB, fbRGBA[3]);
+         break;
+      case PIPE_BLENDFACTOR_CONST_COLOR:
+         term1r = spu_mul(fragR, spu_splats(spu.blend_color.color[0]));
+         term1g = spu_mul(fragG, spu_splats(spu.blend_color.color[1]));
+         term1b = spu_mul(fragB, spu_splats(spu.blend_color.color[2]));
+         break;
+      case PIPE_BLENDFACTOR_CONST_ALPHA:
+         term1r = spu_mul(fragR, spu_splats(spu.blend_color.color[3]));
+         term1g = spu_mul(fragG, spu_splats(spu.blend_color.color[3]));
+         term1b = spu_mul(fragB, spu_splats(spu.blend_color.color[3]));
+         break;
       /* XXX more cases */
       default:
          ASSERT(0);
       }
 
       /*
-       * Compute Src Alpha term
+       * Compute Src Alpha term (fragment alpha * factor)
        */
       switch (spu.blend.alpha_src_factor) {
       case PIPE_BLENDFACTOR_ONE:
@@ -301,13 +321,23 @@ spu_fallback_fragment_ops(uint x, uint y,
       case PIPE_BLENDFACTOR_SRC_ALPHA:
          term1a = spu_mul(fragA, fragA);
          break;
+      case PIPE_BLENDFACTOR_DST_COLOR:
+         /* fall-through */
+      case PIPE_BLENDFACTOR_DST_ALPHA:
+         term1a = spu_mul(fragA, fbRGBA[3]);
+         break;
+      case PIPE_BLENDFACTOR_CONST_COLOR:
+         /* fall-through */
+      case PIPE_BLENDFACTOR_CONST_ALPHA:
+         term1a = spu_mul(fragR, spu_splats(spu.blend_color.color[3]));
+         break;
       /* XXX more cases */
       default:
          ASSERT(0);
       }
 
       /*
-       * Compute Dest RGB terms
+       * Compute Dest RGB terms (framebuffer color * factor)
        */
       switch (spu.blend.rgb_dst_factor) {
       case PIPE_BLENDFACTOR_ONE:
@@ -337,17 +367,37 @@ spu_fallback_fragment_ops(uint x, uint y,
          term2g = spu_mul(fbRGBA[1], tmp);
          term2b = spu_mul(fbRGBA[2], tmp);
          break;
-      /* XXX more cases */
+      case PIPE_BLENDFACTOR_DST_COLOR:
+         term2r = spu_mul(fbRGBA[0], fbRGBA[0]);
+         term2g = spu_mul(fbRGBA[1], fbRGBA[1]);
+         term2b = spu_mul(fbRGBA[2], fbRGBA[2]);
+         break;
+      case PIPE_BLENDFACTOR_DST_ALPHA:
+         term2r = spu_mul(fbRGBA[0], fbRGBA[3]);
+         term2g = spu_mul(fbRGBA[1], fbRGBA[3]);
+         term2b = spu_mul(fbRGBA[2], fbRGBA[3]);
+         break;
+      case PIPE_BLENDFACTOR_CONST_COLOR:
+         term2r = spu_mul(fbRGBA[0], spu_splats(spu.blend_color.color[0]));
+         term2g = spu_mul(fbRGBA[1], spu_splats(spu.blend_color.color[1]));
+         term2b = spu_mul(fbRGBA[2], spu_splats(spu.blend_color.color[2]));
+         break;
+      case PIPE_BLENDFACTOR_CONST_ALPHA:
+         term2r = spu_mul(fbRGBA[0], spu_splats(spu.blend_color.color[3]));
+         term2g = spu_mul(fbRGBA[1], spu_splats(spu.blend_color.color[3]));
+         term2b = spu_mul(fbRGBA[2], spu_splats(spu.blend_color.color[3]));
+         break;
+       /* XXX more cases */
       default:
          ASSERT(0);
       }
 
       /*
-       * Compute Dest Alpha term
+       * Compute Dest Alpha term (framebuffer alpha * factor)
        */
       switch (spu.blend.alpha_dst_factor) {
       case PIPE_BLENDFACTOR_ONE:
-         term2a = fragA;
+         term2a = fbRGBA[3];
          break;
       case PIPE_BLENDFACTOR_SRC_COLOR:
          term2a = spu_splats(0.0f);
@@ -360,6 +410,16 @@ spu_fallback_fragment_ops(uint x, uint y,
          tmp = spu_sub(one, fragA);
          term2a = spu_mul(fbRGBA[3], tmp);
          break;
+      case PIPE_BLENDFACTOR_DST_COLOR:
+         /* fall-through */
+      case PIPE_BLENDFACTOR_DST_ALPHA:
+         term2a = spu_mul(fbRGBA[3], fbRGBA[3]);
+         break;
+      case PIPE_BLENDFACTOR_CONST_COLOR:
+         /* fall-through */
+      case PIPE_BLENDFACTOR_CONST_ALPHA:
+         term2a = spu_mul(fbRGBA[3], spu_splats(spu.blend_color.color[3]));
+         break;
       /* XXX more cases */
       default:
          ASSERT(0);
@@ -394,9 +454,7 @@ spu_fallback_fragment_ops(uint x, uint y,
          fragG = spu_max(term1g, term2g);
          fragB = spu_max(term1b, term2b);
          break;
-      /* XXX more cases */
       default:
-         printf("unsup 0x%x\n", spu.blend.rgb_func);
          ASSERT(0);
       }
 
@@ -419,7 +477,6 @@ spu_fallback_fragment_ops(uint x, uint y,
       case PIPE_BLEND_MAX:
          fragA = spu_max(term1a, term2a);
          break;
-      /* XXX more cases */
       default:
          ASSERT(0);
       }
-- 
cgit v1.2.3


From ddeec1ed10d6c12403fe8d30c072ea68f044db99 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 15 Oct 2008 13:55:18 -0600
Subject: cell: simplify spu debug code

---
 src/gallium/drivers/cell/common.h           |  1 +
 src/gallium/drivers/cell/ppu/cell_context.c |  1 +
 src/gallium/drivers/cell/spu/spu_command.c  | 47 +++++++++++++----------------
 src/gallium/drivers/cell/spu/spu_debug.h    |  9 ------
 src/gallium/drivers/cell/spu/spu_main.c     |  9 +-----
 src/gallium/drivers/cell/spu/spu_main.h     | 15 +++++++--
 src/gallium/drivers/cell/spu/spu_render.c   |  7 +++--
 7 files changed, 41 insertions(+), 48 deletions(-)

(limited to 'src/gallium/drivers/cell/spu/spu_command.c')

diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index 3b5a25e165..8ae78265f2 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -111,6 +111,7 @@
 #define CELL_DEBUG_SYNC                 (1 << 2)
 #define CELL_DEBUG_FRAGMENT_OPS         (1 << 3)
 #define CELL_DEBUG_FRAGMENT_OP_FALLBACK (1 << 4)
+#define CELL_DEBUG_CMD                  (1 << 5)
 
 /** Max instructions for doing per-fragment operations */
 #define SPU_MAX_FRAGMENT_OPS_INSTS 64
diff --git a/src/gallium/drivers/cell/ppu/cell_context.c b/src/gallium/drivers/cell/ppu/cell_context.c
index b66aa9c9d9..f8d5eef3ac 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.c
+++ b/src/gallium/drivers/cell/ppu/cell_context.c
@@ -93,6 +93,7 @@ static const struct debug_named_value cell_debug_flags[] = {
    {"sync", CELL_DEBUG_SYNC},      /**< SPUs do synchronous DMA */
    {"fragops", CELL_DEBUG_FRAGMENT_OPS}, /**< SPUs emit fragment ops debug messages*/
    {"fragopfallback", CELL_DEBUG_FRAGMENT_OP_FALLBACK}, /**< SPUs use reference implementation for fragment ops*/
+   {"cmd", CELL_DEBUG_CMD},       /**< SPUs dump command buffer info */
    {NULL, 0}
 };
 
diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index b521c3aecf..ebbed3d1dc 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -44,7 +44,6 @@
 #include "spu_tile.h"
 #include "spu_vertex_shader.h"
 #include "spu_dcache.h"
-#include "spu_debug.h"
 #include "cell/common.h"
 
 
@@ -97,7 +96,7 @@ release_buffer(uint buffer)
 static void
 cmd_clear_surface(const struct cell_command_clear_surface *clear)
 {
-   DEBUG_PRINTF("CLEAR SURF %u to 0x%08x\n", clear->surface, clear->value);
+   D_PRINTF(CELL_DEBUG_CMD, "CLEAR SURF %u to 0x%08x\n", clear->surface, clear->value);
 
    if (clear->surface == 0) {
       spu.fb.color_clear_value = clear->value;
@@ -165,14 +164,14 @@ cmd_clear_surface(const struct cell_command_clear_surface *clear)
 
 #endif /* CLEAR_OPT */
 
-   DEBUG_PRINTF("CLEAR SURF done\n");
+   D_PRINTF(CELL_DEBUG_CMD, "CLEAR SURF done\n");
 }
 
 
 static void
 cmd_release_verts(const struct cell_command_release_verts *release)
 {
-   DEBUG_PRINTF("RELEASE VERTS %u\n", release->vertex_buf);
+   D_PRINTF(CELL_DEBUG_CMD, "RELEASE VERTS %u\n", release->vertex_buf);
    ASSERT(release->vertex_buf != ~0U);
    release_buffer(release->vertex_buf);
 }
@@ -189,7 +188,7 @@ cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops)
 {
    static int warned = 0;
 
-   DEBUG_PRINTF("CMD_STATE_FRAGMENT_OPS\n");
+   D_PRINTF(CELL_DEBUG_CMD, "CMD_STATE_FRAGMENT_OPS\n");
    /* Copy SPU code from batch buffer to spu buffer */
    memcpy(spu.fragment_ops_code, fops->code, SPU_MAX_FRAGMENT_OPS_INSTS * 4);
    /* Copy state info (for fallback case only) */
@@ -229,7 +228,7 @@ cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops)
 static void
 cmd_state_fragment_program(const struct cell_command_fragment_program *fp)
 {
-   DEBUG_PRINTF("CMD_STATE_FRAGMENT_PROGRAM\n");
+   D_PRINTF(CELL_DEBUG_CMD, "CMD_STATE_FRAGMENT_PROGRAM\n");
    /* Copy SPU code from batch buffer to spu buffer */
    memcpy(spu.fragment_program_code, fp->code,
           SPU_MAX_FRAGMENT_PROGRAM_INSTS * 4);
@@ -247,11 +246,11 @@ cmd_state_fs_constants(const uint64_t *buffer, uint pos)
    const float *constants = (const float *) &buffer[pos + 2];
    uint i;
 
-   DEBUG_PRINTF("CMD_STATE_FS_CONSTANTS (%u)\n", num_const);
+   D_PRINTF(CELL_DEBUG_CMD, "CMD_STATE_FS_CONSTANTS (%u)\n", num_const);
 
    /* Expand each float to float[4] for SOA execution */
    for (i = 0; i < num_const; i++) {
-      DEBUG_PRINTF("  const[%u] = %f\n", i, constants[i]);
+      D_PRINTF(CELL_DEBUG_CMD, "  const[%u] = %f\n", i, constants[i]);
       spu.constants[i] = spu_splats(constants[i]);
    }
 
@@ -263,7 +262,7 @@ cmd_state_fs_constants(const uint64_t *buffer, uint pos)
 static void
 cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
 {
-   DEBUG_PRINTF("FRAMEBUFFER: %d x %d at %p, cformat 0x%x  zformat 0x%x\n",
+   D_PRINTF(CELL_DEBUG_CMD, "FRAMEBUFFER: %d x %d at %p, cformat 0x%x  zformat 0x%x\n",
              cmd->width,
              cmd->height,
              cmd->color_start,
@@ -352,7 +351,7 @@ cmd_state_sampler(const struct cell_command_sampler *sampler)
 {
    uint unit = sampler->unit;
 
-   DEBUG_PRINTF("SAMPLER [%u]\n", unit);
+   D_PRINTF(CELL_DEBUG_CMD, "SAMPLER [%u]\n", unit);
 
    spu.sampler[unit] = sampler->state;
 
@@ -404,9 +403,7 @@ cmd_state_texture(const struct cell_command_texture *texture)
    const uint unit = texture->unit;
    uint i;
 
-   //if (spu.init.id==0) Debug=1;
-
-   DEBUG_PRINTF("TEXTURE [%u]\n", texture->unit);
+   D_PRINTF(CELL_DEBUG_CMD, "TEXTURE [%u]\n", texture->unit);
 
    spu.texture[unit].max_level = 0;
    spu.texture[unit].target = texture->target;
@@ -416,7 +413,7 @@ cmd_state_texture(const struct cell_command_texture *texture)
       uint height = texture->height[i];
       uint depth = texture->depth[i];
 
-      DEBUG_PRINTF("  LEVEL %u: at %p  size[0] %u x %u\n", i,
+      D_PRINTF(CELL_DEBUG_CMD, "  LEVEL %u: at %p  size[0] %u x %u\n", i,
              texture->start[i], texture->width[i], texture->height[i]);
 
       spu.texture[unit].level[i].start = texture->start[i];
@@ -438,15 +435,13 @@ cmd_state_texture(const struct cell_command_texture *texture)
    }
 
    update_tex_masks(&spu.texture[unit], &spu.sampler[unit], unit);
-
-   //Debug=0;
 }
 
 
 static void
 cmd_state_vertex_info(const struct vertex_info *vinfo)
 {
-   DEBUG_PRINTF("VERTEX_INFO num_attribs=%u\n", vinfo->num_attribs);
+   D_PRINTF(CELL_DEBUG_CMD, "VERTEX_INFO num_attribs=%u\n", vinfo->num_attribs);
    ASSERT(vinfo->num_attribs >= 1);
    ASSERT(vinfo->num_attribs <= 8);
    memcpy(&spu.vertex_info, vinfo, sizeof(*vinfo));
@@ -485,7 +480,7 @@ cmd_state_attrib_fetch(const struct cell_attribute_fetch_code *code)
 static void
 cmd_finish(void)
 {
-   DEBUG_PRINTF("FINISH\n");
+   D_PRINTF(CELL_DEBUG_CMD, "FINISH\n");
    really_clear_tiles(0);
    /* wait for all outstanding DMAs to finish */
    mfc_write_tag_mask(~0);
@@ -510,7 +505,7 @@ cmd_batch(uint opcode)
    const unsigned usize = size / sizeof(buffer[0]);
    uint pos;
 
-   DEBUG_PRINTF("BATCH buffer %u, len %u, from %p\n",
+   D_PRINTF(CELL_DEBUG_CMD, "BATCH buffer %u, len %u, from %p\n",
              buf, size, spu.init.buffers[buf]);
 
    ASSERT((opcode & CELL_CMD_OPCODE_MASK) == CELL_CMD_BATCH);
@@ -530,7 +525,7 @@ cmd_batch(uint opcode)
    wait_on_mask(1 << TAG_BATCH_BUFFER);
 
    /* Tell PPU we're done copying the buffer to local store */
-   DEBUG_PRINTF("release batch buf %u\n", buf);
+   D_PRINTF(CELL_DEBUG_CMD, "release batch buf %u\n", buf);
    release_buffer(buf);
 
    /*
@@ -663,7 +658,7 @@ cmd_batch(uint opcode)
       }
    }
 
-   DEBUG_PRINTF("BATCH complete\n");
+   D_PRINTF(CELL_DEBUG_CMD, "BATCH complete\n");
 }
 
 
@@ -677,7 +672,7 @@ command_loop(void)
    struct cell_command cmd;
    int exitFlag = 0;
 
-   DEBUG_PRINTF("Enter command loop\n");
+   D_PRINTF(CELL_DEBUG_CMD, "Enter command loop\n");
 
    ASSERT((sizeof(struct cell_command) & 0xf) == 0);
    ASSERT_ALIGN16(&cmd);
@@ -686,12 +681,12 @@ command_loop(void)
       unsigned opcode;
       int tag = 0;
 
-      DEBUG_PRINTF("Wait for cmd...\n");
+      D_PRINTF(CELL_DEBUG_CMD, "Wait for cmd...\n");
 
       /* read/wait from mailbox */
       opcode = (unsigned int) spu_read_in_mbox();
 
-      DEBUG_PRINTF("got cmd 0x%x\n", opcode);
+      D_PRINTF(CELL_DEBUG_CMD, "got cmd 0x%x\n", opcode);
 
       /* command payload */
       mfc_get(&cmd,  /* dest */
@@ -708,7 +703,7 @@ command_loop(void)
 
       switch (opcode & CELL_CMD_OPCODE_MASK) {
       case CELL_CMD_EXIT:
-         DEBUG_PRINTF("EXIT\n");
+         D_PRINTF(CELL_DEBUG_CMD, "EXIT\n");
          exitFlag = 1;
          break;
       case CELL_CMD_VS_EXECUTE:
@@ -725,7 +720,7 @@ command_loop(void)
 
    }
 
-   DEBUG_PRINTF("Exit command loop\n");
+   D_PRINTF(CELL_DEBUG_CMD, "Exit command loop\n");
 
    spu_dcache_report();
 }
diff --git a/src/gallium/drivers/cell/spu/spu_debug.h b/src/gallium/drivers/cell/spu/spu_debug.h
index eeec052655..25653dcdcd 100644
--- a/src/gallium/drivers/cell/spu/spu_debug.h
+++ b/src/gallium/drivers/cell/spu/spu_debug.h
@@ -30,28 +30,19 @@
 #define SPU_DEBUG_H
 
 
-/* Set to 0 to disable all extraneous debugging code */
-#define DEBUG 1
-
 #if DEBUG
-extern boolean Debug;
-extern boolean force_fragment_ops_fallback;
 
 /* These debug macros use the unusual construction ", ##__VA_ARGS__"
  * which expands to the expected comma + args if variadic arguments
  * are supplied, but swallows the comma if there are no variadic
  * arguments (which avoids syntax errors that would otherwise occur).
  */
-#define DEBUG_PRINTF(format,...) \
-   if (Debug) \
-      printf("SPU %u: " format, spu.init.id, ##__VA_ARGS__)
 #define D_PRINTF(flag, format,...) \
    if (spu.init.debug_flags & (flag)) \
       printf("SPU %u: " format, spu.init.id, ##__VA_ARGS__)
 
 #else
 
-#define DEBUG_PRINTF(...)
 #define D_PRINTF(...)
 
 #endif
diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index 4becd0f92a..c8bb251905 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -40,7 +40,6 @@
 #include "spu_per_fragment_op.h"
 #include "spu_texture.h"
 //#include "spu_test.h"
-#include "spu_debug.h"
 #include "cell/common.h"
 
 
@@ -53,12 +52,6 @@ helpful headers:
 struct spu_global spu;
 
 
-#if DEBUG
-boolean Debug = FALSE;
-boolean force_fragment_ops_fallback = TRUE;
-#endif
-
-
 static void
 one_time_init(void)
 {
@@ -102,7 +95,7 @@ main(main_param_t speid, main_param_t argp)
 
    one_time_init();
 
-   DEBUG_PRINTF("main() speid=%lu\n", (unsigned long) speid);
+   D_PRINTF(CELL_DEBUG_CMD, "main() speid=%lu\n", (unsigned long) speid);
    D_PRINTF(CELL_DEBUG_FRAGMENT_OP_FALLBACK, "using fragment op fallback\n");
 
    /* get initialization data */
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index ca72baea8b..569b9e45d4 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -36,6 +36,19 @@
 #include "pipe/p_state.h"
 
 
+#if DEBUG
+/* These debug macros use the unusual construction ", ##__VA_ARGS__"
+ * which expands to the expected comma + args if variadic arguments
+ * are supplied, but swallows the comma if there are no variadic
+ * arguments (which avoids syntax errors that would otherwise occur).
+ */
+#define D_PRINTF(flag, format,...) \
+   if (spu.init.debug_flags & (flag)) \
+      printf("SPU %u: " format, spu.init.id, ##__VA_ARGS__)
+#else
+#define D_PRINTF(...)
+#endif
+
 
 #define MAX_WIDTH 1024
 #define MAX_HEIGHT 1024
@@ -187,8 +200,6 @@ struct spu_global
 
 
 extern struct spu_global spu;
-extern boolean Debug;
-
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_render.c b/src/gallium/drivers/cell/spu/spu_render.c
index 82dbeb26b7..cfff19b6c0 100644
--- a/src/gallium/drivers/cell/spu/spu_render.c
+++ b/src/gallium/drivers/cell/spu/spu_render.c
@@ -177,7 +177,7 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
    uint i, j;
 
 
-   if (Debug) {
+#if 0
       printf("SPU %u: RENDER prim %u, num_vert=%u  num_ind=%u  "
              "inline_vert=%u\n",
              spu.init.id,
@@ -190,7 +190,7 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
       printf("       bound: %g, %g .. %g, %g\n",
              render->xmin, render->ymin, render->xmax, render->ymax);
       */
-   }
+#endif
 
    ASSERT(sizeof(*render) % 4 == 0);
    ASSERT(total_vertex_bytes % 16 == 0);
@@ -293,7 +293,8 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
       spu.ztile_status[ty][tx] = spu.cur_ztile_status;
    }
 
-   if (Debug)
+#if 0
       printf("SPU %u: RENDER done\n",
              spu.init.id);
+#endif
 }
-- 
cgit v1.2.3


From 0eb0b0a816764a323af7a8d2b5cb6792f886ce04 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 15 Oct 2008 14:12:55 -0600
Subject: cell: remove some old, pre-batchbuffer stuff

---
 src/gallium/drivers/cell/common.h          | 14 --------------
 src/gallium/drivers/cell/ppu/cell_spu.c    |  5 +----
 src/gallium/drivers/cell/ppu/cell_spu.h    |  3 +--
 src/gallium/drivers/cell/spu/spu_command.c | 19 -------------------
 4 files changed, 2 insertions(+), 39 deletions(-)

(limited to 'src/gallium/drivers/cell/spu/spu_command.c')

diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index d716a26175..600f1b37a2 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -269,19 +269,6 @@ struct cell_command_texture
 };
 
 
-/** XXX unions don't seem to work */
-/* XXX this should go away; all commands should be placed in batch buffers */
-struct cell_command
-{
-#if 0
-   struct cell_command_framebuffer fb;
-   struct cell_command_clear_surface clear;
-   struct cell_command_render render;
-#endif
-   struct cell_command_vs vs;
-} ALIGN16_ATTRIB;
-
-
 #define MAX_SPU_FUNCTIONS 12
 /**
  * Used to tell the PPU about the address of particular functions in the
@@ -302,7 +289,6 @@ struct cell_init_info
    unsigned id;
    unsigned num_spus;
    unsigned debug_flags;  /**< mask of CELL_DEBUG_x flags */
-   struct cell_command *cmd;
 
    /** Buffers for command batches, vertex/index data */
    ubyte *buffers[CELL_NUM_BUFFERS];
diff --git a/src/gallium/drivers/cell/ppu/cell_spu.c b/src/gallium/drivers/cell/ppu/cell_spu.c
index df020c4146..90745da3d2 100644
--- a/src/gallium/drivers/cell/ppu/cell_spu.c
+++ b/src/gallium/drivers/cell/ppu/cell_spu.c
@@ -126,9 +126,6 @@ cell_start_spus(struct cell_context *cell)
 
    assert(cell->num_spus <= MAX_SPUS);
 
-   ASSERT_ALIGN16(&cell_global.command[0]);
-   ASSERT_ALIGN16(&cell_global.command[1]);
-
    ASSERT_ALIGN16(&cell_global.inits[0]);
    ASSERT_ALIGN16(&cell_global.inits[1]);
 
@@ -141,7 +138,7 @@ cell_start_spus(struct cell_context *cell)
       cell_global.inits[i].id = i;
       cell_global.inits[i].num_spus = cell->num_spus;
       cell_global.inits[i].debug_flags = cell->debug_flags;
-      cell_global.inits[i].cmd = &cell_global.command[i];
+
       for (j = 0; j < CELL_NUM_BUFFERS; j++) {
          cell_global.inits[i].buffers[j] = cell->buffer[j];
       }
diff --git a/src/gallium/drivers/cell/ppu/cell_spu.h b/src/gallium/drivers/cell/ppu/cell_spu.h
index 137f26612e..3443331b01 100644
--- a/src/gallium/drivers/cell/ppu/cell_spu.h
+++ b/src/gallium/drivers/cell/ppu/cell_spu.h
@@ -50,10 +50,9 @@ struct cell_global_info
    pthread_t spe_threads[MAX_SPUS];
 
    /**
-    * Data sent to SPUs
+    * Data sent to SPUs at start-up
     */
    struct cell_init_info inits[MAX_SPUS];
-   struct cell_command command[MAX_SPUS];
 };
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index ebbed3d1dc..4febd5385b 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -669,38 +669,19 @@ cmd_batch(uint opcode)
 void
 command_loop(void)
 {
-   struct cell_command cmd;
    int exitFlag = 0;
 
    D_PRINTF(CELL_DEBUG_CMD, "Enter command loop\n");
 
-   ASSERT((sizeof(struct cell_command) & 0xf) == 0);
-   ASSERT_ALIGN16(&cmd);
-
    while (!exitFlag) {
       unsigned opcode;
-      int tag = 0;
 
       D_PRINTF(CELL_DEBUG_CMD, "Wait for cmd...\n");
 
       /* read/wait from mailbox */
       opcode = (unsigned int) spu_read_in_mbox();
-
       D_PRINTF(CELL_DEBUG_CMD, "got cmd 0x%x\n", opcode);
 
-      /* command payload */
-      mfc_get(&cmd,  /* dest */
-              (unsigned int) spu.init.cmd, /* src */
-              sizeof(struct cell_command), /* bytes */
-              tag,
-              0, /* tid */
-              0  /* rid */);
-      wait_on_mask( 1 << tag );
-
-      /*
-       * NOTE: most commands should be contained in a batch buffer
-       */
-
       switch (opcode & CELL_CMD_OPCODE_MASK) {
       case CELL_CMD_EXIT:
          D_PRINTF(CELL_DEBUG_CMD, "EXIT\n");
-- 
cgit v1.2.3


From ec7d6c656178babdf143faa242f7a3df9d0bc22c Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 15 Oct 2008 14:39:16 -0600
Subject: cell: send rasterizer state to SPUs in proper way, remove
 front_winding hack

---
 src/gallium/drivers/cell/common.h              | 18 ++++++++++++++----
 src/gallium/drivers/cell/ppu/cell_state_emit.c |  7 +++++++
 src/gallium/drivers/cell/ppu/cell_vbuf.c       |  1 -
 src/gallium/drivers/cell/spu/spu_command.c     |  8 ++++++++
 src/gallium/drivers/cell/spu/spu_main.h        |  1 +
 src/gallium/drivers/cell/spu/spu_render.c      |  2 +-
 src/gallium/drivers/cell/spu/spu_tri.c         |  4 ++--
 src/gallium/drivers/cell/spu/spu_tri.h         |  2 +-
 8 files changed, 34 insertions(+), 9 deletions(-)

(limited to 'src/gallium/drivers/cell/spu/spu_command.c')

diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index 1f6f2d494b..0ff2c491fb 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -99,8 +99,9 @@
 #define CELL_CMD_STATE_FRAGMENT_PROGRAM 19
 #define CELL_CMD_STATE_ATTRIB_FETCH  20
 #define CELL_CMD_STATE_FS_CONSTANTS  21
-#define CELL_CMD_VS_EXECUTE          22
-#define CELL_CMD_FLUSH_BUFFER_RANGE  23
+#define CELL_CMD_STATE_RASTERIZER    22
+#define CELL_CMD_VS_EXECUTE          23
+#define CELL_CMD_FLUSH_BUFFER_RANGE  24
 
 /** Command/batch buffers */
 #define CELL_NUM_BUFFERS 4
@@ -156,13 +157,23 @@ struct cell_command_fragment_program
  */
 struct cell_command_framebuffer
 {
-   uint64_t opcode;     /**< CELL_CMD_FRAMEBUFFER */
+   uint64_t opcode;     /**< CELL_CMD_STATE_FRAMEBUFFER */
    int width, height;
    void *color_start, *depth_start;
    enum pipe_format color_format, depth_format;
 };
 
 
+/**
+ * Tell SPUs about rasterizer state.
+ */
+struct cell_command_rasterizer
+{
+   uint64_t opcode;    /**< CELL_CMD_STATE_RASTERIZER */
+   struct pipe_rasterizer_state rasterizer;
+};
+
+
 /**
  * Clear framebuffer to the given value/color.
  */
@@ -238,7 +249,6 @@ struct cell_command_render
    float xmin, ymin, xmax, ymax;  /* XXX another dummy field */
    uint min_index;
    boolean inline_verts;
-   uint front_winding; /* the rasterizer needs to be able to determine facing to apply front/back-facing stencil */
 };
 
 
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index d2427584ba..e6387382f2 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -147,6 +147,13 @@ cell_emit_state(struct cell_context *cell)
 #endif
    }
 
+   if (cell->dirty & (CELL_NEW_RASTERIZER)) {
+      struct cell_command_rasterizer *rast =
+         cell_batch_alloc(cell, sizeof(*rast));
+      rast->opcode = CELL_CMD_STATE_RASTERIZER;
+      rast->rasterizer = *cell->rasterizer;
+   }
+
    if (cell->dirty & (CELL_NEW_FS)) {
       /* Send new fragment program to SPUs */
       struct cell_command_fragment_program *fp
diff --git a/src/gallium/drivers/cell/ppu/cell_vbuf.c b/src/gallium/drivers/cell/ppu/cell_vbuf.c
index 578ddf62dc..aa63435b93 100644
--- a/src/gallium/drivers/cell/ppu/cell_vbuf.c
+++ b/src/gallium/drivers/cell/ppu/cell_vbuf.c
@@ -214,7 +214,6 @@ cell_vbuf_draw(struct vbuf_render *vbr,
 
       render->opcode = CELL_CMD_RENDER;
       render->prim_type = cvbr->prim;
-      render->front_winding = cell->rasterizer->front_winding;
 
       render->num_indexes = nr_indices;
       render->min_index = min_index;
diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index 4febd5385b..d2c282a022 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -583,6 +583,14 @@ cmd_batch(uint opcode)
       case CELL_CMD_STATE_FS_CONSTANTS:
          pos = cmd_state_fs_constants(buffer, pos);
          break;
+      case CELL_CMD_STATE_RASTERIZER:
+         {
+            struct cell_command_rasterizer *rast =
+               (struct cell_command_rasterizer *) &buffer[pos];
+            spu.rasterizer = rast->rasterizer;
+            pos += sizeof(*rast) / 8;
+         }
+         break;
       case CELL_CMD_STATE_SAMPLER:
          {
             struct cell_command_sampler *sampler
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index f87495b72d..4099e52699 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -153,6 +153,7 @@ struct spu_global
    struct pipe_blend_state blend;
    struct pipe_blend_color blend_color;
    struct pipe_sampler_state sampler[PIPE_MAX_SAMPLERS];
+   struct pipe_rasterizer_state rasterizer;
    struct spu_texture texture[PIPE_MAX_SAMPLERS];
    struct vertex_info vertex_info;
 
diff --git a/src/gallium/drivers/cell/spu/spu_render.c b/src/gallium/drivers/cell/spu/spu_render.c
index cfff19b6c0..75a7f75abc 100644
--- a/src/gallium/drivers/cell/spu/spu_render.c
+++ b/src/gallium/drivers/cell/spu/spu_render.c
@@ -279,7 +279,7 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
          v1 = (const float *) (vertices + indexes[j+1] * vertex_size);
          v2 = (const float *) (vertices + indexes[j+2] * vertex_size);
 
-         drawn += tri_draw(v0, v1, v2, tx, ty, render->front_winding);
+         drawn += tri_draw(v0, v1, v2, tx, ty);
       }
 
       //printf("SPU %u: drew %u of %u\n", spu.init.id, drawn, render->num_indexes/3);
diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index 2417db8960..1519b8cd7e 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -775,7 +775,7 @@ determinant(const float *v0, const float *v1, const float *v2)
  */
 boolean
 tri_draw(const float *v0, const float *v1, const float *v2,
-         uint tx, uint ty, uint front_winding)
+         uint tx, uint ty)
 {
    setup.tx = tx;
    setup.ty = ty;
@@ -790,7 +790,7 @@ tri_draw(const float *v0, const float *v1, const float *v2,
     * which will be needed for front/back-face stencil application
     */
    float det = determinant(v0, v1, v2);
-   setup.facing = (det > 0.0) ^ (front_winding == PIPE_WINDING_CW);
+   setup.facing = (det > 0.0) ^ (spu.rasterizer.front_winding == PIPE_WINDING_CW);
 
    if (!setup_sort_vertices((struct vertex_header *) v0,
                             (struct vertex_header *) v1,
diff --git a/src/gallium/drivers/cell/spu/spu_tri.h b/src/gallium/drivers/cell/spu/spu_tri.h
index abc3d35160..aa694dd7c9 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.h
+++ b/src/gallium/drivers/cell/spu/spu_tri.h
@@ -31,7 +31,7 @@
 
 
 extern boolean
-tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty, uint front_winding);
+tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty);
 
 
 #endif /* SPU_TRI_H */
-- 
cgit v1.2.3


From 0116ee1d1c341726b6ed23c2dddc4515e8a34385 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 15 Oct 2008 20:46:43 -0600
Subject: cell: start some performance measurements

Use the spu_write_decrementer() and spu_read_decrementer() functions to
measure time.  Convert to milliseconds according to the system timebase value.
---
 src/gallium/drivers/cell/common.h          |  1 +
 src/gallium/drivers/cell/ppu/cell_spu.c    | 31 ++++++++++++++++++++++++++++++
 src/gallium/drivers/cell/spu/spu_command.c | 15 +++++++++++++++
 src/gallium/drivers/cell/spu/spu_render.c  |  9 ++++++++-
 4 files changed, 55 insertions(+), 1 deletion(-)

(limited to 'src/gallium/drivers/cell/spu/spu_command.c')

diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index 0ff2c491fb..469d56cda8 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -299,6 +299,7 @@ struct cell_init_info
    unsigned id;
    unsigned num_spus;
    unsigned debug_flags;  /**< mask of CELL_DEBUG_x flags */
+   float inv_timebase;    /**< 1.0/timebase, for perf measurement */
 
    /** Buffers for command batches, vertex/index data */
    ubyte *buffers[CELL_NUM_BUFFERS];
diff --git a/src/gallium/drivers/cell/ppu/cell_spu.c b/src/gallium/drivers/cell/ppu/cell_spu.c
index a6e268b362..28e5e6d706 100644
--- a/src/gallium/drivers/cell/ppu/cell_spu.c
+++ b/src/gallium/drivers/cell/ppu/cell_spu.c
@@ -52,6 +52,35 @@ helpful headers:
 struct cell_global_info cell_global;
 
 
+/**
+ * Scan /proc/cpuinfo to determine the timebase for the system.
+ * This is used by the SPUs to convert 'decrementer' ticks to seconds.
+ * There may be a better way to get this value...
+ */
+static unsigned
+get_timebase(void)
+{
+   FILE *f = fopen("/proc/cpuinfo", "r");
+   unsigned timebase;
+
+   assert(f);
+   while (!feof(f)) {
+      char line[80];
+      fgets(line, sizeof(line), f);
+      if (strncmp(line, "timebase", 8) == 0) {
+         char *colon = strchr(line, ':');
+         if (colon) {
+            timebase = atoi(colon + 2);
+            break;
+         }
+      }
+   }
+   fclose(f);
+
+   return timebase;
+}
+
+
 /**
  * Write a 1-word message to the given SPE mailbox.
  */
@@ -115,6 +144,7 @@ cell_start_spus(struct cell_context *cell)
 {
    static boolean one_time_init = FALSE;
    uint i, j;
+   uint timebase = get_timebase();
 
    if (one_time_init) {
       fprintf(stderr, "PPU: Multiple rendering contexts not yet supported "
@@ -138,6 +168,7 @@ cell_start_spus(struct cell_context *cell)
       cell_global.inits[i].id = i;
       cell_global.inits[i].num_spus = cell->num_spus;
       cell_global.inits[i].debug_flags = cell->debug_flags;
+      cell_global.inits[i].inv_timebase = 1000.0f / timebase;
 
       for (j = 0; j < CELL_NUM_BUFFERS; j++) {
          cell_global.inits[i].buffers[j] = cell->buffer[j];
diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index d2c282a022..57d265fef7 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -670,6 +670,8 @@ cmd_batch(uint opcode)
 }
 
 
+#define PERF 0
+
 
 /**
  * Main loop for SPEs: Get a command, execute it, repeat.
@@ -678,6 +680,7 @@ void
 command_loop(void)
 {
    int exitFlag = 0;
+   uint t0, t1;
 
    D_PRINTF(CELL_DEBUG_CMD, "Enter command loop\n");
 
@@ -686,10 +689,16 @@ command_loop(void)
 
       D_PRINTF(CELL_DEBUG_CMD, "Wait for cmd...\n");
 
+      if (PERF)
+         spu_write_decrementer(~0);
+
       /* read/wait from mailbox */
       opcode = (unsigned int) spu_read_in_mbox();
       D_PRINTF(CELL_DEBUG_CMD, "got cmd 0x%x\n", opcode);
 
+      if (PERF)
+         t0 = spu_read_decrementer();
+
       switch (opcode & CELL_CMD_OPCODE_MASK) {
       case CELL_CMD_EXIT:
          D_PRINTF(CELL_DEBUG_CMD, "EXIT\n");
@@ -707,6 +716,12 @@ command_loop(void)
          printf("Bad opcode 0x%x!\n", opcode & CELL_CMD_OPCODE_MASK);
       }
 
+      if (PERF) {
+         t1 = spu_read_decrementer();
+         printf("wait mbox time: %gms   batch time: %gms\n",
+                (~0u - t0) * spu.init.inv_timebase,
+                (t0 - t1) * spu.init.inv_timebase);
+      }
    }
 
    D_PRINTF(CELL_DEBUG_CMD, "Exit command loop\n");
diff --git a/src/gallium/drivers/cell/spu/spu_render.c b/src/gallium/drivers/cell/spu/spu_render.c
index 802455bf79..5515bb55c9 100644
--- a/src/gallium/drivers/cell/spu/spu_render.c
+++ b/src/gallium/drivers/cell/spu/spu_render.c
@@ -175,6 +175,7 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
    const ubyte *vertices;
    const ushort *indexes;
    uint i, j;
+   uint num_tiles;
 
    D_PRINTF(CELL_DEBUG_CMD,
             "RENDER prim=%u num_vert=%u num_ind=%u inline_vert=%u\n",
@@ -242,6 +243,8 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
    wait_on_mask(1 << TAG_SURFACE_CLEAR); /* XXX temporary */
 
 
+   num_tiles = 0;
+
    /**
     ** loop over tiles, rendering tris
     **/
@@ -255,6 +258,8 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
       if (!my_tile(tx, ty))
          continue;
 
+      num_tiles++;
+
       spu.cur_ctile_status = spu.ctile_status[ty][tx];
       spu.cur_ztile_status = spu.ztile_status[ty][tx];
 
@@ -284,5 +289,7 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
       spu.ztile_status[ty][tx] = spu.cur_ztile_status;
    }
 
-   D_PRINTF(CELL_DEBUG_CMD, "RENDER done\n");
+   D_PRINTF(CELL_DEBUG_CMD,
+            "RENDER done (%u tiles hit)\n",
+            num_tiles);
 }
-- 
cgit v1.2.3


From 926b8dbb3e86360e5968882df94785ae84d0ad43 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Thu, 16 Oct 2008 09:00:05 -0600
Subject: cell: clean up various texture-related things

Distinguish among texture targets in codegen.
progs/demos/cubemap.c runs correctly now too.
---
 src/gallium/drivers/cell/ppu/cell_gen_fp.c | 29 ++++++++++++++---
 src/gallium/drivers/cell/spu/spu_command.c | 24 ++++++--------
 src/gallium/drivers/cell/spu/spu_funcs.c   | 34 +++++++++++++++++---
 src/gallium/drivers/cell/spu/spu_main.h    | 16 +++++-----
 src/gallium/drivers/cell/spu/spu_texture.c | 50 ++++++++++++++----------------
 src/gallium/drivers/cell/spu/spu_texture.h | 34 +++++++++-----------
 6 files changed, 107 insertions(+), 80 deletions(-)

(limited to 'src/gallium/drivers/cell/spu/spu_command.c')

diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
index 3dfd5f673d..2b34cf1e23 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
@@ -1337,16 +1337,33 @@ emit_function_call(struct codegen *gen,
 
 
 static boolean
-emit_TXP(struct codegen *gen, const struct tgsi_full_instruction *inst)
+emit_TEX(struct codegen *gen, const struct tgsi_full_instruction *inst)
 {
-   const uint addr = lookup_function(gen->cell, "spu_txp");
+   const uint target = inst->InstructionExtTexture.Texture;
    const uint unit = inst->FullSrcRegisters[1].SrcRegister.Index;
+   uint addr;
    int ch;
    int coord_regs[4], d_regs[4];
 
+   switch (target) {
+   case TGSI_TEXTURE_1D:
+   case TGSI_TEXTURE_2D:
+      addr = lookup_function(gen->cell, "spu_tex_2d");
+      break;
+   case TGSI_TEXTURE_3D:
+      addr = lookup_function(gen->cell, "spu_tex_3d");
+      break;
+   case TGSI_TEXTURE_CUBE:
+      addr = lookup_function(gen->cell, "spu_tex_cube");
+      break;
+   default:
+      ASSERT(0 && "unsupported texture target");
+      return FALSE;
+   }
+
    assert(inst->FullSrcRegisters[1].SrcRegister.File == TGSI_FILE_SAMPLER);
 
-   spe_comment(gen->f, -4, "CALL txp:");
+   spe_comment(gen->f, -4, "CALL tex:");
 
    /* get src/dst reg info */
    for (ch = 0; ch < 4; ch++) {
@@ -1368,7 +1385,7 @@ emit_TXP(struct codegen *gen, const struct tgsi_full_instruction *inst)
          spe_stqd(gen->f, reg, SPE_REG_SP, 16 * offset);
       }
 
-      /* setup function arguments */
+      /* setup function arguments (XXX depends on target) */
       for (i = 0; i < 4; i++) {
          spe_move(gen->f, 3 + i, coord_regs[i]);
       }
@@ -1674,8 +1691,10 @@ emit_instruction(struct codegen *gen,
       /* fall-through for now */
    case TGSI_OPCODE_TXB:
       /* fall-through for now */
+   case TGSI_OPCODE_TXL:
+      /* fall-through for now */
    case TGSI_OPCODE_TXP:
-      return emit_TXP(gen, inst);
+      return emit_TEX(gen, inst);
 
    case TGSI_OPCODE_IF:
       return emit_IF(gen, inst);
diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index 57d265fef7..ff4a52d79a 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -310,8 +310,7 @@ cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
  */
 static void
 update_tex_masks(struct spu_texture *texture,
-                 const struct pipe_sampler_state *sampler,
-                 uint unit)
+                 const struct pipe_sampler_state *sampler)
 {
    uint i;
 
@@ -338,11 +337,6 @@ update_tex_masks(struct spu_texture *texture,
          texture->level[i].scale_t = spu_splats(1.0f);
       }
    }
-
-   /* XXX temporary hack */
-   if (texture->target == PIPE_TEXTURE_CUBE) {
-      spu.sample_texture4[unit] = sample_texture4_cube;
-   }
 }
 
 
@@ -357,12 +351,12 @@ cmd_state_sampler(const struct cell_command_sampler *sampler)
 
    switch (spu.sampler[unit].min_img_filter) {
    case PIPE_TEX_FILTER_LINEAR:
-      spu.min_sample_texture4[unit] = sample_texture4_bilinear;
+      spu.min_sample_texture_2d[unit] = sample_texture_2d_bilinear;
       break;
    case PIPE_TEX_FILTER_ANISO:
       /* fall-through, for now */
    case PIPE_TEX_FILTER_NEAREST:
-      spu.min_sample_texture4[unit] = sample_texture4_nearest;
+      spu.min_sample_texture_2d[unit] = sample_texture_2d_nearest;
       break;
    default:
       ASSERT(0);
@@ -370,12 +364,12 @@ cmd_state_sampler(const struct cell_command_sampler *sampler)
 
    switch (spu.sampler[sampler->unit].mag_img_filter) {
    case PIPE_TEX_FILTER_LINEAR:
-      spu.mag_sample_texture4[unit] = sample_texture4_bilinear;
+      spu.mag_sample_texture_2d[unit] = sample_texture_2d_bilinear;
       break;
    case PIPE_TEX_FILTER_ANISO:
       /* fall-through, for now */
    case PIPE_TEX_FILTER_NEAREST:
-      spu.mag_sample_texture4[unit] = sample_texture4_nearest;
+      spu.mag_sample_texture_2d[unit] = sample_texture_2d_nearest;
       break;
    default:
       ASSERT(0);
@@ -384,16 +378,16 @@ cmd_state_sampler(const struct cell_command_sampler *sampler)
    switch (spu.sampler[sampler->unit].min_mip_filter) {
    case PIPE_TEX_MIPFILTER_NEAREST:
    case PIPE_TEX_MIPFILTER_LINEAR:
-      spu.sample_texture4[unit] = sample_texture4_lod;
+      spu.sample_texture_2d[unit] = sample_texture_2d_lod;
       break;
    case PIPE_TEX_MIPFILTER_NONE:
-      spu.sample_texture4[unit] = spu.mag_sample_texture4[unit];
+      spu.sample_texture_2d[unit] = spu.mag_sample_texture_2d[unit];
       break;
    default:
       ASSERT(0);
    }
 
-   update_tex_masks(&spu.texture[unit], &spu.sampler[unit], unit);
+   update_tex_masks(&spu.texture[unit], &spu.sampler[unit]);
 }
 
 
@@ -434,7 +428,7 @@ cmd_state_texture(const struct cell_command_texture *texture)
          spu.texture[unit].max_level = i;
    }
 
-   update_tex_masks(&spu.texture[unit], &spu.sampler[unit], unit);
+   update_tex_masks(&spu.texture[unit], &spu.sampler[unit]);
 }
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_funcs.c b/src/gallium/drivers/cell/spu/spu_funcs.c
index 5c3ee305d4..3534b35000 100644
--- a/src/gallium/drivers/cell/spu/spu_funcs.c
+++ b/src/gallium/drivers/cell/spu/spu_funcs.c
@@ -43,6 +43,7 @@
 #include "cell/common.h"
 #include "spu_main.h"
 #include "spu_funcs.h"
+#include "spu_texture.h"
 
 
 /** For "return"-ing four vectors */
@@ -102,11 +103,34 @@ spu_log2(vector float x)
 
 
 static struct vec_4x4
-spu_txp(vector float s, vector float t, vector float r, vector float q,
-        unsigned unit)
+spu_tex_2d(vector float s, vector float t, vector float r, vector float q,
+           unsigned unit)
 {
    struct vec_4x4 colors;
-   spu.sample_texture4[unit](s, t, r, q, unit, 0, 0, colors.v);
+   (void) r;
+   (void) q;
+   spu.sample_texture_2d[unit](s, t, unit, 0, 0, colors.v);
+   return colors;
+}
+
+static struct vec_4x4
+spu_tex_3d(vector float s, vector float t, vector float r, vector float q,
+           unsigned unit)
+{
+   struct vec_4x4 colors;
+   (void) r;
+   (void) q;
+   spu.sample_texture_2d[unit](s, t, unit, 0, 0, colors.v);
+   return colors;
+}
+
+static struct vec_4x4
+spu_tex_cube(vector float s, vector float t, vector float r, vector float q,
+           unsigned unit)
+{
+   struct vec_4x4 colors;
+   (void) q;
+   sample_texture_cube(s, t, r, unit, colors.v);
    return colors;
 }
 
@@ -147,7 +171,9 @@ return_function_info(void)
    export_func(&funcs, "spu_pow", &spu_pow);
    export_func(&funcs, "spu_exp2", &spu_exp2);
    export_func(&funcs, "spu_log2", &spu_log2);
-   export_func(&funcs, "spu_txp", &spu_txp);
+   export_func(&funcs, "spu_tex_2d", &spu_tex_2d);
+   export_func(&funcs, "spu_tex_3d", &spu_tex_3d);
+   export_func(&funcs, "spu_tex_cube", &spu_tex_cube);
 
    /* Send the function info back to the PPU / main memory */
    mfc_put((void *) &funcs,  /* src in local store */
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 4099e52699..80e9c696f8 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -70,12 +70,10 @@ typedef union {
 
 
 /** Function for sampling textures */
-typedef void (*spu_sample_texture4_func)(vector float s,
-                                         vector float t,
-                                         vector float r,
-                                         vector float q,
-                                         uint unit, uint level, uint face,
-                                         vector float colors[4]);
+typedef void (*spu_sample_texture_2d_func)(vector float s,
+                                           vector float t,
+                                           uint unit, uint level, uint face,
+                                           vector float colors[4]);
 
 
 /** Function for performing per-fragment ops */
@@ -183,9 +181,9 @@ struct spu_global
    spu_fragment_program_func fragment_program;
 
    /** Current texture sampler function */
-   spu_sample_texture4_func sample_texture4[CELL_MAX_SAMPLERS];
-   spu_sample_texture4_func min_sample_texture4[CELL_MAX_SAMPLERS];
-   spu_sample_texture4_func mag_sample_texture4[CELL_MAX_SAMPLERS];
+   spu_sample_texture_2d_func sample_texture_2d[CELL_MAX_SAMPLERS];
+   spu_sample_texture_2d_func min_sample_texture_2d[CELL_MAX_SAMPLERS];
+   spu_sample_texture_2d_func mag_sample_texture_2d[CELL_MAX_SAMPLERS];
 
    /** Fragment program constants */
    vector float constants[4 * CELL_MAX_CONSTANTS];
diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
index 42eb06a362..04202a7657 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.c
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -126,10 +126,9 @@ spu_clamp(vector signed int vec, vector signed int max)
  * \param colors  returned colors in SOA format (rrrr, gggg, bbbb, aaaa).
  */
 void
-sample_texture4_nearest(vector float s, vector float t,
-                        vector float r, vector float q,
-                        uint unit, uint level, uint face,
-                        vector float colors[4])
+sample_texture_2d_nearest(vector float s, vector float t,
+                          uint unit, uint level, uint face,
+                          vector float colors[4])
 {
    const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
    vector float ss = spu_mul(s, tlevel->scale_s);
@@ -158,10 +157,9 @@ sample_texture4_nearest(vector float s, vector float t,
  * \param colors  returned colors in SOA format (rrrr, gggg, bbbb, aaaa).
  */
 void
-sample_texture4_bilinear(vector float s, vector float t,
-                         vector float r, vector float q,
-                         uint unit, uint level, uint face,
-                         vector float colors[4])
+sample_texture_2d_bilinear(vector float s, vector float t,
+                           uint unit, uint level, uint face,
+                           vector float colors[4])
 {
    const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
    static const vector float half = {-0.5f, -0.5f, -0.5f, -0.5f};
@@ -308,10 +306,9 @@ transpose(vector unsigned int *mOut0,
  * Bilinear filtering, using int intead of float arithmetic
  */
 void
-sample_texture4_bilinear_2(vector float s, vector float t,
-                           vector float r, vector float q,
-                           uint unit, uint level, uint face,
-                           vector float colors[4])
+sample_texture_2d_bilinear_int(vector float s, vector float t,
+                               uint unit, uint level, uint face,
+                               vector float colors[4])
 {
    const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
    static const vector float half = {-0.5f, -0.5f, -0.5f, -0.5f};
@@ -444,10 +441,9 @@ compute_lambda(uint unit, vector float s, vector float t)
  * Texture sampling with level of detail selection.
  */
 void
-sample_texture4_lod(vector float s, vector float t,
-                    vector float r, vector float q,
-                    uint unit, uint level_ignored, uint face,
-                    vector float colors[4])
+sample_texture_2d_lod(vector float s, vector float t,
+                      uint unit, uint level_ignored, uint face,
+                      vector float colors[4])
 {
    /*
     * Note that we're computing a lambda/lod here that's used for all
@@ -455,6 +451,9 @@ sample_texture4_lod(vector float s, vector float t,
     */
    float lambda = compute_lambda(unit, s, t);
 
+   (void) face;
+   (void) level_ignored;
+
    /* apply lod bias */
    lambda += spu.sampler[unit].lod_bias;
 
@@ -466,14 +465,14 @@ sample_texture4_lod(vector float s, vector float t,
 
    if (lambda <= 0.0f) {
       /* magnify */
-      spu.mag_sample_texture4[unit](s, t, r, q, unit, 0, 0, colors);
+      spu.mag_sample_texture_2d[unit](s, t, unit, 0, 0, colors);
    }
    else {
       /* minify */
       int level = (int) (lambda + 0.5f);
       if (level > (int) spu.texture[unit].max_level)
          level = spu.texture[unit].max_level;
-      spu.min_sample_texture4[unit](s, t, r, q, unit, level, 0, colors);
+      spu.min_sample_texture_2d[unit](s, t, unit, level, 0, colors);
       /* XXX to do: mipmap level interpolation */
    }
 }
@@ -552,13 +551,10 @@ choose_cube_face(float rx, float ry, float rz, float *newS, float *newT)
 
 
 void
-sample_texture4_cube(vector float s, vector float t,
-                     vector float r, vector float q,
-                     uint unit, uint level, uint face_ignored,
-                     vector float colors[4])
+sample_texture_cube(vector float s, vector float t, vector float r,
+                    uint unit, vector float colors[4])
 {
-   static const vector float zero = {0.0f, 0.0f, 0.0f, 0.0f};
-   uint p, faces[4];
+   uint p, faces[4], level = 0;
    float newS[4], newT[4];
 
    /* Compute cube face referenced by the four sets of texcoords.
@@ -577,15 +573,15 @@ sample_texture4_cube(vector float s, vector float t,
       /* GOOD!  All four texcoords refer to the same cube face */
       s = (vector float) {newS[0], newS[1], newS[2], newS[3]};
       t = (vector float) {newT[0], newT[1], newT[2], newT[3]};
-      sample_texture4_nearest(s, t, zero, zero, unit, level, faces[0], colors);
+      sample_texture_2d_nearest(s, t, unit, level, faces[0], colors);
    }
    else {
       /* BAD!  The four texcoords refer to different faces */
       for (p = 0; p < 4; p++) {      
          vector float c[4];
 
-         sample_texture4_nearest(spu_splats(newS[p]), spu_splats(newT[p]),
-                                 zero, zero, unit, level, faces[p], c);
+         sample_texture_2d_nearest(spu_splats(newS[p]), spu_splats(newT[p]),
+                                   unit, level, faces[p], c);
 
          float red = spu_extract(c[0], p);
          float green = spu_extract(c[1], p);
diff --git a/src/gallium/drivers/cell/spu/spu_texture.h b/src/gallium/drivers/cell/spu/spu_texture.h
index 387484c3ad..7b75b007b5 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.h
+++ b/src/gallium/drivers/cell/spu/spu_texture.h
@@ -37,37 +37,31 @@ invalidate_tex_cache(void);
 
 
 extern void
-sample_texture4_nearest(vector float s, vector float t,
-                        vector float r, vector float q,
-                        uint unit, uint level, uint face,
-                        vector float colors[4]);
+sample_texture_2d_nearest(vector float s, vector float t,
+                          uint unit, uint level, uint face,
+                          vector float colors[4]);
 
 
 extern void
-sample_texture4_bilinear(vector float s, vector float t,
-                         vector float r, vector float q,
-                         uint unit, uint level, uint face,
-                         vector float colors[4]);
-
-extern void
-sample_texture4_bilinear_2(vector float s, vector float t,
-                           vector float r, vector float q,
+sample_texture_2d_bilinear(vector float s, vector float t,
                            uint unit, uint level, uint face,
                            vector float colors[4]);
 
+extern void
+sample_texture_2d_bilinear_int(vector float s, vector float t,
+                               uint unit, uint level, uint face,
+                               vector float colors[4]);
+
 
 extern void
-sample_texture4_lod(vector float s, vector float t,
-                    vector float r, vector float q,
-                    uint unit, uint level, uint face,
-                    vector float colors[4]);
+sample_texture_2d_lod(vector float s, vector float t,
+                      uint unit, uint level, uint face,
+                      vector float colors[4]);
 
 
 extern void
-sample_texture4_cube(vector float s, vector float t,
-                     vector float r, vector float q,
-                     uint unit, uint level_ignored, uint face_ignored,
-                     vector float colors[4]);
+sample_texture_cube(vector float s, vector float t, vector float r,
+                    uint unit, vector float colors[4]);
 
 
 #endif /* SPU_TEXTURE_H */
-- 
cgit v1.2.3


From 9fa8671c73fa44a95e2ea7fed6047bddb042796f Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Thu, 16 Oct 2008 20:25:28 -0600
Subject: cell: add new debug flag (cache) to report texture cache stats on
 exit

---
 src/gallium/drivers/cell/common.h           | 1 +
 src/gallium/drivers/cell/ppu/cell_context.c | 1 +
 src/gallium/drivers/cell/spu/spu_command.c  | 3 ++-
 src/gallium/drivers/cell/spu/spu_dcache.c   | 4 +++-
 4 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'src/gallium/drivers/cell/spu/spu_command.c')

diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index 469d56cda8..9ca4e9d67e 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -117,6 +117,7 @@
 #define CELL_DEBUG_FRAGMENT_OPS         (1 << 3)
 #define CELL_DEBUG_FRAGMENT_OP_FALLBACK (1 << 4)
 #define CELL_DEBUG_CMD                  (1 << 5)
+#define CELL_DEBUG_CACHE                (1 << 6)
 
 /** Max instructions for doing per-fragment operations */
 #define SPU_MAX_FRAGMENT_OPS_INSTS 64
diff --git a/src/gallium/drivers/cell/ppu/cell_context.c b/src/gallium/drivers/cell/ppu/cell_context.c
index 4dad490ce1..7a2d93ecb4 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.c
+++ b/src/gallium/drivers/cell/ppu/cell_context.c
@@ -94,6 +94,7 @@ static const struct debug_named_value cell_debug_flags[] = {
    {"fragops", CELL_DEBUG_FRAGMENT_OPS}, /**< SPUs emit fragment ops debug messages*/
    {"fragopfallback", CELL_DEBUG_FRAGMENT_OP_FALLBACK}, /**< SPUs use reference implementation for fragment ops*/
    {"cmd", CELL_DEBUG_CMD},       /**< SPUs dump command buffer info */
+   {"cache", CELL_DEBUG_CACHE},   /**< report texture cache stats on exit */
    {NULL, 0}
 };
 
diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index ff4a52d79a..9c853c0961 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -720,5 +720,6 @@ command_loop(void)
 
    D_PRINTF(CELL_DEBUG_CMD, "Exit command loop\n");
 
-   spu_dcache_report();
+   if (spu.init.debug_flags & CELL_DEBUG_CACHE)
+      spu_dcache_report();
 }
diff --git a/src/gallium/drivers/cell/spu/spu_dcache.c b/src/gallium/drivers/cell/spu/spu_dcache.c
index 167404cdc5..a6d67634fd 100644
--- a/src/gallium/drivers/cell/spu/spu_dcache.c
+++ b/src/gallium/drivers/cell/spu/spu_dcache.c
@@ -36,7 +36,9 @@
 #define CACHE_SET_TAGID(set)  (((set) & 0x03) + TAG_DCACHE0)
 #define CACHE_LOG2NNWAY       2
 #define CACHE_LOG2NSETS       6
-/*#define CACHE_STATS           1*/
+#ifdef DEBUG
+#define CACHE_STATS           1
+#endif
 #include <cache-api.h>
 
 /* Yes folks, this is ugly.
-- 
cgit v1.2.3


From 70dd4379d2cd54f229c3940312537912470218d3 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 22 Oct 2008 10:34:13 -0600
Subject: cell: implement fencing for texture buffers

If we delete a texture, we need to keep the underlying tiled data buffer
around until any rendering that references it has completed.
Keep a list of buffers referenced by a rendering batch.  Unref/free them when
the associated batch's fence is executed/signalled.
---
 src/gallium/drivers/cell/common.h              |  25 ++++
 src/gallium/drivers/cell/ppu/Makefile          |   1 +
 src/gallium/drivers/cell/ppu/cell_batch.c      |  32 +++++
 src/gallium/drivers/cell/ppu/cell_context.c    |   6 +
 src/gallium/drivers/cell/ppu/cell_context.h    |  21 ++++
 src/gallium/drivers/cell/ppu/cell_fence.c      | 158 +++++++++++++++++++++++++
 src/gallium/drivers/cell/ppu/cell_fence.h      |  57 +++++++++
 src/gallium/drivers/cell/ppu/cell_state_emit.c |   2 +-
 src/gallium/drivers/cell/ppu/cell_texture.c    |  33 ++++--
 src/gallium/drivers/cell/ppu/cell_texture.h    |   5 +-
 src/gallium/drivers/cell/ppu/cell_vbuf.c       |   6 +
 src/gallium/drivers/cell/spu/spu_command.c     |  38 +++++-
 src/gallium/drivers/cell/spu/spu_main.h        |   2 +-
 13 files changed, 367 insertions(+), 19 deletions(-)
 create mode 100644 src/gallium/drivers/cell/ppu/cell_fence.c
 create mode 100644 src/gallium/drivers/cell/ppu/cell_fence.h

(limited to 'src/gallium/drivers/cell/spu/spu_command.c')

diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index 9ca4e9d67e..23fb0b0831 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -102,6 +102,8 @@
 #define CELL_CMD_STATE_RASTERIZER    22
 #define CELL_CMD_VS_EXECUTE          23
 #define CELL_CMD_FLUSH_BUFFER_RANGE  24
+#define CELL_CMD_FENCE               25
+
 
 /** Command/batch buffers */
 #define CELL_NUM_BUFFERS 4
@@ -123,6 +125,29 @@
 #define SPU_MAX_FRAGMENT_OPS_INSTS 64
 
 
+
+#define CELL_FENCE_IDLE      0
+#define CELL_FENCE_EMITTED   1
+#define CELL_FENCE_SIGNALLED 2
+
+struct cell_fence
+{
+   /** There's a 16-byte status qword per SPU */
+   volatile uint status[CELL_MAX_SPUS][4];
+};
+
+
+/**
+ * Fence command sent to SPUs.  In response, the SPUs will write
+ * CELL_FENCE_STATUS_SIGNALLED back to the fence status word in main memory.
+ */
+struct cell_command_fence
+{
+   uint64_t opcode;      /**< CELL_CMD_FENCE */
+   struct cell_fence *fence;
+};
+
+
 /**
  * Command to specify per-fragment operations state and generated code.
  * Note that the dsa, blend, blend_color fields are really only needed
diff --git a/src/gallium/drivers/cell/ppu/Makefile b/src/gallium/drivers/cell/ppu/Makefile
index b28f4c5c31..9358a47284 100644
--- a/src/gallium/drivers/cell/ppu/Makefile
+++ b/src/gallium/drivers/cell/ppu/Makefile
@@ -24,6 +24,7 @@ SOURCES = \
 	cell_clear.c \
 	cell_context.c \
 	cell_draw_arrays.c \
+	cell_fence.c \
 	cell_flush.c \
 	cell_gen_fragment.c \
 	cell_gen_fp.c \
diff --git a/src/gallium/drivers/cell/ppu/cell_batch.c b/src/gallium/drivers/cell/ppu/cell_batch.c
index 01254aed60..448b723d85 100644
--- a/src/gallium/drivers/cell/ppu/cell_batch.c
+++ b/src/gallium/drivers/cell/ppu/cell_batch.c
@@ -28,6 +28,7 @@
 
 #include "cell_context.h"
 #include "cell_batch.h"
+#include "cell_fence.h"
 #include "cell_spu.h"
 
 
@@ -63,6 +64,10 @@ cell_get_empty_buffer(struct cell_context *cell)
                printf("PPU: ALLOC BUFFER %u, %u tries\n", buf, tries);
                */
                prev_buffer = buf;
+
+               /* release tex buffer associated w/ prev use of this batch buf */
+               cell_free_fenced_buffers(cell, &cell->fenced_buffers[buf]);
+
                return buf;
             }
          }
@@ -84,6 +89,26 @@ cell_get_empty_buffer(struct cell_context *cell)
 }
 
 
+/**
+ * Append a fence command to the current batch buffer.
+ * Note that we're sure there's always room for this because of the
+ * adjusted size check in cell_batch_free_space().
+ */
+static void
+emit_fence(struct cell_context *cell)
+{
+   const uint batch = cell->cur_batch;
+   const uint size = cell->buffer_size[batch];
+   struct cell_command_fence *fence_cmd;
+
+   ASSERT(size + sizeof(struct cell_command_fence) <= CELL_BUFFER_SIZE);
+
+   fence_cmd = (struct cell_command_fence *) (cell->buffer[batch] + size);
+   fence_cmd->opcode = CELL_CMD_FENCE;
+   fence_cmd->fence = &cell->fenced_buffers[batch].fence;
+}
+
+
 /**
  * Flush the current batch buffer to the SPUs.
  * An empty buffer will be found and set as the new current batch buffer
@@ -102,6 +127,12 @@ cell_batch_flush(struct cell_context *cell)
    if (size == 0)
       return;
 
+   /* Before we use this batch buffer, make sure any fenced texture buffers
+    * are released.
+    */
+   if (cell->fenced_buffers[batch].head)
+      emit_fence(cell);
+
    flushing = TRUE;
 
    assert(batch < CELL_NUM_BUFFERS);
@@ -142,6 +173,7 @@ uint
 cell_batch_free_space(const struct cell_context *cell)
 {
    uint free = CELL_BUFFER_SIZE - cell->buffer_size[cell->cur_batch];
+   free -= sizeof(struct cell_command_fence);
    return free;
 }
 
diff --git a/src/gallium/drivers/cell/ppu/cell_context.c b/src/gallium/drivers/cell/ppu/cell_context.c
index 7a2d93ecb4..22d552d8e3 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.c
+++ b/src/gallium/drivers/cell/ppu/cell_context.c
@@ -47,6 +47,7 @@
 #include "cell_clear.h"
 #include "cell_context.h"
 #include "cell_draw_arrays.h"
+#include "cell_fence.h"
 #include "cell_flush.h"
 #include "cell_state.h"
 #include "cell_surface.h"
@@ -104,6 +105,7 @@ cell_create_context(struct pipe_screen *screen,
                     struct cell_winsys *cws)
 {
    struct cell_context *cell;
+   uint i;
 
    /* some fields need to be 16-byte aligned, so align the whole object */
    cell = (struct cell_context*) align_malloc(sizeof(struct cell_context), 16);
@@ -151,6 +153,10 @@ cell_create_context(struct pipe_screen *screen,
                                               cell_debug_flags, 
                                               0 );
 
+   for (i = 0; i < CELL_NUM_BUFFERS; i++)
+      cell_fence_init(&cell->fenced_buffers[i].fence);
+
+
    /*
     * SPU stuff
     */
diff --git a/src/gallium/drivers/cell/ppu/cell_context.h b/src/gallium/drivers/cell/ppu/cell_context.h
index ad1f4829a4..4491ae8cdf 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.h
+++ b/src/gallium/drivers/cell/ppu/cell_context.h
@@ -81,6 +81,19 @@ struct cell_fragment_ops_key
 };
 
 
+struct cell_buffer_node;
+
+/**
+ * Fenced buffer list.  List of buffers which can be unreferenced after
+ * the fence has been executed/signalled.
+ */
+struct cell_buffer_list
+{
+   struct cell_fence fence;
+   struct cell_buffer_node *head;
+};
+
+
 /**
  * Per-context state, subclass of pipe_context.
  */
@@ -154,6 +167,14 @@ struct cell_context
    uint buffer_status[CELL_MAX_SPUS][CELL_NUM_BUFFERS][4] ALIGN16_ATTRIB;
 
 
+   /** Associated with each command/batch buffer is a list of pipe_buffers
+    * that are fenced.  When the last command in a buffer is executed, the
+    * fence will be signalled, indicating that any pipe_buffers preceeding
+    * that fence can be unreferenced (and probably freed).
+    */
+   struct cell_buffer_list fenced_buffers[CELL_NUM_BUFFERS];
+
+
    struct spe_function attrib_fetch;
    unsigned attrib_fetch_offsets[PIPE_MAX_ATTRIBS];
 
diff --git a/src/gallium/drivers/cell/ppu/cell_fence.c b/src/gallium/drivers/cell/ppu/cell_fence.c
new file mode 100644
index 0000000000..ffb3bea12b
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_fence.c
@@ -0,0 +1,158 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include <unistd.h>
+#include "util/u_memory.h"
+#include "pipe/p_inlines.h"
+#include "cell_context.h"
+#include "cell_batch.h"
+#include "cell_fence.h"
+#include "cell_texture.h"
+
+
+void
+cell_fence_init(struct cell_fence *fence)
+{
+   uint i;
+   for (i = 0; i < CELL_MAX_SPUS; i++) {
+      fence->status[i][0] = CELL_FENCE_IDLE;
+   }
+}
+
+
+boolean
+cell_fence_signalled(const struct cell_context *cell,
+                     const struct cell_fence *fence)
+{
+   uint i;
+   for (i = 0; i < cell->num_spus; i++) {
+      //ASSERT(fence->status[i][0] != CELL_FENCE_IDLE);
+      if (fence->status[i][0] == CELL_FENCE_EMITTED)
+         return FALSE;
+   }
+   return TRUE;
+}
+
+
+void
+cell_fence_finish(const struct cell_context *cell,
+                  const struct cell_fence *fence)
+{
+   while (!cell_fence_signalled(cell, fence)) {
+      usleep(10);
+   }
+}
+
+
+
+
+struct cell_buffer_node
+{
+   struct pipe_buffer *buffer;
+   struct cell_buffer_node *next;
+};
+
+
+static void
+cell_add_buffer_to_list(struct cell_context *cell,
+                        struct cell_buffer_list *list,
+                        struct pipe_buffer *buffer)
+{
+   struct pipe_screen *ps = cell->pipe.screen;
+   struct cell_buffer_node *node = CALLOC_STRUCT(cell_buffer_node);
+   /* create new list node which references the buffer, insert at head */
+   if (node) {
+      pipe_buffer_reference(ps, &node->buffer, buffer);
+      node->next = list->head;
+      list->head = node;
+   }
+}
+
+
+/**
+ * Wait for completion of the given fence, then unreference any buffers
+ * on the list.
+ * This typically unrefs/frees texture buffers after any rendering which uses
+ * them has completed.
+ */
+void
+cell_free_fenced_buffers(struct cell_context *cell,
+                         struct cell_buffer_list *list)
+{
+   if (list->head) {
+      struct pipe_screen *ps = cell->pipe.screen;
+      struct cell_buffer_node *node;
+
+      cell_fence_finish(cell, &list->fence);
+
+      /* traverse the list, unreferencing buffers, freeing nodes */
+      node = list->head;
+      while (node) {
+         struct cell_buffer_node *next = node->next;
+         assert(node->buffer);
+         pipe_buffer_unmap(ps, node->buffer);
+#if 0
+         printf("Unref buffer %p\n", node->buffer);
+         if (node->buffer->refcount == 1)
+            printf("   Delete!\n");
+#endif
+         pipe_buffer_reference(ps, &node->buffer, NULL);
+         FREE(node);
+         node = next;
+      }
+      list->head = NULL;
+   }
+}
+
+
+/**
+ * This should be called for each render command.
+ * Any texture buffers that are current bound will be added to a fenced
+ * list to be freed later when the fence is executed/signalled.
+ */
+void
+cell_add_fenced_textures(struct cell_context *cell)
+{
+   struct cell_buffer_list *list = &cell->fenced_buffers[cell->cur_batch];
+   uint i;
+
+   for (i = 0; i < cell->num_textures; i++) {
+      struct cell_texture *ct = cell->texture[i];
+      if (ct) {
+         uint level;
+         for (level = 0; level < CELL_MAX_TEXTURE_LEVELS; level++) {
+            if (ct->tiled_buffer[level]) {
+#if 0
+               printf("Adding texture %p buffer %p to list\n",
+                      ct, ct->tiled_buffer[level]);
+#endif
+               cell_add_buffer_to_list(cell, list, ct->tiled_buffer[level]);
+            }
+         }
+      }
+   }
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_fence.h b/src/gallium/drivers/cell/ppu/cell_fence.h
new file mode 100644
index 0000000000..536b4ba411
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_fence.h
@@ -0,0 +1,57 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef CELL_FENCE_H
+#define CELL_FENCE_H
+
+
+extern void
+cell_fence_init(struct cell_fence *fence);
+
+
+extern boolean
+cell_fence_signalled(const struct cell_context *cell,
+                     const struct cell_fence *fence);
+
+
+extern void
+cell_fence_finish(const struct cell_context *cell,
+                  const struct cell_fence *fence);
+
+
+
+extern void
+cell_free_fenced_buffers(struct cell_context *cell,
+                         struct cell_buffer_list *list);
+
+
+extern void
+cell_add_fenced_textures(struct cell_context *cell);
+
+
+#endif /* CELL_FENCE_H */
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index effcd2a1e1..dd2d7f7d1e 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -225,7 +225,7 @@ cell_emit_state(struct cell_context *cell)
             if (cell->texture[i]) {
                uint level;
                for (level = 0; level < CELL_MAX_TEXTURE_LEVELS; level++) {
-                  texture->start[level] = cell->texture[i]->tiled_data[level];
+                  texture->start[level] = cell->texture[i]->tiled_mapped[level];
                   texture->width[level] = cell->texture[i]->base.width[level];
                   texture->height[level] = cell->texture[i]->base.height[level];
                   texture->depth[level] = cell->texture[i]->base.depth[level];
diff --git a/src/gallium/drivers/cell/ppu/cell_texture.c b/src/gallium/drivers/cell/ppu/cell_texture.c
index 9c6741f1bc..9ac2f3bbb9 100644
--- a/src/gallium/drivers/cell/ppu/cell_texture.c
+++ b/src/gallium/drivers/cell/ppu/cell_texture.c
@@ -136,6 +136,9 @@ cell_texture_release(struct pipe_screen *screen,
        __FUNCTION__, (void *) *pt, (*pt)->refcount - 1);
    */
    if (--(*pt)->refcount <= 0) {
+      /* Delete this texture now.
+       * But note that the underlying pipe_buffer may linger...
+       */
       struct cell_texture *ct = cell_texture(*pt);
       uint i;
 
@@ -146,14 +149,12 @@ cell_texture_release(struct pipe_screen *screen,
       pipe_buffer_reference(screen, &ct->buffer, NULL);
 
       for (i = 0; i < CELL_MAX_TEXTURE_LEVELS; i++) {
-         if (ct->tiled_data[i]) {
-            /* XXX need to use a fenced buffer for tiled data so that
-             * it's properly freed after rendering has completed.
-             * Disabling this free() allows glDrawPixels to work for now.
-             */
-#if 0
-            align_free(ct->tiled_data[i]);
-#endif
+         /* Unreference the tiled image buffer.
+          * It may not actually be deleted until a fence is hit.
+          */
+         if (ct->tiled_buffer[i]) {
+            ct->tiled_mapped[i] = NULL;
+            winsys_buffer_reference(screen->winsys, &ct->tiled_buffer[i], NULL);
          }
       }
 
@@ -234,12 +235,18 @@ cell_twiddle_texture(struct pipe_screen *screen,
          int offset = bufWidth * bufHeight * 4 * surface->face;
          uint *dst;
 
-         if (!ct->tiled_data[level]) {
-            ct->tiled_data[level] =
-               align_malloc(bufWidth * bufHeight * 4 * numFaces, 16);
+         if (!ct->tiled_buffer[level]) {
+            /* allocate buffer for tiled data now */
+            struct pipe_winsys *ws = screen->winsys;
+            uint bytes = bufWidth * bufHeight * 4 * numFaces;
+            ct->tiled_buffer[level] = ws->buffer_create(ws, 16,
+                                                        PIPE_BUFFER_USAGE_PIXEL,
+                                                        bytes);
+            /* and map it */
+            ct->tiled_mapped[level] = ws->buffer_map(ws, ct->tiled_buffer[level],
+                                                     PIPE_BUFFER_USAGE_GPU_READ);
          }
-
-         dst = (uint *) ((ubyte *) ct->tiled_data[level] + offset);
+         dst = (uint *) ((ubyte *) ct->tiled_mapped[level] + offset);
 
          twiddle_image_uint(texWidth, texHeight, TILE_SIZE, dst,
                             surface->stride, src);
diff --git a/src/gallium/drivers/cell/ppu/cell_texture.h b/src/gallium/drivers/cell/ppu/cell_texture.h
index a0757091b0..2f5fe0dd1b 100644
--- a/src/gallium/drivers/cell/ppu/cell_texture.h
+++ b/src/gallium/drivers/cell/ppu/cell_texture.h
@@ -48,7 +48,10 @@ struct cell_texture
    struct pipe_buffer *buffer;
    unsigned long buffer_size;
 
-   void *tiled_data[CELL_MAX_TEXTURE_LEVELS];  /* XXX this may be temporary */ /*ALIGN16*/
+   /** Texture data in tiled layout is held here */
+   struct pipe_buffer *tiled_buffer[CELL_MAX_TEXTURE_LEVELS];
+   /** Mapped, tiled texture data */
+   void *tiled_mapped[CELL_MAX_TEXTURE_LEVELS];
 };
 
 
diff --git a/src/gallium/drivers/cell/ppu/cell_vbuf.c b/src/gallium/drivers/cell/ppu/cell_vbuf.c
index aa63435b93..65ba51b6bb 100644
--- a/src/gallium/drivers/cell/ppu/cell_vbuf.c
+++ b/src/gallium/drivers/cell/ppu/cell_vbuf.c
@@ -38,6 +38,7 @@
 
 #include "cell_batch.h"
 #include "cell_context.h"
+#include "cell_fence.h"
 #include "cell_flush.h"
 #include "cell_spu.h"
 #include "cell_vbuf.h"
@@ -108,6 +109,11 @@ cell_vbuf_release_vertices(struct vbuf_render *vbr, void *vertices,
           __FUNCTION__, cvbr->vertex_buf, vertices_used);
    */
 
+   /* Make sure texture buffers aren't released until we're done rendering
+    * with them.
+    */
+   cell_add_fenced_textures(cell);
+
    /* Tell SPUs they can release the vert buf */
    if (cvbr->vertex_buf != ~0U) {
       struct cell_command_release_verts *release
diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index 9c853c0961..a6ed29ea63 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -76,9 +76,10 @@ static void
 release_buffer(uint buffer)
 {
    /* Evidently, using less than a 16-byte status doesn't work reliably */
-   static const uint status[4] ALIGN16_ATTRIB
-      = {CELL_BUFFER_STATUS_FREE, 0, 0, 0};
-
+   static const vector unsigned int status = {CELL_BUFFER_STATUS_FREE,
+                                              CELL_BUFFER_STATUS_FREE,
+                                              CELL_BUFFER_STATUS_FREE,
+                                              CELL_BUFFER_STATUS_FREE};
    const uint index = 4 * (spu.init.id * CELL_NUM_BUFFERS + buffer);
    uint *dst = spu.init.buffer_status + index;
 
@@ -93,6 +94,29 @@ release_buffer(uint buffer)
 }
 
 
+/**
+ * Write CELL_FENCE_SIGNALLED back to the fence status qword in main memory.
+ * There's a qword of status per SPU.
+ */
+static void
+cmd_fence(struct cell_command_fence *fence_cmd)
+{
+   static const vector unsigned int status = {CELL_FENCE_SIGNALLED,
+                                              CELL_FENCE_SIGNALLED,
+                                              CELL_FENCE_SIGNALLED,
+                                              CELL_FENCE_SIGNALLED};
+   uint *dst = (uint *) fence_cmd->fence;
+   dst += 4 * spu.init.id;  /* main store/memory address, not local store */
+
+   mfc_put((void *) &status,    /* src in local memory */
+           (unsigned int) dst,  /* dst in main memory */
+           sizeof(status),      /* size */
+           TAG_FENCE,           /* tag */
+           0, /* tid */
+           0  /* rid */);
+}
+
+
 static void
 cmd_clear_surface(const struct cell_command_clear_surface *clear)
 {
@@ -637,6 +661,14 @@ cmd_batch(uint opcode)
          cmd_finish();
          pos += 1;
          break;
+      case CELL_CMD_FENCE:
+         {
+            struct cell_command_fence *fence_cmd =
+               (struct cell_command_fence *) &buffer[pos];
+            cmd_fence(fence_cmd);
+            pos += sizeof(*fence_cmd) / 8;
+         }
+         break;
       case CELL_CMD_RELEASE_VERTS:
          {
             struct cell_command_release_verts *release
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 95ef4c9244..668af10be2 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -210,7 +210,7 @@ extern struct spu_global spu;
 #define TAG_DCACHE1           21
 #define TAG_DCACHE2           22
 #define TAG_DCACHE3           23
-
+#define TAG_FENCE             24
 
 
 static INLINE void
-- 
cgit v1.2.3


From db680ac0e3697ecc2c2dbd5f22c4c2fdb136b62c Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Tue, 28 Oct 2008 14:03:51 -0600
Subject: cell: fix a number of fence issues

Plus add assertions to check status, alignment, etc.
---
 src/gallium/drivers/cell/ppu/cell_batch.c   | 19 ++++++++++++++++---
 src/gallium/drivers/cell/ppu/cell_context.h |  2 +-
 src/gallium/drivers/cell/ppu/cell_fence.c   | 14 ++++++++++++--
 src/gallium/drivers/cell/spu/spu_command.c  |  2 +-
 4 files changed, 30 insertions(+), 7 deletions(-)

(limited to 'src/gallium/drivers/cell/spu/spu_command.c')

diff --git a/src/gallium/drivers/cell/ppu/cell_batch.c b/src/gallium/drivers/cell/ppu/cell_batch.c
index 448b723d85..962775cd33 100644
--- a/src/gallium/drivers/cell/ppu/cell_batch.c
+++ b/src/gallium/drivers/cell/ppu/cell_batch.c
@@ -100,12 +100,23 @@ emit_fence(struct cell_context *cell)
    const uint batch = cell->cur_batch;
    const uint size = cell->buffer_size[batch];
    struct cell_command_fence *fence_cmd;
+   struct cell_fence *fence = &cell->fenced_buffers[batch].fence;
+   uint i;
+
+   /* set fence status to emitted, not yet signalled */
+   for (i = 0; i < cell->num_spus; i++) {
+      fence->status[i][0] = CELL_FENCE_EMITTED;
+   }
 
    ASSERT(size + sizeof(struct cell_command_fence) <= CELL_BUFFER_SIZE);
 
    fence_cmd = (struct cell_command_fence *) (cell->buffer[batch] + size);
    fence_cmd->opcode = CELL_CMD_FENCE;
-   fence_cmd->fence = &cell->fenced_buffers[batch].fence;
+   fence_cmd->fence = fence;
+
+   /* update batch buffer size */
+   cell->buffer_size[batch] = size + sizeof(struct cell_command_fence);
+   assert(sizeof(struct cell_command_fence) % 8 == 0);
 }
 
 
@@ -119,7 +130,7 @@ cell_batch_flush(struct cell_context *cell)
 {
    static boolean flushing = FALSE;
    uint batch = cell->cur_batch;
-   const uint size = cell->buffer_size[batch];
+   uint size = cell->buffer_size[batch];
    uint spu, cmd_word;
 
    assert(!flushing);
@@ -130,8 +141,10 @@ cell_batch_flush(struct cell_context *cell)
    /* Before we use this batch buffer, make sure any fenced texture buffers
     * are released.
     */
-   if (cell->fenced_buffers[batch].head)
+   if (cell->fenced_buffers[batch].head) {
       emit_fence(cell);
+      size = cell->buffer_size[batch];
+   }
 
    flushing = TRUE;
 
diff --git a/src/gallium/drivers/cell/ppu/cell_context.h b/src/gallium/drivers/cell/ppu/cell_context.h
index 4491ae8cdf..eb1397bb3f 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.h
+++ b/src/gallium/drivers/cell/ppu/cell_context.h
@@ -89,7 +89,7 @@ struct cell_buffer_node;
  */
 struct cell_buffer_list
 {
-   struct cell_fence fence;
+   struct cell_fence fence ALIGN16_ATTRIB;
    struct cell_buffer_node *head;
 };
 
diff --git a/src/gallium/drivers/cell/ppu/cell_fence.c b/src/gallium/drivers/cell/ppu/cell_fence.c
index ffb3bea12b..867b5dcaa0 100644
--- a/src/gallium/drivers/cell/ppu/cell_fence.c
+++ b/src/gallium/drivers/cell/ppu/cell_fence.c
@@ -38,6 +38,7 @@ void
 cell_fence_init(struct cell_fence *fence)
 {
    uint i;
+   ASSERT_ALIGN16(fence->status);
    for (i = 0; i < CELL_MAX_SPUS; i++) {
       fence->status[i][0] = CELL_FENCE_IDLE;
    }
@@ -50,9 +51,9 @@ cell_fence_signalled(const struct cell_context *cell,
 {
    uint i;
    for (i = 0; i < cell->num_spus; i++) {
-      //ASSERT(fence->status[i][0] != CELL_FENCE_IDLE);
-      if (fence->status[i][0] == CELL_FENCE_EMITTED)
+      if (fence->status[i][0] != CELL_FENCE_SIGNALLED)
          return FALSE;
+      /*assert(fence->status[i][0] == CELL_FENCE_EMITTED);*/
    }
    return TRUE;
 }
@@ -65,6 +66,15 @@ cell_fence_finish(const struct cell_context *cell,
    while (!cell_fence_signalled(cell, fence)) {
       usleep(10);
    }
+
+#ifdef DEBUG
+   {
+      uint i;
+      for (i = 0; i < cell->num_spus; i++) {
+         assert(fence->status[i][0] == CELL_FENCE_SIGNALLED);
+      }
+   }
+#endif
 }
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index a6ed29ea63..63818d4c46 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -107,7 +107,7 @@ cmd_fence(struct cell_command_fence *fence_cmd)
                                               CELL_FENCE_SIGNALLED};
    uint *dst = (uint *) fence_cmd->fence;
    dst += 4 * spu.init.id;  /* main store/memory address, not local store */
-
+   ASSERT_ALIGN16(dst);
    mfc_put((void *) &status,    /* src in local memory */
            (unsigned int) dst,  /* dst in main memory */
            sizeof(status),      /* size */
-- 
cgit v1.2.3


From 711f8a1dd94e2e1e715615d947e03015ef972326 Mon Sep 17 00:00:00 2001
From: Robert Ellison <papillo@tungstengraphics.com>
Date: Thu, 30 Oct 2008 15:24:23 -0600
Subject: CELL: stencil bug fixes

Two definitive bugs in stenciling were fixed.

The first, reversed registers in the generated Select Bytes (selb)
instruction, caused the stenciling INCR and DECR operations to
fail dramatically, putting new values in where old values were
supposed to be and vice versa.

The second caused stencil tiles to not be read and written from
main memory by the SPUs.  A per-spu flag, spu.read_depth, was used
to indicate whether the SPU should be reading depth tiles, and was set
only when depth was enabled.  A second flag, spu.read_stencil, was
set when stenciling was enabled, but never referenced.

As stenciling and depth are in the same tiles on the Cell, and there
is no corresponding TAG_WRITE_TILE_STENCIL to complement
TAG_WRITE_TILE_COLOR and TAG_WRITE_TILE_Z, I fixed this by
eliminating the unused "spu.read_stencil", renaming "spu.read_depth"
to "spu.read_depth_stencil", and setting it if either stenciling or
depth is enabled.

I also added an optimization to the fragment ops generation code,
that avoids calculating stencil values and/or stencil writemask
when the stencil operations are all KEEP.
---
 progs/trivial/tri-stencil.c                      | 13 ++++++++++--
 src/gallium/drivers/cell/ppu/cell_gen_fragment.c | 25 ++++++++++++++++++------
 src/gallium/drivers/cell/spu/spu_command.c       |  3 +--
 src/gallium/drivers/cell/spu/spu_main.h          |  3 +--
 src/gallium/drivers/cell/spu/spu_render.c        |  4 ++--
 src/gallium/drivers/cell/spu/spu_tri.c           |  2 +-
 6 files changed, 35 insertions(+), 15 deletions(-)

(limited to 'src/gallium/drivers/cell/spu/spu_command.c')

diff --git a/progs/trivial/tri-stencil.c b/progs/trivial/tri-stencil.c
index 5edbef26ce..7686e16aef 100644
--- a/progs/trivial/tri-stencil.c
+++ b/progs/trivial/tri-stencil.c
@@ -49,7 +49,15 @@ static void Key(unsigned char key, int x, int y)
 
     switch (key) {
       case 27:
+        printf("Exiting...\n");
 	exit(1);
+      case 'r':
+        printf("Redisplaying...\n");
+        glutPostRedisplay();
+        break;
+      default:
+        printf("No such key '%c'...\n", key);
+        break;
     }
 }
 
@@ -89,7 +97,7 @@ static void Draw(void)
    glEnd();
 #endif
 
-#if 0
+#if 1
    glStencilFunc(GL_EQUAL, 1, 1);
    glStencilOp(GL_KEEP, GL_KEEP, GL_KEEP);
 
@@ -130,7 +138,8 @@ int main(int argc, char **argv)
 	exit(1);
     }
 
-    glutInitWindowPosition(0, 0); glutInitWindowSize( 300, 300);
+    glutInitWindowPosition(0, 0); 
+    glutInitWindowSize( 300, 300);
 
     type = GLUT_RGB | GLUT_SINGLE | GLUT_DEPTH | GLUT_STENCIL;
     glutInitDisplayMode(type);
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
index 4e1e53ecdc..8e4dd82404 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
@@ -1282,7 +1282,7 @@ gen_stencil_values(struct spe_function *f, unsigned int stencil_op,
       /* Add Word Immediate computes rT = rA + 10-bit signed immediate */
       spe_ai(f, newS_reg, fbS_reg, 1);
       /* Select from the current value or the new value based on the equality test */
-      spe_selb(f, newS_reg, fbS_reg, newS_reg, equals_reg);
+      spe_selb(f, newS_reg, newS_reg, fbS_reg, equals_reg);
 
       spe_release_register(f, equals_reg);
       break;
@@ -1295,7 +1295,7 @@ gen_stencil_values(struct spe_function *f, unsigned int stencil_op,
       /* Add Word Immediate with a (-1) value works */
       spe_ai(f, newS_reg, fbS_reg, -1);
       /* Select from the current value or the new value based on the equality test */
-      spe_selb(f, newS_reg, fbS_reg, newS_reg, equals_reg);
+      spe_selb(f, newS_reg, newS_reg, fbS_reg, equals_reg);
 
       spe_release_register(f, equals_reg);
       break;
@@ -1534,15 +1534,28 @@ gen_stencil_depth_test(struct spe_function *f,
     * meaning that we have to calculate the stencil values but do not
     * need to mask them), we can avoid generating code.  Don't forget
     * that we need to consider backfacing stencil, if enabled.
+    *
+    * Note that if the backface stencil is *not* enabled, the backface
+    * stencil will have the same values as the frontface stencil.
     */
-   if (dsa->stencil[0].write_mask == 0x0 && (!dsa->stencil[1].enabled || dsa->stencil[1].write_mask == 0x00)) {
-      /* Trivial: don't need to calculate stencil values, and don't need to 
-       * write them back to the framebuffer.
+   if (dsa->stencil[0].fail_op == PIPE_STENCIL_OP_KEEP &&
+       dsa->stencil[0].zfail_op == PIPE_STENCIL_OP_KEEP &&
+       dsa->stencil[0].zpass_op == PIPE_STENCIL_OP_KEEP &&
+       dsa->stencil[1].fail_op == PIPE_STENCIL_OP_KEEP &&
+       dsa->stencil[1].zfail_op == PIPE_STENCIL_OP_KEEP &&
+       dsa->stencil[1].zpass_op == PIPE_STENCIL_OP_KEEP) {
+       /* No changes to any stencil values */
+       need_to_calculate_stencil_values = false;
+       need_to_writemask_stencil_values = false;
+    }
+    else if (dsa->stencil[0].write_mask == 0x0 && dsa->stencil[1].write_mask == 0x0) {
+      /* All changes are writemasked out, so no need to calculate
+       * what those changes might be, and no need to write anything back.
        */
       need_to_calculate_stencil_values = false;
       need_to_writemask_stencil_values = false;
    }
-   else if (dsa->stencil[0].write_mask == 0xff && (!dsa->stencil[1].enabled || dsa->stencil[1].write_mask == 0xff)) {
+   else if (dsa->stencil[0].write_mask == 0xff && dsa->stencil[1].write_mask == 0xff) {
       /* Still trivial, but a little less so.  We need to write the stencil
        * values, but we don't need to mask them.
        */
diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index 63818d4c46..d726622d94 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -244,8 +244,7 @@ cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops)
       }
    }
 
-   spu.read_depth = spu.depth_stencil_alpha.depth.enabled;
-   spu.read_stencil = spu.depth_stencil_alpha.stencil[0].enabled;
+   spu.read_depth_stencil = (spu.depth_stencil_alpha.depth.enabled || spu.depth_stencil_alpha.stencil[0].enabled);
 }
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 668af10be2..692790c9f3 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -160,8 +160,7 @@ struct spu_global
    tile_t ztile ALIGN16_ATTRIB;
 
    /** Read depth/stencil tiles? */
-   boolean read_depth;
-   boolean read_stencil;
+   boolean read_depth_stencil;
 
    /** Current tiles' status */
    ubyte cur_ctile_status, cur_ztile_status;
diff --git a/src/gallium/drivers/cell/spu/spu_render.c b/src/gallium/drivers/cell/spu/spu_render.c
index 5515bb55c9..7c225e2f27 100644
--- a/src/gallium/drivers/cell/spu/spu_render.c
+++ b/src/gallium/drivers/cell/spu/spu_render.c
@@ -98,7 +98,7 @@ my_tile(uint tx, uint ty)
 static INLINE void
 get_cz_tiles(uint tx, uint ty)
 {
-   if (spu.read_depth) {
+   if (spu.read_depth_stencil) {
       if (spu.cur_ztile_status != TILE_STATUS_CLEAR) {
          //printf("SPU %u: getting Z tile %u, %u\n", spu.init.id, tx, ty);
          get_tile(tx, ty, &spu.ztile, TAG_READ_TILE_Z, 1);
@@ -153,7 +153,7 @@ static INLINE void
 wait_put_cz_tiles(void)
 {
    wait_on_mask(1 << TAG_WRITE_TILE_COLOR);
-   if (spu.read_depth) {
+   if (spu.read_depth_stencil) {
       wait_on_mask(1 << TAG_WRITE_TILE_Z);
    }
 }
diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index 4caf7d6b61..5f908159bb 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -369,7 +369,7 @@ flush_spans(void)
    }
    ASSERT(spu.cur_ctile_status != TILE_STATUS_DEFINED);
 
-   if (spu.read_depth) {
+   if (spu.read_depth_stencil) {
       if (spu.cur_ztile_status == TILE_STATUS_GETTING) {
          /* wait for mfc_get() to complete */
          //printf("SPU: %u: waiting for ztile\n", spu.init.id);
-- 
cgit v1.2.3


From 90027f85786406133a5180998a75fb612b6a221e Mon Sep 17 00:00:00 2001
From: Robert Ellison <papillo@tungstengraphics.com>
Date: Tue, 11 Nov 2008 13:57:10 -0700
Subject: CELL: two-sided stencil fixes

With these changes, the tests/stencil_twoside test now works.

- Eliminate blending from the stencil_twoside test, as it produces an
  unneeded dependency on having blending working

- The spe_splat() function will now work if the register being splatted
  and the destination register are the same

- Separate fragment code generated for front-facing and back-facing
  fragments.  Often these are the same; if two-sided stenciling is on,
  they can be different.  This is easier and faster than generating
  code that does both tests and merges the results.

- Fixed a cut/paste bug where if the back Z-pass stencil operation
  were different from all the other operations, the back Z-fail
  results were incorrect.
---
 progs/tests/stencil_twoside.c                      |   2 -
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c        |   7 +-
 src/gallium/drivers/cell/common.h                  |   6 +-
 src/gallium/drivers/cell/ppu/cell_gen_fragment.c   | 239 ++++++---------------
 src/gallium/drivers/cell/ppu/cell_gen_fragment.h   |   2 +-
 src/gallium/drivers/cell/ppu/cell_state_emit.c     |  19 +-
 src/gallium/drivers/cell/spu/spu_command.c         |   6 +-
 src/gallium/drivers/cell/spu/spu_main.c            |   6 +-
 src/gallium/drivers/cell/spu/spu_main.h            |  10 +-
 src/gallium/drivers/cell/spu/spu_per_fragment_op.c |   3 +-
 src/gallium/drivers/cell/spu/spu_per_fragment_op.h |   3 +-
 src/gallium/drivers/cell/spu/spu_tri.c             |  20 +-
 12 files changed, 115 insertions(+), 208 deletions(-)

(limited to 'src/gallium/drivers/cell/spu/spu_command.c')

diff --git a/progs/tests/stencil_twoside.c b/progs/tests/stencil_twoside.c
index be9d9a776a..8826c46fc2 100644
--- a/progs/tests/stencil_twoside.c
+++ b/progs/tests/stencil_twoside.c
@@ -115,7 +115,6 @@ static void Display( void )
    glVertex2f(-1,  1);
    glEnd();
 
-
    if (use20syntax) {
       stencil_func_separate(GL_FRONT, GL_ALWAYS, 0, ~0);
       stencil_func_separate(GL_BACK, GL_ALWAYS, 0, ~0);
@@ -279,7 +278,6 @@ static void Init( void )
    stencil_op_separate = glutGetProcAddress( "glStencilOpSeparate" );
 
    printf("\nAll 5 squares should be the same color.\n");
-   glEnable( GL_BLEND );
 }
 
 
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
index f8568f690b..1bd9f1c8dd 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
@@ -958,9 +958,12 @@ spe_compare_greater_uint(struct spe_function *p, unsigned rT, unsigned rA, unsig
 void
 spe_splat(struct spe_function *p, unsigned rT, unsigned rA)
 {
+   /* Use a temporary, just in case rT == rA */
+   unsigned int tmp_reg = spe_allocate_available_register(p);
    /* Duplicate bytes 0, 1, 2, and 3 across the whole register */
-   spe_ila(p, rT, 0x00010203);
-   spe_shufb(p, rT, rA, rA, rT);
+   spe_ila(p, tmp_reg, 0x00010203);
+   spe_shufb(p, rT, rA, rA, tmp_reg);
+   spe_release_register(p, tmp_reg);
 }
 
 
diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index 87488ea2d7..a670ed3c6e 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -130,6 +130,9 @@
 #define CELL_FENCE_EMITTED   1
 #define CELL_FENCE_SIGNALLED 2
 
+#define CELL_FACING_FRONT    0
+#define CELL_FACING_BACK     1
+
 struct cell_fence
 {
    /** There's a 16-byte status qword per SPU */
@@ -160,7 +163,8 @@ struct cell_command_fragment_ops
    struct pipe_depth_stencil_alpha_state dsa;
    struct pipe_blend_state blend;
    struct pipe_blend_color blend_color;
-   unsigned code[SPU_MAX_FRAGMENT_OPS_INSTS];
+   unsigned code_front[SPU_MAX_FRAGMENT_OPS_INSTS];
+   unsigned code_back[SPU_MAX_FRAGMENT_OPS_INSTS];
 };
 
 
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
index d9c3ff3f4d..6e425eafaa 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
@@ -1412,144 +1412,72 @@ gen_stencil_values(struct spe_function *f, unsigned int stencil_op,
  * and released by the corresponding spe_release_register_set() call.
  */
 static void
-gen_get_stencil_values(struct spe_function *f, const struct pipe_depth_stencil_alpha_state *dsa,
+gen_get_stencil_values(struct spe_function *f, const struct pipe_stencil_state *stencil,
+                       const unsigned int depth_enabled,
                        unsigned int fbS_reg, 
                        unsigned int *fail_reg, unsigned int *zfail_reg, 
-                       unsigned int *zpass_reg, unsigned int *back_fail_reg, 
-                       unsigned int *back_zfail_reg, unsigned int *back_zpass_reg)
+                       unsigned int *zpass_reg)
 {
-   unsigned zfail_op, back_zfail_op;
+   unsigned zfail_op;
 
    /* Stenciling had better be enabled here */
-   ASSERT(dsa->stencil[0].enabled);
+   ASSERT(stencil->enabled);
 
    /* If the depth test is not enabled, it is treated as though it always
-    * passes.  In particular, that means that the "zfail_op" (and the backfacing
-    * counterpart, if active) are not considered - a failing stencil test will
-    * trigger the "fail_op", and a passing stencil test will trigger the
-    * "zpass_op".
+    * passes, which means that the zfail_op is not considered - a
+    * failing stencil test triggers the fail_op, and a passing one
+    * triggers the zpass_op
     *
-    * By overriding the operations in this case to be PIPE_STENCIL_OP_KEEP,
-    * we keep them from being calculated.
+    * As an optimization, override calculation of the zfail_op values
+    * if they aren't going to be used.  By setting the value of
+    * the operation to PIPE_STENCIL_OP_KEEP, its value will be assumed
+    * to match the incoming stencil values, and no calculation will
+    * be done.
     */
-   if (dsa->depth.enabled) {
-      zfail_op = dsa->stencil[0].zfail_op;
-      back_zfail_op = dsa->stencil[1].zfail_op;
+   if (depth_enabled) {
+      zfail_op = stencil->zfail_op;
    }
    else {
       zfail_op = PIPE_STENCIL_OP_KEEP;
-      back_zfail_op = PIPE_STENCIL_OP_KEEP;
    }
 
    /* One-sided or front-facing stencil */
-   if (dsa->stencil[0].fail_op == PIPE_STENCIL_OP_KEEP) {
+   if (stencil->fail_op == PIPE_STENCIL_OP_KEEP) {
       *fail_reg = fbS_reg;
    }
    else {
       *fail_reg = spe_allocate_available_register(f);
-      gen_stencil_values(f, dsa->stencil[0].fail_op, dsa->stencil[0].ref_value, 
+      gen_stencil_values(f, stencil->fail_op, stencil->ref_value, 
          0xff, fbS_reg, *fail_reg);
    }
 
+   /* Check the possibly overridden value, not the structure value */
    if (zfail_op == PIPE_STENCIL_OP_KEEP) {
       *zfail_reg = fbS_reg;
    }
-   else if (zfail_op == dsa->stencil[0].fail_op) {
+   else if (zfail_op == stencil->fail_op) {
       *zfail_reg = *fail_reg;
    }
    else {
       *zfail_reg = spe_allocate_available_register(f);
-      gen_stencil_values(f, dsa->stencil[0].zfail_op, dsa->stencil[0].ref_value, 
+      gen_stencil_values(f, stencil->zfail_op, stencil->ref_value, 
          0xff, fbS_reg, *zfail_reg);
    }
 
-   if (dsa->stencil[0].zpass_op == PIPE_STENCIL_OP_KEEP) {
+   if (stencil->zpass_op == PIPE_STENCIL_OP_KEEP) {
       *zpass_reg = fbS_reg;
    }
-   else if (dsa->stencil[0].zpass_op == dsa->stencil[0].fail_op) {
+   else if (stencil->zpass_op == stencil->fail_op) {
       *zpass_reg = *fail_reg;
    }
-   else if (dsa->stencil[0].zpass_op == zfail_op) {
+   else if (stencil->zpass_op == zfail_op) {
       *zpass_reg = *zfail_reg;
    }
    else {
       *zpass_reg = spe_allocate_available_register(f);
-      gen_stencil_values(f, dsa->stencil[0].zpass_op, dsa->stencil[0].ref_value, 
+      gen_stencil_values(f, stencil->zpass_op, stencil->ref_value, 
          0xff, fbS_reg, *zpass_reg);
    }
-
-   /* If two-sided stencil is enabled, we have more work to do. */
-   if (!dsa->stencil[1].enabled) {
-      /* This just flags that the registers need not be deallocated later */
-      *back_fail_reg = fbS_reg;
-      *back_zfail_reg = fbS_reg;
-      *back_zpass_reg = fbS_reg;
-   }
-   else {
-      /* Same calculations as above, but for the back stencil */
-      if (dsa->stencil[1].fail_op == PIPE_STENCIL_OP_KEEP) {
-         *back_fail_reg = fbS_reg;
-      }
-      else if (dsa->stencil[1].fail_op == dsa->stencil[0].fail_op) {
-         *back_fail_reg = *fail_reg;
-      }
-      else if (dsa->stencil[1].fail_op == zfail_op) {
-         *back_fail_reg = *zfail_reg;
-      }
-      else if (dsa->stencil[1].fail_op == dsa->stencil[0].zpass_op) {
-         *back_fail_reg = *zpass_reg;
-      }
-      else {
-         *back_fail_reg = spe_allocate_available_register(f);
-         gen_stencil_values(f, dsa->stencil[1].fail_op, dsa->stencil[1].ref_value, 
-            0xff, fbS_reg, *back_fail_reg);
-      }
-
-      if (back_zfail_op == PIPE_STENCIL_OP_KEEP) {
-         *back_zfail_reg = fbS_reg;
-      }
-      else if (back_zfail_op == dsa->stencil[0].fail_op) {
-         *back_zfail_reg = *fail_reg;
-      }
-      else if (back_zfail_op == zfail_op) {
-         *back_zfail_reg = *zfail_reg;
-      }
-      else if (back_zfail_op == dsa->stencil[0].zpass_op) {
-         *back_zfail_reg = *zpass_reg;
-      }
-      else if (back_zfail_op == dsa->stencil[1].fail_op) {
-         *back_zfail_reg = *back_fail_reg;
-      }
-      else {
-         *back_zfail_reg = spe_allocate_available_register(f);
-         gen_stencil_values(f, dsa->stencil[1].zfail_op, dsa->stencil[1].ref_value, 
-            0xff, fbS_reg, *back_zfail_reg);
-      }
-
-      if (dsa->stencil[1].zpass_op == PIPE_STENCIL_OP_KEEP) {
-         *back_zpass_reg = fbS_reg;
-      }
-      else if (dsa->stencil[1].zpass_op == dsa->stencil[0].fail_op) {
-         *back_zpass_reg = *fail_reg;
-      }
-      else if (dsa->stencil[1].zpass_op == zfail_op) {
-         *back_zpass_reg = *zfail_reg;
-      }
-      else if (dsa->stencil[1].zpass_op == dsa->stencil[0].zpass_op) {
-         *back_zpass_reg = *zpass_reg;
-      }
-      else if (dsa->stencil[1].zpass_op == dsa->stencil[1].fail_op) {
-         *back_zpass_reg = *back_fail_reg;
-      }
-      else if (dsa->stencil[1].zpass_op == back_zfail_op) {
-         *back_zpass_reg = *back_zfail_reg;
-      }
-      else {
-         *back_zfail_reg = spe_allocate_available_register(f);
-         gen_stencil_values(f, dsa->stencil[1].zpass_op, dsa->stencil[1].ref_value, 
-            0xff, fbS_reg, *back_zpass_reg);
-      }
-   } /* End of calculations for back-facing stencil */
 }
 
 /* Note that fbZ_reg may *not* be set on entry, if in fact
@@ -1559,7 +1487,7 @@ gen_get_stencil_values(struct spe_function *f, const struct pipe_depth_stencil_a
 static boolean
 gen_stencil_depth_test(struct spe_function *f, 
                        const struct pipe_depth_stencil_alpha_state *dsa, 
-                       const int const facing_reg,
+                       const uint facing,
                        const int mask_reg, const int fragZ_reg, 
                        const int fbZ_reg, const int fbS_reg)
 {
@@ -1571,6 +1499,8 @@ gen_stencil_depth_test(struct spe_function *f,
    boolean need_to_calculate_stencil_values;
    boolean need_to_writemask_stencil_values;
 
+   struct pipe_stencil_state *stencil;
+
    /* Registers.  We may or may not actually allocate these, depending
     * on whether the state values indicate that we need them.
     */
@@ -1598,6 +1528,20 @@ gen_stencil_depth_test(struct spe_function *f,
    spe_comment(f, 0, "Allocating stencil register set");
    spe_allocate_register_set(f);
 
+   /* The facing we're given is the fragment facing; it doesn't
+    * exactly match the stencil facing.  If stencil is enabled,
+    * but two-sided stencil is *not* enabled, we use the same
+    * stencil settings for both front- and back-facing fragments.
+    * We only use the "back-facing" stencil for backfacing fragments
+    * if two-sided stenciling is enabled.
+    */
+   if (facing == CELL_FACING_BACK && dsa->stencil[1].enabled) {
+      stencil = &dsa->stencil[1];
+   }
+   else {
+      stencil = &dsa->stencil[0];
+   }
+
    /* Calculate the writemask.  If the writemask is trivial (either
     * all 0s, meaning that we don't need to calculate any stencil values
     * because they're not going to change the stencil anyway, or all 1s,
@@ -1608,24 +1552,20 @@ gen_stencil_depth_test(struct spe_function *f,
     * Note that if the backface stencil is *not* enabled, the backface
     * stencil will have the same values as the frontface stencil.
     */
-   if (dsa->stencil[0].fail_op == PIPE_STENCIL_OP_KEEP &&
-       dsa->stencil[0].zfail_op == PIPE_STENCIL_OP_KEEP &&
-       dsa->stencil[0].zpass_op == PIPE_STENCIL_OP_KEEP &&
-       dsa->stencil[1].fail_op == PIPE_STENCIL_OP_KEEP &&
-       dsa->stencil[1].zfail_op == PIPE_STENCIL_OP_KEEP &&
-       dsa->stencil[1].zpass_op == PIPE_STENCIL_OP_KEEP) {
-       /* No changes to any stencil values */
+   if (stencil->fail_op == PIPE_STENCIL_OP_KEEP &&
+       stencil->zfail_op == PIPE_STENCIL_OP_KEEP &&
+       stencil->zpass_op == PIPE_STENCIL_OP_KEEP) {
        need_to_calculate_stencil_values = false;
        need_to_writemask_stencil_values = false;
     }
-    else if (dsa->stencil[0].write_mask == 0x0 && dsa->stencil[1].write_mask == 0x0) {
+    else if (stencil->write_mask == 0x0) {
       /* All changes are writemasked out, so no need to calculate
        * what those changes might be, and no need to write anything back.
        */
       need_to_calculate_stencil_values = false;
       need_to_writemask_stencil_values = false;
    }
-   else if (dsa->stencil[0].write_mask == 0xff && dsa->stencil[1].write_mask == 0xff) {
+   else if (stencil->write_mask == 0xff) {
       /* Still trivial, but a little less so.  We need to write the stencil
        * values, but we don't need to mask them.
        */
@@ -1645,14 +1585,7 @@ gen_stencil_depth_test(struct spe_function *f,
        */
       spe_comment(f, 0, "Computing stencil writemask");
       stencil_writemask_reg = spe_allocate_available_register(f);
-      spe_load_uint(f, stencil_writemask_reg, dsa->stencil[0].write_mask);
-      if (dsa->stencil[1].enabled && dsa->stencil[0].write_mask != dsa->stencil[1].write_mask) {
-         unsigned int back_write_mask_reg = spe_allocate_available_register(f);
-         spe_comment(f, 0, "Resolving two-sided stencil writemask");
-         spe_load_uint(f, back_write_mask_reg, dsa->stencil[1].write_mask);
-         spe_selb(f, stencil_writemask_reg, stencil_writemask_reg, back_write_mask_reg, facing_reg);
-         spe_release_register(f, back_write_mask_reg);
-      }
+      spe_load_uint(f, stencil_writemask_reg, dsa->stencil[facing].write_mask);
    }
 
    /* At least one-sided stenciling must be on.  Generate code that
@@ -1666,19 +1599,7 @@ gen_stencil_depth_test(struct spe_function *f,
     */
    spe_comment(f, 0, "Running basic stencil test");
    stencil_pass_reg = spe_allocate_available_register(f);
-   gen_stencil_test(f, &dsa->stencil[0], 0xff, mask_reg, fbS_reg, stencil_pass_reg);
-
-   /* If two-sided stenciling is on, generate code to run the stencil
-    * test on the backfacing stencil as well, and combine the two results
-    * into the one correct result based on facing.
-    */
-   if (dsa->stencil[1].enabled) {
-      unsigned int temp_reg = spe_allocate_available_register(f);
-      spe_comment(f, 0, "Running backface stencil test");
-      gen_stencil_test(f, &dsa->stencil[1], 0xff, mask_reg, fbS_reg, temp_reg);
-      spe_selb(f, stencil_pass_reg, stencil_pass_reg, temp_reg, facing_reg);
-      spe_release_register(f, temp_reg);
-   }
+   gen_stencil_test(f, stencil, 0xff, mask_reg, fbS_reg, stencil_pass_reg);
 
    /* Generate code that, given the mask of valid fragments and the
     * mask of valid fragments that passed the stencil test, computes
@@ -1698,9 +1619,6 @@ gen_stencil_depth_test(struct spe_function *f,
 
    /* We may not need to calculate stencil values, if the writemask is off */
    if (need_to_calculate_stencil_values) {
-      unsigned int back_stencil_fail_values, back_stencil_pass_depth_fail_values, back_stencil_pass_depth_pass_values;
-      unsigned int front_stencil_fail_values, front_stencil_pass_depth_fail_values, front_stencil_pass_depth_pass_values;
-
       /* Generate code that calculates exactly which stencil values we need,
        * without calculating the same value twice (say, if two different
        * stencil ops have the same value).  This code will work for one-sided
@@ -1715,51 +1633,11 @@ gen_stencil_depth_test(struct spe_function *f,
        * This function will allocate a variant number of registers that
        * will be released as part of the register set.
        */
-      spe_comment(f, 0, "Computing stencil values");
-      gen_get_stencil_values(f, dsa, fbS_reg, 
-         &front_stencil_fail_values, &front_stencil_pass_depth_fail_values, 
-         &front_stencil_pass_depth_pass_values, &back_stencil_fail_values, 
-         &back_stencil_pass_depth_fail_values, &back_stencil_pass_depth_pass_values);
-
-      /* Tricky, tricky, tricky - the things we do to create optimal
-       * code...
-       *
-       * The various stencil values registers may overlap with each other
-       * and with fbS_reg arbitrarily (as any particular operation is
-       * only calculated once and stored in one register, no matter
-       * how many times it is used).  So we can't change the values 
-       * within those registers directly - if we change a value in a
-       * register that's being referenced by two different calculations,
-       * we've just unwittingly changed the second value as well...
-       *
-       * Avoid this by allocating new registers to hold the results
-       * (there may be 2, if the depth test is off, or 3, if it is on).
-       * These will be released as part of the register set.
-       */
-      if (!dsa->stencil[1].enabled) {
-         /* The easy case: if two-sided stenciling is *not* enabled, we
-          * just use the front-sided values.
-          */
-         stencil_fail_values = front_stencil_fail_values;
-         stencil_pass_depth_fail_values = front_stencil_pass_depth_fail_values;
-         stencil_pass_depth_pass_values = front_stencil_pass_depth_pass_values;
-      }
-      else { /* two-sided stencil enabled */
-         spe_comment(f, 0, "Resolving backface stencil values");
-         /* Allocate new registers for the needed merged values */
-         stencil_fail_values = spe_allocate_available_register(f);
-         spe_selb(f, stencil_fail_values, front_stencil_fail_values, back_stencil_fail_values, facing_reg);
-         if (dsa->depth.enabled) {
-            stencil_pass_depth_fail_values = spe_allocate_available_register(f);
-            spe_selb(f, stencil_pass_depth_fail_values, front_stencil_pass_depth_fail_values, back_stencil_pass_depth_fail_values, facing_reg);
-         }
-         else {
-            stencil_pass_depth_fail_values = fbS_reg;
-         }
-         stencil_pass_depth_pass_values = spe_allocate_available_register(f);
-         spe_selb(f, stencil_pass_depth_pass_values, front_stencil_pass_depth_pass_values, back_stencil_pass_depth_pass_values, facing_reg);
-      }
-   }
+      spe_comment(f, 0, facing == CELL_FACING_FRONT ? "Computing front-facing stencil values" : "Computing back-facing stencil values");
+      gen_get_stencil_values(f, stencil, dsa->depth.enabled, fbS_reg, 
+         &stencil_fail_values, &stencil_pass_depth_fail_values, 
+         &stencil_pass_depth_pass_values);
+   }  
 
    /* We now have all the stencil values we need.  We also need 
     * the results of the depth test to figure out which
@@ -1896,10 +1774,12 @@ gen_stencil_depth_test(struct spe_function *f,
  * should be much faster.
  *
  * \param cell  the rendering context (in)
+ * \param facing whether the generated code is for front-facing or 
+ *              back-facing fragments
  * \param f     the generated function (out)
  */
 void
-cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
+cell_gen_fragment_function(struct cell_context *cell, uint facing, struct spe_function *f)
 {
    const struct pipe_depth_stencil_alpha_state *dsa = cell->depth_stencil;
    const struct pipe_blend_state *blend = cell->blend;
@@ -1917,7 +1797,8 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
    const int fragB_reg = 10;  /* vector float */
    const int fragA_reg = 11;  /* vector float */
    const int mask_reg = 12;   /* vector uint */
-   const int facing_reg = 13; /* uint */
+
+   ASSERT(facing == CELL_FACING_FRONT || facing == CELL_FACING_BACK);
 
    /* offset of quad from start of tile
     * XXX assuming 4-byte pixels for color AND Z/stencil!!!!
@@ -1945,7 +1826,6 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
    spe_allocate_register(f, fragB_reg);
    spe_allocate_register(f, fragA_reg);
    spe_allocate_register(f, mask_reg);
-   spe_allocate_register(f, facing_reg);
 
    quad_offset_reg = spe_allocate_available_register(f);
    fbRGBA_reg = spe_allocate_available_register(f);
@@ -1969,6 +1849,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
       spe_release_register(f, y2_reg);
    }
 
+   /* Generate the alpha test, if needed. */
    if (dsa->alpha.enabled) {
       gen_alpha_test(dsa, f, mask_reg, fragA_reg);
    }
@@ -2095,7 +1976,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
           * gen_stencil_depth_test() function must ignore the
           * fbZ_reg register if depth is not enabled.
           */
-         write_depth_stencil = gen_stencil_depth_test(f, dsa, facing_reg, mask_reg, fragZ_reg, fbZ_reg, fbS_reg);
+         write_depth_stencil = gen_stencil_depth_test(f, dsa, facing, mask_reg, fragZ_reg, fbZ_reg, fbS_reg);
       }
       else if (dsa->depth.enabled) {
          int zmask_reg = spe_allocate_available_register(f);
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.h b/src/gallium/drivers/cell/ppu/cell_gen_fragment.h
index b59de198dc..2fabfdfb08 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.h
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.h
@@ -31,7 +31,7 @@
 
 
 extern void
-cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f);
+cell_gen_fragment_function(struct cell_context *cell, uint facing, struct spe_function *f);
 
 
 #endif /* CELL_GEN_FRAGMENT_H */
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index dd2d7f7d1e..031b27f11f 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -75,23 +75,29 @@ lookup_fragment_ops(struct cell_context *cell)
     * If not found, create/save new fragment ops command.
     */
    if (!ops) {
-      struct spe_function spe_code;
+      struct spe_function spe_code_front, spe_code_back;
 
       if (0)
          debug_printf("**** Create New Fragment Ops\n");
 
       /* Prepare the buffer that will hold the generated code. */
-      spe_init_func(&spe_code, SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE);
+      spe_init_func(&spe_code_front, SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE);
+      spe_init_func(&spe_code_back, SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE);
 
-      /* generate new code */
-      cell_gen_fragment_function(cell, &spe_code);
+      /* generate new code.  Always generate new code for both front-facing
+       * and back-facing fragments, even if it's the same code in both
+       * cases.
+       */
+      cell_gen_fragment_function(cell, CELL_FACING_FRONT, &spe_code_front);
+      cell_gen_fragment_function(cell, CELL_FACING_BACK, &spe_code_back);
 
       /* alloc new fragment ops command */
       ops = CALLOC_STRUCT(cell_command_fragment_ops);
 
       /* populate the new cell_command_fragment_ops object */
       ops->opcode = CELL_CMD_STATE_FRAGMENT_OPS;
-      memcpy(ops->code, spe_code.store, spe_code_size(&spe_code));
+      memcpy(ops->code_front, spe_code_front.store, spe_code_size(&spe_code_front));
+      memcpy(ops->code_back, spe_code_back.store, spe_code_size(&spe_code_back));
       ops->dsa = *cell->depth_stencil;
       ops->blend = *cell->blend;
 
@@ -99,7 +105,8 @@ lookup_fragment_ops(struct cell_context *cell)
       util_keymap_insert(cell->fragment_ops_cache, &key, ops, NULL);
 
       /* release rtasm buffer */
-      spe_release_func(&spe_code);
+      spe_release_func(&spe_code_front);
+      spe_release_func(&spe_code_back);
    }
    else {
       if (0)
diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index d726622d94..d5faf4e3aa 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -214,7 +214,8 @@ cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops)
 
    D_PRINTF(CELL_DEBUG_CMD, "CMD_STATE_FRAGMENT_OPS\n");
    /* Copy SPU code from batch buffer to spu buffer */
-   memcpy(spu.fragment_ops_code, fops->code, SPU_MAX_FRAGMENT_OPS_INSTS * 4);
+   memcpy(spu.fragment_ops_code_front, fops->code_front, SPU_MAX_FRAGMENT_OPS_INSTS * 4);
+   memcpy(spu.fragment_ops_code_back, fops->code_back, SPU_MAX_FRAGMENT_OPS_INSTS * 4);
    /* Copy state info (for fallback case only) */
    memcpy(&spu.depth_stencil_alpha, &fops->dsa, sizeof(fops->dsa));
    memcpy(&spu.blend, &fops->blend, sizeof(fops->blend));
@@ -234,7 +235,8 @@ cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops)
     * raw state records that the fallback code requires.
     */
    if ((spu.init.debug_flags & CELL_DEBUG_FRAGMENT_OP_FALLBACK) == 0) {
-      spu.fragment_ops = (spu_fragment_ops_func) spu.fragment_ops_code;
+      spu.fragment_ops[CELL_FACING_FRONT] = (spu_fragment_ops_func) spu.fragment_ops_code_front;
+      spu.fragment_ops[CELL_FACING_BACK] = (spu_fragment_ops_func) spu.fragment_ops_code_back;
    }
    else {
       /* otherwise, the default fallback code remains in place */
diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index c8bb251905..7033f6037d 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -63,7 +63,8 @@ one_time_init(void)
     * This will normally be overriden by a code-gen'd function
     * unless CELL_FORCE_FRAGMENT_OPS_FALLBACK is set.
     */
-   spu.fragment_ops = spu_fallback_fragment_ops;
+   spu.fragment_ops[CELL_FACING_FRONT] = spu_fallback_fragment_ops;
+   spu.fragment_ops[CELL_FACING_BACK] = spu_fallback_fragment_ops;
 }
 
 
@@ -90,7 +91,8 @@ main(main_param_t speid, main_param_t argp)
 
    ASSERT(sizeof(tile_t) == TILE_SIZE * TILE_SIZE * 4);
    ASSERT(sizeof(struct cell_command_render) % 8 == 0);
-   ASSERT(((unsigned long) &spu.fragment_ops_code) % 8 == 0);
+   ASSERT(((unsigned long) &spu.fragment_ops_code_front) % 8 == 0);
+   ASSERT(((unsigned long) &spu.fragment_ops_code_back) % 8 == 0);
    ASSERT(((unsigned long) &spu.fragment_program_code) % 8 == 0);
 
    one_time_init();
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 692790c9f3..24cf7d77ce 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -85,8 +85,7 @@ typedef void (*spu_fragment_ops_func)(uint x, uint y,
                                       vector float fragGreen,
                                       vector float fragBlue,
                                       vector float fragAlpha,
-                                      vector unsigned int mask,
-                                      uint facing);
+                                      vector unsigned int mask);
 
 /** Function for running fragment program */
 typedef vector unsigned int (*spu_fragment_program_func)(vector float *inputs,
@@ -170,9 +169,10 @@ struct spu_global
    ubyte ztile_status[CELL_MAX_HEIGHT/TILE_SIZE][CELL_MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
 
    /** Current fragment ops machine code, at 8-byte boundary */
-   uint fragment_ops_code[SPU_MAX_FRAGMENT_OPS_INSTS] ALIGN8_ATTRIB;
-   /** Current fragment ops function */
-   spu_fragment_ops_func fragment_ops;
+   uint fragment_ops_code_front[SPU_MAX_FRAGMENT_OPS_INSTS] ALIGN8_ATTRIB;
+   uint fragment_ops_code_back[SPU_MAX_FRAGMENT_OPS_INSTS] ALIGN8_ATTRIB;
+   /** Current fragment ops functions, 0 = frontfacing, 1 = backfacing */
+   spu_fragment_ops_func fragment_ops[2];
 
    /** Current fragment program machine code, at 8-byte boundary */
    uint fragment_program_code[SPU_MAX_FRAGMENT_PROGRAM_INSTS] ALIGN8_ATTRIB;
diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
index f8ffc70492..683664e8a4 100644
--- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
+++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
@@ -75,8 +75,7 @@ spu_fallback_fragment_ops(uint x, uint y,
                           vector float fragG,
                           vector float fragB,
                           vector float fragA,
-                          vector unsigned int mask,
-                          uint facing)
+                          vector unsigned int mask)
 {
    vector float frag_aos[4];
    unsigned int fbc0, fbc1, fbc2, fbc3 ; /* framebuffer/tile colors */
diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.h b/src/gallium/drivers/cell/spu/spu_per_fragment_op.h
index a61689c83a..f817abf046 100644
--- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.h
+++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.h
@@ -38,8 +38,7 @@ spu_fallback_fragment_ops(uint x, uint y,
                           vector float fragGreen,
                           vector float fragBlue,
                           vector float fragAlpha,
-                          vector unsigned int mask,
-                          uint facing);
+                          vector unsigned int mask);
 
 
 #endif /* SPU_PER_FRAGMENT_OP */
diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index 5f908159bb..22e51a86ae 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -275,15 +275,20 @@ emit_quad( int x, int y, mask_t mask)
 
          /* Execute per-fragment/quad operations, including:
           * alpha test, z test, stencil test, blend and framebuffer writing.
+          * Note that there are two different fragment operations functions
+          * that can be called, one for front-facing fragments, and one
+          * for back-facing fragments.  (Often the two are the same;
+          * but in some cases, like two-sided stenciling, they can be
+          * very different.)  So choose the correct function depending
+          * on the calculated facing.
           */
-         spu.fragment_ops(ix, iy, &spu.ctile, &spu.ztile,
+         spu.fragment_ops[setup.facing](ix, iy, &spu.ctile, &spu.ztile,
                           fragZ,
                           outputs[0*4+0],
                           outputs[0*4+1],
                           outputs[0*4+2],
                           outputs[0*4+3],
-                          mask,
-                          setup.facing);
+                          mask);
       }
    }
 }
@@ -519,7 +524,14 @@ setup_sort_vertices(const struct vertex_header *v0,
 
    setup.oneOverArea = 1.0f / area;
 
-   /* The product of area * sign indicates front/back orientation (0/1) */
+   /* The product of area * sign indicates front/back orientation (0/1).
+    * Just in case someone gets the bright idea of switching the front
+    * and back constants without noticing that we're assuming their
+    * values in this operation, also assert that the values are
+    * what we think they are.
+    */
+   ASSERT(CELL_FACING_FRONT == 0);
+   ASSERT(CELL_FACING_BACK == 1);
    setup.facing = (area * sign > 0.0f)
       ^ (spu.rasterizer.front_winding == PIPE_WINDING_CW);
 
-- 
cgit v1.2.3


From 11fc390f6478526d4f0bdb4b7e628284da31b3b9 Mon Sep 17 00:00:00 2001
From: Robert Ellison <papillo@tungstengraphics.com>
Date: Fri, 21 Nov 2008 11:42:14 -0700
Subject: CELL: use variant-length fragment ops programs

This is a set of changes that optimizes the memory use of fragment
operation programs (by using and transmitting only as much memory as is
needed for the fragment ops programs, instead of maximal sizes), as well
as eliminate the dependency on hard-coded maximal program sizes.  State
that is not dependent on fragment facing (i.e. that isn't using
two-sided stenciling) will only save and transmit a single
fragment operation program, instead of two identical programs.

- Added the ability to emit a LNOP (No Operation (Load)) instruction.
  This is used to pad the generated fragment operations programs to
  a multiple of 8 bytes, which is necessary for proper operation of
  the dual instruction pipeline, and also required for proper SPU-side
  decoding.

- Added the ability to allocate and manage a variant-length
  struct cell_command_fragment_ops.  This structure now puts the
  generated function field at the end, where it can be as large
  as necessary.

- On the PPU side, we now combine the generated front-facing and
  back-facing code into a single variant-length buffer (and only use one
  if the two sets of code are identical) for transmission to the SPU.

- On the SPU side, we pull the correct sizes out of the buffer,
  allocate a new code buffer if the one we have isn't large enough,
  and save the code to that buffer.  The buffer is deallocated when
  the SPU exits.

- Commented out the emit_fetch() static function, which was not being used.
---
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c      |   7 +-
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h      |  11 ++-
 src/gallium/auxiliary/util/u_memory.h            |   2 +
 src/gallium/drivers/cell/common.h                |  31 +++++--
 src/gallium/drivers/cell/ppu/cell_gen_fragment.c |   7 +-
 src/gallium/drivers/cell/ppu/cell_state_emit.c   |  77 ++++++++++++++--
 src/gallium/drivers/cell/ppu/cell_vertex_fetch.c |   3 +
 src/gallium/drivers/cell/spu/spu_command.c       | 111 +++++++++++++++++------
 src/gallium/drivers/cell/spu/spu_command.h       |  32 ++++++-
 src/gallium/drivers/cell/spu/spu_main.c          |  15 +--
 src/gallium/drivers/cell/spu/spu_main.h          |   4 +-
 11 files changed, 232 insertions(+), 68 deletions(-)

(limited to 'src/gallium/drivers/cell/spu/spu_command.c')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
index 1bd9f1c8dd..b9a75ae559 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
@@ -341,7 +341,11 @@ static void emit_RI18(struct spe_function *p, unsigned op, unsigned rT,
 }
 
 
-
+#define EMIT(_name, _op) \
+void _name (struct spe_function *p) \
+{ \
+   emit_RR(p, _op, 0, 0, 0, __FUNCTION__); \
+}
 
 #define EMIT_(_name, _op) \
 void _name (struct spe_function *p, unsigned rT) \
@@ -713,7 +717,6 @@ hbrr;
 #if 0
 stop;
 EMIT_RR  (spe_stopd, 0x140);
-EMIT_    (spe_lnop,  0x001);
 EMIT_    (spe_nop,   0x201);
 sync;
 EMIT_    (spe_dsync, 0x003);
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
index 7c211ffc51..f9ad2acacd 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
@@ -99,7 +99,9 @@ extern void spe_comment(struct spe_function *p, int rel_indent, const char *s);
 
 #endif /* RTASM_PPC_SPE_H */
 
-#ifndef EMIT_
+#ifndef EMIT
+#define EMIT(_name, _op) \
+    extern void _name (struct spe_function *p);
 #define EMIT_(_name, _op) \
     extern void _name (struct spe_function *p, unsigned rT);
 #define EMIT_R(_name, _op) \
@@ -129,7 +131,7 @@ extern void spe_comment(struct spe_function *p, int rel_indent, const char *s);
 #define EMIT_I16(_name, _op) \
     extern void _name (struct spe_function *p, int imm);
 #define UNDEF_EMIT_MACROS
-#endif /* EMIT_ */
+#endif /* EMIT */
 
 
 /* Memory load / store instructions
@@ -294,6 +296,10 @@ EMIT_RI16(spe_brz,       0x040)
 EMIT_RI16(spe_brhnz,     0x046)
 EMIT_RI16(spe_brhz,      0x044)
 
+/* Control instructions
+ */
+EMIT     (spe_lnop,      0x001)
+
 extern void
 spe_lqd(struct spe_function *p, unsigned rT, unsigned rA, int offset);
 
@@ -418,6 +424,7 @@ EMIT_R   (spe_wrch,       0x10d)
 
 
 #ifdef UNDEF_EMIT_MACROS
+#undef EMIT
 #undef EMIT_
 #undef EMIT_R
 #undef EMIT_RR
diff --git a/src/gallium/auxiliary/util/u_memory.h b/src/gallium/auxiliary/util/u_memory.h
index 857102719d..1a6b596421 100644
--- a/src/gallium/auxiliary/util/u_memory.h
+++ b/src/gallium/auxiliary/util/u_memory.h
@@ -151,6 +151,8 @@ REALLOC( void *old_ptr, unsigned old_size, unsigned new_size )
 
 #define CALLOC_STRUCT(T)   (struct T *) CALLOC(1, sizeof(struct T))
 
+#define CALLOC_VARIANT_LENGTH_STRUCT(T,more_size)   ((struct T *) CALLOC(1, sizeof(struct T) + more_size))
+
 
 /**
  * Return memory on given byte alignment
diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index a670ed3c6e..98554d7f52 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -121,11 +121,6 @@
 #define CELL_DEBUG_CMD                  (1 << 5)
 #define CELL_DEBUG_CACHE                (1 << 6)
 
-/** Max instructions for doing per-fragment operations */
-#define SPU_MAX_FRAGMENT_OPS_INSTS 128
-
-
-
 #define CELL_FENCE_IDLE      0
 #define CELL_FENCE_EMITTED   1
 #define CELL_FENCE_SIGNALLED 2
@@ -153,18 +148,36 @@ struct cell_command_fence
 
 /**
  * Command to specify per-fragment operations state and generated code.
- * Note that the dsa, blend, blend_color fields are really only needed
+ * Note that this is a variant-length structure, allocated with as 
+ * much memory as needed to hold the generated code; the "code"
+ * field *must* be the last field in the structure.  Also, the entire
+ * length of the structure (including the variant code field) must be
+ * a multiple of 8 bytes; we require that this structure itself be
+ * a multiple of 8 bytes, and that the generated code also be a multiple
+ * of 8 bytes.
+ *
+ * Also note that the dsa, blend, blend_color fields are really only needed
  * for the fallback/C per-pixel code.  They're not used when we generate
- * dynamic SPU fragment code (which is the normal case).
+ * dynamic SPU fragment code (which is the normal case), and will eventually
+ * be removed from this structure.
  */
 struct cell_command_fragment_ops
 {
    uint64_t opcode;      /**< CELL_CMD_STATE_FRAGMENT_OPS */
+
+   /* Fields for the fallback case */
    struct pipe_depth_stencil_alpha_state dsa;
    struct pipe_blend_state blend;
    struct pipe_blend_color blend_color;
-   unsigned code_front[SPU_MAX_FRAGMENT_OPS_INSTS];
-   unsigned code_back[SPU_MAX_FRAGMENT_OPS_INSTS];
+
+   /* Fields for the generated SPU code */
+   unsigned total_code_size;
+   unsigned front_code_index;
+   unsigned back_code_index;
+   /* this field has variant length, and must be the last field in 
+    * the structure
+    */
+   unsigned code[0];
 };
 
 
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
index 82336d6635..2c64eb1bcc 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
@@ -1776,7 +1776,10 @@ gen_stencil_depth_test(struct spe_function *f,
  * \param cell  the rendering context (in)
  * \param facing whether the generated code is for front-facing or 
  *              back-facing fragments
- * \param f     the generated function (out)
+ * \param f     the generated function (in/out); on input, the function
+ *              must already have been initialized.  On exit, whatever
+ *              instructions within the generated function have had
+ *              the fragment ops appended.
  */
 void
 cell_gen_fragment_function(struct cell_context *cell, const uint facing, struct spe_function *f)
@@ -1808,8 +1811,6 @@ cell_gen_fragment_function(struct cell_context *cell, const uint facing, struct
    int fbRGBA_reg;  /**< framebuffer's RGBA colors for quad */
    int fbZS_reg;    /**< framebuffer's combined z/stencil values for quad */
 
-   spe_init_func(f, SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE);
-
    if (cell->debug_flags & CELL_DEBUG_ASM) {
       spe_print_code(f, true);
       spe_indent(f, 8);
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index 031b27f11f..0a0af81f53 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -76,30 +76,86 @@ lookup_fragment_ops(struct cell_context *cell)
     */
    if (!ops) {
       struct spe_function spe_code_front, spe_code_back;
+      unsigned int facing_dependent, total_code_size;
 
       if (0)
          debug_printf("**** Create New Fragment Ops\n");
 
-      /* Prepare the buffer that will hold the generated code. */
-      spe_init_func(&spe_code_front, SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE);
-      spe_init_func(&spe_code_back, SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE);
+      /* Prepare the buffer that will hold the generated code.  The
+       * "0" passed in for the size means that the SPE code will
+       * use a default size.
+       */
+      spe_init_func(&spe_code_front, 0);
+      spe_init_func(&spe_code_back, 0);
 
-      /* generate new code.  Always generate new code for both front-facing
+      /* Generate new code.  Always generate new code for both front-facing
        * and back-facing fragments, even if it's the same code in both
        * cases.
        */
       cell_gen_fragment_function(cell, CELL_FACING_FRONT, &spe_code_front);
       cell_gen_fragment_function(cell, CELL_FACING_BACK, &spe_code_back);
 
-      /* alloc new fragment ops command */
-      ops = CALLOC_STRUCT(cell_command_fragment_ops);
+      /* Make sure the code is a multiple of 8 bytes long; this is
+       * required to ensure that the dual pipe instruction alignment
+       * is correct.  It's also important for the SPU unpacking,
+       * which assumes 8-byte boundaries.
+       */
+      unsigned int front_code_size = spe_code_size(&spe_code_front);
+      while (front_code_size % 8 != 0) {
+         spe_lnop(&spe_code_front);
+         front_code_size = spe_code_size(&spe_code_front);
+      }
+      unsigned int back_code_size = spe_code_size(&spe_code_back);
+      while (back_code_size % 8 != 0) {
+         spe_lnop(&spe_code_back);
+         back_code_size = spe_code_size(&spe_code_back);
+      }
 
+      /* Determine whether the code we generated is facing-dependent, by
+       * determining whether the generated code is different for the front-
+       * and back-facing fragments.
+       */
+      if (front_code_size == back_code_size && memcmp(spe_code_front.store, spe_code_back.store, front_code_size) == 0) {
+         /* Code is identical; only need one copy. */
+         facing_dependent = 0;
+         total_code_size = front_code_size;
+      }
+      else {
+         /* Code is different for front-facing and back-facing fragments.
+          * Need to send both copies.
+          */
+         facing_dependent = 1;
+         total_code_size = front_code_size + back_code_size;
+      }
+
+      /* alloc new fragment ops command.  Note that this structure
+       * has variant length based on the total code size required.
+       */
+      ops = CALLOC_VARIANT_LENGTH_STRUCT(cell_command_fragment_ops, total_code_size);
       /* populate the new cell_command_fragment_ops object */
       ops->opcode = CELL_CMD_STATE_FRAGMENT_OPS;
-      memcpy(ops->code_front, spe_code_front.store, spe_code_size(&spe_code_front));
-      memcpy(ops->code_back, spe_code_back.store, spe_code_size(&spe_code_back));
+      ops->total_code_size = total_code_size;
+      ops->front_code_index = 0;
+      memcpy(ops->code, spe_code_front.store, front_code_size);
+      if (facing_dependent) {
+        /* We have separate front- and back-facing code.  Append the
+         * back-facing code to the buffer.  Be careful because the code
+         * size is in bytes, but the buffer is of unsigned elements.
+         */
+        ops->back_code_index = front_code_size / sizeof(spe_code_front.store[0]);
+        memcpy(ops->code + ops->back_code_index, spe_code_back.store, back_code_size);
+      }
+      else {
+        /* Use the same code for front- and back-facing fragments */
+        ops->back_code_index = ops->front_code_index;
+      }
+
+      /* Set the fields for the fallback case.  Note that these fields
+       * (and the whole fallback case) will eventually go away.
+       */
       ops->dsa = *cell->depth_stencil;
       ops->blend = *cell->blend;
+      ops->blend_color = cell->blend_color;
 
       /* insert cell_command_fragment_ops object into keymap/cache */
       util_keymap_insert(cell->fragment_ops_cache, &key, ops, NULL);
@@ -200,9 +256,10 @@ cell_emit_state(struct cell_context *cell)
                       CELL_NEW_DEPTH_STENCIL |
                       CELL_NEW_BLEND)) {
       struct cell_command_fragment_ops *fops, *fops_cmd;
-      fops_cmd = cell_batch_alloc(cell, sizeof(*fops_cmd));
+      /* Note that cell_command_fragment_ops is a variant-sized record */
       fops = lookup_fragment_ops(cell);
-      memcpy(fops_cmd, fops, sizeof(*fops));
+      fops_cmd = cell_batch_alloc(cell, sizeof(*fops_cmd) + fops->total_code_size);
+      memcpy(fops_cmd, fops, sizeof(*fops) + fops->total_code_size);
    }
 
    if (cell->dirty & CELL_NEW_SAMPLER) {
diff --git a/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c b/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c
index 18969005b0..9cba537d9e 100644
--- a/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c
+++ b/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c
@@ -145,6 +145,8 @@ emit_matrix_transpose(struct spe_function *p,
 }
 
 
+#if 0
+/* This appears to not be used currently */
 static void
 emit_fetch(struct spe_function *p,
 	   unsigned in_ptr, unsigned *offset,
@@ -256,6 +258,7 @@ emit_fetch(struct spe_function *p,
       spe_release_register(p, float_one);
    }
 }
+#endif
 
 
 void cell_update_vertex_fetch(struct draw_context *draw)
diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index d5faf4e3aa..8500d19754 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -210,45 +210,72 @@ cmd_release_verts(const struct cell_command_release_verts *release)
 static void
 cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops)
 {
-   static int warned = 0;
-
    D_PRINTF(CELL_DEBUG_CMD, "CMD_STATE_FRAGMENT_OPS\n");
-   /* Copy SPU code from batch buffer to spu buffer */
-   memcpy(spu.fragment_ops_code_front, fops->code_front, SPU_MAX_FRAGMENT_OPS_INSTS * 4);
-   memcpy(spu.fragment_ops_code_back, fops->code_back, SPU_MAX_FRAGMENT_OPS_INSTS * 4);
-   /* Copy state info (for fallback case only) */
+
+   /* Copy state info (for fallback case only - this will eventually
+    * go away when the fallback case goes away)
+    */
    memcpy(&spu.depth_stencil_alpha, &fops->dsa, sizeof(fops->dsa));
    memcpy(&spu.blend, &fops->blend, sizeof(fops->blend));
    memcpy(&spu.blend_color, &fops->blend_color, sizeof(fops->blend_color));
 
-   /* Parity twist!  For now, always use the fallback code by default,
-    * only switching to codegen when specifically requested.  This
-    * allows us to develop freely without risking taking down the
-    * branch.
-    *
-    * Later, the parity of this check will be reversed, so that
-    * codegen is *always* used, unless we specifically indicate that
-    * we don't want it.
-    *
-    * Eventually, the option will be removed completely, because in
-    * final code we'll always use codegen and won't even provide the
-    * raw state records that the fallback code requires.
+   /* Make sure the SPU knows which buffers it's expected to read when
+    * it's told to pull tiles.
     */
-   if ((spu.init.debug_flags & CELL_DEBUG_FRAGMENT_OP_FALLBACK) == 0) {
-      spu.fragment_ops[CELL_FACING_FRONT] = (spu_fragment_ops_func) spu.fragment_ops_code_front;
-      spu.fragment_ops[CELL_FACING_BACK] = (spu_fragment_ops_func) spu.fragment_ops_code_back;
-   }
-   else {
-      /* otherwise, the default fallback code remains in place */
+   spu.read_depth_stencil = (spu.depth_stencil_alpha.depth.enabled || spu.depth_stencil_alpha.stencil[0].enabled);
+
+   /* If we're forcing the fallback code to be used (for debug purposes),
+    * install that.  Otherwise install the incoming SPU code.
+    */
+   if ((spu.init.debug_flags & CELL_DEBUG_FRAGMENT_OP_FALLBACK) != 0) {
+      static unsigned int warned = 0;
       if (!warned) {
          fprintf(stderr, "Cell Warning: using fallback per-fragment code\n");
          warned = 1;
       }
+      /* The following two lines aren't really necessary if you
+       * know the debug flags won't change during a run, and if you
+       * know that the function pointers are initialized correctly.
+       * We set them here to allow a person to change the debug
+       * flags during a run (from inside a debugger).
+       */
+      spu.fragment_ops[CELL_FACING_FRONT] = spu_fallback_fragment_ops;
+      spu.fragment_ops[CELL_FACING_BACK] = spu_fallback_fragment_ops;
+      return;
    }
 
-   spu.read_depth_stencil = (spu.depth_stencil_alpha.depth.enabled || spu.depth_stencil_alpha.stencil[0].enabled);
-}
+   /* Make sure the SPU code buffer is large enough to hold the incoming code.
+    * Note that we *don't* use align_malloc() and align_free(), because
+    * those utility functions are *not* available in SPU code.
+    * */
+   if (spu.fragment_ops_code_size < fops->total_code_size) {
+      if (spu.fragment_ops_code != NULL) {
+         free(spu.fragment_ops_code);
+      }
+      spu.fragment_ops_code_size = fops->total_code_size;
+      spu.fragment_ops_code = malloc(fops->total_code_size);
+      if (spu.fragment_ops_code == NULL) {
+         /* Whoops. */
+         fprintf(stderr, "CELL Warning: failed to allocate fragment ops code (%d bytes) - using fallback\n", fops->total_code_size);
+         spu.fragment_ops_code = NULL;
+         spu.fragment_ops_code_size = 0;
+         spu.fragment_ops[CELL_FACING_FRONT] = spu_fallback_fragment_ops;
+         spu.fragment_ops[CELL_FACING_BACK] = spu_fallback_fragment_ops;
+         return;
+      }
+   }
 
+   /* Copy the SPU code from the command buffer to the spu buffer */
+   memcpy(spu.fragment_ops_code, fops->code, fops->total_code_size);
+
+   /* Set the pointers for the front-facing and back-facing fragments
+    * to the specified offsets within the code.  Note that if the
+    * front-facing and back-facing code are the same, they'll have
+    * the same offset.
+    */
+   spu.fragment_ops[CELL_FACING_FRONT] = (spu_fragment_ops_func) &spu.fragment_ops_code[fops->front_code_index];
+   spu.fragment_ops[CELL_FACING_BACK] = (spu_fragment_ops_func) &spu.fragment_ops_code[fops->back_code_index];
+}
 
 static void
 cmd_state_fragment_program(const struct cell_command_fragment_program *fp)
@@ -588,7 +615,8 @@ cmd_batch(uint opcode)
             struct cell_command_fragment_ops *fops
                = (struct cell_command_fragment_ops *) &buffer[pos];
             cmd_state_fragment_ops(fops);
-            pos += sizeof(*fops) / 8;
+            /* This is a variant-sized command */
+            pos += (sizeof(*fops) + fops->total_code_size)/ 8;
          }
          break;
       case CELL_CMD_STATE_FRAGMENT_PROGRAM:
@@ -756,3 +784,32 @@ command_loop(void)
    if (spu.init.debug_flags & CELL_DEBUG_CACHE)
       spu_dcache_report();
 }
+
+/* Initialize this module; we manage the fragment ops buffer here. */
+void
+spu_command_init(void)
+{
+   /* Install default/fallback fragment processing function.
+    * This will normally be overriden by a code-gen'd function
+    * unless CELL_FORCE_FRAGMENT_OPS_FALLBACK is set.
+    */
+   spu.fragment_ops[CELL_FACING_FRONT] = spu_fallback_fragment_ops;
+   spu.fragment_ops[CELL_FACING_BACK] = spu_fallback_fragment_ops;
+
+   /* Set up the basic empty buffer for code-gen'ed fragment ops */
+   spu.fragment_ops_code = NULL;
+   spu.fragment_ops_code_size = 0;
+}
+
+void
+spu_command_close(void)
+{
+   /* Deallocate the code-gen buffer for fragment ops, and reset the
+    * fragment ops functions to their initial setting (just to leave
+    * things in a good state).
+    */
+   if (spu.fragment_ops_code != NULL) {
+      free(spu.fragment_ops_code);
+   }
+   spu_command_init();
+}
diff --git a/src/gallium/drivers/cell/spu/spu_command.h b/src/gallium/drivers/cell/spu/spu_command.h
index 853e9aa549..83dcdade28 100644
--- a/src/gallium/drivers/cell/spu/spu_command.h
+++ b/src/gallium/drivers/cell/spu/spu_command.h
@@ -1,7 +1,35 @@
-
-
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
 
 extern void
 command_loop(void);
 
+extern void
+spu_command_init(void);
 
+extern void
+spu_command_close(void);
diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index 7033f6037d..97c86d194d 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -58,17 +58,8 @@ one_time_init(void)
    memset(spu.ctile_status, TILE_STATUS_DEFINED, sizeof(spu.ctile_status));
    memset(spu.ztile_status, TILE_STATUS_DEFINED, sizeof(spu.ztile_status));
    invalidate_tex_cache();
-
-   /* Install default/fallback fragment processing function.
-    * This will normally be overriden by a code-gen'd function
-    * unless CELL_FORCE_FRAGMENT_OPS_FALLBACK is set.
-    */
-   spu.fragment_ops[CELL_FACING_FRONT] = spu_fallback_fragment_ops;
-   spu.fragment_ops[CELL_FACING_BACK] = spu_fallback_fragment_ops;
 }
 
-
-
 /* In some versions of the SDK the SPE main takes 'unsigned long' as a
  * parameter.  In others it takes 'unsigned long long'.  Use a define to
  * select between the two.
@@ -91,11 +82,11 @@ main(main_param_t speid, main_param_t argp)
 
    ASSERT(sizeof(tile_t) == TILE_SIZE * TILE_SIZE * 4);
    ASSERT(sizeof(struct cell_command_render) % 8 == 0);
-   ASSERT(((unsigned long) &spu.fragment_ops_code_front) % 8 == 0);
-   ASSERT(((unsigned long) &spu.fragment_ops_code_back) % 8 == 0);
+   ASSERT(sizeof(struct cell_command_fragment_ops) % 8 == 0);
    ASSERT(((unsigned long) &spu.fragment_program_code) % 8 == 0);
 
    one_time_init();
+   spu_command_init();
 
    D_PRINTF(CELL_DEBUG_CMD, "main() speid=%lu\n", (unsigned long) speid);
    D_PRINTF(CELL_DEBUG_FRAGMENT_OP_FALLBACK, "using fragment op fallback\n");
@@ -120,5 +111,7 @@ main(main_param_t speid, main_param_t argp)
 
    command_loop();
 
+   spu_command_close();
+
    return 0;
 }
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 24cf7d77ce..33767e7c51 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -169,8 +169,8 @@ struct spu_global
    ubyte ztile_status[CELL_MAX_HEIGHT/TILE_SIZE][CELL_MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
 
    /** Current fragment ops machine code, at 8-byte boundary */
-   uint fragment_ops_code_front[SPU_MAX_FRAGMENT_OPS_INSTS] ALIGN8_ATTRIB;
-   uint fragment_ops_code_back[SPU_MAX_FRAGMENT_OPS_INSTS] ALIGN8_ATTRIB;
+   uint *fragment_ops_code;
+   uint fragment_ops_code_size;
    /** Current fragment ops functions, 0 = frontfacing, 1 = backfacing */
    spu_fragment_ops_func fragment_ops[2];
 
-- 
cgit v1.2.3


From 402e6752b53d04af0bbfc5391547c2d127bce859 Mon Sep 17 00:00:00 2001
From: Jonathan Adamczewski <jadamcze@utas.edu.au>
Date: Mon, 12 Jan 2009 16:24:49 -0700
Subject: cell: allocate batch buffers w/ 16-byte alignment

Replace cell_batch{align,alloc)*() with cell_batch_alloc16(), allocating
multiples of 16 bytes that are 16 byte aligned.

Opcodes are stored in preferred slot of SPU machine word.

Various structures are explicitly padded to 16 byte multiples.

Added STATIC_ASSERT().
---
 src/gallium/drivers/cell/common.h              | 43 +++++++++++----
 src/gallium/drivers/cell/ppu/cell_batch.c      | 73 ++++----------------------
 src/gallium/drivers/cell/ppu/cell_batch.h      |  9 +---
 src/gallium/drivers/cell/ppu/cell_clear.c      |  5 +-
 src/gallium/drivers/cell/ppu/cell_flush.c      | 13 ++---
 src/gallium/drivers/cell/ppu/cell_state_emit.c | 43 ++++++++-------
 src/gallium/drivers/cell/ppu/cell_vbuf.c       | 16 +++---
 src/gallium/drivers/cell/spu/spu_command.c     | 52 +++++++++---------
 8 files changed, 111 insertions(+), 143 deletions(-)

(limited to 'src/gallium/drivers/cell/spu/spu_command.c')

diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index 98554d7f52..1f6860da11 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -49,6 +49,15 @@
    }
 
 
+
+#define JOIN(x, y) JOIN_AGAIN(x, y)
+#define JOIN_AGAIN(x, y) x ## y
+
+#define STATIC_ASSERT(e) \
+{typedef char JOIN(assertion_failed_at_line_, __LINE__) [(e) ? 1 : -1];}
+
+
+
 /** for sanity checking */
 #define ASSERT_ALIGN16(ptr) \
   ASSERT((((unsigned long) (ptr)) & 0xf) == 0);
@@ -134,6 +143,11 @@ struct cell_fence
    volatile uint status[CELL_MAX_SPUS][4];
 };
 
+#ifdef __SPU__
+typedef vector unsigned int opcode_t;
+#else
+typedef unsigned int opcode_t[4];
+#endif
 
 /**
  * Fence command sent to SPUs.  In response, the SPUs will write
@@ -141,8 +155,9 @@ struct cell_fence
  */
 struct cell_command_fence
 {
-   uint64_t opcode;      /**< CELL_CMD_FENCE */
+   opcode_t opcode;      /**< CELL_CMD_FENCE */
    struct cell_fence *fence;
+   uint32_t pad_[3];
 };
 
 
@@ -163,7 +178,7 @@ struct cell_command_fence
  */
 struct cell_command_fragment_ops
 {
-   uint64_t opcode;      /**< CELL_CMD_STATE_FRAGMENT_OPS */
+   opcode_t opcode;      /**< CELL_CMD_STATE_FRAGMENT_OPS */
 
    /* Fields for the fallback case */
    struct pipe_depth_stencil_alpha_state dsa;
@@ -189,8 +204,9 @@ struct cell_command_fragment_ops
  */
 struct cell_command_fragment_program
 {
-   uint64_t opcode;      /**< CELL_CMD_STATE_FRAGMENT_PROGRAM */
+   opcode_t opcode;      /**< CELL_CMD_STATE_FRAGMENT_PROGRAM */
    uint num_inst;        /**< Number of instructions */
+   uint32_t pad[3];
    unsigned code[SPU_MAX_FRAGMENT_PROGRAM_INSTS];
 };
 
@@ -200,10 +216,11 @@ struct cell_command_fragment_program
  */
 struct cell_command_framebuffer
 {
-   uint64_t opcode;     /**< CELL_CMD_STATE_FRAMEBUFFER */
+   opcode_t opcode;     /**< CELL_CMD_STATE_FRAMEBUFFER */
    int width, height;
    void *color_start, *depth_start;
    enum pipe_format color_format, depth_format;
+   uint32_t pad_[2];
 };
 
 
@@ -212,7 +229,7 @@ struct cell_command_framebuffer
  */
 struct cell_command_rasterizer
 {
-   uint64_t opcode;    /**< CELL_CMD_STATE_RASTERIZER */
+   opcode_t opcode;    /**< CELL_CMD_STATE_RASTERIZER */
    struct pipe_rasterizer_state rasterizer;
 };
 
@@ -222,9 +239,10 @@ struct cell_command_rasterizer
  */
 struct cell_command_clear_surface
 {
-   uint64_t opcode;     /**< CELL_CMD_CLEAR_SURFACE */
+   opcode_t opcode;     /**< CELL_CMD_CLEAR_SURFACE */
    uint surface; /**< Temporary: 0=color, 1=Z */
    uint value;
+   uint32_t pad[2];
 };
 
 
@@ -271,7 +289,7 @@ struct cell_shader_info
 #define SPU_VERTS_PER_BATCH 64
 struct cell_command_vs
 {
-   uint64_t opcode;       /**< CELL_CMD_VS_EXECUTE */
+   opcode_t opcode;       /**< CELL_CMD_VS_EXECUTE */
    uint64_t vOut[SPU_VERTS_PER_BATCH];
    unsigned num_elts;
    unsigned elts[SPU_VERTS_PER_BATCH];
@@ -283,7 +301,7 @@ struct cell_command_vs
 
 struct cell_command_render
 {
-   uint64_t opcode;   /**< CELL_CMD_RENDER */
+   opcode_t opcode;   /**< CELL_CMD_RENDER */
    uint prim_type;    /**< PIPE_PRIM_x */
    uint num_verts;
    uint vertex_size;  /**< bytes per vertex */
@@ -292,27 +310,30 @@ struct cell_command_render
    float xmin, ymin, xmax, ymax;  /* XXX another dummy field */
    uint min_index;
    boolean inline_verts;
+   uint32_t pad_[1];
 };
 
 
 struct cell_command_release_verts
 {
-   uint64_t opcode;         /**< CELL_CMD_RELEASE_VERTS */
+   opcode_t opcode;         /**< CELL_CMD_RELEASE_VERTS */
    uint vertex_buf;    /**< in [0, CELL_NUM_BUFFERS-1] */
+   uint32_t pad_[3];
 };
 
 
 struct cell_command_sampler
 {
-   uint64_t opcode;         /**< CELL_CMD_STATE_SAMPLER */
+   opcode_t opcode;         /**< CELL_CMD_STATE_SAMPLER */
    uint unit;
    struct pipe_sampler_state state;
+   uint32_t pad_[1];
 };
 
 
 struct cell_command_texture
 {
-   uint64_t opcode;     /**< CELL_CMD_STATE_TEXTURE */
+   opcode_t opcode;     /**< CELL_CMD_STATE_TEXTURE */
    uint target;         /**< PIPE_TEXTURE_x */
    uint unit;
    void *start[CELL_MAX_TEXTURE_LEVELS];   /**< Address in main memory */
diff --git a/src/gallium/drivers/cell/ppu/cell_batch.c b/src/gallium/drivers/cell/ppu/cell_batch.c
index 962775cd33..fe144f8b84 100644
--- a/src/gallium/drivers/cell/ppu/cell_batch.c
+++ b/src/gallium/drivers/cell/ppu/cell_batch.c
@@ -108,15 +108,16 @@ emit_fence(struct cell_context *cell)
       fence->status[i][0] = CELL_FENCE_EMITTED;
    }
 
+   STATIC_ASSERT(sizeof(struct cell_command_fence) % 16 == 0);
+   ASSERT(size % 16 == 0);
    ASSERT(size + sizeof(struct cell_command_fence) <= CELL_BUFFER_SIZE);
 
    fence_cmd = (struct cell_command_fence *) (cell->buffer[batch] + size);
-   fence_cmd->opcode = CELL_CMD_FENCE;
+   fence_cmd->opcode[0] = CELL_CMD_FENCE;
    fence_cmd->fence = fence;
 
    /* update batch buffer size */
    cell->buffer_size[batch] = size + sizeof(struct cell_command_fence);
-   assert(sizeof(struct cell_command_fence) % 8 == 0);
 }
 
 
@@ -191,70 +192,19 @@ cell_batch_free_space(const struct cell_context *cell)
 }
 
 
-/**
- * Append data to the current batch buffer.
- * \param data  address of block of bytes to append
- * \param bytes  size of block of bytes
- */
-void
-cell_batch_append(struct cell_context *cell, const void *data, uint bytes)
-{
-   uint size;
-
-   ASSERT(bytes % 8 == 0);
-   ASSERT(bytes <= CELL_BUFFER_SIZE);
-   ASSERT(cell->cur_batch >= 0);
-
-#ifdef ASSERT
-   {
-      uint spu;
-      for (spu = 0; spu < cell->num_spus; spu++) {
-         ASSERT(cell->buffer_status[spu][cell->cur_batch][0]
-                 == CELL_BUFFER_STATUS_USED);
-      }
-   }
-#endif
-
-   size = cell->buffer_size[cell->cur_batch];
-
-   if (bytes > cell_batch_free_space(cell)) {
-      cell_batch_flush(cell);
-      size = 0;
-   }
-
-   ASSERT(size + bytes <= CELL_BUFFER_SIZE);
-
-   memcpy(cell->buffer[cell->cur_batch] + size, data, bytes);
-
-   cell->buffer_size[cell->cur_batch] = size + bytes;
-}
-
-
 /**
  * Allocate space in the current batch buffer for 'bytes' space.
+ * Bytes must be a multiple of 16 bytes.  Allocation will be 16 byte aligned.
  * \return address in batch buffer to put data
  */
 void *
-cell_batch_alloc(struct cell_context *cell, uint bytes)
-{
-   return cell_batch_alloc_aligned(cell, bytes, 1);
-}
-
-
-/**
- * Same as \sa cell_batch_alloc, but return an address at a particular
- * alignment.
- */
-void *
-cell_batch_alloc_aligned(struct cell_context *cell, uint bytes,
-                         uint alignment)
+cell_batch_alloc16(struct cell_context *cell, uint bytes)
 {
    void *pos;
-   uint size, padbytes;
+   uint size;
 
-   ASSERT(bytes % 8 == 0);
+   ASSERT(bytes % 16 == 0);
    ASSERT(bytes <= CELL_BUFFER_SIZE);
-   ASSERT(alignment > 0);
    ASSERT(cell->cur_batch >= 0);
 
 #ifdef ASSERT
@@ -269,17 +219,12 @@ cell_batch_alloc_aligned(struct cell_context *cell, uint bytes,
 
    size = cell->buffer_size[cell->cur_batch];
 
-   padbytes = (alignment - (size % alignment)) % alignment;
-
-   if (padbytes + bytes > cell_batch_free_space(cell)) {
+   if (bytes > cell_batch_free_space(cell)) {
       cell_batch_flush(cell);
       size = 0;
    }
-   else {
-      size += padbytes;
-   }
 
-   ASSERT(size % alignment == 0);
+   ASSERT(size % 16 == 0);
    ASSERT(size + bytes <= CELL_BUFFER_SIZE);
 
    pos = (void *) (cell->buffer[cell->cur_batch] + size);
diff --git a/src/gallium/drivers/cell/ppu/cell_batch.h b/src/gallium/drivers/cell/ppu/cell_batch.h
index f74dd60079..290136031a 100644
--- a/src/gallium/drivers/cell/ppu/cell_batch.h
+++ b/src/gallium/drivers/cell/ppu/cell_batch.h
@@ -44,15 +44,8 @@ cell_batch_flush(struct cell_context *cell);
 extern uint
 cell_batch_free_space(const struct cell_context *cell);
 
-extern void
-cell_batch_append(struct cell_context *cell, const void *data, uint bytes);
-
-extern void *
-cell_batch_alloc(struct cell_context *cell, uint bytes);
-
 extern void *
-cell_batch_alloc_aligned(struct cell_context *cell, uint bytes,
-                         uint alignment);
+cell_batch_alloc16(struct cell_context *cell, uint bytes);
 
 extern void
 cell_init_batch_buffers(struct cell_context *cell);
diff --git a/src/gallium/drivers/cell/ppu/cell_clear.c b/src/gallium/drivers/cell/ppu/cell_clear.c
index 037635e466..c2e276988c 100644
--- a/src/gallium/drivers/cell/ppu/cell_clear.c
+++ b/src/gallium/drivers/cell/ppu/cell_clear.c
@@ -99,10 +99,11 @@ cell_clear_surface(struct pipe_context *pipe, struct pipe_surface *ps,
 
    /* Build a CLEAR command and place it in the current batch buffer */
    {
+      STATIC_ASSERT(sizeof(struct cell_command_clear_surface) % 16 == 0);
       struct cell_command_clear_surface *clr
          = (struct cell_command_clear_surface *)
-         cell_batch_alloc(cell, sizeof(*clr));
-      clr->opcode = CELL_CMD_CLEAR_SURFACE;
+         cell_batch_alloc16(cell, sizeof(*clr));
+      clr->opcode[0] = CELL_CMD_CLEAR_SURFACE;
       clr->surface = surfIndex;
       clr->value = clearValue;
    }
diff --git a/src/gallium/drivers/cell/ppu/cell_flush.c b/src/gallium/drivers/cell/ppu/cell_flush.c
index a64967b4b9..8275c9dc9c 100644
--- a/src/gallium/drivers/cell/ppu/cell_flush.c
+++ b/src/gallium/drivers/cell/ppu/cell_flush.c
@@ -72,8 +72,9 @@ cell_flush_int(struct cell_context *cell, unsigned flags)
    flushing = TRUE;
 
    if (flags & CELL_FLUSH_WAIT) {
-      uint64_t *cmd = (uint64_t *) cell_batch_alloc(cell, sizeof(uint64_t));
-      *cmd = CELL_CMD_FINISH;
+      STATIC_ASSERT(sizeof(opcode_t) % 16 == 0);
+      opcode_t *cmd = (opcode_t*) cell_batch_alloc16(cell, sizeof(opcode_t));
+      *cmd[0] = CELL_CMD_FINISH;
    }
 
    cell_batch_flush(cell);
@@ -101,11 +102,11 @@ void
 cell_flush_buffer_range(struct cell_context *cell, void *ptr,
 			unsigned size)
 {
-   uint64_t batch[1 + (ROUNDUP8(sizeof(struct cell_buffer_range)) / 8)];
-   struct cell_buffer_range *br = (struct cell_buffer_range *) & batch[1];
-
+   STATIC_ASSERT((sizeof(opcode_t) + sizeof(struct cell_buffer_range)) % 16 == 0);
+   uint32_t *batch = (uint32_t*)cell_batch_alloc16(cell, 
+      sizeof(opcode_t) + sizeof(struct cell_buffer_range));
+   struct cell_buffer_range *br = (struct cell_buffer_range *) &batch[4];
    batch[0] = CELL_CMD_FLUSH_BUFFER_RANGE;
    br->base = (uintptr_t) ptr;
    br->size = size;
-   cell_batch_append(cell, batch, sizeof(batch));
 }
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index 0a0af81f53..39b85faeb8 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -133,7 +133,7 @@ lookup_fragment_ops(struct cell_context *cell)
        */
       ops = CALLOC_VARIANT_LENGTH_STRUCT(cell_command_fragment_ops, total_code_size);
       /* populate the new cell_command_fragment_ops object */
-      ops->opcode = CELL_CMD_STATE_FRAGMENT_OPS;
+      ops->opcode[0] = CELL_CMD_STATE_FRAGMENT_OPS;
       ops->total_code_size = total_code_size;
       ops->front_code_index = 0;
       memcpy(ops->code, spe_code_front.store, front_code_size);
@@ -178,10 +178,10 @@ static void
 emit_state_cmd(struct cell_context *cell, uint cmd,
                const void *state, uint state_size)
 {
-   uint64_t *dst = (uint64_t *) 
-       cell_batch_alloc(cell, ROUNDUP8(sizeof(uint64_t) + state_size));
+   uint32_t *dst = (uint32_t *) 
+       cell_batch_alloc16(cell, ROUNDUP16(sizeof(opcode_t) + state_size));
    *dst = cmd;
-   memcpy(dst + 1, state, state_size);
+   memcpy(dst + 4, state, state_size);
 }
 
 
@@ -195,9 +195,10 @@ cell_emit_state(struct cell_context *cell)
    if (cell->dirty & CELL_NEW_FRAMEBUFFER) {
       struct pipe_surface *cbuf = cell->framebuffer.cbufs[0];
       struct pipe_surface *zbuf = cell->framebuffer.zsbuf;
+      STATIC_ASSERT(sizeof(struct cell_command_framebuffer) % 16 == 0);
       struct cell_command_framebuffer *fb
-         = cell_batch_alloc(cell, sizeof(*fb));
-      fb->opcode = CELL_CMD_STATE_FRAMEBUFFER;
+         = cell_batch_alloc16(cell, sizeof(*fb));
+      fb->opcode[0] = CELL_CMD_STATE_FRAMEBUFFER;
       fb->color_start = cell->cbuf_map[0];
       fb->color_format = cbuf->format;
       fb->depth_start = cell->zsbuf_map;
@@ -211,17 +212,19 @@ cell_emit_state(struct cell_context *cell)
    }
 
    if (cell->dirty & (CELL_NEW_RASTERIZER)) {
+      STATIC_ASSERT(sizeof(struct cell_command_rasterizer) % 16 == 0);
       struct cell_command_rasterizer *rast =
-         cell_batch_alloc(cell, sizeof(*rast));
-      rast->opcode = CELL_CMD_STATE_RASTERIZER;
+         cell_batch_alloc16(cell, sizeof(*rast));
+      rast->opcode[0] = CELL_CMD_STATE_RASTERIZER;
       rast->rasterizer = *cell->rasterizer;
    }
 
    if (cell->dirty & (CELL_NEW_FS)) {
       /* Send new fragment program to SPUs */
+      STATIC_ASSERT(sizeof(struct cell_command_fragment_program) % 16 == 0);
       struct cell_command_fragment_program *fp
-            = cell_batch_alloc(cell, sizeof(*fp));
-      fp->opcode = CELL_CMD_STATE_FRAGMENT_PROGRAM;
+            = cell_batch_alloc16(cell, sizeof(*fp));
+      fp->opcode[0] = CELL_CMD_STATE_FRAGMENT_PROGRAM;
       fp->num_inst = cell->fs->code.num_inst;
       memcpy(&fp->code, cell->fs->code.store,
              SPU_MAX_FRAGMENT_PROGRAM_INSTS * SPE_INST_SIZE);
@@ -238,14 +241,14 @@ cell_emit_state(struct cell_context *cell)
       const uint shader = PIPE_SHADER_FRAGMENT;
       const uint num_const = cell->constants[shader].size / sizeof(float);
       uint i, j;
-      float *buf = cell_batch_alloc(cell, 16 + num_const * sizeof(float));
-      uint64_t *ibuf = (uint64_t *) buf;
+      float *buf = cell_batch_alloc16(cell, ROUNDUP16(32 + num_const * sizeof(float)));
+      uint32_t *ibuf = (uint32_t *) buf;
       const float *constants = pipe_buffer_map(cell->pipe.screen,
                                                cell->constants[shader].buffer,
                                                PIPE_BUFFER_USAGE_CPU_READ);
       ibuf[0] = CELL_CMD_STATE_FS_CONSTANTS;
-      ibuf[1] = num_const;
-      j = 4;
+      ibuf[4] = num_const;
+      j = 8;
       for (i = 0; i < num_const; i++) {
          buf[j++] = constants[i];
       }
@@ -258,7 +261,7 @@ cell_emit_state(struct cell_context *cell)
       struct cell_command_fragment_ops *fops, *fops_cmd;
       /* Note that cell_command_fragment_ops is a variant-sized record */
       fops = lookup_fragment_ops(cell);
-      fops_cmd = cell_batch_alloc(cell, sizeof(*fops_cmd) + fops->total_code_size);
+      fops_cmd = cell_batch_alloc16(cell, ROUNDUP16(sizeof(*fops_cmd) + fops->total_code_size));
       memcpy(fops_cmd, fops, sizeof(*fops) + fops->total_code_size);
    }
 
@@ -267,9 +270,10 @@ cell_emit_state(struct cell_context *cell)
       for (i = 0; i < CELL_MAX_SAMPLERS; i++) {
          if (cell->dirty_samplers & (1 << i)) {
             if (cell->sampler[i]) {
+               STATIC_ASSERT(sizeof(struct cell_command_sampler) % 16 == 0);
                struct cell_command_sampler *sampler
-                  = cell_batch_alloc(cell, sizeof(*sampler));
-               sampler->opcode = CELL_CMD_STATE_SAMPLER;
+                  = cell_batch_alloc16(cell, sizeof(*sampler));
+               sampler->opcode[0] = CELL_CMD_STATE_SAMPLER;
                sampler->unit = i;
                sampler->state = *cell->sampler[i];
             }
@@ -282,9 +286,10 @@ cell_emit_state(struct cell_context *cell)
       uint i;
       for (i = 0;i < CELL_MAX_SAMPLERS; i++) {
          if (cell->dirty_textures & (1 << i)) {
+            STATIC_ASSERT(sizeof(struct cell_command_texture) % 16 == 0);
             struct cell_command_texture *texture
-               =  cell_batch_alloc(cell, sizeof(*texture));
-            texture->opcode = CELL_CMD_STATE_TEXTURE;
+               =  (struct cell_command_texture *)cell_batch_alloc16(cell, sizeof(*texture));
+            texture->opcode[0] = CELL_CMD_STATE_TEXTURE;
             texture->unit = i;
             if (cell->texture[i]) {
                uint level;
diff --git a/src/gallium/drivers/cell/ppu/cell_vbuf.c b/src/gallium/drivers/cell/ppu/cell_vbuf.c
index 65ba51b6bb..ab54e79689 100644
--- a/src/gallium/drivers/cell/ppu/cell_vbuf.c
+++ b/src/gallium/drivers/cell/ppu/cell_vbuf.c
@@ -116,10 +116,11 @@ cell_vbuf_release_vertices(struct vbuf_render *vbr, void *vertices,
 
    /* Tell SPUs they can release the vert buf */
    if (cvbr->vertex_buf != ~0U) {
+      STATIC_ASSERT(sizeof(struct cell_command_release_verts) % 16 == 0);
       struct cell_command_release_verts *release
          = (struct cell_command_release_verts *)
-         cell_batch_alloc(cell, sizeof(struct cell_command_release_verts));
-      release->opcode = CELL_CMD_RELEASE_VERTS;
+         cell_batch_alloc16(cell, sizeof(struct cell_command_release_verts));
+      release->opcode[0] = CELL_CMD_RELEASE_VERTS;
       release->vertex_buf = cvbr->vertex_buf;
    }
 
@@ -210,15 +211,16 @@ cell_vbuf_draw(struct vbuf_render *vbr,
 
    /* build/insert batch RENDER command */
    {
-      const uint index_bytes = ROUNDUP8(nr_indices * 2);
-      const uint vertex_bytes = nr_vertices * 4 * cell->vertex_info.size;
+      const uint index_bytes = ROUNDUP16(nr_indices * 2);
+      const uint vertex_bytes = ROUNDUP16(nr_vertices * 4 * cell->vertex_info.size);
+      STATIC_ASSERT(sizeof(struct cell_command_render) % 16 == 0);
       const uint batch_size = sizeof(struct cell_command_render) + index_bytes;
 
       struct cell_command_render *render
          = (struct cell_command_render *)
-         cell_batch_alloc(cell, batch_size);
+         cell_batch_alloc16(cell, batch_size);
 
-      render->opcode = CELL_CMD_RENDER;
+      render->opcode[0] = CELL_CMD_RENDER;
       render->prim_type = cvbr->prim;
 
       render->num_indexes = nr_indices;
@@ -236,7 +238,7 @@ cell_vbuf_draw(struct vbuf_render *vbr,
           min_index == 0 &&
           vertex_bytes + 16 <= cell_batch_free_space(cell)) {
          /* vertex data inlined, after indices, at 16-byte boundary */
-         void *dst = cell_batch_alloc_aligned(cell, vertex_bytes, 16);
+         void *dst = cell_batch_alloc16(cell, vertex_bytes);
          memcpy(dst, vertices, vertex_bytes);
          render->inline_verts = TRUE;
          render->vertex_buf = ~0;
diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index 8500d19754..5c0179d954 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -292,10 +292,10 @@ cmd_state_fragment_program(const struct cell_command_fragment_program *fp)
 
 
 static uint
-cmd_state_fs_constants(const uint64_t *buffer, uint pos)
+cmd_state_fs_constants(const qword *buffer, uint pos)
 {
-   const uint num_const = buffer[pos + 1];
-   const float *constants = (const float *) &buffer[pos + 2];
+   const uint num_const = spu_extract((vector unsigned int)buffer[pos+1], 0);
+   const float *constants = (const float *) &buffer[pos+2];
    uint i;
 
    D_PRINTF(CELL_DEBUG_CMD, "CMD_STATE_FS_CONSTANTS (%u)\n", num_const);
@@ -306,8 +306,8 @@ cmd_state_fs_constants(const uint64_t *buffer, uint pos)
       spu.constants[i] = spu_splats(constants[i]);
    }
 
-   /* return new buffer pos (in 8-byte words) */
-   return pos + 2 + num_const / 2;
+   /* return new buffer pos (in 16-byte words) */
+   return pos + 2 + (ROUNDUP16(num_const * sizeof(float)) / 16);
 }
 
 
@@ -547,8 +547,8 @@ cmd_batch(uint opcode)
 {
    const uint buf = (opcode >> 8) & 0xff;
    uint size = (opcode >> 16);
-   uint64_t buffer[CELL_BUFFER_SIZE / 8] ALIGN16_ATTRIB;
-   const unsigned usize = size / sizeof(buffer[0]);
+   qword buffer[CELL_BUFFER_SIZE / 16] ALIGN16_ATTRIB;
+   const unsigned usize = ROUNDUP16(size) / sizeof(buffer[0]);
    uint pos;
 
    D_PRINTF(CELL_DEBUG_CMD, "BATCH buffer %u, len %u, from %p\n",
@@ -578,7 +578,7 @@ cmd_batch(uint opcode)
     * Loop over commands in the batch buffer
     */
    for (pos = 0; pos < usize; /* no incr */) {
-      switch (buffer[pos]) {
+      switch (si_to_uint(buffer[pos])) {
       /*
        * rendering commands
        */
@@ -587,7 +587,7 @@ cmd_batch(uint opcode)
             struct cell_command_clear_surface *clr
                = (struct cell_command_clear_surface *) &buffer[pos];
             cmd_clear_surface(clr);
-            pos += sizeof(*clr) / 8;
+            pos += sizeof(*clr) / 16;
          }
          break;
       case CELL_CMD_RENDER:
@@ -596,7 +596,7 @@ cmd_batch(uint opcode)
                = (struct cell_command_render *) &buffer[pos];
             uint pos_incr;
             cmd_render(render, &pos_incr);
-            pos += pos_incr;
+            pos += ((pos_incr+1)&~1) / 2; // should 'fix' cmd_render return
          }
          break;
       /*
@@ -607,7 +607,7 @@ cmd_batch(uint opcode)
             struct cell_command_framebuffer *fb
                = (struct cell_command_framebuffer *) &buffer[pos];
             cmd_state_framebuffer(fb);
-            pos += sizeof(*fb) / 8;
+            pos += sizeof(*fb) / 16;
          }
          break;
       case CELL_CMD_STATE_FRAGMENT_OPS:
@@ -616,7 +616,7 @@ cmd_batch(uint opcode)
                = (struct cell_command_fragment_ops *) &buffer[pos];
             cmd_state_fragment_ops(fops);
             /* This is a variant-sized command */
-            pos += (sizeof(*fops) + fops->total_code_size)/ 8;
+            pos += ROUNDUP16(sizeof(*fops) + fops->total_code_size) / 16;
          }
          break;
       case CELL_CMD_STATE_FRAGMENT_PROGRAM:
@@ -624,7 +624,7 @@ cmd_batch(uint opcode)
             struct cell_command_fragment_program *fp
                = (struct cell_command_fragment_program *) &buffer[pos];
             cmd_state_fragment_program(fp);
-            pos += sizeof(*fp) / 8;
+            pos += sizeof(*fp) / 16;
          }
          break;
       case CELL_CMD_STATE_FS_CONSTANTS:
@@ -635,7 +635,7 @@ cmd_batch(uint opcode)
             struct cell_command_rasterizer *rast =
                (struct cell_command_rasterizer *) &buffer[pos];
             spu.rasterizer = rast->rasterizer;
-            pos += sizeof(*rast) / 8;
+            pos += sizeof(*rast) / 16;
          }
          break;
       case CELL_CMD_STATE_SAMPLER:
@@ -643,7 +643,7 @@ cmd_batch(uint opcode)
             struct cell_command_sampler *sampler
                = (struct cell_command_sampler *) &buffer[pos];
             cmd_state_sampler(sampler);
-            pos += sizeof(*sampler) / 8;
+            pos += sizeof(*sampler) / 16;
          }
          break;
       case CELL_CMD_STATE_TEXTURE:
@@ -651,37 +651,37 @@ cmd_batch(uint opcode)
             struct cell_command_texture *texture
                = (struct cell_command_texture *) &buffer[pos];
             cmd_state_texture(texture);
-            pos += sizeof(*texture) / 8;
+            pos += sizeof(*texture) / 16;
          }
          break;
       case CELL_CMD_STATE_VERTEX_INFO:
          cmd_state_vertex_info((struct vertex_info *) &buffer[pos+1]);
-         pos += (1 + ROUNDUP8(sizeof(struct vertex_info)) / 8);
+         pos += 1 + ROUNDUP16(sizeof(struct vertex_info)) / 16;
          break;
       case CELL_CMD_STATE_VIEWPORT:
          (void) memcpy(& draw.viewport, &buffer[pos+1],
                        sizeof(struct pipe_viewport_state));
-         pos += (1 + ROUNDUP8(sizeof(struct pipe_viewport_state)) / 8);
+         pos += 1 + ROUNDUP16(sizeof(struct pipe_viewport_state)) / 16;
          break;
       case CELL_CMD_STATE_UNIFORMS:
-         draw.constants = (const float (*)[4]) (uintptr_t) buffer[pos + 1];
+         draw.constants = (const float (*)[4]) (uintptr_t)spu_extract((vector unsigned int)buffer[pos+1],0);
          pos += 2;
          break;
       case CELL_CMD_STATE_VS_ARRAY_INFO:
          cmd_state_vs_array_info((struct cell_array_info *) &buffer[pos+1]);
-         pos += (1 + ROUNDUP8(sizeof(struct cell_array_info)) / 8);
+         pos += 1 + ROUNDUP16(sizeof(struct cell_array_info)) / 16;
          break;
       case CELL_CMD_STATE_BIND_VS:
 #if 0
          spu_bind_vertex_shader(&draw,
                                 (struct cell_shader_info *) &buffer[pos+1]);
 #endif
-         pos += (1 + ROUNDUP8(sizeof(struct cell_shader_info)) / 8);
+         pos += 1 + ROUNDUP16(sizeof(struct cell_shader_info)) / 16;
          break;
       case CELL_CMD_STATE_ATTRIB_FETCH:
          cmd_state_attrib_fetch((struct cell_attribute_fetch_code *)
                                 &buffer[pos+1]);
-         pos += (1 + ROUNDUP8(sizeof(struct cell_attribute_fetch_code)) / 8);
+         pos += 1 + ROUNDUP16(sizeof(struct cell_attribute_fetch_code)) / 16;
          break;
       /*
        * misc commands
@@ -695,7 +695,7 @@ cmd_batch(uint opcode)
             struct cell_command_fence *fence_cmd =
                (struct cell_command_fence *) &buffer[pos];
             cmd_fence(fence_cmd);
-            pos += sizeof(*fence_cmd) / 8;
+            pos += sizeof(*fence_cmd) / 16;
          }
          break;
       case CELL_CMD_RELEASE_VERTS:
@@ -703,7 +703,7 @@ cmd_batch(uint opcode)
             struct cell_command_release_verts *release
                = (struct cell_command_release_verts *) &buffer[pos];
             cmd_release_verts(release);
-            pos += sizeof(*release) / 8;
+            pos += sizeof(*release) / 16;
          }
          break;
       case CELL_CMD_FLUSH_BUFFER_RANGE: {
@@ -711,11 +711,11 @@ cmd_batch(uint opcode)
 	     &buffer[pos+1];
 
 	 spu_dcache_mark_dirty((unsigned) br->base, br->size);
-         pos += (1 + ROUNDUP8(sizeof(struct cell_buffer_range)) / 8);
+         pos += 1 + ROUNDUP16(sizeof(struct cell_buffer_range)) / 16;
 	 break;
       }
       default:
-         printf("SPU %u: bad opcode: 0x%llx\n", spu.init.id, buffer[pos]);
+         printf("SPU %u: bad opcode: 0x%x\n", spu.init.id, si_to_uint(buffer[pos]));
          ASSERT(0);
          break;
       }
-- 
cgit v1.2.3