71 files changed, 17154 insertions, 0 deletions
diff --git a/src/gallium/drivers/cell/Makefile b/src/gallium/drivers/cell/Makefile
new file mode 100644
index 0000000000..47aef7b05f
--- /dev/null
+++ b/src/gallium/drivers/cell/Makefile
@@ -0,0 +1,12 @@
+# Cell Gallium driver Makefile
+
+
+default:
+	( cd spu ; make )
+	( cd ppu ; make )
+
+
+
+clean:
+	( cd spu ; make clean )
+	( cd ppu ; make clean )
diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
new file mode 100644
index 0000000000..5dc756023f
--- /dev/null
+++ b/src/gallium/drivers/cell/common.h
@@ -0,0 +1,302 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * Types and tokens which are common to the SPU and PPU code.
+ */
+
+
+#ifndef CELL_COMMON_H
+#define CELL_COMMON_H
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_format.h"
+#include "pipe/p_state.h"
+
+
+/** The standard assert macro doesn't seem to work reliably */
+#define ASSERT(x) \
+   if (!(x)) { \
+      ubyte *p = NULL; \
+      fprintf(stderr, "%s:%d: %s(): assertion %s failed.\n", \
+              __FILE__, __LINE__, __FUNCTION__, #x);             \
+      *p = 0; \
+      exit(1); \
+   }
+
+
+/** for sanity checking */
+#define ASSERT_ALIGN16(ptr) \
+  ASSERT((((unsigned long) (ptr)) & 0xf) == 0);
+
+
+/** round up value to next multiple of 4 */
+#define ROUNDUP4(k)  (((k) + 0x3) & ~0x3)
+
+/** round up value to next multiple of 8 */
+#define ROUNDUP8(k)  (((k) + 0x7) & ~0x7)
+
+/** round up value to next multiple of 16 */
+#define ROUNDUP16(k)  (((k) + 0xf) & ~0xf)
+
+
+#define CELL_MAX_SPUS 6
+
+#define CELL_MAX_SAMPLERS 4
+
+#define TILE_SIZE 32
+
+
+/**
+ * The low byte of a mailbox word contains the command opcode.
+ * Remaining higher bytes are command specific.
+ */
+#define CELL_CMD_OPCODE_MASK 0xff
+
+#define CELL_CMD_EXIT                 1
+#define CELL_CMD_CLEAR_SURFACE        2
+#define CELL_CMD_FINISH               3
+#define CELL_CMD_RENDER               4
+#define CELL_CMD_BATCH                5
+#define CELL_CMD_RELEASE_VERTS        6
+#define CELL_CMD_STATE_FRAMEBUFFER   10
+#define CELL_CMD_STATE_FRAGMENT_OPS  11
+#define CELL_CMD_STATE_SAMPLER       12
+#define CELL_CMD_STATE_TEXTURE       13
+#define CELL_CMD_STATE_VERTEX_INFO   14
+#define CELL_CMD_STATE_VIEWPORT      15
+#define CELL_CMD_STATE_UNIFORMS      16
+#define CELL_CMD_STATE_VS_ARRAY_INFO 17
+#define CELL_CMD_STATE_BIND_VS       18
+#define CELL_CMD_STATE_FRAGMENT_PROGRAM 19
+#define CELL_CMD_STATE_ATTRIB_FETCH  20
+#define CELL_CMD_STATE_FS_CONSTANTS  21
+#define CELL_CMD_VS_EXECUTE          22
+#define CELL_CMD_FLUSH_BUFFER_RANGE  23
+
+
+#define CELL_NUM_BUFFERS 4
+#define CELL_BUFFER_SIZE (4*1024)  /**< 16KB would be the max */
+
+#define CELL_BUFFER_STATUS_FREE 10
+#define CELL_BUFFER_STATUS_USED 20
+
+#define CELL_DEBUG_CHECKER              (1 << 0)
+#define CELL_DEBUG_ASM                  (1 << 1)
+#define CELL_DEBUG_SYNC                 (1 << 2)
+#define CELL_DEBUG_FRAGMENT_OPS         (1 << 3)
+#define CELL_DEBUG_FRAGMENT_OP_FALLBACK (1 << 4)
+
+/** Max instructions for doing per-fragment operations */
+#define SPU_MAX_FRAGMENT_OPS_INSTS 64
+
+
+/**
+ * Command to specify per-fragment operations state and generated code.
+ */
+struct cell_command_fragment_ops
+{
+   uint64_t opcode;      /**< CELL_CMD_STATE_FRAGMENT_OPS */
+   struct pipe_depth_stencil_alpha_state dsa;
+   struct pipe_blend_state blend;
+   unsigned code[SPU_MAX_FRAGMENT_OPS_INSTS];
+};
+
+
+/** Max instructions for fragment programs */
+#define SPU_MAX_FRAGMENT_PROGRAM_INSTS 512
+
+/**
+ * Command to send a fragment program to SPUs.
+ */
+struct cell_command_fragment_program
+{
+   uint64_t opcode;      /**< CELL_CMD_STATE_FRAGMENT_PROGRAM */
+   uint num_inst;        /**< Number of instructions */
+   unsigned code[SPU_MAX_FRAGMENT_PROGRAM_INSTS];
+};
+
+
+/**
+ * Tell SPUs about the framebuffer size, location
+ */
+struct cell_command_framebuffer
+{
+   uint64_t opcode;     /**< CELL_CMD_FRAMEBUFFER */
+   int width, height;
+   void *color_start, *depth_start;
+   enum pipe_format color_format, depth_format;
+};
+
+
+/**
+ * Clear framebuffer to the given value/color.
+ */
+struct cell_command_clear_surface
+{
+   uint64_t opcode;     /**< CELL_CMD_CLEAR_SURFACE */
+   uint surface; /**< Temporary: 0=color, 1=Z */
+   uint value;
+};
+
+
+/**
+ * Array info used by the vertex shader's vertex puller.
+ */
+struct cell_array_info
+{
+   uint64_t base;      /**< Base address of the 0th element. */
+   uint attr;          /**< Attribute that this state is for. */
+   uint pitch;         /**< Byte pitch from one entry to the next. */
+   uint size;
+   uint function_offset;
+};
+
+
+struct cell_attribute_fetch_code
+{
+   uint64_t base;
+   uint size;
+};
+
+
+struct cell_buffer_range
+{
+   uint64_t base;
+   unsigned size;
+};
+
+
+struct cell_shader_info
+{
+   uint64_t declarations;
+   uint64_t instructions;
+   uint64_t  immediates;
+
+   unsigned num_outputs;
+   unsigned num_declarations;
+   unsigned num_instructions;
+   unsigned num_immediates;
+};
+
+
+#define SPU_VERTS_PER_BATCH 64
+struct cell_command_vs
+{
+   uint64_t opcode;       /**< CELL_CMD_VS_EXECUTE */
+   uint64_t vOut[SPU_VERTS_PER_BATCH];
+   unsigned num_elts;
+   unsigned elts[SPU_VERTS_PER_BATCH];
+   float plane[12][4];
+   unsigned nr_planes;
+   unsigned nr_attrs;
+};
+
+
+struct cell_command_render
+{
+   uint64_t opcode;   /**< CELL_CMD_RENDER */
+   uint prim_type;    /**< PIPE_PRIM_x */
+   uint num_verts;
+   uint vertex_size;  /**< bytes per vertex */
+   uint num_indexes;
+   uint vertex_buf;  /**< which cell->buffer[] contains the vertex data */
+   float xmin, ymin, xmax, ymax;  /* XXX another dummy field */
+   uint min_index;
+   boolean inline_verts;
+   uint front_winding; /* the rasterizer needs to be able to determine facing to apply front/back-facing stencil */
+};
+
+
+struct cell_command_release_verts
+{
+   uint64_t opcode;         /**< CELL_CMD_RELEASE_VERTS */
+   uint vertex_buf;    /**< in [0, CELL_NUM_BUFFERS-1] */
+};
+
+
+struct cell_command_sampler
+{
+   uint64_t opcode;         /**< CELL_CMD_STATE_SAMPLER */
+   uint unit;
+   struct pipe_sampler_state state;
+};
+
+
+struct cell_command_texture
+{
+   uint64_t opcode;     /**< CELL_CMD_STATE_TEXTURE */
+   uint unit;
+   void *start;         /**< Address in main memory */
+   ushort width, height;
+};
+
+
+/** XXX unions don't seem to work */
+/* XXX this should go away; all commands should be placed in batch buffers */
+struct cell_command
+{
+#if 0
+   struct cell_command_framebuffer fb;
+   struct cell_command_clear_surface clear;
+   struct cell_command_render render;
+#endif
+   struct cell_command_vs vs;
+} ALIGN16_ATTRIB;
+
+
+#define MAX_SPU_FUNCTIONS 12
+/**
+ * Used to tell the PPU about the address of particular functions in the
+ * SPU's address space.
+ */
+struct cell_spu_function_info
+{
+   uint num;
+   char names[MAX_SPU_FUNCTIONS][16];
+   uint addrs[MAX_SPU_FUNCTIONS];
+   char pad[12];   /**< Pad struct to multiple of 16 bytes (256 currently) */
+};
+
+
+/** This is the object passed to spe_create_thread() */
+struct cell_init_info
+{
+   unsigned id;
+   unsigned num_spus;
+   unsigned debug_flags;  /**< mask of CELL_DEBUG_x flags */
+   struct cell_command *cmd;
+
+   /** Buffers for command batches, vertex/index data */
+   ubyte *buffers[CELL_NUM_BUFFERS];
+   uint *buffer_status;  /**< points at cell_context->buffer_status */
+
+   struct cell_spu_function_info *spu_functions;
+} ALIGN16_ATTRIB;
+
+
+#endif /* CELL_COMMON_H */
diff --git a/src/gallium/drivers/cell/ppu/Makefile b/src/gallium/drivers/cell/ppu/Makefile
new file mode 100644
index 0000000000..b28f4c5c31
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/Makefile
@@ -0,0 +1,83 @@
+# Gallium3D Cell driver: PPU code
+
+# This makefile builds the libcell.a library which gets pulled into
+# the main libGL.so library
+
+
+TOP = ../../../../..
+include $(TOP)/configs/current
+
+
+# This is the "top-level" cell PPU driver code, will get pulled into libGL.so
+# by the winsys Makefile.
+CELL_LIB = ../libcell.a
+
+
+# This is the SPU code.  We'd like to be able to put this into the libcell.a
+# archive with the PPU code, but nesting .a libs doesn't seem to work.
+# So, it's pulled into libGL.so in gallium/winsys/xlib/Makefile
+SPU_CODE_MODULE = ../spu/g3d_spu.a
+
+
+SOURCES = \
+	cell_batch.c \
+	cell_clear.c \
+	cell_context.c \
+	cell_draw_arrays.c \
+	cell_flush.c \
+	cell_gen_fragment.c \
+	cell_gen_fp.c \
+	cell_state_derived.c \
+	cell_state_emit.c \
+	cell_state_shader.c \
+	cell_pipe_state.c \
+	cell_screen.c \
+	cell_state_vertex.c \
+	cell_spu.c \
+	cell_surface.c \
+	cell_texture.c \
+	cell_vbuf.c \
+	cell_vertex_fetch.c \
+	cell_vertex_shader.c \
+	cell_winsys.c
+
+
+OBJECTS = $(SOURCES:.c=.o) \
+
+INCLUDE_DIRS = \
+	-I$(TOP)/src/mesa \
+	-I$(TOP)/src/gallium/include \
+	-I$(TOP)/src/gallium/auxiliary \
+	-I$(TOP)/src/gallium/drivers
+
+.c.o:
+	$(CC) -c $(INCLUDE_DIRS) $(CFLAGS) $< -o $@
+
+
+
+default: $(CELL_LIB)
+
+
+$(CELL_LIB): $(OBJECTS) $(SPU_CODE_MODULE)
+#	ar -ru $(CELL_LIB) $(OBJECTS) $(SPU_CODE_MODULE) # doesn't work
+	ar -ru $(CELL_LIB) $(OBJECTS)
+
+#$(PROG): $(PPU_OBJECTS)
+#	$(CC) -o $(PROG) $(PPU_OBJECTS) $(SPU_CODE_MODULE) $(PPU_LFLAGS)
+
+
+
+clean:
+	rm -f *.o *~ $(CELL_LIB)
+
+
+
+depend: $(SOURCES)
+	rm -f depend
+	touch depend
+	$(MKDEP) $(MKDEP_OPTIONS) $(INCLUDE_DIRS) $(SOURCES) 2> /dev/null
+
+include depend
+
+
+
diff --git a/src/gallium/drivers/cell/ppu/cell_batch.c b/src/gallium/drivers/cell/ppu/cell_batch.c
new file mode 100644
index 0000000000..16882c0129
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_batch.c
@@ -0,0 +1,267 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "cell_context.h"
+#include "cell_batch.h"
+#include "cell_spu.h"
+
+
+
+/**
+ * Search the buffer pool for an empty/free buffer and return its index.
+ * Buffers are used for storing vertex data, state and commands which
+ * will be sent to the SPUs.
+ * If no empty buffers are available, wait for one.
+ * \return buffer index in [0, CELL_NUM_BUFFERS-1]
+ */
+uint
+cell_get_empty_buffer(struct cell_context *cell)
+{
+   uint buf = 0, tries = 0;
+
+   /* Find a buffer that's marked as free by all SPUs */
+   while (1) {
+      uint spu, num_free = 0;
+
+      for (spu = 0; spu < cell->num_spus; spu++) {
+         if (cell->buffer_status[spu][buf][0] == CELL_BUFFER_STATUS_FREE) {
+            num_free++;
+
+            if (num_free == cell->num_spus) {
+               /* found a free buffer, now mark status as used */
+               for (spu = 0; spu < cell->num_spus; spu++) {
+                  cell->buffer_status[spu][buf][0] = CELL_BUFFER_STATUS_USED;
+               }
+               /*
+               printf("PPU: ALLOC BUFFER %u\n", buf);
+               */
+               return buf;
+            }
+         }
+         else {
+            break;
+         }
+      }
+
+      /* try next buf */
+      buf = (buf + 1) % CELL_NUM_BUFFERS;
+
+      tries++;
+      if (tries == 100) {
+         /*
+         printf("PPU WAITING for buffer...\n");
+         */
+      }
+   }
+}
+
+
+/**
+ * Flush the current batch buffer to the SPUs.
+ * An empty buffer will be found and set as the new current batch buffer
+ * for subsequent commands/data.
+ */
+void
+cell_batch_flush(struct cell_context *cell)
+{
+   static boolean flushing = FALSE;
+   uint batch = cell->cur_batch;
+   const uint size = cell->buffer_size[batch];
+   uint spu, cmd_word;
+
+   assert(!flushing);
+
+   if (size == 0)
+      return;
+
+   flushing = TRUE;
+
+   assert(batch < CELL_NUM_BUFFERS);
+
+   /*
+   printf("cell_batch_dispatch: buf %u at %p, size %u\n",
+          batch, &cell->buffer[batch][0], size);
+   */
+     
+   /*
+    * Build "BATCH" command and send to all SPUs.
+    */
+   cmd_word = CELL_CMD_BATCH | (batch << 8) | (size << 16);
+
+   for (spu = 0; spu < cell->num_spus; spu++) {
+      assert(cell->buffer_status[spu][batch][0] == CELL_BUFFER_STATUS_USED);
+      send_mbox_message(cell_global.spe_contexts[spu], cmd_word);
+   }
+
+   /* When the SPUs are done copying the buffer into their locals stores
+    * they'll write a BUFFER_STATUS_FREE message into the buffer_status[]
+    * array indicating that the PPU can re-use the buffer.
+    */
+
+   batch = cell_get_empty_buffer(cell);
+
+   cell->buffer_size[batch] = 0;  /* empty */
+   cell->cur_batch = batch;
+
+   flushing = FALSE;
+}
+
+
+/**
+ * Return the number of bytes free in the current batch buffer.
+ */
+uint
+cell_batch_free_space(const struct cell_context *cell)
+{
+   uint free = CELL_BUFFER_SIZE - cell->buffer_size[cell->cur_batch];
+   return free;
+}
+
+
+/**
+ * Append data to the current batch buffer.
+ * \param data  address of block of bytes to append
+ * \param bytes  size of block of bytes
+ */
+void
+cell_batch_append(struct cell_context *cell, const void *data, uint bytes)
+{
+   uint size;
+
+   ASSERT(bytes % 8 == 0);
+   ASSERT(bytes <= CELL_BUFFER_SIZE);
+   ASSERT(cell->cur_batch >= 0);
+
+#ifdef ASSERT
+   {
+      uint spu;
+      for (spu = 0; spu < cell->num_spus; spu++) {
+         ASSERT(cell->buffer_status[spu][cell->cur_batch][0]
+                 == CELL_BUFFER_STATUS_USED);
+      }
+   }
+#endif
+
+   size = cell->buffer_size[cell->cur_batch];
+
+   if (size + bytes > CELL_BUFFER_SIZE) {
+      cell_batch_flush(cell);
+      size = 0;
+   }
+
+   ASSERT(size + bytes <= CELL_BUFFER_SIZE);
+
+   memcpy(cell->buffer[cell->cur_batch] + size, data, bytes);
+
+   cell->buffer_size[cell->cur_batch] = size + bytes;
+}
+
+
+/**
+ * Allocate space in the current batch buffer for 'bytes' space.
+ * \return address in batch buffer to put data
+ */
+void *
+cell_batch_alloc(struct cell_context *cell, uint bytes)
+{
+   return cell_batch_alloc_aligned(cell, bytes, 1);
+}
+
+
+/**
+ * Same as \sa cell_batch_alloc, but return an address at a particular
+ * alignment.
+ */
+void *
+cell_batch_alloc_aligned(struct cell_context *cell, uint bytes,
+                         uint alignment)
+{
+   void *pos;
+   uint size, padbytes;
+
+   ASSERT(bytes % 8 == 0);
+   ASSERT(bytes <= CELL_BUFFER_SIZE);
+   ASSERT(alignment > 0);
+   ASSERT(cell->cur_batch >= 0);
+
+#ifdef ASSERT
+   {
+      uint spu;
+      for (spu = 0; spu < cell->num_spus; spu++) {
+         ASSERT(cell->buffer_status[spu][cell->cur_batch][0]
+                 == CELL_BUFFER_STATUS_USED);
+      }
+   }
+#endif
+
+   size = cell->buffer_size[cell->cur_batch];
+
+   padbytes = (alignment - (size % alignment)) % alignment;
+
+   if (padbytes + size + bytes > CELL_BUFFER_SIZE) {
+      cell_batch_flush(cell);
+      size = 0;
+   }
+   else {
+      size += padbytes;
+   }
+
+   ASSERT(size % alignment == 0);
+   ASSERT(size + bytes <= CELL_BUFFER_SIZE);
+
+   pos = (void *) (cell->buffer[cell->cur_batch] + size);
+
+   cell->buffer_size[cell->cur_batch] = size + bytes;
+
+   return pos;
+}
+
+
+/**
+ * One-time init of batch buffers.
+ */
+void
+cell_init_batch_buffers(struct cell_context *cell)
+{
+   uint spu, buf;
+
+   /* init command, vertex/index buffer info */
+   for (buf = 0; buf < CELL_NUM_BUFFERS; buf++) {
+      cell->buffer_size[buf] = 0;
+
+      /* init batch buffer status values,
+       * mark 0th buffer as used, rest as free.
+       */
+      for (spu = 0; spu < cell->num_spus; spu++) {
+         if (buf == 0)
+            cell->buffer_status[spu][buf][0] = CELL_BUFFER_STATUS_USED;
+         else
+            cell->buffer_status[spu][buf][0] = CELL_BUFFER_STATUS_FREE;
+      }
+   }
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_batch.h b/src/gallium/drivers/cell/ppu/cell_batch.h
new file mode 100644
index 0000000000..f74dd60079
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_batch.h
@@ -0,0 +1,61 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef CELL_BATCH_H
+#define CELL_BATCH_H
+
+#include "pipe/p_compiler.h"
+
+
+struct cell_context;
+
+
+extern uint
+cell_get_empty_buffer(struct cell_context *cell);
+
+extern void
+cell_batch_flush(struct cell_context *cell);
+
+extern uint
+cell_batch_free_space(const struct cell_context *cell);
+
+extern void
+cell_batch_append(struct cell_context *cell, const void *data, uint bytes);
+
+extern void *
+cell_batch_alloc(struct cell_context *cell, uint bytes);
+
+extern void *
+cell_batch_alloc_aligned(struct cell_context *cell, uint bytes,
+                         uint alignment);
+
+extern void
+cell_init_batch_buffers(struct cell_context *cell);
+
+
+#endif /* CELL_BATCH_H */
diff --git a/src/gallium/drivers/cell/ppu/cell_clear.c b/src/gallium/drivers/cell/ppu/cell_clear.c
new file mode 100644
index 0000000000..c9c0c721bb
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_clear.c
@@ -0,0 +1,109 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * Authors
+ *  Brian Paul
+ */
+
+#include <stdio.h>
+#include <assert.h>
+#include <stdint.h>
+#include "pipe/p_inlines.h"
+#include "util/u_memory.h"
+#include "util/u_pack_color.h"
+#include "cell/common.h"
+#include "cell_clear.h"
+#include "cell_context.h"
+#include "cell_batch.h"
+#include "cell_flush.h"
+#include "cell_spu.h"
+#include "cell_state.h"
+
+
+/**
+ * Convert packed pixel from one format to another.
+ */
+static unsigned
+convert_color(enum pipe_format srcFormat, unsigned srcColor,
+              enum pipe_format dstFormat)
+{
+   ubyte r, g, b, a;
+   unsigned dstColor;
+
+   util_unpack_color_ub(srcFormat, &srcColor, &r, &g, &b, &a);
+   util_pack_color_ub(r, g, b, a, dstFormat, &dstColor);
+
+   return dstColor;
+}
+
+
+
+/**
+ * Called via pipe->clear()
+ */
+void
+cell_clear_surface(struct pipe_context *pipe, struct pipe_surface *ps,
+                   unsigned clearValue)
+{
+   struct pipe_screen *screen = pipe->screen;
+   struct cell_context *cell = cell_context(pipe);
+   uint surfIndex;
+
+   if (cell->dirty)
+      cell_update_derived(cell);
+
+
+   if (!cell->cbuf_map[0])
+      cell->cbuf_map[0] = screen->surface_map(screen, ps,
+                                              PIPE_BUFFER_USAGE_GPU_WRITE);
+
+   if (ps == cell->framebuffer.zsbuf) {
+      /* clear z/stencil buffer */
+      surfIndex = 1;
+   }
+   else {
+      /* clear color buffer */
+      surfIndex = 0;
+
+      if (ps->format != PIPE_FORMAT_A8R8G8B8_UNORM) {
+         clearValue = convert_color(PIPE_FORMAT_A8R8G8B8_UNORM, clearValue,
+                                    ps->format);
+      }
+   }
+
+
+   /* Build a CLEAR command and place it in the current batch buffer */
+   {
+      struct cell_command_clear_surface *clr
+         = (struct cell_command_clear_surface *)
+         cell_batch_alloc(cell, sizeof(*clr));
+      clr->opcode = CELL_CMD_CLEAR_SURFACE;
+      clr->surface = surfIndex;
+      clr->value = clearValue;
+   }
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_clear.h b/src/gallium/drivers/cell/ppu/cell_clear.h
new file mode 100644
index 0000000000..ff47d43f4c
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_clear.h
@@ -0,0 +1,43 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef CELL_CLEAR_H
+#define CELL_CLEAR_H
+
+
+struct pipe_context;
+struct pipe_surface;
+
+
+extern void
+cell_clear_surface(struct pipe_context *pipe, struct pipe_surface *ps,
+                   unsigned clearValue);
+
+
+
+#endif /* CELL_CLEAR_H */
diff --git a/src/gallium/drivers/cell/ppu/cell_context.c b/src/gallium/drivers/cell/ppu/cell_context.c
new file mode 100644
index 0000000000..30ce6f9762
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_context.c
@@ -0,0 +1,166 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * Authors
+ *  Brian Paul
+ */
+
+
+#include <stdio.h>
+
+#include "pipe/p_defines.h"
+#include "pipe/p_format.h"
+#include "util/u_memory.h"
+#include "pipe/p_winsys.h"
+#include "pipe/p_screen.h"
+
+#include "draw/draw_context.h"
+#include "draw/draw_private.h"
+
+#include "cell/common.h"
+#include "cell_batch.h"
+#include "cell_clear.h"
+#include "cell_context.h"
+#include "cell_draw_arrays.h"
+#include "cell_flush.h"
+#include "cell_state.h"
+#include "cell_surface.h"
+#include "cell_spu.h"
+#include "cell_pipe_state.h"
+#include "cell_texture.h"
+#include "cell_vbuf.h"
+
+
+
+static void
+cell_destroy_context( struct pipe_context *pipe )
+{
+   struct cell_context *cell = cell_context(pipe);
+
+   util_delete_keymap(cell->fragment_ops_cache, NULL);
+
+   cell_spu_exit(cell);
+
+   align_free(cell);
+}
+
+
+static struct draw_context *
+cell_draw_create(struct cell_context *cell)
+{
+   struct draw_context *draw = draw_create();
+
+#if 0 /* broken */
+   if (getenv("GALLIUM_CELL_VS")) {
+      /* plug in SPU-based vertex transformation code */
+      draw->shader_queue_flush = cell_vertex_shader_queue_flush;
+      draw->driver_private = cell;
+   }
+#endif
+
+   return draw;
+}
+
+
+static const struct debug_named_value cell_debug_flags[] = {
+   {"checker", CELL_DEBUG_CHECKER},/**< modulate tile clear color by SPU ID */
+   {"asm", CELL_DEBUG_ASM},        /**< dump SPU asm code */
+   {"sync", CELL_DEBUG_SYNC},      /**< SPUs do synchronous DMA */
+   {"fragops", CELL_DEBUG_FRAGMENT_OPS}, /**< SPUs emit fragment ops debug messages*/
+   {"fragopfallback", CELL_DEBUG_FRAGMENT_OP_FALLBACK}, /**< SPUs use reference implementation for fragment ops*/
+   {NULL, 0}
+};
+
+
+struct pipe_context *
+cell_create_context(struct pipe_screen *screen,
+                    struct cell_winsys *cws)
+{
+   struct cell_context *cell;
+
+   /* some fields need to be 16-byte aligned, so align the whole object */
+   cell = (struct cell_context*) align_malloc(sizeof(struct cell_context), 16);
+   if (!cell)
+      return NULL;
+
+   memset(cell, 0, sizeof(*cell));
+
+   cell->winsys = cws;
+   cell->pipe.winsys = screen->winsys;
+   cell->pipe.screen = screen;
+   cell->pipe.destroy = cell_destroy_context;
+
+   cell->pipe.clear = cell_clear_surface;
+   cell->pipe.flush = cell_flush;
+
+#if 0
+   cell->pipe.begin_query = cell_begin_query;
+   cell->pipe.end_query = cell_end_query;
+   cell->pipe.wait_query = cell_wait_query;
+#endif
+
+   cell_init_draw_functions(cell);
+   cell_init_state_functions(cell);
+   cell_init_shader_functions(cell);
+   cell_init_surface_functions(cell);
+   cell_init_texture_functions(cell);
+   cell_init_vertex_functions(cell);
+
+   cell->draw = cell_draw_create(cell);
+
+   /* Create cache of fragment ops generated code */
+   cell->fragment_ops_cache =
+      util_new_keymap(sizeof(struct cell_fragment_ops_key), ~0, NULL);
+
+   cell_init_vbuf(cell);
+
+   draw_set_rasterize_stage(cell->draw, cell->vbuf);
+
+   /* convert all points/lines to tris for the time being */
+   draw_wide_point_threshold(cell->draw, 0.0);
+   draw_wide_line_threshold(cell->draw, 0.0);
+
+   /* get env vars or read config file to get debug flags */
+   cell->debug_flags = debug_get_flags_option("CELL_DEBUG", 
+                                              cell_debug_flags, 
+                                              0 );
+
+   /*
+    * SPU stuff
+    */
+   cell->num_spus = 6;
+   /* XXX is this in SDK 3.0 only?
+   cell->num_spus = spe_cpu_info_get(SPE_COUNT_PHYSICAL_SPES, -1);
+   */
+
+   cell_start_spus(cell);
+
+   cell_init_batch_buffers(cell);
+
+   return &cell->pipe;
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_context.h b/src/gallium/drivers/cell/ppu/cell_context.h
new file mode 100644
index 0000000000..80a9b3d7e1
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_context.h
@@ -0,0 +1,181 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef CELL_CONTEXT_H
+#define CELL_CONTEXT_H
+
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "draw/draw_vertex.h"
+#include "draw/draw_vbuf.h"
+#include "cell_winsys.h"
+#include "cell/common.h"
+#include "rtasm/rtasm_ppc_spe.h"
+#include "tgsi/tgsi_scan.h"
+#include "util/u_keymap.h"
+
+
+struct cell_vbuf_render;
+
+
+/**
+ * Cell vertex shader state, subclass of pipe_shader_state.
+ */
+struct cell_vertex_shader_state
+{
+   struct pipe_shader_state shader;
+   struct tgsi_shader_info info;
+   void *draw_data;
+};
+
+
+/**
+ * Cell fragment shader state, subclass of pipe_shader_state.
+ */
+struct cell_fragment_shader_state
+{
+   struct pipe_shader_state shader;
+   struct tgsi_shader_info info;
+   struct spe_function code;
+   void *data;
+};
+
+
+/**
+ * Key for mapping per-fragment state to cached SPU machine code.
+ *  keymap(cell_fragment_ops_key) => cell_command_fragment_ops
+ */
+struct cell_fragment_ops_key
+{
+   struct pipe_blend_state blend;
+   struct pipe_depth_stencil_alpha_state dsa;
+   enum pipe_format color_format;
+   enum pipe_format zs_format;
+};
+
+
+/**
+ * Per-context state, subclass of pipe_context.
+ */
+struct cell_context
+{
+   struct pipe_context pipe;
+
+   struct cell_winsys *winsys;
+
+   const struct pipe_blend_state *blend;
+   const struct pipe_sampler_state *sampler[PIPE_MAX_SAMPLERS];
+   uint num_samplers;
+   const struct pipe_depth_stencil_alpha_state *depth_stencil;
+   const struct pipe_rasterizer_state *rasterizer;
+   const struct cell_vertex_shader_state *vs;
+   const struct cell_fragment_shader_state *fs;
+
+   struct spe_function logic_op;
+
+   struct pipe_blend_color blend_color;
+   struct pipe_clip_state clip;
+   struct pipe_constant_buffer constants[2];
+   struct pipe_framebuffer_state framebuffer;
+   struct pipe_poly_stipple poly_stipple;
+   struct pipe_scissor_state scissor;
+   struct cell_texture *texture[PIPE_MAX_SAMPLERS];
+   uint num_textures;
+   struct pipe_viewport_state viewport;
+   struct pipe_vertex_buffer vertex_buffer[PIPE_MAX_ATTRIBS];
+   uint num_vertex_buffers;
+   struct pipe_vertex_element vertex_element[PIPE_MAX_ATTRIBS];
+   uint num_vertex_elements;
+
+   ubyte *cbuf_map[PIPE_MAX_COLOR_BUFS];
+   ubyte *zsbuf_map;
+
+   struct pipe_surface *tex_surf;
+   uint *tex_map;
+
+   uint dirty;
+
+   /** Cache of code generated for per-fragment ops */
+   struct keymap *fragment_ops_cache;
+
+   /** The primitive drawing context */
+   struct draw_context *draw;
+   struct draw_stage *render_stage;
+
+   /** For post-transformed vertex buffering: */
+   struct cell_vbuf_render *vbuf_render;
+   struct draw_stage *vbuf;
+
+   struct vertex_info vertex_info;
+
+   /** Mapped constant buffers */
+   void *mapped_constants[PIPE_SHADER_TYPES];
+
+   struct cell_spu_function_info spu_functions ALIGN16_ATTRIB;
+
+   uint num_spus;
+
+   /** Buffers for command batches, vertex/index data */
+   uint buffer_size[CELL_NUM_BUFFERS];
+   ubyte buffer[CELL_NUM_BUFFERS][CELL_BUFFER_SIZE] ALIGN16_ATTRIB;
+
+   int cur_batch;  /**< which buffer is being filled w/ commands */
+
+   /** [4] to ensure 16-byte alignment for each status word */
+   uint buffer_status[CELL_MAX_SPUS][CELL_NUM_BUFFERS][4] ALIGN16_ATTRIB;
+
+
+   struct spe_function attrib_fetch;
+   unsigned attrib_fetch_offsets[PIPE_MAX_ATTRIBS];
+
+   unsigned debug_flags;
+};
+
+
+
+
+static INLINE struct cell_context *
+cell_context(struct pipe_context *pipe)
+{
+   return (struct cell_context *) pipe;
+}
+
+
+extern struct pipe_context *
+cell_create_context(struct pipe_screen *screen, struct cell_winsys *cws);
+
+extern void
+cell_vertex_shader_queue_flush(struct draw_context *draw);
+
+
+/* XXX find a better home for this */
+extern void cell_update_vertex_fetch(struct draw_context *draw);
+
+
+#endif /* CELL_CONTEXT_H */
diff --git a/src/gallium/drivers/cell/ppu/cell_draw_arrays.c b/src/gallium/drivers/cell/ppu/cell_draw_arrays.c
new file mode 100644
index 0000000000..880d535320
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_draw_arrays.c
@@ -0,0 +1,191 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Author:
+ *    Brian Paul
+ *    Keith Whitwell
+ */
+
+
+#include "pipe/p_defines.h"
+#include "pipe/p_context.h"
+#include "pipe/p_winsys.h"
+#include "pipe/p_inlines.h"
+
+#include "cell_context.h"
+#include "cell_draw_arrays.h"
+#include "cell_state.h"
+#include "cell_flush.h"
+
+#include "draw/draw_context.h"
+
+
+
+static void
+cell_map_constant_buffers(struct cell_context *sp)
+{
+   struct pipe_winsys *ws = sp->pipe.winsys;
+   uint i;
+   for (i = 0; i < 2; i++) {
+      if (sp->constants[i].size) {
+         sp->mapped_constants[i] = ws->buffer_map(ws, sp->constants[i].buffer,
+                                                  PIPE_BUFFER_USAGE_CPU_READ);
+         cell_flush_buffer_range(sp, sp->mapped_constants[i], 
+                                 sp->constants[i].buffer->size);
+      }
+   }
+
+   draw_set_mapped_constant_buffer(sp->draw,
+                                   sp->mapped_constants[PIPE_SHADER_VERTEX],
+                                   sp->constants[PIPE_SHADER_VERTEX].size);
+}
+
+static void
+cell_unmap_constant_buffers(struct cell_context *sp)
+{
+   struct pipe_winsys *ws = sp->pipe.winsys;
+   uint i;
+   for (i = 0; i < 2; i++) {
+      if (sp->constants[i].size)
+         ws->buffer_unmap(ws, sp->constants[i].buffer);
+      sp->mapped_constants[i] = NULL;
+   }
+}
+
+
+
+/**
+ * Draw vertex arrays, with optional indexing.
+ * Basically, map the vertex buffers (and drawing surfaces), then hand off
+ * the drawing to the 'draw' module.
+ *
+ * XXX should the element buffer be specified/bound with a separate function?
+ */
+static boolean
+cell_draw_range_elements(struct pipe_context *pipe,
+                         struct pipe_buffer *indexBuffer,
+                         unsigned indexSize,
+                         unsigned min_index,
+                         unsigned max_index,
+                         unsigned mode, unsigned start, unsigned count)
+{
+   struct cell_context *sp = cell_context(pipe);
+   struct draw_context *draw = sp->draw;
+   unsigned i;
+
+   if (sp->dirty)
+      cell_update_derived( sp );
+
+#if 0
+   cell_map_surfaces(sp);
+#endif
+   cell_map_constant_buffers(sp);
+
+   /*
+    * Map vertex buffers
+    */
+   for (i = 0; i < sp->num_vertex_buffers; i++) {
+      void *buf = pipe_buffer_map(pipe->screen,
+                                           sp->vertex_buffer[i].buffer,
+                                           PIPE_BUFFER_USAGE_CPU_READ);
+      cell_flush_buffer_range(sp, buf, sp->vertex_buffer[i].buffer->size);
+      draw_set_mapped_vertex_buffer(draw, i, buf);
+   }
+   /* Map index buffer, if present */
+   if (indexBuffer) {
+      void *mapped_indexes = pipe_buffer_map(pipe->screen,
+                                                      indexBuffer,
+                                                      PIPE_BUFFER_USAGE_CPU_READ);
+      draw_set_mapped_element_buffer(draw, indexSize, mapped_indexes);
+   }
+   else {
+      /* no index/element buffer */
+      draw_set_mapped_element_buffer(draw, 0, NULL);
+   }
+
+
+   /* draw! */
+   draw_arrays(draw, mode, start, count);
+
+   /*
+    * unmap vertex/index buffers - will cause draw module to flush
+    */
+   for (i = 0; i < sp->num_vertex_buffers; i++) {
+      draw_set_mapped_vertex_buffer(draw, i, NULL);
+      pipe_buffer_unmap(pipe->screen, sp->vertex_buffer[i].buffer);
+   }
+   if (indexBuffer) {
+      draw_set_mapped_element_buffer(draw, 0, NULL);
+      pipe_buffer_unmap(pipe->screen, indexBuffer);
+   }
+
+   /* Note: leave drawing surfaces mapped */
+   cell_unmap_constant_buffers(sp);
+
+   return TRUE;
+}
+
+
+static boolean
+cell_draw_elements(struct pipe_context *pipe,
+                   struct pipe_buffer *indexBuffer,
+                   unsigned indexSize,
+                   unsigned mode, unsigned start, unsigned count)
+{
+   return cell_draw_range_elements( pipe, indexBuffer,
+                                    indexSize,
+                                    0, 0xffffffff,
+                                    mode, start, count );
+}
+
+
+static boolean
+cell_draw_arrays(struct pipe_context *pipe, unsigned mode,
+                     unsigned start, unsigned count)
+{
+   return cell_draw_elements(pipe, NULL, 0, mode, start, count);
+}
+
+
+static void
+cell_set_edgeflags(struct pipe_context *pipe, const unsigned *edgeflags)
+{
+   struct cell_context *cell = cell_context(pipe);
+   draw_set_edgeflags(cell->draw, edgeflags);
+}
+
+
+
+void
+cell_init_draw_functions(struct cell_context *cell)
+{
+   cell->pipe.draw_arrays = cell_draw_arrays;
+   cell->pipe.draw_elements = cell_draw_elements;
+   cell->pipe.draw_range_elements = cell_draw_range_elements;
+   cell->pipe.set_edgeflags = cell_set_edgeflags;
+}
+
diff --git a/src/gallium/drivers/cell/ppu/cell_draw_arrays.h b/src/gallium/drivers/cell/ppu/cell_draw_arrays.h
new file mode 100644
index 0000000000..148873aa67
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_draw_arrays.h
@@ -0,0 +1,36 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef CELL_DRAW_ARRAYS_H
+#define CELL_DRAW_ARRAYS_H
+
+
+extern void
+cell_init_draw_functions(struct cell_context *cell);
+
+
+#endif /* CELL_DRAW_ARRAYS_H */
diff --git a/src/gallium/drivers/cell/ppu/cell_flush.c b/src/gallium/drivers/cell/ppu/cell_flush.c
new file mode 100644
index 0000000000..6596b72010
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_flush.c
@@ -0,0 +1,111 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "cell_context.h"
+#include "cell_batch.h"
+#include "cell_flush.h"
+#include "cell_spu.h"
+#include "cell_render.h"
+#include "draw/draw_context.h"
+
+
+/**
+ * Called via pipe->flush()
+ */
+void
+cell_flush(struct pipe_context *pipe, unsigned flags,
+           struct pipe_fence_handle **fence)
+{
+   struct cell_context *cell = cell_context(pipe);
+
+   if (fence) {
+      *fence = NULL;
+      /* XXX: Implement real fencing */
+      flags |= CELL_FLUSH_WAIT;
+   }
+
+   if (flags & PIPE_FLUSH_SWAPBUFFERS)
+      flags |= CELL_FLUSH_WAIT;
+
+   draw_flush( cell->draw );
+   cell_flush_int(cell, flags);
+}
+
+
+/**
+ * Cell internal flush function.  Send the current batch buffer to all SPUs.
+ * If flags & CELL_FLUSH_WAIT, do not return until the SPUs are idle.
+ * \param flags  bitmask of flags CELL_FLUSH_WAIT, or zero
+ */
+void
+cell_flush_int(struct cell_context *cell, unsigned flags)
+{
+   static boolean flushing = FALSE;  /* recursion catcher */
+   uint i;
+
+   ASSERT(!flushing);
+   flushing = TRUE;
+
+   if (flags & CELL_FLUSH_WAIT) {
+      uint64_t *cmd = (uint64_t *) cell_batch_alloc(cell, sizeof(uint64_t));
+      *cmd = CELL_CMD_FINISH;
+   }
+
+   cell_batch_flush(cell);
+
+#if 0
+   /* Send CMD_FINISH to all SPUs */
+   for (i = 0; i < cell->num_spus; i++) {
+      send_mbox_message(cell_global.spe_contexts[i], CELL_CMD_FINISH);
+   }
+#endif
+
+   if (flags & CELL_FLUSH_WAIT) {
+      /* Wait for ack */
+      for (i = 0; i < cell->num_spus; i++) {
+         uint k = wait_mbox_message(cell_global.spe_contexts[i]);
+         assert(k == CELL_CMD_FINISH);
+      }
+   }
+
+   flushing = FALSE;
+}
+
+
+void
+cell_flush_buffer_range(struct cell_context *cell, void *ptr,
+			unsigned size)
+{
+   uint64_t batch[1 + (ROUNDUP8(sizeof(struct cell_buffer_range)) / 8)];
+   struct cell_buffer_range *br = (struct cell_buffer_range *) & batch[1];
+
+   batch[0] = CELL_CMD_FLUSH_BUFFER_RANGE;
+   br->base = (uintptr_t) ptr;
+   br->size = size;
+   cell_batch_append(cell, batch, sizeof(batch));
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_flush.h b/src/gallium/drivers/cell/ppu/cell_flush.h
new file mode 100644
index 0000000000..509ae6239a
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_flush.h
@@ -0,0 +1,45 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef CELL_FLUSH
+#define CELL_FLUSH
+
+#define CELL_FLUSH_WAIT 0x80000000
+
+extern void
+cell_flush(struct pipe_context *pipe, unsigned flags,
+           struct pipe_fence_handle **fence);
+
+extern void
+cell_flush_int(struct cell_context *cell, unsigned flags);
+
+extern void
+cell_flush_buffer_range(struct cell_context *cell, void *ptr,
+			unsigned size);
+
+#endif
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
new file mode 100644
index 0000000000..c8125a8a05
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
@@ -0,0 +1,1755 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+
+/**
+ * Generate SPU fragment program/shader code.
+ *
+ * Note that we generate SOA-style code here.  So each TGSI instruction
+ * operates on four pixels (and is translated into four SPU instructions,
+ * generally speaking).
+ *
+ * \author Brian Paul
+ */
+
+#include <math.h>
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_util.h"
+#include "tgsi/tgsi_exec.h"
+#include "tgsi/tgsi_dump.h"
+#include "rtasm/rtasm_ppc_spe.h"
+#include "util/u_memory.h"
+#include "cell_context.h"
+#include "cell_gen_fp.h"
+
+
+#define MAX_TEMPS 16
+#define MAX_IMMED  8
+
+#define CHAN_X  0
+#define CHAN_Y  1
+#define CHAN_Z  2
+#define CHAN_W  3
+
+/**
+ * Context needed during code generation.
+ */
+struct codegen
+{
+   struct cell_context *cell;
+   int inputs_reg;      /**< 1st function parameter */
+   int outputs_reg;     /**< 2nd function parameter */
+   int constants_reg;   /**< 3rd function parameter */
+   int temp_regs[MAX_TEMPS][4]; /**< maps TGSI temps to SPE registers */
+   int imm_regs[MAX_IMMED][4];  /**< maps TGSI immediates to SPE registers */
+
+   int num_imm;  /**< number of immediates */
+
+   int one_reg;         /**< register containing {1.0, 1.0, 1.0, 1.0} */
+
+   /** Per-instruction temps / intermediate temps */
+   int num_itemps;
+   int itemps[10];
+
+   /** Current IF/ELSE/ENDIF nesting level */
+   int if_nesting;
+   /** Index of execution mask register */
+   int exec_mask_reg;
+
+   int frame_size;  /**< Stack frame size, in words */
+
+   struct spe_function *f;
+   boolean error;
+};
+
+
+/**
+ * Allocate an intermediate temporary register.
+ */
+static int
+get_itemp(struct codegen *gen)
+{
+   int t = spe_allocate_available_register(gen->f);
+   assert(gen->num_itemps < Elements(gen->itemps));
+   gen->itemps[gen->num_itemps++] = t;
+   return t;
+}
+
+/**
+ * Free all intermediate temporary registers.  To be called after each
+ * instruction has been emitted.
+ */
+static void
+free_itemps(struct codegen *gen)
+{
+   int i;
+   for (i = 0; i < gen->num_itemps; i++) {
+      spe_release_register(gen->f, gen->itemps[i]);
+   }
+   gen->num_itemps = 0;
+}
+
+
+/**
+ * Return index of an SPE register containing {1.0, 1.0, 1.0, 1.0}.
+ * The register is allocated and initialized upon the first call.
+ */
+static int
+get_const_one_reg(struct codegen *gen)
+{
+   if (gen->one_reg <= 0) {
+      gen->one_reg = spe_allocate_available_register(gen->f);
+
+      spe_indent(gen->f, 4);
+      spe_comment(gen->f, -4, "INIT CONSTANT 1.0:");
+
+      /* one = {1.0, 1.0, 1.0, 1.0} */
+      spe_load_float(gen->f, gen->one_reg, 1.0f);
+
+      spe_indent(gen->f, -4);
+   }
+
+   return gen->one_reg;
+}
+
+
+/**
+ * Return index of the pixel execution mask.
+ * The register is allocated an initialized upon the first call.
+ *
+ * The pixel execution mask controls which pixels in a quad are
+ * modified, according to surrounding conditionals, loops, etc.
+ */
+static int
+get_exec_mask_reg(struct codegen *gen)
+{
+   if (gen->exec_mask_reg <= 0) {
+      gen->exec_mask_reg = spe_allocate_available_register(gen->f);
+
+      spe_indent(gen->f, 4);
+      spe_comment(gen->f, -4, "INIT EXEC MASK = ~0:");
+
+      /* exec_mask = {~0, ~0, ~0, ~0} */
+      spe_load_int(gen->f, gen->exec_mask_reg, ~0);
+
+      spe_indent(gen->f, -4);
+   }
+
+   return gen->exec_mask_reg;
+}
+
+
+/**
+ * Return the index of the SPU temporary containing the named TGSI
+ * source register.  If the TGSI register is a TGSI_FILE_TEMPORARY we
+ * just return the corresponding SPE register.  If the TGIS register
+ * is TGSI_FILE_INPUT/CONSTANT/IMMEDIATE we allocate a new SPE register
+ * and emit an SPE load instruction.
+ */
+static int
+get_src_reg(struct codegen *gen,
+            int channel,
+            const struct tgsi_full_src_register *src)
+{
+   int reg = -1;
+   int swizzle = tgsi_util_get_full_src_register_extswizzle(src, channel);
+   boolean reg_is_itemp = FALSE;
+   uint sign_op;
+
+   assert(swizzle >= TGSI_SWIZZLE_X);
+   assert(swizzle <= TGSI_EXTSWIZZLE_ONE);
+
+   if (swizzle == TGSI_EXTSWIZZLE_ONE) {
+      /* Load const one float and early out */
+      reg = get_const_one_reg(gen);
+   }
+   else if (swizzle == TGSI_EXTSWIZZLE_ZERO) {
+      /* Load const zero float and early out */
+      reg = get_itemp(gen);
+      spe_xor(gen->f, reg, reg, reg);
+   }
+   else {
+      assert(swizzle < 4);
+
+      switch (src->SrcRegister.File) {
+      case TGSI_FILE_TEMPORARY:
+         reg = gen->temp_regs[src->SrcRegister.Index][swizzle];
+         break;
+      case TGSI_FILE_INPUT:
+         {
+            /* offset is measured in quadwords, not bytes */
+            int offset = src->SrcRegister.Index * 4 + swizzle;
+            reg = get_itemp(gen);
+            reg_is_itemp = TRUE;
+            /* Load:  reg = memory[(machine_reg) + offset] */
+            spe_lqd(gen->f, reg, gen->inputs_reg, offset * 16);
+         }
+         break;
+      case TGSI_FILE_IMMEDIATE:
+         reg = gen->imm_regs[src->SrcRegister.Index][swizzle];
+         break;
+      case TGSI_FILE_CONSTANT:
+         {
+            /* offset is measured in quadwords, not bytes */
+            int offset = src->SrcRegister.Index * 4 + swizzle;
+            reg = get_itemp(gen);
+            reg_is_itemp = TRUE;
+            /* Load:  reg = memory[(machine_reg) + offset] */
+            spe_lqd(gen->f, reg, gen->constants_reg, offset * 16);
+         }
+         break;
+      case TGSI_FILE_SAMPLER:
+         {
+            reg = 3; /* XXX total hack */
+         }
+         break;
+      default:
+         assert(0);
+      }
+   }
+
+   /*
+    * Handle absolute value, negate or set-negative of src register.
+    */
+   sign_op = tgsi_util_get_full_src_register_sign_mode(src, channel);
+   if (sign_op != TGSI_UTIL_SIGN_KEEP) {
+      /*
+       * All sign ops are done by manipulating bit 31, the IEEE float sign bit.
+       */
+      const int bit31mask_reg = get_itemp(gen);
+      int result_reg;
+
+      if (reg_is_itemp) {
+         /* re-use 'reg' for the result */
+         result_reg = reg;
+      }
+      else {
+         /* alloc a new reg for the result */
+         result_reg = get_itemp(gen);
+      }
+
+      /* mask with bit 31 set, the rest cleared */
+      spe_load_int(gen->f, bit31mask_reg, (1 << 31));
+
+      if (sign_op == TGSI_UTIL_SIGN_CLEAR) {
+         spe_andc(gen->f, result_reg, reg, bit31mask_reg);
+      }
+      else if (sign_op == TGSI_UTIL_SIGN_SET) {
+         spe_and(gen->f, result_reg, reg, bit31mask_reg);
+      }
+      else {
+         assert(sign_op == TGSI_UTIL_SIGN_TOGGLE);
+         spe_xor(gen->f, result_reg, reg, bit31mask_reg);
+      }
+
+      reg = result_reg;
+   }
+
+   return reg;
+}
+
+
+/**
+ * Return the index of an SPE register to use for the given TGSI register.
+ * If the TGSI register is TGSI_FILE_TEMPORARAY, the index of the
+ * corresponding SPE register is returned.  If the TGSI register is
+ * TGSI_FILE_OUTPUT we allocate an intermediate temporary register.
+ * See store_dest_reg() below...
+ */
+static int
+get_dst_reg(struct codegen *gen,
+            int channel,
+            const struct tgsi_full_dst_register *dest)
+{
+   int reg = -1;
+
+   switch (dest->DstRegister.File) {
+   case TGSI_FILE_TEMPORARY:
+      if (gen->if_nesting > 0)
+         reg = get_itemp(gen);
+      else
+         reg = gen->temp_regs[dest->DstRegister.Index][channel];
+      break;
+   case TGSI_FILE_OUTPUT:
+      reg = get_itemp(gen);
+      break;
+   default:
+      assert(0);
+   }
+
+   return reg;
+}
+
+
+/**
+ * When a TGSI instruction is writing to an output register, this
+ * function emits the SPE store instruction to store the value_reg.
+ * \param value_reg  the SPE register containing the value to store.
+ *                   This would have been returned by get_dst_reg().
+ */
+static void
+store_dest_reg(struct codegen *gen,
+               int value_reg, int channel,
+               const struct tgsi_full_dst_register *dest)
+{
+   switch (dest->DstRegister.File) {
+   case TGSI_FILE_TEMPORARY:
+      if (gen->if_nesting > 0) {
+         int d_reg = gen->temp_regs[dest->DstRegister.Index][channel];
+         int exec_reg = get_exec_mask_reg(gen);
+         /* Mix d with new value according to exec mask:
+          * d[i] = mask_reg[i] ? value_reg : d_reg
+          */
+         spe_selb(gen->f, d_reg, d_reg, value_reg, exec_reg);
+      }
+      else {
+         /* we're not inside a condition or loop: do nothing special */
+
+      }
+      break;
+   case TGSI_FILE_OUTPUT:
+      {
+         /* offset is measured in quadwords, not bytes */
+         int offset = dest->DstRegister.Index * 4 + channel;
+         if (gen->if_nesting > 0) {
+            int exec_reg = get_exec_mask_reg(gen);
+            int curval_reg = get_itemp(gen);
+            /* First read the current value from memory:
+             * Load:  curval = memory[(machine_reg) + offset]
+             */
+            spe_lqd(gen->f, curval_reg, gen->outputs_reg, offset * 16);
+            /* Mix curval with newvalue according to exec mask:
+             * d[i] = mask_reg[i] ? value_reg : d_reg
+             */
+            spe_selb(gen->f, curval_reg, curval_reg, value_reg, exec_reg);
+            /* Store: memory[(machine_reg) + offset] = curval */
+            spe_stqd(gen->f, curval_reg, gen->outputs_reg, offset * 16);
+         }
+         else {
+            /* Store: memory[(machine_reg) + offset] = reg */
+            spe_stqd(gen->f, value_reg, gen->outputs_reg, offset * 16);
+         }
+      }
+      break;
+   default:
+      assert(0);
+   }
+}
+
+
+
+static void
+emit_prologue(struct codegen *gen)
+{
+   gen->frame_size = 256+128; /* XXX temporary */
+
+   spe_comment(gen->f, -4, "Function prologue:");
+
+   /* save $lr on stack     # stqd $lr,16($sp) */
+   spe_stqd(gen->f, SPE_REG_RA, SPE_REG_SP, 16);
+
+   /* save stack pointer    # stqd $sp,-frameSize($sp) */
+   spe_stqd(gen->f, SPE_REG_SP, SPE_REG_SP, -gen->frame_size);
+
+   /* adjust stack pointer  # ai $sp,$sp,-frameSize */
+   spe_ai(gen->f, SPE_REG_SP, SPE_REG_SP, -gen->frame_size);
+}
+
+
+static void
+emit_epilogue(struct codegen *gen)
+{
+   spe_comment(gen->f, -4, "Function epilogue:");
+
+   /* restore stack pointer    # ai $sp,$sp,frameSize */
+   spe_ai(gen->f, SPE_REG_SP, SPE_REG_SP, gen->frame_size);
+
+   /* restore $lr              # lqd $lr,16($sp) */
+   spe_lqd(gen->f, SPE_REG_RA, SPE_REG_SP, 16);
+
+   /* return from function call */
+   spe_bi(gen->f, SPE_REG_RA, 0, 0);
+}
+
+
+static boolean
+emit_MOV(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch, src_reg[4], dst_reg[4];
+   spe_comment(gen->f, -4, "MOV:");
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         src_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         dst_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+      }
+   }
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         /* XXX we don't always need to actually emit a mov instruction here */
+         spe_move(gen->f, dst_reg[ch], src_reg[ch]);
+         store_dest_reg(gen, dst_reg[ch], ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+   return true;
+}
+
+/**
+ * Emit addition instructions.  Recall that a single TGSI_OPCODE_ADD
+ * becomes (up to) four SPU "fa" instructions because we're doing SOA
+ * processing.
+ */
+static boolean
+emit_ADD(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch, s1_reg[4], s2_reg[4], d_reg[4];
+
+   spe_comment(gen->f, -4, "ADD:");
+   /* Loop over Red/Green/Blue/Alpha channels, fetch src operands */
+   for (ch = 0; ch < 4; ch++) {
+      /* If the dest R, G, B or A writemask is enabled... */
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         s2_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+      }
+   }
+   /* Loop over Red/Green/Blue/Alpha channels, do the add, store results */
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         /* Emit actual SPE instruction: d = s1 + s2 */
+         spe_fa(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch]);
+         /* Store the result (a no-op for TGSI_FILE_TEMPORARY dests) */
+         store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
+         /* Free any intermediate temps we allocated */
+         free_itemps(gen);
+      }
+   }
+   return true;
+}
+
+/**
+ * Emit subtract.  See emit_ADD for comments.
+ */
+static boolean
+emit_SUB(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch, s1_reg[4], s2_reg[4], d_reg[4];
+   spe_comment(gen->f, -4, "SUB:");
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         s2_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+      }
+   }
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         /* d = s1 - s2 */
+         spe_fm(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch]);
+         store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+   return true;
+}
+
+/**
+ * Emit multiply add.  See emit_ADD for comments.
+ */
+static boolean
+emit_MAD(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch, s1_reg[4], s2_reg[4], s3_reg[4], d_reg[4];
+   spe_comment(gen->f, -4, "MAD:");
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         s2_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         s3_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[2]);
+         d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+      }
+   }
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         /* d = s1 * s2 + s3 */
+         spe_fma(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch], s3_reg[ch]);
+         store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+   return true;
+}
+
+
+/**
+ * Emit linear interpolate.  See emit_ADD for comments.
+ */
+static boolean
+emit_LERP(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+   spe_comment(gen->f, -4, "LERP:");
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         int s3_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[2]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         /* d = s3 + s1(s2 - s3) */
+         spe_fs(gen->f, d_reg, s2_reg, s3_reg);
+         spe_fma(gen->f, d_reg, d_reg, s1_reg, s3_reg);
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+   return true;
+}
+
+/**
+ * Emit multiply.  See emit_ADD for comments.
+ */
+static boolean
+emit_MUL(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch, s1_reg[4], s2_reg[4], d_reg[4];
+   spe_comment(gen->f, -4, "MUL:");
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         s2_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+      }
+   }
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         /* d = s1 * s2 */
+         spe_fm(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch]);
+         store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+   return true;
+}
+
+/**
+ * Emit reciprocal.  See emit_ADD for comments.
+ */
+static boolean
+emit_RCP(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+   spe_comment(gen->f, -4, "RCP:");
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         /* d = 1/s1 */
+         spe_frest(gen->f, d_reg, s1_reg);
+         spe_fi(gen->f, d_reg, s1_reg, d_reg);
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+   return true;
+}
+
+/**
+ * Emit reciprocal sqrt.  See emit_ADD for comments.
+ */
+static boolean
+emit_RSQ(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+   spe_comment(gen->f, -4, "RSQ:");
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         /* d = 1/s1 */
+         spe_frsqest(gen->f, d_reg, s1_reg);
+         spe_fi(gen->f, d_reg, s1_reg, d_reg);
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+   return true;
+}
+
+/**
+ * Emit absolute value.  See emit_ADD for comments.
+ */
+static boolean
+emit_ABS(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+   spe_comment(gen->f, -4, "ABS:");
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         const int bit31mask_reg = get_itemp(gen);
+
+         /* mask with bit 31 set, the rest cleared */  
+         spe_load_int(gen->f, bit31mask_reg, (1 << 31));
+
+         /* d = sign bit cleared in s1 */
+         spe_andc(gen->f, d_reg, s1_reg, bit31mask_reg);
+
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+   return true;
+}
+
+/**
+ * Emit 3 component dot product.  See emit_ADD for comments.
+ */
+static boolean
+emit_DP3(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+   int s1x_reg, s1y_reg, s1z_reg;
+   int s2x_reg, s2y_reg, s2z_reg;
+   int t0_reg = get_itemp(gen), t1_reg = get_itemp(gen);
+
+   spe_comment(gen->f, -4, "DP3:");
+
+   s1x_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]);
+   s2x_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]);
+   s1y_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]);
+   s2y_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[1]);
+   s1z_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]);
+   s2z_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[1]);
+
+   /* t0 = x0 * x1 */
+   spe_fm(gen->f, t0_reg, s1x_reg, s2x_reg);
+
+   /* t1 = y0 * y1 */
+   spe_fm(gen->f, t1_reg, s1y_reg, s2y_reg);
+
+   /* t0 = z0 * z1 + t0 */
+   spe_fma(gen->f, t0_reg, s1z_reg, s2z_reg, t0_reg);
+
+   /* t0 = t0 + t1 */
+   spe_fa(gen->f, t0_reg, t0_reg, t1_reg);
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         spe_move(gen->f, d_reg, t0_reg);
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+      }
+   }
+
+   free_itemps(gen);
+   return true;
+}
+
+/**
+ * Emit 4 component dot product.  See emit_ADD for comments.
+ */
+static boolean
+emit_DP4(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+   spe_comment(gen->f, -4, "DP4:");
+
+   int s1_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]);
+   int s2_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]);
+   int tmp_reg = get_itemp(gen);
+
+   /* t = x0 * x1 */
+   spe_fm(gen->f, tmp_reg, s1_reg, s2_reg);
+
+   s1_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]);
+   s2_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[1]);
+   /* t = y0 * y1 + t */
+   spe_fma(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg);
+
+   s1_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]);
+   s2_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[1]);
+   /* t = z0 * z1 + t */
+   spe_fma(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg);
+
+   s1_reg = get_src_reg(gen, CHAN_W, &inst->FullSrcRegisters[0]);
+   s2_reg = get_src_reg(gen, CHAN_W, &inst->FullSrcRegisters[1]);
+   /* t = w0 * w1 + t */
+   spe_fma(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg);
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         spe_move(gen->f, d_reg, tmp_reg);
+         store_dest_reg(gen, tmp_reg, ch, &inst->FullDstRegisters[0]);
+      }
+   }
+
+   free_itemps(gen);
+   return true;
+}
+
+/**
+ * Emit homogeneous dot product.  See emit_ADD for comments.
+ */
+static boolean
+emit_DPH(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+   spe_comment(gen->f, -4, "DPH:");
+
+   int s1_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]);
+   int s2_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]);
+   int tmp_reg = get_itemp(gen);
+
+   /* t = x0 * x1 */
+   spe_fm(gen->f, tmp_reg, s1_reg, s2_reg);
+
+   s1_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]);
+   s2_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[1]);
+   /* t = y0 * y1 + t */
+   spe_fma(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg);
+
+   s1_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]);
+   s2_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[1]);
+   /* t = z0 * z1 + t */
+   spe_fma(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg);
+
+   s2_reg = get_src_reg(gen, CHAN_W, &inst->FullSrcRegisters[1]);
+   /* t = w1 + t */
+   spe_fa(gen->f, tmp_reg, s2_reg, tmp_reg);
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         spe_move(gen->f, d_reg, tmp_reg);
+         store_dest_reg(gen, tmp_reg, ch, &inst->FullDstRegisters[0]);
+      }
+   }
+
+   free_itemps(gen);
+   return true;
+}
+
+/**
+ * Emit cross product.  See emit_ADD for comments.
+ */
+static boolean
+emit_XPD(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   spe_comment(gen->f, -4, "XPD:");
+
+   int s1_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]);
+   int s2_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[1]);
+   int tmp_reg = get_itemp(gen);
+
+   /* t = z0 * y1 */
+   spe_fm(gen->f, tmp_reg, s1_reg, s2_reg);
+
+   s1_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]);
+   s2_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[1]);
+   /* t = y0 * z1 - t */
+   spe_fms(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg);
+
+   if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << CHAN_X)) {
+      store_dest_reg(gen, tmp_reg, CHAN_X, &inst->FullDstRegisters[0]);
+   }
+
+   s1_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]);
+   s2_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[1]);
+   /* t = x0 * z1 */
+   spe_fm(gen->f, tmp_reg, s1_reg, s2_reg);
+
+   s1_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]);
+   s2_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]);
+   /* t = z0 * x1 - t */
+   spe_fms(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg);
+
+   if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << CHAN_Y)) {
+      store_dest_reg(gen, tmp_reg, CHAN_Y, &inst->FullDstRegisters[0]);
+   }
+
+   s1_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]);
+   s2_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]);
+   /* t = y0 * x1 */
+   spe_fm(gen->f, tmp_reg, s1_reg, s2_reg);
+
+   s1_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]);
+   s2_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[1]);
+   /* t = x0 * y1 - t */
+   spe_fms(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg);
+
+   if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << CHAN_Z)) {
+      store_dest_reg(gen, tmp_reg, CHAN_Z, &inst->FullDstRegisters[0]);
+   }
+
+   free_itemps(gen);
+   return true;
+}
+
+/**
+ * Emit set-if-greater-than.
+ * Note that the SPE fcgt instruction produces 0x0 and 0xffffffff as
+ * the result but OpenGL/TGSI needs 0.0 and 1.0 results.
+ * We can easily convert 0x0/0xffffffff to 0.0/1.0 with a bitwise AND.
+ */
+static boolean
+emit_SGT(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+
+   spe_comment(gen->f, -4, "SGT:");
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+
+         /* d = (s1 > s2) */
+         spe_fcgt(gen->f, d_reg, s1_reg, s2_reg);
+
+         /* convert d from 0x0/0xffffffff to 0.0/1.0 */
+         /* d = d & one_reg */
+         spe_and(gen->f, d_reg, d_reg, get_const_one_reg(gen));
+
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+
+   return true;
+}
+
+/**
+ * Emit set-if_less-then.  See emit_SGT for comments.
+ */
+static boolean
+emit_SLT(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+
+   spe_comment(gen->f, -4, "SLT:");
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+
+         /* d = (s1 < s2) */
+         spe_fcgt(gen->f, d_reg, s2_reg, s1_reg);
+
+         /* convert d from 0x0/0xffffffff to 0.0/1.0 */
+         /* d = d & one_reg */
+         spe_and(gen->f, d_reg, d_reg, get_const_one_reg(gen));
+
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+
+   return true;
+}
+
+/**
+ * Emit set-if_greater-then-or-equal.  See emit_SGT for comments.
+ */
+static boolean
+emit_SGE(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+
+   spe_comment(gen->f, -4, "SGE:");
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+
+         /* d = (s1 >= s2) */
+         spe_fcgt(gen->f, d_reg, s2_reg, s1_reg);
+
+         /* convert d from 0x0/0xffffffff to 0.0/1.0 */
+         /* d = ~d & one_reg */
+         spe_andc(gen->f, d_reg, get_const_one_reg(gen), d_reg);
+
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+
+   return true;
+}
+
+/**
+ * Emit set-if_less-then-or-equal.  See emit_SGT for comments.
+ */
+static boolean
+emit_SLE(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+
+   spe_comment(gen->f, -4, "SLE:");
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+
+         /* d = (s1 <= s2) */
+         spe_fcgt(gen->f, d_reg, s1_reg, s2_reg);
+
+         /* convert d from 0x0/0xffffffff to 0.0/1.0 */
+         /* d = ~d & one_reg */
+         spe_andc(gen->f, d_reg, get_const_one_reg(gen), d_reg);
+
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+
+   return true;
+}
+
+/**
+ * Emit set-if_equal.  See emit_SGT for comments.
+ */
+static boolean
+emit_SEQ(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+
+   spe_comment(gen->f, -4, "SEQ:");
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+
+         /* d = (s1 == s2) */
+         spe_fceq(gen->f, d_reg, s1_reg, s2_reg);
+
+         /* convert d from 0x0/0xffffffff to 0.0/1.0 */
+         /* d = d & one_reg */
+         spe_and(gen->f, d_reg, d_reg, get_const_one_reg(gen));
+
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+
+   return true;
+}
+
+/**
+ * Emit set-if_not_equal.  See emit_SGT for comments.
+ */
+static boolean
+emit_SNE(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+
+   spe_comment(gen->f, -4, "SNE:");
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+
+         /* d = (s1 != s2) */
+         spe_fceq(gen->f, d_reg, s1_reg, s2_reg);
+         spe_nor(gen->f, d_reg, d_reg, d_reg);
+
+         /* convert d from 0x0/0xffffffff to 0.0/1.0 */
+         /* d = d & one_reg */
+         spe_and(gen->f, d_reg, d_reg, get_const_one_reg(gen));
+
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+
+   return true;
+}
+
+/**
+ * Emit compare.  See emit_SGT for comments.
+ */
+static boolean
+emit_CMP(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+
+   spe_comment(gen->f, -4, "CMP:");
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         int s3_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[2]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         int zero_reg = get_itemp(gen);
+   
+         spe_xor(gen->f, zero_reg, zero_reg, zero_reg);
+
+         /* d = (s1 < 0) ? s2 : s3 */
+         spe_fcgt(gen->f, d_reg, zero_reg, s1_reg);
+         spe_selb(gen->f, d_reg, s3_reg, s2_reg, d_reg);
+
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+
+   return true;
+}
+
+/**
+ * Emit trunc.  
+ * Convert float to signed int
+ * Convert signed int to float
+ */
+static boolean
+emit_TRUNC(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+
+   spe_comment(gen->f, -4, "TRUNC:");
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+
+         /* Convert float to int */
+         spe_cflts(gen->f, d_reg, s1_reg, 0);
+
+         /* Convert int to float */
+         spe_csflt(gen->f, d_reg, d_reg, 0);
+
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+
+   return true;
+}
+
+/**
+ * Emit floor.  
+ * If negative int subtract one
+ * Convert float to signed int
+ * Convert signed int to float
+ */
+static boolean
+emit_FLR(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+
+   spe_comment(gen->f, -4, "FLR:");
+
+   int zero_reg = get_itemp(gen);
+   spe_xor(gen->f, zero_reg, zero_reg, zero_reg);
+   
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         int tmp_reg = get_itemp(gen);
+
+         /* If negative, subtract 1.0 */
+         spe_fcgt(gen->f, d_reg, zero_reg, s1_reg);
+         spe_selb(gen->f, tmp_reg, zero_reg, get_const_one_reg(gen), d_reg);
+         spe_fs(gen->f, d_reg, s1_reg, tmp_reg);
+
+         /* Convert float to int */
+         spe_cflts(gen->f, d_reg, d_reg, 0);
+
+         /* Convert int to float */
+         spe_csflt(gen->f, d_reg, d_reg, 0);
+
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+
+   return true;
+}
+
+/**
+ * Emit frac.  
+ * Input - FLR(Input)
+ */
+static boolean
+emit_FRC(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+
+   spe_comment(gen->f, -4, "FLR:");
+
+   int zero_reg = get_itemp(gen);
+   spe_xor(gen->f, zero_reg, zero_reg, zero_reg);
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         int tmp_reg = get_itemp(gen);
+
+         /* If negative, subtract 1.0 */
+         spe_fcgt(gen->f, d_reg, zero_reg, s1_reg);
+         spe_selb(gen->f, tmp_reg, zero_reg, get_const_one_reg(gen), d_reg);
+         spe_fs(gen->f, d_reg, s1_reg, tmp_reg);
+
+         /* Convert float to int */
+         spe_cflts(gen->f, d_reg, d_reg, 0);
+
+         /* Convert int to float */
+         spe_csflt(gen->f, d_reg, d_reg, 0);
+
+         /* d = s1 - FLR(s1) */
+         spe_fs(gen->f, d_reg, s1_reg, d_reg);
+
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+
+   return true;
+}
+
+
+#if 0
+static void
+print_functions(struct cell_context *cell)
+{
+   struct cell_spu_function_info *funcs = &cell->spu_functions;
+   uint i;
+   for (i = 0; i < funcs->num; i++) {
+      printf("SPU func %u: %s at %u\n",
+             i, funcs->names[i], funcs->addrs[i]);
+   }
+}
+#endif
+
+
+static uint
+lookup_function(struct cell_context *cell, const char *funcname)
+{
+   const struct cell_spu_function_info *funcs = &cell->spu_functions;
+   uint i, addr = 0;
+   for (i = 0; i < funcs->num; i++) {
+      if (strcmp(funcs->names[i], funcname) == 0) {
+         addr = funcs->addrs[i];
+      }
+   }
+   assert(addr && "spu function not found");
+   return addr / 4;  /* discard 2 least significant bits */
+}
+
+
+/**
+ * Emit code to call a SPU function.
+ * Used to implement instructions like SIN/COS/POW/TEX/etc.
+ */
+static boolean
+emit_function_call(struct codegen *gen,
+                   const struct tgsi_full_instruction *inst,
+                   char *funcname, uint num_args)
+{
+   const uint addr = lookup_function(gen->cell, funcname);
+   char comment[100];
+   int ch;
+
+   assert(num_args <= 3);
+
+   snprintf(comment, sizeof(comment), "CALL %s:", funcname);
+   spe_comment(gen->f, -4, comment);
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int s_regs[3], d_reg;
+         ubyte usedRegs[SPE_NUM_REGS];
+         uint a, i, numUsed;
+
+         for (a = 0; a < num_args; a++) {
+            s_regs[a] = get_src_reg(gen, ch, &inst->FullSrcRegisters[a]);
+         }
+         d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+
+         numUsed = spe_get_registers_used(gen->f, usedRegs);
+         assert(numUsed < gen->frame_size / 16 - 32);
+
+         /* save registers to stack */
+         for (i = 0; i < numUsed; i++) {
+            uint reg = usedRegs[i];
+            int offset = 2 + i;
+            spe_stqd(gen->f, reg, SPE_REG_SP, 16 * offset);
+         }
+
+         /* setup function arguments */
+         for (a = 0; a < num_args; a++) {
+            spe_move(gen->f, 3 + a, s_regs[a]);
+         }
+
+         /* branch to function, save return addr */
+         spe_brasl(gen->f, SPE_REG_RA, addr);
+
+         /* save function's return value */
+         spe_move(gen->f, d_reg, 3);
+
+         /* restore registers from stack */
+         for (i = 0; i < numUsed; i++) {
+            uint reg = usedRegs[i];
+            if (reg != d_reg) {
+               int offset = 2 + i;
+               spe_lqd(gen->f, reg, SPE_REG_SP, 16 * offset);
+            }
+         }
+
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+
+   return true;
+}
+
+
+static boolean
+emit_TXP(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   const uint addr = lookup_function(gen->cell, "spu_txp");
+   int ch;
+   int coord_regs[4], d_regs[4];
+
+   spe_comment(gen->f, -4, "CALL txp:");
+
+   /* get src/dst reg info */
+   for (ch = 0; ch < 4; ch++) {
+      coord_regs[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+      d_regs[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+   }
+
+   {
+      ubyte usedRegs[SPE_NUM_REGS];
+      uint i, numUsed;
+
+      numUsed = spe_get_registers_used(gen->f, usedRegs);
+      assert(numUsed < gen->frame_size / 16 - 32);
+
+      /* save registers to stack */
+      for (i = 0; i < numUsed; i++) {
+         uint reg = usedRegs[i];
+         int offset = 2 + i;
+         spe_stqd(gen->f, reg, SPE_REG_SP, 16 * offset);
+      }
+
+      /* setup function arguments */
+      for (i = 0; i < 4; i++) {
+         spe_move(gen->f, 3 + i, coord_regs[i]);
+      }
+
+      /* branch to function, save return addr */
+      spe_brasl(gen->f, SPE_REG_RA, addr);
+
+      /* save function's return values (four pixel's colors) */
+      for (i = 0; i < 4; i++) {
+         spe_move(gen->f, d_regs[i], 3 + i);
+      }
+
+      /* restore registers from stack */
+      for (i = 0; i < numUsed; i++) {
+         uint reg = usedRegs[i];
+         if (reg != d_regs[0] &&
+             reg != d_regs[1] &&
+             reg != d_regs[2] &&
+             reg != d_regs[3]) {
+            int offset = 2 + i;
+            spe_lqd(gen->f, reg, SPE_REG_SP, 16 * offset);
+         }
+      }
+   }
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         store_dest_reg(gen, d_regs[ch], ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+
+   return TRUE;
+}
+
+
+/**
+ * Emit max.  See emit_SGT for comments.
+ */
+static boolean
+emit_MAX(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+
+   spe_comment(gen->f, -4, "MAX:");
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         int tmp_reg = get_itemp(gen);
+
+         /* d = (s1 > s2) ? s1 : s2 */
+         spe_fcgt(gen->f, tmp_reg, s1_reg, s2_reg);
+         spe_selb(gen->f, d_reg, s2_reg, s1_reg, tmp_reg);
+
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+
+   return true;
+}
+
+/**
+ * Emit max.  See emit_SGT for comments.
+ */
+static boolean
+emit_MIN(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+
+   spe_comment(gen->f, -4, "MIN:");
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         int tmp_reg = get_itemp(gen);
+
+         /* d = (s2 > s1) ? s1 : s2 */
+         spe_fcgt(gen->f, tmp_reg, s2_reg, s1_reg);
+         spe_selb(gen->f, d_reg, s2_reg, s1_reg, tmp_reg);
+
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+
+   return true;
+}
+
+static boolean
+emit_IF(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   const int channel = 0;
+   const int exec_reg = get_exec_mask_reg(gen);
+
+   spe_comment(gen->f, -4, "IF:");
+
+   /* update execution mask with the predicate register */
+   int tmp_reg = get_itemp(gen);
+   int s1_reg = get_src_reg(gen, channel, &inst->FullSrcRegisters[0]);
+
+   /* tmp = (s1_reg == 0) */
+   spe_ceqi(gen->f, tmp_reg, s1_reg, 0);
+   /* tmp = !tmp */
+   spe_complement(gen->f, tmp_reg, tmp_reg);
+   /* exec_mask = exec_mask & tmp */
+   spe_and(gen->f, exec_reg, exec_reg, tmp_reg);
+
+   gen->if_nesting++;
+
+   free_itemps(gen);
+
+   return true;
+}
+
+
+static boolean
+emit_ELSE(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   const int exec_reg = get_exec_mask_reg(gen);
+
+   spe_comment(gen->f, -4, "ELSE:");
+
+   /* exec_mask = !exec_mask */
+   spe_complement(gen->f, exec_reg, exec_reg);
+
+   return true;
+}
+
+
+static boolean
+emit_ENDIF(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   const int exec_reg = get_exec_mask_reg(gen);
+
+   spe_comment(gen->f, -4, "ENDIF:");
+
+   /* XXX todo: pop execution mask */
+
+   spe_load_int(gen->f, exec_reg, ~0x0);
+
+   gen->if_nesting--;
+   return true;
+}
+
+
+static boolean
+emit_DDX_DDY(struct codegen *gen, const struct tgsi_full_instruction *inst,
+             boolean ddx)
+{
+   int ch;
+
+   spe_comment(gen->f, -4, ddx ? "DDX:" : "DDY:");
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int s_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+
+         int t1_reg = get_itemp(gen);
+         int t2_reg = get_itemp(gen);
+
+         spe_splat_word(gen->f, t1_reg, s_reg, 0); /* upper-left pixel */
+         if (ddx) {
+            spe_splat_word(gen->f, t2_reg, s_reg, 1); /* upper-right pixel */
+         }
+         else {
+            spe_splat_word(gen->f, t2_reg, s_reg, 2); /* lower-left pixel */
+         }
+         spe_fs(gen->f, d_reg, t2_reg, t1_reg);
+
+         free_itemps(gen);
+      }
+   }
+
+   return true;
+}
+
+
+
+
+/**
+ * Emit END instruction.
+ * We just return from the shader function at this point.
+ *
+ * Note that there may be more code after this that would be
+ * called by TGSI_OPCODE_CALL.
+ */
+static boolean
+emit_END(struct codegen *gen)
+{
+   spe_comment(gen->f, -4, "END:");
+   emit_epilogue(gen);
+   return true;
+}
+
+
+/**
+ * Emit code for the given instruction.  Just a big switch stmt.
+ */
+static boolean
+emit_instruction(struct codegen *gen,
+                 const struct tgsi_full_instruction *inst)
+{
+   switch (inst->Instruction.Opcode) {
+   case TGSI_OPCODE_MOV:
+   case TGSI_OPCODE_SWZ:
+      return emit_MOV(gen, inst);
+   case TGSI_OPCODE_MUL:
+      return emit_MUL(gen, inst);
+   case TGSI_OPCODE_ADD:
+      return emit_ADD(gen, inst);
+   case TGSI_OPCODE_SUB:
+      return emit_SUB(gen, inst);
+   case TGSI_OPCODE_MAD:
+      return emit_MAD(gen, inst);
+   case TGSI_OPCODE_LERP:
+      return emit_LERP(gen, inst);
+   case TGSI_OPCODE_DP3:
+      return emit_DP3(gen, inst);
+   case TGSI_OPCODE_DP4:
+      return emit_DP4(gen, inst);
+   case TGSI_OPCODE_DPH:
+      return emit_DPH(gen, inst);
+   case TGSI_OPCODE_XPD:
+      return emit_XPD(gen, inst);
+   case TGSI_OPCODE_RCP:
+      return emit_RCP(gen, inst);
+   case TGSI_OPCODE_RSQ:
+      return emit_RSQ(gen, inst);
+   case TGSI_OPCODE_ABS:
+      return emit_ABS(gen, inst);
+   case TGSI_OPCODE_SGT:
+      return emit_SGT(gen, inst);
+   case TGSI_OPCODE_SLT:
+      return emit_SLT(gen, inst);
+   case TGSI_OPCODE_SGE:
+      return emit_SGE(gen, inst);
+   case TGSI_OPCODE_SLE:
+      return emit_SLE(gen, inst);
+   case TGSI_OPCODE_SEQ:
+      return emit_SEQ(gen, inst);
+   case TGSI_OPCODE_SNE:
+      return emit_SNE(gen, inst);
+   case TGSI_OPCODE_CMP:
+      return emit_CMP(gen, inst);
+   case TGSI_OPCODE_MAX:
+      return emit_MAX(gen, inst);
+   case TGSI_OPCODE_MIN:
+      return emit_MIN(gen, inst);
+   case TGSI_OPCODE_TRUNC:
+      return emit_TRUNC(gen, inst);
+   case TGSI_OPCODE_FLR:
+      return emit_FLR(gen, inst);
+   case TGSI_OPCODE_FRC:
+      return emit_FRC(gen, inst);
+   case TGSI_OPCODE_END:
+      return emit_END(gen);
+
+   case TGSI_OPCODE_COS:
+      return emit_function_call(gen, inst, "spu_cos", 1);
+   case TGSI_OPCODE_SIN:
+      return emit_function_call(gen, inst, "spu_sin", 1);
+   case TGSI_OPCODE_POW:
+      return emit_function_call(gen, inst, "spu_pow", 2);
+   case TGSI_OPCODE_EXPBASE2:
+      return emit_function_call(gen, inst, "spu_exp2", 1);
+   case TGSI_OPCODE_LOGBASE2:
+      return emit_function_call(gen, inst, "spu_log2", 1);
+   case TGSI_OPCODE_TEX:
+      /* fall-through for now */
+   case TGSI_OPCODE_TXD:
+      /* fall-through for now */
+   case TGSI_OPCODE_TXP:
+      return emit_TXP(gen, inst);
+
+   case TGSI_OPCODE_IF:
+      return emit_IF(gen, inst);
+   case TGSI_OPCODE_ELSE:
+      return emit_ELSE(gen, inst);
+   case TGSI_OPCODE_ENDIF:
+      return emit_ENDIF(gen, inst);
+
+   case TGSI_OPCODE_DDX:
+      return emit_DDX_DDY(gen, inst, true);
+   case TGSI_OPCODE_DDY:
+      return emit_DDX_DDY(gen, inst, false);
+
+   /* XXX lots more cases to do... */
+
+   default:
+      fprintf(stderr, "Cell: unimplemented TGSI instruction %d!\n",
+              inst->Instruction.Opcode);
+      return false;
+   }
+
+   return true;
+}
+
+
+
+/**
+ * Emit code for a TGSI immediate value (vector of four floats).
+ * This involves register allocation and initialization.
+ * XXX the initialization should be done by a "prepare" stage, not
+ * per quad execution!
+ */
+static boolean
+emit_immediate(struct codegen *gen, const struct tgsi_full_immediate *immed)
+{
+   int ch;
+
+   assert(gen->num_imm < MAX_TEMPS);
+
+   spe_comment(gen->f, -4, "IMMEDIATE:");
+
+   for (ch = 0; ch < 4; ch++) {
+      float val = immed->u.ImmediateFloat32[ch].Float;
+
+      if (ch > 0 && val == immed->u.ImmediateFloat32[ch - 1].Float) {
+         /* re-use previous register */
+         gen->imm_regs[gen->num_imm][ch] = gen->imm_regs[gen->num_imm][ch - 1];
+      }
+      else {
+         int reg = spe_allocate_available_register(gen->f);
+
+         if (reg < 0)
+            return false;
+
+         /* update immediate map */
+         gen->imm_regs[gen->num_imm][ch] = reg;
+
+         /* emit initializer instruction */
+         spe_load_float(gen->f, reg, val);
+      }
+   }
+
+   gen->num_imm++;
+
+   return true;
+}
+
+
+
+/**
+ * Emit "code" for a TGSI declaration.
+ * We only care about TGSI TEMPORARY register declarations at this time.
+ * For each TGSI TEMPORARY we allocate four SPE registers.
+ */
+static boolean
+emit_declaration(struct cell_context *cell,
+                 struct codegen *gen, const struct tgsi_full_declaration *decl)
+{
+   int i, ch;
+
+   switch (decl->Declaration.File) {
+   case TGSI_FILE_TEMPORARY:
+      for (i = decl->DeclarationRange.First;
+           i <= decl->DeclarationRange.Last;
+           i++) {
+         assert(i < MAX_TEMPS);
+         for (ch = 0; ch < 4; ch++) {
+            gen->temp_regs[i][ch] = spe_allocate_available_register(gen->f);
+            if (gen->temp_regs[i][ch] < 0)
+               return false; /* out of regs */
+         }
+
+         /* XXX if we run out of SPE registers, we need to spill
+          * to SPU memory.  someday...
+          */
+
+         {
+            char buf[100];
+            sprintf(buf, "TGSI temp[%d] maps to SPU regs [$%d $%d $%d $%d]", i,
+                    gen->temp_regs[i][0], gen->temp_regs[i][1],
+                    gen->temp_regs[i][2], gen->temp_regs[i][3]);
+            spe_comment(gen->f, -4, buf);
+         }
+      }
+      break;
+   default:
+      ; /* ignore */
+   }
+
+   return true;
+}
+
+
+
+/**
+ * Translate TGSI shader code to SPE instructions.  This is done when
+ * the state tracker gives us a new shader (via pipe->create_fs_state()).
+ *
+ * \param cell    the rendering context (in)
+ * \param tokens  the TGSI shader (in)
+ * \param f       the generated function (out)
+ */
+boolean
+cell_gen_fragment_program(struct cell_context *cell,
+                          const struct tgsi_token *tokens,
+                          struct spe_function *f)
+{
+   struct tgsi_parse_context parse;
+   struct codegen gen;
+
+   memset(&gen, 0, sizeof(gen));
+   gen.cell = cell;
+   gen.f = f;
+
+   /* For SPE function calls: reg $3 = first param, $4 = second param, etc. */
+   gen.inputs_reg = 3;     /* pointer to inputs array */
+   gen.outputs_reg = 4;    /* pointer to outputs array */
+   gen.constants_reg = 5;  /* pointer to constants array */
+
+   spe_init_func(f, SPU_MAX_FRAGMENT_PROGRAM_INSTS * SPE_INST_SIZE);
+   spe_allocate_register(f, gen.inputs_reg);
+   spe_allocate_register(f, gen.outputs_reg);
+   spe_allocate_register(f, gen.constants_reg);
+
+   if (cell->debug_flags & CELL_DEBUG_ASM) {
+      spe_print_code(f, true);
+      spe_indent(f, 8);
+      printf("Begin %s\n", __FUNCTION__);
+      tgsi_dump(tokens, 0);
+   }
+
+   tgsi_parse_init(&parse, tokens);
+
+   emit_prologue(&gen);
+
+   while (!tgsi_parse_end_of_tokens(&parse) && !gen.error) {
+      tgsi_parse_token(&parse);
+
+      switch (parse.FullToken.Token.Type) {
+      case TGSI_TOKEN_TYPE_IMMEDIATE:
+         if (!emit_immediate(&gen, &parse.FullToken.FullImmediate))
+            gen.error = true;
+         break;
+
+      case TGSI_TOKEN_TYPE_DECLARATION:
+         if (!emit_declaration(cell, &gen, &parse.FullToken.FullDeclaration))
+            gen.error = true;
+         break;
+
+      case TGSI_TOKEN_TYPE_INSTRUCTION:
+         if (!emit_instruction(&gen, &parse.FullToken.FullInstruction))
+            gen.error = true;
+         break;
+
+      default:
+         assert(0);
+      }
+   }
+
+   if (gen.error) {
+      /* terminate the SPE code */
+      return emit_END(&gen);
+   }
+
+   if (cell->debug_flags & CELL_DEBUG_ASM) {
+      printf("cell_gen_fragment_program nr instructions: %d\n", f->num_inst);
+      printf("End %s\n", __FUNCTION__);
+   }
+
+   tgsi_parse_free( &parse );
+
+   return !gen.error;
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.h b/src/gallium/drivers/cell/ppu/cell_gen_fp.h
new file mode 100644
index 0000000000..99faea7046
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.h
@@ -0,0 +1,42 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+
+#ifndef CELL_GEN_FP_H
+#define CELL_GEN_FP_H
+
+
+
+extern boolean
+cell_gen_fragment_program(struct cell_context *cell,
+                          const struct tgsi_token *tokens,
+                          struct spe_function *f);
+
+
+#endif /* CELL_GEN_FP_H */
+
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
new file mode 100644
index 0000000000..de170d1036
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
@@ -0,0 +1,2102 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+
+/**
+ * Generate SPU per-fragment code (actually per-quad code).
+ * \author Brian Paul
+ */
+
+
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "rtasm/rtasm_ppc_spe.h"
+#include "cell_context.h"
+#include "cell_gen_fragment.h"
+
+
+
+/** Do extra optimizations? */
+#define OPTIMIZATIONS 1
+
+
+/**
+ * Generate SPE code to perform Z/depth testing.
+ *
+ * \param dsa         Gallium depth/stencil/alpha state to gen code for
+ * \param f           SPE function to append instruction onto.
+ * \param mask_reg    register containing quad/pixel "alive" mask (in/out)
+ * \param ifragZ_reg  register containing integer fragment Z values (in)
+ * \param ifbZ_reg    register containing integer frame buffer Z values (in/out)
+ * \param zmask_reg   register containing result of Z test/comparison (out)
+ *
+ * Returns true if the Z-buffer needs to be updated.
+ */
+static boolean
+gen_depth_test(struct spe_function *f,
+               const struct pipe_depth_stencil_alpha_state *dsa,
+               int mask_reg, int ifragZ_reg, int ifbZ_reg, int zmask_reg)
+{
+   /* NOTE: we use clgt below, not cgt, because we want to compare _unsigned_
+    * quantities.  This only makes a difference for 32-bit Z values though.
+    */
+   ASSERT(dsa->depth.enabled);
+
+   switch (dsa->depth.func) {
+   case PIPE_FUNC_EQUAL:
+      /* zmask = (ifragZ == ref) */
+      spe_ceq(f, zmask_reg, ifragZ_reg, ifbZ_reg);
+      /* mask = (mask & zmask) */
+      spe_and(f, mask_reg, mask_reg, zmask_reg);
+      break;
+
+   case PIPE_FUNC_NOTEQUAL:
+      /* zmask = (ifragZ == ref) */
+      spe_ceq(f, zmask_reg, ifragZ_reg, ifbZ_reg);
+      /* mask = (mask & ~zmask) */
+      spe_andc(f, mask_reg, mask_reg, zmask_reg);
+      break;
+
+   case PIPE_FUNC_GREATER:
+      /* zmask = (ifragZ > ref) */
+      spe_clgt(f, zmask_reg, ifragZ_reg, ifbZ_reg);
+      /* mask = (mask & zmask) */
+      spe_and(f, mask_reg, mask_reg, zmask_reg);
+      break;
+
+   case PIPE_FUNC_LESS:
+      /* zmask = (ref > ifragZ) */
+      spe_clgt(f, zmask_reg, ifbZ_reg, ifragZ_reg);
+      /* mask = (mask & zmask) */
+      spe_and(f, mask_reg, mask_reg, zmask_reg);
+      break;
+
+   case PIPE_FUNC_LEQUAL:
+      /* zmask = (ifragZ > ref) */
+      spe_clgt(f, zmask_reg, ifragZ_reg, ifbZ_reg);
+      /* mask = (mask & ~zmask) */
+      spe_andc(f, mask_reg, mask_reg, zmask_reg);
+      break;
+
+   case PIPE_FUNC_GEQUAL:
+      /* zmask = (ref > ifragZ) */
+      spe_clgt(f, zmask_reg, ifbZ_reg, ifragZ_reg);
+      /* mask = (mask & ~zmask) */
+      spe_andc(f, mask_reg, mask_reg, zmask_reg);
+      break;
+
+   case PIPE_FUNC_NEVER:
+      spe_il(f, mask_reg, 0);  /* mask = {0,0,0,0} */
+      spe_move(f, zmask_reg, mask_reg);  /* zmask = mask */
+      break;
+
+   case PIPE_FUNC_ALWAYS:
+      /* mask unchanged */
+      spe_il(f, zmask_reg, ~0);  /* zmask = {~0,~0,~0,~0} */
+      break;
+
+   default:
+      ASSERT(0);
+      break;
+   }
+
+   if (dsa->depth.writemask) {
+      /*
+       * If (ztest passed) {
+       *    framebufferZ = fragmentZ;
+       * }
+       * OR,
+       * framebufferZ = (ztest_passed ? fragmentZ : framebufferZ;
+       */
+      spe_selb(f, ifbZ_reg, ifbZ_reg, ifragZ_reg, mask_reg);
+      return true;
+   }
+
+   return false;
+}
+
+
+/**
+ * Generate SPE code to perform alpha testing.
+ *
+ * \param dsa        Gallium depth/stencil/alpha state to gen code for
+ * \param f          SPE function to append instruction onto.
+ * \param mask_reg   register containing quad/pixel "alive" mask (in/out)
+ * \param fragA_reg  register containing four fragment alpha values (in)
+ */
+static void
+gen_alpha_test(const struct pipe_depth_stencil_alpha_state *dsa,
+               struct spe_function *f, int mask_reg, int fragA_reg)
+{
+   int ref_reg = spe_allocate_available_register(f);
+   int amask_reg = spe_allocate_available_register(f);
+
+   ASSERT(dsa->alpha.enabled);
+
+   if ((dsa->alpha.func != PIPE_FUNC_NEVER) &&
+       (dsa->alpha.func != PIPE_FUNC_ALWAYS)) {
+      /* load/splat the alpha reference float value */
+      spe_load_float(f, ref_reg, dsa->alpha.ref);
+   }
+
+   /* emit code to do the alpha comparison, updating 'mask' */
+   switch (dsa->alpha.func) {
+   case PIPE_FUNC_EQUAL:
+      /* amask = (fragA == ref) */
+      spe_fceq(f, amask_reg, fragA_reg, ref_reg);
+      /* mask = (mask & amask) */
+      spe_and(f, mask_reg, mask_reg, amask_reg);
+      break;
+
+   case PIPE_FUNC_NOTEQUAL:
+      /* amask = (fragA == ref) */
+      spe_fceq(f, amask_reg, fragA_reg, ref_reg);
+      /* mask = (mask & ~amask) */
+      spe_andc(f, mask_reg, mask_reg, amask_reg);
+      break;
+
+   case PIPE_FUNC_GREATER:
+      /* amask = (fragA > ref) */
+      spe_fcgt(f, amask_reg, fragA_reg, ref_reg);
+      /* mask = (mask & amask) */
+      spe_and(f, mask_reg, mask_reg, amask_reg);
+      break;
+
+   case PIPE_FUNC_LESS:
+      /* amask = (ref > fragA) */
+      spe_fcgt(f, amask_reg, ref_reg, fragA_reg);
+      /* mask = (mask & amask) */
+      spe_and(f, mask_reg, mask_reg, amask_reg);
+      break;
+
+   case PIPE_FUNC_LEQUAL:
+      /* amask = (fragA > ref) */
+      spe_fcgt(f, amask_reg, fragA_reg, ref_reg);
+      /* mask = (mask & ~amask) */
+      spe_andc(f, mask_reg, mask_reg, amask_reg);
+      break;
+
+   case PIPE_FUNC_GEQUAL:
+      /* amask = (ref > fragA) */
+      spe_fcgt(f, amask_reg, ref_reg, fragA_reg);
+      /* mask = (mask & ~amask) */
+      spe_andc(f, mask_reg, mask_reg, amask_reg);
+      break;
+
+   case PIPE_FUNC_NEVER:
+      spe_il(f, mask_reg, 0);  /* mask = [0,0,0,0] */
+      break;
+
+   case PIPE_FUNC_ALWAYS:
+      /* no-op, mask unchanged */
+      break;
+
+   default:
+      ASSERT(0);
+      break;
+   }
+
+#if OPTIMIZATIONS
+   /* if mask == {0,0,0,0} we're all done, return */
+   {
+      /* re-use amask reg here */
+      int tmp_reg = amask_reg;
+      /* tmp[0] = (mask[0] | mask[1] | mask[2] | mask[3]) */
+      spe_orx(f, tmp_reg, mask_reg);
+      /* if tmp[0] == 0 then return from function call */
+      spe_biz(f, tmp_reg, SPE_REG_RA, 0, 0);
+   }
+#endif
+
+   spe_release_register(f, ref_reg);
+   spe_release_register(f, amask_reg);
+}
+
+/* This pair of functions is used inline to allocate and deallocate
+ * optional constant registers.  Once a constant is discovered to be 
+ * needed, we will likely need it again, so we don't want to deallocate
+ * it and have to allocate and load it again unnecessarily.
+ */
+static inline void
+setup_optional_register(struct spe_function *f, boolean *is_already_set, unsigned int *r)
+{
+   if (*is_already_set) return;
+   *r = spe_allocate_available_register(f);
+}
+
+static inline void
+release_optional_register(struct spe_function *f, boolean *is_already_set, unsigned int r)
+{
+    if (!*is_already_set) return;
+    spe_release_register(f, r);
+    *is_already_set = false;
+}
+
+static inline void
+setup_const_register(struct spe_function *f, boolean *is_already_set, unsigned int *r, float value)
+{
+   if (*is_already_set) return;
+   setup_optional_register(f, is_already_set, r);
+   spe_load_float(f, *r, value);
+}
+
+static inline void
+release_const_register(struct spe_function *f, boolean *is_already_set, unsigned int r)
+{
+    release_optional_register(f, is_already_set, r);
+}
+
+/**
+ * Generate SPE code to implement the given blend mode for a quad of pixels.
+ * \param f          SPE function to append instruction onto.
+ * \param fragR_reg  register with fragment red values (float) (in/out)
+ * \param fragG_reg  register with fragment green values (float) (in/out)
+ * \param fragB_reg  register with fragment blue values (float) (in/out)
+ * \param fragA_reg  register with fragment alpha values (float) (in/out)
+ * \param fbRGBA_reg register with packed framebuffer colors (integer) (in)
+ */
+static void
+gen_blend(const struct pipe_blend_state *blend,
+          const struct pipe_blend_color *blend_color,
+          struct spe_function *f,
+          enum pipe_format color_format,
+          int fragR_reg, int fragG_reg, int fragB_reg, int fragA_reg,
+          int fbRGBA_reg)
+{
+   int term1R_reg = spe_allocate_available_register(f);
+   int term1G_reg = spe_allocate_available_register(f);
+   int term1B_reg = spe_allocate_available_register(f);
+   int term1A_reg = spe_allocate_available_register(f);
+
+   int term2R_reg = spe_allocate_available_register(f);
+   int term2G_reg = spe_allocate_available_register(f);
+   int term2B_reg = spe_allocate_available_register(f);
+   int term2A_reg = spe_allocate_available_register(f);
+
+   int fbR_reg = spe_allocate_available_register(f);
+   int fbG_reg = spe_allocate_available_register(f);
+   int fbB_reg = spe_allocate_available_register(f);
+   int fbA_reg = spe_allocate_available_register(f);
+
+   int tmp_reg = spe_allocate_available_register(f);
+
+   /* Optional constant registers we might or might not end up using;
+    * if we do use them, make sure we only allocate them once by
+    * keeping a flag on each one.
+    */
+   boolean one_reg_set = false;
+   unsigned int one_reg;
+   boolean constR_reg_set = false, constG_reg_set = false, 
+      constB_reg_set = false, constA_reg_set = false;
+   unsigned int constR_reg, constG_reg, constB_reg, constA_reg;
+
+   ASSERT(blend->blend_enable);
+
+   /* Unpack/convert framebuffer colors from four 32-bit packed colors
+    * (fbRGBA) to four float RGBA vectors (fbR, fbG, fbB, fbA).
+    * Each 8-bit color component is expanded into a float in [0.0, 1.0].
+    */
+   {
+      int mask_reg = spe_allocate_available_register(f);
+
+      /* mask = {0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff} */
+      spe_load_int(f, mask_reg, 0xff);
+
+      /* XXX there may be more clever ways to implement the following code */
+      switch (color_format) {
+      case PIPE_FORMAT_A8R8G8B8_UNORM:
+         /* fbB = fbB & mask */
+         spe_and(f, fbB_reg, fbRGBA_reg, mask_reg);
+         /* mask = mask << 8 */
+         spe_roti(f, mask_reg, mask_reg, 8);
+
+         /* fbG = fbRGBA & mask */
+         spe_and(f, fbG_reg, fbRGBA_reg, mask_reg);
+         /* fbG = fbG >> 8 */
+         spe_roti(f, fbG_reg, fbG_reg, -8);
+         /* mask = mask << 8 */
+         spe_roti(f, mask_reg, mask_reg, 8);
+
+         /* fbR = fbRGBA & mask */
+         spe_and(f, fbR_reg, fbRGBA_reg, mask_reg);
+         /* fbR = fbR >> 16 */
+         spe_roti(f, fbR_reg, fbR_reg, -16);
+         /* mask = mask << 8 */
+         spe_roti(f, mask_reg, mask_reg, 8);
+
+         /* fbA = fbRGBA & mask */
+         spe_and(f, fbA_reg, fbRGBA_reg, mask_reg);
+         /* fbA = fbA >> 24 */
+         spe_roti(f, fbA_reg, fbA_reg, -24);
+         break;
+
+      case PIPE_FORMAT_B8G8R8A8_UNORM:
+         /* fbA = fbA & mask */
+         spe_and(f, fbA_reg, fbRGBA_reg, mask_reg);
+         /* mask = mask << 8 */
+         spe_roti(f, mask_reg, mask_reg, 8);
+
+         /* fbR = fbRGBA & mask */
+         spe_and(f, fbR_reg, fbRGBA_reg, mask_reg);
+         /* fbR = fbR >> 8 */
+         spe_roti(f, fbR_reg, fbR_reg, -8);
+         /* mask = mask << 8 */
+         spe_roti(f, mask_reg, mask_reg, 8);
+
+         /* fbG = fbRGBA & mask */
+         spe_and(f, fbG_reg, fbRGBA_reg, mask_reg);
+         /* fbG = fbG >> 16 */
+         spe_roti(f, fbG_reg, fbG_reg, -16);
+         /* mask = mask << 8 */
+         spe_roti(f, mask_reg, mask_reg, 8);
+
+         /* fbB = fbRGBA & mask */
+         spe_and(f, fbB_reg, fbRGBA_reg, mask_reg);
+         /* fbB = fbB >> 24 */
+         spe_roti(f, fbB_reg, fbB_reg, -24);
+         break;
+
+      default:
+         ASSERT(0);
+      }
+
+      /* convert int[4] in [0,255] to float[4] in [0.0, 1.0] */
+      spe_cuflt(f, fbR_reg, fbR_reg, 8);
+      spe_cuflt(f, fbG_reg, fbG_reg, 8);
+      spe_cuflt(f, fbB_reg, fbB_reg, 8);
+      spe_cuflt(f, fbA_reg, fbA_reg, 8);
+
+      spe_release_register(f, mask_reg);
+   }
+
+   /*
+    * Compute Src RGB terms.  We're actually looking for the value
+    * of (the appropriate RGB factors) * (the incoming source RGB color),
+    * because in some cases (like PIPE_BLENDFACTOR_ONE and 
+    * PIPE_BLENDFACTOR_ZERO) we can avoid doing unnecessary math.
+    */
+   switch (blend->rgb_src_factor) {
+   case PIPE_BLENDFACTOR_ONE:
+      /* factors = (1,1,1), so term = (R,G,B) */
+      spe_move(f, term1R_reg, fragR_reg);
+      spe_move(f, term1G_reg, fragG_reg);
+      spe_move(f, term1B_reg, fragB_reg);
+      break;
+   case PIPE_BLENDFACTOR_ZERO:
+      /* factors = (0,0,0), so term = (0,0,0) */
+      spe_load_float(f, term1R_reg, 0.0f);
+      spe_load_float(f, term1G_reg, 0.0f);
+      spe_load_float(f, term1B_reg, 0.0f);
+      break;
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+      /* factors = (R,G,B), so term = (R*R, G*G, B*B) */
+      spe_fm(f, term1R_reg, fragR_reg, fragR_reg);
+      spe_fm(f, term1G_reg, fragG_reg, fragG_reg);
+      spe_fm(f, term1B_reg, fragB_reg, fragB_reg);
+      break;
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+      /* factors = (A,A,A), so term = (R*A, G*A, B*A) */
+      spe_fm(f, term1R_reg, fragR_reg, fragA_reg);
+      spe_fm(f, term1G_reg, fragG_reg, fragA_reg);
+      spe_fm(f, term1B_reg, fragB_reg, fragA_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+      /* factors = (1-R,1-G,1-B), so term = (R*(1-R), G*(1-G), B*(1-B)) 
+       * or in other words term = (R-R*R, G-G*G, B-B*B)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term1R_reg, fragR_reg, fragR_reg, fragR_reg);
+      spe_fnms(f, term1G_reg, fragG_reg, fragG_reg, fragG_reg);
+      spe_fnms(f, term1B_reg, fragB_reg, fragB_reg, fragB_reg);
+      break;
+   case PIPE_BLENDFACTOR_DST_COLOR:
+      /* factors = (Rfb,Gfb,Bfb), so term = (R*Rfb, G*Gfb, B*Bfb) */
+      spe_fm(f, term1R_reg, fragR_reg, fbR_reg);
+      spe_fm(f, term1G_reg, fragG_reg, fbG_reg);
+      spe_fm(f, term1B_reg, fragB_reg, fbB_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+      /* factors = (1-Rfb,1-Gfb,1-Bfb), so term = (R*(1-Rfb),G*(1-Gfb),B*(1-Bfb))
+       * or term = (R-R*Rfb, G-G*Gfb, B-B*Bfb)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term1R_reg, fragR_reg, fbR_reg, fragR_reg);
+      spe_fnms(f, term1G_reg, fragG_reg, fbG_reg, fragG_reg);
+      spe_fnms(f, term1B_reg, fragB_reg, fbB_reg, fragB_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+      /* factors = (1-A,1-A,1-A), so term = (R*(1-A),G*(1-A),B*(1-A))
+       * or term = (R-R*A,G-G*A,B-B*A)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term1R_reg, fragR_reg, fragA_reg, fragR_reg);
+      spe_fnms(f, term1G_reg, fragG_reg, fragA_reg, fragG_reg);
+      spe_fnms(f, term1B_reg, fragB_reg, fragA_reg, fragB_reg);
+      break;
+   case PIPE_BLENDFACTOR_DST_ALPHA:
+      /* factors = (Afb, Afb, Afb), so term = (R*Afb, G*Afb, B*Afb) */
+      spe_fm(f, term1R_reg, fragR_reg, fbA_reg);
+      spe_fm(f, term1G_reg, fragG_reg, fbA_reg);
+      spe_fm(f, term1B_reg, fragB_reg, fbA_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+      /* factors = (1-Afb, 1-Afb, 1-Afb), so term = (R*(1-Afb),G*(1-Afb),B*(1-Afb)) 
+       * or term = (R-R*Afb,G-G*Afb,b-B*Afb)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term1R_reg, fragR_reg, fbA_reg, fragR_reg);
+      spe_fnms(f, term1G_reg, fragG_reg, fbA_reg, fragG_reg);
+      spe_fnms(f, term1B_reg, fragB_reg, fbA_reg, fragB_reg);
+      break;
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+      /* We need the optional constant color registers */
+      setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
+      setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
+      setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
+      /* now, factor = (Rc,Gc,Bc), so term = (R*Rc,G*Gc,B*Bc) */
+      spe_fm(f, term1R_reg, fragR_reg, constR_reg);
+      spe_fm(f, term1G_reg, fragG_reg, constG_reg);
+      spe_fm(f, term1B_reg, fragB_reg, constB_reg);
+      break;
+   case PIPE_BLENDFACTOR_CONST_ALPHA:
+      /* we'll need the optional constant alpha register */
+      setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
+      /* factor = (Ac,Ac,Ac), so term = (R*Ac,G*Ac,B*Ac) */
+      spe_fm(f, term1R_reg, fragR_reg, constA_reg);
+      spe_fm(f, term1G_reg, fragG_reg, constA_reg);
+      spe_fm(f, term1B_reg, fragB_reg, constA_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+      /* We need the optional constant color registers */
+      setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
+      setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
+      setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
+      /* factor = (1-Rc,1-Gc,1-Bc), so term = (R*(1-Rc),G*(1-Gc),B*(1-Bc)) 
+       * or term = (R-R*Rc, G-G*Gc, B-B*Bc)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term1R_reg, fragR_reg, constR_reg, fragR_reg);
+      spe_fnms(f, term1G_reg, fragG_reg, constG_reg, fragG_reg);
+      spe_fnms(f, term1B_reg, fragB_reg, constB_reg, fragB_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+      /* We need the optional constant color registers */
+      setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
+      setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
+      setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
+      /* factor = (1-Ac,1-Ac,1-Ac), so term = (R*(1-Ac),G*(1-Ac),B*(1-Ac))
+       * or term = (R-R*Ac,G-G*Ac,B-B*Ac)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term1R_reg, fragR_reg, constA_reg, fragR_reg);
+      spe_fnms(f, term1G_reg, fragG_reg, constA_reg, fragG_reg);
+      spe_fnms(f, term1B_reg, fragB_reg, constA_reg, fragB_reg);
+      break;
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+      /* We'll need the optional {1,1,1,1} register */
+      setup_const_register(f, &one_reg_set, &one_reg, 1.0f);
+      /* factor = (min(A,1-Afb),min(A,1-Afb),min(A,1-Afb)), so 
+       * term = (R*min(A,1-Afb), G*min(A,1-Afb), B*min(A,1-Afb))
+       * We could expand the term (as a*min(b,c) == min(a*b,a*c)
+       * as long as a is positive), but then we'd have to do three
+       * spe_float_min() functions instead of one, so this is simpler.
+       */
+      /* tmp = 1 - Afb */
+      spe_fs(f, tmp_reg, one_reg, fbA_reg);
+      /* tmp = min(A,tmp) */
+      spe_float_min(f, tmp_reg, fragA_reg, tmp_reg);
+      /* term = R*tmp */
+      spe_fm(f, term1R_reg, fragR_reg, tmp_reg);
+      spe_fm(f, term1G_reg, fragG_reg, tmp_reg);
+      spe_fm(f, term1B_reg, fragB_reg, tmp_reg);
+      break;
+
+      /* These are special D3D cases involving a second color output
+       * from the fragment shader.  I'm not sure we can support them
+       * yet... XXX
+       */
+   case PIPE_BLENDFACTOR_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_SRC1_ALPHA:
+   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+
+   default:
+      ASSERT(0);
+   }
+
+   /*
+    * Compute Src Alpha term.  Like the above, we're looking for
+    * the full term A*factor, not just the factor itself, because
+    * in many cases we can avoid doing unnecessary multiplies.
+    */
+   switch (blend->alpha_src_factor) {
+   case PIPE_BLENDFACTOR_ZERO:
+      /* factor = 0, so term = 0 */
+      spe_load_float(f, term1A_reg, 0.0f);
+      break;
+
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: /* fall through */
+   case PIPE_BLENDFACTOR_ONE:
+      /* factor = 1, so term = A */
+      spe_move(f, term1A_reg, fragA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+      /* factor = A, so term = A*A */
+      spe_fm(f, term1A_reg, fragA_reg, fragA_reg);
+      break;
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+      spe_fm(f, term1A_reg, fragA_reg, fragA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+      /* factor = 1-A, so term = A*(1-A) = A-A*A */
+      /* fnms(a,b,c,d) computes a = d - b*c */
+      spe_fnms(f, term1A_reg, fragA_reg, fragA_reg, fragA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_DST_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_DST_COLOR:
+      /* factor = Afb, so term = A*Afb */
+      spe_fm(f, term1A_reg, fragA_reg, fbA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+      /* factor = 1-Afb, so term = A*(1-Afb) = A - A*Afb */
+      /* fnms(a,b,c,d) computes a = d - b*c */
+      spe_fnms(f, term1A_reg, fragA_reg, fbA_reg, fragA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_CONST_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+      /* We need the optional constA_reg register */
+      setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
+      /* factor = Ac, so term = A*Ac */
+      spe_fm(f, term1A_reg, fragA_reg, constA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+      /* We need the optional constA_reg register */
+      setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
+      /* factor = 1-Ac, so term = A*(1-Ac) = A-A*Ac */
+      /* fnms(a,b,c,d) computes a = d - b*c */
+      spe_fnms(f, term1A_reg, fragA_reg, constA_reg, fragA_reg);
+      break;
+
+      /* These are special D3D cases involving a second color output
+       * from the fragment shader.  I'm not sure we can support them
+       * yet... XXX
+       */
+   case PIPE_BLENDFACTOR_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_SRC1_ALPHA:
+   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+   default:
+      ASSERT(0);
+   }
+
+   /*
+    * Compute Dest RGB term.  Like the above, we're looking for
+    * the full term (Rfb,Gfb,Bfb)*(factor), not just the factor itself, because
+    * in many cases we can avoid doing unnecessary multiplies.
+    */
+   switch (blend->rgb_dst_factor) {
+   case PIPE_BLENDFACTOR_ONE:
+      /* factors = (1,1,1), so term = (Rfb,Gfb,Bfb) */
+      spe_move(f, term2R_reg, fbR_reg);
+      spe_move(f, term2G_reg, fbG_reg);
+      spe_move(f, term2B_reg, fbB_reg);
+      break;
+   case PIPE_BLENDFACTOR_ZERO:
+      /* factor s= (0,0,0), so term = (0,0,0) */
+      spe_load_float(f, term2R_reg, 0.0f);
+      spe_load_float(f, term2G_reg, 0.0f);
+      spe_load_float(f, term2B_reg, 0.0f);
+      break;
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+      /* factors = (R,G,B), so term = (R*Rfb, G*Gfb, B*Bfb) */
+      spe_fm(f, term2R_reg, fbR_reg, fragR_reg);
+      spe_fm(f, term2G_reg, fbG_reg, fragG_reg);
+      spe_fm(f, term2B_reg, fbB_reg, fragB_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+      /* factors = (1-R,1-G,1-B), so term = (Rfb*(1-R), Gfb*(1-G), Bfb*(1-B)) 
+       * or in other words term = (Rfb-Rfb*R, Gfb-Gfb*G, Bfb-Bfb*B)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term2R_reg, fragR_reg, fbR_reg, fbR_reg);
+      spe_fnms(f, term2G_reg, fragG_reg, fbG_reg, fbG_reg);
+      spe_fnms(f, term2B_reg, fragB_reg, fbB_reg, fbB_reg);
+      break;
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+      /* factors = (A,A,A), so term = (Rfb*A, Gfb*A, Bfb*A) */
+      spe_fm(f, term2R_reg, fbR_reg, fragA_reg);
+      spe_fm(f, term2G_reg, fbG_reg, fragA_reg);
+      spe_fm(f, term2B_reg, fbB_reg, fragA_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+      /* factors = (1-A,1-A,1-A) so term = (Rfb-Rfb*A,Gfb-Gfb*A,Bfb-Bfb*A) */
+      /* fnms(a,b,c,d) computes a = d - b*c */
+      spe_fnms(f, term2R_reg, fbR_reg, fragA_reg, fbR_reg);
+      spe_fnms(f, term2G_reg, fbG_reg, fragA_reg, fbG_reg);
+      spe_fnms(f, term2B_reg, fbB_reg, fragA_reg, fbB_reg);
+      break;
+   case PIPE_BLENDFACTOR_DST_COLOR:
+      /* factors = (Rfb,Gfb,Bfb), so term = (Rfb*Rfb, Gfb*Gfb, Bfb*Bfb) */
+      spe_fm(f, term2R_reg, fbR_reg, fbR_reg);
+      spe_fm(f, term2G_reg, fbG_reg, fbG_reg);
+      spe_fm(f, term2B_reg, fbB_reg, fbB_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+      /* factors = (1-Rfb,1-Gfb,1-Bfb), so term = (Rfb*(1-Rfb),Gfb*(1-Gfb),Bfb*(1-Bfb))
+       * or term = (Rfb-Rfb*Rfb, Gfb-Gfb*Gfb, Bfb-Bfb*Bfb)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term2R_reg, fbR_reg, fbR_reg, fbR_reg);
+      spe_fnms(f, term2G_reg, fbG_reg, fbG_reg, fbG_reg);
+      spe_fnms(f, term2B_reg, fbB_reg, fbB_reg, fbB_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_DST_ALPHA:
+      /* factors = (Afb, Afb, Afb), so term = (Rfb*Afb, Gfb*Afb, Bfb*Afb) */
+      spe_fm(f, term2R_reg, fbR_reg, fbA_reg);
+      spe_fm(f, term2G_reg, fbG_reg, fbA_reg);
+      spe_fm(f, term2B_reg, fbB_reg, fbA_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+      /* factors = (1-Afb, 1-Afb, 1-Afb), so term = (Rfb*(1-Afb),Gfb*(1-Afb),Bfb*(1-Afb)) 
+       * or term = (Rfb-Rfb*Afb,Gfb-Gfb*Afb,Bfb-Bfb*Afb)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term2R_reg, fbR_reg, fbA_reg, fbR_reg);
+      spe_fnms(f, term2G_reg, fbG_reg, fbA_reg, fbG_reg);
+      spe_fnms(f, term2B_reg, fbB_reg, fbA_reg, fbB_reg);
+      break;
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+      /* We need the optional constant color registers */
+      setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
+      setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
+      setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
+      /* now, factor = (Rc,Gc,Bc), so term = (Rfb*Rc,Gfb*Gc,Bfb*Bc) */
+      spe_fm(f, term2R_reg, fbR_reg, constR_reg);
+      spe_fm(f, term2G_reg, fbG_reg, constG_reg);
+      spe_fm(f, term2B_reg, fbB_reg, constB_reg);
+      break;
+   case PIPE_BLENDFACTOR_CONST_ALPHA:
+      /* we'll need the optional constant alpha register */
+      setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
+      /* factor = (Ac,Ac,Ac), so term = (Rfb*Ac,Gfb*Ac,Bfb*Ac) */
+      spe_fm(f, term2R_reg, fbR_reg, constA_reg);
+      spe_fm(f, term2G_reg, fbG_reg, constA_reg);
+      spe_fm(f, term2B_reg, fbB_reg, constA_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+      /* We need the optional constant color registers */
+      setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
+      setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
+      setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
+      /* factor = (1-Rc,1-Gc,1-Bc), so term = (Rfb*(1-Rc),Gfb*(1-Gc),Bfb*(1-Bc)) 
+       * or term = (Rfb-Rfb*Rc, Gfb-Gfb*Gc, Bfb-Bfb*Bc)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term2R_reg, fbR_reg, constR_reg, fbR_reg);
+      spe_fnms(f, term2G_reg, fbG_reg, constG_reg, fbG_reg);
+      spe_fnms(f, term2B_reg, fbB_reg, constB_reg, fbB_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+      /* We need the optional constant color registers */
+      setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
+      setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
+      setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
+      /* factor = (1-Ac,1-Ac,1-Ac), so term = (Rfb*(1-Ac),Gfb*(1-Ac),Bfb*(1-Ac))
+       * or term = (Rfb-Rfb*Ac,Gfb-Gfb*Ac,Bfb-Bfb*Ac)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term2R_reg, fbR_reg, constA_reg, fbR_reg);
+      spe_fnms(f, term2G_reg, fbG_reg, constA_reg, fbG_reg);
+      spe_fnms(f, term2B_reg, fbB_reg, constA_reg, fbB_reg);
+      break;
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: /* not supported for dest RGB */
+      ASSERT(0);
+      break;
+
+      /* These are special D3D cases involving a second color output
+       * from the fragment shader.  I'm not sure we can support them
+       * yet... XXX
+       */
+   case PIPE_BLENDFACTOR_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_SRC1_ALPHA:
+   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+
+   default:
+      ASSERT(0);
+   }
+
+   /*
+    * Compute Dest Alpha term.  Like the above, we're looking for
+    * the full term Afb*factor, not just the factor itself, because
+    * in many cases we can avoid doing unnecessary multiplies.
+    */
+   switch (blend->alpha_dst_factor) {
+   case PIPE_BLENDFACTOR_ONE:
+      /* factor = 1, so term = Afb */
+      spe_move(f, term2A_reg, fbA_reg);
+      break;
+   case PIPE_BLENDFACTOR_ZERO:
+      /* factor = 0, so term = 0 */
+      spe_load_float(f, term2A_reg, 0.0f);
+      break;
+
+   case PIPE_BLENDFACTOR_SRC_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+      /* factor = A, so term = Afb*A */
+      spe_fm(f, term2A_reg, fbA_reg, fragA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+      /* factor = 1-A, so term = Afb*(1-A) = Afb-Afb*A */
+      /* fnms(a,b,c,d) computes a = d - b*c */
+      spe_fnms(f, term2A_reg, fbA_reg, fragA_reg, fbA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_DST_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_DST_COLOR:
+      /* factor = Afb, so term = Afb*Afb */
+      spe_fm(f, term2A_reg, fbA_reg, fbA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+      /* factor = 1-Afb, so term = Afb*(1-Afb) = Afb - Afb*Afb */
+      /* fnms(a,b,c,d) computes a = d - b*c */
+      spe_fnms(f, term2A_reg, fbA_reg, fbA_reg, fbA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_CONST_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+      /* We need the optional constA_reg register */
+      setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
+      /* factor = Ac, so term = Afb*Ac */
+      spe_fm(f, term2A_reg, fbA_reg, constA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+      /* We need the optional constA_reg register */
+      setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
+      /* factor = 1-Ac, so term = Afb*(1-Ac) = Afb-Afb*Ac */
+      /* fnms(a,b,c,d) computes a = d - b*c */
+      spe_fnms(f, term2A_reg, fbA_reg, constA_reg, fbA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: /* not supported for dest alpha */
+      ASSERT(0);
+      break;
+
+      /* These are special D3D cases involving a second color output
+       * from the fragment shader.  I'm not sure we can support them
+       * yet... XXX
+       */
+   case PIPE_BLENDFACTOR_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_SRC1_ALPHA:
+   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+   default:
+      ASSERT(0);
+   }
+
+   /*
+    * Combine Src/Dest RGB terms as per the blend equation.
+    */
+   switch (blend->rgb_func) {
+   case PIPE_BLEND_ADD:
+      spe_fa(f, fragR_reg, term1R_reg, term2R_reg);
+      spe_fa(f, fragG_reg, term1G_reg, term2G_reg);
+      spe_fa(f, fragB_reg, term1B_reg, term2B_reg);
+      break;
+   case PIPE_BLEND_SUBTRACT:
+      spe_fs(f, fragR_reg, term1R_reg, term2R_reg);
+      spe_fs(f, fragG_reg, term1G_reg, term2G_reg);
+      spe_fs(f, fragB_reg, term1B_reg, term2B_reg);
+      break;
+   case PIPE_BLEND_REVERSE_SUBTRACT:
+      spe_fs(f, fragR_reg, term2R_reg, term1R_reg);
+      spe_fs(f, fragG_reg, term2G_reg, term1G_reg);
+      spe_fs(f, fragB_reg, term2B_reg, term1B_reg);
+      break;
+   case PIPE_BLEND_MIN:
+      spe_float_min(f, fragR_reg, term1R_reg, term2R_reg);
+      spe_float_min(f, fragG_reg, term1G_reg, term2G_reg);
+      spe_float_min(f, fragB_reg, term1B_reg, term2B_reg);
+      break;
+   case PIPE_BLEND_MAX:
+      spe_float_max(f, fragR_reg, term1R_reg, term2R_reg);
+      spe_float_max(f, fragG_reg, term1G_reg, term2G_reg);
+      spe_float_max(f, fragB_reg, term1B_reg, term2B_reg);
+      break;
+   default:
+      ASSERT(0);
+   }
+
+   /*
+    * Combine Src/Dest A term
+    */
+   switch (blend->alpha_func) {
+   case PIPE_BLEND_ADD:
+      spe_fa(f, fragA_reg, term1A_reg, term2A_reg);
+      break;
+   case PIPE_BLEND_SUBTRACT:
+      spe_fs(f, fragA_reg, term1A_reg, term2A_reg);
+      break;
+   case PIPE_BLEND_REVERSE_SUBTRACT:
+      spe_fs(f, fragA_reg, term2A_reg, term1A_reg);
+      break;
+   case PIPE_BLEND_MIN:
+      spe_float_min(f, fragA_reg, term1A_reg, term2A_reg);
+      break;
+   case PIPE_BLEND_MAX:
+      spe_float_max(f, fragA_reg, term1A_reg, term2A_reg);
+      break;
+   default:
+      ASSERT(0);
+   }
+
+   spe_release_register(f, term1R_reg);
+   spe_release_register(f, term1G_reg);
+   spe_release_register(f, term1B_reg);
+   spe_release_register(f, term1A_reg);
+
+   spe_release_register(f, term2R_reg);
+   spe_release_register(f, term2G_reg);
+   spe_release_register(f, term2B_reg);
+   spe_release_register(f, term2A_reg);
+
+   spe_release_register(f, fbR_reg);
+   spe_release_register(f, fbG_reg);
+   spe_release_register(f, fbB_reg);
+   spe_release_register(f, fbA_reg);
+
+   spe_release_register(f, tmp_reg);
+
+   /* Free any optional registers that actually got used */
+   release_const_register(f, &one_reg_set, one_reg);
+   release_const_register(f, &constR_reg_set, constR_reg);
+   release_const_register(f, &constG_reg_set, constG_reg);
+   release_const_register(f, &constB_reg_set, constB_reg);
+   release_const_register(f, &constA_reg_set, constA_reg);
+}
+
+
+static void
+gen_logicop(const struct pipe_blend_state *blend,
+            struct spe_function *f,
+            int fragRGBA_reg, int fbRGBA_reg)
+{
+   /* We've got four 32-bit RGBA packed pixels in each of
+    * fragRGBA_reg and fbRGBA_reg, not sets of floating-point
+    * reds, greens, blues, and alphas.
+    * */
+   ASSERT(blend->logicop_enable);
+
+   switch(blend->logicop_func) {
+      case PIPE_LOGICOP_CLEAR: /* 0 */
+         spe_zero(f, fragRGBA_reg);
+         break;
+      case PIPE_LOGICOP_NOR: /* ~(s | d) */
+         spe_nor(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_AND_INVERTED: /* ~s & d */
+         /* andc R, A, B computes R = A & ~B */
+         spe_andc(f, fragRGBA_reg, fbRGBA_reg, fragRGBA_reg);
+         break;
+      case PIPE_LOGICOP_COPY_INVERTED: /* ~s */
+         spe_complement(f, fragRGBA_reg, fragRGBA_reg);
+         break;
+      case PIPE_LOGICOP_AND_REVERSE: /* s & ~d */
+         /* andc R, A, B computes R = A & ~B */
+         spe_andc(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_INVERT: /* ~d */
+         /* Note that (A nor A) == ~(A|A) == ~A */
+         spe_nor(f, fragRGBA_reg, fbRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_XOR: /* s ^ d */
+         spe_xor(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_NAND: /* ~(s & d) */
+         spe_nand(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_AND: /* s & d */
+         spe_and(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_EQUIV: /* ~(s ^ d) */
+         spe_xor(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+         spe_complement(f, fragRGBA_reg, fragRGBA_reg);
+         break;
+      case PIPE_LOGICOP_NOOP: /* d */
+         spe_move(f, fragRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_OR_INVERTED: /* ~s | d */
+         /* orc R, A, B computes R = A | ~B */
+         spe_orc(f, fragRGBA_reg, fbRGBA_reg, fragRGBA_reg);
+         break;
+      case PIPE_LOGICOP_COPY: /* s */
+         break;
+      case PIPE_LOGICOP_OR_REVERSE: /* s | ~d */
+         /* orc R, A, B computes R = A | ~B */
+         spe_orc(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_OR: /* s | d */
+         spe_or(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_SET: /* 1 */
+         spe_load_int(f, fragRGBA_reg, 0xffffffff);
+         break;
+      default:
+         ASSERT(0);
+   }
+}
+
+
+/**
+ * Generate code to pack a quad of float colors into four 32-bit integers.
+ *
+ * \param f             SPE function to append instruction onto.
+ * \param color_format  the dest color packing format
+ * \param r_reg         register containing four red values (in/clobbered)
+ * \param g_reg         register containing four green values (in/clobbered)
+ * \param b_reg         register containing four blue values (in/clobbered)
+ * \param a_reg         register containing four alpha values (in/clobbered)
+ * \param rgba_reg      register to store the packed RGBA colors (out)
+ */
+static void
+gen_pack_colors(struct spe_function *f,
+                enum pipe_format color_format,
+                int r_reg, int g_reg, int b_reg, int a_reg,
+                int rgba_reg)
+{
+   int rg_reg = spe_allocate_available_register(f);
+   int ba_reg = spe_allocate_available_register(f);
+
+   /* Convert float[4] in [0.0,1.0] to int[4] in [0,~0], with clamping */
+   spe_cfltu(f, r_reg, r_reg, 32);
+   spe_cfltu(f, g_reg, g_reg, 32);
+   spe_cfltu(f, b_reg, b_reg, 32);
+   spe_cfltu(f, a_reg, a_reg, 32);
+
+   /* Shift the most significant bytes to the least significant positions.
+    * I.e.: reg = reg >> 24
+    */
+   spe_rotmi(f, r_reg, r_reg, -24);
+   spe_rotmi(f, g_reg, g_reg, -24);
+   spe_rotmi(f, b_reg, b_reg, -24);
+   spe_rotmi(f, a_reg, a_reg, -24);
+
+   /* Shift the color bytes according to the surface format */
+   if (color_format == PIPE_FORMAT_A8R8G8B8_UNORM) {
+      spe_roti(f, g_reg, g_reg, 8);   /* green <<= 8 */
+      spe_roti(f, r_reg, r_reg, 16);  /* red <<= 16 */
+      spe_roti(f, a_reg, a_reg, 24);  /* alpha <<= 24 */
+   }
+   else if (color_format == PIPE_FORMAT_B8G8R8A8_UNORM) {
+      spe_roti(f, r_reg, r_reg, 8);   /* red <<= 8 */
+      spe_roti(f, g_reg, g_reg, 16);  /* green <<= 16 */
+      spe_roti(f, b_reg, b_reg, 24);  /* blue <<= 24 */
+   }
+   else {
+      ASSERT(0);
+   }
+
+   /* Merge red, green, blue, alpha registers to make packed RGBA colors.
+    * Eg: after shifting according to color_format we might have:
+    *     R = {0x00ff0000, 0x00110000, 0x00220000, 0x00330000}
+    *     G = {0x0000ff00, 0x00004400, 0x00005500, 0x00006600}
+    *     B = {0x000000ff, 0x00000077, 0x00000088, 0x00000099}
+    *     A = {0xff000000, 0xaa000000, 0xbb000000, 0xcc000000}
+    * OR-ing all those together gives us four packed colors:
+    *  RGBA = {0xffffffff, 0xaa114477, 0xbb225588, 0xcc336699}
+    */
+   spe_or(f, rg_reg, r_reg, g_reg);
+   spe_or(f, ba_reg, a_reg, b_reg);
+   spe_or(f, rgba_reg, rg_reg, ba_reg);
+
+   spe_release_register(f, rg_reg);
+   spe_release_register(f, ba_reg);
+}
+
+static void
+gen_colormask(struct spe_function *f,
+              uint colormask,
+              enum pipe_format color_format,
+              int fragRGBA_reg, int fbRGBA_reg)
+{
+   /* We've got four 32-bit RGBA packed pixels in each of
+    * fragRGBA_reg and fbRGBA_reg, not sets of floating-point
+    * reds, greens, blues, and alphas.  Further, the pixels
+    * are packed according to the given color format, not
+    * necessarily RGBA...
+    */
+   unsigned int r_mask;
+   unsigned int g_mask;
+   unsigned int b_mask;
+   unsigned int a_mask;
+
+   /* Calculate exactly where the bits for any particular color
+    * end up, so we can mask them correctly.
+    */
+   switch(color_format) {
+      case PIPE_FORMAT_A8R8G8B8_UNORM:
+         /* ARGB */
+         a_mask = 0xff000000;
+         r_mask = 0x00ff0000;
+         g_mask = 0x0000ff00;
+         b_mask = 0x000000ff;
+         break;
+      case PIPE_FORMAT_B8G8R8A8_UNORM:
+         /* BGRA */
+         b_mask = 0xff000000;
+         g_mask = 0x00ff0000;
+         r_mask = 0x0000ff00;
+         a_mask = 0x000000ff;
+         break;
+      default:
+         ASSERT(0);
+   }
+
+   /* For each R, G, B, and A component we're supposed to mask out, 
+    * clear its bits.   Then our mask operation later will work 
+    * as expected.
+    */
+   if (!(colormask & PIPE_MASK_R)) {
+      r_mask = 0;
+   }
+   if (!(colormask & PIPE_MASK_G)) {
+      g_mask = 0;
+   }
+   if (!(colormask & PIPE_MASK_B)) {
+      b_mask = 0;
+   }
+   if (!(colormask & PIPE_MASK_A)) {
+      a_mask = 0;
+   }
+
+   /* Get a temporary register to hold the mask that will be applied to the fragment */
+   int colormask_reg = spe_allocate_available_register(f);
+
+   /* The actual mask we're going to use is an OR of the remaining R, G, B, and A
+    * masks.  Load the result value into our temporary register.
+    */
+   spe_load_uint(f, colormask_reg, r_mask | g_mask | b_mask | a_mask);
+
+   /* Use the mask register to select between the fragment color
+    * values and the frame buffer color values.  Wherever the
+    * mask has a 0 bit, the current frame buffer color should override
+    * the fragment color.  Wherever the mask has a 1 bit, the 
+    * fragment color should persevere.  The Select Bits (selb rt, rA, rB, rM)
+    * instruction will select bits from its first operand rA wherever the
+    * the mask bits rM are 0, and from its second operand rB wherever the
+    * mask bits rM are 1.  That means that the frame buffer color is the
+    * first operand, and the fragment color the second.
+    */
+    spe_selb(f, fragRGBA_reg, fbRGBA_reg, fragRGBA_reg, colormask_reg);
+
+    /* Release the temporary register and we're done */
+    spe_release_register(f, colormask_reg);
+}
+
+/* This function is annoyingly similar to gen_depth_test(), above, except
+ * that instead of comparing two varying values (i.e. fragment and buffer),
+ * we're comparing a varying value with a static value.  As such, we have
+ * access to the Compare Immediate instructions where we don't in 
+ * gen_depth_test(), which is what makes us very different.
+ *
+ * The return value in the stencil_pass_reg is a bitmask of valid
+ * fragments that also passed the stencil test.  The bitmask of valid
+ * fragments that failed would be found in (mask_reg & ~stencil_pass_reg).
+ */
+static void
+gen_stencil_test(struct spe_function *f, const struct pipe_stencil_state *state, 
+                 unsigned int mask_reg, unsigned int fbS_reg, 
+                 unsigned int stencil_pass_reg)
+{
+   /* Generate code that puts the set of passing fragments into the stencil_pass_reg
+    * register, taking into account whether each fragment was active to begin with.
+    */
+   switch (state->func) {
+   case PIPE_FUNC_EQUAL:
+      /* stencil_pass = mask & (s == reference) */
+      spe_compare_equal_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
+      spe_and(f, stencil_pass_reg, mask_reg, stencil_pass_reg);
+      /* stencil_fail = mask & ~stencil_pass */
+      break;
+
+   case PIPE_FUNC_NOTEQUAL:
+      /* stencil_pass = mask & ~(s == reference) */
+      spe_compare_equal_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
+      spe_andc(f, stencil_pass_reg, mask_reg, stencil_pass_reg);
+      break;
+
+   case PIPE_FUNC_GREATER:
+      /* stencil_pass = mask & (s > reference) */
+      spe_compare_greater_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
+      spe_and(f, stencil_pass_reg, mask_reg, stencil_pass_reg);
+      break;
+
+   case PIPE_FUNC_LESS: {
+      /* stencil_pass = mask & (reference > s) */
+      /* There's no convenient Compare Less Than Immediate instruction, so
+       * we'll have to do this one the harder way, by loading a register and 
+       * comparing directly.  Compare Logical Greater Than Word (clgt) 
+       * treats its operands as unsigned - no sign extension.
+       */
+      unsigned int tmp_reg = spe_allocate_available_register(f);
+      spe_load_uint(f, tmp_reg, state->ref_value);
+      spe_clgt(f, stencil_pass_reg, tmp_reg, fbS_reg);
+      spe_and(f, stencil_pass_reg, mask_reg, stencil_pass_reg);
+      spe_release_register(f, tmp_reg);
+      break;
+   }
+
+   case PIPE_FUNC_LEQUAL:
+      /* stencil_pass = mask & (s <= reference) = mask & ~(s > reference) */
+      spe_compare_greater_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
+      spe_andc(f, stencil_pass_reg, mask_reg, stencil_pass_reg);
+      break;
+
+   case PIPE_FUNC_GEQUAL: {
+      /* stencil_pass = mask & (s >= reference) = mask & ~(reference > s) */
+      /* As above, we have to do this by loading a register */
+      unsigned int tmp_reg = spe_allocate_available_register(f);
+      spe_load_uint(f, tmp_reg, state->ref_value);
+      spe_clgt(f, stencil_pass_reg, tmp_reg, fbS_reg);
+      spe_andc(f, stencil_pass_reg, mask_reg, stencil_pass_reg);
+      spe_release_register(f, tmp_reg);
+      break;
+   }
+
+   case PIPE_FUNC_NEVER:
+      /* stencil_pass = mask & 0 = 0 */
+      spe_load_uint(f, stencil_pass_reg, 0);
+      spe_move(f, stencil_pass_reg, mask_reg);  /* zmask = mask */
+      break;
+
+   case PIPE_FUNC_ALWAYS:
+      /* stencil_pass = mask & 1 = mask */
+      spe_move(f, stencil_pass_reg, mask_reg);
+      break;
+   }
+
+   /* The fragments that passed the stencil test are now in stencil_pass_reg.
+    * The fragments that failed would be (mask_reg & ~stencil_pass_reg).
+    */
+}
+
+/* This function generates code that calculates a set of new stencil values
+ * given the earlier values and the operation to apply.  It does not
+ * apply any tests.  It is intended to be called up to 3 times
+ * (for the stencil fail operation, for the stencil pass-z fail operation,
+ * and for the stencil pass-z pass operation) to collect up to three
+ * possible sets of values, and for the caller to combine them based
+ * on the result of the tests.
+ *
+ * stencil_max_value should be (2^n - 1) where n is the number of bits
+ * in the stencil buffer - in other words, it should be usable as a mask.
+ */
+static void
+gen_stencil_values(struct spe_function *f, unsigned int stencil_op,
+                   unsigned int stencil_ref_value, unsigned int stencil_max_value,
+                   unsigned int fbS_reg, unsigned int newS_reg)
+{
+   /* The code below assumes that newS_reg and fbS_reg are not the same
+    * register; if they can be, the calculations below will have to use
+    * an additional temporary register.  For now, mark the assumption
+    * with an assertion that will fail if they are the same.
+    */
+   ASSERT(fbS_reg != newS_reg);
+
+   /* The code also assumes the the stencil_max_value is of the form 
+    * 2^n-1 and can therefore be used as a mask for the valid bits in 
+    * addition to a maximum.  Make sure this is the case as well.
+    * The clever math below exploits the fact that incrementing a 
+    * binary number serves to flip all the bits of a number starting at
+    * the LSB and continuing to (and including) the first zero bit
+    * found.  That means that a number and its increment will always
+    * have at least one bit in common (the high order bit, if nothing
+    * else) *unless* the number is zero, *or* the number is of a form
+    * consisting of some number of 1s in the low-order bits followed
+    * by nothing but 0s in the high-order bits.  The latter case
+    * implies it's of the form 2^n-1.
+    */
+   ASSERT(stencil_max_value > 0 && ((stencil_max_value + 1) & stencil_max_value) == 0);
+
+   switch(stencil_op) {
+   case PIPE_STENCIL_OP_KEEP:
+      /* newS = S */
+      spe_move(f, newS_reg, fbS_reg);
+      break;
+
+   case PIPE_STENCIL_OP_ZERO:
+      /* newS = 0 */
+      spe_zero(f, newS_reg);
+      break;
+
+   case PIPE_STENCIL_OP_REPLACE:
+      /* newS = stencil reference value */
+      spe_load_uint(f, newS_reg, stencil_ref_value);
+      break;
+
+   case PIPE_STENCIL_OP_INCR: {
+      /* newS = (s == max ? max : s + 1) */
+      unsigned int equals_reg = spe_allocate_available_register(f);
+
+      spe_compare_equal_uint(f, equals_reg, fbS_reg, stencil_max_value);
+      /* Add Word Immediate computes rT = rA + 10-bit signed immediate */
+      spe_ai(f, newS_reg, fbS_reg, 1);
+      /* Select from the current value or the new value based on the equality test */
+      spe_selb(f, newS_reg, fbS_reg, newS_reg, equals_reg);
+
+      spe_release_register(f, equals_reg);
+      break;
+   }
+   case PIPE_STENCIL_OP_DECR: {
+      /* newS = (s == 0 ? 0 : s - 1) */
+      unsigned int equals_reg = spe_allocate_available_register(f);
+
+      spe_compare_equal_uint(f, equals_reg, fbS_reg, 0);
+      /* Add Word Immediate with a (-1) value works */
+      spe_ai(f, newS_reg, fbS_reg, -1);
+      /* Select from the current value or the new value based on the equality test */
+      spe_selb(f, newS_reg, fbS_reg, newS_reg, equals_reg);
+
+      spe_release_register(f, equals_reg);
+      break;
+   }
+   case PIPE_STENCIL_OP_INCR_WRAP:
+      /* newS = (s == max ? 0 : s + 1), but since max is 2^n-1, we can
+       * do a normal add and mask off the correct bits 
+       */
+      spe_ai(f, newS_reg, fbS_reg, 1);
+      spe_and_uint(f, newS_reg, newS_reg, stencil_max_value);
+      break;
+
+   case PIPE_STENCIL_OP_DECR_WRAP:
+      /* newS = (s == 0 ? max : s - 1), but we'll pull the same mask trick as above */
+      spe_ai(f, newS_reg, fbS_reg, -1);
+      spe_and_uint(f, newS_reg, newS_reg, stencil_max_value);
+      break;
+
+   case PIPE_STENCIL_OP_INVERT:
+      /* newS = ~s.  We take advantage of the mask/max value to invert only
+       * the valid bits for the field so we don't have to do an extra "and".
+       */
+      spe_xor_uint(f, newS_reg, fbS_reg, stencil_max_value);
+      break;
+
+   default:
+      ASSERT(0);
+   }
+}
+
+
+/* This function generates code to get all the necessary possible
+ * stencil values.  For each of the output registers (fail_reg,
+ * zfail_reg, and zpass_reg), it either allocates a new register
+ * and calculates a new set of values based on the stencil operation,
+ * or it reuses a register allocation and calculation done for an
+ * earlier (matching) operation, or it reuses the fbS_reg register
+ * (if the stencil operation is KEEP, which doesn't change the 
+ * stencil buffer).
+ *
+ * Since this function allocates a variable number of registers,
+ * to avoid incurring complex logic to free them, they should
+ * be allocated after a spe_allocate_register_set() call
+ * and released by the corresponding spe_release_register_set() call.
+ */
+static void
+gen_get_stencil_values(struct spe_function *f, const struct pipe_depth_stencil_alpha_state *dsa,
+                       unsigned int fbS_reg, 
+                       unsigned int *fail_reg, unsigned int *zfail_reg, 
+                       unsigned int *zpass_reg, unsigned int *back_fail_reg, 
+                       unsigned int *back_zfail_reg, unsigned int *back_zpass_reg)
+{
+   unsigned zfail_op, back_zfail_op;
+
+   /* Stenciling had better be enabled here */
+   ASSERT(dsa->stencil[0].enabled);
+
+   /* If the depth test is not enabled, it is treated as though it always
+    * passes.  In particular, that means that the "zfail_op" (and the backfacing
+    * counterpart, if active) are not considered - a failing stencil test will
+    * trigger the "fail_op", and a passing stencil test will trigger the
+    * "zpass_op".
+    *
+    * By overriding the operations in this case to be PIPE_STENCIL_OP_KEEP,
+    * we keep them from being calculated.
+    */
+   if (dsa->depth.enabled) {
+      zfail_op = dsa->stencil[0].zfail_op;
+      back_zfail_op = dsa->stencil[1].zfail_op;
+   }
+   else {
+      zfail_op = PIPE_STENCIL_OP_KEEP;
+      back_zfail_op = PIPE_STENCIL_OP_KEEP;
+   }
+
+   /* One-sided or front-facing stencil */
+   if (dsa->stencil[0].fail_op == PIPE_STENCIL_OP_KEEP) {
+      *fail_reg = fbS_reg;
+   }
+   else {
+      *fail_reg = spe_allocate_available_register(f);
+      gen_stencil_values(f, dsa->stencil[0].fail_op, dsa->stencil[0].ref_value, 
+         0xff, fbS_reg, *fail_reg);
+   }
+
+   if (zfail_op == PIPE_STENCIL_OP_KEEP) {
+      *zfail_reg = fbS_reg;
+   }
+   else if (zfail_op == dsa->stencil[0].fail_op) {
+      *zfail_reg = *fail_reg;
+   }
+   else {
+      *zfail_reg = spe_allocate_available_register(f);
+      gen_stencil_values(f, dsa->stencil[0].zfail_op, dsa->stencil[0].ref_value, 
+         0xff, fbS_reg, *zfail_reg);
+   }
+
+   if (dsa->stencil[0].zpass_op == PIPE_STENCIL_OP_KEEP) {
+      *zpass_reg = fbS_reg;
+   }
+   else if (dsa->stencil[0].zpass_op == dsa->stencil[0].fail_op) {
+      *zpass_reg = *fail_reg;
+   }
+   else if (dsa->stencil[0].zpass_op == zfail_op) {
+      *zpass_reg = *zfail_reg;
+   }
+   else {
+      *zpass_reg = spe_allocate_available_register(f);
+      gen_stencil_values(f, dsa->stencil[0].zpass_op, dsa->stencil[0].ref_value, 
+         0xff, fbS_reg, *zpass_reg);
+   }
+
+   /* If two-sided stencil is enabled, we have more work to do. */
+   if (!dsa->stencil[1].enabled) {
+      /* This just flags that the registers need not be deallocated later */
+      *back_fail_reg = fbS_reg;
+      *back_zfail_reg = fbS_reg;
+      *back_zpass_reg = fbS_reg;
+   }
+   else {
+      /* Same calculations as above, but for the back stencil */
+      if (dsa->stencil[1].fail_op == PIPE_STENCIL_OP_KEEP) {
+         *back_fail_reg = fbS_reg;
+      }
+      else if (dsa->stencil[1].fail_op == dsa->stencil[0].fail_op) {
+         *back_fail_reg = *fail_reg;
+      }
+      else if (dsa->stencil[1].fail_op == zfail_op) {
+         *back_fail_reg = *zfail_reg;
+      }
+      else if (dsa->stencil[1].fail_op == dsa->stencil[0].zpass_op) {
+         *back_fail_reg = *zpass_reg;
+      }
+      else {
+         *back_fail_reg = spe_allocate_available_register(f);
+         gen_stencil_values(f, dsa->stencil[1].fail_op, dsa->stencil[1].ref_value, 
+            0xff, fbS_reg, *back_fail_reg);
+      }
+
+      if (back_zfail_op == PIPE_STENCIL_OP_KEEP) {
+         *back_zfail_reg = fbS_reg;
+      }
+      else if (back_zfail_op == dsa->stencil[0].fail_op) {
+         *back_zfail_reg = *fail_reg;
+      }
+      else if (back_zfail_op == zfail_op) {
+         *back_zfail_reg = *zfail_reg;
+      }
+      else if (back_zfail_op == dsa->stencil[0].zpass_op) {
+         *back_zfail_reg = *zpass_reg;
+      }
+      else if (back_zfail_op == dsa->stencil[1].fail_op) {
+         *back_zfail_reg = *back_fail_reg;
+      }
+      else {
+         *back_zfail_reg = spe_allocate_available_register(f);
+         gen_stencil_values(f, dsa->stencil[1].zfail_op, dsa->stencil[1].ref_value, 
+            0xff, fbS_reg, *back_zfail_reg);
+      }
+
+      if (dsa->stencil[1].zpass_op == PIPE_STENCIL_OP_KEEP) {
+         *back_zpass_reg = fbS_reg;
+      }
+      else if (dsa->stencil[1].zpass_op == dsa->stencil[0].fail_op) {
+         *back_zpass_reg = *fail_reg;
+      }
+      else if (dsa->stencil[1].zpass_op == zfail_op) {
+         *back_zpass_reg = *zfail_reg;
+      }
+      else if (dsa->stencil[1].zpass_op == dsa->stencil[0].zpass_op) {
+         *back_zpass_reg = *zpass_reg;
+      }
+      else if (dsa->stencil[1].zpass_op == dsa->stencil[1].fail_op) {
+         *back_zpass_reg = *back_fail_reg;
+      }
+      else if (dsa->stencil[1].zpass_op == back_zfail_op) {
+         *back_zpass_reg = *back_zfail_reg;
+      }
+      else {
+         *back_zfail_reg = spe_allocate_available_register(f);
+         gen_stencil_values(f, dsa->stencil[1].zpass_op, dsa->stencil[1].ref_value, 
+            0xff, fbS_reg, *back_zpass_reg);
+      }
+   } /* End of calculations for back-facing stencil */
+}
+
+static boolean
+gen_stencil_depth_test(struct spe_function *f, 
+                       const struct pipe_depth_stencil_alpha_state *dsa, 
+                       const int const facing_reg,
+                       const int mask_reg, const int fragZ_reg, 
+                       const int fbZ_reg, const int fbS_reg)
+{
+   /* True if we've generated code that could require writeback to the
+    * depth and/or stencil buffers
+    */
+   boolean modified_buffers = false;
+
+   boolean need_to_calculate_stencil_values;
+   boolean need_to_writemask_stencil_values;
+
+   /* Registers.  We may or may not actually allocate these, depending
+    * on whether the state values indicate that we need them.
+    */
+   unsigned int stencil_pass_reg, stencil_fail_reg;
+   unsigned int stencil_fail_values, stencil_pass_depth_fail_values, stencil_pass_depth_pass_values;
+   unsigned int stencil_writemask_reg;
+   unsigned int zmask_reg;
+   unsigned int newS_reg;
+
+   /* Stenciling is quite complex: up to six different configurable stencil 
+    * operations/calculations can be required (three each for front-facing
+    * and back-facing fragments).  Many of those operations will likely 
+    * be identical, so there's good reason to try to avoid calculating 
+    * the same values more than once (which unfortunately makes the code less 
+    * straightforward).
+    *
+    * To make register management easier, we start a new 
+    * register set; we can release all the registers in the set at
+    * once, and avoid having to keep track of exactly which registers
+    * we allocate.  We can still allocate and free registers as 
+    * desired (if we know we no longer need a register), but we don't
+    * have to spend the complexity to track the more difficult variant
+    * register usage scenarios.
+    */
+   spe_allocate_register_set(f);
+
+   /* Calculate the writemask.  If the writemask is trivial (either
+    * all 0s, meaning that we don't need to calculate any stencil values
+    * because they're not going to change the stencil anyway, or all 1s,
+    * meaning that we have to calculate the stencil values but do not
+    * need to mask them), we can avoid generating code.  Don't forget
+    * that we need to consider backfacing stencil, if enabled.
+    */
+   if (dsa->stencil[0].write_mask == 0x0 && (!dsa->stencil[1].enabled || dsa->stencil[1].write_mask == 0x00)) {
+      /* Trivial: don't need to calculate stencil values, and don't need to 
+       * write them back to the framebuffer.
+       */
+      need_to_calculate_stencil_values = false;
+      need_to_writemask_stencil_values = false;
+   }
+   else if (dsa->stencil[0].write_mask == 0xff && (!dsa->stencil[1].enabled || dsa->stencil[1].write_mask == 0x00)) {
+      /* Still trivial, but a little less so.  We need to write the stencil
+       * values, but we don't need to mask them.
+       */
+      need_to_calculate_stencil_values = true;
+      need_to_writemask_stencil_values = false;
+   }
+   else {
+      /* The general case: calculate, mask, and write */
+      need_to_calculate_stencil_values = true;
+      need_to_writemask_stencil_values = true;
+
+      /* While we're here, generate code that calculates what the
+       * writemask should be.  If backface stenciling is enabled,
+       * and the backface writemask is not the same as the frontface
+       * writemask, we'll have to generate code that merges the
+       * two masks into a single effective mask based on fragment facing.
+       */
+      stencil_writemask_reg = spe_allocate_available_register(f);
+      spe_load_uint(f, stencil_writemask_reg, dsa->stencil[0].write_mask);
+      if (dsa->stencil[1].enabled && dsa->stencil[0].write_mask != dsa->stencil[1].write_mask) {
+         unsigned int back_write_mask_reg = spe_allocate_available_register(f);
+         spe_load_uint(f, back_write_mask_reg, dsa->stencil[1].write_mask);
+         spe_selb(f, stencil_writemask_reg, stencil_writemask_reg, back_write_mask_reg, facing_reg);
+         spe_release_register(f, back_write_mask_reg);
+      }
+   }
+
+   /* At least one-sided stenciling must be on.  Generate code that
+    * runs the stencil test on the basic/front-facing stencil, leaving
+    * the mask of passing stencil bits in stencil_pass_reg.  This mask will
+    * be used both to mask the set of active pixels, and also to
+    * determine how the stencil buffer changes.
+    *
+    * This test will *not* change the value in mask_reg (because we don't
+    * yet know whether to apply the two-sided stencil or one-sided stencil).
+    */
+   stencil_pass_reg = spe_allocate_available_register(f);
+   gen_stencil_test(f, &dsa->stencil[0], mask_reg, fbS_reg, stencil_pass_reg);
+
+   /* If two-sided stenciling is on, generate code to run the stencil
+    * test on the backfacing stencil as well, and combine the two results
+    * into the one correct result based on facing.
+    */
+   if (dsa->stencil[1].enabled) {
+      unsigned int temp_reg = spe_allocate_available_register(f);
+      gen_stencil_test(f, &dsa->stencil[1], mask_reg, fbS_reg, temp_reg);
+      spe_selb(f, stencil_pass_reg, stencil_pass_reg, temp_reg, facing_reg);
+      spe_release_register(f, temp_reg);
+   }
+
+   /* Generate code that, given the mask of valid fragments and the
+    * mask of valid fragments that passed the stencil test, computes
+    * the mask of valid fragments that failed the stencil test.  We
+    * have to do this before we run a depth test (because the
+    * depth test should not be performed on fragments that failed the
+    * stencil test, and because the depth test will update the 
+    * mask of valid fragments based on the results of the depth test).
+    */
+   stencil_fail_reg = spe_allocate_available_register(f);
+   spe_andc(f, stencil_fail_reg, mask_reg, stencil_pass_reg);
+   /* Now remove the stenciled-out pixels from the valid fragment mask,
+    * so we can later use the valid fragment mask in the depth test.
+    */
+   spe_and(f, mask_reg, mask_reg, stencil_pass_reg);
+
+   /* We may not need to calculate stencil values, if the writemask is off */
+   if (need_to_calculate_stencil_values) {
+      unsigned int back_stencil_fail_values, back_stencil_pass_depth_fail_values, back_stencil_pass_depth_pass_values;
+      unsigned int front_stencil_fail_values, front_stencil_pass_depth_fail_values, front_stencil_pass_depth_pass_values;
+
+      /* Generate code that calculates exactly which stencil values we need,
+       * without calculating the same value twice (say, if two different
+       * stencil ops have the same value).  This code will work for one-sided
+       * and two-sided stenciling (so that we take into account that operations
+       * may match between front and back stencils), and will also take into
+       * account whether the depth test is enabled (if the depth test is off,
+       * we don't need any of the zfail results, because the depth test always
+       * is considered to pass if it is disabled).  Any register value that
+       * does not need to be calculated will come back with the same value
+       * that's in fbS_reg.
+       *
+       * This function will allocate a variant number of registers that
+       * will be released as part of the register set.
+       */
+      gen_get_stencil_values(f, dsa, fbS_reg, 
+         &front_stencil_fail_values, &front_stencil_pass_depth_fail_values, 
+         &front_stencil_pass_depth_pass_values, &back_stencil_fail_values, 
+         &back_stencil_pass_depth_fail_values, &back_stencil_pass_depth_pass_values);
+
+      /* Tricky, tricky, tricky - the things we do to create optimal
+       * code...
+       *
+       * The various stencil values registers may overlap with each other
+       * and with fbS_reg arbitrarily (as any particular operation is
+       * only calculated once and stored in one register, no matter
+       * how many times it is used).  So we can't change the values 
+       * within those registers directly - if we change a value in a
+       * register that's being referenced by two different calculations,
+       * we've just unwittingly changed the second value as well...
+       *
+       * Avoid this by allocating new registers to hold the results
+       * (there may be 2, if the depth test is off, or 3, if it is on).
+       * These will be released as part of the register set.
+       */
+      if (!dsa->stencil[1].enabled) {
+         /* The easy case: if two-sided stenciling is *not* enabled, we
+          * just use the front-sided values.
+          */
+         stencil_fail_values = front_stencil_fail_values;
+         stencil_pass_depth_fail_values = front_stencil_pass_depth_fail_values;
+         stencil_pass_depth_pass_values = front_stencil_pass_depth_pass_values;
+      }
+      else { /* two-sided stencil enabled */
+         /* Allocate new registers for the needed merged values */
+         stencil_fail_values = spe_allocate_available_register(f);
+         spe_selb(f, stencil_fail_values, front_stencil_fail_values, back_stencil_fail_values, facing_reg);
+         if (dsa->depth.enabled) {
+            stencil_pass_depth_fail_values = spe_allocate_available_register(f);
+            spe_selb(f, stencil_pass_depth_fail_values, front_stencil_pass_depth_fail_values, back_stencil_pass_depth_fail_values, facing_reg);
+         }
+         else {
+            stencil_pass_depth_fail_values = fbS_reg;
+         }
+         stencil_pass_depth_pass_values = spe_allocate_available_register(f);
+         spe_selb(f, stencil_pass_depth_pass_values, front_stencil_pass_depth_pass_values, back_stencil_pass_depth_pass_values, facing_reg);
+      }
+   }
+
+   /* We now have all the stencil values we need.  We also need 
+    * the results of the depth test to figure out which
+    * stencil values will become the new stencil values.  (Even if
+    * we aren't actually calculating stencil values, we need to apply
+    * the depth test if it's enabled.)
+    *
+    * The code generated by gen_depth_test() returns the results of the
+    * test in the given register, but also alters the mask_reg based
+    * on the results of the test.
+    */
+   if (dsa->depth.enabled) {
+      zmask_reg = spe_allocate_available_register(f);
+      modified_buffers |= gen_depth_test(f, dsa, mask_reg, fragZ_reg, fbZ_reg, zmask_reg);
+   }
+
+   if (need_to_calculate_stencil_values) {
+      /* If we need to writemask the stencil values before going into
+       * the stencil buffer, we'll have to use a new register to
+       * hold the new values.  If not, we can just keep using the
+       * current register.
+       */
+      if (need_to_writemask_stencil_values) {
+         newS_reg = spe_allocate_available_register(f);
+         spe_move(f, newS_reg, fbS_reg);
+         modified_buffers = true;
+      }
+      else {
+         newS_reg = fbS_reg;
+      }
+
+      /* Merge in the selected stencil fail values */
+      if (stencil_fail_values != fbS_reg) {
+         spe_selb(f, newS_reg, newS_reg, stencil_fail_values, stencil_fail_reg);
+      }
+
+      /* Same for the stencil pass/depth fail values.  If this calculation
+       * is not needed (say, if depth test is off), then the
+       * stencil_pass_depth_fail_values register will be equal to fbS_reg
+       * and we'll skip the calculation.
+       */
+      if (stencil_pass_depth_fail_values != fbS_reg) {
+         /* We don't actually have a stencil pass/depth fail mask yet.
+          * Calculate it here from the stencil passing mask and the
+          * depth passing mask.  Note that zmask_reg *must* have been
+          * set above if we're here.
+          */
+         unsigned int stencil_pass_depth_fail_mask = spe_allocate_available_register(f);
+         spe_andc(f, stencil_pass_depth_fail_mask, stencil_pass_reg, zmask_reg);
+
+         spe_selb(f, newS_reg, newS_reg, stencil_pass_depth_fail_values, stencil_pass_depth_fail_mask);
+
+         spe_release_register(f, stencil_pass_depth_fail_mask);
+      }
+
+      /* Same for the stencil pass/depth pass mask */
+      if (stencil_pass_depth_pass_values != fbS_reg) {
+         unsigned int stencil_pass_depth_pass_mask = spe_allocate_available_register(f);
+         spe_and(f, stencil_pass_depth_pass_mask, stencil_pass_reg, zmask_reg);
+
+         spe_selb(f, newS_reg, newS_reg, stencil_pass_depth_pass_values, stencil_pass_depth_pass_mask);
+         spe_release_register(f, stencil_pass_depth_pass_mask);
+      }
+
+      /* Almost done.  If we need to writemask, do it now, leaving the
+       * results in the fbS_reg register passed in.  If we don't need
+       * to writemask, then the results are *already* in the fbS_reg,
+       * so there's nothing more to do.
+       */
+
+      if (need_to_writemask_stencil_values) {
+         /* The Select Bytes command makes a fine writemask.  Where
+          * the mask is 0, the first (original) values are retained,
+          * effectively masking out changes.  Where the mask is 1, the
+          * second (new) values are retained, incorporating changes.
+          */
+         spe_selb(f, fbS_reg, fbS_reg, newS_reg, stencil_writemask_reg);
+      }
+   } /* done calculating stencil values */
+
+   /* The stencil and/or depth values have been applied, and the
+    * mask_reg, fbS_reg, and fbZ_reg values have been updated.
+    * We're all done, except that we've allocated a fair number
+    * of registers that we didn't bother tracking.  Release all
+    * those registers as part of the register set, and go home.
+    */
+   spe_release_register_set(f);
+
+   /* Return true if we could have modified the stencil and/or
+    * depth buffers.
+    */
+   return modified_buffers;
+}
+
+
+/**
+ * Generate SPE code to implement the fragment operations (alpha test,
+ * depth test, stencil test, blending, colormask, and final
+ * framebuffer write) as specified by the current context state.
+ *
+ * Logically, this code will be called after running the fragment
+ * shader.  But under some circumstances we could run some of this
+ * code before the fragment shader to cull fragments/quads that are
+ * totally occluded/discarded.
+ *
+ * XXX we only support PIPE_FORMAT_Z24S8_UNORM z/stencil buffer right now.
+ *
+ * See the spu_default_fragment_ops() function to see how the per-fragment
+ * operations would be done with ordinary C code.
+ * The code we generate here though has no branches, is SIMD, etc and
+ * should be much faster.
+ *
+ * \param cell  the rendering context (in)
+ * \param f     the generated function (out)
+ */
+void
+cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
+{
+   const struct pipe_depth_stencil_alpha_state *dsa = cell->depth_stencil;
+   const struct pipe_blend_state *blend = cell->blend;
+   const struct pipe_blend_color *blend_color = &cell->blend_color;
+   const enum pipe_format color_format = cell->framebuffer.cbufs[0]->format;
+
+   /* For SPE function calls: reg $3 = first param, $4 = second param, etc. */
+   const int x_reg = 3;  /* uint */
+   const int y_reg = 4;  /* uint */
+   const int color_tile_reg = 5;  /* tile_t * */
+   const int depth_tile_reg = 6;  /* tile_t * */
+   const int fragZ_reg = 7;   /* vector float */
+   const int fragR_reg = 8;   /* vector float */
+   const int fragG_reg = 9;   /* vector float */
+   const int fragB_reg = 10;  /* vector float */
+   const int fragA_reg = 11;  /* vector float */
+   const int mask_reg = 12;   /* vector uint */
+   const int facing_reg = 13; /* uint */
+
+   /* offset of quad from start of tile
+    * XXX assuming 4-byte pixels for color AND Z/stencil!!!!
+    */
+   int quad_offset_reg;
+
+   int fbRGBA_reg;  /**< framebuffer's RGBA colors for quad */
+   int fbZS_reg;    /**< framebuffer's combined z/stencil values for quad */
+
+   spe_init_func(f, SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE);
+
+   if (cell->debug_flags & CELL_DEBUG_ASM) {
+      spe_print_code(f, true);
+      spe_indent(f, 8);
+      spe_comment(f, -4, "Begin per-fragment ops");
+   }
+
+   spe_allocate_register(f, x_reg);
+   spe_allocate_register(f, y_reg);
+   spe_allocate_register(f, color_tile_reg);
+   spe_allocate_register(f, depth_tile_reg);
+   spe_allocate_register(f, fragZ_reg);
+   spe_allocate_register(f, fragR_reg);
+   spe_allocate_register(f, fragG_reg);
+   spe_allocate_register(f, fragB_reg);
+   spe_allocate_register(f, fragA_reg);
+   spe_allocate_register(f, mask_reg);
+   spe_allocate_register(f, facing_reg);
+
+   quad_offset_reg = spe_allocate_available_register(f);
+   fbRGBA_reg = spe_allocate_available_register(f);
+   fbZS_reg = spe_allocate_available_register(f);
+
+   /* compute offset of quad from start of tile, in bytes */
+   {
+      int x2_reg = spe_allocate_available_register(f);
+      int y2_reg = spe_allocate_available_register(f);
+
+      ASSERT(TILE_SIZE == 32);
+
+      spe_comment(f, 0, "Compute quad offset within tile");
+      spe_rotmi(f, y2_reg, y_reg, -1);  /* y2 = y / 2 */
+      spe_rotmi(f, x2_reg, x_reg, -1);  /* x2 = x / 2 */
+      spe_shli(f, y2_reg, y2_reg, 4);   /* y2 *= 16 */
+      spe_a(f, quad_offset_reg, y2_reg, x2_reg);  /* offset = y2 + x2 */
+      spe_shli(f, quad_offset_reg, quad_offset_reg, 4);   /* offset *= 16 */
+
+      spe_release_register(f, x2_reg);
+      spe_release_register(f, y2_reg);
+   }
+
+   if (dsa->alpha.enabled) {
+      gen_alpha_test(dsa, f, mask_reg, fragA_reg);
+   }
+
+   /* If we need the stencil buffers (because one- or two-sided stencil is
+    * enabled) or the depth buffer (because the depth test is enabled),
+    * go grab them.  Note that if either one- or two-sided stencil is
+    * enabled, dsa->stencil[0].enabled will be true.
+    */
+   if (dsa->depth.enabled || dsa->stencil[0].enabled) {
+      const enum pipe_format zs_format = cell->framebuffer.zsbuf->format;
+      boolean write_depth_stencil;
+
+      /* We may or may not need to allocate a register for Z or stencil values */
+      boolean fbS_reg_set = false, fbZ_reg_set = false;
+      unsigned int fbS_reg, fbZ_reg = 0;
+
+      spe_comment(f, 0, "Fetch quad's Z/stencil values from tile");
+
+      /* fetch quad of depth/stencil values from tile at (x,y) */
+      /* Load: fbZS_reg = memory[depth_tile_reg + offset_reg] */
+      /* XXX Not sure this is allowed if we've only got a 16-bit Z buffer... */
+      spe_lqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg);
+
+      /* From the Z/stencil buffer format, pull out the bits we need for
+       * Z and/or stencil.  We'll also convert the incoming fragment Z
+       * value in fragZ_reg from a floating point value in [0.0..1.0] to
+       * an unsigned integer value with the appropriate resolution.
+       */
+      switch(zs_format) {
+
+         case PIPE_FORMAT_S8Z24_UNORM: /* fall through */
+         case PIPE_FORMAT_X8Z24_UNORM:
+            if (dsa->depth.enabled) {
+               /* We need the Z part at least */
+               setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
+               /* four 24-bit Z values in the low-order bits */
+               spe_and_uint(f, fbZ_reg, fbZS_reg, 0x00ffffff);
+
+               /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
+                * to a 24-bit unsigned integer
+                */
+               spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+               spe_rotmi(f, fragZ_reg, fragZ_reg, -8);
+            }
+            if (dsa->stencil[0].enabled) {
+               setup_optional_register(f, &fbS_reg_set, &fbS_reg);
+               /* four 8-bit Z values in the high-order bits */
+               spe_rotmi(f, fbS_reg, fbZS_reg, -24);
+            }
+            break;
+
+         case PIPE_FORMAT_Z24S8_UNORM: /* fall through */
+         case PIPE_FORMAT_Z24X8_UNORM:
+            if (dsa->depth.enabled) {
+               setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
+               /* shift by 8 to get the upper 24-bit values */
+               spe_rotmi(f, fbS_reg, fbZS_reg, -8);
+
+               /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
+                * to a 24-bit unsigned integer
+                */
+               spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+               spe_rotmi(f, fragZ_reg, fragZ_reg, -8);
+            }
+            if (dsa->stencil[0].enabled) {
+               setup_optional_register(f, &fbS_reg_set, &fbS_reg);
+               /* 8-bit stencil in the low-order bits - mask them out */
+               spe_and_uint(f, fbS_reg, fbZS_reg, 0x000000ff);
+            }
+            break;
+
+         case PIPE_FORMAT_Z32_UNORM:
+            if (dsa->depth.enabled) {
+               setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
+               /* Copy over 4 32-bit values */
+               spe_move(f, fbZ_reg, fbZS_reg);
+
+               /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
+                * to a 32-bit unsigned integer
+                */
+               spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+            }
+            /* No stencil, so can't do anything there */
+            break;
+
+         case PIPE_FORMAT_Z16_UNORM:
+            if (dsa->depth.enabled) {
+               /* XXX Not sure this is correct, but it was here before, so we're
+                * going with it for now
+                */
+               setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
+               /* Copy over 4 32-bit values */
+               spe_move(f, fbZ_reg, fbZS_reg);
+
+               /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
+                * to a 16-bit unsigned integer
+                */
+               spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+               spe_rotmi(f, fragZ_reg, fragZ_reg, -16);
+            }
+            /* No stencil */
+            break;
+
+         default:
+            ASSERT(0); /* invalid format */
+      }
+
+      /* If stencil is enabled, use the stencil-specific code
+       * generator to generate both the stencil and depth (if needed)
+       * tests.  Otherwise, if only depth is enabled, generate
+       * a quick depth test.  The test generators themselves will
+       * report back whether the depth/stencil buffer has to be
+       * written back.
+       */
+      if (dsa->stencil[0].enabled) {
+         /* This will perform the stencil and depth tests, and update
+          * the mask_reg, fbZ_reg, and fbS_reg as required by the
+          * tests.
+          */
+         ASSERT(fbS_reg_set);
+         ASSERT(fbZ_reg_set);
+         spe_comment(f, 0, "Perform stencil test");
+
+         write_depth_stencil = gen_stencil_depth_test(f, dsa, facing_reg, mask_reg, fragZ_reg, fbZ_reg, fbS_reg);
+      }
+      else if (dsa->depth.enabled) {
+         int zmask_reg = spe_allocate_available_register(f);
+         spe_comment(f, 0, "Perform depth test");
+         write_depth_stencil = gen_depth_test(f, dsa, mask_reg, fragZ_reg, fbZ_reg, zmask_reg);
+         spe_release_register(f, zmask_reg);
+      }
+      else {
+         write_depth_stencil = false;
+      }
+
+      if (write_depth_stencil) {
+         /* Merge latest Z and Stencil values into fbZS_reg.
+          * fbZ_reg has four Z vals in bits [23..0] or bits [15..0].
+          * fbS_reg has four 8-bit Z values in bits [7..0].
+          */
+         spe_comment(f, 0, "Store quad's depth/stencil values in tile");
+         if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
+             zs_format == PIPE_FORMAT_X8Z24_UNORM) {
+            if (fbS_reg_set) {
+               spe_shli(f, fbS_reg, fbS_reg, 24); /* fbS = fbS << 24 */
+               spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
+            }
+            else {
+               spe_move(f, fbZS_reg, fbZ_reg);
+            }
+         }
+         else if (zs_format == PIPE_FORMAT_Z24S8_UNORM ||
+                  zs_format == PIPE_FORMAT_Z24X8_UNORM) {
+            spe_shli(f, fbZ_reg, fbZ_reg, 8); /* fbZ = fbZ << 8 */
+            if (fbS_reg_set) {
+               spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
+            }
+         }
+         else if (zs_format == PIPE_FORMAT_Z32_UNORM) {
+            spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */
+         }
+         else if (zs_format == PIPE_FORMAT_Z16_UNORM) {
+            spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */
+         }
+         else if (zs_format == PIPE_FORMAT_S8_UNORM) {
+            ASSERT(0);   /* XXX to do */
+         }
+         else {
+            ASSERT(0); /* bad zs_format */
+         }
+
+         /* Store: memory[depth_tile_reg + quad_offset_reg] = fbZS */
+         spe_stqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg);
+      }
+
+      release_optional_register(f, &fbZ_reg_set, fbZ_reg);
+      release_optional_register(f, &fbS_reg_set, fbS_reg);
+   }
+
+   /* Get framebuffer quad/colors.  We'll need these for blending,
+    * color masking, and to obey the quad/pixel mask.
+    * Load: fbRGBA_reg = memory[color_tile + quad_offset]
+    * Note: if mask={~0,~0,~0,~0} and we're not blending or colormasking
+    * we could skip this load.
+    */
+   spe_comment(f, 0, "Fetch quad colors from tile");
+   spe_lqx(f, fbRGBA_reg, color_tile_reg, quad_offset_reg);
+
+   if (blend->blend_enable) {
+      spe_comment(f, 0, "Perform blending");
+      gen_blend(blend, blend_color, f, color_format,
+                fragR_reg, fragG_reg, fragB_reg, fragA_reg, fbRGBA_reg);
+   }
+
+   /*
+    * Write fragment colors to framebuffer/tile.
+    * This involves converting the fragment colors from float[4] to the
+    * tile's specific format and obeying the quad/pixel mask.
+    */
+   {
+      int rgba_reg = spe_allocate_available_register(f);
+
+      /* Pack four float colors as four 32-bit int colors */
+      spe_comment(f, 0, "Convert float quad colors to packed int framebuffer colors");
+      gen_pack_colors(f, color_format,
+                      fragR_reg, fragG_reg, fragB_reg, fragA_reg,
+                      rgba_reg);
+
+      if (blend->logicop_enable) {
+         spe_comment(f, 0, "Compute logic op");
+         gen_logicop(blend, f, rgba_reg, fbRGBA_reg);
+      }
+
+      if (blend->colormask != PIPE_MASK_RGBA) {
+         spe_comment(f, 0, "Compute color mask");
+         gen_colormask(f, blend->colormask, color_format, rgba_reg, fbRGBA_reg);
+      }
+
+      /* Mix fragment colors with framebuffer colors using the quad/pixel mask:
+       * if (mask[i])
+       *    rgba[i] = rgba[i];
+       * else
+       *    rgba[i] = framebuffer[i];
+       */
+      spe_selb(f, rgba_reg, fbRGBA_reg, rgba_reg, mask_reg);
+
+      /* Store updated quad in tile:
+       * memory[color_tile + quad_offset] = rgba_reg;
+       */
+      spe_comment(f, 0, "Store quad colors into color tile");
+      spe_stqx(f, rgba_reg, color_tile_reg, quad_offset_reg);
+
+      spe_release_register(f, rgba_reg);
+   }
+
+   //printf("gen_fragment_ops nr instructions: %u\n", f->num_inst);
+
+   spe_bi(f, SPE_REG_RA, 0, 0);  /* return from function call */
+
+   spe_release_register(f, fbRGBA_reg);
+   spe_release_register(f, fbZS_reg);
+   spe_release_register(f, quad_offset_reg);
+
+   if (cell->debug_flags & CELL_DEBUG_ASM) {
+      spe_comment(f, -4, "End per-fragment ops");
+   }
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.h b/src/gallium/drivers/cell/ppu/cell_gen_fragment.h
new file mode 100644
index 0000000000..b59de198dc
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.h
@@ -0,0 +1,38 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef CELL_GEN_FRAGMENT_H
+#define CELL_GEN_FRAGMENT_H
+
+
+extern void
+cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f);
+
+
+#endif /* CELL_GEN_FRAGMENT_H */
+
diff --git a/src/gallium/drivers/cell/ppu/cell_pipe_state.c b/src/gallium/drivers/cell/ppu/cell_pipe_state.c
new file mode 100644
index 0000000000..8c55b8e093
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_pipe_state.c
@@ -0,0 +1,352 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/* Authors:
+ *  Keith Whitwell <keith@tungstengraphics.com>
+ *  Brian Paul
+ */
+
+#include "util/u_memory.h"
+#include "pipe/p_inlines.h"
+#include "draw/draw_context.h"
+#include "cell_context.h"
+#include "cell_flush.h"
+#include "cell_state.h"
+#include "cell_texture.h"
+
+
+
+static void *
+cell_create_blend_state(struct pipe_context *pipe,
+                        const struct pipe_blend_state *blend)
+{
+   return mem_dup(blend, sizeof(*blend));
+}
+
+
+static void
+cell_bind_blend_state(struct pipe_context *pipe, void *blend)
+{
+   struct cell_context *cell = cell_context(pipe);
+
+   draw_flush(cell->draw);
+
+   cell->blend = (struct pipe_blend_state *) blend;
+   cell->dirty |= CELL_NEW_BLEND;
+}
+
+
+static void
+cell_delete_blend_state(struct pipe_context *pipe, void *blend)
+{
+   FREE(blend);
+}
+
+
+static void
+cell_set_blend_color(struct pipe_context *pipe,
+                     const struct pipe_blend_color *blend_color)
+{
+   struct cell_context *cell = cell_context(pipe);
+
+   draw_flush(cell->draw);
+
+   cell->blend_color = *blend_color;
+
+   cell->dirty |= CELL_NEW_BLEND;
+}
+
+
+
+
+static void *
+cell_create_depth_stencil_alpha_state(struct pipe_context *pipe,
+                 const struct pipe_depth_stencil_alpha_state *dsa)
+{
+   return mem_dup(dsa, sizeof(*dsa));
+}
+
+
+static void
+cell_bind_depth_stencil_alpha_state(struct pipe_context *pipe,
+                                    void *dsa)
+{
+   struct cell_context *cell = cell_context(pipe);
+
+   draw_flush(cell->draw);
+
+   cell->depth_stencil = (struct pipe_depth_stencil_alpha_state *) dsa;
+   cell->dirty |= CELL_NEW_DEPTH_STENCIL;
+}
+
+
+static void
+cell_delete_depth_stencil_alpha_state(struct pipe_context *pipe, void *dsa)
+{
+   FREE(dsa);
+}
+
+
+static void
+cell_set_clip_state(struct pipe_context *pipe,
+                    const struct pipe_clip_state *clip)
+{
+   struct cell_context *cell = cell_context(pipe);
+
+   /* pass the clip state to the draw module */
+   draw_set_clip_state(cell->draw, clip);
+}
+
+
+
+/* Called when driver state tracker notices changes to the viewport
+ * matrix:
+ */
+static void
+cell_set_viewport_state( struct pipe_context *pipe,
+                         const struct pipe_viewport_state *viewport )
+{
+   struct cell_context *cell = cell_context(pipe);
+
+   cell->viewport = *viewport; /* struct copy */
+   cell->dirty |= CELL_NEW_VIEWPORT;
+
+   /* pass the viewport info to the draw module */
+   draw_set_viewport_state(cell->draw, viewport);
+
+   /* Using tnl/ and vf/ modules is temporary while getting started.
+    * Full pipe will have vertex shader, vertex fetch of its own.
+    */
+}
+
+
+static void
+cell_set_scissor_state( struct pipe_context *pipe,
+                        const struct pipe_scissor_state *scissor )
+{
+   struct cell_context *cell = cell_context(pipe);
+
+   memcpy( &cell->scissor, scissor, sizeof(*scissor) );
+   cell->dirty |= CELL_NEW_SCISSOR;
+}
+
+
+static void
+cell_set_polygon_stipple( struct pipe_context *pipe,
+                          const struct pipe_poly_stipple *stipple )
+{
+   struct cell_context *cell = cell_context(pipe);
+
+   memcpy( &cell->poly_stipple, stipple, sizeof(*stipple) );
+   cell->dirty |= CELL_NEW_STIPPLE;
+}
+
+
+
+static void *
+cell_create_rasterizer_state(struct pipe_context *pipe,
+                             const struct pipe_rasterizer_state *rasterizer)
+{
+   return mem_dup(rasterizer, sizeof(*rasterizer));
+}
+
+
+static void
+cell_bind_rasterizer_state(struct pipe_context *pipe, void *rast)
+{
+   struct pipe_rasterizer_state *rasterizer =
+      (struct pipe_rasterizer_state *) rast;
+   struct cell_context *cell = cell_context(pipe);
+
+   /* pass-through to draw module */
+   draw_set_rasterizer_state(cell->draw, rasterizer);
+
+   cell->rasterizer = rasterizer;
+
+   cell->dirty |= CELL_NEW_RASTERIZER;
+}
+
+
+static void
+cell_delete_rasterizer_state(struct pipe_context *pipe, void *rasterizer)
+{
+   FREE(rasterizer);
+}
+
+
+
+static void *
+cell_create_sampler_state(struct pipe_context *pipe,
+                          const struct pipe_sampler_state *sampler)
+{
+   return mem_dup(sampler, sizeof(*sampler));
+}
+
+
+static void
+cell_bind_sampler_states(struct pipe_context *pipe,
+                         unsigned num, void **samplers)
+{
+   struct cell_context *cell = cell_context(pipe);
+
+   assert(num <= CELL_MAX_SAMPLERS);
+
+   draw_flush(cell->draw);
+
+   memcpy(cell->sampler, samplers, num * sizeof(void *));
+   memset(&cell->sampler[num], 0, (CELL_MAX_SAMPLERS - num) *
+          sizeof(void *));
+   cell->num_samplers = num;
+
+   cell->dirty |= CELL_NEW_SAMPLER;
+}
+
+
+static void
+cell_delete_sampler_state(struct pipe_context *pipe,
+                              void *sampler)
+{
+   FREE( sampler );
+}
+
+
+
+static void
+cell_set_sampler_textures(struct pipe_context *pipe,
+                          unsigned num, struct pipe_texture **texture)
+{
+   struct cell_context *cell = cell_context(pipe);
+   uint i;
+
+   assert(num <= CELL_MAX_SAMPLERS);
+
+   /* Check for no-op */
+   if (num == cell->num_textures &&
+       !memcmp(cell->texture, texture, num * sizeof(struct pipe_texture *)))
+      return;
+
+   draw_flush(cell->draw);
+
+   for (i = 0; i < CELL_MAX_SAMPLERS; i++) {
+      struct pipe_texture *tex = i < num ? texture[i] : NULL;
+
+      pipe_texture_reference((struct pipe_texture **) &cell->texture[i], tex);
+   }
+   cell->num_textures = num;
+
+   cell_update_texture_mapping(cell);
+
+   cell->dirty |= CELL_NEW_TEXTURE;
+}
+
+
+
+static void
+cell_set_framebuffer_state(struct pipe_context *pipe,
+                           const struct pipe_framebuffer_state *fb)
+{
+   struct cell_context *cell = cell_context(pipe);
+
+   if (1 /*memcmp(&cell->framebuffer, fb, sizeof(*fb))*/) {
+      struct pipe_surface *csurf = fb->cbufs[0];
+      struct pipe_surface *zsurf = fb->zsbuf;
+      uint i;
+      uint flags = (PIPE_BUFFER_USAGE_GPU_WRITE |
+                    PIPE_BUFFER_USAGE_GPU_READ);
+
+      /* unmap old surfaces */
+      for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
+         if (cell->framebuffer.cbufs[i] && cell->cbuf_map[i]) {
+            pipe_surface_unmap(cell->framebuffer.cbufs[i]);
+            cell->cbuf_map[i] = NULL;
+         }
+      }
+
+      if (cell->framebuffer.zsbuf && cell->zsbuf_map) {
+         pipe_surface_unmap(cell->framebuffer.zsbuf);
+         cell->zsbuf_map = NULL;
+      }
+
+      /* Finish any pending rendering to the current surface before
+       * installing a new surface!
+       */
+      cell_flush_int(cell, CELL_FLUSH_WAIT);
+
+      /* update my state
+       * (this is also where old surfaces will finally get freed)
+       */
+      cell->framebuffer.width = fb->width;
+      cell->framebuffer.height = fb->height;
+      cell->framebuffer.num_cbufs = fb->num_cbufs;
+      for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
+         pipe_surface_reference(&cell->framebuffer.cbufs[i], fb->cbufs[i]);
+      }
+      pipe_surface_reference(&cell->framebuffer.zsbuf, fb->zsbuf);
+
+      /* map new surfaces */
+      if (csurf)
+         cell->cbuf_map[0] = pipe_surface_map(csurf, flags);
+
+      if (zsurf)
+         cell->zsbuf_map = pipe_surface_map(zsurf, flags);
+
+      cell->dirty |= CELL_NEW_FRAMEBUFFER;
+   }
+}
+
+
+
+void
+cell_init_state_functions(struct cell_context *cell)
+{
+   cell->pipe.create_blend_state = cell_create_blend_state;
+   cell->pipe.bind_blend_state   = cell_bind_blend_state;
+   cell->pipe.delete_blend_state = cell_delete_blend_state;
+
+   cell->pipe.create_sampler_state = cell_create_sampler_state;
+   cell->pipe.bind_sampler_states = cell_bind_sampler_states;
+   cell->pipe.delete_sampler_state = cell_delete_sampler_state;
+
+   cell->pipe.set_sampler_textures = cell_set_sampler_textures;
+
+   cell->pipe.create_depth_stencil_alpha_state = cell_create_depth_stencil_alpha_state;
+   cell->pipe.bind_depth_stencil_alpha_state   = cell_bind_depth_stencil_alpha_state;
+   cell->pipe.delete_depth_stencil_alpha_state = cell_delete_depth_stencil_alpha_state;
+
+   cell->pipe.create_rasterizer_state = cell_create_rasterizer_state;
+   cell->pipe.bind_rasterizer_state   = cell_bind_rasterizer_state;
+   cell->pipe.delete_rasterizer_state = cell_delete_rasterizer_state;
+
+   cell->pipe.set_blend_color = cell_set_blend_color;
+   cell->pipe.set_clip_state = cell_set_clip_state;
+
+   cell->pipe.set_framebuffer_state = cell_set_framebuffer_state;
+
+   cell->pipe.set_polygon_stipple = cell_set_polygon_stipple;
+   cell->pipe.set_scissor_state = cell_set_scissor_state;
+   cell->pipe.set_viewport_state = cell_set_viewport_state;
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_pipe_state.h b/src/gallium/drivers/cell/ppu/cell_pipe_state.h
new file mode 100644
index 0000000000..1889bd52ff
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_pipe_state.h
@@ -0,0 +1,39 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef CELL_PIPE_STATE_H
+#define CELL_PIPE_STATE_H
+
+
+struct cell_context;
+
+extern void
+cell_init_state_functions(struct cell_context *cell);
+
+
+#endif /* CELL_PIPE_STATE_H */
diff --git a/src/gallium/drivers/cell/ppu/cell_render.c b/src/gallium/drivers/cell/ppu/cell_render.c
new file mode 100644
index 0000000000..79cb8df82f
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_render.c
@@ -0,0 +1,211 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * \brief  Last stage of 'draw' pipeline: send tris to SPUs.
+ * \author  Brian Paul
+ */
+
+#include "cell_context.h"
+#include "cell_render.h"
+#include "cell_spu.h"
+#include "util/u_memory.h"
+#include "draw/draw_private.h"
+
+
+struct render_stage {
+   struct draw_stage stage; /**< This must be first (base class) */
+
+   struct cell_context *cell;
+};
+
+
+static INLINE struct render_stage *
+render_stage(struct draw_stage *stage)
+{
+   return (struct render_stage *) stage;
+}
+
+
+static void render_begin( struct draw_stage *stage )
+{
+#if 0
+   struct render_stage *render = render_stage(stage);
+   struct cell_context *sp = render->cell;
+   const struct pipe_shader_state *fs = &render->cell->fs->shader;
+   render->quad.nr_attrs = render->cell->nr_frag_attrs;
+
+   render->firstFpInput = fs->input_semantic_name[0];
+
+   sp->quad.first->begin(sp->quad.first);
+#endif
+}
+
+
+static void render_end( struct draw_stage *stage )
+{
+}
+
+
+static void reset_stipple_counter( struct draw_stage *stage )
+{
+   struct render_stage *render = render_stage(stage);
+   /*render->cell->line_stipple_counter = 0;*/
+}
+
+
+static void
+render_point(struct draw_stage *stage, struct prim_header *prim)
+{
+}
+
+
+static void
+render_line(struct draw_stage *stage, struct prim_header *prim)
+{
+}
+
+
+/** Write a vertex into the prim buffer */
+static void
+save_vertex(struct cell_prim_buffer *buf, uint pos,
+            const struct vertex_header *vert)
+{
+   uint attr, j;
+
+   for (attr = 0; attr < 2; attr++) {
+      for (j = 0; j < 4; j++) {
+         buf->vertex[pos][attr][j] = vert->data[attr][j];
+      }
+   }
+
+   /* update bounding box */
+   if (vert->data[0][0] < buf->xmin)
+      buf->xmin = vert->data[0][0];
+   if (vert->data[0][0] > buf->xmax)
+      buf->xmax = vert->data[0][0];
+   if (vert->data[0][1] < buf->ymin)
+      buf->ymin = vert->data[0][1];
+   if (vert->data[0][1] > buf->ymax)
+      buf->ymax = vert->data[0][1];
+}
+
+
+static void
+render_tri(struct draw_stage *stage, struct prim_header *prim)
+{
+   struct render_stage *rs = render_stage(stage);
+   struct cell_context *cell = rs->cell;
+   struct cell_prim_buffer *buf = &cell->prim_buffer;
+   uint i;
+
+   if (buf->num_verts + 3 > CELL_MAX_VERTS) {
+      cell_flush_prim_buffer(cell);
+   }
+
+   i = buf->num_verts;
+   assert(i+2 <= CELL_MAX_VERTS);
+   save_vertex(buf, i+0, prim->v[0]);
+   save_vertex(buf, i+1, prim->v[1]);
+   save_vertex(buf, i+2, prim->v[2]);
+   buf->num_verts += 3;
+}
+
+
+/**
+ * Send the a RENDER command to all SPUs to have them render the prims
+ * in the current prim_buffer.
+ */
+void
+cell_flush_prim_buffer(struct cell_context *cell)
+{
+   uint i;
+
+   if (cell->prim_buffer.num_verts == 0)
+      return;
+
+   for (i = 0; i < cell->num_spus; i++) {
+      struct cell_command_render *render = &cell_global.command[i].render;
+      render->prim_type = PIPE_PRIM_TRIANGLES;
+      render->num_verts = cell->prim_buffer.num_verts;
+      render->front_winding = cell->rasterizer->front_winding;
+      render->vertex_size = cell->vertex_info->size * 4;
+      render->xmin = cell->prim_buffer.xmin;
+      render->ymin = cell->prim_buffer.ymin;
+      render->xmax = cell->prim_buffer.xmax;
+      render->ymax = cell->prim_buffer.ymax;
+      render->vertex_data = &cell->prim_buffer.vertex;
+      ASSERT_ALIGN16(render->vertex_data);
+      send_mbox_message(cell_global.spe_contexts[i], CELL_CMD_RENDER);
+   }
+
+   cell->prim_buffer.num_verts = 0;
+
+   cell->prim_buffer.xmin = 1e100;
+   cell->prim_buffer.ymin = 1e100;
+   cell->prim_buffer.xmax = -1e100;
+   cell->prim_buffer.ymax = -1e100;
+
+   /* XXX temporary, need to double-buffer the prim buffer until we get
+    * a real command buffer/list system.
+    */
+   cell_flush(&cell->pipe, 0x0);
+}
+
+
+
+static void render_destroy( struct draw_stage *stage )
+{
+   FREE( stage );
+}
+
+
+/**
+ * Create a new draw/render stage.  This will be plugged into the
+ * draw module as the last pipeline stage.
+ */
+struct draw_stage *cell_draw_render_stage( struct cell_context *cell )
+{
+   struct render_stage *render = CALLOC_STRUCT(render_stage);
+
+   render->cell = cell;
+   render->stage.draw = cell->draw;
+   render->stage.begin = render_begin;
+   render->stage.point = render_point;
+   render->stage.line = render_line;
+   render->stage.tri = render_tri;
+   render->stage.end = render_end;
+   render->stage.reset_stipple_counter = reset_stipple_counter;
+   render->stage.destroy = render_destroy;
+
+   /*
+   render->quad.coef = render->coef;
+   render->quad.posCoef = &render->posCoef;
+   */
+
+   return &render->stage;
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_render.h b/src/gallium/drivers/cell/ppu/cell_render.h
new file mode 100644
index 0000000000..826dcbafeb
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_render.h
@@ -0,0 +1,39 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef CELL_RENDER_H
+#define CELL_RENDER_H
+
+struct cell_context;
+struct draw_stage;
+
+extern void
+cell_flush_prim_buffer(struct cell_context *cell);
+
+extern struct draw_stage *cell_draw_render_stage( struct cell_context *cell );
+
+#endif /* CELL_RENDER_H */
diff --git a/src/gallium/drivers/cell/ppu/cell_screen.c b/src/gallium/drivers/cell/ppu/cell_screen.c
new file mode 100644
index 0000000000..47ba6fa290
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_screen.c
@@ -0,0 +1,170 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "util/u_memory.h"
+#include "pipe/p_winsys.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_screen.h"
+
+#include "cell/common.h"
+#include "cell_screen.h"
+#include "cell_texture.h"
+#include "cell_winsys.h"
+
+
+static const char *
+cell_get_vendor(struct pipe_screen *screen)
+{
+   return "Tungsten Graphics, Inc.";
+}
+
+
+static const char *
+cell_get_name(struct pipe_screen *screen)
+{
+   return "Cell";
+}
+
+
+static int
+cell_get_param(struct pipe_screen *screen, int param)
+{
+   switch (param) {
+   case PIPE_CAP_MAX_TEXTURE_IMAGE_UNITS:
+      return CELL_MAX_SAMPLERS;
+   case PIPE_CAP_NPOT_TEXTURES:
+      return 1;
+   case PIPE_CAP_TWO_SIDED_STENCIL:
+      return 1;
+   case PIPE_CAP_GLSL:
+      return 1;
+   case PIPE_CAP_S3TC:
+      return 0;
+   case PIPE_CAP_ANISOTROPIC_FILTER:
+      return 0;
+   case PIPE_CAP_POINT_SPRITE:
+      return 1;
+   case PIPE_CAP_MAX_RENDER_TARGETS:
+      return 1;
+   case PIPE_CAP_OCCLUSION_QUERY:
+      return 1;
+   case PIPE_CAP_TEXTURE_SHADOW_MAP:
+      return 10;
+   case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
+      return 12; /* max 2Kx2K */
+   case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
+      return 8;  /* max 128x128x128 */
+   case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
+      return 12; /* max 2Kx2K */
+   default:
+      return 10;
+   }
+}
+
+
+static float
+cell_get_paramf(struct pipe_screen *screen, int param)
+{
+   switch (param) {
+   case PIPE_CAP_MAX_LINE_WIDTH:
+      /* fall-through */
+   case PIPE_CAP_MAX_LINE_WIDTH_AA:
+      return 255.0; /* arbitrary */
+
+   case PIPE_CAP_MAX_POINT_WIDTH:
+      /* fall-through */
+   case PIPE_CAP_MAX_POINT_WIDTH_AA:
+      return 255.0; /* arbitrary */
+
+   case PIPE_CAP_MAX_TEXTURE_ANISOTROPY:
+      return 0.0;
+
+   case PIPE_CAP_MAX_TEXTURE_LOD_BIAS:
+      return 16.0; /* arbitrary */
+
+   default:
+      return 10;
+   }
+}
+
+
+static boolean
+cell_is_format_supported( struct pipe_screen *screen,
+                          enum pipe_format format, 
+                          enum pipe_texture_target target,
+                          unsigned tex_usage, 
+                          unsigned geom_flags )
+{
+   /* cell supports most formats, XXX for now anyway */
+   if (format == PIPE_FORMAT_DXT5_RGBA ||
+       format == PIPE_FORMAT_R8G8B8A8_SRGB)
+      return FALSE;
+   else
+      return TRUE;
+}
+
+
+static void
+cell_destroy_screen( struct pipe_screen *screen )
+{
+   struct pipe_winsys *winsys = screen->winsys;
+
+   if(winsys->destroy)
+      winsys->destroy(winsys);
+
+   FREE(screen);
+}
+
+
+/**
+ * Create a new pipe_screen object
+ * Note: we're not presently subclassing pipe_screen (no cell_screen) but
+ * that would be the place to put SPU thread/context info...
+ */
+struct pipe_screen *
+cell_create_screen(struct pipe_winsys *winsys)
+{
+   struct pipe_screen *screen = CALLOC_STRUCT(pipe_screen);
+
+   if (!screen)
+      return NULL;
+
+   screen->winsys = winsys;
+
+   screen->destroy = cell_destroy_screen;
+
+   screen->get_name = cell_get_name;
+   screen->get_vendor = cell_get_vendor;
+   screen->get_param = cell_get_param;
+   screen->get_paramf = cell_get_paramf;
+   screen->is_format_supported = cell_is_format_supported;
+
+   cell_init_screen_texture_funcs(screen);
+
+   return screen;
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_screen.h b/src/gallium/drivers/cell/ppu/cell_screen.h
new file mode 100644
index 0000000000..c7e15889d6
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_screen.h
@@ -0,0 +1,41 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef CELL_SCREEN_H
+#define CELL_SCREEN_H
+
+
+struct pipe_screen;
+struct pipe_winsys;
+
+
+extern struct pipe_screen *
+cell_create_screen(struct pipe_winsys *winsys);
+
+
+#endif /* CELL_SCREEN_H */
diff --git a/src/gallium/drivers/cell/ppu/cell_spu.c b/src/gallium/drivers/cell/ppu/cell_spu.c
new file mode 100644
index 0000000000..df020c4146
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_spu.c
@@ -0,0 +1,191 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+/**
+ * Utility/wrappers for communicating with the SPUs.
+ */
+
+
+#include <pthread.h>
+
+#include "cell_spu.h"
+#include "pipe/p_format.h"
+#include "pipe/p_state.h"
+#include "util/u_memory.h"
+#include "cell/common.h"
+
+
+/*
+helpful headers:
+/opt/ibm/cell-sdk/prototype/src/include/ppu/cbe_mfc.h
+*/
+
+
+/**
+ * Cell/SPU info that's not per-context.
+ */
+struct cell_global_info cell_global;
+
+
+/**
+ * Write a 1-word message to the given SPE mailbox.
+ */
+void
+send_mbox_message(spe_context_ptr_t ctx, unsigned int msg)
+{
+   spe_in_mbox_write(ctx, &msg, 1, SPE_MBOX_ALL_BLOCKING);
+}
+
+
+/**
+ * Wait for a 1-word message to arrive in given mailbox.
+ */
+uint
+wait_mbox_message(spe_context_ptr_t ctx)
+{
+   do {
+      unsigned data;
+      int count = spe_out_mbox_read(ctx, &data, 1);
+
+      if (count == 1) {
+	 return data;
+      }
+      
+      if (count < 0) {
+	 /* error */ ;
+      }
+   } while (1);
+}
+
+
+/**
+ * Called by pthread_create() to spawn an SPU thread.
+ */
+static void *
+cell_thread_function(void *arg)
+{
+   struct cell_init_info *init = (struct cell_init_info *) arg;
+   unsigned entry = SPE_DEFAULT_ENTRY;
+
+   ASSERT_ALIGN16(init);
+
+   if (spe_context_run(cell_global.spe_contexts[init->id], &entry, 0,
+                       init, NULL, NULL) < 0) {
+      fprintf(stderr, "spe_context_run() failed\n");
+      exit(1);
+   }
+
+   pthread_exit(NULL);
+}
+
+
+/**
+ * Create the SPU threads.  This is done once during driver initialization.
+ * This involves setting the the "init" message which is sent to each SPU.
+ * The init message specifies an SPU id, total number of SPUs, location
+ * and number of batch buffers, etc.
+ */
+void
+cell_start_spus(struct cell_context *cell)
+{
+   static boolean one_time_init = FALSE;
+   uint i, j;
+
+   if (one_time_init) {
+      fprintf(stderr, "PPU: Multiple rendering contexts not yet supported "
+	      "on Cell.\n");
+      abort();
+   }
+
+   one_time_init = TRUE;
+
+   assert(cell->num_spus <= MAX_SPUS);
+
+   ASSERT_ALIGN16(&cell_global.command[0]);
+   ASSERT_ALIGN16(&cell_global.command[1]);
+
+   ASSERT_ALIGN16(&cell_global.inits[0]);
+   ASSERT_ALIGN16(&cell_global.inits[1]);
+
+   /*
+    * Initialize the global 'inits' structure for each SPU.
+    * A pointer to the init struct will be passed to each SPU.
+    * The SPUs will then each grab their init info with mfc_get().
+    */
+   for (i = 0; i < cell->num_spus; i++) {
+      cell_global.inits[i].id = i;
+      cell_global.inits[i].num_spus = cell->num_spus;
+      cell_global.inits[i].debug_flags = cell->debug_flags;
+      cell_global.inits[i].cmd = &cell_global.command[i];
+      for (j = 0; j < CELL_NUM_BUFFERS; j++) {
+         cell_global.inits[i].buffers[j] = cell->buffer[j];
+      }
+      cell_global.inits[i].buffer_status = &cell->buffer_status[0][0][0];
+
+      cell_global.inits[i].spu_functions = &cell->spu_functions;
+
+      cell_global.spe_contexts[i] = spe_context_create(0, NULL);
+      if (!cell_global.spe_contexts[i]) {
+         fprintf(stderr, "spe_context_create() failed\n");
+         exit(1);
+      }
+
+      if (spe_program_load(cell_global.spe_contexts[i], &g3d_spu)) {
+         fprintf(stderr, "spe_program_load() failed\n");
+         exit(1);
+      }
+      
+      pthread_create(&cell_global.spe_threads[i], /* returned thread handle */
+                     NULL,                        /* pthread attribs */
+                     &cell_thread_function,       /* start routine */
+		     &cell_global.inits[i]);      /* thread argument */
+   }
+}
+
+
+/**
+ * Tell all the SPUs to stop/exit.
+ * This is done when the driver's exiting / cleaning up.
+ */
+void
+cell_spu_exit(struct cell_context *cell)
+{
+   uint i;
+
+   for (i = 0; i < cell->num_spus; i++) {
+      send_mbox_message(cell_global.spe_contexts[i], CELL_CMD_EXIT);
+   }
+
+   /* wait for threads to exit */
+   for (i = 0; i < cell->num_spus; i++) {
+      void *value;
+      pthread_join(cell_global.spe_threads[i], &value);
+      cell_global.spe_threads[i] = 0;
+      cell_global.spe_contexts[i] = 0;
+   }
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_spu.h b/src/gallium/drivers/cell/ppu/cell_spu.h
new file mode 100644
index 0000000000..137f26612e
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_spu.h
@@ -0,0 +1,82 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef CELL_SPU
+#define CELL_SPU
+
+
+#include <libspe2.h>
+#include <libmisc.h>
+#include "cell/common.h"
+
+#include "cell_context.h"
+
+
+#define MAX_SPUS 8
+
+/**
+ * Global vars, for now anyway.
+ */
+struct cell_global_info
+{
+   /**
+    * SPU/SPE handles, etc
+    */
+   spe_context_ptr_t spe_contexts[MAX_SPUS];
+   pthread_t spe_threads[MAX_SPUS];
+
+   /**
+    * Data sent to SPUs
+    */
+   struct cell_init_info inits[MAX_SPUS];
+   struct cell_command command[MAX_SPUS];
+};
+
+
+extern struct cell_global_info cell_global;
+
+
+/** This is the handle for the actual SPE code */
+extern spe_program_handle_t g3d_spu;
+
+
+extern void
+send_mbox_message(spe_context_ptr_t ctx, unsigned int msg);
+
+extern uint
+wait_mbox_message(spe_context_ptr_t ctx);
+
+
+extern void
+cell_start_spus(struct cell_context *cell);
+
+
+extern void
+cell_spu_exit(struct cell_context *cell);
+
+
+#endif /* CELL_SPU */
diff --git a/src/gallium/drivers/cell/ppu/cell_state.h b/src/gallium/drivers/cell/ppu/cell_state.h
new file mode 100644
index 0000000000..b193170f9c
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_state.h
@@ -0,0 +1,65 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef CELL_STATE_H
+#define CELL_STATE_H
+
+
+#define CELL_NEW_VIEWPORT      0x1
+#define CELL_NEW_RASTERIZER    0x2
+#define CELL_NEW_FS            0x4
+#define CELL_NEW_BLEND         0x8
+#define CELL_NEW_CLIP          0x10
+#define CELL_NEW_SCISSOR       0x20
+#define CELL_NEW_STIPPLE       0x40
+#define CELL_NEW_FRAMEBUFFER   0x80
+#define CELL_NEW_ALPHA_TEST    0x100
+#define CELL_NEW_DEPTH_STENCIL 0x200
+#define CELL_NEW_SAMPLER       0x400
+#define CELL_NEW_TEXTURE       0x800
+#define CELL_NEW_VERTEX        0x1000
+#define CELL_NEW_VS            0x2000
+#define CELL_NEW_VS_CONSTANTS  0x4000
+#define CELL_NEW_FS_CONSTANTS  0x8000
+#define CELL_NEW_VERTEX_INFO   0x10000
+
+
+extern void
+cell_update_derived( struct cell_context *softpipe );
+
+
+extern void
+cell_init_shader_functions(struct cell_context *cell);
+
+
+extern void
+cell_init_vertex_functions(struct cell_context *cell);
+
+
+#endif /* CELL_STATE_H */
+
diff --git a/src/gallium/drivers/cell/ppu/cell_state_derived.c b/src/gallium/drivers/cell/ppu/cell_state_derived.c
new file mode 100644
index 0000000000..efc4f78364
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_state_derived.c
@@ -0,0 +1,170 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "util/u_memory.h"
+#include "pipe/p_shader_tokens.h"
+#include "draw/draw_context.h"
+#include "draw/draw_vertex.h"
+#include "cell_context.h"
+#include "cell_batch.h"
+#include "cell_state.h"
+#include "cell_state_emit.h"
+
+
+/**
+ * Determine how to map vertex program outputs to fragment program inputs.
+ * Basically, this will be used when computing the triangle interpolation
+ * coefficients from the post-transform vertex attributes.
+ */
+static void
+calculate_vertex_layout( struct cell_context *cell )
+{
+   const struct cell_fragment_shader_state *fs = cell->fs;
+   const enum interp_mode colorInterp
+      = cell->rasterizer->flatshade ? INTERP_CONSTANT : INTERP_LINEAR;
+   struct vertex_info *vinfo = &cell->vertex_info;
+   uint i;
+   int src;
+
+#if 0
+   if (cell->vbuf) {
+      /* if using the post-transform vertex buffer, tell draw_vbuf to
+       * simply emit the whole post-xform vertex as-is:
+       */
+      struct vertex_info *vinfo_vbuf = &cell->vertex_info_vbuf;
+      vinfo_vbuf->num_attribs = 0;
+      draw_emit_vertex_attr(vinfo_vbuf, EMIT_ALL, INTERP_NONE, 0);
+      vinfo_vbuf->size = 4 * vs->num_outputs + sizeof(struct vertex_header)/4;
+   }
+#endif
+
+   /* reset vinfo */
+   vinfo->num_attribs = 0;
+
+   /* we always want to emit vertex pos */
+   src = draw_find_vs_output(cell->draw, TGSI_SEMANTIC_POSITION, 0);
+   assert(src >= 0);
+   draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_POS, src);
+
+
+   /*
+    * Loop over fragment shader inputs, searching for the matching output
+    * from the vertex shader.
+    */
+   for (i = 0; i < fs->info.num_inputs; i++) {
+      switch (fs->info.input_semantic_name[i]) {
+      case TGSI_SEMANTIC_POSITION:
+         /* already done above */
+         break;
+
+      case TGSI_SEMANTIC_COLOR:
+         src = draw_find_vs_output(cell->draw, TGSI_SEMANTIC_COLOR, 
+                                   fs->info.input_semantic_index[i]);
+         assert(src >= 0);
+         draw_emit_vertex_attr(vinfo, EMIT_4F, colorInterp, src);
+         break;
+
+      case TGSI_SEMANTIC_FOG:
+         src = draw_find_vs_output(cell->draw, TGSI_SEMANTIC_FOG, 0);
+#if 1
+         if (src < 0) /* XXX temp hack, try demos/fogcoord.c with this */
+            src = 0;
+#endif
+         assert(src >= 0);
+         draw_emit_vertex_attr(vinfo, EMIT_1F, INTERP_PERSPECTIVE, src);
+         break;
+
+      case TGSI_SEMANTIC_GENERIC:
+         /* this includes texcoords and varying vars */
+         src = draw_find_vs_output(cell->draw, TGSI_SEMANTIC_GENERIC,
+                              fs->info.input_semantic_index[i]);
+         assert(src >= 0);
+         draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, src);
+         break;
+
+      default:
+         assert(0);
+      }
+   }
+
+   draw_compute_vertex_size(vinfo);
+
+   /* XXX only signal this if format really changes */
+   cell->dirty |= CELL_NEW_VERTEX_INFO;
+}
+
+
+#if 0
+/**
+ * Recompute cliprect from scissor bounds, scissor enable and surface size.
+ */
+static void
+compute_cliprect(struct cell_context *sp)
+{
+   uint surfWidth = sp->framebuffer.width;
+   uint surfHeight = sp->framebuffer.height;
+
+   if (sp->rasterizer->scissor) {
+      /* clip to scissor rect */
+      sp->cliprect.minx = MAX2(sp->scissor.minx, 0);
+      sp->cliprect.miny = MAX2(sp->scissor.miny, 0);
+      sp->cliprect.maxx = MIN2(sp->scissor.maxx, surfWidth);
+      sp->cliprect.maxy = MIN2(sp->scissor.maxy, surfHeight);
+   }
+   else {
+      /* clip to surface bounds */
+      sp->cliprect.minx = 0;
+      sp->cliprect.miny = 0;
+      sp->cliprect.maxx = surfWidth;
+      sp->cliprect.maxy = surfHeight;
+   }
+}
+#endif
+
+
+
+/**
+ * Update derived state, send current state to SPUs prior to rendering.
+ */
+void cell_update_derived( struct cell_context *cell )
+{
+   if (cell->dirty & (CELL_NEW_RASTERIZER |
+                      CELL_NEW_FS |
+                      CELL_NEW_VS))
+      calculate_vertex_layout( cell );
+
+#if 0
+   if (cell->dirty & (CELL_NEW_SCISSOR |
+                      CELL_NEW_DEPTH_STENCIL_ALPHA |
+                      CELL_NEW_FRAMEBUFFER))
+      compute_cliprect(cell);
+#endif
+
+   cell_emit_state(cell);
+
+   cell->dirty = 0;
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
new file mode 100644
index 0000000000..cbfa393cfb
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -0,0 +1,247 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "pipe/p_inlines.h"
+#include "util/u_memory.h"
+#include "cell_context.h"
+#include "cell_gen_fragment.h"
+#include "cell_state.h"
+#include "cell_state_emit.h"
+#include "cell_batch.h"
+#include "cell_texture.h"
+#include "draw/draw_context.h"
+#include "draw/draw_private.h"
+
+
+/**
+ * Find/create a cell_command_fragment_ops object corresponding to the
+ * current blend/stencil/z/colormask/etc. state.
+ */
+static struct cell_command_fragment_ops *
+lookup_fragment_ops(struct cell_context *cell)
+{
+   struct cell_fragment_ops_key key;
+   struct cell_command_fragment_ops *ops;
+
+   /*
+    * Build key
+    */
+   memset(&key, 0, sizeof(key));
+   key.blend = *cell->blend;
+   key.dsa = *cell->depth_stencil;
+
+   if (cell->framebuffer.cbufs[0])
+      key.color_format = cell->framebuffer.cbufs[0]->format;
+   else
+      key.color_format = PIPE_FORMAT_NONE;
+
+   if (cell->framebuffer.zsbuf)
+      key.zs_format = cell->framebuffer.zsbuf->format;
+   else
+      key.zs_format = PIPE_FORMAT_NONE;
+
+   /*
+    * Look up key in cache.
+    */
+   ops = (struct cell_command_fragment_ops *)
+      util_keymap_lookup(cell->fragment_ops_cache, &key);
+
+   /*
+    * If not found, create/save new fragment ops command.
+    */
+   if (!ops) {
+      struct spe_function spe_code;
+
+      if (0)
+         debug_printf("**** Create New Fragment Ops\n");
+
+      /* Prepare the buffer that will hold the generated code. */
+      spe_init_func(&spe_code, SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE);
+
+      /* generate new code */
+      cell_gen_fragment_function(cell, &spe_code);
+
+      /* alloc new fragment ops command */
+      ops = CALLOC_STRUCT(cell_command_fragment_ops);
+
+      /* populate the new cell_command_fragment_ops object */
+      ops->opcode = CELL_CMD_STATE_FRAGMENT_OPS;
+      memcpy(ops->code, spe_code.store, spe_code_size(&spe_code));
+      ops->dsa = *cell->depth_stencil;
+      ops->blend = *cell->blend;
+
+      /* insert cell_command_fragment_ops object into keymap/cache */
+      util_keymap_insert(cell->fragment_ops_cache, &key, ops, NULL);
+
+      /* release rtasm buffer */
+      spe_release_func(&spe_code);
+   }
+   else {
+      if (0)
+         debug_printf("**** Re-use Fragment Ops\n");
+   }
+
+   return ops;
+}
+
+
+
+static void
+emit_state_cmd(struct cell_context *cell, uint cmd,
+               const void *state, uint state_size)
+{
+   uint64_t *dst = (uint64_t *) 
+       cell_batch_alloc(cell, ROUNDUP8(sizeof(uint64_t) + state_size));
+   *dst = cmd;
+   memcpy(dst + 1, state, state_size);
+}
+
+
+/**
+ * For state marked as 'dirty', construct a state-update command block
+ * and insert it into the current batch buffer.
+ */
+void
+cell_emit_state(struct cell_context *cell)
+{
+   if (cell->dirty & CELL_NEW_FRAMEBUFFER) {
+      struct pipe_surface *cbuf = cell->framebuffer.cbufs[0];
+      struct pipe_surface *zbuf = cell->framebuffer.zsbuf;
+      struct cell_command_framebuffer *fb
+         = cell_batch_alloc(cell, sizeof(*fb));
+      fb->opcode = CELL_CMD_STATE_FRAMEBUFFER;
+      fb->color_start = cell->cbuf_map[0];
+      fb->color_format = cbuf->format;
+      fb->depth_start = cell->zsbuf_map;
+      fb->depth_format = zbuf ? zbuf->format : PIPE_FORMAT_NONE;
+      fb->width = cell->framebuffer.width;
+      fb->height = cell->framebuffer.height;
+#if 0
+      printf("EMIT color format %s\n", pf_name(fb->color_format));
+      printf("EMIT depth format %s\n", pf_name(fb->depth_format));
+#endif
+   }
+
+   if (cell->dirty & (CELL_NEW_FS)) {
+      /* Send new fragment program to SPUs */
+      struct cell_command_fragment_program *fp
+            = cell_batch_alloc(cell, sizeof(*fp));
+      fp->opcode = CELL_CMD_STATE_FRAGMENT_PROGRAM;
+      fp->num_inst = cell->fs->code.num_inst;
+      memcpy(&fp->code, cell->fs->code.store,
+             SPU_MAX_FRAGMENT_PROGRAM_INSTS * SPE_INST_SIZE);
+      if (0) {
+         int i;
+         printf("PPU Emit CELL_CMD_STATE_FRAGMENT_PROGRAM:\n");
+         for (i = 0; i < fp->num_inst; i++) {
+            printf(" %3d: 0x%08x\n", i, fp->code[i]);
+         }
+      }
+   }
+
+   if (cell->dirty & (CELL_NEW_FS_CONSTANTS)) {
+      const uint shader = PIPE_SHADER_FRAGMENT;
+      const uint num_const = cell->constants[shader].size / sizeof(float);
+      uint i, j;
+      float *buf = cell_batch_alloc(cell, 16 + num_const * sizeof(float));
+      uint64_t *ibuf = (uint64_t *) buf;
+      const float *constants = pipe_buffer_map(cell->pipe.screen,
+                                               cell->constants[shader].buffer,
+                                               PIPE_BUFFER_USAGE_CPU_READ);
+      ibuf[0] = CELL_CMD_STATE_FS_CONSTANTS;
+      ibuf[1] = num_const;
+      j = 4;
+      for (i = 0; i < num_const; i++) {
+         buf[j++] = constants[i];
+      }
+      pipe_buffer_unmap(cell->pipe.screen, cell->constants[shader].buffer);
+   }
+
+   if (cell->dirty & (CELL_NEW_FRAMEBUFFER |
+                      CELL_NEW_DEPTH_STENCIL |
+                      CELL_NEW_BLEND)) {
+      struct cell_command_fragment_ops *fops, *fops_cmd;
+      fops_cmd = cell_batch_alloc(cell, sizeof(*fops_cmd));
+      fops = lookup_fragment_ops(cell);
+      memcpy(fops_cmd, fops, sizeof(*fops));
+   }
+
+   if (cell->dirty & CELL_NEW_SAMPLER) {
+      uint i;
+      for (i = 0; i < CELL_MAX_SAMPLERS; i++) {
+         if (cell->sampler[i]) {
+            struct cell_command_sampler *sampler
+               = cell_batch_alloc(cell, sizeof(*sampler));
+            sampler->opcode = CELL_CMD_STATE_SAMPLER;
+            sampler->unit = i;
+            sampler->state = *cell->sampler[i];
+         }
+      }
+   }
+
+   if (cell->dirty & CELL_NEW_TEXTURE) {
+      uint i;
+      for (i = 0;i < CELL_MAX_SAMPLERS; i++) {
+         struct cell_command_texture *texture
+            =  cell_batch_alloc(cell, sizeof(*texture));
+         texture->opcode = CELL_CMD_STATE_TEXTURE;
+         texture->unit = i;
+         if (cell->texture[i]) {
+            texture->start = cell->texture[i]->tiled_data;
+            texture->width = cell->texture[i]->base.width[0];
+            texture->height = cell->texture[i]->base.height[0];
+         }
+         else {
+            texture->start = NULL;
+            texture->width = 1;
+            texture->height = 1;
+         }
+      }
+   }
+
+   if (cell->dirty & CELL_NEW_VERTEX_INFO) {
+      emit_state_cmd(cell, CELL_CMD_STATE_VERTEX_INFO,
+                     &cell->vertex_info, sizeof(struct vertex_info));
+   }
+
+#if 0
+   if (cell->dirty & CELL_NEW_VS) {
+      const struct draw_context *const draw = cell->draw;
+      struct cell_shader_info info;
+
+      info.num_outputs = draw_num_vs_outputs(draw);
+      info.declarations = (uintptr_t) draw->vs.machine.Declarations;
+      info.num_declarations = draw->vs.machine.NumDeclarations;
+      info.instructions = (uintptr_t) draw->vs.machine.Instructions;
+      info.num_instructions = draw->vs.machine.NumInstructions;
+      info.immediates = (uintptr_t) draw->vs.machine.Imms;
+      info.num_immediates = draw->vs.machine.ImmLimit / 4;
+
+      emit_state_cmd(cell, CELL_CMD_STATE_BIND_VS, &info, sizeof(info));
+   }
+#endif
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.h b/src/gallium/drivers/cell/ppu/cell_state_emit.h
new file mode 100644
index 0000000000..59f8affe8d
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.h
@@ -0,0 +1,36 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef CELL_STATE_EMIT_H
+#define CELL_STATE_EMIT_H
+
+
+extern void
+cell_emit_state(struct cell_context *cell);
+
+
+#endif /* CELL_STATE_EMIT_H */
diff --git a/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c b/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c
new file mode 100644
index 0000000000..78cb446c14
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c
@@ -0,0 +1,1430 @@
+/*
+ * (C) Copyright IBM Corporation 2008
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * \file
+ * Generate code to perform all per-fragment operations.
+ *
+ * Code generated by these functions perform both alpha, depth, and stencil
+ * testing as well as alpha blending.
+ *
+ * \note
+ * Occlusion query is not supported, but this is the right place to add that
+ * support.
+ *
+ * \author Ian Romanick <idr@us.ibm.com>
+ */
+
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+#include "cell_context.h"
+
+#include "rtasm/rtasm_ppc_spe.h"
+
+
+/**
+ * Generate code to perform alpha testing.
+ *
+ * The code generated by this function uses the register specificed by
+ * \c mask as both an input and an output.
+ *
+ * \param dsa    Current alpha-test state
+ * \param f      Function to which code should be appended
+ * \param mask   Index of register containing active fragment mask
+ * \param alphas Index of register containing per-fragment alpha values
+ *
+ * \note Emits a maximum of 6 instructions.
+ */
+static void
+emit_alpha_test(struct pipe_depth_stencil_alpha_state *dsa,
+                struct spe_function *f, int mask, int alphas)
+{
+   /* If the alpha function is either NEVER or ALWAYS, there is no need to
+    * load the reference value into a register.  ALWAYS is a fairly common
+    * case, and this optimization saves 2 instructions.
+    */
+   if (dsa->alpha.enabled
+       && (dsa->alpha.func != PIPE_FUNC_NEVER)
+       && (dsa->alpha.func != PIPE_FUNC_ALWAYS)) {
+      int ref = spe_allocate_available_register(f);
+      int tmp_a = spe_allocate_available_register(f);
+      int tmp_b = spe_allocate_available_register(f);
+      union {
+         float f;
+         unsigned u;
+      } ref_val;
+      boolean complement = FALSE;
+
+      ref_val.f = dsa->alpha.ref;
+
+      spe_il(f, ref, ref_val.u & 0x0000ffff);
+      spe_ilh(f, ref, ref_val.u >> 16);
+
+      switch (dsa->alpha.func) {
+      case PIPE_FUNC_NOTEQUAL:
+         complement = TRUE;
+         /* FALLTHROUGH */
+
+      case PIPE_FUNC_EQUAL:
+         spe_fceq(f, tmp_a, ref, alphas);
+         break;
+
+      case PIPE_FUNC_LEQUAL:
+         complement = TRUE;
+         /* FALLTHROUGH */
+
+      case PIPE_FUNC_GREATER:
+         spe_fcgt(f, tmp_a, ref, alphas);
+         break;
+
+      case PIPE_FUNC_LESS:
+         complement = TRUE;
+         /* FALLTHROUGH */
+
+      case PIPE_FUNC_GEQUAL:
+         spe_fcgt(f, tmp_a, ref, alphas);
+         spe_fceq(f, tmp_b, ref, alphas);
+         spe_or(f, tmp_a, tmp_b, tmp_a);
+         break;
+
+      case PIPE_FUNC_ALWAYS:
+      case PIPE_FUNC_NEVER:
+      default:
+         assert(0);
+         break;
+      }
+
+      if (complement) {
+         spe_andc(f, mask, mask, tmp_a);
+      } else {
+         spe_and(f, mask, mask, tmp_a);
+      }
+
+      spe_release_register(f, ref);
+      spe_release_register(f, tmp_a);
+      spe_release_register(f, tmp_b);
+   } else if (dsa->alpha.enabled && (dsa->alpha.func == PIPE_FUNC_NEVER)) {
+      spe_il(f, mask, 0);
+   }
+}
+
+
+/**
+ * Generate code to perform Z testing.  Four Z values are tested at once.
+ * \param dsa        Current depth-test state
+ * \param f          Function to which code should be appended
+ * \param mask       Index of register to contain depth-pass mask
+ * \param stored     Index of register containing values from depth buffer
+ * \param calculated Index of register containing per-fragment depth values
+ *
+ * \return
+ * If the calculated depth comparison mask is the actual mask, \c FALSE is
+ * returned.  If the calculated depth comparison mask is the compliment of
+ * the actual mask, \c TRUE is returned.
+ *
+ * \note Emits a maximum of 3 instructions.
+ */
+static boolean
+emit_depth_test(struct pipe_depth_stencil_alpha_state *dsa,
+                struct spe_function *f, int mask, int stored, int calculated)
+{
+   unsigned func = (dsa->depth.enabled)
+       ? dsa->depth.func : PIPE_FUNC_ALWAYS;
+   int tmp = spe_allocate_available_register(f);
+   boolean compliment = FALSE;
+
+   switch (func) {
+   case PIPE_FUNC_NEVER:
+      spe_il(f, mask, 0);
+      break;
+
+   case PIPE_FUNC_NOTEQUAL:
+      compliment = TRUE;
+      /* FALLTHROUGH */
+   case PIPE_FUNC_EQUAL:
+      spe_ceq(f, mask, calculated, stored);
+      break;
+
+   case PIPE_FUNC_LEQUAL:
+      compliment = TRUE;
+      /* FALLTHROUGH */
+   case PIPE_FUNC_GREATER:
+      spe_clgt(f, mask, calculated, stored);
+      break;
+
+   case PIPE_FUNC_LESS:
+      compliment = TRUE;
+      /* FALLTHROUGH */
+   case PIPE_FUNC_GEQUAL:
+      spe_clgt(f, mask, calculated, stored);
+      spe_ceq(f, tmp, calculated, stored);
+      spe_or(f, mask, mask, tmp);
+      break;
+
+   case PIPE_FUNC_ALWAYS:
+      spe_il(f, mask, ~0);
+      break;
+
+   default:
+      assert(0);
+      break;
+   }
+
+   spe_release_register(f, tmp);
+   return compliment;
+}
+
+
+/**
+ * Generate code to apply the stencil operation (after testing).
+ * \note Emits a maximum of 5 instructions.
+ *
+ * \warning
+ * Since \c out and \c in might be the same register, this routine cannot
+ * generate code that uses \c out as a temporary.
+ */
+static void
+emit_stencil_op(struct spe_function *f,
+                int out, int in, int mask, unsigned op, unsigned ref)
+{
+   const int clamp = spe_allocate_available_register(f);
+   const int clamp_mask = spe_allocate_available_register(f);
+   const int result = spe_allocate_available_register(f);
+
+   switch(op) {
+   case PIPE_STENCIL_OP_KEEP:
+      assert(0);
+   case PIPE_STENCIL_OP_ZERO:
+      spe_il(f, result, 0);
+      break;
+   case PIPE_STENCIL_OP_REPLACE:
+      spe_il(f, result, ref);
+      break;
+   case PIPE_STENCIL_OP_INCR:
+      /* clamp = [0xff, 0xff, 0xff, 0xff] */
+      spe_il(f, clamp, 0x0ff);
+      /* result[i] = in[i] + 1 */
+      spe_ai(f, result, in, 1);
+      /* clamp_mask[i] = (result[i] > 0xff) */
+      spe_clgti(f, clamp_mask, result, 0x0ff);
+      /* result[i] = clamp_mask[i] ? clamp[i] : result[i] */
+      spe_selb(f, result, result, clamp, clamp_mask);
+      break;
+   case PIPE_STENCIL_OP_DECR:
+      spe_il(f, clamp, 0);
+      spe_ai(f, result, in, -1);
+
+      /* If "(s-1) < 0" in signed arithemtic, then "(s-1) > MAX" in unsigned
+       * arithmetic.
+       */
+      spe_clgti(f, clamp_mask, result, 0x0ff);
+      spe_selb(f, result, result, clamp, clamp_mask);
+      break;
+   case PIPE_STENCIL_OP_INCR_WRAP:
+      spe_ai(f, result, in, 1);
+      break;
+   case PIPE_STENCIL_OP_DECR_WRAP:
+      spe_ai(f, result, in, -1);
+      break;
+   case PIPE_STENCIL_OP_INVERT:
+      spe_nor(f, result, in, in);
+      break;
+   default:
+      assert(0);
+   }
+
+   spe_selb(f, out, in, result, mask);
+
+   spe_release_register(f, result);
+   spe_release_register(f, clamp_mask);
+   spe_release_register(f, clamp);
+}
+
+
+/**
+ * Generate code to do stencil test.  Four pixels are tested at once.
+ * \param dsa        Depth / stencil test state
+ * \param face       0 for front face, 1 for back face
+ * \param f          Function to append instructions to
+ * \param mask       Register containing mask of fragments passing the
+ *                   alpha test
+ * \param depth_mask Register containing mask of fragments passing the
+ *                   depth test
+ * \param depth_compliment  Is \c depth_mask the compliment of the actual mask?
+ * \param stencil    Register containing values from stencil buffer
+ * \param depth_pass Register to store mask of fragments passing stencil test
+ *                   and depth test
+ *
+ * \note
+ * Emits a maximum of 10 + (3 * 5) = 25 instructions.
+ */
+static int
+emit_stencil_test(struct pipe_depth_stencil_alpha_state *dsa,
+                  unsigned face,
+                  struct spe_function *f,
+                  int mask,
+                  int depth_mask,
+                  boolean depth_complement,
+                  int stencil,
+                  int depth_pass)
+{
+   int stencil_fail = spe_allocate_available_register(f);
+   int depth_fail = spe_allocate_available_register(f);
+   int stencil_mask = spe_allocate_available_register(f);
+   int stencil_pass = spe_allocate_available_register(f);
+   int face_stencil = spe_allocate_available_register(f);
+   int stencil_src = stencil;
+   const unsigned ref = (dsa->stencil[face].ref_value
+                         & dsa->stencil[face].value_mask);
+   boolean complement = FALSE;
+   int stored;
+   int tmp = spe_allocate_available_register(f);
+
+
+   if ((dsa->stencil[face].func != PIPE_FUNC_NEVER)
+       && (dsa->stencil[face].func != PIPE_FUNC_ALWAYS)
+       && (dsa->stencil[face].value_mask != 0x0ff)) {
+      stored = spe_allocate_available_register(f);
+      spe_andi(f, stored, stencil, dsa->stencil[face].value_mask);
+   } else {
+      stored = stencil;
+   }
+
+
+   switch (dsa->stencil[face].func) {
+   case PIPE_FUNC_NEVER:
+      spe_il(f, stencil_mask, 0);   /* stencil_mask[0..3] = [0,0,0,0] */
+      break;
+
+   case PIPE_FUNC_NOTEQUAL:
+      complement = TRUE;
+      /* FALLTHROUGH */
+   case PIPE_FUNC_EQUAL:
+      /* stencil_mask[i] = (stored[i] == ref) */
+      spe_ceqi(f, stencil_mask, stored, ref);
+      break;
+
+   case PIPE_FUNC_LEQUAL:
+      complement = TRUE;
+      /* FALLTHROUGH */
+   case PIPE_FUNC_GREATER:
+      complement = TRUE;
+      /* stencil_mask[i] = (stored[i] > ref) */
+      spe_clgti(f, stencil_mask, stored, ref);
+      break;
+
+   case PIPE_FUNC_LESS:
+      complement = TRUE;
+      /* FALLTHROUGH */
+   case PIPE_FUNC_GEQUAL:
+      /* stencil_mask[i] = (stored[i] > ref) */
+      spe_clgti(f, stencil_mask, stored, ref);
+      /* tmp[i] = (stored[i] == ref) */
+      spe_ceqi(f, tmp, stored, ref);
+      /* stencil_mask[i] = stencil_mask[i] | tmp[i] */
+      spe_or(f, stencil_mask, stencil_mask, tmp);
+      break;
+
+   case PIPE_FUNC_ALWAYS:
+      /* See comment below. */
+      break;
+
+   default:
+      assert(0);
+      break;
+   }
+
+   if (stored != stencil) {
+      spe_release_register(f, stored);
+   }
+   spe_release_register(f, tmp);
+
+
+   /* ALWAYS is a very common stencil-test, so some effort is applied to
+    * optimize that case.  The stencil-pass mask is the same as the input
+    * fragment mask.  This makes the stencil-test (above) a no-op, and the
+    * input fragment mask can be "renamed" the stencil-pass mask.
+    */
+   if (dsa->stencil[face].func == PIPE_FUNC_ALWAYS) {
+      spe_release_register(f, stencil_pass);
+      stencil_pass = mask;
+   } else {
+      if (complement) {
+         spe_andc(f, stencil_pass, mask, stencil_mask);
+      } else {
+         spe_and(f, stencil_pass, mask, stencil_mask);
+      }
+   }
+
+   if (depth_complement) {
+      spe_andc(f, depth_pass, stencil_pass, depth_mask);
+   } else {
+      spe_and(f, depth_pass, stencil_pass, depth_mask);
+   }
+
+
+   /* Conditionally emit code to update the stencil value under various
+    * condititons.  Note that there is no need to generate code under the
+    * following circumstances:
+    *
+    * - Stencil write mask is zero.
+    * - For stencil-fail if the stencil test is ALWAYS
+    * - For depth-fail if the stencil test is NEVER
+    * - For depth-pass if the stencil test is NEVER
+    * - Any of the 3 conditions if the operation is KEEP
+    */
+   if (dsa->stencil[face].write_mask != 0) {
+      if ((dsa->stencil[face].func != PIPE_FUNC_ALWAYS)
+          && (dsa->stencil[face].fail_op != PIPE_STENCIL_OP_KEEP)) {
+         if (complement) {
+            spe_and(f, stencil_fail, mask, stencil_mask);
+         } else {
+            spe_andc(f, stencil_fail, mask, stencil_mask);
+         }
+
+         emit_stencil_op(f, face_stencil, stencil_src, stencil_fail,
+                         dsa->stencil[face].fail_op,
+                         dsa->stencil[face].ref_value);
+
+         stencil_src = face_stencil;
+      }
+
+      if ((dsa->stencil[face].func != PIPE_FUNC_NEVER)
+          && (dsa->stencil[face].zfail_op != PIPE_STENCIL_OP_KEEP)) {
+         if (depth_complement) {
+            spe_and(f, depth_fail, stencil_pass, depth_mask);
+         } else {
+            spe_andc(f, depth_fail, stencil_pass, depth_mask);
+         }
+
+         emit_stencil_op(f, face_stencil, stencil_src, depth_fail,
+                         dsa->stencil[face].zfail_op,
+                         dsa->stencil[face].ref_value);
+         stencil_src = face_stencil;
+      }
+
+      if ((dsa->stencil[face].func != PIPE_FUNC_NEVER)
+          && (dsa->stencil[face].zpass_op != PIPE_STENCIL_OP_KEEP)) {
+         emit_stencil_op(f, face_stencil, stencil_src, depth_pass,
+                         dsa->stencil[face].zpass_op,
+                         dsa->stencil[face].ref_value);
+         stencil_src = face_stencil;
+      }
+   }
+
+   spe_release_register(f, stencil_fail);
+   spe_release_register(f, depth_fail);
+   spe_release_register(f, stencil_mask);
+   if (stencil_pass != mask) {
+      spe_release_register(f, stencil_pass);
+   }
+
+   /* If all of the stencil operations were KEEP or the stencil write mask was
+    * zero, "stencil_src" will still be set to "stencil".  In this case
+    * release the "face_stencil" register.  Otherwise apply the stencil write
+    * mask to select bits from the calculated stencil value and the previous
+    * stencil value.
+    */
+   if (stencil_src == stencil) {
+      spe_release_register(f, face_stencil);
+   } else if (dsa->stencil[face].write_mask != 0x0ff) {
+      int tmp = spe_allocate_available_register(f);
+
+      spe_il(f, tmp, dsa->stencil[face].write_mask);
+      spe_selb(f, stencil_src, stencil, stencil_src, tmp);
+
+      spe_release_register(f, tmp);
+   }
+
+   return stencil_src;
+}
+
+
+void
+cell_generate_depth_stencil_test(struct cell_depth_stencil_alpha_state *cdsa)
+{
+   struct pipe_depth_stencil_alpha_state *const dsa = &cdsa->base;
+   struct spe_function *const f = &cdsa->code;
+
+   /* This code generates a maximum of 6 (alpha test) + 3 (depth test)
+    * + 25 (front stencil) + 25 (back stencil) + 4 = 63 instructions.  Round
+    * up to 64 to make it a happy power-of-two.
+    */
+   spe_init_func(f, SPE_INST_SIZE * 64);
+
+
+   /* Allocate registers for the function's input parameters.  Cleverly (and
+    * clever code is usually dangerous, but I couldn't resist) the generated
+    * function returns a structure.  Returned structures start with register
+    * 3, and the structure fields are ordered to match up exactly with the
+    * input parameters.
+    */
+   int mask = spe_allocate_register(f, 3);
+   int depth = spe_allocate_register(f, 4);
+   int stencil = spe_allocate_register(f, 5);
+   int zvals = spe_allocate_register(f, 6);
+   int frag_a = spe_allocate_register(f, 7);
+   int facing = spe_allocate_register(f, 8);
+
+   int depth_mask = spe_allocate_available_register(f);
+
+   boolean depth_complement;
+
+
+   emit_alpha_test(dsa, f, mask, frag_a);
+
+   depth_complement = emit_depth_test(dsa, f, depth_mask, depth, zvals);
+
+   if (dsa->stencil[0].enabled) {
+      const int front_depth_pass = spe_allocate_available_register(f);
+      int front_stencil = emit_stencil_test(dsa, 0, f, mask,
+                                            depth_mask, depth_complement,
+                                            stencil, front_depth_pass);
+
+      if (dsa->stencil[1].enabled) {
+         const int back_depth_pass = spe_allocate_available_register(f);
+         int back_stencil = emit_stencil_test(dsa, 1, f, mask,
+                                              depth_mask,  depth_complement,
+                                              stencil, back_depth_pass);
+
+         /* If the front facing stencil value and the back facing stencil
+          * value are stored in the same register, there is no need to select
+          * a value based on the facing.  This can happen if the stencil value
+          * was not modified due to the write masks being zero, the stencil
+          * operations being KEEP, etc.
+          */
+         if (front_stencil != back_stencil) {
+            spe_selb(f, stencil, back_stencil, front_stencil, facing);
+         }
+
+         if (back_stencil != stencil) {
+            spe_release_register(f, back_stencil);
+         }
+
+         if (front_stencil != stencil) {
+            spe_release_register(f, front_stencil);
+         }
+
+         spe_selb(f, mask, back_depth_pass, front_depth_pass, facing);
+
+         spe_release_register(f, back_depth_pass);
+      } else {
+         if (front_stencil != stencil) {
+            spe_or(f, stencil, front_stencil, front_stencil);
+            spe_release_register(f, front_stencil);
+         }
+         spe_or(f, mask, front_depth_pass, front_depth_pass);
+      }
+
+      spe_release_register(f, front_depth_pass);
+   } else if (dsa->depth.enabled) {
+      if (depth_complement) {
+         spe_andc(f, mask, mask, depth_mask);
+      } else {
+         spe_and(f, mask, mask, depth_mask);
+      }
+   }
+
+   if (dsa->depth.writemask) {
+         spe_selb(f, depth, depth, zvals, mask);
+   }
+
+   spe_bi(f, 0, 0, 0);  /* return from function call */
+
+
+#if 0
+   {
+      const uint32_t *p = f->store;
+      unsigned i;
+
+      printf("# alpha (%sabled)\n",
+             (dsa->alpha.enabled) ? "en" : "dis");
+      printf("#    func: %u\n", dsa->alpha.func);
+      printf("#    ref: %.2f\n", dsa->alpha.ref);
+
+      printf("# depth (%sabled)\n",
+             (dsa->depth.enabled) ? "en" : "dis");
+      printf("#    func: %u\n", dsa->depth.func);
+
+      for (i = 0; i < 2; i++) {
+         printf("# %s stencil (%sabled)\n",
+                (i == 0) ? "front" : "back",
+                (dsa->stencil[i].enabled) ? "en" : "dis");
+
+         printf("#    func: %u\n", dsa->stencil[i].func);
+         printf("#    op (sf, zf, zp): %u %u %u\n",
+                dsa->stencil[i].fail_op,
+                dsa->stencil[i].zfail_op,
+                dsa->stencil[i].zpass_op);
+         printf("#    ref value / value mask / write mask: %02x %02x %02x\n",
+                dsa->stencil[i].ref_value,
+                dsa->stencil[i].value_mask,
+                dsa->stencil[i].write_mask);
+      }
+
+      printf("\t.text\n");
+      for (/* empty */; p < f->csr; p++) {
+         printf("\t.long\t0x%04x\n", *p);
+      }
+      fflush(stdout);
+   }
+#endif
+}
+
+
+/**
+ * \note Emits a maximum of 3 instructions
+ */
+static int
+emit_alpha_factor_calculation(struct spe_function *f,
+                              unsigned factor,
+                              int src_alpha, int dst_alpha, int const_alpha)
+{
+   int factor_reg;
+   int tmp;
+
+
+   switch (factor) {
+   case PIPE_BLENDFACTOR_ONE:
+      factor_reg = -1;
+      break;
+
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+      factor_reg = spe_allocate_available_register(f);
+
+      spe_or(f, factor_reg, src_alpha, src_alpha);
+      break;
+
+   case PIPE_BLENDFACTOR_DST_ALPHA:
+      factor_reg = dst_alpha;
+      break;
+
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+      factor_reg = -1;
+      break;
+
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+      factor_reg = spe_allocate_available_register(f);
+
+      tmp = spe_allocate_available_register(f);
+      spe_il(f, tmp, 1);
+      spe_cuflt(f, tmp, tmp, 0);
+      spe_fs(f, factor_reg, tmp, const_alpha);
+      spe_release_register(f, tmp);
+      break;
+
+   case PIPE_BLENDFACTOR_CONST_ALPHA:
+      factor_reg = const_alpha;
+      break;
+
+   case PIPE_BLENDFACTOR_ZERO:
+      factor_reg = -1;
+      break;
+
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+      tmp = spe_allocate_available_register(f);
+      factor_reg = spe_allocate_available_register(f);
+
+      spe_il(f, tmp, 1);
+      spe_cuflt(f, tmp, tmp, 0);
+      spe_fs(f, factor_reg, tmp, src_alpha);
+
+      spe_release_register(f, tmp);
+      break;
+
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+      tmp = spe_allocate_available_register(f);
+      factor_reg = spe_allocate_available_register(f);
+
+      spe_il(f, tmp, 1);
+      spe_cuflt(f, tmp, tmp, 0);
+      spe_fs(f, factor_reg, tmp, dst_alpha);
+
+      spe_release_register(f, tmp);
+      break;
+
+   case PIPE_BLENDFACTOR_SRC1_ALPHA:
+   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+   default:
+      assert(0);
+      factor_reg = -1;
+      break;
+   }
+
+   return factor_reg;
+}
+
+
+/**
+ * \note Emits a maximum of 6 instructions
+ */
+static void
+emit_color_factor_calculation(struct spe_function *f,
+                              unsigned sF, unsigned mask,
+                              const int *src,
+                              const int *dst,
+                              const int *const_color,
+                              int *factor)
+{
+   int tmp;
+   unsigned i;
+
+
+   factor[0] = -1;
+   factor[1] = -1;
+   factor[2] = -1;
+   factor[3] = -1;
+
+   switch (sF) {
+   case PIPE_BLENDFACTOR_ONE:
+      break;
+
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+      for (i = 0; i < 3; ++i) {
+         if ((mask & (1U << i)) != 0) {
+            factor[i] = spe_allocate_available_register(f);
+            spe_or(f, factor[i], src[i], src[i]);
+         }
+      }
+      break;
+
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+      factor[0] = spe_allocate_available_register(f);
+      factor[1] = factor[0];
+      factor[2] = factor[0];
+
+      spe_or(f, factor[0], src[3], src[3]);
+      break;
+
+   case PIPE_BLENDFACTOR_DST_ALPHA:
+      factor[0] = dst[3];
+      factor[1] = dst[3];
+      factor[2] = dst[3];
+      break;
+
+   case PIPE_BLENDFACTOR_DST_COLOR:
+      factor[0] = dst[0];
+      factor[1] = dst[1];
+      factor[2] = dst[2];
+      break;
+
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+      tmp = spe_allocate_available_register(f);
+      factor[0] = spe_allocate_available_register(f);
+      factor[1] = factor[0];
+      factor[2] = factor[0];
+
+      /* Alpha saturate means min(As, 1-Ad).
+       */
+      spe_il(f, tmp, 1);
+      spe_cuflt(f, tmp, tmp, 0);
+      spe_fs(f, tmp, tmp, dst[3]);
+      spe_fcgt(f, factor[0], tmp, src[3]);
+      spe_selb(f, factor[0], src[3], tmp, factor[0]);
+
+      spe_release_register(f, tmp);
+      break;
+
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+      tmp = spe_allocate_available_register(f);
+      spe_il(f, tmp, 1);
+      spe_cuflt(f, tmp, tmp, 0);
+
+      for (i = 0; i < 3; i++) {
+         factor[i] = spe_allocate_available_register(f);
+
+         spe_fs(f, factor[i], tmp, const_color[i]);
+      }
+      spe_release_register(f, tmp);
+      break;
+
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+      for (i = 0; i < 3; i++) {
+         factor[i] = const_color[i];
+      }
+      break;
+
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+      factor[0] = spe_allocate_available_register(f);
+      factor[1] = factor[0];
+      factor[2] = factor[0];
+
+      tmp = spe_allocate_available_register(f);
+      spe_il(f, tmp, 1);
+      spe_cuflt(f, tmp, tmp, 0);
+      spe_fs(f, factor[0], tmp, const_color[3]);
+      spe_release_register(f, tmp);
+      break;
+
+   case PIPE_BLENDFACTOR_CONST_ALPHA:
+      factor[0] = const_color[3];
+      factor[1] = factor[0];
+      factor[2] = factor[0];
+      break;
+
+   case PIPE_BLENDFACTOR_ZERO:
+      break;
+
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+      tmp = spe_allocate_available_register(f);
+
+      spe_il(f, tmp, 1);
+      spe_cuflt(f, tmp, tmp, 0);
+
+      for (i = 0; i < 3; ++i) {
+         if ((mask & (1U << i)) != 0) {
+            factor[i] = spe_allocate_available_register(f);
+            spe_fs(f, factor[i], tmp, src[i]);
+         }
+      }
+
+      spe_release_register(f, tmp);
+      break;
+
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+      tmp = spe_allocate_available_register(f);
+      factor[0] = spe_allocate_available_register(f);
+      factor[1] = factor[0];
+      factor[2] = factor[0];
+
+      spe_il(f, tmp, 1);
+      spe_cuflt(f, tmp, tmp, 0);
+      spe_fs(f, factor[0], tmp, src[3]);
+
+      spe_release_register(f, tmp);
+      break;
+
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+      tmp = spe_allocate_available_register(f);
+      factor[0] = spe_allocate_available_register(f);
+      factor[1] = factor[0];
+      factor[2] = factor[0];
+
+      spe_il(f, tmp, 1);
+      spe_cuflt(f, tmp, tmp, 0);
+      spe_fs(f, factor[0], tmp, dst[3]);
+
+      spe_release_register(f, tmp);
+      break;
+
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+      tmp = spe_allocate_available_register(f);
+
+      spe_il(f, tmp, 1);
+      spe_cuflt(f, tmp, tmp, 0);
+
+      for (i = 0; i < 3; ++i) {
+         if ((mask & (1U << i)) != 0) {
+            factor[i] = spe_allocate_available_register(f);
+            spe_fs(f, factor[i], tmp, dst[i]);
+         }
+      }
+
+      spe_release_register(f, tmp);
+      break;
+
+   case PIPE_BLENDFACTOR_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_SRC1_ALPHA:
+   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+   default:
+      assert(0);
+   }
+}
+
+
+static void
+emit_blend_calculation(struct spe_function *f,
+                       unsigned func, unsigned sF, unsigned dF,
+                       int src, int src_factor, int dst, int dst_factor)
+{
+   int tmp = spe_allocate_available_register(f);
+
+   switch (func) {
+   case PIPE_BLEND_ADD:
+      if (sF == PIPE_BLENDFACTOR_ONE) {
+         if (dF == PIPE_BLENDFACTOR_ZERO) {
+            /* Do nothing. */
+         } else if (dF == PIPE_BLENDFACTOR_ONE) {
+            spe_fa(f, src, src, dst);
+         }
+      } else if (sF == PIPE_BLENDFACTOR_ZERO) {
+         if (dF == PIPE_BLENDFACTOR_ZERO) {
+            spe_il(f, src, 0);
+         } else if (dF == PIPE_BLENDFACTOR_ONE) {
+            spe_or(f, src, dst, dst);
+         } else {
+            spe_fm(f, src, dst, dst_factor);
+         }
+      } else if (dF == PIPE_BLENDFACTOR_ZERO) {
+         spe_fm(f, src, src, src_factor);
+      } else {
+         spe_fm(f, tmp, dst, dst_factor);
+         spe_fma(f, src, src, src_factor, tmp);
+      }
+      break;
+
+   case PIPE_BLEND_SUBTRACT:
+      if (sF == PIPE_BLENDFACTOR_ONE) {
+         if (dF == PIPE_BLENDFACTOR_ZERO) {
+            /* Do nothing. */
+         } else if (dF == PIPE_BLENDFACTOR_ONE) {
+            spe_fs(f, src, src, dst);
+         }
+      } else if (sF == PIPE_BLENDFACTOR_ZERO) {
+         if (dF == PIPE_BLENDFACTOR_ZERO) {
+            spe_il(f, src, 0);
+         } else if (dF == PIPE_BLENDFACTOR_ONE) {
+            spe_il(f, tmp, 0);
+            spe_fs(f, src, tmp, dst);
+         } else {
+            spe_fm(f, src, dst, dst_factor);
+         }
+      } else if (dF == PIPE_BLENDFACTOR_ZERO) {
+         spe_fm(f, src, src, src_factor);
+      } else {
+         spe_fm(f, tmp, dst, dst_factor);
+         spe_fms(f, src, src, src_factor, tmp);
+      }
+      break;
+
+   case PIPE_BLEND_REVERSE_SUBTRACT:
+      if (sF == PIPE_BLENDFACTOR_ONE) {
+         if (dF == PIPE_BLENDFACTOR_ZERO) {
+            spe_il(f, tmp, 0);
+            spe_fs(f, src, tmp, src);
+         } else if (dF == PIPE_BLENDFACTOR_ONE) {
+            spe_fs(f, src, dst, src);
+         }
+      } else if (sF == PIPE_BLENDFACTOR_ZERO) {
+         if (dF == PIPE_BLENDFACTOR_ZERO) {
+            spe_il(f, src, 0);
+         } else if (dF == PIPE_BLENDFACTOR_ONE) {
+            spe_or(f, src, dst, dst);
+         } else {
+            spe_fm(f, src, dst, dst_factor);
+         }
+      } else if (dF == PIPE_BLENDFACTOR_ZERO) {
+         spe_fm(f, src, src, src_factor);
+      } else {
+         spe_fm(f, tmp, src, src_factor);
+         spe_fms(f, src, src, dst_factor, tmp);
+      }
+      break;
+
+   case PIPE_BLEND_MIN:
+      spe_cgt(f, tmp, src, dst);
+      spe_selb(f, src, src, dst, tmp);
+      break;
+
+   case PIPE_BLEND_MAX:
+      spe_cgt(f, tmp, src, dst);
+      spe_selb(f, src, dst, src, tmp);
+      break;
+
+   default:
+      assert(0);
+   }
+
+   spe_release_register(f, tmp);
+}
+
+
+/**
+ * Generate code to perform alpha blending on the SPE
+ */
+void
+cell_generate_alpha_blend(struct cell_blend_state *cb)
+{
+   struct pipe_blend_state *const b = &cb->base;
+   struct spe_function *const f = &cb->code;
+
+   /* This code generates a maximum of 3 (source alpha factor)
+    * + 3 (destination alpha factor) + (3 * 6) (source color factor)
+    * + (3 * 6) (destination color factor) + (4 * 2) (blend equation)
+    * + 4 (fragment mask) + 1 (return) = 55 instlructions.  Round up to 64 to
+    * make it a happy power-of-two.
+    */
+   spe_init_func(f, SPE_INST_SIZE * 64);
+
+
+   const int frag[4] = {
+      spe_allocate_register(f, 3),
+      spe_allocate_register(f, 4),
+      spe_allocate_register(f, 5),
+      spe_allocate_register(f, 6),
+   };
+   const int pixel[4] = {
+      spe_allocate_register(f, 7),
+      spe_allocate_register(f, 8),
+      spe_allocate_register(f, 9),
+      spe_allocate_register(f, 10),
+   };
+   const int const_color[4] = {
+      spe_allocate_register(f, 11),
+      spe_allocate_register(f, 12),
+      spe_allocate_register(f, 13),
+      spe_allocate_register(f, 14),
+   };
+   unsigned func[4];
+   unsigned sF[4];
+   unsigned dF[4];
+   unsigned i;
+   int src_factor[4];
+   int dst_factor[4];
+
+
+   /* Does the selected blend mode make use of the source / destination
+    * color (RGB) blend factors?
+    */
+   boolean need_color_factor = b->blend_enable
+       && (b->rgb_func != PIPE_BLEND_MIN)
+       && (b->rgb_func != PIPE_BLEND_MAX);
+
+   /* Does the selected blend mode make use of the source / destination
+    * alpha blend factors?
+    */
+   boolean need_alpha_factor = b->blend_enable
+       && (b->alpha_func != PIPE_BLEND_MIN)
+       && (b->alpha_func != PIPE_BLEND_MAX);
+
+
+   if (b->blend_enable) {
+      sF[0] = b->rgb_src_factor;
+      sF[1] = sF[0];
+      sF[2] = sF[0];
+      switch (b->alpha_src_factor & 0x0f) {
+      case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+         sF[3] = PIPE_BLENDFACTOR_ONE;
+         break;
+      case PIPE_BLENDFACTOR_SRC_COLOR:
+      case PIPE_BLENDFACTOR_DST_COLOR:
+      case PIPE_BLENDFACTOR_CONST_COLOR:
+      case PIPE_BLENDFACTOR_SRC1_COLOR:
+         sF[3] = b->alpha_src_factor + 1;
+         break;
+      default:
+         sF[3] = b->alpha_src_factor;
+      }
+
+      dF[0] = b->rgb_dst_factor;
+      dF[1] = dF[0];
+      dF[2] = dF[0];
+      switch (b->alpha_dst_factor & 0x0f) {
+      case PIPE_BLENDFACTOR_SRC_COLOR:
+      case PIPE_BLENDFACTOR_DST_COLOR:
+      case PIPE_BLENDFACTOR_CONST_COLOR:
+      case PIPE_BLENDFACTOR_SRC1_COLOR:
+         dF[3] = b->alpha_dst_factor + 1;
+         break;
+      default:
+         dF[3] = b->alpha_dst_factor;
+      }
+
+      func[0] = b->rgb_func;
+      func[1] = func[0];
+      func[2] = func[0];
+      func[3] = b->alpha_func;
+   } else {
+      sF[0] = PIPE_BLENDFACTOR_ONE;
+      sF[1] = PIPE_BLENDFACTOR_ONE;
+      sF[2] = PIPE_BLENDFACTOR_ONE;
+      sF[3] = PIPE_BLENDFACTOR_ONE;
+      dF[0] = PIPE_BLENDFACTOR_ZERO;
+      dF[1] = PIPE_BLENDFACTOR_ZERO;
+      dF[2] = PIPE_BLENDFACTOR_ZERO;
+      dF[3] = PIPE_BLENDFACTOR_ZERO;
+
+      func[0] = PIPE_BLEND_ADD;
+      func[1] = PIPE_BLEND_ADD;
+      func[2] = PIPE_BLEND_ADD;
+      func[3] = PIPE_BLEND_ADD;
+   }
+
+
+   /* If alpha writing is enabled and the alpha blend mode requires use of
+    * the alpha factor, calculate the alpha factor.
+    */
+   if (((b->colormask & 8) != 0) && need_alpha_factor) {
+      src_factor[3] = emit_alpha_factor_calculation(f, sF[3], const_color[3],
+                                                    frag[3], pixel[3]);
+
+      /* If the alpha destination blend factor is the same as the alpha source
+       * blend factor, re-use the previously calculated value.
+       */
+      dst_factor[3] = (dF[3] == sF[3])
+          ? src_factor[3]
+          : emit_alpha_factor_calculation(f, dF[3], const_color[3],
+                                          frag[3], pixel[3]);
+   }
+
+
+   if (sF[0] == sF[3]) {
+      src_factor[0] = src_factor[3];
+      src_factor[1] = src_factor[3];
+      src_factor[2] = src_factor[3];
+   } else if (sF[0] == dF[3]) {
+      src_factor[0] = dst_factor[3];
+      src_factor[1] = dst_factor[3];
+      src_factor[2] = dst_factor[3];
+   } else if (need_color_factor) {
+      emit_color_factor_calculation(f,
+                                    b->rgb_src_factor,
+                                    b->colormask,
+                                    frag, pixel, const_color, src_factor);
+   }
+
+
+   if (dF[0] == sF[3]) {
+      dst_factor[0] = src_factor[3];
+      dst_factor[1] = src_factor[3];
+      dst_factor[2] = src_factor[3];
+   } else if (dF[0] == dF[3]) {
+      dst_factor[0] = dst_factor[3];
+      dst_factor[1] = dst_factor[3];
+      dst_factor[2] = dst_factor[3];
+   } else if (dF[0] == sF[0]) {
+      dst_factor[0] = src_factor[0];
+      dst_factor[1] = src_factor[1];
+      dst_factor[2] = src_factor[2];
+   } else if (need_color_factor) {
+      emit_color_factor_calculation(f,
+                                    b->rgb_dst_factor,
+                                    b->colormask,
+                                    frag, pixel, const_color, dst_factor);
+   }
+
+
+
+   for (i = 0; i < 4; ++i) {
+      if ((b->colormask & (1U << i)) != 0) {
+         emit_blend_calculation(f,
+                                func[i], sF[i], dF[i],
+                                frag[i], src_factor[i],
+                                pixel[i], dst_factor[i]);
+      }
+   }
+
+   spe_bi(f, 0, 0, 0);
+
+#if 0
+   {
+      const uint32_t *p = f->store;
+
+      printf("# %u instructions\n", f->csr - f->store);
+      printf("# blend (%sabled)\n",
+             (cb->base.blend_enable) ? "en" : "dis");
+      printf("#    RGB func / sf / df: %u %u %u\n",
+             cb->base.rgb_func,
+             cb->base.rgb_src_factor,
+             cb->base.rgb_dst_factor);
+      printf("#    ALP func / sf / df: %u %u %u\n",
+             cb->base.alpha_func,
+             cb->base.alpha_src_factor,
+             cb->base.alpha_dst_factor);
+
+      printf("\t.text\n");
+      for (/* empty */; p < f->csr; p++) {
+         printf("\t.long\t0x%04x\n", *p);
+      }
+      fflush(stdout);
+   }
+#endif
+}
+
+
+static int
+PC_OFFSET(const struct spe_function *f, const void *d)
+{
+   const intptr_t pc = (intptr_t) &f->store[f->num_inst];
+   const intptr_t ea = ~0x0f & (intptr_t) d;
+
+   return (ea - pc) >> 2;
+}
+
+
+/**
+ * Generate code to perform color conversion and logic op
+ *
+ * \bug
+ * The code generated by this function should also perform dithering.
+ *
+ * \bug
+ * The code generated by this function should also perform color-write
+ * masking.
+ *
+ * \bug
+ * Only two framebuffer formats are supported at this time.
+ */
+void
+cell_generate_logic_op(struct spe_function *f,
+                       const struct pipe_blend_state *blend,
+                       struct pipe_surface *surf)
+{
+   const unsigned logic_op = (blend->logicop_enable)
+       ? blend->logicop_func : PIPE_LOGICOP_COPY;
+
+   /* This code generates a maximum of 37 instructions.  An additional 32
+    * bytes (equiv. to 8 instructions) are needed for data storage.  Round up
+    * to 64 to make it a happy power-of-two.
+    */
+   spe_init_func(f, SPE_INST_SIZE * 64);
+
+
+   /* Pixel colors in framebuffer format in AoS layout.
+    */
+   const int pixel[4] = {
+      spe_allocate_register(f, 3),
+      spe_allocate_register(f, 4),
+      spe_allocate_register(f, 5),
+      spe_allocate_register(f, 6),
+   };
+
+   /* Fragment colors stored as floats in SoA layout.
+    */
+   const int frag[4] = {
+      spe_allocate_register(f, 7),
+      spe_allocate_register(f, 8),
+      spe_allocate_register(f, 9),
+      spe_allocate_register(f, 10),
+   };
+
+   const int mask = spe_allocate_register(f, 11);
+
+
+   /* Short-circuit the noop and invert cases.
+    */
+   if ((logic_op == PIPE_LOGICOP_NOOP) || (blend->colormask == 0)) {
+      spe_bi(f, 0, 0, 0);
+      return;
+   } else if (logic_op == PIPE_LOGICOP_INVERT) {
+      spe_nor(f, pixel[0], pixel[0], pixel[0]);
+      spe_nor(f, pixel[1], pixel[1], pixel[1]);
+      spe_nor(f, pixel[2], pixel[2], pixel[2]);
+      spe_nor(f, pixel[3], pixel[3], pixel[3]);
+      spe_bi(f, 0, 0, 0);
+      return;
+   }
+
+
+   const int tmp[4] = {
+      spe_allocate_available_register(f),
+      spe_allocate_available_register(f),
+      spe_allocate_available_register(f),
+      spe_allocate_available_register(f),
+   };
+
+   const int shuf_xpose_hi = spe_allocate_available_register(f);
+   const int shuf_xpose_lo = spe_allocate_available_register(f);
+   const int shuf_color = spe_allocate_available_register(f);
+
+
+   /* Pointer to the begining of the function's private data area.
+    */
+   uint32_t *const data = ((uint32_t *) f->store) + (64 - 8);
+
+
+   /* Convert fragment colors to framebuffer format in AoS layout.
+    */
+   switch (surf->format) {
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+      data[0] = 0x00010203;
+      data[1] = 0x10111213;
+      data[2] = 0x04050607;
+      data[3] = 0x14151617;
+      data[4] = 0x0c000408;
+      data[5] = 0x80808080;
+      data[6] = 0x80808080;
+      data[7] = 0x80808080;
+      break;
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
+      data[0] = 0x03020100;
+      data[1] = 0x13121110;
+      data[2] = 0x07060504;
+      data[3] = 0x17161514;
+      data[4] = 0x0804000c;
+      data[5] = 0x80808080;
+      data[6] = 0x80808080;
+      data[7] = 0x80808080;
+      break;
+   default:
+      fprintf(stderr, "CELL: Bad pixel format in cell_generate_logic_op()");
+      ASSERT(0);
+   }
+
+   spe_ilh(f, tmp[0], 0x0808);
+   spe_lqr(f, shuf_xpose_hi, PC_OFFSET(f, data+0));
+   spe_lqr(f, shuf_color, PC_OFFSET(f, data+4));
+   spe_a(f, shuf_xpose_lo, shuf_xpose_hi, tmp[0]);
+
+   spe_shufb(f, tmp[0], frag[0], frag[2], shuf_xpose_hi);
+   spe_shufb(f, tmp[1], frag[0], frag[2], shuf_xpose_lo);
+   spe_shufb(f, tmp[2], frag[1], frag[3], shuf_xpose_hi);
+   spe_shufb(f, tmp[3], frag[1], frag[3], shuf_xpose_lo);
+
+   spe_shufb(f, frag[0], tmp[0], tmp[2], shuf_xpose_hi);
+   spe_shufb(f, frag[1], tmp[0], tmp[2], shuf_xpose_lo);
+   spe_shufb(f, frag[2], tmp[1], tmp[3], shuf_xpose_hi);
+   spe_shufb(f, frag[3], tmp[1], tmp[3], shuf_xpose_lo);
+
+   spe_cfltu(f, frag[0], frag[0], 32);
+   spe_cfltu(f, frag[1], frag[1], 32);
+   spe_cfltu(f, frag[2], frag[2], 32);
+   spe_cfltu(f, frag[3], frag[3], 32);
+
+   spe_shufb(f, frag[0], frag[0], pixel[0], shuf_color);
+   spe_shufb(f, frag[1], frag[1], pixel[1], shuf_color);
+   spe_shufb(f, frag[2], frag[2], pixel[2], shuf_color);
+   spe_shufb(f, frag[3], frag[3], pixel[3], shuf_color);
+
+
+   /* If logic op is enabled, perform the requested logical operation on the
+    * converted fragment colors and the pixel colors.
+    */
+   switch (logic_op) {
+   case PIPE_LOGICOP_CLEAR:
+      spe_il(f, frag[0], 0);
+      spe_il(f, frag[1], 0);
+      spe_il(f, frag[2], 0);
+      spe_il(f, frag[3], 0);
+      break;
+   case PIPE_LOGICOP_NOR:
+      spe_nor(f, frag[0], frag[0], pixel[0]);
+      spe_nor(f, frag[1], frag[1], pixel[1]);
+      spe_nor(f, frag[2], frag[2], pixel[2]);
+      spe_nor(f, frag[3], frag[3], pixel[3]);
+      break;
+   case PIPE_LOGICOP_AND_INVERTED:
+      spe_andc(f, frag[0], pixel[0], frag[0]);
+      spe_andc(f, frag[1], pixel[1], frag[1]);
+      spe_andc(f, frag[2], pixel[2], frag[2]);
+      spe_andc(f, frag[3], pixel[3], frag[3]);
+      break;
+   case PIPE_LOGICOP_COPY_INVERTED:
+      spe_nor(f, frag[0], frag[0], frag[0]);
+      spe_nor(f, frag[1], frag[1], frag[1]);
+      spe_nor(f, frag[2], frag[2], frag[2]);
+      spe_nor(f, frag[3], frag[3], frag[3]);
+      break;
+   case PIPE_LOGICOP_AND_REVERSE:
+      spe_andc(f, frag[0], frag[0], pixel[0]);
+      spe_andc(f, frag[1], frag[1], pixel[1]);
+      spe_andc(f, frag[2], frag[2], pixel[2]);
+      spe_andc(f, frag[3], frag[3], pixel[3]);
+      break;
+   case PIPE_LOGICOP_XOR:
+      spe_xor(f, frag[0], frag[0], pixel[0]);
+      spe_xor(f, frag[1], frag[1], pixel[1]);
+      spe_xor(f, frag[2], frag[2], pixel[2]);
+      spe_xor(f, frag[3], frag[3], pixel[3]);
+      break;
+   case PIPE_LOGICOP_NAND:
+      spe_nand(f, frag[0], frag[0], pixel[0]);
+      spe_nand(f, frag[1], frag[1], pixel[1]);
+      spe_nand(f, frag[2], frag[2], pixel[2]);
+      spe_nand(f, frag[3], frag[3], pixel[3]);
+      break;
+   case PIPE_LOGICOP_AND:
+      spe_and(f, frag[0], frag[0], pixel[0]);
+      spe_and(f, frag[1], frag[1], pixel[1]);
+      spe_and(f, frag[2], frag[2], pixel[2]);
+      spe_and(f, frag[3], frag[3], pixel[3]);
+      break;
+   case PIPE_LOGICOP_EQUIV:
+      spe_eqv(f, frag[0], frag[0], pixel[0]);
+      spe_eqv(f, frag[1], frag[1], pixel[1]);
+      spe_eqv(f, frag[2], frag[2], pixel[2]);
+      spe_eqv(f, frag[3], frag[3], pixel[3]);
+      break;
+   case PIPE_LOGICOP_OR_INVERTED:
+      spe_orc(f, frag[0], pixel[0], frag[0]);
+      spe_orc(f, frag[1], pixel[1], frag[1]);
+      spe_orc(f, frag[2], pixel[2], frag[2]);
+      spe_orc(f, frag[3], pixel[3], frag[3]);
+      break;
+   case PIPE_LOGICOP_COPY:
+      break;
+   case PIPE_LOGICOP_OR_REVERSE:
+      spe_orc(f, frag[0], frag[0], pixel[0]);
+      spe_orc(f, frag[1], frag[1], pixel[1]);
+      spe_orc(f, frag[2], frag[2], pixel[2]);
+      spe_orc(f, frag[3], frag[3], pixel[3]);
+      break;
+   case PIPE_LOGICOP_OR:
+      spe_or(f, frag[0], frag[0], pixel[0]);
+      spe_or(f, frag[1], frag[1], pixel[1]);
+      spe_or(f, frag[2], frag[2], pixel[2]);
+      spe_or(f, frag[3], frag[3], pixel[3]);
+      break;
+   case PIPE_LOGICOP_SET:
+      spe_il(f, frag[0], ~0);
+      spe_il(f, frag[1], ~0);
+      spe_il(f, frag[2], ~0);
+      spe_il(f, frag[3], ~0);
+      break;
+
+   /* These two cases are short-circuited above.
+    */
+   case PIPE_LOGICOP_INVERT:
+   case PIPE_LOGICOP_NOOP:
+   default:
+      assert(0);
+   }
+
+
+   /* Apply fragment mask.
+    */
+   spe_ilh(f, tmp[0], 0x0000);
+   spe_ilh(f, tmp[1], 0x0404);
+   spe_ilh(f, tmp[2], 0x0808);
+   spe_ilh(f, tmp[3], 0x0c0c);
+
+   spe_shufb(f, tmp[0], mask, mask, tmp[0]);
+   spe_shufb(f, tmp[1], mask, mask, tmp[1]);
+   spe_shufb(f, tmp[2], mask, mask, tmp[2]);
+   spe_shufb(f, tmp[3], mask, mask, tmp[3]);
+
+   spe_selb(f, pixel[0], pixel[0], frag[0], tmp[0]);
+   spe_selb(f, pixel[1], pixel[1], frag[1], tmp[1]);
+   spe_selb(f, pixel[2], pixel[2], frag[2], tmp[2]);
+   spe_selb(f, pixel[3], pixel[3], frag[3], tmp[3]);
+
+   spe_bi(f, 0, 0, 0);
+
+#if 0
+   {
+      const uint32_t *p = f->store;
+      unsigned i;
+
+      printf("# %u instructions\n", f->csr - f->store);
+
+      printf("\t.text\n");
+      for (i = 0; i < 64; i++) {
+         printf("\t.long\t0x%04x\n", p[i]);
+      }
+      fflush(stdout);
+   }
+#endif
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_state_per_fragment.h b/src/gallium/drivers/cell/ppu/cell_state_per_fragment.h
new file mode 100644
index 0000000000..a8267a5133
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_state_per_fragment.h
@@ -0,0 +1,39 @@
+/*
+ * (C) Copyright IBM Corporation 2008
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef CELL_STATE_PER_FRAGMENT_H
+#define CELL_STATE_PER_FRAGMENT_H
+
+extern void
+cell_generate_depth_stencil_test(struct cell_depth_stencil_alpha_state *cdsa);
+
+extern void
+cell_generate_alpha_blend(struct cell_blend_state *cb);
+
+extern void
+cell_generate_logic_op(struct spe_function *f,
+                       const struct pipe_blend_state *blend,
+                       struct pipe_surface *surf);
+
+#endif /* CELL_STATE_PER_FRAGMENT_H */
diff --git a/src/gallium/drivers/cell/ppu/cell_state_shader.c b/src/gallium/drivers/cell/ppu/cell_state_shader.c
new file mode 100644
index 0000000000..54a17eaf2b
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_state_shader.c
@@ -0,0 +1,219 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "pipe/p_defines.h"
+#include "util/u_memory.h"
+#include "pipe/p_inlines.h"
+#include "pipe/p_winsys.h"
+#include "draw/draw_context.h"
+#include "tgsi/tgsi_parse.h"
+
+#include "cell_context.h"
+#include "cell_state.h"
+#include "cell_gen_fp.h"
+
+
+/** cast wrapper */
+static INLINE struct cell_fragment_shader_state *
+cell_fragment_shader_state(void *shader)
+{
+   return (struct cell_fragment_shader_state *) shader;
+}
+
+
+/** cast wrapper */
+static INLINE struct cell_vertex_shader_state *
+cell_vertex_shader_state(void *shader)
+{
+   return (struct cell_vertex_shader_state *) shader;
+}
+
+
+/**
+ * Create fragment shader state.
+ * Called via pipe->create_fs_state()
+ */
+static void *
+cell_create_fs_state(struct pipe_context *pipe,
+                     const struct pipe_shader_state *templ)
+{
+   struct cell_context *cell = cell_context(pipe);
+   struct cell_fragment_shader_state *cfs;
+
+   cfs = CALLOC_STRUCT(cell_fragment_shader_state);
+   if (!cfs)
+      return NULL;
+
+   cfs->shader.tokens = tgsi_dup_tokens(templ->tokens);
+   if (!cfs->shader.tokens) {
+      FREE(cfs);
+      return NULL;
+   }
+
+   tgsi_scan_shader(templ->tokens, &cfs->info);
+
+   cell_gen_fragment_program(cell, cfs->shader.tokens, &cfs->code);
+
+   return cfs;
+}
+
+
+/**
+ * Called via pipe->bind_fs_state()
+ */
+static void
+cell_bind_fs_state(struct pipe_context *pipe, void *fs)
+{
+   struct cell_context *cell = cell_context(pipe);
+
+   cell->fs = cell_fragment_shader_state(fs);
+
+   cell->dirty |= CELL_NEW_FS;
+}
+
+
+/**
+ * Called via pipe->delete_fs_state()
+ */
+static void
+cell_delete_fs_state(struct pipe_context *pipe, void *fs)
+{
+   struct cell_fragment_shader_state *cfs = cell_fragment_shader_state(fs);
+
+   spe_release_func(&cfs->code);
+
+   FREE((void *) cfs->shader.tokens);
+   FREE(cfs);
+}
+
+
+/**
+ * Create vertex shader state.
+ * Called via pipe->create_vs_state()
+ */
+static void *
+cell_create_vs_state(struct pipe_context *pipe,
+                     const struct pipe_shader_state *templ)
+{
+   struct cell_context *cell = cell_context(pipe);
+   struct cell_vertex_shader_state *cvs;
+
+   cvs = CALLOC_STRUCT(cell_vertex_shader_state);
+   if (!cvs)
+      return NULL;
+
+   cvs->shader.tokens = tgsi_dup_tokens(templ->tokens);
+   if (!cvs->shader.tokens) {
+      FREE(cvs);
+      return NULL;
+   }
+
+   tgsi_scan_shader(templ->tokens, &cvs->info);
+
+   cvs->draw_data = draw_create_vertex_shader(cell->draw, &cvs->shader);
+   if (cvs->draw_data == NULL) {
+      FREE( (void *) cvs->shader.tokens );
+      FREE( cvs );
+      return NULL;
+   }
+
+   return cvs;
+}
+
+
+/**
+ * Called via pipe->bind_vs_state()
+ */
+static void
+cell_bind_vs_state(struct pipe_context *pipe, void *vs)
+{
+   struct cell_context *cell = cell_context(pipe);
+
+   cell->vs = cell_vertex_shader_state(vs);
+
+   draw_bind_vertex_shader(cell->draw,
+                           (cell->vs ? cell->vs->draw_data : NULL));
+
+   cell->dirty |= CELL_NEW_VS;
+}
+
+
+/**
+ * Called via pipe->delete_vs_state()
+ */
+static void
+cell_delete_vs_state(struct pipe_context *pipe, void *vs)
+{
+   struct cell_context *cell = cell_context(pipe);
+   struct cell_vertex_shader_state *cvs = cell_vertex_shader_state(vs);
+
+   draw_delete_vertex_shader(cell->draw, cvs->draw_data);
+   FREE( (void *) cvs->shader.tokens );
+   FREE( cvs );
+}
+
+
+/**
+ * Called via pipe->set_constant_buffer()
+ */
+static void
+cell_set_constant_buffer(struct pipe_context *pipe,
+                         uint shader, uint index,
+                         const struct pipe_constant_buffer *buf)
+{
+   struct cell_context *cell = cell_context(pipe);
+   struct pipe_winsys *ws = pipe->winsys;
+
+   assert(shader < PIPE_SHADER_TYPES);
+   assert(index == 0);
+
+   /* note: reference counting */
+   winsys_buffer_reference(ws,
+                        &cell->constants[shader].buffer,
+                        buf->buffer);
+   cell->constants[shader].size = buf->size;
+
+   if (shader == PIPE_SHADER_VERTEX)
+      cell->dirty |= CELL_NEW_VS_CONSTANTS;
+   else if (shader == PIPE_SHADER_FRAGMENT)
+      cell->dirty |= CELL_NEW_FS_CONSTANTS;
+}
+
+
+void
+cell_init_shader_functions(struct cell_context *cell)
+{
+   cell->pipe.create_fs_state = cell_create_fs_state;
+   cell->pipe.bind_fs_state   = cell_bind_fs_state;
+   cell->pipe.delete_fs_state = cell_delete_fs_state;
+
+   cell->pipe.create_vs_state = cell_create_vs_state;
+   cell->pipe.bind_vs_state   = cell_bind_vs_state;
+   cell->pipe.delete_vs_state = cell_delete_vs_state;
+
+   cell->pipe.set_constant_buffer = cell_set_constant_buffer;
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_state_vertex.c b/src/gallium/drivers/cell/ppu/cell_state_vertex.c
new file mode 100644
index 0000000000..fbe55c8472
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_state_vertex.c
@@ -0,0 +1,79 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+
+#include "cell_context.h"
+#include "cell_state.h"
+
+#include "draw/draw_context.h"
+
+
+static void
+cell_set_vertex_elements(struct pipe_context *pipe,
+                         unsigned count,
+                         const struct pipe_vertex_element *elements)
+{
+   struct cell_context *cell = cell_context(pipe);
+
+   assert(count <= PIPE_MAX_ATTRIBS);
+
+   memcpy(cell->vertex_element, elements, count * sizeof(elements[0]));
+   cell->num_vertex_elements = count;
+
+   cell->dirty |= CELL_NEW_VERTEX;
+
+   draw_set_vertex_elements(cell->draw, count, elements);
+}
+
+
+static void
+cell_set_vertex_buffers(struct pipe_context *pipe,
+                        unsigned count,
+                        const struct pipe_vertex_buffer *buffers)
+{
+   struct cell_context *cell = cell_context(pipe);
+
+   assert(count <= PIPE_MAX_ATTRIBS);
+
+   memcpy(cell->vertex_buffer, buffers, count * sizeof(buffers[0]));
+   cell->num_vertex_buffers = count;
+
+   cell->dirty |= CELL_NEW_VERTEX;
+
+   draw_set_vertex_buffers(cell->draw, count, buffers);
+}
+
+
+void
+cell_init_vertex_functions(struct cell_context *cell)
+{
+   cell->pipe.set_vertex_buffers = cell_set_vertex_buffers;
+   cell->pipe.set_vertex_elements = cell_set_vertex_elements;
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_surface.c b/src/gallium/drivers/cell/ppu/cell_surface.c
new file mode 100644
index 0000000000..732c64082e
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_surface.c
@@ -0,0 +1,37 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "util/u_rect.h"
+#include "cell_context.h"
+
+
+void
+cell_init_surface_functions(struct cell_context *cell)
+{
+   cell->pipe.surface_copy = util_surface_copy;
+   cell->pipe.surface_fill = util_surface_fill;
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_surface.h b/src/gallium/drivers/cell/ppu/cell_surface.h
new file mode 100644
index 0000000000..9e58f32944
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_surface.h
@@ -0,0 +1,42 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#ifndef CELL_SURFACE_H
+#define CELL_SURFACE_H
+
+
+struct cell_context;
+
+
+extern void
+cell_init_surface_functions(struct cell_context *cell);
+
+
+#endif /* SP_SURFACE_H */
diff --git a/src/gallium/drivers/cell/ppu/cell_texture.c b/src/gallium/drivers/cell/ppu/cell_texture.c
new file mode 100644
index 0000000000..b6590dfb86
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_texture.c
@@ -0,0 +1,387 @@
+/**************************************************************************
+ * 
+ * Copyright 2006 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  *   Michel Dänzer <michel@tungstengraphics.com>
+  */
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_inlines.h"
+#include "pipe/p_winsys.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+#include "cell_context.h"
+#include "cell_state.h"
+#include "cell_texture.h"
+
+
+/* Simple, maximally packed layout.
+ */
+
+static unsigned minify( unsigned d )
+{
+   return MAX2(1, d>>1);
+}
+
+
+static void
+cell_texture_layout(struct cell_texture * spt)
+{
+   struct pipe_texture *pt = &spt->base;
+   unsigned level;
+   unsigned width = pt->width[0];
+   unsigned height = pt->height[0];
+   unsigned depth = pt->depth[0];
+
+   spt->buffer_size = 0;
+
+   for ( level = 0 ; level <= pt->last_level ; level++ ) {
+      unsigned size;
+      unsigned w_tile, h_tile;
+
+      /* width, height, rounded up to tile size */
+      w_tile = align(width, TILE_SIZE);
+      h_tile = align(height, TILE_SIZE);
+
+      pt->width[level] = width;
+      pt->height[level] = height;
+      pt->depth[level] = depth;
+      pt->nblocksx[level] = pf_get_nblocksx(&pt->block, w_tile);  
+      pt->nblocksy[level] = pf_get_nblocksy(&pt->block, h_tile);  
+
+      spt->stride[level] = pt->nblocksx[level] * pt->block.size;
+
+      spt->level_offset[level] = spt->buffer_size;
+
+      size = pt->nblocksx[level] * pt->nblocksy[level] * pt->block.size;
+      if (pt->target == PIPE_TEXTURE_CUBE)
+         size *= 6;
+      else
+         size *= depth;
+
+      spt->buffer_size += size;
+
+      width  = minify(width);
+      height = minify(height);
+      depth = minify(depth);
+   }
+}
+
+
+static struct pipe_texture *
+cell_texture_create(struct pipe_screen *screen,
+                    const struct pipe_texture *templat)
+{
+   struct pipe_winsys *ws = screen->winsys;
+   struct cell_texture *spt = CALLOC_STRUCT(cell_texture);
+   if (!spt)
+      return NULL;
+
+   spt->base = *templat;
+   spt->base.refcount = 1;
+   spt->base.screen = screen;
+
+   cell_texture_layout(spt);
+
+   spt->buffer = ws->buffer_create(ws, 32,
+                                   PIPE_BUFFER_USAGE_PIXEL,
+                                   spt->buffer_size);
+
+   if (!spt->buffer) {
+      FREE(spt);
+      return NULL;
+   }
+
+   return &spt->base;
+}
+
+
+static void
+cell_texture_release(struct pipe_screen *screen,
+                     struct pipe_texture **pt)
+{
+   if (!*pt)
+      return;
+
+   /*
+   DBG("%s %p refcount will be %d\n",
+       __FUNCTION__, (void *) *pt, (*pt)->refcount - 1);
+   */
+   if (--(*pt)->refcount <= 0) {
+      struct cell_texture *spt = cell_texture(*pt);
+
+      /*
+      DBG("%s deleting %p\n", __FUNCTION__, (void *) spt);
+      */
+
+      pipe_buffer_reference(screen, &spt->buffer, NULL);
+
+      FREE(spt);
+   }
+   *pt = NULL;
+}
+
+
+#if 0
+static void
+cell_texture_update(struct pipe_context *pipe, struct pipe_texture *texture,
+                    uint face, uint levelsMask)
+{
+   /* XXX TO DO:  re-tile the texture data ... */
+
+}
+#endif
+
+
+static struct pipe_surface *
+cell_get_tex_surface(struct pipe_screen *screen,
+                     struct pipe_texture *pt,
+                     unsigned face, unsigned level, unsigned zslice,
+                     unsigned usage)
+{
+   struct pipe_winsys *ws = screen->winsys;
+   struct cell_texture *spt = cell_texture(pt);
+   struct pipe_surface *ps;
+
+   ps = ws->surface_alloc(ws);
+   if (ps) {
+      assert(ps->refcount);
+      assert(ps->winsys);
+      winsys_buffer_reference(ws, &ps->buffer, spt->buffer);
+      ps->format = pt->format;
+      ps->block = pt->block;
+      ps->width = pt->width[level];
+      ps->height = pt->height[level];
+      ps->nblocksx = pt->nblocksx[level];
+      ps->nblocksy = pt->nblocksy[level];
+      ps->stride = spt->stride[level];
+      ps->offset = spt->level_offset[level];
+      ps->usage = usage;
+
+      /* XXX may need to override usage flags (see sp_texture.c) */
+
+      pipe_texture_reference(&ps->texture, pt); 
+      ps->face = face;
+      ps->level = level;
+      ps->zslice = zslice;
+
+      if (pt->target == PIPE_TEXTURE_CUBE || pt->target == PIPE_TEXTURE_3D) {
+	 ps->offset += ((pt->target == PIPE_TEXTURE_CUBE) ? face : zslice) *
+		       ps->nblocksy *
+		       ps->stride;
+      }
+      else {
+	 assert(face == 0);
+	 assert(zslice == 0);
+      }
+   }
+   return ps;
+}
+
+
+
+/**
+ * Copy tile data from linear layout to tiled layout.
+ * XXX this should be rolled into the future surface-creation code.
+ * XXX also need "untile" code...
+ */
+static void
+tile_copy_data(uint w, uint h, uint tile_size, uint *dst, const uint *src)
+{
+   const uint tile_size2 = tile_size * tile_size;
+   const uint h_t = h / tile_size, w_t = w / tile_size;
+
+   uint it, jt;  /* tile counters */
+   uint i, j;    /* intra-tile counters */
+
+   /* loop over dest tiles */
+   for (it = 0; it < h_t; it++) {
+      for (jt = 0; jt < w_t; jt++) {
+         /* start of dest tile: */
+         uint *tdst = dst + (it * w_t + jt) * tile_size2;
+         /* loop over texels in the tile */
+         for (i = 0; i < tile_size; i++) {
+            for (j = 0; j < tile_size; j++) {
+               const uint srci = it * tile_size + i;
+               const uint srcj = jt * tile_size + j;
+               *tdst++ = src[srci * w + srcj];
+            }
+         }
+      }
+   }
+}
+
+
+
+/**
+ * Convert linear texture image data to tiled format for SPU usage.
+ * XXX recast this in terms of pipe_surfaces (aka texture views).
+ */
+static void
+cell_tile_texture(struct cell_context *cell,
+                  struct cell_texture *texture)
+{
+   struct pipe_screen *screen = cell->pipe.screen;
+   uint face = 0, level = 0, zslice = 0;
+   struct pipe_surface *surf;
+   const uint w = texture->base.width[0], h = texture->base.height[0];
+   const uint *src;
+
+   /* temporary restrictions: */
+   assert(w >= TILE_SIZE);
+   assert(h >= TILE_SIZE);
+   assert(w % TILE_SIZE == 0);
+   assert(h % TILE_SIZE == 0);
+
+   surf = screen->get_tex_surface(screen, &texture->base, face, level, zslice,
+                                  PIPE_BUFFER_USAGE_CPU_WRITE);
+   ASSERT(surf);
+
+   src = (const uint *) pipe_surface_map(surf, PIPE_BUFFER_USAGE_CPU_WRITE);
+
+   if (texture->tiled_data) {
+      align_free(texture->tiled_data);
+   }
+   texture->tiled_data = align_malloc(w * h * 4, 16);
+
+   tile_copy_data(w, h, TILE_SIZE, texture->tiled_data, src);
+
+   pipe_surface_unmap(surf);
+
+   pipe_surface_reference(&surf, NULL);
+}
+
+
+void
+cell_update_texture_mapping(struct cell_context *cell)
+{
+#if 0
+   uint face = 0, level = 0, zslice = 0;
+#endif
+   uint i;
+
+   for (i = 0; i < CELL_MAX_SAMPLERS; i++) {
+      if (cell->texture[i])
+         cell_tile_texture(cell, cell->texture[i]);
+   }
+
+#if 0
+   if (cell->tex_surf && cell->tex_map) {
+      pipe_surface_unmap(cell->tex_surf);
+      cell->tex_map = NULL;
+   }
+
+   /* XXX free old surface */
+
+   cell->tex_surf = cell_get_tex_surface(&cell->pipe,
+                                         &cell->texture[0]->base,
+                                         face, level, zslice);
+
+   cell->tex_map = pipe_surface_map(cell->tex_surf);
+#endif
+}
+
+
+static void 
+cell_tex_surface_release(struct pipe_screen *screen, 
+                         struct pipe_surface **s)
+{
+   /* Effectively do the texture_update work here - if texture images
+    * needed post-processing to put them into hardware layout, this is
+    * where it would happen.  For softpipe, nothing to do.
+    */
+   assert ((*s)->texture);
+   pipe_texture_reference(&(*s)->texture, NULL); 
+
+   screen->winsys->surface_release(screen->winsys, s);
+}
+
+
+static void *
+cell_surface_map( struct pipe_screen *screen,
+                  struct pipe_surface *surface,
+                  unsigned flags )
+{
+   ubyte *map;
+
+   if (flags & ~surface->usage) {
+      assert(0);
+      return NULL;
+   }
+
+   map = pipe_buffer_map( screen, surface->buffer, flags );
+   if (map == NULL)
+      return NULL;
+
+   /* May want to different things here depending on read/write nature
+    * of the map:
+    */
+   if (surface->texture &&
+       (flags & PIPE_BUFFER_USAGE_CPU_WRITE)) 
+   {
+      /* Do something to notify sharing contexts of a texture change.
+       * In softpipe, that would mean flushing the texture cache.
+       */
+#if 0
+      cell_screen(screen)->timestamp++;
+#endif
+   }
+   
+   return map + surface->offset;
+}
+
+
+static void
+cell_surface_unmap(struct pipe_screen *screen,
+                   struct pipe_surface *surface)
+{
+   pipe_buffer_unmap( screen, surface->buffer );
+}
+
+
+void
+cell_init_texture_functions(struct cell_context *cell)
+{
+   /*cell->pipe.texture_update = cell_texture_update;*/
+}
+
+
+void
+cell_init_screen_texture_funcs(struct pipe_screen *screen)
+{
+   screen->texture_create = cell_texture_create;
+   screen->texture_release = cell_texture_release;
+
+   screen->get_tex_surface = cell_get_tex_surface;
+   screen->tex_surface_release = cell_tex_surface_release;
+
+   screen->surface_map = cell_surface_map;
+   screen->surface_unmap = cell_surface_unmap;
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_texture.h b/src/gallium/drivers/cell/ppu/cell_texture.h
new file mode 100644
index 0000000000..6d37e95ebc
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_texture.h
@@ -0,0 +1,76 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef CELL_TEXTURE_H
+#define CELL_TEXTURE_H
+
+
+struct cell_context;
+struct pipe_texture;
+
+
+/**
+ * Subclass of pipe_texture
+ */
+struct cell_texture
+{
+   struct pipe_texture base;
+
+   unsigned long level_offset[PIPE_MAX_TEXTURE_LEVELS];
+   unsigned long stride[PIPE_MAX_TEXTURE_LEVELS];
+
+   /* The data is held here:
+    */
+   struct pipe_buffer *buffer;
+   unsigned long buffer_size;
+
+   void *tiled_data;  /* XXX this may be temporary */ /*ALIGN16*/
+};
+
+
+/** cast wrapper */
+static INLINE struct cell_texture *
+cell_texture(struct pipe_texture *pt)
+{
+   return (struct cell_texture *) pt;
+}
+
+
+
+extern void
+cell_update_texture_mapping(struct cell_context *cell);
+
+
+extern void
+cell_init_texture_functions(struct cell_context *cell);
+
+
+extern void
+cell_init_screen_texture_funcs(struct pipe_screen *screen);
+
+
+#endif /* CELL_TEXTURE_H */
diff --git a/src/gallium/drivers/cell/ppu/cell_vbuf.c b/src/gallium/drivers/cell/ppu/cell_vbuf.c
new file mode 100644
index 0000000000..578ddf62dc
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_vbuf.c
@@ -0,0 +1,302 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * Vertex buffer code.  The draw module transforms vertices to window
+ * coords, etc. and emits the vertices into buffer supplied by this module.
+ * When a vertex buffer is full, or we flush, we'll send the vertex data
+ * to the SPUs.
+ *
+ * Authors
+ *  Brian Paul
+ */
+
+
+#include "cell_batch.h"
+#include "cell_context.h"
+#include "cell_flush.h"
+#include "cell_spu.h"
+#include "cell_vbuf.h"
+#include "draw/draw_vbuf.h"
+#include "util/u_memory.h"
+
+
+/** Allow vertex data to be inlined after RENDER command */
+#define ALLOW_INLINE_VERTS 1
+
+
+/**
+ * Subclass of vbuf_render because we need a cell_context pointer in
+ * a few places.
+ */
+struct cell_vbuf_render
+{
+   struct vbuf_render base;
+   struct cell_context *cell;
+   uint prim;            /**< PIPE_PRIM_x */
+   uint vertex_size;     /**< in bytes */
+   void *vertex_buffer;  /**< just for debug, really */
+   uint vertex_buf;      /**< in [0, CELL_NUM_BUFFERS-1] */
+};
+
+
+/** cast wrapper */
+static struct cell_vbuf_render *
+cell_vbuf_render(struct vbuf_render *vbr)
+{
+   return (struct cell_vbuf_render *) vbr;
+}
+
+
+
+static const struct vertex_info *
+cell_vbuf_get_vertex_info(struct vbuf_render *vbr)
+{
+   struct cell_vbuf_render *cvbr = cell_vbuf_render(vbr);
+   return &cvbr->cell->vertex_info;
+}
+
+
+static void *
+cell_vbuf_allocate_vertices(struct vbuf_render *vbr,
+                            ushort vertex_size, ushort nr_vertices)
+{
+   struct cell_vbuf_render *cvbr = cell_vbuf_render(vbr);
+   /*printf("Alloc verts %u * %u\n", vertex_size, nr_vertices);*/
+
+   assert(cvbr->vertex_buf == ~0);
+   cvbr->vertex_buf = cell_get_empty_buffer(cvbr->cell);
+   cvbr->vertex_buffer = cvbr->cell->buffer[cvbr->vertex_buf];
+   cvbr->vertex_size = vertex_size;
+   return cvbr->vertex_buffer;
+}
+
+
+static void
+cell_vbuf_release_vertices(struct vbuf_render *vbr, void *vertices, 
+                           unsigned vertex_size, unsigned vertices_used)
+{
+   struct cell_vbuf_render *cvbr = cell_vbuf_render(vbr);
+   struct cell_context *cell = cvbr->cell;
+
+   /*
+   printf("%s vertex_buf = %u  count = %u\n",
+          __FUNCTION__, cvbr->vertex_buf, vertices_used);
+   */
+
+   /* Tell SPUs they can release the vert buf */
+   if (cvbr->vertex_buf != ~0U) {
+      struct cell_command_release_verts *release
+         = (struct cell_command_release_verts *)
+         cell_batch_alloc(cell, sizeof(struct cell_command_release_verts));
+      release->opcode = CELL_CMD_RELEASE_VERTS;
+      release->vertex_buf = cvbr->vertex_buf;
+   }
+
+   cvbr->vertex_buf = ~0;
+   cell_flush_int(cell, 0x0);
+
+   assert(vertices == cvbr->vertex_buffer);
+   cvbr->vertex_buffer = NULL;
+}
+
+
+
+static boolean
+cell_vbuf_set_primitive(struct vbuf_render *vbr, unsigned prim)
+{
+   struct cell_vbuf_render *cvbr = cell_vbuf_render(vbr);
+   cvbr->prim = prim;
+   /*printf("cell_set_prim %u\n", prim);*/
+   return TRUE;
+}
+
+
+static void
+cell_vbuf_draw(struct vbuf_render *vbr,
+	       const ushort *indices,
+               uint nr_indices)
+{
+   struct cell_vbuf_render *cvbr = cell_vbuf_render(vbr);
+   struct cell_context *cell = cvbr->cell;
+   float xmin, ymin, xmax, ymax;
+   uint i;
+   uint nr_vertices = 0, min_index = ~0;
+   const void *vertices = cvbr->vertex_buffer;
+   const uint vertex_size = cvbr->vertex_size;
+
+   for (i = 0; i < nr_indices; i++) {
+      if (indices[i] > nr_vertices)
+         nr_vertices = indices[i];
+      if (indices[i] < min_index)
+         min_index = indices[i];
+   }
+   nr_vertices++;
+
+#if 0
+   /*if (min_index > 0)*/
+      printf("%s min_index = %u\n", __FUNCTION__, min_index);
+#endif
+
+#if 0
+   printf("cell_vbuf_draw() nr_indices = %u nr_verts = %u\n",
+          nr_indices, nr_vertices);
+   printf("  ");
+   for (i = 0; i < nr_indices; i += 3) {
+      printf("%u %u %u, ", indices[i+0], indices[i+1], indices[i+2]);
+   }
+   printf("\n");
+#elif 0
+   printf("cell_vbuf_draw() nr_indices = %u nr_verts = %u  indexes = [%u %u %u ...]\n",
+          nr_indices, nr_vertices,
+          indices[0], indices[1], indices[2]);
+   printf("ind space = %u, vert space = %u, space = %u\n",
+          nr_indices * 2,
+          nr_vertices * 4 * cell->vertex_info.size,
+          cell_batch_free_space(cell));
+#endif
+
+   /* compute x/y bounding box */
+   xmin = ymin = 1e50;
+   xmax = ymax = -1e50;
+   for (i = min_index; i < nr_vertices; i++) {
+      const float *v = (float *) ((ubyte *) vertices + i * vertex_size);
+      if (v[0] < xmin)
+         xmin = v[0];
+      if (v[0] > xmax)
+         xmax = v[0];
+      if (v[1] < ymin)
+         ymin = v[1];
+      if (v[1] > ymax)
+         ymax = v[1];
+   }
+#if 0
+   printf("PPU Bounds %g, %g .. %g, %g\n", xmin, ymin, xmax, ymax);
+   fflush(stdout);
+#endif
+
+   if (cvbr->prim != PIPE_PRIM_TRIANGLES)
+      return; /* only render tris for now */
+
+   /* build/insert batch RENDER command */
+   {
+      const uint index_bytes = ROUNDUP8(nr_indices * 2);
+      const uint vertex_bytes = nr_vertices * 4 * cell->vertex_info.size;
+      const uint batch_size = sizeof(struct cell_command_render) + index_bytes;
+
+      struct cell_command_render *render
+         = (struct cell_command_render *)
+         cell_batch_alloc(cell, batch_size);
+
+      render->opcode = CELL_CMD_RENDER;
+      render->prim_type = cvbr->prim;
+      render->front_winding = cell->rasterizer->front_winding;
+
+      render->num_indexes = nr_indices;
+      render->min_index = min_index;
+
+      /* append indices after render command */
+      memcpy(render + 1, indices, nr_indices * 2);
+
+      /* if there's room, append vertices after the indices, else leave
+       * vertices in the original/separate buffer.
+       */
+      render->vertex_size = 4 * cell->vertex_info.size;
+      render->num_verts = nr_vertices;
+      if (ALLOW_INLINE_VERTS &&
+          min_index == 0 &&
+          vertex_bytes + 16 <= cell_batch_free_space(cell)) {
+         /* vertex data inlined, after indices, at 16-byte boundary */
+         void *dst = cell_batch_alloc_aligned(cell, vertex_bytes, 16);
+         memcpy(dst, vertices, vertex_bytes);
+         render->inline_verts = TRUE;
+         render->vertex_buf = ~0;
+      }
+      else {
+         /* vertex data in separate buffer */
+         render->inline_verts = FALSE;
+         ASSERT(cvbr->vertex_buf >= 0);
+         render->vertex_buf = cvbr->vertex_buf;
+      }
+
+      render->xmin = xmin;
+      render->ymin = ymin;
+      render->xmax = xmax;
+      render->ymax = ymax;
+   }
+
+#if 0
+   /* helpful for debug */
+   cell_flush_int(cell, CELL_FLUSH_WAIT);
+#endif
+}
+
+
+static void
+cell_vbuf_destroy(struct vbuf_render *vbr)
+{
+   struct cell_vbuf_render *cvbr = cell_vbuf_render(vbr);
+   cvbr->cell->vbuf_render = NULL;
+   FREE(cvbr);
+}
+
+
+/**
+ * Initialize the post-transform vertex buffer information for the given
+ * context.
+ */
+void
+cell_init_vbuf(struct cell_context *cell)
+{
+   assert(cell->draw);
+
+   cell->vbuf_render = CALLOC_STRUCT(cell_vbuf_render);
+
+   /* The max number of indexes is what can fix into a batch buffer,
+    * minus the render and release-verts commands.
+    */
+   cell->vbuf_render->base.max_indices
+      = (CELL_BUFFER_SIZE
+         - sizeof(struct cell_command_render)
+         - sizeof(struct cell_command_release_verts))
+      / sizeof(ushort);
+   cell->vbuf_render->base.max_vertex_buffer_bytes = CELL_BUFFER_SIZE;
+
+   cell->vbuf_render->base.get_vertex_info = cell_vbuf_get_vertex_info;
+   cell->vbuf_render->base.allocate_vertices = cell_vbuf_allocate_vertices;
+   cell->vbuf_render->base.set_primitive = cell_vbuf_set_primitive;
+   cell->vbuf_render->base.draw = cell_vbuf_draw;
+   cell->vbuf_render->base.release_vertices = cell_vbuf_release_vertices;
+   cell->vbuf_render->base.destroy = cell_vbuf_destroy;
+
+   cell->vbuf_render->cell = cell;
+#if 1
+   cell->vbuf_render->vertex_buf = ~0;
+#endif
+
+   cell->vbuf = draw_vbuf_stage(cell->draw, &cell->vbuf_render->base);
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_vbuf.h b/src/gallium/drivers/cell/ppu/cell_vbuf.h
new file mode 100644
index 0000000000..d265cbf770
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_vbuf.h
@@ -0,0 +1,38 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef CELL_VBUF_H
+#define CELL_VBUF_H
+
+
+struct cell_context;
+
+extern void
+cell_init_vbuf(struct cell_context *cell);
+
+
+#endif /* CELL_VBUF_H */
diff --git a/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c b/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c
new file mode 100644
index 0000000000..18969005b0
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c
@@ -0,0 +1,343 @@
+/*
+ * (C) Copyright IBM Corporation 2008
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <inttypes.h>
+#include "pipe/p_defines.h"
+#include "pipe/p_context.h"
+#include "pipe/p_format.h"
+
+#include "../auxiliary/draw/draw_context.h"
+#include "../auxiliary/draw/draw_private.h"
+
+#include "cell_context.h"
+#include "rtasm/rtasm_ppc_spe.h"
+
+
+/**
+ * Emit a 4x4 matrix transpose operation
+ *
+ * \param p         Function that the transpose operation is to be appended to
+ * \param row0      Register containing row 0 of the source matrix
+ * \param row1      Register containing row 1 of the source matrix
+ * \param row2      Register containing row 2 of the source matrix
+ * \param row3      Register containing row 3 of the source matrix
+ * \param dest_ptr  Register containing the address of the destination matrix
+ * \param shuf_ptr  Register containing the address of the shuffled data
+ * \param count     Number of colums to actually be written to the destination
+ *
+ * \note
+ * This function assumes that the registers named by \c row0, \c row1,
+ * \c row2, and \c row3 are scratch and can be modified by the generated code.
+ * Furthermore, these registers will be released, via calls to
+ * \c release_register, by this function.
+ * 
+ * \note
+ * This function requires that four temporary are available on entry.
+ */
+static void
+emit_matrix_transpose(struct spe_function *p,
+		      unsigned row0, unsigned row1, unsigned row2,
+		      unsigned row3, unsigned dest_ptr,
+		      unsigned shuf_ptr, unsigned count)
+{
+   int shuf_hi = spe_allocate_available_register(p);
+   int shuf_lo = spe_allocate_available_register(p);
+   int t1 = spe_allocate_available_register(p);
+   int t2 = spe_allocate_available_register(p);
+   int t3;
+   int t4;
+   int col0;
+   int col1;
+   int col2;
+   int col3;
+
+
+   spe_lqd(p, shuf_hi, shuf_ptr, 3*16);
+   spe_lqd(p, shuf_lo, shuf_ptr, 4*16);
+   spe_shufb(p, t1, row0, row2, shuf_hi);
+   spe_shufb(p, t2, row0, row2, shuf_lo);
+
+
+   /* row0 and row2 are now no longer needed.  Re-use those registers as
+    * temporaries.
+    */
+   t3 = row0;
+   t4 = row2;
+
+   spe_shufb(p, t3, row1, row3, shuf_hi);
+   spe_shufb(p, t4, row1, row3, shuf_lo);
+
+
+   /* row1 and row3 are now no longer needed.  Re-use those registers as
+    * temporaries.
+    */
+   col0 = row1;
+   col1 = row3;
+
+   spe_shufb(p, col0, t1, t3, shuf_hi);
+   if (count > 1) {
+      spe_shufb(p, col1, t1, t3, shuf_lo);
+   }
+
+   /* t1 and t3 are now no longer needed.  Re-use those registers as
+    * temporaries.
+    */
+   col2 = t1;
+   col3 = t3;
+
+   if (count > 2) {
+      spe_shufb(p, col2, t2, t4, shuf_hi);
+   }
+
+   if (count > 3) {
+      spe_shufb(p, col3, t2, t4, shuf_lo);
+   }
+
+
+   /* Store the results.  Remember that the stqd instruction is encoded using
+    * the qword offset (stand-alone assemblers to the byte-offset to
+    * qword-offset conversion for you), so the byte-offset needs be divided by
+    * 16.
+    */
+   switch (count) {
+   case 4:
+      spe_stqd(p, col3, dest_ptr, 3 * 16);
+   case 3:
+      spe_stqd(p, col2, dest_ptr, 2 * 16);
+   case 2:
+      spe_stqd(p, col1, dest_ptr, 1 * 16);
+   case 1:
+      spe_stqd(p, col0, dest_ptr, 0 * 16);
+   }
+
+
+   /* Release all of the temporary registers used.
+    */
+   spe_release_register(p, col0);
+   spe_release_register(p, col1);
+   spe_release_register(p, col2);
+   spe_release_register(p, col3);
+   spe_release_register(p, shuf_hi);
+   spe_release_register(p, shuf_lo);
+   spe_release_register(p, t2);
+   spe_release_register(p, t4);
+}
+
+
+static void
+emit_fetch(struct spe_function *p,
+	   unsigned in_ptr, unsigned *offset,
+	   unsigned out_ptr, unsigned shuf_ptr,
+	   enum pipe_format format)
+{
+   const unsigned count = (pf_size_x(format) != 0) + (pf_size_y(format) != 0)
+       + (pf_size_z(format) != 0) + (pf_size_w(format) != 0);
+   const unsigned type = pf_type(format);
+   const unsigned bytes = pf_size_x(format);
+
+   int v0 = spe_allocate_available_register(p);
+   int v1 = spe_allocate_available_register(p);
+   int v2 = spe_allocate_available_register(p);
+   int v3 = spe_allocate_available_register(p);
+   int tmp = spe_allocate_available_register(p);
+   int float_zero = -1;
+   int float_one = -1;
+   float scale_signed = 0.0;
+   float scale_unsigned = 0.0;
+
+   spe_lqd(p, v0, in_ptr, (0 + offset[0]) * 16);
+   spe_lqd(p, v1, in_ptr, (1 + offset[0]) * 16);
+   spe_lqd(p, v2, in_ptr, (2 + offset[0]) * 16);
+   spe_lqd(p, v3, in_ptr, (3 + offset[0]) * 16);
+   offset[0] += 4;
+   
+   switch (bytes) {
+   case 1:
+      scale_signed = 1.0f / 127.0f;
+      scale_unsigned = 1.0f / 255.0f;
+      spe_lqd(p, tmp, shuf_ptr, 1 * 16);
+      spe_shufb(p, v0, v0, v0, tmp);
+      spe_shufb(p, v1, v1, v1, tmp);
+      spe_shufb(p, v2, v2, v2, tmp);
+      spe_shufb(p, v3, v3, v3, tmp);
+      break;
+   case 2:
+      scale_signed = 1.0f / 32767.0f;
+      scale_unsigned = 1.0f / 65535.0f;
+      spe_lqd(p, tmp, shuf_ptr, 2 * 16);
+      spe_shufb(p, v0, v0, v0, tmp);
+      spe_shufb(p, v1, v1, v1, tmp);
+      spe_shufb(p, v2, v2, v2, tmp);
+      spe_shufb(p, v3, v3, v3, tmp);
+      break;
+   case 4:
+      scale_signed = 1.0f / 2147483647.0f;
+      scale_unsigned = 1.0f / 4294967295.0f;
+      break;
+   default:
+      assert(0);
+      break;
+   }
+
+   switch (type) {
+   case PIPE_FORMAT_TYPE_FLOAT:
+      break;
+   case PIPE_FORMAT_TYPE_UNORM:
+      spe_ilhu(p, tmp, ((unsigned) scale_unsigned) >> 16);
+      spe_iohl(p, tmp, ((unsigned) scale_unsigned) & 0x0ffff);
+      spe_cuflt(p, v0, v0, 0);
+      spe_fm(p, v0, v0, tmp);
+      break;
+   case PIPE_FORMAT_TYPE_SNORM:
+      spe_ilhu(p, tmp, ((unsigned) scale_signed) >> 16);
+      spe_iohl(p, tmp, ((unsigned) scale_signed) & 0x0ffff);
+      spe_csflt(p, v0, v0, 0);
+      spe_fm(p, v0, v0, tmp);
+      break;
+   case PIPE_FORMAT_TYPE_USCALED:
+      spe_cuflt(p, v0, v0, 0);
+      break;
+   case PIPE_FORMAT_TYPE_SSCALED:
+      spe_csflt(p, v0, v0, 0);
+      break;
+   }
+
+
+   if (count < 4) {
+      float_one = spe_allocate_available_register(p);
+      spe_il(p, float_one, 1);
+      spe_cuflt(p, float_one, float_one, 0);
+      
+      if (count < 3) {
+	 float_zero = spe_allocate_available_register(p);
+	 spe_il(p, float_zero, 0);
+      }
+   }
+
+   spe_release_register(p, tmp);
+
+   emit_matrix_transpose(p, v0, v1, v2, v3, out_ptr, shuf_ptr, count);
+
+   switch (count) {
+   case 1:
+      spe_stqd(p, float_zero, out_ptr, 1 * 16);
+   case 2:
+      spe_stqd(p, float_zero, out_ptr, 2 * 16);
+   case 3:
+      spe_stqd(p, float_one, out_ptr, 3 * 16);
+   }
+
+   if (float_zero != -1) {
+      spe_release_register(p, float_zero);
+   }
+
+   if (float_one != -1) {
+      spe_release_register(p, float_one);
+   }
+}
+
+
+void cell_update_vertex_fetch(struct draw_context *draw)
+{
+#if 0
+   struct cell_context *const cell =
+       (struct cell_context *) draw->driver_private;
+   struct spe_function *p = &cell->attrib_fetch;
+   unsigned function_index[PIPE_MAX_ATTRIBS];
+   unsigned unique_attr_formats;
+   int out_ptr;
+   int in_ptr;
+   int shuf_ptr;
+   unsigned i;
+   unsigned j;
+
+
+   /* Determine how many unique input attribute formats there are.  At the
+    * same time, store the index of the lowest numbered attribute that has
+    * the same format as any non-unique format.
+    */
+   unique_attr_formats = 1;
+   function_index[0] = 0;
+   for (i = 1; i < draw->vertex_fetch.nr_attrs; i++) {
+      const enum pipe_format curr_fmt = draw->vertex_element[i].src_format;
+
+      for (j = 0; j < i; j++) {
+	 if (curr_fmt == draw->vertex_element[j].src_format) {
+	    break;
+	 }
+      }
+      
+      if (j == i) {
+	 unique_attr_formats++;
+      }
+
+      function_index[i] = j;
+   }
+
+
+   /* Each fetch function can be a maximum of 34 instructions (note: this is
+    * actually a slight over-estimate).
+    */
+   spe_init_func(p, 34 * SPE_INST_SIZE * unique_attr_formats);
+
+
+   /* Allocate registers for the function's input parameters.
+    */
+   out_ptr = spe_allocate_register(p, 3);
+   in_ptr = spe_allocate_register(p, 4);
+   shuf_ptr = spe_allocate_register(p, 5);
+
+
+   /* Generate code for the individual attribute fetch functions.
+    */
+   for (i = 0; i < draw->vertex_fetch.nr_attrs; i++) {
+      unsigned offset;
+
+      if (function_index[i] == i) {
+	 cell->attrib_fetch_offsets[i] = (unsigned) ((void *) p->csr 
+						     - (void *) p->store);
+
+	 offset = 0;
+	 emit_fetch(p, in_ptr, &offset, out_ptr, shuf_ptr,
+		    draw->vertex_element[i].src_format);
+	 spe_bi(p, 0, 0, 0);
+
+	 /* Round up to the next 16-byte boundary.
+	  */
+	 if ((((unsigned) p->store) & 0x0f) != 0) {
+	    const unsigned align = ((unsigned) p->store) & 0x0f;
+	    p->store = (uint32_t *) (((void *) p->store) + align);
+	 }
+      } else {
+	 /* Use the same function entry-point as a previously seen attribute
+	  * with the same format.
+	  */
+	 cell->attrib_fetch_offsets[i] = 
+	     cell->attrib_fetch_offsets[function_index[i]];
+      }
+   }
+#else
+   assert(0);
+#endif
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_vertex_shader.c b/src/gallium/drivers/cell/ppu/cell_vertex_shader.c
new file mode 100644
index 0000000000..2b10c116fa
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_vertex_shader.c
@@ -0,0 +1,146 @@
+/*
+ * (C) Copyright IBM Corporation 2008
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * \file cell_vertex_shader.c
+ * Vertex shader interface routines for Cell.
+ *
+ * \author Ian Romanick <idr@us.ibm.com>
+ */
+
+#include "pipe/p_defines.h"
+#include "pipe/p_context.h"
+#include "pipe/p_winsys.h"
+#include "util/u_math.h"
+
+#include "cell_context.h"
+#include "cell_draw_arrays.h"
+#include "cell_flush.h"
+#include "cell_spu.h"
+#include "cell_batch.h"
+
+#include "cell/common.h"
+#include "draw/draw_context.h"
+#include "draw/draw_private.h"
+
+/**
+ * Run the vertex shader on all vertices in the vertex queue.
+ * Called by the draw module when the vertx cache needs to be flushed.
+ */
+void
+cell_vertex_shader_queue_flush(struct draw_context *draw)
+{
+#if 0
+   struct cell_context *const cell =
+       (struct cell_context *) draw->driver_private;
+   struct cell_command_vs *const vs = &cell_global.command[0].vs;
+   uint64_t *batch;
+   struct cell_array_info *array_info;
+   unsigned i, j;
+   struct cell_attribute_fetch_code *cf;
+
+   assert(draw->vs.queue_nr != 0);
+
+   /* XXX: do this on statechange: 
+    */
+   draw_update_vertex_fetch(draw);
+   cell_update_vertex_fetch(draw);
+
+
+   batch = cell_batch_alloc(cell, sizeof(batch[0]) + sizeof(*cf));
+   batch[0] = CELL_CMD_STATE_ATTRIB_FETCH;
+   cf = (struct cell_attribute_fetch_code *) (&batch[1]);
+   cf->base = (uint64_t) cell->attrib_fetch.store;
+   cf->size = ROUNDUP16((unsigned)((void *) cell->attrib_fetch.csr 
+				   - (void *) cell->attrib_fetch.store));
+
+
+   for (i = 0; i < draw->vertex_fetch.nr_attrs; i++) {
+      const enum pipe_format format = draw->vertex_element[i].src_format;
+      const unsigned count = ((pf_size_x(format) != 0)
+			      + (pf_size_y(format) != 0)
+			      + (pf_size_z(format) != 0)
+			      + (pf_size_w(format) != 0));
+      const unsigned size = pf_size_x(format) * count;
+
+      batch = cell_batch_alloc(cell, sizeof(batch[0]) + sizeof(*array_info));
+
+      batch[0] = CELL_CMD_STATE_VS_ARRAY_INFO;
+
+      array_info = (struct cell_array_info *) &batch[1];
+      assert(draw->vertex_fetch.src_ptr[i] != NULL);
+      array_info->base = (uintptr_t) draw->vertex_fetch.src_ptr[i];
+      array_info->attr = i;
+      array_info->pitch = draw->vertex_fetch.pitch[i];
+      array_info->size = size;
+      array_info->function_offset = cell->attrib_fetch_offsets[i];
+   }
+
+   batch = cell_batch_alloc(cell, sizeof(batch[0])
+                            + sizeof(struct pipe_viewport_state));
+   batch[0] = CELL_CMD_STATE_VIEWPORT;
+   (void) memcpy(&batch[1], &draw->viewport,
+                 sizeof(struct pipe_viewport_state));
+
+   {
+      uint64_t uniforms = (uintptr_t) draw->user.constants;
+
+      batch = cell_batch_alloc(cell, 2 *sizeof(batch[0]));
+      batch[0] = CELL_CMD_STATE_UNIFORMS;
+      batch[1] = uniforms;
+   }
+
+   cell_batch_flush(cell);
+
+   vs->opcode = CELL_CMD_VS_EXECUTE;
+   vs->nr_attrs = draw->vertex_fetch.nr_attrs;
+
+   (void) memcpy(vs->plane, draw->plane, sizeof(draw->plane));
+   vs->nr_planes = draw->nr_planes;
+
+   for (i = 0; i < draw->vs.queue_nr; i += SPU_VERTS_PER_BATCH) {
+      const unsigned n = MIN2(SPU_VERTS_PER_BATCH, draw->vs.queue_nr - i);
+
+      for (j = 0; j < n; j++) {
+         vs->elts[j] = draw->vs.queue[i + j].elt;
+         vs->vOut[j] = (uintptr_t) draw->vs.queue[i + j].vertex;
+      }
+
+      for (/* empty */; j < SPU_VERTS_PER_BATCH; j++) {
+         vs->elts[j] = vs->elts[0];
+         vs->vOut[j] = (uintptr_t) draw->vs.queue[i + j].vertex;
+      }
+
+      vs->num_elts = n;
+      send_mbox_message(cell_global.spe_contexts[0], CELL_CMD_VS_EXECUTE);
+
+      cell_flush_int(cell, CELL_FLUSH_WAIT);
+   }
+
+   draw->vs.post_nr = draw->vs.queue_nr;
+   draw->vs.queue_nr = 0;
+#else
+   assert(0);
+#endif
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_winsys.c b/src/gallium/drivers/cell/ppu/cell_winsys.c
new file mode 100644
index 0000000000..d570bbd2f9
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_winsys.c
@@ -0,0 +1,40 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "util/u_memory.h"
+#include "cell_winsys.h"
+
+
+struct cell_winsys *
+cell_get_winsys(uint format)
+{
+   struct cell_winsys *cws = CALLOC_STRUCT(cell_winsys);
+   if (cws)
+      cws->preferredFormat = format;
+   return cws;
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_winsys.h b/src/gallium/drivers/cell/ppu/cell_winsys.h
new file mode 100644
index 0000000000..ae2af5696b
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_winsys.h
@@ -0,0 +1,50 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef CELL_WINSYS_H
+#define CELL_WINSYS_H
+
+#include "pipe/p_compiler.h"
+
+
+/**
+ * Very simple winsys at this time.
+ * Will probably eventually add SPU control info.
+ */
+struct cell_winsys
+{
+   uint preferredFormat;
+};
+
+
+extern struct cell_winsys *
+cell_get_winsys(uint format);
+
+
+
+#endif
diff --git a/src/gallium/drivers/cell/spu/.gitignore b/src/gallium/drivers/cell/spu/.gitignore
new file mode 100644
index 0000000000..2be9a2d324
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/.gitignore
@@ -0,0 +1 @@
+g3d_spu
diff --git a/src/gallium/drivers/cell/spu/Makefile b/src/gallium/drivers/cell/spu/Makefile
new file mode 100644
index 0000000000..116453b79c
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/Makefile
@@ -0,0 +1,82 @@
+# Gallium3D Cell driver: SPU code
+
+# This makefile builds the g3d_spu.a file that's linked into the
+# PPU code/library.
+
+
+TOP = ../../../../..
+include $(TOP)/configs/current
+
+
+PROG = g3d
+
+PROG_SPU = $(PROG)_spu
+PROG_SPU_A = $(PROG)_spu.a
+PROG_SPU_EMBED_O = $(PROG)_spu-embed.o
+
+
+SOURCES = \
+	spu_command.c \
+	spu_dcache.c \
+	spu_funcs.c \
+	spu_main.c \
+	spu_per_fragment_op.c \
+	spu_render.c \
+	spu_texture.c \
+	spu_tile.c \
+	spu_tri.c
+
+OLD_SOURCES = \
+	spu_exec.c \
+	spu_util.c \
+	spu_vertex_fetch.c \
+	spu_vertex_shader.c
+
+
+SPU_OBJECTS = $(SOURCES:.c=.o) \
+
+SPU_ASM_OUT = $(SOURCES:.c=.s) \
+
+INCLUDE_DIRS = \
+	-I$(TOP)/src/mesa \
+	-I$(TOP)/src/gallium/include \
+	-I$(TOP)/src/gallium/auxiliary \
+	-I$(TOP)/src/gallium/drivers
+
+
+.c.o:
+	$(SPU_CC) $(SPU_CFLAGS) -c $<
+
+.c.s:
+	$(SPU_CC) $(SPU_CFLAGS) -O3 -S $<
+
+
+# The .a file will be linked into the main/PPU executable
+default: $(PROG_SPU_A)
+
+$(PROG_SPU_A): $(PROG_SPU_EMBED_O)
+	$(SPU_AR) $(SPU_AR_FLAGS) $(PROG_SPU_A) $(PROG_SPU_EMBED_O)
+
+$(PROG_SPU_EMBED_O): $(PROG_SPU)
+	$(SPU_EMBED) $(SPU_EMBED_FLAGS) $(PROG_SPU) $(PROG_SPU) $(PROG_SPU_EMBED_O)
+
+$(PROG_SPU): $(SPU_OBJECTS)
+	$(SPU_CC) -o $(PROG_SPU) $(SPU_OBJECTS) $(SPU_LFLAGS)
+
+
+
+asmfiles: $(SPU_ASM_OUT)
+
+
+clean:
+	rm -f *~ *.o *.a *.d *.s $(PROG_SPU)
+
+
+
+depend: $(SOURCES)
+	rm -f depend
+	touch depend
+	$(MKDEP) $(MKDEP_OPTIONS) $(INCLUDE_DIRS) $(SOURCES) 2> /dev/null
+
+include depend
+
diff --git a/src/gallium/drivers/cell/spu/spu_colorpack.h b/src/gallium/drivers/cell/spu/spu_colorpack.h
new file mode 100644
index 0000000000..fd8dc6ded3
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_colorpack.h
@@ -0,0 +1,110 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+
+#ifndef SPU_COLORPACK_H
+#define SPU_COLORPACK_H
+
+
+#include <spu_intrinsics.h>
+
+
+static INLINE unsigned int
+spu_pack_R8G8B8A8(vector float rgba)
+{
+  vector unsigned int out = spu_convtu(rgba, 32);
+
+  out = spu_shuffle(out, out, ((vector unsigned char) {
+                                  0, 4, 8, 12, 0, 0, 0, 0, 
+                                  0, 0, 0, 0, 0, 0, 0, 0 }) );
+
+  return spu_extract(out, 0);
+}
+
+
+static INLINE unsigned int
+spu_pack_A8R8G8B8(vector float rgba)
+{
+  vector unsigned int out = spu_convtu(rgba, 32);
+  out = spu_shuffle(out, out, ((vector unsigned char) {
+                                  12, 0, 4, 8, 0, 0, 0, 0, 
+                                  0, 0, 0, 0, 0, 0, 0, 0}) );
+  return spu_extract(out, 0);
+}
+
+
+static INLINE unsigned int
+spu_pack_B8G8R8A8(vector float rgba)
+{
+  vector unsigned int out = spu_convtu(rgba, 32);
+  out = spu_shuffle(out, out, ((vector unsigned char) {
+                                  8, 4, 0, 12, 0, 0, 0, 0, 
+                                  0, 0, 0, 0, 0, 0, 0, 0}) );
+  return spu_extract(out, 0);
+}
+
+
+static INLINE unsigned int
+spu_pack_color_shuffle(vector float rgba, vector unsigned char shuffle)
+{
+  vector unsigned int out = spu_convtu(rgba, 32);
+  out = spu_shuffle(out, out, shuffle);
+  return spu_extract(out, 0);
+}
+
+
+static INLINE vector float
+spu_unpack_B8G8R8A8(uint color)
+{
+   vector unsigned int color_u4 = spu_splats(color);
+   color_u4 = spu_shuffle(color_u4, color_u4,
+                          ((vector unsigned char) {
+                             10, 10, 10, 10,
+                             5, 5, 5, 5,
+                             0, 0, 0, 0,
+                             15, 15, 15, 15}) );
+   return spu_convtf(color_u4, 32);
+}
+
+
+static INLINE vector float
+spu_unpack_A8R8G8B8(uint color)
+{
+   vector unsigned int color_u4 = spu_splats(color);
+   color_u4 = spu_shuffle(color_u4, color_u4,
+                          ((vector unsigned char) {
+                             5, 5, 5, 5,
+                             10, 10, 10, 10,
+                             15, 15, 15, 15,
+                             0, 0, 0, 0}) );
+
+   return spu_convtf(color_u4, 32);
+}
+
+
+#endif /* SPU_COLORPACK_H */
diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
new file mode 100644
index 0000000000..91a4c137e7
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -0,0 +1,621 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+/**
+ * SPU command processing code
+ */
+
+
+#include <stdio.h>
+#include <libmisc.h>
+
+#include "pipe/p_defines.h"
+
+#include "spu_command.h"
+#include "spu_main.h"
+#include "spu_render.h"
+#include "spu_per_fragment_op.h"
+#include "spu_texture.h"
+#include "spu_tile.h"
+#include "spu_vertex_shader.h"
+#include "spu_dcache.h"
+#include "spu_debug.h"
+#include "cell/common.h"
+
+
+struct spu_vs_context draw;
+
+
+/**
+ * Buffers containing dynamically generated SPU code:
+ */
+static unsigned char attribute_fetch_code_buffer[136 * PIPE_MAX_ATTRIBS]
+    ALIGN16_ATTRIB;
+
+
+
+/**
+ * Tell the PPU that this SPU has finished copying a buffer to
+ * local store and that it may be reused by the PPU.
+ * This is done by writting a 16-byte batch-buffer-status block back into
+ * main memory (in cell_context->buffer_status[]).
+ */
+static void
+release_buffer(uint buffer)
+{
+   /* Evidently, using less than a 16-byte status doesn't work reliably */
+   static const uint status[4] ALIGN16_ATTRIB
+      = {CELL_BUFFER_STATUS_FREE, 0, 0, 0};
+
+   const uint index = 4 * (spu.init.id * CELL_NUM_BUFFERS + buffer);
+   uint *dst = spu.init.buffer_status + index;
+
+   ASSERT(buffer < CELL_NUM_BUFFERS);
+
+   mfc_put((void *) &status,    /* src in local memory */
+           (unsigned int) dst,  /* dst in main memory */
+           sizeof(status),      /* size */
+           TAG_MISC,            /* tag is unimportant */
+           0, /* tid */
+           0  /* rid */);
+}
+
+
+static void
+cmd_clear_surface(const struct cell_command_clear_surface *clear)
+{
+   DEBUG_PRINTF("CLEAR SURF %u to 0x%08x\n", clear->surface, clear->value);
+
+   if (clear->surface == 0) {
+      spu.fb.color_clear_value = clear->value;
+      if (spu.init.debug_flags & CELL_DEBUG_CHECKER) {
+         uint x = (spu.init.id << 4) | (spu.init.id << 12) |
+            (spu.init.id << 20) | (spu.init.id << 28);
+         spu.fb.color_clear_value ^= x;
+      }
+   }
+   else {
+      spu.fb.depth_clear_value = clear->value;
+   }
+
+#define CLEAR_OPT 1
+#if CLEAR_OPT
+
+   /* Simply set all tiles' status to CLEAR.
+    * When we actually begin rendering into a tile, we'll initialize it to
+    * the clear value.  If any tiles go untouched during the frame,
+    * really_clear_tiles() will set them to the clear value.
+    */
+   if (clear->surface == 0) {
+      memset(spu.ctile_status, TILE_STATUS_CLEAR, sizeof(spu.ctile_status));
+   }
+   else {
+      memset(spu.ztile_status, TILE_STATUS_CLEAR, sizeof(spu.ztile_status));
+   }
+
+#else
+
+   /*
+    * This path clears the whole framebuffer to the clear color right now.
+    */
+
+   /*
+   printf("SPU: %s num=%d w=%d h=%d\n",
+          __FUNCTION__, num_tiles, spu.fb.width_tiles, spu.fb.height_tiles);
+   */
+
+   /* init a single tile to the clear value */
+   if (clear->surface == 0) {
+      clear_c_tile(&spu.ctile);
+   }
+   else {
+      clear_z_tile(&spu.ztile);
+   }
+
+   /* walk over my tiles, writing the 'clear' tile's data */
+   {
+      const uint num_tiles = spu.fb.width_tiles * spu.fb.height_tiles;
+      uint i;
+      for (i = spu.init.id; i < num_tiles; i += spu.init.num_spus) {
+         uint tx = i % spu.fb.width_tiles;
+         uint ty = i / spu.fb.width_tiles;
+         if (clear->surface == 0)
+            put_tile(tx, ty, &spu.ctile, TAG_SURFACE_CLEAR, 0);
+         else
+            put_tile(tx, ty, &spu.ztile, TAG_SURFACE_CLEAR, 1);
+      }
+   }
+
+   if (spu.init.debug_flags & CELL_DEBUG_SYNC) {
+      wait_on_mask(1 << TAG_SURFACE_CLEAR);
+   }
+
+#endif /* CLEAR_OPT */
+
+   DEBUG_PRINTF("CLEAR SURF done\n");
+}
+
+
+static void
+cmd_release_verts(const struct cell_command_release_verts *release)
+{
+   DEBUG_PRINTF("RELEASE VERTS %u\n", release->vertex_buf);
+   ASSERT(release->vertex_buf != ~0U);
+   release_buffer(release->vertex_buf);
+}
+
+
+/**
+ * Process a CELL_CMD_STATE_FRAGMENT_OPS command.
+ * This involves installing new fragment ops SPU code.
+ * If this function is never called, we'll use a regular C fallback function
+ * for fragment processing.
+ */
+static void
+cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops)
+{
+   static int warned = 0;
+
+   DEBUG_PRINTF("CMD_STATE_FRAGMENT_OPS\n");
+   /* Copy SPU code from batch buffer to spu buffer */
+   memcpy(spu.fragment_ops_code, fops->code, SPU_MAX_FRAGMENT_OPS_INSTS * 4);
+   /* Copy state info (for fallback case only) */
+   memcpy(&spu.depth_stencil_alpha, &fops->dsa, sizeof(fops->dsa));
+   memcpy(&spu.blend, &fops->blend, sizeof(fops->blend));
+
+   /* Parity twist!  For now, always use the fallback code by default,
+    * only switching to codegen when specifically requested.  This
+    * allows us to develop freely without risking taking down the
+    * branch.
+    *
+    * Later, the parity of this check will be reversed, so that
+    * codegen is *always* used, unless we specifically indicate that
+    * we don't want it.
+    *
+    * Eventually, the option will be removed completely, because in
+    * final code we'll always use codegen and won't even provide the
+    * raw state records that the fallback code requires.
+    */
+   if ((spu.init.debug_flags & CELL_DEBUG_FRAGMENT_OP_FALLBACK) == 0) {
+      spu.fragment_ops = (spu_fragment_ops_func) spu.fragment_ops_code;
+   }
+   else {
+      /* otherwise, the default fallback code remains in place */
+      if (!warned) {
+         fprintf(stderr, "Cell Warning: using fallback per-fragment code\n");
+         warned = 1;
+      }
+   }
+
+   spu.read_depth = spu.depth_stencil_alpha.depth.enabled;
+   spu.read_stencil = spu.depth_stencil_alpha.stencil[0].enabled;
+}
+
+
+static void
+cmd_state_fragment_program(const struct cell_command_fragment_program *fp)
+{
+   DEBUG_PRINTF("CMD_STATE_FRAGMENT_PROGRAM\n");
+   /* Copy SPU code from batch buffer to spu buffer */
+   memcpy(spu.fragment_program_code, fp->code,
+          SPU_MAX_FRAGMENT_PROGRAM_INSTS * 4);
+#if 01
+   /* Point function pointer at new code */
+   spu.fragment_program = (spu_fragment_program_func)spu.fragment_program_code;
+#endif
+}
+
+
+static uint
+cmd_state_fs_constants(const uint64_t *buffer, uint pos)
+{
+   const uint num_const = buffer[pos + 1];
+   const float *constants = (const float *) &buffer[pos + 2];
+   uint i;
+
+   DEBUG_PRINTF("CMD_STATE_FS_CONSTANTS (%u)\n", num_const);
+
+   /* Expand each float to float[4] for SOA execution */
+   for (i = 0; i < num_const; i++) {
+      spu.constants[i] = spu_splats(constants[i]);
+   }
+
+   /* return new buffer pos (in 8-byte words) */
+   return pos + 2 + num_const / 2;
+}
+
+
+static void
+cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
+{
+   DEBUG_PRINTF("FRAMEBUFFER: %d x %d at %p, cformat 0x%x  zformat 0x%x\n",
+             cmd->width,
+             cmd->height,
+             cmd->color_start,
+             cmd->color_format,
+             cmd->depth_format);
+
+   ASSERT_ALIGN16(cmd->color_start);
+   ASSERT_ALIGN16(cmd->depth_start);
+
+   spu.fb.color_start = cmd->color_start;
+   spu.fb.depth_start = cmd->depth_start;
+   spu.fb.color_format = cmd->color_format;
+   spu.fb.depth_format = cmd->depth_format;
+   spu.fb.width = cmd->width;
+   spu.fb.height = cmd->height;
+   spu.fb.width_tiles = (spu.fb.width + TILE_SIZE - 1) / TILE_SIZE;
+   spu.fb.height_tiles = (spu.fb.height + TILE_SIZE - 1) / TILE_SIZE;
+
+   switch (spu.fb.depth_format) {
+   case PIPE_FORMAT_Z32_UNORM:
+      spu.fb.zsize = 4;
+      spu.fb.zscale = (float) 0xffffffffu;
+      break;
+   case PIPE_FORMAT_Z24S8_UNORM:
+   case PIPE_FORMAT_S8Z24_UNORM:
+   case PIPE_FORMAT_Z24X8_UNORM:
+   case PIPE_FORMAT_X8Z24_UNORM:
+      spu.fb.zsize = 4;
+      spu.fb.zscale = (float) 0x00ffffffu;
+      break;
+   case PIPE_FORMAT_Z16_UNORM:
+      spu.fb.zsize = 2;
+      spu.fb.zscale = (float) 0xffffu;
+      break;
+   default:
+      spu.fb.zsize = 0;
+      break;
+   }
+}
+
+
+static void
+cmd_state_sampler(const struct cell_command_sampler *sampler)
+{
+   DEBUG_PRINTF("SAMPLER [%u]\n", sampler->unit);
+
+   spu.sampler[sampler->unit] = sampler->state;
+   if (spu.sampler[sampler->unit].min_img_filter == PIPE_TEX_FILTER_LINEAR)
+      spu.sample_texture[sampler->unit] = sample_texture_bilinear;
+   else
+      spu.sample_texture[sampler->unit] = sample_texture_nearest;
+}
+
+
+static void
+cmd_state_texture(const struct cell_command_texture *texture)
+{
+   const uint unit = texture->unit;
+   const uint width = texture->width;
+   const uint height = texture->height;
+
+   DEBUG_PRINTF("TEXTURE [%u] at %p  size %u x %u\n",
+             texture->unit, texture->start,
+             texture->width, texture->height);
+
+   spu.texture[unit].start = texture->start;
+   spu.texture[unit].width = width;
+   spu.texture[unit].height = height;
+
+   spu.texture[unit].tiles_per_row = width / TILE_SIZE;
+
+   spu.texture[unit].tex_size = (vector float) { width, height, 0.0, 0.0};
+   spu.texture[unit].tex_size_mask = (vector unsigned int)
+         { width - 1, height - 1, 0, 0 };
+   spu.texture[unit].tex_size_x_mask = spu_splats(width - 1);
+   spu.texture[unit].tex_size_y_mask = spu_splats(height - 1);
+}
+
+
+static void
+cmd_state_vertex_info(const struct vertex_info *vinfo)
+{
+   DEBUG_PRINTF("VERTEX_INFO num_attribs=%u\n", vinfo->num_attribs);
+   ASSERT(vinfo->num_attribs >= 1);
+   ASSERT(vinfo->num_attribs <= 8);
+   memcpy(&spu.vertex_info, vinfo, sizeof(*vinfo));
+}
+
+
+static void
+cmd_state_vs_array_info(const struct cell_array_info *vs_info)
+{
+   const unsigned attr = vs_info->attr;
+
+   ASSERT(attr < PIPE_MAX_ATTRIBS);
+   draw.vertex_fetch.src_ptr[attr] = vs_info->base;
+   draw.vertex_fetch.pitch[attr] = vs_info->pitch;
+   draw.vertex_fetch.size[attr] = vs_info->size;
+   draw.vertex_fetch.code_offset[attr] = vs_info->function_offset;
+   draw.vertex_fetch.dirty = 1;
+}
+
+
+static void
+cmd_state_attrib_fetch(const struct cell_attribute_fetch_code *code)
+{
+   mfc_get(attribute_fetch_code_buffer,
+           (unsigned int) code->base,  /* src */
+           code->size,
+           TAG_BATCH_BUFFER,
+           0, /* tid */
+           0  /* rid */);
+   wait_on_mask(1 << TAG_BATCH_BUFFER);
+
+   draw.vertex_fetch.code = attribute_fetch_code_buffer;
+}
+
+
+static void
+cmd_finish(void)
+{
+   DEBUG_PRINTF("FINISH\n");
+   really_clear_tiles(0);
+   /* wait for all outstanding DMAs to finish */
+   mfc_write_tag_mask(~0);
+   mfc_read_tag_status_all();
+   /* send mbox message to PPU */
+   spu_write_out_mbox(CELL_CMD_FINISH);
+}
+
+
+/**
+ * Execute a batch of commands which was sent to us by the PPU.
+ * See the cell_emit_state.c code to see where the commands come from.
+ *
+ * The opcode param encodes the location of the buffer and its size.
+ */
+static void
+cmd_batch(uint opcode)
+{
+   const uint buf = (opcode >> 8) & 0xff;
+   uint size = (opcode >> 16);
+   uint64_t buffer[CELL_BUFFER_SIZE / 8] ALIGN16_ATTRIB;
+   const unsigned usize = size / sizeof(buffer[0]);
+   uint pos;
+
+   DEBUG_PRINTF("BATCH buffer %u, len %u, from %p\n",
+             buf, size, spu.init.buffers[buf]);
+
+   ASSERT((opcode & CELL_CMD_OPCODE_MASK) == CELL_CMD_BATCH);
+
+   ASSERT_ALIGN16(spu.init.buffers[buf]);
+
+   size = ROUNDUP16(size);
+
+   ASSERT_ALIGN16(spu.init.buffers[buf]);
+
+   mfc_get(buffer,  /* dest */
+           (unsigned int) spu.init.buffers[buf],  /* src */
+           size,
+           TAG_BATCH_BUFFER,
+           0, /* tid */
+           0  /* rid */);
+   wait_on_mask(1 << TAG_BATCH_BUFFER);
+
+   /* Tell PPU we're done copying the buffer to local store */
+   DEBUG_PRINTF("release batch buf %u\n", buf);
+   release_buffer(buf);
+
+   /*
+    * Loop over commands in the batch buffer
+    */
+   for (pos = 0; pos < usize; /* no incr */) {
+      switch (buffer[pos]) {
+      /*
+       * rendering commands
+       */
+      case CELL_CMD_CLEAR_SURFACE:
+         {
+            struct cell_command_clear_surface *clr
+               = (struct cell_command_clear_surface *) &buffer[pos];
+            cmd_clear_surface(clr);
+            pos += sizeof(*clr) / 8;
+         }
+         break;
+      case CELL_CMD_RENDER:
+         {
+            struct cell_command_render *render
+               = (struct cell_command_render *) &buffer[pos];
+            uint pos_incr;
+            cmd_render(render, &pos_incr);
+            pos += pos_incr;
+         }
+         break;
+      /*
+       * state-update commands
+       */
+      case CELL_CMD_STATE_FRAMEBUFFER:
+         {
+            struct cell_command_framebuffer *fb
+               = (struct cell_command_framebuffer *) &buffer[pos];
+            cmd_state_framebuffer(fb);
+            pos += sizeof(*fb) / 8;
+         }
+         break;
+      case CELL_CMD_STATE_FRAGMENT_OPS:
+         {
+            struct cell_command_fragment_ops *fops
+               = (struct cell_command_fragment_ops *) &buffer[pos];
+            cmd_state_fragment_ops(fops);
+            pos += sizeof(*fops) / 8;
+         }
+         break;
+      case CELL_CMD_STATE_FRAGMENT_PROGRAM:
+         {
+            struct cell_command_fragment_program *fp
+               = (struct cell_command_fragment_program *) &buffer[pos];
+            cmd_state_fragment_program(fp);
+            pos += sizeof(*fp) / 8;
+         }
+         break;
+      case CELL_CMD_STATE_FS_CONSTANTS:
+         pos = cmd_state_fs_constants(buffer, pos);
+         break;
+      case CELL_CMD_STATE_SAMPLER:
+         {
+            struct cell_command_sampler *sampler
+               = (struct cell_command_sampler *) &buffer[pos];
+            cmd_state_sampler(sampler);
+            pos += sizeof(*sampler) / 8;
+         }
+         break;
+      case CELL_CMD_STATE_TEXTURE:
+         {
+            struct cell_command_texture *texture
+               = (struct cell_command_texture *) &buffer[pos];
+            cmd_state_texture(texture);
+            pos += sizeof(*texture) / 8;
+         }
+         break;
+      case CELL_CMD_STATE_VERTEX_INFO:
+         cmd_state_vertex_info((struct vertex_info *) &buffer[pos+1]);
+         pos += (1 + ROUNDUP8(sizeof(struct vertex_info)) / 8);
+         break;
+      case CELL_CMD_STATE_VIEWPORT:
+         (void) memcpy(& draw.viewport, &buffer[pos+1],
+                       sizeof(struct pipe_viewport_state));
+         pos += (1 + ROUNDUP8(sizeof(struct pipe_viewport_state)) / 8);
+         break;
+      case CELL_CMD_STATE_UNIFORMS:
+         draw.constants = (const float (*)[4]) (uintptr_t) buffer[pos + 1];
+         pos += 2;
+         break;
+      case CELL_CMD_STATE_VS_ARRAY_INFO:
+         cmd_state_vs_array_info((struct cell_array_info *) &buffer[pos+1]);
+         pos += (1 + ROUNDUP8(sizeof(struct cell_array_info)) / 8);
+         break;
+      case CELL_CMD_STATE_BIND_VS:
+#if 0
+         spu_bind_vertex_shader(&draw,
+                                (struct cell_shader_info *) &buffer[pos+1]);
+#endif
+         pos += (1 + ROUNDUP8(sizeof(struct cell_shader_info)) / 8);
+         break;
+      case CELL_CMD_STATE_ATTRIB_FETCH:
+         cmd_state_attrib_fetch((struct cell_attribute_fetch_code *)
+                                &buffer[pos+1]);
+         pos += (1 + ROUNDUP8(sizeof(struct cell_attribute_fetch_code)) / 8);
+         break;
+      /*
+       * misc commands
+       */
+      case CELL_CMD_FINISH:
+         cmd_finish();
+         pos += 1;
+         break;
+      case CELL_CMD_RELEASE_VERTS:
+         {
+            struct cell_command_release_verts *release
+               = (struct cell_command_release_verts *) &buffer[pos];
+            cmd_release_verts(release);
+            pos += sizeof(*release) / 8;
+         }
+         break;
+      case CELL_CMD_FLUSH_BUFFER_RANGE: {
+	 struct cell_buffer_range *br = (struct cell_buffer_range *)
+	     &buffer[pos+1];
+
+	 spu_dcache_mark_dirty((unsigned) br->base, br->size);
+         pos += (1 + ROUNDUP8(sizeof(struct cell_buffer_range)) / 8);
+	 break;
+      }
+      default:
+         printf("SPU %u: bad opcode: 0x%llx\n", spu.init.id, buffer[pos]);
+         ASSERT(0);
+         break;
+      }
+   }
+
+   DEBUG_PRINTF("BATCH complete\n");
+}
+
+
+
+/**
+ * Main loop for SPEs: Get a command, execute it, repeat.
+ */
+void
+command_loop(void)
+{
+   struct cell_command cmd;
+   int exitFlag = 0;
+
+   DEBUG_PRINTF("Enter command loop\n");
+
+   ASSERT((sizeof(struct cell_command) & 0xf) == 0);
+   ASSERT_ALIGN16(&cmd);
+
+   while (!exitFlag) {
+      unsigned opcode;
+      int tag = 0;
+
+      DEBUG_PRINTF("Wait for cmd...\n");
+
+      /* read/wait from mailbox */
+      opcode = (unsigned int) spu_read_in_mbox();
+
+      DEBUG_PRINTF("got cmd 0x%x\n", opcode);
+
+      /* command payload */
+      mfc_get(&cmd,  /* dest */
+              (unsigned int) spu.init.cmd, /* src */
+              sizeof(struct cell_command), /* bytes */
+              tag,
+              0, /* tid */
+              0  /* rid */);
+      wait_on_mask( 1 << tag );
+
+      /*
+       * NOTE: most commands should be contained in a batch buffer
+       */
+
+      switch (opcode & CELL_CMD_OPCODE_MASK) {
+      case CELL_CMD_EXIT:
+         DEBUG_PRINTF("EXIT\n");
+         exitFlag = 1;
+         break;
+      case CELL_CMD_VS_EXECUTE:
+#if 0
+         spu_execute_vertex_shader(&draw, &cmd.vs);
+#endif
+         break;
+      case CELL_CMD_BATCH:
+         cmd_batch(opcode);
+         break;
+      default:
+         printf("Bad opcode 0x%x!\n", opcode & CELL_CMD_OPCODE_MASK);
+      }
+
+   }
+
+   DEBUG_PRINTF("Exit command loop\n");
+
+   spu_dcache_report();
+}
diff --git a/src/gallium/drivers/cell/spu/spu_command.h b/src/gallium/drivers/cell/spu/spu_command.h
new file mode 100644
index 0000000000..853e9aa549
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_command.h
@@ -0,0 +1,7 @@
+
+
+
+extern void
+command_loop(void);
+
+
diff --git a/src/gallium/drivers/cell/spu/spu_dcache.c b/src/gallium/drivers/cell/spu/spu_dcache.c
new file mode 100644
index 0000000000..167404cdc5
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_dcache.c
@@ -0,0 +1,143 @@
+/*
+ * (C) Copyright IBM Corporation 2008
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "cell/common.h"
+#include "spu_main.h"
+#include "spu_dcache.h"
+
+#define CACHELINE_LOG2SIZE    7
+#define LINE_SIZE             (1U << 7)
+#define ALIGN_MASK            (~(LINE_SIZE - 1))
+
+#define CACHE_NAME            data
+#define CACHED_TYPE           qword
+#define CACHE_TYPE            CACHE_TYPE_RO
+#define CACHE_SET_TAGID(set)  (((set) & 0x03) + TAG_DCACHE0)
+#define CACHE_LOG2NNWAY       2
+#define CACHE_LOG2NSETS       6
+/*#define CACHE_STATS           1*/
+#include <cache-api.h>
+
+/* Yes folks, this is ugly.
+ */
+#undef CACHE_NWAY
+#undef CACHE_NSETS
+#define CACHE_NAME            data
+#define CACHE_NWAY            4
+#define CACHE_NSETS           (1U << 6)
+
+
+/**
+ * Fetch between arbitrary number of bytes from an unaligned address
+ *
+ * \param dst   Destination data buffer
+ * \param ea    Main memory effective address of source data
+ * \param size  Number of bytes to read
+ *
+ * \warning
+ * As is hinted by the type of the \c dst pointer, this function writes
+ * multiples of 16-bytes.
+ */
+void
+spu_dcache_fetch_unaligned(qword *dst, unsigned ea, unsigned size)
+{
+   const int shift = ea & 0x0f;
+   const unsigned read_size = ROUNDUP16(size + shift);
+   const unsigned last_read = ROUNDUP16(ea + size);
+   const qword *const last_write = dst + (ROUNDUP16(size) / 16);
+   unsigned i;
+
+
+   if (shift == 0) {
+      /* Data is already aligned.  Fetch directly into the destination buffer.
+       */
+      for (i = 0; i < size; i += 16) {
+         *(dst++) = cache_rd(data, ea + i);
+      }
+   } else {
+      qword hi;
+
+
+      /* Please exercise extreme caution when modifying this code.  This code
+       * must not read past the end of the page containing the source data,
+       * and it must not write more than ((size + 15) / 16) qwords to the
+       * destination buffer.
+       */
+      ea &= ~0x0f;
+      hi = cache_rd(data, ea);
+      for (i = 16; i < read_size; i += 16) {
+         qword lo = cache_rd(data, ea + i);
+
+         *(dst++) = si_or((qword) spu_slqwbyte(hi, shift),
+                          (qword) spu_rlmaskqwbyte(lo, shift - 16));
+         hi = lo;
+      }
+
+      if (dst != last_write) {
+         *(dst++) = si_or((qword) spu_slqwbyte(hi, shift), si_il(0));
+      }
+   }
+   
+   ASSERT((ea + i) == last_read);
+   ASSERT(dst == last_write);
+}
+
+
+/**
+ * Notify the cache that a range of main memory may have been modified
+ */
+void
+spu_dcache_mark_dirty(unsigned ea, unsigned size)
+{
+   unsigned i;
+   const unsigned aligned_start = (ea & ALIGN_MASK);
+   const unsigned aligned_end = (ea + size + (LINE_SIZE - 1)) 
+       & ALIGN_MASK;
+
+
+   for (i = 0; i < (CACHE_NWAY * CACHE_NSETS); i++) {
+      const unsigned entry = __cache_dir[i];
+      const unsigned addr = entry & ~0x0f;
+
+      __cache_dir[i] = ((addr >= aligned_start) && (addr < aligned_end))
+          ? (entry & ~CACHELINE_VALID) : entry;
+   }
+}
+
+
+/**
+ * Print cache utilization report
+ */
+void
+spu_dcache_report(void)
+{
+#ifdef CACHE_STATS
+   if (spu.init.id == 0) {
+      printf("SPU 0: Texture cache report:\n");
+      cache_pr_stats(data);
+   }
+#endif
+}
+
+
diff --git a/src/gallium/drivers/cell/spu/spu_dcache.h b/src/gallium/drivers/cell/spu/spu_dcache.h
new file mode 100644
index 0000000000..39a19eb31b
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_dcache.h
@@ -0,0 +1,37 @@
+/*
+ * (C) Copyright IBM Corporation 2008
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef SPU_DCACHE_H
+#define SPU_DCACHE_H
+
+extern void
+spu_dcache_fetch_unaligned(qword *dst, unsigned ea, unsigned size);
+
+extern void
+spu_dcache_mark_dirty(unsigned ea, unsigned size);
+
+extern void
+spu_dcache_report(void);
+
+#endif /* SPU_DCACHE_H */
diff --git a/src/gallium/drivers/cell/spu/spu_debug.h b/src/gallium/drivers/cell/spu/spu_debug.h
new file mode 100644
index 0000000000..eeec052655
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_debug.h
@@ -0,0 +1,60 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef SPU_DEBUG_H
+#define SPU_DEBUG_H
+
+
+/* Set to 0 to disable all extraneous debugging code */
+#define DEBUG 1
+
+#if DEBUG
+extern boolean Debug;
+extern boolean force_fragment_ops_fallback;
+
+/* These debug macros use the unusual construction ", ##__VA_ARGS__"
+ * which expands to the expected comma + args if variadic arguments
+ * are supplied, but swallows the comma if there are no variadic
+ * arguments (which avoids syntax errors that would otherwise occur).
+ */
+#define DEBUG_PRINTF(format,...) \
+   if (Debug) \
+      printf("SPU %u: " format, spu.init.id, ##__VA_ARGS__)
+#define D_PRINTF(flag, format,...) \
+   if (spu.init.debug_flags & (flag)) \
+      printf("SPU %u: " format, spu.init.id, ##__VA_ARGS__)
+
+#else
+
+#define DEBUG_PRINTF(...)
+#define D_PRINTF(...)
+
+#endif
+
+
+#endif /* SPU_DEBUG_H */
diff --git a/src/gallium/drivers/cell/spu/spu_exec.c b/src/gallium/drivers/cell/spu/spu_exec.c
new file mode 100644
index 0000000000..e27df2dfb3
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_exec.c
@@ -0,0 +1,1933 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * TGSI interpretor/executor.
+ *
+ * Flow control information:
+ *
+ * Since we operate on 'quads' (4 pixels or 4 vertices in parallel)
+ * flow control statements (IF/ELSE/ENDIF, LOOP/ENDLOOP) require special
+ * care since a condition may be true for some quad components but false
+ * for other components.
+ *
+ * We basically execute all statements (even if they're in the part of
+ * an IF/ELSE clause that's "not taken") and use a special mask to
+ * control writing to destination registers.  This is the ExecMask.
+ * See store_dest().
+ *
+ * The ExecMask is computed from three other masks (CondMask, LoopMask and
+ * ContMask) which are controlled by the flow control instructions (namely:
+ * (IF/ELSE/ENDIF, LOOP/ENDLOOP and CONT).
+ *
+ *
+ * Authors:
+ *   Michal Krol
+ *   Brian Paul
+ */
+
+#include <transpose_matrix4x4.h>
+#include <simdmath/ceilf4.h>
+#include <simdmath/cosf4.h>
+#include <simdmath/divf4.h>
+#include <simdmath/floorf4.h>
+#include <simdmath/log2f4.h>
+#include <simdmath/powf4.h>
+#include <simdmath/sinf4.h>
+#include <simdmath/sqrtf4.h>
+#include <simdmath/truncf4.h>
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_state.h"
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_util.h"
+#include "spu_exec.h"
+#include "spu_main.h"
+#include "spu_vertex_shader.h"
+#include "spu_dcache.h"
+#include "cell/common.h"
+
+#define TILE_TOP_LEFT     0
+#define TILE_TOP_RIGHT    1
+#define TILE_BOTTOM_LEFT  2
+#define TILE_BOTTOM_RIGHT 3
+
+/*
+ * Shorthand locations of various utility registers (_I = Index, _C = Channel)
+ */
+#define TEMP_0_I           TGSI_EXEC_TEMP_00000000_I
+#define TEMP_0_C           TGSI_EXEC_TEMP_00000000_C
+#define TEMP_7F_I          TGSI_EXEC_TEMP_7FFFFFFF_I
+#define TEMP_7F_C          TGSI_EXEC_TEMP_7FFFFFFF_C
+#define TEMP_80_I          TGSI_EXEC_TEMP_80000000_I
+#define TEMP_80_C          TGSI_EXEC_TEMP_80000000_C
+#define TEMP_FF_I          TGSI_EXEC_TEMP_FFFFFFFF_I
+#define TEMP_FF_C          TGSI_EXEC_TEMP_FFFFFFFF_C
+#define TEMP_1_I           TGSI_EXEC_TEMP_ONE_I
+#define TEMP_1_C           TGSI_EXEC_TEMP_ONE_C
+#define TEMP_2_I           TGSI_EXEC_TEMP_TWO_I
+#define TEMP_2_C           TGSI_EXEC_TEMP_TWO_C
+#define TEMP_128_I         TGSI_EXEC_TEMP_128_I
+#define TEMP_128_C         TGSI_EXEC_TEMP_128_C
+#define TEMP_M128_I        TGSI_EXEC_TEMP_MINUS_128_I
+#define TEMP_M128_C        TGSI_EXEC_TEMP_MINUS_128_C
+#define TEMP_KILMASK_I     TGSI_EXEC_TEMP_KILMASK_I
+#define TEMP_KILMASK_C     TGSI_EXEC_TEMP_KILMASK_C
+#define TEMP_OUTPUT_I      TGSI_EXEC_TEMP_OUTPUT_I
+#define TEMP_OUTPUT_C      TGSI_EXEC_TEMP_OUTPUT_C
+#define TEMP_PRIMITIVE_I   TGSI_EXEC_TEMP_PRIMITIVE_I
+#define TEMP_PRIMITIVE_C   TGSI_EXEC_TEMP_PRIMITIVE_C
+#define TEMP_R0            TGSI_EXEC_TEMP_R0
+
+#define FOR_EACH_CHANNEL(CHAN)\
+   for (CHAN = 0; CHAN < 4; CHAN++)
+
+#define IS_CHANNEL_ENABLED(INST, CHAN)\
+   ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
+
+#define IS_CHANNEL_ENABLED2(INST, CHAN)\
+   ((INST).FullDstRegisters[1].DstRegister.WriteMask & (1 << (CHAN)))
+
+#define FOR_EACH_ENABLED_CHANNEL(INST, CHAN)\
+   FOR_EACH_CHANNEL( CHAN )\
+      if (IS_CHANNEL_ENABLED( INST, CHAN ))
+
+#define FOR_EACH_ENABLED_CHANNEL2(INST, CHAN)\
+   FOR_EACH_CHANNEL( CHAN )\
+      if (IS_CHANNEL_ENABLED2( INST, CHAN ))
+
+
+/** The execution mask depends on the conditional mask and the loop mask */
+#define UPDATE_EXEC_MASK(MACH) \
+      MACH->ExecMask = MACH->CondMask & MACH->LoopMask & MACH->ContMask & MACH->FuncMask
+
+
+#define CHAN_X  0
+#define CHAN_Y  1
+#define CHAN_Z  2
+#define CHAN_W  3
+
+
+
+/**
+ * Initialize machine state by expanding tokens to full instructions,
+ * allocating temporary storage, setting up constants, etc.
+ * After this, we can call spu_exec_machine_run() many times.
+ */
+void
+spu_exec_machine_init(struct spu_exec_machine *mach,
+                      uint numSamplers,
+                      struct spu_sampler *samplers,
+                      unsigned processor)
+{
+   const qword zero = si_il(0);
+   const qword not_zero = si_il(~0);
+
+   (void) numSamplers;
+   mach->Samplers = samplers;
+   mach->Processor = processor;
+   mach->Addrs = &mach->Temps[TGSI_EXEC_NUM_TEMPS];
+
+   /* Setup constants. */
+   mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].q = zero;
+   mach->Temps[TEMP_FF_I].xyzw[TEMP_FF_C].q = not_zero;
+   mach->Temps[TEMP_7F_I].xyzw[TEMP_7F_C].q = si_shli(not_zero, -1);
+   mach->Temps[TEMP_80_I].xyzw[TEMP_80_C].q = si_shli(not_zero, 31);
+
+   mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].q = (qword) spu_splats(1.0f);
+   mach->Temps[TEMP_2_I].xyzw[TEMP_2_C].q = (qword) spu_splats(2.0f);
+   mach->Temps[TEMP_128_I].xyzw[TEMP_128_C].q = (qword) spu_splats(128.0f);
+   mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C].q = (qword) spu_splats(-128.0f);
+}
+
+
+static INLINE qword
+micro_abs(qword src)
+{
+   return si_rotmi(si_shli(src, 1), -1);
+}
+
+static INLINE qword
+micro_ceil(qword src)
+{
+   return (qword) _ceilf4((vec_float4) src);
+}
+
+static INLINE qword
+micro_cos(qword src)
+{
+   return (qword) _cosf4((vec_float4) src);
+}
+
+static const qword br_shuf = {
+   TILE_BOTTOM_RIGHT + 0, TILE_BOTTOM_RIGHT + 1,
+   TILE_BOTTOM_RIGHT + 2, TILE_BOTTOM_RIGHT + 3,
+   TILE_BOTTOM_RIGHT + 0, TILE_BOTTOM_RIGHT + 1,
+   TILE_BOTTOM_RIGHT + 2, TILE_BOTTOM_RIGHT + 3,
+   TILE_BOTTOM_RIGHT + 0, TILE_BOTTOM_RIGHT + 1,
+   TILE_BOTTOM_RIGHT + 2, TILE_BOTTOM_RIGHT + 3,
+   TILE_BOTTOM_RIGHT + 0, TILE_BOTTOM_RIGHT + 1,
+   TILE_BOTTOM_RIGHT + 2, TILE_BOTTOM_RIGHT + 3,
+};
+
+static const qword bl_shuf = {
+   TILE_BOTTOM_LEFT + 0, TILE_BOTTOM_LEFT + 1,
+   TILE_BOTTOM_LEFT + 2, TILE_BOTTOM_LEFT + 3,
+   TILE_BOTTOM_LEFT + 0, TILE_BOTTOM_LEFT + 1,
+   TILE_BOTTOM_LEFT + 2, TILE_BOTTOM_LEFT + 3,
+   TILE_BOTTOM_LEFT + 0, TILE_BOTTOM_LEFT + 1,
+   TILE_BOTTOM_LEFT + 2, TILE_BOTTOM_LEFT + 3,
+   TILE_BOTTOM_LEFT + 0, TILE_BOTTOM_LEFT + 1,
+   TILE_BOTTOM_LEFT + 2, TILE_BOTTOM_LEFT + 3,
+};
+
+static const qword tl_shuf = {
+   TILE_TOP_LEFT + 0, TILE_TOP_LEFT + 1,
+   TILE_TOP_LEFT + 2, TILE_TOP_LEFT + 3,
+   TILE_TOP_LEFT + 0, TILE_TOP_LEFT + 1,
+   TILE_TOP_LEFT + 2, TILE_TOP_LEFT + 3,
+   TILE_TOP_LEFT + 0, TILE_TOP_LEFT + 1,
+   TILE_TOP_LEFT + 2, TILE_TOP_LEFT + 3,
+   TILE_TOP_LEFT + 0, TILE_TOP_LEFT + 1,
+   TILE_TOP_LEFT + 2, TILE_TOP_LEFT + 3,
+};
+
+static qword
+micro_ddx(qword src)
+{
+   qword bottom_right = si_shufb(src, src, br_shuf);
+   qword bottom_left = si_shufb(src, src, bl_shuf);
+
+   return si_fs(bottom_right, bottom_left);
+}
+
+static qword
+micro_ddy(qword src)
+{
+   qword top_left = si_shufb(src, src, tl_shuf);
+   qword bottom_left = si_shufb(src, src, bl_shuf);
+
+   return si_fs(top_left, bottom_left);
+}
+
+static INLINE qword
+micro_div(qword src0, qword src1)
+{
+   return (qword) _divf4((vec_float4) src0, (vec_float4) src1);
+}
+
+static qword
+micro_flr(qword src)
+{
+   return (qword) _floorf4((vec_float4) src);
+}
+
+static qword
+micro_frc(qword src)
+{
+   return si_fs(src, (qword) _floorf4((vec_float4) src));
+}
+
+static INLINE qword
+micro_ge(qword src0, qword src1)
+{
+   return si_or(si_fceq(src0, src1), si_fcgt(src0, src1));
+}
+
+static qword
+micro_lg2(qword src)
+{
+   return (qword) _log2f4((vec_float4) src);
+}
+
+static INLINE qword
+micro_lt(qword src0, qword src1)
+{
+   const qword tmp = si_or(si_fceq(src0, src1), si_fcgt(src0, src1));
+
+   return si_xori(tmp, 0xff);
+}
+
+static INLINE qword
+micro_max(qword src0, qword src1)
+{
+   return si_selb(src1, src0, si_fcgt(src0, src1));
+}
+
+static INLINE qword
+micro_min(qword src0, qword src1)
+{
+   return si_selb(src0, src1, si_fcgt(src0, src1));
+}
+
+static qword
+micro_neg(qword src)
+{
+   return si_xor(src, (qword) spu_splats(0x80000000));
+}
+
+static qword
+micro_set_sign(qword src)
+{
+   return si_or(src, (qword) spu_splats(0x80000000));
+}
+
+static qword
+micro_pow(qword src0, qword src1)
+{
+   return (qword) _powf4((vec_float4) src0, (vec_float4) src1);
+}
+
+static qword
+micro_rnd(qword src)
+{
+   const qword half = (qword) spu_splats(0.5f);
+
+   /* May be able to use _roundf4.  There may be some difference, though.
+    */
+   return (qword) _floorf4((vec_float4) si_fa(src, half));
+}
+
+static INLINE qword
+micro_ishr(qword src0, qword src1)
+{
+   return si_rotma(src0, si_sfi(src1, 0));
+}
+
+static qword
+micro_trunc(qword src)
+{
+   return (qword) _truncf4((vec_float4) src);
+}
+
+static qword
+micro_sin(qword src)
+{
+   return (qword) _sinf4((vec_float4) src);
+}
+
+static INLINE qword
+micro_sqrt(qword src)
+{
+   return (qword) _sqrtf4((vec_float4) src);
+}
+
+static void
+fetch_src_file_channel(
+   const struct spu_exec_machine *mach,
+   const uint file,
+   const uint swizzle,
+   const union spu_exec_channel *index,
+   union spu_exec_channel *chan )
+{
+   switch( swizzle ) {
+   case TGSI_EXTSWIZZLE_X:
+   case TGSI_EXTSWIZZLE_Y:
+   case TGSI_EXTSWIZZLE_Z:
+   case TGSI_EXTSWIZZLE_W:
+      switch( file ) {
+      case TGSI_FILE_CONSTANT: {
+         unsigned i;
+
+         for (i = 0; i < 4; i++) {
+            const float *ptr = mach->Consts[index->i[i]];
+            float tmp[4];
+
+            spu_dcache_fetch_unaligned((qword *) tmp,
+                                       (uintptr_t)(ptr + swizzle),
+                                       sizeof(float));
+
+            chan->f[i] = tmp[0];
+         }
+         break;
+      }
+
+      case TGSI_FILE_INPUT:
+         chan->u[0] = mach->Inputs[index->i[0]].xyzw[swizzle].u[0];
+         chan->u[1] = mach->Inputs[index->i[1]].xyzw[swizzle].u[1];
+         chan->u[2] = mach->Inputs[index->i[2]].xyzw[swizzle].u[2];
+         chan->u[3] = mach->Inputs[index->i[3]].xyzw[swizzle].u[3];
+         break;
+
+      case TGSI_FILE_TEMPORARY:
+         chan->u[0] = mach->Temps[index->i[0]].xyzw[swizzle].u[0];
+         chan->u[1] = mach->Temps[index->i[1]].xyzw[swizzle].u[1];
+         chan->u[2] = mach->Temps[index->i[2]].xyzw[swizzle].u[2];
+         chan->u[3] = mach->Temps[index->i[3]].xyzw[swizzle].u[3];
+         break;
+
+      case TGSI_FILE_IMMEDIATE:
+         ASSERT( index->i[0] < (int) mach->ImmLimit );
+         ASSERT( index->i[1] < (int) mach->ImmLimit );
+         ASSERT( index->i[2] < (int) mach->ImmLimit );
+         ASSERT( index->i[3] < (int) mach->ImmLimit );
+
+         chan->f[0] = mach->Imms[index->i[0]][swizzle];
+         chan->f[1] = mach->Imms[index->i[1]][swizzle];
+         chan->f[2] = mach->Imms[index->i[2]][swizzle];
+         chan->f[3] = mach->Imms[index->i[3]][swizzle];
+         break;
+
+      case TGSI_FILE_ADDRESS:
+         chan->u[0] = mach->Addrs[index->i[0]].xyzw[swizzle].u[0];
+         chan->u[1] = mach->Addrs[index->i[1]].xyzw[swizzle].u[1];
+         chan->u[2] = mach->Addrs[index->i[2]].xyzw[swizzle].u[2];
+         chan->u[3] = mach->Addrs[index->i[3]].xyzw[swizzle].u[3];
+         break;
+
+      case TGSI_FILE_OUTPUT:
+         /* vertex/fragment output vars can be read too */
+         chan->u[0] = mach->Outputs[index->i[0]].xyzw[swizzle].u[0];
+         chan->u[1] = mach->Outputs[index->i[1]].xyzw[swizzle].u[1];
+         chan->u[2] = mach->Outputs[index->i[2]].xyzw[swizzle].u[2];
+         chan->u[3] = mach->Outputs[index->i[3]].xyzw[swizzle].u[3];
+         break;
+
+      default:
+         ASSERT( 0 );
+      }
+      break;
+
+   case TGSI_EXTSWIZZLE_ZERO:
+      *chan = mach->Temps[TEMP_0_I].xyzw[TEMP_0_C];
+      break;
+
+   case TGSI_EXTSWIZZLE_ONE:
+      *chan = mach->Temps[TEMP_1_I].xyzw[TEMP_1_C];
+      break;
+
+   default:
+      ASSERT( 0 );
+   }
+}
+
+static void
+fetch_source(
+   const struct spu_exec_machine *mach,
+   union spu_exec_channel *chan,
+   const struct tgsi_full_src_register *reg,
+   const uint chan_index )
+{
+   union spu_exec_channel index;
+   uint swizzle;
+
+   index.i[0] =
+   index.i[1] =
+   index.i[2] =
+   index.i[3] = reg->SrcRegister.Index;
+
+   if (reg->SrcRegister.Indirect) {
+      union spu_exec_channel index2;
+      union spu_exec_channel indir_index;
+
+      index2.i[0] =
+      index2.i[1] =
+      index2.i[2] =
+      index2.i[3] = reg->SrcRegisterInd.Index;
+
+      swizzle = tgsi_util_get_src_register_swizzle(&reg->SrcRegisterInd,
+                                                   CHAN_X);
+      fetch_src_file_channel(
+         mach,
+         reg->SrcRegisterInd.File,
+         swizzle,
+         &index2,
+         &indir_index );
+
+      index.q = si_a(index.q, indir_index.q);
+   }
+
+   if( reg->SrcRegister.Dimension ) {
+      switch( reg->SrcRegister.File ) {
+      case TGSI_FILE_INPUT:
+         index.q = si_mpyi(index.q, 17);
+         break;
+      case TGSI_FILE_CONSTANT:
+         index.q = si_shli(index.q, 12);
+         break;
+      default:
+         ASSERT( 0 );
+      }
+
+      index.i[0] += reg->SrcRegisterDim.Index;
+      index.i[1] += reg->SrcRegisterDim.Index;
+      index.i[2] += reg->SrcRegisterDim.Index;
+      index.i[3] += reg->SrcRegisterDim.Index;
+
+      if (reg->SrcRegisterDim.Indirect) {
+         union spu_exec_channel index2;
+         union spu_exec_channel indir_index;
+
+         index2.i[0] =
+         index2.i[1] =
+         index2.i[2] =
+         index2.i[3] = reg->SrcRegisterDimInd.Index;
+
+         swizzle = tgsi_util_get_src_register_swizzle( &reg->SrcRegisterDimInd, CHAN_X );
+         fetch_src_file_channel(
+            mach,
+            reg->SrcRegisterDimInd.File,
+            swizzle,
+            &index2,
+            &indir_index );
+
+         index.q = si_a(index.q, indir_index.q);
+      }
+   }
+
+   swizzle = tgsi_util_get_full_src_register_extswizzle( reg, chan_index );
+   fetch_src_file_channel(
+      mach,
+      reg->SrcRegister.File,
+      swizzle,
+      &index,
+      chan );
+
+   switch (tgsi_util_get_full_src_register_sign_mode( reg, chan_index )) {
+   case TGSI_UTIL_SIGN_CLEAR:
+      chan->q = micro_abs(chan->q);
+      break;
+
+   case TGSI_UTIL_SIGN_SET:
+      chan->q = micro_set_sign(chan->q);
+      break;
+
+   case TGSI_UTIL_SIGN_TOGGLE:
+      chan->q = micro_neg(chan->q);
+      break;
+
+   case TGSI_UTIL_SIGN_KEEP:
+      break;
+   }
+
+   if (reg->SrcRegisterExtMod.Complement) {
+      chan->q = si_fs(mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].q, chan->q);
+   }
+}
+
+static void
+store_dest(
+   struct spu_exec_machine *mach,
+   const union spu_exec_channel *chan,
+   const struct tgsi_full_dst_register *reg,
+   const struct tgsi_full_instruction *inst,
+   uint chan_index )
+{
+   union spu_exec_channel *dst;
+
+   switch( reg->DstRegister.File ) {
+   case TGSI_FILE_NULL:
+      return;
+
+   case TGSI_FILE_OUTPUT:
+      dst = &mach->Outputs[mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0]
+                           + reg->DstRegister.Index].xyzw[chan_index];
+      break;
+
+   case TGSI_FILE_TEMPORARY:
+      dst = &mach->Temps[reg->DstRegister.Index].xyzw[chan_index];
+      break;
+
+   case TGSI_FILE_ADDRESS:
+      dst = &mach->Addrs[reg->DstRegister.Index].xyzw[chan_index];
+      break;
+
+   default:
+      ASSERT( 0 );
+      return;
+   }
+
+   switch (inst->Instruction.Saturate)
+   {
+   case TGSI_SAT_NONE:
+      if (mach->ExecMask & 0x1)
+         dst->i[0] = chan->i[0];
+      if (mach->ExecMask & 0x2)
+         dst->i[1] = chan->i[1];
+      if (mach->ExecMask & 0x4)
+         dst->i[2] = chan->i[2];
+      if (mach->ExecMask & 0x8)
+         dst->i[3] = chan->i[3];
+      break;
+
+   case TGSI_SAT_ZERO_ONE:
+      /* XXX need to obey ExecMask here */
+      dst->q = micro_max(chan->q, mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].q);
+      dst->q = micro_min(dst->q, mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].q);
+      break;
+
+   case TGSI_SAT_MINUS_PLUS_ONE:
+      ASSERT( 0 );
+      break;
+
+   default:
+      ASSERT( 0 );
+   }
+}
+
+#define FETCH(VAL,INDEX,CHAN)\
+    fetch_source (mach, VAL, &inst->FullSrcRegisters[INDEX], CHAN)
+
+#define STORE(VAL,INDEX,CHAN)\
+    store_dest (mach, VAL, &inst->FullDstRegisters[INDEX], inst, CHAN )
+
+
+/**
+ * Execute ARB-style KIL which is predicated by a src register.
+ * Kill fragment if any of the four values is less than zero.
+ */
+static void
+exec_kil(struct spu_exec_machine *mach,
+         const struct tgsi_full_instruction *inst)
+{
+   uint uniquemask;
+   uint chan_index;
+   uint kilmask = 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
+   union spu_exec_channel r[1];
+
+   /* This mask stores component bits that were already tested. Note that
+    * we test if the value is less than zero, so 1.0 and 0.0 need not to be
+    * tested. */
+   uniquemask = (1 << TGSI_EXTSWIZZLE_ZERO) | (1 << TGSI_EXTSWIZZLE_ONE);
+
+   for (chan_index = 0; chan_index < 4; chan_index++)
+   {
+      uint swizzle;
+      uint i;
+
+      /* unswizzle channel */
+      swizzle = tgsi_util_get_full_src_register_extswizzle (
+                        &inst->FullSrcRegisters[0],
+                        chan_index);
+
+      /* check if the component has not been already tested */
+      if (uniquemask & (1 << swizzle))
+         continue;
+      uniquemask |= 1 << swizzle;
+
+      FETCH(&r[0], 0, chan_index);
+      for (i = 0; i < 4; i++)
+         if (r[0].f[i] < 0.0f)
+            kilmask |= 1 << i;
+   }
+
+   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
+}
+
+/**
+ * Execute NVIDIA-style KIL which is predicated by a condition code.
+ * Kill fragment if the condition code is TRUE.
+ */
+static void
+exec_kilp(struct tgsi_exec_machine *mach,
+          const struct tgsi_full_instruction *inst)
+{
+   uint kilmask = 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
+
+   /* TODO: build kilmask from CC mask */
+
+   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
+}
+
+/*
+ * Fetch a texel using STR texture coordinates.
+ */
+static void
+fetch_texel( struct spu_sampler *sampler,
+             const union spu_exec_channel *s,
+             const union spu_exec_channel *t,
+             const union spu_exec_channel *p,
+             float lodbias,  /* XXX should be float[4] */
+             union spu_exec_channel *r,
+             union spu_exec_channel *g,
+             union spu_exec_channel *b,
+             union spu_exec_channel *a )
+{
+   qword rgba[4];
+   qword out[4];
+
+   sampler->get_samples(sampler, s->f, t->f, p->f, lodbias, 
+			(float (*)[4]) rgba);
+
+   _transpose_matrix4x4((vec_float4 *) out, (vec_float4 *) rgba);
+   r->q = out[0];
+   g->q = out[1];
+   b->q = out[2];
+   a->q = out[3];
+}
+
+
+static void
+exec_tex(struct spu_exec_machine *mach,
+         const struct tgsi_full_instruction *inst,
+         boolean biasLod, boolean projected)
+{
+   const uint unit = inst->FullSrcRegisters[1].SrcRegister.Index;
+   union spu_exec_channel r[8];
+   uint chan_index;
+   float lodBias;
+
+   /*   printf("Sampler %u unit %u\n", sampler, unit); */
+
+   switch (inst->InstructionExtTexture.Texture) {
+   case TGSI_TEXTURE_1D:
+
+      FETCH(&r[0], 0, CHAN_X);
+
+      if (projected) {
+         FETCH(&r[1], 0, CHAN_W);
+         r[0].q = micro_div(r[0].q, r[1].q);
+      }
+
+      if (biasLod) {
+         FETCH(&r[1], 0, CHAN_W);
+         lodBias = r[2].f[0];
+      }
+      else
+         lodBias = 0.0;
+
+      fetch_texel(&mach->Samplers[unit],
+                  &r[0], NULL, NULL, lodBias,  /* S, T, P, BIAS */
+                  &r[0], &r[1], &r[2], &r[3]); /* R, G, B, A */
+      break;
+
+   case TGSI_TEXTURE_2D:
+   case TGSI_TEXTURE_RECT:
+
+      FETCH(&r[0], 0, CHAN_X);
+      FETCH(&r[1], 0, CHAN_Y);
+      FETCH(&r[2], 0, CHAN_Z);
+
+      if (projected) {
+         FETCH(&r[3], 0, CHAN_W);
+         r[0].q = micro_div(r[0].q, r[3].q);
+         r[1].q = micro_div(r[1].q, r[3].q);
+         r[2].q = micro_div(r[2].q, r[3].q);
+      }
+
+      if (biasLod) {
+         FETCH(&r[3], 0, CHAN_W);
+         lodBias = r[3].f[0];
+      }
+      else
+         lodBias = 0.0;
+
+      fetch_texel(&mach->Samplers[unit],
+                  &r[0], &r[1], &r[2], lodBias,  /* inputs */
+                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
+      break;
+
+   case TGSI_TEXTURE_3D:
+   case TGSI_TEXTURE_CUBE:
+
+      FETCH(&r[0], 0, CHAN_X);
+      FETCH(&r[1], 0, CHAN_Y);
+      FETCH(&r[2], 0, CHAN_Z);
+
+      if (projected) {
+         FETCH(&r[3], 0, CHAN_W);
+         r[0].q = micro_div(r[0].q, r[3].q);
+         r[1].q = micro_div(r[1].q, r[3].q);
+         r[2].q = micro_div(r[2].q, r[3].q);
+      }
+
+      if (biasLod) {
+         FETCH(&r[3], 0, CHAN_W);
+         lodBias = r[3].f[0];
+      }
+      else
+         lodBias = 0.0;
+
+      fetch_texel(&mach->Samplers[unit],
+                  &r[0], &r[1], &r[2], lodBias,
+                  &r[0], &r[1], &r[2], &r[3]);
+      break;
+
+   default:
+      ASSERT (0);
+   }
+
+   FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+      STORE( &r[chan_index], 0, chan_index );
+   }
+}
+
+
+
+static void
+constant_interpolation(
+   struct spu_exec_machine *mach,
+   unsigned attrib,
+   unsigned chan )
+{
+   unsigned i;
+
+   for( i = 0; i < QUAD_SIZE; i++ ) {
+      mach->Inputs[attrib].xyzw[chan].f[i] = mach->InterpCoefs[attrib].a0[chan];
+   }
+}
+
+static void
+linear_interpolation(
+   struct spu_exec_machine *mach,
+   unsigned attrib,
+   unsigned chan )
+{
+   const float x = mach->QuadPos.xyzw[0].f[0];
+   const float y = mach->QuadPos.xyzw[1].f[0];
+   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
+   const float dady = mach->InterpCoefs[attrib].dady[chan];
+   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
+   mach->Inputs[attrib].xyzw[chan].f[0] = a0;
+   mach->Inputs[attrib].xyzw[chan].f[1] = a0 + dadx;
+   mach->Inputs[attrib].xyzw[chan].f[2] = a0 + dady;
+   mach->Inputs[attrib].xyzw[chan].f[3] = a0 + dadx + dady;
+}
+
+static void
+perspective_interpolation(
+   struct spu_exec_machine *mach,
+   unsigned attrib,
+   unsigned chan )
+{
+   const float x = mach->QuadPos.xyzw[0].f[0];
+   const float y = mach->QuadPos.xyzw[1].f[0];
+   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
+   const float dady = mach->InterpCoefs[attrib].dady[chan];
+   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
+   const float *w = mach->QuadPos.xyzw[3].f;
+   /* divide by W here */
+   mach->Inputs[attrib].xyzw[chan].f[0] = a0 / w[0];
+   mach->Inputs[attrib].xyzw[chan].f[1] = (a0 + dadx) / w[1];
+   mach->Inputs[attrib].xyzw[chan].f[2] = (a0 + dady) / w[2];
+   mach->Inputs[attrib].xyzw[chan].f[3] = (a0 + dadx + dady) / w[3];
+}
+
+
+typedef void (* interpolation_func)(
+   struct spu_exec_machine *mach,
+   unsigned attrib,
+   unsigned chan );
+
+static void
+exec_declaration(struct spu_exec_machine *mach,
+                 const struct tgsi_full_declaration *decl)
+{
+   if( mach->Processor == TGSI_PROCESSOR_FRAGMENT ) {
+      if( decl->Declaration.File == TGSI_FILE_INPUT ) {
+         unsigned first, last, mask;
+         interpolation_func interp;
+
+         first = decl->DeclarationRange.First;
+         last = decl->DeclarationRange.Last;
+         mask = decl->Declaration.UsageMask;
+
+         switch( decl->Declaration.Interpolate ) {
+         case TGSI_INTERPOLATE_CONSTANT:
+            interp = constant_interpolation;
+            break;
+
+         case TGSI_INTERPOLATE_LINEAR:
+            interp = linear_interpolation;
+            break;
+
+         case TGSI_INTERPOLATE_PERSPECTIVE:
+            interp = perspective_interpolation;
+            break;
+
+         default:
+            ASSERT( 0 );
+         }
+
+         if( mask == TGSI_WRITEMASK_XYZW ) {
+            unsigned i, j;
+
+            for( i = first; i <= last; i++ ) {
+               for( j = 0; j < NUM_CHANNELS; j++ ) {
+                  interp( mach, i, j );
+               }
+            }
+         }
+         else {
+            unsigned i, j;
+
+            for( j = 0; j < NUM_CHANNELS; j++ ) {
+               if( mask & (1 << j) ) {
+                  for( i = first; i <= last; i++ ) {
+                     interp( mach, i, j );
+                  }
+               }
+            }
+         }
+      }
+   }
+}
+
+static void
+exec_instruction(
+   struct spu_exec_machine *mach,
+   const struct tgsi_full_instruction *inst,
+   int *pc )
+{
+   uint chan_index;
+   union spu_exec_channel r[8];
+
+   (*pc)++;
+
+   switch (inst->Instruction.Opcode) {
+   case TGSI_OPCODE_ARL:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 FETCH( &r[0], 0, chan_index );
+         r[0].q = si_cflts(r[0].q, 0);
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_MOV:
+   case TGSI_OPCODE_SWZ:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_LIT:
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
+	 STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X );
+      }
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y ) || IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
+	 FETCH( &r[0], 0, CHAN_X );
+         if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
+            r[0].q = micro_max(r[0].q, mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].q);
+	    STORE( &r[0], 0, CHAN_Y );
+	 }
+
+         if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
+            FETCH( &r[1], 0, CHAN_Y );
+            r[1].q = micro_max(r[1].q, mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].q);
+
+            FETCH( &r[2], 0, CHAN_W );
+            r[2].q = micro_min(r[2].q, mach->Temps[TEMP_128_I].xyzw[TEMP_128_C].q);
+            r[2].q = micro_max(r[2].q, mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C].q);
+            r[1].q = micro_pow(r[1].q, r[2].q);
+
+            /* r0 = (r0 > 0.0) ? r1 : 0.0
+             */
+            r[0].q = si_fcgt(r[0].q, mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].q);
+            r[0].q = si_selb(mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].q, r[1].q,
+                             r[0].q);
+            STORE( &r[0], 0, CHAN_Z );
+         }
+      }
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
+	 STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
+      }
+      break;
+
+   case TGSI_OPCODE_RCP:
+   /* TGSI_OPCODE_RECIP */
+      FETCH( &r[0], 0, CHAN_X );
+      r[0].q = micro_div(mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].q, r[0].q);
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_RSQ:
+   /* TGSI_OPCODE_RECIPSQRT */
+      FETCH( &r[0], 0, CHAN_X );
+      r[0].q = micro_sqrt(r[0].q);
+      r[0].q = micro_div(mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].q, r[0].q);
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_EXP:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_LOG:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_MUL:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index )
+      {
+         FETCH(&r[0], 0, chan_index);
+         FETCH(&r[1], 1, chan_index);
+
+         r[0].q = si_fm(r[0].q, r[1].q);
+
+         STORE(&r[0], 0, chan_index);
+      }
+      break;
+
+   case TGSI_OPCODE_ADD:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         r[0].q = si_fa(r[0].q, r[1].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_DP3:
+   /* TGSI_OPCODE_DOT3 */
+      FETCH( &r[0], 0, CHAN_X );
+      FETCH( &r[1], 1, CHAN_X );
+      r[0].q = si_fm(r[0].q, r[1].q);
+
+      FETCH( &r[1], 0, CHAN_Y );
+      FETCH( &r[2], 1, CHAN_Y );
+      r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
+
+
+      FETCH( &r[1], 0, CHAN_Z );
+      FETCH( &r[2], 1, CHAN_Z );
+      r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+    case TGSI_OPCODE_DP4:
+    /* TGSI_OPCODE_DOT4 */
+       FETCH(&r[0], 0, CHAN_X);
+       FETCH(&r[1], 1, CHAN_X);
+
+      r[0].q = si_fm(r[0].q, r[1].q);
+
+       FETCH(&r[1], 0, CHAN_Y);
+       FETCH(&r[2], 1, CHAN_Y);
+
+      r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
+
+       FETCH(&r[1], 0, CHAN_Z);
+       FETCH(&r[2], 1, CHAN_Z);
+
+      r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
+
+       FETCH(&r[1], 0, CHAN_W);
+       FETCH(&r[2], 1, CHAN_W);
+
+      r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_DST:
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
+	 STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X );
+      }
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
+	 FETCH( &r[0], 0, CHAN_Y );
+	 FETCH( &r[1], 1, CHAN_Y);
+      r[0].q = si_fm(r[0].q, r[1].q);
+	 STORE( &r[0], 0, CHAN_Y );
+      }
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
+	 FETCH( &r[0], 0, CHAN_Z );
+	 STORE( &r[0], 0, CHAN_Z );
+      }
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
+	 FETCH( &r[0], 1, CHAN_W );
+	 STORE( &r[0], 0, CHAN_W );
+      }
+      break;
+
+   case TGSI_OPCODE_MIN:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH(&r[0], 0, chan_index);
+         FETCH(&r[1], 1, chan_index);
+
+         r[0].q = micro_min(r[0].q, r[1].q);
+
+         STORE(&r[0], 0, chan_index);
+      }
+      break;
+
+   case TGSI_OPCODE_MAX:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH(&r[0], 0, chan_index);
+         FETCH(&r[1], 1, chan_index);
+
+         r[0].q = micro_max(r[0].q, r[1].q);
+
+         STORE(&r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SLT:
+   /* TGSI_OPCODE_SETLT */
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+
+         r[0].q = micro_ge(r[0].q, r[1].q);
+         r[0].q = si_xori(r[0].q, 0xff);
+
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SGE:
+   /* TGSI_OPCODE_SETGE */
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         r[0].q = micro_ge(r[0].q, r[1].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_MAD:
+   /* TGSI_OPCODE_MADD */
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         FETCH( &r[2], 2, chan_index );
+         r[0].q = si_fma(r[0].q, r[1].q, r[2].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SUB:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH(&r[0], 0, chan_index);
+         FETCH(&r[1], 1, chan_index);
+
+         r[0].q = si_fs(r[0].q, r[1].q);
+
+         STORE(&r[0], 0, chan_index);
+      }
+      break;
+
+   case TGSI_OPCODE_LERP:
+   /* TGSI_OPCODE_LRP */
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH(&r[0], 0, chan_index);
+         FETCH(&r[1], 1, chan_index);
+         FETCH(&r[2], 2, chan_index);
+
+         r[1].q = si_fs(r[1].q, r[2].q);
+         r[0].q = si_fma(r[0].q, r[1].q, r[2].q);
+
+         STORE(&r[0], 0, chan_index);
+      }
+      break;
+
+   case TGSI_OPCODE_CND:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_CND0:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_DOT2ADD:
+      /* TGSI_OPCODE_DP2A */
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_INDEX:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_NEGATE:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_FRAC:
+   /* TGSI_OPCODE_FRC */
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         r[0].q = micro_frc(r[0].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_CLAMP:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_FLOOR:
+   /* TGSI_OPCODE_FLR */
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         r[0].q = micro_flr(r[0].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_ROUND:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         r[0].q = micro_rnd(r[0].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_EXPBASE2:
+    /* TGSI_OPCODE_EX2 */
+      FETCH(&r[0], 0, CHAN_X);
+
+      r[0].q = micro_pow(mach->Temps[TEMP_2_I].xyzw[TEMP_2_C].q, r[0].q);
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_LOGBASE2:
+   /* TGSI_OPCODE_LG2 */
+      FETCH( &r[0], 0, CHAN_X );
+      r[0].q = micro_lg2(r[0].q);
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_POWER:
+      /* TGSI_OPCODE_POW */
+      FETCH(&r[0], 0, CHAN_X);
+      FETCH(&r[1], 1, CHAN_X);
+
+      r[0].q = micro_pow(r[0].q, r[1].q);
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_CROSSPRODUCT:
+      /* TGSI_OPCODE_XPD */
+      FETCH(&r[0], 0, CHAN_Y);
+      FETCH(&r[1], 1, CHAN_Z);
+      FETCH(&r[3], 0, CHAN_Z);
+      FETCH(&r[4], 1, CHAN_Y);
+
+      /* r2 = (r0 * r1) - (r3 * r5)
+       */
+      r[2].q = si_fm(r[3].q, r[5].q);
+      r[2].q = si_fms(r[0].q, r[1].q, r[2].q);
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
+         STORE( &r[2], 0, CHAN_X );
+      }
+
+      FETCH(&r[2], 1, CHAN_X);
+      FETCH(&r[5], 0, CHAN_X);
+
+      /* r3 = (r3 * r2) - (r1 * r5)
+       */
+      r[1].q = si_fm(r[1].q, r[5].q);
+      r[3].q = si_fms(r[3].q, r[2].q, r[1].q);
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
+         STORE( &r[3], 0, CHAN_Y );
+      }
+
+      /* r5 = (r5 * r4) - (r0 * r2)
+       */
+      r[0].q = si_fm(r[0].q, r[2].q);
+      r[5].q = si_fms(r[5].q, r[4].q, r[0].q);
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
+         STORE( &r[5], 0, CHAN_Z );
+      }
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
+         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
+      }
+      break;
+
+    case TGSI_OPCODE_MULTIPLYMATRIX:
+       ASSERT (0);
+       break;
+
+    case TGSI_OPCODE_ABS:
+       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+          FETCH(&r[0], 0, chan_index);
+
+          r[0].q = micro_abs(r[0].q);
+
+          STORE(&r[0], 0, chan_index);
+       }
+       break;
+
+   case TGSI_OPCODE_RCC:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_DPH:
+      FETCH(&r[0], 0, CHAN_X);
+      FETCH(&r[1], 1, CHAN_X);
+
+      r[0].q = si_fm(r[0].q, r[1].q);
+
+      FETCH(&r[1], 0, CHAN_Y);
+      FETCH(&r[2], 1, CHAN_Y);
+
+      r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
+
+      FETCH(&r[1], 0, CHAN_Z);
+      FETCH(&r[2], 1, CHAN_Z);
+
+      r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
+
+      FETCH(&r[1], 1, CHAN_W);
+
+      r[0].q = si_fa(r[0].q, r[1].q);
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_COS:
+      FETCH(&r[0], 0, CHAN_X);
+
+      r[0].q = micro_cos(r[0].q);
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_DDX:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         r[0].q = micro_ddx(r[0].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_DDY:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         r[0].q = micro_ddy(r[0].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_KILP:
+      exec_kilp (mach, inst);
+      break;
+
+   case TGSI_OPCODE_KIL:
+      exec_kil (mach, inst);
+      break;
+
+   case TGSI_OPCODE_PK2H:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_PK2US:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_PK4B:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_PK4UB:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_RFL:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_SEQ:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+
+         r[0].q = si_fceq(r[0].q, r[1].q);
+
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SFL:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_SGT:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         r[0].q = si_fcgt(r[0].q, r[1].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SIN:
+      FETCH( &r[0], 0, CHAN_X );
+      r[0].q = micro_sin(r[0].q);
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SLE:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+
+         r[0].q = si_fcgt(r[0].q, r[1].q);
+         r[0].q = si_xori(r[0].q, 0xff);
+
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SNE:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+
+         r[0].q = si_fceq(r[0].q, r[1].q);
+         r[0].q = si_xori(r[0].q, 0xff);
+
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_STR:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_TEX:
+      /* simple texture lookup */
+      /* src[0] = texcoord */
+      /* src[1] = sampler unit */
+      exec_tex(mach, inst, FALSE, FALSE);
+      break;
+
+   case TGSI_OPCODE_TXB:
+      /* Texture lookup with lod bias */
+      /* src[0] = texcoord (src[0].w = load bias) */
+      /* src[1] = sampler unit */
+      exec_tex(mach, inst, TRUE, FALSE);
+      break;
+
+   case TGSI_OPCODE_TXD:
+      /* Texture lookup with explict partial derivatives */
+      /* src[0] = texcoord */
+      /* src[1] = d[strq]/dx */
+      /* src[2] = d[strq]/dy */
+      /* src[3] = sampler unit */
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_TXL:
+      /* Texture lookup with explit LOD */
+      /* src[0] = texcoord (src[0].w = load bias) */
+      /* src[1] = sampler unit */
+      exec_tex(mach, inst, TRUE, FALSE);
+      break;
+
+   case TGSI_OPCODE_TXP:
+      /* Texture lookup with projection */
+      /* src[0] = texcoord (src[0].w = projection) */
+      /* src[1] = sampler unit */
+      exec_tex(mach, inst, TRUE, TRUE);
+      break;
+
+   case TGSI_OPCODE_UP2H:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_UP2US:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_UP4B:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_UP4UB:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_X2D:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_ARA:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_ARR:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_BRA:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_CAL:
+      /* skip the call if no execution channels are enabled */
+      if (mach->ExecMask) {
+         /* do the call */
+
+         /* push the Cond, Loop, Cont stacks */
+         ASSERT(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
+         mach->CondStack[mach->CondStackTop++] = mach->CondMask;
+         ASSERT(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
+         mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
+         ASSERT(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
+         mach->ContStack[mach->ContStackTop++] = mach->ContMask;
+
+         ASSERT(mach->FuncStackTop < TGSI_EXEC_MAX_CALL_NESTING);
+         mach->FuncStack[mach->FuncStackTop++] = mach->FuncMask;
+
+         /* note that PC was already incremented above */
+         mach->CallStack[mach->CallStackTop++] = *pc;
+         *pc = inst->InstructionExtLabel.Label;
+      }
+      break;
+
+   case TGSI_OPCODE_RET:
+      mach->FuncMask &= ~mach->ExecMask;
+      UPDATE_EXEC_MASK(mach);
+
+      if (mach->ExecMask == 0x0) {
+         /* really return now (otherwise, keep executing */
+
+         if (mach->CallStackTop == 0) {
+            /* returning from main() */
+            *pc = -1;
+            return;
+         }
+         *pc = mach->CallStack[--mach->CallStackTop];
+
+         /* pop the Cond, Loop, Cont stacks */
+         ASSERT(mach->CondStackTop > 0);
+         mach->CondMask = mach->CondStack[--mach->CondStackTop];
+         ASSERT(mach->LoopStackTop > 0);
+         mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
+         ASSERT(mach->ContStackTop > 0);
+         mach->ContMask = mach->ContStack[--mach->ContStackTop];
+         ASSERT(mach->FuncStackTop > 0);
+         mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
+
+         UPDATE_EXEC_MASK(mach);
+      }
+      break;
+
+   case TGSI_OPCODE_SSG:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_CMP:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH(&r[0], 0, chan_index);
+         FETCH(&r[1], 1, chan_index);
+         FETCH(&r[2], 2, chan_index);
+
+         /* r0 = (r0 < 0.0) ? r1 : r2
+          */
+         r[3].q = si_xor(r[3].q, r[3].q);
+         r[0].q = micro_lt(r[0].q, r[3].q);
+         r[0].q = si_selb(r[1].q, r[2].q, r[0].q);
+
+         STORE(&r[0], 0, chan_index);
+      }
+      break;
+
+   case TGSI_OPCODE_SCS:
+      if( IS_CHANNEL_ENABLED( *inst, CHAN_X ) || IS_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
+         FETCH( &r[0], 0, CHAN_X );
+      }
+      if( IS_CHANNEL_ENABLED( *inst, CHAN_X ) ) {
+         r[1].q = micro_cos(r[0].q);
+         STORE( &r[1], 0, CHAN_X );
+      }
+      if( IS_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
+         r[1].q = micro_sin(r[0].q);
+         STORE( &r[1], 0, CHAN_Y );
+      }
+      if( IS_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
+         STORE( &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], 0, CHAN_Z );
+      }
+      if( IS_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
+         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
+      }
+      break;
+
+   case TGSI_OPCODE_NRM:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_DIV:
+      ASSERT( 0 );
+      break;
+
+   case TGSI_OPCODE_DP2:
+      FETCH( &r[0], 0, CHAN_X );
+      FETCH( &r[1], 1, CHAN_X );
+      r[0].q = si_fm(r[0].q, r[1].q);
+
+      FETCH( &r[1], 0, CHAN_Y );
+      FETCH( &r[2], 1, CHAN_Y );
+      r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_IF:
+      /* push CondMask */
+      ASSERT(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
+      mach->CondStack[mach->CondStackTop++] = mach->CondMask;
+      FETCH( &r[0], 0, CHAN_X );
+      /* update CondMask */
+      if( ! r[0].u[0] ) {
+         mach->CondMask &= ~0x1;
+      }
+      if( ! r[0].u[1] ) {
+         mach->CondMask &= ~0x2;
+      }
+      if( ! r[0].u[2] ) {
+         mach->CondMask &= ~0x4;
+      }
+      if( ! r[0].u[3] ) {
+         mach->CondMask &= ~0x8;
+      }
+      UPDATE_EXEC_MASK(mach);
+      /* Todo: If CondMask==0, jump to ELSE */
+      break;
+
+   case TGSI_OPCODE_ELSE:
+      /* invert CondMask wrt previous mask */
+      {
+         uint prevMask;
+         ASSERT(mach->CondStackTop > 0);
+         prevMask = mach->CondStack[mach->CondStackTop - 1];
+         mach->CondMask = ~mach->CondMask & prevMask;
+         UPDATE_EXEC_MASK(mach);
+         /* Todo: If CondMask==0, jump to ENDIF */
+      }
+      break;
+
+   case TGSI_OPCODE_ENDIF:
+      /* pop CondMask */
+      ASSERT(mach->CondStackTop > 0);
+      mach->CondMask = mach->CondStack[--mach->CondStackTop];
+      UPDATE_EXEC_MASK(mach);
+      break;
+
+   case TGSI_OPCODE_END:
+      /* halt execution */
+      *pc = -1;
+      break;
+
+   case TGSI_OPCODE_REP:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_ENDREP:
+       ASSERT (0);
+       break;
+
+   case TGSI_OPCODE_PUSHA:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_POPA:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_CEIL:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         r[0].q = micro_ceil(r[0].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_I2F:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         r[0].q = si_csflt(r[0].q, 0);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_NOT:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         r[0].q = si_xorbi(r[0].q, 0xff);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_TRUNC:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         r[0].q = micro_trunc(r[0].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SHL:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+
+         r[0].q = si_shl(r[0].q, r[1].q);
+
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SHR:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         r[0].q = micro_ishr(r[0].q, r[1].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_AND:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         r[0].q = si_and(r[0].q, r[1].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_OR:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         r[0].q = si_or(r[0].q, r[1].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_MOD:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_XOR:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         r[0].q = si_xor(r[0].q, r[1].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SAD:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_TXF:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_TXQ:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_EMIT:
+      mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] += 16;
+      mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]]++;
+      break;
+
+   case TGSI_OPCODE_ENDPRIM:
+      mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]++;
+      mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]] = 0;
+      break;
+
+   case TGSI_OPCODE_LOOP:
+      /* fall-through (for now) */
+   case TGSI_OPCODE_BGNLOOP2:
+      /* push LoopMask and ContMasks */
+      ASSERT(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
+      mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
+      ASSERT(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
+      mach->ContStack[mach->ContStackTop++] = mach->ContMask;
+      break;
+
+   case TGSI_OPCODE_ENDLOOP:
+      /* fall-through (for now at least) */
+   case TGSI_OPCODE_ENDLOOP2:
+      /* Restore ContMask, but don't pop */
+      ASSERT(mach->ContStackTop > 0);
+      mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
+      if (mach->LoopMask) {
+         /* repeat loop: jump to instruction just past BGNLOOP */
+         *pc = inst->InstructionExtLabel.Label + 1;
+      }
+      else {
+         /* exit loop: pop LoopMask */
+         ASSERT(mach->LoopStackTop > 0);
+         mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
+         /* pop ContMask */
+         ASSERT(mach->ContStackTop > 0);
+         mach->ContMask = mach->ContStack[--mach->ContStackTop];
+      }
+      UPDATE_EXEC_MASK(mach);
+      break;
+
+   case TGSI_OPCODE_BRK:
+      /* turn off loop channels for each enabled exec channel */
+      mach->LoopMask &= ~mach->ExecMask;
+      /* Todo: if mach->LoopMask == 0, jump to end of loop */
+      UPDATE_EXEC_MASK(mach);
+      break;
+
+   case TGSI_OPCODE_CONT:
+      /* turn off cont channels for each enabled exec channel */
+      mach->ContMask &= ~mach->ExecMask;
+      /* Todo: if mach->LoopMask == 0, jump to end of loop */
+      UPDATE_EXEC_MASK(mach);
+      break;
+
+   case TGSI_OPCODE_BGNSUB:
+      /* no-op */
+      break;
+
+   case TGSI_OPCODE_ENDSUB:
+      /* no-op */
+      break;
+
+   case TGSI_OPCODE_NOISE1:
+      ASSERT( 0 );
+      break;
+
+   case TGSI_OPCODE_NOISE2:
+      ASSERT( 0 );
+      break;
+
+   case TGSI_OPCODE_NOISE3:
+      ASSERT( 0 );
+      break;
+
+   case TGSI_OPCODE_NOISE4:
+      ASSERT( 0 );
+      break;
+
+   case TGSI_OPCODE_NOP:
+      break;
+
+   default:
+      ASSERT( 0 );
+   }
+}
+
+
+/**
+ * Run TGSI interpreter.
+ * \return bitmask of "alive" quad components
+ */
+uint
+spu_exec_machine_run( struct spu_exec_machine *mach )
+{
+   uint i;
+   int pc = 0;
+
+   mach->CondMask = 0xf;
+   mach->LoopMask = 0xf;
+   mach->ContMask = 0xf;
+   mach->FuncMask = 0xf;
+   mach->ExecMask = 0xf;
+
+   mach->CondStackTop = 0; /* temporarily subvert this ASSERTion */
+   ASSERT(mach->CondStackTop == 0);
+   ASSERT(mach->LoopStackTop == 0);
+   ASSERT(mach->ContStackTop == 0);
+   ASSERT(mach->CallStackTop == 0);
+
+   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] = 0;
+   mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] = 0;
+
+   if( mach->Processor == TGSI_PROCESSOR_GEOMETRY ) {
+      mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0] = 0;
+      mach->Primitives[0] = 0;
+   }
+
+
+   /* execute declarations (interpolants) */
+   if( mach->Processor == TGSI_PROCESSOR_FRAGMENT ) {
+      for (i = 0; i < mach->NumDeclarations; i++) {
+         union {
+            struct tgsi_full_declaration decl;
+            qword buffer[ROUNDUP16(sizeof(struct tgsi_full_declaration)) / 16];
+         } d ALIGN16_ATTRIB;
+         unsigned ea = (unsigned) (mach->Declarations + pc);
+
+         spu_dcache_fetch_unaligned(d.buffer, ea, sizeof(d.decl));
+
+         exec_declaration( mach, &d.decl );
+      }
+   }
+
+   /* execute instructions, until pc is set to -1 */
+   while (pc != -1) {
+      union {
+         struct tgsi_full_instruction inst;
+         qword buffer[ROUNDUP16(sizeof(struct tgsi_full_instruction)) / 16];
+      } i ALIGN16_ATTRIB;
+      unsigned ea = (unsigned) (mach->Instructions + pc);
+
+      spu_dcache_fetch_unaligned(i.buffer, ea, sizeof(i.inst));
+      exec_instruction( mach, & i.inst, &pc );
+   }
+
+#if 0
+   /* we scale from floats in [0,1] to Zbuffer ints in sp_quad_depth_test.c */
+   if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) {
+      /*
+       * Scale back depth component.
+       */
+      for (i = 0; i < 4; i++)
+         mach->Outputs[0].xyzw[2].f[i] *= ctx->DrawBuffer->_DepthMaxF;
+   }
+#endif
+
+   return ~mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
+}
+
+
diff --git a/src/gallium/drivers/cell/spu/spu_exec.h b/src/gallium/drivers/cell/spu/spu_exec.h
new file mode 100644
index 0000000000..8605679940
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_exec.h
@@ -0,0 +1,172 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#if !defined SPU_EXEC_H
+#define SPU_EXEC_H
+
+#include "pipe/p_compiler.h"
+#include "tgsi/tgsi_exec.h"
+
+#if defined __cplusplus
+extern "C" {
+#endif
+
+/**
+  * Registers may be treated as float, signed int or unsigned int.
+  */
+union spu_exec_channel
+{
+   float    f[QUAD_SIZE];
+   int      i[QUAD_SIZE];
+   unsigned u[QUAD_SIZE];
+   qword    q;
+};
+
+/**
+  * A vector[RGBA] of channels[4 pixels]
+  */
+struct spu_exec_vector
+{
+   union spu_exec_channel xyzw[NUM_CHANNELS];
+};
+
+/**
+ * For fragment programs, information for computing fragment input
+ * values from plane equation of the triangle/line.
+ */
+struct spu_interp_coef
+{
+   float a0[NUM_CHANNELS];	/* in an xyzw layout */
+   float dadx[NUM_CHANNELS];
+   float dady[NUM_CHANNELS];
+};
+
+
+struct softpipe_tile_cache;  /**< Opaque to TGSI */
+
+/**
+ * Information for sampling textures, which must be implemented
+ * by code outside the TGSI executor.
+ */
+struct spu_sampler
+{
+   const struct pipe_sampler_state *state;
+   struct pipe_texture *texture;
+   /** Get samples for four fragments in a quad */
+   void (*get_samples)(struct spu_sampler *sampler,
+                       const float s[QUAD_SIZE],
+                       const float t[QUAD_SIZE],
+                       const float p[QUAD_SIZE],
+                       float lodbias,
+                       float rgba[NUM_CHANNELS][QUAD_SIZE]);
+   void *pipe; /*XXX temporary*/
+   struct softpipe_tile_cache *cache;
+};
+
+
+/**
+ * Run-time virtual machine state for executing TGSI shader.
+ */
+struct spu_exec_machine
+{
+   /*
+    * 32 program temporaries
+    * 4  internal temporaries
+    * 1  address
+    */
+   struct spu_exec_vector       Temps[TGSI_EXEC_NUM_TEMPS 
+                                      + TGSI_EXEC_NUM_TEMP_EXTRAS + 1]
+       ALIGN16_ATTRIB;
+
+   struct spu_exec_vector       *Addrs;
+
+   struct spu_sampler           *Samplers;
+
+   float                         Imms[TGSI_EXEC_NUM_IMMEDIATES][4];
+   unsigned                      ImmLimit;
+   float                         (*Consts)[4];
+   struct spu_exec_vector       *Inputs;
+   struct spu_exec_vector       *Outputs;
+   unsigned                      Processor;
+
+   /* GEOMETRY processor only. */
+   unsigned                      *Primitives;
+
+   /* FRAGMENT processor only. */
+   const struct spu_interp_coef *InterpCoefs;
+   struct spu_exec_vector       QuadPos;
+
+   /* Conditional execution masks */
+   uint CondMask;  /**< For IF/ELSE/ENDIF */
+   uint LoopMask;  /**< For BGNLOOP/ENDLOOP */
+   uint ContMask;  /**< For loop CONT statements */
+   uint FuncMask;  /**< For function calls */
+   uint ExecMask;  /**< = CondMask & LoopMask */
+
+   /** Condition mask stack (for nested conditionals) */
+   uint CondStack[TGSI_EXEC_MAX_COND_NESTING];
+   int CondStackTop;
+
+   /** Loop mask stack (for nested loops) */
+   uint LoopStack[TGSI_EXEC_MAX_LOOP_NESTING];
+   int LoopStackTop;
+
+   /** Loop continue mask stack (see comments in tgsi_exec.c) */
+   uint ContStack[TGSI_EXEC_MAX_LOOP_NESTING];
+   int ContStackTop;
+
+   /** Function execution mask stack (for executing subroutine code) */
+   uint FuncStack[TGSI_EXEC_MAX_CALL_NESTING];
+   int FuncStackTop;
+
+   /** Function call stack for saving/restoring the program counter */
+   uint CallStack[TGSI_EXEC_MAX_CALL_NESTING];
+   int CallStackTop;
+
+   struct tgsi_full_instruction *Instructions;
+   uint NumInstructions;
+
+   struct tgsi_full_declaration *Declarations;
+   uint NumDeclarations;
+};
+
+
+extern void
+spu_exec_machine_init(struct spu_exec_machine *mach,
+                      uint numSamplers,
+                      struct spu_sampler *samplers,
+                      unsigned processor);
+
+extern uint
+spu_exec_machine_run( struct spu_exec_machine *mach );
+
+
+#if defined __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /* SPU_EXEC_H */
diff --git a/src/gallium/drivers/cell/spu/spu_funcs.c b/src/gallium/drivers/cell/spu/spu_funcs.c
new file mode 100644
index 0000000000..c7bcb3de9d
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_funcs.c
@@ -0,0 +1,176 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+/**
+ * SPU functions accessed by shaders.
+ *
+ * Authors: Brian Paul
+ */
+
+
+#include <string.h>
+#include <libmisc.h>
+#include <math.h>
+#include <cos14_v.h>
+#include <sin14_v.h>
+#include <transpose_matrix4x4.h>
+
+#include "cell/common.h"
+#include "spu_main.h"
+#include "spu_funcs.h"
+
+
+/** For "return"-ing four vectors */
+struct vec_4x4
+{
+   vector float v[4];
+};
+
+
+static vector float
+spu_cos(vector float x)
+{
+   return _cos14_v(x);
+}
+
+static vector float
+spu_sin(vector float x)
+{
+   return _sin14_v(x);
+}
+
+static vector float
+spu_pow(vector float x, vector float y)
+{
+   float z0 = powf(spu_extract(x,0), spu_extract(y,0));
+   float z1 = powf(spu_extract(x,1), spu_extract(y,1));
+   float z2 = powf(spu_extract(x,2), spu_extract(y,2));
+   float z3 = powf(spu_extract(x,3), spu_extract(y,3));
+   return (vector float) {z0, z1, z2, z3};
+}
+
+static vector float
+spu_exp2(vector float x)
+{
+   float z0 = powf(2.0f, spu_extract(x,0));
+   float z1 = powf(2.0f, spu_extract(x,1));
+   float z2 = powf(2.0f, spu_extract(x,2));
+   float z3 = powf(2.0f, spu_extract(x,3));
+   return (vector float) {z0, z1, z2, z3};
+}
+
+static vector float
+spu_log2(vector float x)
+{
+   /*
+    * log_base_2(x) = log(x) / log(2)
+    * 1.442695 = 1/log(2).
+    */
+   static const vector float k = {1.442695F, 1.442695F, 1.442695F, 1.442695F};
+   float z0 = logf(spu_extract(x,0));
+   float z1 = logf(spu_extract(x,1));
+   float z2 = logf(spu_extract(x,2));
+   float z3 = logf(spu_extract(x,3));
+   vector float v = (vector float) {z0, z1, z2, z3};
+   return spu_mul(v, k);
+}
+
+static struct vec_4x4
+spu_txp(vector float s, vector float t, vector float r, vector float q)
+{
+   const uint unit = 0;
+   struct vec_4x4 colors;
+   vector float coords[4];
+
+   coords[0] = s;
+   coords[1] = t;
+   coords[2] = r;
+   coords[3] = q;
+   _transpose_matrix4x4(coords, coords);
+
+   /* get four texture samples */
+   colors.v[0] = spu.sample_texture[unit](unit, coords[0]);
+   colors.v[1] = spu.sample_texture[unit](unit, coords[1]);
+   colors.v[2] = spu.sample_texture[unit](unit, coords[2]);
+   colors.v[3] = spu.sample_texture[unit](unit, coords[3]);
+
+   _transpose_matrix4x4(colors.v, colors.v);
+   return colors;
+}
+
+
+/**
+ * Add named function to list of "exported" functions that will be
+ * made available to the PPU-hosted code generator.
+ */
+static void
+export_func(struct cell_spu_function_info *spu_functions,
+            const char *name, void *addr)
+{
+   uint n = spu_functions->num;
+   ASSERT(strlen(name) < 16);
+   strcpy(spu_functions->names[n], name);
+   spu_functions->addrs[n] = (uint) addr;
+   spu_functions->num++;
+   ASSERT(spu_functions->num <= 16);
+}
+
+
+/**
+ * Return info about the SPU's function to the PPU / main memory.
+ * The PPU needs to know the address of some SPU-side functions so
+ * that we can generate shader code with function calls.
+ */
+void
+return_function_info(void)
+{
+   struct cell_spu_function_info funcs ALIGN16_ATTRIB;
+   int tag = TAG_MISC;
+
+   ASSERT(sizeof(funcs) == 256); /* must be multiple of 16 bytes */
+
+   funcs.num = 0;
+   export_func(&funcs, "spu_cos", &spu_cos);
+   export_func(&funcs, "spu_sin", &spu_sin);
+   export_func(&funcs, "spu_pow", &spu_pow);
+   export_func(&funcs, "spu_exp2", &spu_exp2);
+   export_func(&funcs, "spu_log2", &spu_log2);
+   export_func(&funcs, "spu_txp", &spu_txp);
+
+   /* Send the function info back to the PPU / main memory */
+   mfc_put((void *) &funcs,  /* src in local store */
+           (unsigned int) spu.init.spu_functions, /* dst in main memory */
+           sizeof(funcs),  /* bytes */
+           tag,
+           0, /* tid */
+           0  /* rid */);
+   wait_on_mask(1 << tag);
+}
+
+
+
diff --git a/src/gallium/drivers/cell/spu/spu_funcs.h b/src/gallium/drivers/cell/spu/spu_funcs.h
new file mode 100644
index 0000000000..3adb6ae99f
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_funcs.h
@@ -0,0 +1,35 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef SPU_FUNCS_H
+#define SPU_FUNCS_H
+
+extern void
+return_function_info(void);
+
+#endif
+
diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
new file mode 100644
index 0000000000..4becd0f92a
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -0,0 +1,129 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+/* main() for Cell SPU code */
+
+
+#include <stdio.h>
+#include <libmisc.h>
+
+#include "pipe/p_defines.h"
+
+#include "spu_funcs.h"
+#include "spu_command.h"
+#include "spu_main.h"
+#include "spu_per_fragment_op.h"
+#include "spu_texture.h"
+//#include "spu_test.h"
+#include "spu_debug.h"
+#include "cell/common.h"
+
+
+/*
+helpful headers:
+/usr/lib/gcc/spu/4.1.1/include/spu_mfcio.h
+/opt/cell/sdk/usr/include/libmisc.h
+*/
+
+struct spu_global spu;
+
+
+#if DEBUG
+boolean Debug = FALSE;
+boolean force_fragment_ops_fallback = TRUE;
+#endif
+
+
+static void
+one_time_init(void)
+{
+   memset(spu.ctile_status, TILE_STATUS_DEFINED, sizeof(spu.ctile_status));
+   memset(spu.ztile_status, TILE_STATUS_DEFINED, sizeof(spu.ztile_status));
+   invalidate_tex_cache();
+
+   /* Install default/fallback fragment processing function.
+    * This will normally be overriden by a code-gen'd function
+    * unless CELL_FORCE_FRAGMENT_OPS_FALLBACK is set.
+    */
+   spu.fragment_ops = spu_fallback_fragment_ops;
+}
+
+
+
+/* In some versions of the SDK the SPE main takes 'unsigned long' as a
+ * parameter.  In others it takes 'unsigned long long'.  Use a define to
+ * select between the two.
+ */
+#ifdef SPU_MAIN_PARAM_LONG_LONG
+typedef unsigned long long main_param_t;
+#else
+typedef unsigned long main_param_t;
+#endif
+
+/**
+ * SPE entrypoint.
+ */
+int
+main(main_param_t speid, main_param_t argp)
+{
+   int tag = 0;
+
+   (void) speid;
+
+   ASSERT(sizeof(tile_t) == TILE_SIZE * TILE_SIZE * 4);
+   ASSERT(sizeof(struct cell_command_render) % 8 == 0);
+   ASSERT(((unsigned long) &spu.fragment_ops_code) % 8 == 0);
+   ASSERT(((unsigned long) &spu.fragment_program_code) % 8 == 0);
+
+   one_time_init();
+
+   DEBUG_PRINTF("main() speid=%lu\n", (unsigned long) speid);
+   D_PRINTF(CELL_DEBUG_FRAGMENT_OP_FALLBACK, "using fragment op fallback\n");
+
+   /* get initialization data */
+   mfc_get(&spu.init,  /* dest */
+           (unsigned int) argp, /* src */
+           sizeof(struct cell_init_info), /* bytes */
+           tag,
+           0, /* tid */
+           0  /* rid */);
+   wait_on_mask( 1 << tag );
+
+   if (spu.init.id == 0) {
+      return_function_info();
+   }
+
+#if 0
+   if (spu.init.id==0)
+      spu_test_misc(spu.init.id);
+#endif
+
+   command_loop();
+
+   return 0;
+}
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
new file mode 100644
index 0000000000..82c9c69a3a
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -0,0 +1,232 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef SPU_MAIN_H
+#define SPU_MAIN_H
+
+
+#include <spu_mfcio.h>
+
+#include "cell/common.h"
+#include "draw/draw_vertex.h"
+#include "pipe/p_state.h"
+
+
+
+#define MAX_WIDTH 1024
+#define MAX_HEIGHT 1024
+
+
+#define CELL_MAX_CONSTANTS 32  /**< number of float[4] constants */
+
+
+/**
+ * A tile is basically a TILE_SIZE x TILE_SIZE block of 4-byte pixels.
+ * The data may be addressed through several different types.
+ */
+typedef union {
+   ushort us[TILE_SIZE][TILE_SIZE];
+   uint   ui[TILE_SIZE][TILE_SIZE];
+   vector unsigned short us8[TILE_SIZE/2][TILE_SIZE/4];
+   vector unsigned int ui4[TILE_SIZE/2][TILE_SIZE/2];
+} tile_t;
+
+
+#define TILE_STATUS_CLEAR   1
+#define TILE_STATUS_DEFINED 2  /**< defined in FB, but not in local store */
+#define TILE_STATUS_CLEAN   3  /**< in local store, but not changed */
+#define TILE_STATUS_DIRTY   4  /**< modified locally, but not put back yet */
+#define TILE_STATUS_GETTING 5  /**< mfc_get() called but not yet arrived */
+
+
+/** Function for sampling textures */
+typedef vector float (*spu_sample_texture_func)(uint unit,
+                                                vector float texcoord);
+
+/** Function for performing per-fragment ops */
+typedef void (*spu_fragment_ops_func)(uint x, uint y,
+                                      tile_t *colorTile,
+                                      tile_t *depthStencilTile,
+                                      vector float fragZ,
+                                      vector float fragRed,
+                                      vector float fragGreen,
+                                      vector float fragBlue,
+                                      vector float fragAlpha,
+                                      vector unsigned int mask,
+                                      uint facing);
+
+/** Function for running fragment program */
+typedef void (*spu_fragment_program_func)(vector float *inputs,
+                                          vector float *outputs,
+                                          vector float *constants);
+
+
+struct spu_framebuffer
+{
+   void *color_start;              /**< addr of color surface in main memory */
+   void *depth_start;              /**< addr of depth surface in main memory */
+   enum pipe_format color_format;
+   enum pipe_format depth_format;
+   uint width, height;             /**< size in pixels */
+   uint width_tiles, height_tiles; /**< width and height in tiles */
+
+   uint color_clear_value;
+   uint depth_clear_value;
+
+   uint zsize;                     /**< 0, 2 or 4 bytes per Z */
+   float zscale;                   /**< 65535.0, 2^24-1 or 2^32-1 */
+} ALIGN16_ATTRIB;
+
+
+struct spu_texture
+{
+   void *start;
+   ushort width, height;
+   ushort tiles_per_row;
+   vector float tex_size;
+   vector unsigned int tex_size_mask; /**< == int(size - 1) */
+   vector unsigned int tex_size_x_mask; /**< == int(size - 1) */
+   vector unsigned int tex_size_y_mask; /**< == int(size - 1) */
+} ALIGN16_ATTRIB;
+
+
+/**
+ * All SPU global/context state will be in a singleton object of this type:
+ */
+struct spu_global
+{
+   /** One-time init/constant info */
+   struct cell_init_info init;
+
+   /*
+    * Current state
+    */
+   struct spu_framebuffer fb;
+   struct pipe_depth_stencil_alpha_state depth_stencil_alpha;
+   struct pipe_blend_state blend;
+   struct pipe_sampler_state sampler[PIPE_MAX_SAMPLERS];
+   struct spu_texture texture[PIPE_MAX_SAMPLERS];
+   struct vertex_info vertex_info;
+
+   /** Current color and Z tiles */
+   tile_t ctile ALIGN16_ATTRIB;
+   tile_t ztile ALIGN16_ATTRIB;
+
+   /** Read depth/stencil tiles? */
+   boolean read_depth;
+   boolean read_stencil;
+
+   /** Current tiles' status */
+   ubyte cur_ctile_status, cur_ztile_status;
+
+   /** Status of all tiles in framebuffer */
+   ubyte ctile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
+   ubyte ztile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
+
+   /** Current fragment ops machine code, at 8-byte boundary */
+   uint fragment_ops_code[SPU_MAX_FRAGMENT_OPS_INSTS] ALIGN8_ATTRIB;
+   /** Current fragment ops function */
+   spu_fragment_ops_func fragment_ops;
+
+   /** Current fragment program machine code, at 8-byte boundary */
+   uint fragment_program_code[SPU_MAX_FRAGMENT_PROGRAM_INSTS] ALIGN8_ATTRIB;
+   /** Current fragment ops function */
+   spu_fragment_program_func fragment_program;
+
+   /** Current texture sampler function */
+   spu_sample_texture_func sample_texture[CELL_MAX_SAMPLERS];
+
+   /** Fragment program constants */
+   vector float constants[4 * CELL_MAX_CONSTANTS];
+
+} ALIGN16_ATTRIB;
+
+
+extern struct spu_global spu;
+extern boolean Debug;
+
+
+
+
+/* DMA TAGS */
+
+#define TAG_SURFACE_CLEAR     10
+#define TAG_VERTEX_BUFFER     11
+#define TAG_READ_TILE_COLOR   12
+#define TAG_READ_TILE_Z       13
+#define TAG_WRITE_TILE_COLOR  14
+#define TAG_WRITE_TILE_Z      15
+#define TAG_INDEX_BUFFER      16
+#define TAG_BATCH_BUFFER      17
+#define TAG_MISC              18
+#define TAG_DCACHE0           20
+#define TAG_DCACHE1           21
+#define TAG_DCACHE2           22
+#define TAG_DCACHE3           23
+
+
+
+static INLINE void
+wait_on_mask(unsigned tagMask)
+{
+   mfc_write_tag_mask( tagMask );
+   /* wait for completion of _any_ DMAs specified by tagMask */
+   mfc_read_tag_status_any();
+}
+
+
+static INLINE void
+wait_on_mask_all(unsigned tagMask)
+{
+   mfc_write_tag_mask( tagMask );
+   /* wait for completion of _any_ DMAs specified by tagMask */
+   mfc_read_tag_status_all();
+}
+
+
+
+
+
+static INLINE void
+memset16(ushort *d, ushort value, uint count)
+{
+   uint i;
+   for (i = 0; i < count; i++)
+      d[i] = value;
+}
+
+
+static INLINE void
+memset32(uint *d, uint value, uint count)
+{
+   uint i;
+   for (i = 0; i < count; i++)
+      d[i] = value;
+}
+
+
+#endif /* SPU_MAIN_H */
diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
new file mode 100644
index 0000000000..d252fa6dc1
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
@@ -0,0 +1,532 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * \author Brian Paul
+ */
+
+
+#include <transpose_matrix4x4.h>
+#include "pipe/p_format.h"
+#include "spu_main.h"
+#include "spu_colorpack.h"
+#include "spu_per_fragment_op.h"
+
+
+#define LINEAR_QUAD_LAYOUT 1
+
+
+/**
+ * Called by rasterizer for each quad after the shader has run.  Do
+ * all the per-fragment operations including alpha test, z test,
+ * stencil test, blend, colormask and logicops.  This is a
+ * fallback/debug function.  In reality we'll use a generated function
+ * produced by the PPU.  But this function is useful for
+ * debug/validation.
+ */
+void
+spu_fallback_fragment_ops(uint x, uint y,
+                          tile_t *colorTile,
+                          tile_t *depthStencilTile,
+                          vector float fragZ,
+                          vector float fragR,
+                          vector float fragG,
+                          vector float fragB,
+                          vector float fragA,
+                          vector unsigned int mask,
+                          uint facing)
+{
+   vector float frag_aos[4];
+   unsigned int fbc0, fbc1, fbc2, fbc3 ; /* framebuffer/tile colors */
+   unsigned int fragc0, fragc1, fragc2, fragc3;  /* fragment colors */
+
+   /*
+    * Do alpha test
+    */
+   if (spu.depth_stencil_alpha.alpha.enabled) {
+      vector float ref = spu_splats(spu.depth_stencil_alpha.alpha.ref);
+      vector unsigned int amask;
+
+      switch (spu.depth_stencil_alpha.alpha.func) {
+      case PIPE_FUNC_LESS:
+         amask = spu_cmpgt(ref, fragA);  /* mask = (fragA < ref) */
+         break;
+      case PIPE_FUNC_GREATER:
+         amask = spu_cmpgt(fragA, ref);  /* mask = (fragA > ref) */
+         break;
+      case PIPE_FUNC_GEQUAL:
+         amask = spu_cmpgt(ref, fragA);
+         amask = spu_nor(amask, amask);
+         break;
+      case PIPE_FUNC_LEQUAL:
+         amask = spu_cmpgt(fragA, ref);
+         amask = spu_nor(amask, amask);
+         break;
+      case PIPE_FUNC_EQUAL:
+         amask = spu_cmpeq(ref, fragA);
+         break;
+      case PIPE_FUNC_NOTEQUAL:
+         amask = spu_cmpeq(ref, fragA);
+         amask = spu_nor(amask, amask);
+         break;
+      case PIPE_FUNC_ALWAYS:
+         amask = spu_splats(0xffffffffU);
+         break;
+      case PIPE_FUNC_NEVER:
+         amask = spu_splats( 0x0U);
+         break;
+      default:
+         ;
+      }
+
+      mask = spu_and(mask, amask);
+   }
+
+
+   /*
+    * Z and/or stencil testing...
+    */
+   if (spu.depth_stencil_alpha.depth.enabled ||
+       spu.depth_stencil_alpha.stencil[0].enabled) {
+
+      /* get four Z/Stencil values from tile */
+      vector unsigned int mask24 = spu_splats((unsigned int)0x00ffffffU);
+      vector unsigned int ifbZS = depthStencilTile->ui4[y/2][x/2];
+      vector unsigned int ifbZ = spu_and(ifbZS, mask24);
+      vector unsigned int ifbS = spu_andc(ifbZS, mask24);
+
+      if (spu.depth_stencil_alpha.stencil[0].enabled) {
+         /* do stencil test */
+         ASSERT(spu.fb.depth_format == PIPE_FORMAT_S8Z24_UNORM);
+
+      }
+      else if (spu.depth_stencil_alpha.depth.enabled) {
+         /* do depth test */
+
+         ASSERT(spu.fb.depth_format == PIPE_FORMAT_S8Z24_UNORM ||
+                spu.fb.depth_format == PIPE_FORMAT_X8Z24_UNORM);
+
+         vector unsigned int ifragZ;
+         vector unsigned int zmask;
+
+         /* convert four fragZ from float to uint */
+         fragZ = spu_mul(fragZ, spu_splats((float) 0xffffff));
+         ifragZ = spu_convtu(fragZ, 0);
+
+         /* do depth comparison, setting zmask with results */
+         switch (spu.depth_stencil_alpha.depth.func) {
+         case PIPE_FUNC_LESS:
+            zmask = spu_cmpgt(ifbZ, ifragZ);  /* mask = (ifragZ < ifbZ) */
+            break;
+         case PIPE_FUNC_GREATER:
+            zmask = spu_cmpgt(ifragZ, ifbZ);  /* mask = (ifbZ > ifragZ) */
+            break;
+         case PIPE_FUNC_GEQUAL:
+            zmask = spu_cmpgt(ifbZ, ifragZ);
+            zmask = spu_nor(zmask, zmask);
+            break;
+         case PIPE_FUNC_LEQUAL:
+            zmask = spu_cmpgt(ifragZ, ifbZ);
+            zmask = spu_nor(zmask, zmask);
+            break;
+         case PIPE_FUNC_EQUAL:
+            zmask = spu_cmpeq(ifbZ, ifragZ);
+            break;
+         case PIPE_FUNC_NOTEQUAL:
+            zmask = spu_cmpeq(ifbZ, ifragZ);
+            zmask = spu_nor(zmask, zmask);
+            break;
+         case PIPE_FUNC_ALWAYS:
+            zmask = spu_splats(0xffffffffU);
+            break;
+         case PIPE_FUNC_NEVER:
+            zmask = spu_splats( 0x0U);
+            break;
+         default:
+            ;
+         }
+
+         mask = spu_and(mask, zmask);
+
+         /* merge framebuffer Z and fragment Z according to the mask */
+         ifbZ = spu_or(spu_and(ifragZ, mask),
+                       spu_andc(ifbZ, mask));
+      }
+
+      if (spu_extract(spu_orx(mask), 0)) {
+         /* put new fragment Z/Stencil values back into Z/Stencil tile */
+         depthStencilTile->ui4[y/2][x/2] = spu_or(ifbZ, ifbS);
+
+         spu.cur_ztile_status = TILE_STATUS_DIRTY;
+      }
+   }
+
+
+   /*
+    * If we'll need the current framebuffer/tile colors for blending
+    * or logicop or colormask, fetch them now.
+    */
+   if (spu.blend.blend_enable ||
+       spu.blend.logicop_enable ||
+       spu.blend.colormask != 0xf) {
+
+#if LINEAR_QUAD_LAYOUT /* See comments/diagram below */
+      fbc0 = colorTile->ui[y][x*2+0];
+      fbc1 = colorTile->ui[y][x*2+1];
+      fbc2 = colorTile->ui[y][x*2+2];
+      fbc3 = colorTile->ui[y][x*2+3];
+#else
+      fbc0 = colorTile->ui[y+0][x+0];
+      fbc1 = colorTile->ui[y+0][x+1];
+      fbc2 = colorTile->ui[y+1][x+0];
+      fbc3 = colorTile->ui[y+1][x+1];
+#endif
+   }
+
+
+   /*
+    * Do blending
+    */
+   if (spu.blend.blend_enable) {
+      /* blending terms, misc regs */
+      vector float term1r, term1g, term1b, term1a;
+      vector float term2r, term2g, term2b, term2a;
+      vector float one, tmp;
+
+      vector float fbRGBA[4];  /* current framebuffer colors */
+
+      /* convert framebuffer colors from packed int to vector float */
+      {
+         vector float temp[4]; /* float colors in AOS form */
+         switch (spu.fb.color_format) {
+         case PIPE_FORMAT_B8G8R8A8_UNORM:
+            temp[0] = spu_unpack_B8G8R8A8(fbc0);
+            temp[1] = spu_unpack_B8G8R8A8(fbc1);
+            temp[2] = spu_unpack_B8G8R8A8(fbc2);
+            temp[3] = spu_unpack_B8G8R8A8(fbc3);
+            break;
+         case PIPE_FORMAT_A8R8G8B8_UNORM:
+            temp[0] = spu_unpack_A8R8G8B8(fbc0);
+            temp[1] = spu_unpack_A8R8G8B8(fbc1);
+            temp[2] = spu_unpack_A8R8G8B8(fbc2);
+            temp[3] = spu_unpack_A8R8G8B8(fbc3);
+            break;
+         default:
+            ASSERT(0);
+         }
+         _transpose_matrix4x4(fbRGBA, temp); /* fbRGBA = transpose(temp) */
+      }
+
+      /*
+       * Compute Src RGB terms
+       */
+      switch (spu.blend.rgb_src_factor) {
+      case PIPE_BLENDFACTOR_ONE:
+         term1r = fragR;
+         term1g = fragG;
+         term1b = fragB;
+         break;
+      case PIPE_BLENDFACTOR_ZERO:
+         term1r =
+         term1g =
+         term1b = spu_splats(0.0f);
+         break;
+      case PIPE_BLENDFACTOR_SRC_COLOR:
+         term1r = spu_mul(fragR, fragR);
+         term1g = spu_mul(fragG, fragG);
+         term1b = spu_mul(fragB, fragB);
+         break;
+      case PIPE_BLENDFACTOR_SRC_ALPHA:
+         term1r = spu_mul(fragR, fragA);
+         term1g = spu_mul(fragG, fragA);
+         term1b = spu_mul(fragB, fragA);
+         break;
+      /* XXX more cases */
+      default:
+         ASSERT(0);
+      }
+
+      /*
+       * Compute Src Alpha term
+       */
+      switch (spu.blend.alpha_src_factor) {
+      case PIPE_BLENDFACTOR_ONE:
+         term1a = fragA;
+         break;
+      case PIPE_BLENDFACTOR_SRC_COLOR:
+         term1a = spu_splats(0.0f);
+         break;
+      case PIPE_BLENDFACTOR_SRC_ALPHA:
+         term1a = spu_mul(fragA, fragA);
+         break;
+      /* XXX more cases */
+      default:
+         ASSERT(0);
+      }
+
+      /*
+       * Compute Dest RGB terms
+       */
+      switch (spu.blend.rgb_dst_factor) {
+      case PIPE_BLENDFACTOR_ONE:
+         term2r = fragR;
+         term2g = fragG;
+         term2b = fragB;
+         break;
+      case PIPE_BLENDFACTOR_ZERO:
+         term2r =
+         term2g =
+         term2b = spu_splats(0.0f);
+         break;
+      case PIPE_BLENDFACTOR_SRC_COLOR:
+         term2r = spu_mul(fbRGBA[0], fragR);
+         term2g = spu_mul(fbRGBA[1], fragG);
+         term2b = spu_mul(fbRGBA[2], fragB);
+         break;
+      case PIPE_BLENDFACTOR_SRC_ALPHA:
+         term2r = spu_mul(fbRGBA[0], fragA);
+         term2g = spu_mul(fbRGBA[1], fragA);
+         term2b = spu_mul(fbRGBA[2], fragA);
+         break;
+      case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+         one = spu_splats(1.0f);
+         tmp = spu_sub(one, fragA);
+         term2r = spu_mul(fbRGBA[0], tmp);
+         term2g = spu_mul(fbRGBA[1], tmp);
+         term2b = spu_mul(fbRGBA[2], tmp);
+         break;
+      /* XXX more cases */
+      default:
+         ASSERT(0);
+      }
+
+      /*
+       * Compute Dest Alpha term
+       */
+      switch (spu.blend.alpha_dst_factor) {
+      case PIPE_BLENDFACTOR_ONE:
+         term2a = fragA;
+         break;
+      case PIPE_BLENDFACTOR_SRC_COLOR:
+         term2a = spu_splats(0.0f);
+         break;
+      case PIPE_BLENDFACTOR_SRC_ALPHA:
+         term2a = spu_mul(fbRGBA[3], fragA);
+         break;
+      case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+         one = spu_splats(1.0f);
+         tmp = spu_sub(one, fragA);
+         term2a = spu_mul(fbRGBA[3], tmp);
+         break;
+      /* XXX more cases */
+      default:
+         ASSERT(0);
+      }
+
+      /*
+       * Combine Src/Dest RGB terms
+       */
+      switch (spu.blend.rgb_func) {
+      case PIPE_BLEND_ADD:
+         fragR = spu_add(term1r, term2r);
+         fragG = spu_add(term1g, term2g);
+         fragB = spu_add(term1b, term2b);
+         break;
+      case PIPE_BLEND_SUBTRACT:
+         fragR = spu_sub(term1r, term2r);
+         fragG = spu_sub(term1g, term2g);
+         fragB = spu_sub(term1b, term2b);
+         break;
+      /* XXX more cases */
+      default:
+         ASSERT(0);
+      }
+
+      /*
+       * Combine Src/Dest A term
+       */
+      switch (spu.blend.alpha_func) {
+      case PIPE_BLEND_ADD:
+         fragA = spu_add(term1a, term2a);
+         break;
+      case PIPE_BLEND_SUBTRACT:
+         fragA = spu_sub(term1a, term2a);
+         break;
+      /* XXX more cases */
+      default:
+         ASSERT(0);
+      }
+   }
+
+
+   /*
+    * Convert RRRR,GGGG,BBBB,AAAA to RGBA,RGBA,RGBA,RGBA.
+    */
+#if 0
+   /* original code */
+   {
+      vector float frag_soa[4];
+      frag_soa[0] = fragR;
+      frag_soa[1] = fragG;
+      frag_soa[2] = fragB;
+      frag_soa[3] = fragA;
+      _transpose_matrix4x4(frag_aos, frag_soa);
+   }
+#else
+   /* short-cut relying on function parameter layout: */
+   _transpose_matrix4x4(frag_aos, &fragR);
+   (void) fragG;
+   (void) fragB;
+#endif
+
+   /*
+    * Pack fragment float colors into 32-bit RGBA words.
+    */
+   switch (spu.fb.color_format) {
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+      fragc0 = spu_pack_A8R8G8B8(frag_aos[0]);
+      fragc1 = spu_pack_A8R8G8B8(frag_aos[1]);
+      fragc2 = spu_pack_A8R8G8B8(frag_aos[2]);
+      fragc3 = spu_pack_A8R8G8B8(frag_aos[3]);
+      break;
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
+      fragc0 = spu_pack_B8G8R8A8(frag_aos[0]);
+      fragc1 = spu_pack_B8G8R8A8(frag_aos[1]);
+      fragc2 = spu_pack_B8G8R8A8(frag_aos[2]);
+      fragc3 = spu_pack_B8G8R8A8(frag_aos[3]);
+      break;
+   default:
+      fprintf(stderr, "SPU: Bad pixel format in spu_default_fragment_ops\n");
+      ASSERT(0);
+   }
+
+
+   /*
+    * Do color masking
+    */
+   if (spu.blend.colormask != 0xf) {
+      uint cmask = 0x0; /* each byte corresponds to a color channel */
+
+      /* Form bitmask depending on color buffer format and colormask bits */
+      switch (spu.fb.color_format) {
+      case PIPE_FORMAT_A8R8G8B8_UNORM:
+         if (spu.blend.colormask & PIPE_MASK_R)
+            cmask |= 0x00ff0000; /* red */
+         if (spu.blend.colormask & PIPE_MASK_G)
+            cmask |= 0x0000ff00; /* green */
+         if (spu.blend.colormask & PIPE_MASK_B)
+            cmask |= 0x000000ff; /* blue */
+         if (spu.blend.colormask & PIPE_MASK_A)
+            cmask |= 0xff000000; /* alpha */
+         break;
+      case PIPE_FORMAT_B8G8R8A8_UNORM:
+         if (spu.blend.colormask & PIPE_MASK_R)
+            cmask |= 0x0000ff00; /* red */
+         if (spu.blend.colormask & PIPE_MASK_G)
+            cmask |= 0x00ff0000; /* green */
+         if (spu.blend.colormask & PIPE_MASK_B)
+            cmask |= 0xff000000; /* blue */
+         if (spu.blend.colormask & PIPE_MASK_A)
+            cmask |= 0x000000ff; /* alpha */
+         break;
+      default:
+         ASSERT(0);
+      }
+
+      /*
+       * Apply color mask to the 32-bit packed colors.
+       * if (cmask[i])
+       *    frag color[i] = frag color[i];
+       * else
+       *    frag color[i] = framebuffer color[i];
+       */
+      fragc0 = (fragc0 & cmask) | (fbc0 & ~cmask);
+      fragc1 = (fragc1 & cmask) | (fbc1 & ~cmask);
+      fragc2 = (fragc2 & cmask) | (fbc2 & ~cmask);
+      fragc3 = (fragc3 & cmask) | (fbc3 & ~cmask);
+   }
+
+
+   /*
+    * Do logic ops
+    */
+   if (spu.blend.logicop_enable) {
+      /* XXX to do */
+      /* apply logicop to 32-bit packed colors (fragcx and fbcx) */
+   }
+
+
+   /*
+    * If mask is non-zero, mark tile as dirty.
+    */
+   if (spu_extract(spu_orx(mask), 0)) {
+      spu.cur_ctile_status = TILE_STATUS_DIRTY;
+   }
+   else {
+      /* write no fragments */
+      return;
+   }
+
+
+   /*
+    * Write new fragment/quad colors to the framebuffer/tile.
+    * Only write pixels where the corresponding mask word is set.
+    */
+#if LINEAR_QUAD_LAYOUT
+   /*
+    * Quad layout:
+    *  +--+--+--+--+
+    *  |p0|p1|p2|p3|...
+    *  +--+--+--+--+
+    */
+   if (spu_extract(mask, 0))
+      colorTile->ui[y][x*2] = fragc0;
+   if (spu_extract(mask, 1))
+      colorTile->ui[y][x*2+1] = fragc1;
+   if (spu_extract(mask, 2))
+      colorTile->ui[y][x*2+2] = fragc2;
+   if (spu_extract(mask, 3))
+      colorTile->ui[y][x*2+3] = fragc3;
+#else
+   /*
+    * Quad layout:
+    *  +--+--+
+    *  |p0|p1|...
+    *  +--+--+
+    *  |p2|p3|...
+    *  +--+--+
+    */
+   if (spu_extract(mask, 0))
+      colorTile->ui[y+0][x+0] = fragc0;
+   if (spu_extract(mask, 1))
+      colorTile->ui[y+0][x+1] = fragc1;
+   if (spu_extract(mask, 2))
+      colorTile->ui[y+1][x+0] = fragc2;
+   if (spu_extract(mask, 3))
+      colorTile->ui[y+1][x+1] = fragc3;
+#endif
+}
diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.h b/src/gallium/drivers/cell/spu/spu_per_fragment_op.h
new file mode 100644
index 0000000000..a61689c83a
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.h
@@ -0,0 +1,45 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef SPU_PER_FRAGMENT_OP
+#define SPU_PER_FRAGMENT_OP
+
+
+extern void
+spu_fallback_fragment_ops(uint x, uint y,
+                          tile_t *colorTile,
+                          tile_t *depthStencilTile,
+                          vector float fragZ,
+                          vector float fragRed,
+                          vector float fragGreen,
+                          vector float fragBlue,
+                          vector float fragAlpha,
+                          vector unsigned int mask,
+                          uint facing);
+
+
+#endif /* SPU_PER_FRAGMENT_OP */
diff --git a/src/gallium/drivers/cell/spu/spu_render.c b/src/gallium/drivers/cell/spu/spu_render.c
new file mode 100644
index 0000000000..82dbeb26b7
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_render.c
@@ -0,0 +1,299 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include <stdio.h>
+#include <libmisc.h>
+#include <spu_mfcio.h>
+
+#include "spu_main.h"
+#include "spu_render.h"
+#include "spu_tri.h"
+#include "spu_tile.h"
+#include "cell/common.h"
+#include "util/u_memory.h"
+
+
+/**
+ * Given a rendering command's bounding box (in pixels) compute the
+ * location of the corresponding screen tile bounding box.
+ */
+static INLINE void
+tile_bounding_box(const struct cell_command_render *render,
+                  uint *txmin, uint *tymin,
+                  uint *box_num_tiles, uint *box_width_tiles)
+{
+#if 0
+   /* Debug: full-window bounding box */
+   uint txmax = spu.fb.width_tiles - 1;
+   uint tymax = spu.fb.height_tiles - 1;
+   *txmin = 0;
+   *tymin = 0;
+   *box_num_tiles = spu.fb.width_tiles * spu.fb.height_tiles;
+   *box_width_tiles = spu.fb.width_tiles;
+   (void) render;
+   (void) txmax;
+   (void) tymax;
+#else
+   uint txmax, tymax, box_height_tiles;
+
+   *txmin = (uint) render->xmin / TILE_SIZE;
+   *tymin = (uint) render->ymin / TILE_SIZE;
+   txmax = (uint) render->xmax / TILE_SIZE;
+   tymax = (uint) render->ymax / TILE_SIZE;
+   if (txmax >= spu.fb.width_tiles)
+      txmax = spu.fb.width_tiles-1;
+   if (tymax >= spu.fb.height_tiles)
+      tymax = spu.fb.height_tiles-1;
+   *box_width_tiles = txmax - *txmin + 1;
+   box_height_tiles = tymax - *tymin + 1;
+   *box_num_tiles = *box_width_tiles * box_height_tiles;
+#endif
+#if 0
+   printf("SPU %u: bounds: %g, %g  ...  %g, %g\n", spu.init.id,
+          render->xmin, render->ymin, render->xmax, render->ymax);
+   printf("SPU %u: tiles:  %u, %u .. %u, %u\n",
+           spu.init.id, *txmin, *tymin, txmax, tymax);
+   ASSERT(render->xmin <= render->xmax);
+   ASSERT(render->ymin <= render->ymax);
+#endif
+}
+
+
+/** Check if the tile at (tx,ty) belongs to this SPU */
+static INLINE boolean
+my_tile(uint tx, uint ty)
+{
+   return (spu.fb.width_tiles * ty + tx) % spu.init.num_spus == spu.init.id;
+}
+
+
+/**
+ * Start fetching non-clear color/Z tiles from main memory
+ */
+static INLINE void
+get_cz_tiles(uint tx, uint ty)
+{
+   if (spu.read_depth) {
+      if (spu.cur_ztile_status != TILE_STATUS_CLEAR) {
+         //printf("SPU %u: getting Z tile %u, %u\n", spu.init.id, tx, ty);
+         get_tile(tx, ty, &spu.ztile, TAG_READ_TILE_Z, 1);
+         spu.cur_ztile_status = TILE_STATUS_GETTING;
+      }
+   }
+
+   if (spu.cur_ctile_status != TILE_STATUS_CLEAR) {
+      //printf("SPU %u: getting C tile %u, %u\n", spu.init.id, tx, ty);
+      get_tile(tx, ty, &spu.ctile, TAG_READ_TILE_COLOR, 0);
+      spu.cur_ctile_status = TILE_STATUS_GETTING;
+   }
+}
+
+
+/**
+ * Start putting dirty color/Z tiles back to main memory
+ */
+static INLINE void
+put_cz_tiles(uint tx, uint ty)
+{
+   if (spu.cur_ztile_status == TILE_STATUS_DIRTY) {
+      /* tile was modified and needs to be written back */
+      //printf("SPU %u: put dirty Z tile %u, %u\n", spu.init.id, tx, ty);
+      put_tile(tx, ty, &spu.ztile, TAG_WRITE_TILE_Z, 1);
+      spu.cur_ztile_status = TILE_STATUS_DEFINED;
+   }
+   else if (spu.cur_ztile_status == TILE_STATUS_GETTING) {
+      /* tile was never used */
+      spu.cur_ztile_status = TILE_STATUS_DEFINED;
+      //printf("SPU %u: put getting Z tile %u, %u\n", spu.init.id, tx, ty);
+   }
+
+   if (spu.cur_ctile_status == TILE_STATUS_DIRTY) {
+      /* tile was modified and needs to be written back */
+      //printf("SPU %u: put dirty C tile %u, %u\n", spu.init.id, tx, ty);
+      put_tile(tx, ty, &spu.ctile, TAG_WRITE_TILE_COLOR, 0);
+      spu.cur_ctile_status = TILE_STATUS_DEFINED;
+   }
+   else if (spu.cur_ctile_status == TILE_STATUS_GETTING) {
+      /* tile was never used */
+      spu.cur_ctile_status = TILE_STATUS_DEFINED;
+      //printf("SPU %u: put getting C tile %u, %u\n", spu.init.id, tx, ty);
+   }
+}
+
+
+/**
+ * Wait for 'put' of color/z tiles to complete.
+ */
+static INLINE void
+wait_put_cz_tiles(void)
+{
+   wait_on_mask(1 << TAG_WRITE_TILE_COLOR);
+   if (spu.read_depth) {
+      wait_on_mask(1 << TAG_WRITE_TILE_Z);
+   }
+}
+
+
+/**
+ * Render primitives
+ * \param pos_incr  returns value indicating how may words to skip after
+ *                  this command in the batch buffer
+ */
+void
+cmd_render(const struct cell_command_render *render, uint *pos_incr)
+{
+   /* we'll DMA into these buffers */
+   ubyte vertex_data[CELL_BUFFER_SIZE] ALIGN16_ATTRIB;
+   const uint vertex_size = render->vertex_size; /* in bytes */
+   /*const*/ uint total_vertex_bytes = render->num_verts * vertex_size;
+   uint index_bytes;
+   const ubyte *vertices;
+   const ushort *indexes;
+   uint i, j;
+
+
+   if (Debug) {
+      printf("SPU %u: RENDER prim %u, num_vert=%u  num_ind=%u  "
+             "inline_vert=%u\n",
+             spu.init.id,
+             render->prim_type,
+             render->num_verts,
+             render->num_indexes,
+             render->inline_verts);
+
+      /*
+      printf("       bound: %g, %g .. %g, %g\n",
+             render->xmin, render->ymin, render->xmax, render->ymax);
+      */
+   }
+
+   ASSERT(sizeof(*render) % 4 == 0);
+   ASSERT(total_vertex_bytes % 16 == 0);
+   ASSERT(render->prim_type == PIPE_PRIM_TRIANGLES);
+   ASSERT(render->num_indexes % 3 == 0);
+
+
+   /* indexes are right after the render command in the batch buffer */
+   indexes = (const ushort *) (render + 1);
+   index_bytes = ROUNDUP8(render->num_indexes * 2);
+   *pos_incr = index_bytes / 8 + sizeof(*render) / 8;
+
+
+   if (render->inline_verts) {
+      /* Vertices are after indexes in batch buffer at next 16-byte addr */
+      vertices = (const ubyte *) render + (*pos_incr * 8);
+      vertices = (const ubyte *) align_pointer((void *) vertices, 16);
+      ASSERT_ALIGN16(vertices);
+      *pos_incr = ((vertices + total_vertex_bytes) - (ubyte *) render) / 8;
+   }
+   else {
+      /* Begin DMA fetch of vertex buffer */
+      ubyte *src = spu.init.buffers[render->vertex_buf];
+      ubyte *dest = vertex_data;
+
+      /* skip vertex data we won't use */
+#if 01
+      src += render->min_index * vertex_size;
+      dest += render->min_index * vertex_size;
+      total_vertex_bytes -= render->min_index * vertex_size;
+#endif
+      ASSERT(total_vertex_bytes % 16 == 0);
+      ASSERT_ALIGN16(dest);
+      ASSERT_ALIGN16(src);
+
+      mfc_get(dest,   /* in vertex_data[] array */
+              (unsigned int) src,  /* src in main memory */
+              total_vertex_bytes,  /* size */
+              TAG_VERTEX_BUFFER,
+              0, /* tid */
+              0  /* rid */);
+
+      vertices = vertex_data;
+
+      wait_on_mask(1 << TAG_VERTEX_BUFFER);
+   }
+
+
+   /**
+    ** find tiles which intersect the prim bounding box
+    **/
+   uint txmin, tymin, box_width_tiles, box_num_tiles;
+   tile_bounding_box(render, &txmin, &tymin,
+                     &box_num_tiles, &box_width_tiles);
+
+
+   /* make sure any pending clears have completed */
+   wait_on_mask(1 << TAG_SURFACE_CLEAR); /* XXX temporary */
+
+
+   /**
+    ** loop over tiles, rendering tris
+    **/
+   for (i = 0; i < box_num_tiles; i++) {
+      const uint tx = txmin + i % box_width_tiles;
+      const uint ty = tymin + i / box_width_tiles;
+
+      ASSERT(tx < spu.fb.width_tiles);
+      ASSERT(ty < spu.fb.height_tiles);
+
+      if (!my_tile(tx, ty))
+         continue;
+
+      spu.cur_ctile_status = spu.ctile_status[ty][tx];
+      spu.cur_ztile_status = spu.ztile_status[ty][tx];
+
+      get_cz_tiles(tx, ty);
+
+      uint drawn = 0;
+
+      /* loop over tris */
+      for (j = 0; j < render->num_indexes; j += 3) {
+         const float *v0, *v1, *v2;
+
+         v0 = (const float *) (vertices + indexes[j+0] * vertex_size);
+         v1 = (const float *) (vertices + indexes[j+1] * vertex_size);
+         v2 = (const float *) (vertices + indexes[j+2] * vertex_size);
+
+         drawn += tri_draw(v0, v1, v2, tx, ty, render->front_winding);
+      }
+
+      //printf("SPU %u: drew %u of %u\n", spu.init.id, drawn, render->num_indexes/3);
+
+      /* write color/z tiles back to main framebuffer, if dirtied */
+      put_cz_tiles(tx, ty);
+
+      wait_put_cz_tiles(); /* XXX seems unnecessary... */
+
+      spu.ctile_status[ty][tx] = spu.cur_ctile_status;
+      spu.ztile_status[ty][tx] = spu.cur_ztile_status;
+   }
+
+   if (Debug)
+      printf("SPU %u: RENDER done\n",
+             spu.init.id);
+}
diff --git a/src/gallium/drivers/cell/spu/spu_render.h b/src/gallium/drivers/cell/spu/spu_render.h
new file mode 100644
index 0000000000..493434f087
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_render.h
@@ -0,0 +1,38 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef SPU_RENDER_H
+#define SPU_RENDER_H
+
+#include "cell/common.h"
+
+extern void
+cmd_render(const struct cell_command_render *render, uint *pos_incr);
+
+#endif /* SPU_RENDER_H */
+
diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
new file mode 100644
index 0000000000..117b8a36f8
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -0,0 +1,200 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "pipe/p_compiler.h"
+#include "spu_main.h"
+#include "spu_texture.h"
+#include "spu_tile.h"
+#include "spu_colorpack.h"
+#include "spu_dcache.h"
+
+
+/**
+ * Mark all tex cache entries as invalid.
+ */
+void
+invalidate_tex_cache(void)
+{
+   uint unit = 0;
+   uint bytes = 4 * spu.texture[unit].width
+      * spu.texture[unit].height;
+
+   spu_dcache_mark_dirty((unsigned) spu.texture[unit].start, bytes);
+}
+
+
+/**
+ * XXX look into getting texels for all four pixels in a quad at once.
+ */
+static uint
+get_texel(uint unit, vec_uint4 coordinate)
+{
+   /*
+    * XXX we could do the "/ TILE_SIZE" and "% TILE_SIZE" operations as
+    * SIMD since X and Y are already in a SIMD register.
+    */
+   const unsigned texture_ea = (uintptr_t) spu.texture[unit].start;
+   ushort x = spu_extract(coordinate, 0);
+   ushort y = spu_extract(coordinate, 1);
+   unsigned tile_offset = sizeof(tile_t)
+      * ((y / TILE_SIZE * spu.texture[unit].tiles_per_row) + (x / TILE_SIZE));
+   ushort texel_offset = (ushort) 4
+      * (ushort) (((ushort) (y % TILE_SIZE) * (ushort) TILE_SIZE) + (x % TILE_SIZE));
+   vec_uint4 tmp;
+
+   spu_dcache_fetch_unaligned((qword *) & tmp,
+                              texture_ea + tile_offset + texel_offset,
+                              4);
+   return spu_extract(tmp, 0);
+}
+
+
+/**
+ * Get four texels from locations (x[0], y[0]), (x[1], y[1]) ...
+ *
+ * NOTE: in the typical case of bilinear filtering, the four texels
+ * are in a 2x2 group so we could get by with just two dcache fetches
+ * (two side-by-side texels per fetch).  But when bilinear filtering
+ * wraps around a texture edge, we'll probably need code like we have
+ * now.
+ * FURTHERMORE: since we're rasterizing a quad of 2x2 pixels at a time,
+ * it's quite likely that the four pixels in a quad will need some of the
+ * same texels.  So look into doing texture fetches for four pixels at
+ * a time.
+ */
+static void
+get_four_texels(uint unit, vec_uint4 x, vec_uint4 y, vec_uint4 *texels)
+{
+   const unsigned texture_ea = (uintptr_t) spu.texture[unit].start;
+   vec_uint4 tile_x = spu_rlmask(x, -5);
+   vec_uint4 tile_y = spu_rlmask(y, -5);
+   const qword offset_x = si_andi((qword) x, 0x1f);
+   const qword offset_y = si_andi((qword) y, 0x1f);
+
+   const qword tiles_per_row = (qword) spu_splats(spu.texture[unit].tiles_per_row);
+   const qword tile_size = (qword) spu_splats((unsigned) sizeof(tile_t));
+
+   qword tile_offset = si_mpya((qword) tile_y, tiles_per_row, (qword) tile_x);
+   tile_offset = si_mpy((qword) tile_offset, tile_size);
+
+   qword texel_offset = si_a(si_mpyui(offset_y, 32), offset_x);
+   texel_offset = si_mpyui(texel_offset, 4);
+   
+   vec_uint4 offset = (vec_uint4) si_a(tile_offset, texel_offset);
+   
+   spu_dcache_fetch_unaligned((qword *) & texels[0],
+                              texture_ea + spu_extract(offset, 0), 4);
+   spu_dcache_fetch_unaligned((qword *) & texels[1],
+                              texture_ea + spu_extract(offset, 1), 4);
+   spu_dcache_fetch_unaligned((qword *) & texels[2],
+                              texture_ea + spu_extract(offset, 2), 4);
+   spu_dcache_fetch_unaligned((qword *) & texels[3],
+                              texture_ea + spu_extract(offset, 3), 4);
+}
+
+
+/**
+ * Get texture sample at texcoord.
+ */
+vector float
+sample_texture_nearest(uint unit, vector float texcoord)
+{
+   vector float tc = spu_mul(texcoord, spu.texture[unit].tex_size);
+   vector unsigned int itc = spu_convtu(tc, 0);  /* convert to int */
+   itc = spu_and(itc, spu.texture[unit].tex_size_mask); /* mask (GL_REPEAT) */
+   uint texel = get_texel(unit, itc);
+   return spu_unpack_A8R8G8B8(texel);
+}
+
+
+vector float
+sample_texture_bilinear(uint unit, vector float texcoord)
+{
+   static const vec_uint4 offset_x = {0, 0, 1, 1};
+   static const vec_uint4 offset_y = {0, 1, 0, 1};
+
+   vector float tc = spu_mul(texcoord, spu.texture[unit].tex_size);
+   tc = spu_add(tc, spu_splats(-0.5f));  /* half texel bias */
+
+   /* integer texcoords S,T: */
+   vec_uint4 itc = spu_convtu(tc, 0);  /* convert to int */
+
+   vec_uint4 texels[4];
+   
+   /* setup texcoords for quad:
+    *  +-----+-----+
+    *  |x0,y0|x1,y1|
+    *  +-----+-----+
+    *  |x2,y2|x3,y3|
+    *  +-----+-----+
+    */
+   vec_uint4 x = spu_splats(spu_extract(itc, 0));
+   vec_uint4 y = spu_splats(spu_extract(itc, 1));
+   x = spu_add(x, offset_x);
+   y = spu_add(y, offset_y);
+
+   /* GL_REPEAT wrap mode: */
+   x = spu_and(x, spu.texture[unit].tex_size_x_mask);
+   y = spu_and(y, spu.texture[unit].tex_size_y_mask);
+
+   get_four_texels(unit, x, y, texels);
+
+   /* integer A8R8G8B8 to float texel conversion */
+   vector float texel00 = spu_unpack_A8R8G8B8(spu_extract(texels[0], 0));
+   vector float texel01 = spu_unpack_A8R8G8B8(spu_extract(texels[1], 0));
+   vector float texel10 = spu_unpack_A8R8G8B8(spu_extract(texels[2], 0));
+   vector float texel11 = spu_unpack_A8R8G8B8(spu_extract(texels[3], 0));
+
+
+   /* Compute weighting factors in [0,1]
+    * Multiply texcoord by 1024, AND with 1023, convert back to float.
+    */
+   vector float tc1024 = spu_mul(tc, spu_splats(1024.0f));
+   vector signed int itc1024 = spu_convts(tc1024, 0);
+   itc1024 = spu_and(itc1024, spu_splats((1 << 10) - 1));
+   vector float weight = spu_convtf(itc1024, 10);
+
+   /* smeared frac and 1-frac */
+   vector float sfrac = spu_splats(spu_extract(weight, 0));
+   vector float tfrac = spu_splats(spu_extract(weight, 1));
+   vector float sfrac1 = spu_sub(spu_splats(1.0f), sfrac);
+   vector float tfrac1 = spu_sub(spu_splats(1.0f), tfrac);
+
+   /* multiply the samples (colors) by the S/T weights */
+   texel00 = spu_mul(spu_mul(texel00, sfrac1), tfrac1);
+   texel10 = spu_mul(spu_mul(texel10, sfrac ), tfrac1);
+   texel01 = spu_mul(spu_mul(texel01, sfrac1), tfrac );
+   texel11 = spu_mul(spu_mul(texel11, sfrac ), tfrac );
+
+   /* compute sum of weighted samples */
+   vector float texel_sum = spu_add(texel00, texel01);
+   texel_sum = spu_add(texel_sum, texel10);
+   texel_sum = spu_add(texel_sum, texel11);
+
+   return texel_sum;
+}
diff --git a/src/gallium/drivers/cell/spu/spu_texture.h b/src/gallium/drivers/cell/spu/spu_texture.h
new file mode 100644
index 0000000000..f7c9738be8
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_texture.h
@@ -0,0 +1,47 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef SPU_TEXTURE_H
+#define SPU_TEXTURE_H
+
+
+#include "pipe/p_compiler.h"
+
+
+extern void
+invalidate_tex_cache(void);
+
+
+extern vector float
+sample_texture_nearest(uint unit, vector float texcoord);
+
+
+extern vector float
+sample_texture_bilinear(uint unit, vector float texcoord);
+
+
+#endif /* SPU_TEXTURE_H */
diff --git a/src/gallium/drivers/cell/spu/spu_tile.c b/src/gallium/drivers/cell/spu/spu_tile.c
new file mode 100644
index 0000000000..6905015a48
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_tile.c
@@ -0,0 +1,126 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+
+#include "spu_tile.h"
+#include "spu_main.h"
+
+
+/**
+ * Get tile of color or Z values from main memory, put into SPU memory.
+ */
+void
+get_tile(uint tx, uint ty, tile_t *tile, int tag, int zBuf)
+{
+   const uint offset = ty * spu.fb.width_tiles + tx;
+   const uint bytesPerTile = TILE_SIZE * TILE_SIZE * (zBuf ? spu.fb.zsize : 4);
+   const ubyte *src = zBuf ? spu.fb.depth_start : spu.fb.color_start;
+
+   src += offset * bytesPerTile;
+
+   ASSERT(tx < spu.fb.width_tiles);
+   ASSERT(ty < spu.fb.height_tiles);
+   ASSERT_ALIGN16(tile);
+   /*
+   printf("get_tile:  dest: %p  src: 0x%x  size: %d\n",
+          tile, (unsigned int) src, bytesPerTile);
+   */
+   mfc_get(tile->ui,  /* dest in local memory */
+           (unsigned int) src, /* src in main memory */
+           bytesPerTile,
+           tag,
+           0, /* tid */
+           0  /* rid */);
+}
+
+
+/**
+ * Move tile of color or Z values from SPU memory to main memory.
+ */
+void
+put_tile(uint tx, uint ty, const tile_t *tile, int tag, int zBuf)
+{
+   const uint offset = ty * spu.fb.width_tiles + tx;
+   const uint bytesPerTile = TILE_SIZE * TILE_SIZE * (zBuf ? spu.fb.zsize : 4);
+   ubyte *dst = zBuf ? spu.fb.depth_start : spu.fb.color_start;
+
+   dst += offset * bytesPerTile;
+
+   ASSERT(tx < spu.fb.width_tiles);
+   ASSERT(ty < spu.fb.height_tiles);
+   ASSERT_ALIGN16(tile);
+   /*
+   printf("SPU %u: put_tile:  src: %p  dst: 0x%x  size: %d\n",
+          spu.init.id,
+          tile, (unsigned int) dst, bytesPerTile);
+   */
+   mfc_put((void *) tile->ui,  /* src in local memory */
+           (unsigned int) dst,  /* dst in main memory */
+           bytesPerTile,
+           tag,
+           0, /* tid */
+           0  /* rid */);
+}
+
+
+/**
+ * For tiles whose status is TILE_STATUS_CLEAR, write solid-filled
+ * tiles back to the main framebuffer.
+ */
+void
+really_clear_tiles(uint surfaceIndex)
+{
+   const uint num_tiles = spu.fb.width_tiles * spu.fb.height_tiles;
+   uint i;
+
+   if (surfaceIndex == 0) {
+      clear_c_tile(&spu.ctile);
+
+      for (i = spu.init.id; i < num_tiles; i += spu.init.num_spus) {
+         uint tx = i % spu.fb.width_tiles;
+         uint ty = i / spu.fb.width_tiles;
+         if (spu.ctile_status[ty][tx] == TILE_STATUS_CLEAR) {
+            put_tile(tx, ty, &spu.ctile, TAG_SURFACE_CLEAR, 0);
+         }
+      }
+   }
+   else {
+      clear_z_tile(&spu.ztile);
+
+      for (i = spu.init.id; i < num_tiles; i += spu.init.num_spus) {
+         uint tx = i % spu.fb.width_tiles;
+         uint ty = i / spu.fb.width_tiles;
+         if (spu.ztile_status[ty][tx] == TILE_STATUS_CLEAR)
+            put_tile(tx, ty, &spu.ctile, TAG_SURFACE_CLEAR, 1);
+      }
+   }
+
+#if 0
+   wait_on_mask(1 << TAG_SURFACE_CLEAR);
+#endif
+}
diff --git a/src/gallium/drivers/cell/spu/spu_tile.h b/src/gallium/drivers/cell/spu/spu_tile.h
new file mode 100644
index 0000000000..7bfb52be8f
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_tile.h
@@ -0,0 +1,75 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef SPU_TILE_H
+#define SPU_TILE_H
+
+
+#include <libmisc.h>
+#include <spu_mfcio.h>
+#include "spu_main.h"
+#include "cell/common.h"
+
+
+
+extern void
+get_tile(uint tx, uint ty, tile_t *tile, int tag, int zBuf);
+
+extern void
+put_tile(uint tx, uint ty, const tile_t *tile, int tag, int zBuf);
+
+extern void
+really_clear_tiles(uint surfaceIndex);
+
+
+static INLINE void
+clear_c_tile(tile_t *ctile)
+{
+   memset32((uint*) ctile->ui,
+            spu.fb.color_clear_value,
+            TILE_SIZE * TILE_SIZE);
+}
+
+
+static INLINE void
+clear_z_tile(tile_t *ztile)
+{
+   if (spu.fb.zsize == 2) {
+      memset16((ushort*) ztile->us,
+               spu.fb.depth_clear_value,
+               TILE_SIZE * TILE_SIZE);
+   }
+   else {
+      ASSERT(spu.fb.zsize != 0);
+      memset32((uint*) ztile->ui,
+               spu.fb.depth_clear_value,
+               TILE_SIZE * TILE_SIZE);
+   }
+}
+
+
+#endif /* SPU_TILE_H */
diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
new file mode 100644
index 0000000000..87991c3136
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -0,0 +1,980 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * Triangle rendering within a tile.
+ */
+
+#include <transpose_matrix4x4.h>
+#include "pipe/p_compiler.h"
+#include "pipe/p_format.h"
+#include "util/u_math.h"
+#include "spu_colorpack.h"
+#include "spu_main.h"
+#include "spu_texture.h"
+#include "spu_tile.h"
+#include "spu_tri.h"
+
+
+/** Masks are uint[4] vectors with each element being 0 or 0xffffffff */
+typedef vector unsigned int mask_t;
+
+typedef union
+{
+   vector float v;
+   float f[4];
+} float4;
+
+
+/**
+ * Simplified types taken from other parts of Gallium
+ */
+struct vertex_header {
+   vector float data[1];
+};
+
+
+
+/* XXX fix this */
+#undef CEILF
+#define CEILF(X) ((float) (int) ((X) + 0.99999))
+
+
+#define QUAD_TOP_LEFT     0
+#define QUAD_TOP_RIGHT    1
+#define QUAD_BOTTOM_LEFT  2
+#define QUAD_BOTTOM_RIGHT 3
+#define MASK_TOP_LEFT     (1 << QUAD_TOP_LEFT)
+#define MASK_TOP_RIGHT    (1 << QUAD_TOP_RIGHT)
+#define MASK_BOTTOM_LEFT  (1 << QUAD_BOTTOM_LEFT)
+#define MASK_BOTTOM_RIGHT (1 << QUAD_BOTTOM_RIGHT)
+#define MASK_ALL          0xf
+
+
+#define DEBUG_VERTS 0
+
+/**
+ * Triangle edge info
+ */
+struct edge {
+   float dx;		/**< X(v1) - X(v0), used only during setup */
+   float dy;		/**< Y(v1) - Y(v0), used only during setup */
+   float dxdy;		/**< dx/dy */
+   float sx, sy;	/**< first sample point coord */
+   int lines;		/**< number of lines on this edge */
+};
+
+
+struct interp_coef
+{
+   float4 a0;
+   float4 dadx;
+   float4 dady;
+};
+
+
+/**
+ * Triangle setup info (derived from draw_stage).
+ * Also used for line drawing (taking some liberties).
+ */
+struct setup_stage {
+
+   /* Vertices are just an array of floats making up each attribute in
+    * turn.  Currently fixed at 4 floats, but should change in time.
+    * Codegen will help cope with this.
+    */
+   const struct vertex_header *vmax;
+   const struct vertex_header *vmid;
+   const struct vertex_header *vmin;
+   const struct vertex_header *vprovoke;
+
+   struct edge ebot;
+   struct edge etop;
+   struct edge emaj;
+
+   float oneoverarea;
+
+   uint facing;
+
+   uint tx, ty;
+
+   int cliprect_minx, cliprect_maxx, cliprect_miny, cliprect_maxy;
+
+#if 0
+   struct tgsi_interp_coef coef[PIPE_MAX_SHADER_INPUTS];
+#else
+   struct interp_coef coef[PIPE_MAX_SHADER_INPUTS];
+#endif
+
+#if 0
+   struct quad_header quad; 
+#endif
+
+   struct {
+      int left[2];   /**< [0] = row0, [1] = row1 */
+      int right[2];
+      int y;
+      unsigned y_flags;
+      unsigned mask;     /**< mask of MASK_BOTTOM/TOP_LEFT/RIGHT bits */
+   } span;
+};
+
+
+
+static struct setup_stage setup;
+
+
+
+
+#if 0
+/**
+ * Basically a cast wrapper.
+ */
+static INLINE struct setup_stage *setup_stage( struct draw_stage *stage )
+{
+   return (struct setup_stage *)stage;
+}
+#endif
+
+#if 0
+/**
+ * Clip setup.quad against the scissor/surface bounds.
+ */
+static INLINE void
+quad_clip(struct setup_stage *setup)
+{
+   const struct pipe_scissor_state *cliprect = &setup.softpipe->cliprect;
+   const int minx = (int) cliprect->minx;
+   const int maxx = (int) cliprect->maxx;
+   const int miny = (int) cliprect->miny;
+   const int maxy = (int) cliprect->maxy;
+
+   if (setup.quad.x0 >= maxx ||
+       setup.quad.y0 >= maxy ||
+       setup.quad.x0 + 1 < minx ||
+       setup.quad.y0 + 1 < miny) {
+      /* totally clipped */
+      setup.quad.mask = 0x0;
+      return;
+   }
+   if (setup.quad.x0 < minx)
+      setup.quad.mask &= (MASK_BOTTOM_RIGHT | MASK_TOP_RIGHT);
+   if (setup.quad.y0 < miny)
+      setup.quad.mask &= (MASK_BOTTOM_LEFT | MASK_BOTTOM_RIGHT);
+   if (setup.quad.x0 == maxx - 1)
+      setup.quad.mask &= (MASK_BOTTOM_LEFT | MASK_TOP_LEFT);
+   if (setup.quad.y0 == maxy - 1)
+      setup.quad.mask &= (MASK_TOP_LEFT | MASK_TOP_RIGHT);
+}
+#endif
+
+#if 0
+/**
+ * Emit a quad (pass to next stage) with clipping.
+ */
+static INLINE void
+clip_emit_quad(struct setup_stage *setup)
+{
+   quad_clip(setup);
+   if (setup.quad.mask) {
+      struct softpipe_context *sp = setup.softpipe;
+      sp->quad.first->run(sp->quad.first, &setup.quad);
+   }
+}
+#endif
+
+/**
+ * Evaluate attribute coefficients (plane equations) to compute
+ * attribute values for the four fragments in a quad.
+ * Eg: four colors will be computed (in AoS format).
+ */
+static INLINE void
+eval_coeff(uint slot, float x, float y, vector float result[4])
+{
+   switch (spu.vertex_info.interp_mode[slot]) {
+   case INTERP_CONSTANT:
+      result[QUAD_TOP_LEFT] =
+      result[QUAD_TOP_RIGHT] =
+      result[QUAD_BOTTOM_LEFT] =
+      result[QUAD_BOTTOM_RIGHT] = setup.coef[slot].a0.v;
+      break;
+
+   case INTERP_LINEAR:
+      /* fall-through, for now */
+   default:
+      {
+         register vector float dadx = setup.coef[slot].dadx.v;
+         register vector float dady = setup.coef[slot].dady.v;
+         register vector float topLeft
+            = spu_add(setup.coef[slot].a0.v,
+                      spu_add(spu_mul(spu_splats(x), dadx),
+                              spu_mul(spu_splats(y), dady)));
+
+         result[QUAD_TOP_LEFT] = topLeft;
+         result[QUAD_TOP_RIGHT] = spu_add(topLeft, dadx);
+         result[QUAD_BOTTOM_LEFT] = spu_add(topLeft, dady);
+         result[QUAD_BOTTOM_RIGHT] = spu_add(spu_add(topLeft, dadx), dady);
+      }
+   }
+}
+
+
+/**
+ * As above, but return 4 vectors in SOA format.
+ * XXX this will all be re-written someday.
+ */
+static INLINE void
+eval_coeff_soa(uint slot, float x, float y, vector float result[4])
+{
+   eval_coeff(slot, x, y, result);
+   _transpose_matrix4x4(result, result);
+}
+
+
+
+static INLINE vector float
+eval_z(float x, float y)
+{
+   const uint slot = 0;
+   const float dzdx = setup.coef[slot].dadx.f[2];
+   const float dzdy = setup.coef[slot].dady.f[2];
+   const float topLeft = setup.coef[slot].a0.f[2] + x * dzdx + y * dzdy;
+   const vector float topLeftv = spu_splats(topLeft);
+   const vector float derivs = (vector float) { 0.0, dzdx, dzdy, dzdx + dzdy };
+   return spu_add(topLeftv, derivs);
+}
+
+
+/**
+ * Emit a quad (pass to next stage).  No clipping is done.
+ * Note: about 1/5 to 1/7 of the time, mask is zero and this function
+ * should be skipped.  But adding the test for that slows things down
+ * overall.
+ */
+static INLINE void
+emit_quad( int x, int y, mask_t mask)
+{
+   /* If any bits in mask are set... */
+   if (spu_extract(spu_orx(mask), 0)) {
+      const int ix = x - setup.cliprect_minx;
+      const int iy = y - setup.cliprect_miny;
+
+      spu.cur_ctile_status = TILE_STATUS_DIRTY;
+      spu.cur_ztile_status = TILE_STATUS_DIRTY;
+
+      if (0/*spu.texture[0].start*/) {
+         /*
+          * Temporary texture mapping path
+          * This will go away when fragment programs support TEX inst.
+          */
+         const uint unit = 0;
+         vector float colors[4];
+         vector float texcoords[4];
+         eval_coeff(2, (float) x, (float) y, texcoords);
+
+         if (spu_extract(mask, 0))
+            colors[0] = spu.sample_texture[unit](unit, texcoords[0]);
+         if (spu_extract(mask, 1))
+            colors[1] = spu.sample_texture[unit](unit, texcoords[1]);
+         if (spu_extract(mask, 2))
+            colors[2] = spu.sample_texture[unit](unit, texcoords[2]);
+         if (spu_extract(mask, 3))
+            colors[3] = spu.sample_texture[unit](unit, texcoords[3]);
+
+
+         if (spu.texture[1].start) {
+            /* multi-texture mapping */
+            const uint unit = 1;
+            vector float colors1[4];
+
+            eval_coeff(2, (float) x, (float) y, texcoords);
+
+            if (spu_extract(mask, 0))
+               colors1[0] = spu.sample_texture[unit](unit, texcoords[0]);
+            if (spu_extract(mask, 1))
+               colors1[1] = spu.sample_texture[unit](unit, texcoords[1]);
+            if (spu_extract(mask, 2))
+               colors1[2] = spu.sample_texture[unit](unit, texcoords[2]);
+            if (spu_extract(mask, 3))
+               colors1[3] = spu.sample_texture[unit](unit, texcoords[3]);
+
+            /* hack: modulate first texture by second */
+            colors[0] = spu_mul(colors[0], colors1[0]);
+            colors[1] = spu_mul(colors[1], colors1[1]);
+            colors[2] = spu_mul(colors[2], colors1[2]);
+            colors[3] = spu_mul(colors[3], colors1[3]);
+         }
+
+         {
+            /* Convert fragment data from AoS to SoA format.
+             * I.e. (RGBA,RGBA,RGBA,RGBA) -> (RRRR,GGGG,BBBB,AAAA)
+             * This is temporary!
+             */
+            vector float soa_frag[4];
+            _transpose_matrix4x4(soa_frag, colors);
+
+            vector float fragZ = eval_z((float) x, (float) y);
+
+            /* Do all per-fragment/quad operations here, including:
+             * alpha test, z test, stencil test, blend and framebuffer writing.
+             */
+            spu.fragment_ops(ix, iy, &spu.ctile, &spu.ztile,
+                             fragZ,
+                             soa_frag[0], soa_frag[1],
+                             soa_frag[2], soa_frag[3],
+                             mask,
+                             setup.facing);
+         }
+
+      }
+      else {
+         /*
+          * Run fragment shader, execute per-fragment ops, update fb/tile.
+          */
+         vector float inputs[4*4], outputs[2*4];
+         vector float fragZ = eval_z((float) x, (float) y);
+
+         /* setup inputs */
+#if 0
+         eval_coeff_soa(1, (float) x, (float) y, inputs);
+#else
+         uint i;
+         for (i = 0; i < spu.vertex_info.num_attribs; i++) {
+            eval_coeff_soa(i+1, (float) x, (float) y, inputs + i * 4);
+         }
+#endif
+         ASSERT(spu.fragment_program);
+         ASSERT(spu.fragment_ops);
+
+         /* Execute the current fragment program */
+         spu.fragment_program(inputs, outputs, spu.constants);
+
+         /* Execute per-fragment/quad operations, including:
+          * alpha test, z test, stencil test, blend and framebuffer writing.
+          */
+         spu.fragment_ops(ix, iy, &spu.ctile, &spu.ztile,
+                          fragZ,
+                          outputs[0*4+0],
+                          outputs[0*4+1],
+                          outputs[0*4+2],
+                          outputs[0*4+3],
+                          mask,
+                          setup.facing);
+      }
+   }
+}
+
+
+/**
+ * Given an X or Y coordinate, return the block/quad coordinate that it
+ * belongs to.
+ */
+static INLINE int block( int x )
+{
+   return x & ~1;
+}
+
+
+/**
+ * Compute mask which indicates which pixels in the 2x2 quad are actually inside
+ * the triangle's bounds.
+ * The mask is a uint4 vector and each element will be 0 or 0xffffffff.
+ */
+static INLINE mask_t calculate_mask( int x )
+{
+   /* This is a little tricky.
+    * Use & instead of && to avoid branches.
+    * Use negation to convert true/false to ~0/0 values.
+    */
+   mask_t mask;
+   mask = spu_insert(-((x   >= setup.span.left[0]) & (x   < setup.span.right[0])), mask, 0);
+   mask = spu_insert(-((x+1 >= setup.span.left[0]) & (x+1 < setup.span.right[0])), mask, 1);
+   mask = spu_insert(-((x   >= setup.span.left[1]) & (x   < setup.span.right[1])), mask, 2);
+   mask = spu_insert(-((x+1 >= setup.span.left[1]) & (x+1 < setup.span.right[1])), mask, 3);
+   return mask;
+}
+
+
+/**
+ * Render a horizontal span of quads
+ */
+static void flush_spans( void )
+{
+   int minleft, maxright;
+   int x;
+
+   switch (setup.span.y_flags) {
+   case 0x3:
+      /* both odd and even lines written (both quad rows) */
+      minleft = MIN2(setup.span.left[0], setup.span.left[1]);
+      maxright = MAX2(setup.span.right[0], setup.span.right[1]);
+      break;
+
+   case 0x1:
+      /* only even line written (quad top row) */
+      minleft = setup.span.left[0];
+      maxright = setup.span.right[0];
+      break;
+
+   case 0x2:
+      /* only odd line written (quad bottom row) */
+      minleft = setup.span.left[1];
+      maxright = setup.span.right[1];
+      break;
+
+   default:
+      return;
+   }
+
+
+   /* OK, we're very likely to need the tile data now.
+    * clear or finish waiting if needed.
+    */
+   if (spu.cur_ctile_status == TILE_STATUS_GETTING) {
+      /* wait for mfc_get() to complete */
+      //printf("SPU: %u: waiting for ctile\n", spu.init.id);
+      wait_on_mask(1 << TAG_READ_TILE_COLOR);
+      spu.cur_ctile_status = TILE_STATUS_CLEAN;
+   }
+   else if (spu.cur_ctile_status == TILE_STATUS_CLEAR) {
+      //printf("SPU %u: clearing C tile %u, %u\n", spu.init.id, setup.tx, setup.ty);
+      clear_c_tile(&spu.ctile);
+      spu.cur_ctile_status = TILE_STATUS_DIRTY;
+   }
+   ASSERT(spu.cur_ctile_status != TILE_STATUS_DEFINED);
+
+   if (spu.read_depth) {
+      if (spu.cur_ztile_status == TILE_STATUS_GETTING) {
+         /* wait for mfc_get() to complete */
+         //printf("SPU: %u: waiting for ztile\n", spu.init.id);
+         wait_on_mask(1 << TAG_READ_TILE_Z);
+         spu.cur_ztile_status = TILE_STATUS_CLEAN;
+      }
+      else if (spu.cur_ztile_status == TILE_STATUS_CLEAR) {
+         //printf("SPU %u: clearing Z tile %u, %u\n", spu.init.id, setup.tx, setup.ty);
+         clear_z_tile(&spu.ztile);
+         spu.cur_ztile_status = TILE_STATUS_DIRTY;
+      }
+      ASSERT(spu.cur_ztile_status != TILE_STATUS_DEFINED);
+   }
+
+   /* XXX this loop could be moved into the above switch cases and
+    * calculate_mask() could be simplified a bit...
+    */
+   for (x = block(minleft); x <= block(maxright); x += 2) {
+#if 1
+      emit_quad( x, setup.span.y, calculate_mask( x ));
+#endif
+   }
+
+   setup.span.y = 0;
+   setup.span.y_flags = 0;
+   setup.span.right[0] = 0;
+   setup.span.right[1] = 0;
+}
+
+#if DEBUG_VERTS
+static void print_vertex(const struct vertex_header *v)
+{
+   int i;
+   fprintf(stderr, "Vertex: (%p)\n", v);
+   for (i = 0; i < setup.quad.nr_attrs; i++) {
+      fprintf(stderr, "  %d: %f %f %f %f\n",  i, 
+              v->data[i][0], v->data[i][1], v->data[i][2], v->data[i][3]);
+   }
+}
+#endif
+
+
+static boolean setup_sort_vertices(const struct vertex_header *v0,
+                                   const struct vertex_header *v1,
+                                   const struct vertex_header *v2)
+{
+
+#if DEBUG_VERTS
+   fprintf(stderr, "Triangle:\n");
+   print_vertex(v0);
+   print_vertex(v1);
+   print_vertex(v2);
+#endif
+
+   setup.vprovoke = v2;
+
+   /* determine bottom to top order of vertices */
+   {
+      float y0 = spu_extract(v0->data[0], 1);
+      float y1 = spu_extract(v1->data[0], 1);
+      float y2 = spu_extract(v2->data[0], 1);
+      if (y0 <= y1) {
+	 if (y1 <= y2) {
+	    /* y0<=y1<=y2 */
+	    setup.vmin = v0;   
+	    setup.vmid = v1;   
+	    setup.vmax = v2;
+	 }
+	 else if (y2 <= y0) {
+	    /* y2<=y0<=y1 */
+	    setup.vmin = v2;   
+	    setup.vmid = v0;   
+	    setup.vmax = v1;   
+	 }
+	 else {
+	    /* y0<=y2<=y1 */
+	    setup.vmin = v0;   
+	    setup.vmid = v2;   
+	    setup.vmax = v1;  
+	 }
+      }
+      else {
+	 if (y0 <= y2) {
+	    /* y1<=y0<=y2 */
+	    setup.vmin = v1;   
+	    setup.vmid = v0;   
+	    setup.vmax = v2;  
+	 }
+	 else if (y2 <= y1) {
+	    /* y2<=y1<=y0 */
+	    setup.vmin = v2;   
+	    setup.vmid = v1;   
+	    setup.vmax = v0;  
+	 }
+	 else {
+	    /* y1<=y2<=y0 */
+	    setup.vmin = v1;   
+	    setup.vmid = v2;   
+	    setup.vmax = v0;
+	 }
+      }
+   }
+
+   /* Check if triangle is completely outside the tile bounds */
+   if (spu_extract(setup.vmin->data[0], 1) > setup.cliprect_maxy)
+      return FALSE;
+   if (spu_extract(setup.vmax->data[0], 1) < setup.cliprect_miny)
+      return FALSE;
+   if (spu_extract(setup.vmin->data[0], 0) < setup.cliprect_minx &&
+       spu_extract(setup.vmid->data[0], 0) < setup.cliprect_minx &&
+       spu_extract(setup.vmax->data[0], 0) < setup.cliprect_minx)
+      return FALSE;
+   if (spu_extract(setup.vmin->data[0], 0) > setup.cliprect_maxx &&
+       spu_extract(setup.vmid->data[0], 0) > setup.cliprect_maxx &&
+       spu_extract(setup.vmax->data[0], 0) > setup.cliprect_maxx)
+      return FALSE;
+
+   setup.ebot.dx = spu_extract(setup.vmid->data[0], 0) - spu_extract(setup.vmin->data[0], 0);
+   setup.ebot.dy = spu_extract(setup.vmid->data[0], 1) - spu_extract(setup.vmin->data[0], 1);
+   setup.emaj.dx = spu_extract(setup.vmax->data[0], 0) - spu_extract(setup.vmin->data[0], 0);
+   setup.emaj.dy = spu_extract(setup.vmax->data[0], 1) - spu_extract(setup.vmin->data[0], 1);
+   setup.etop.dx = spu_extract(setup.vmax->data[0], 0) - spu_extract(setup.vmid->data[0], 0);
+   setup.etop.dy = spu_extract(setup.vmax->data[0], 1) - spu_extract(setup.vmid->data[0], 1);
+
+   /*
+    * Compute triangle's area.  Use 1/area to compute partial
+    * derivatives of attributes later.
+    *
+    * The area will be the same as prim->det, but the sign may be
+    * different depending on how the vertices get sorted above.
+    *
+    * To determine whether the primitive is front or back facing we
+    * use the prim->det value because its sign is correct.
+    */
+   {
+      const float area = (setup.emaj.dx * setup.ebot.dy - 
+			    setup.ebot.dx * setup.emaj.dy);
+
+      setup.oneoverarea = 1.0f / area;
+      /*
+      _mesa_printf("%s one-over-area %f  area %f  det %f\n",
+                   __FUNCTION__, setup.oneoverarea, area, prim->det );
+      */
+   }
+
+#if 0
+   /* We need to know if this is a front or back-facing triangle for:
+    *  - the GLSL gl_FrontFacing fragment attribute (bool)
+    *  - two-sided stencil test
+    */
+   setup.quad.facing = (prim->det > 0.0) ^ (setup.softpipe->rasterizer->front_winding == PIPE_WINDING_CW);
+#endif
+
+   return TRUE;
+}
+
+
+/**
+ * Compute a0 for a constant-valued coefficient (GL_FLAT shading).
+ * The value value comes from vertex->data[slot].
+ * The result will be put into setup.coef[slot].a0.
+ * \param slot  which attribute slot 
+ */
+static INLINE void
+const_coeff(uint slot)
+{
+   setup.coef[slot].dadx.v = (vector float) {0.0, 0.0, 0.0, 0.0};
+   setup.coef[slot].dady.v = (vector float) {0.0, 0.0, 0.0, 0.0};
+   setup.coef[slot].a0.v = setup.vprovoke->data[slot];
+}
+
+
+/**
+ * Compute a0, dadx and dady for a linearly interpolated coefficient,
+ * for a triangle.
+ */
+static INLINE void
+tri_linear_coeff(uint slot, uint firstComp, uint lastComp)
+{
+   uint i;
+   const float *vmin_d = (float *) &setup.vmin->data[slot];
+   const float *vmid_d = (float *) &setup.vmid->data[slot];
+   const float *vmax_d = (float *) &setup.vmax->data[slot];
+   const float x = spu_extract(setup.vmin->data[0], 0) - 0.5f;
+   const float y = spu_extract(setup.vmin->data[0], 1) - 0.5f;
+
+   for (i = firstComp; i < lastComp; i++) {
+      float botda = vmid_d[i] - vmin_d[i];
+      float majda = vmax_d[i] - vmin_d[i];
+      float a = setup.ebot.dy * majda - botda * setup.emaj.dy;
+      float b = setup.emaj.dx * botda - majda * setup.ebot.dx;
+   
+      ASSERT(slot < PIPE_MAX_SHADER_INPUTS);
+
+      setup.coef[slot].dadx.f[i] = a * setup.oneoverarea;
+      setup.coef[slot].dady.f[i] = b * setup.oneoverarea;
+
+      /* calculate a0 as the value which would be sampled for the
+       * fragment at (0,0), taking into account that we want to sample at
+       * pixel centers, in other words (0.5, 0.5).
+       *
+       * this is neat but unfortunately not a good way to do things for
+       * triangles with very large values of dadx or dady as it will
+       * result in the subtraction and re-addition from a0 of a very
+       * large number, which means we'll end up loosing a lot of the
+       * fractional bits and precision from a0.  the way to fix this is
+       * to define a0 as the sample at a pixel center somewhere near vmin
+       * instead - i'll switch to this later.
+       */
+      setup.coef[slot].a0.f[i] = (vmin_d[i] - 
+                                 (setup.coef[slot].dadx.f[i] * x + 
+                                  setup.coef[slot].dady.f[i] * y));
+   }
+
+   /*
+   _mesa_printf("attr[%d].%c: %f dx:%f dy:%f\n",
+		slot, "xyzw"[i], 
+		setup.coef[slot].a0[i],
+		setup.coef[slot].dadx.f[i],
+		setup.coef[slot].dady.f[i]);
+   */
+}
+
+
+/**
+ * As above, but interp setup all four vector components.
+ */
+static INLINE void
+tri_linear_coeff4(uint slot)
+{
+   const vector float vmin_d = setup.vmin->data[slot];
+   const vector float vmid_d = setup.vmid->data[slot];
+   const vector float vmax_d = setup.vmax->data[slot];
+   const vector float xxxx = spu_splats(spu_extract(setup.vmin->data[0], 0) - 0.5f);
+   const vector float yyyy = spu_splats(spu_extract(setup.vmin->data[0], 1) - 0.5f);
+
+   vector float botda = vmid_d - vmin_d;
+   vector float majda = vmax_d - vmin_d;
+
+   vector float a = spu_sub(spu_mul(spu_splats(setup.ebot.dy), majda),
+                            spu_mul(botda, spu_splats(setup.emaj.dy)));
+   vector float b = spu_sub(spu_mul(spu_splats(setup.emaj.dx), botda),
+                            spu_mul(majda, spu_splats(setup.ebot.dx)));
+
+   setup.coef[slot].dadx.v = spu_mul(a, spu_splats(setup.oneoverarea));
+   setup.coef[slot].dady.v = spu_mul(b, spu_splats(setup.oneoverarea));
+
+   vector float tempx = spu_mul(setup.coef[slot].dadx.v, xxxx);
+   vector float tempy = spu_mul(setup.coef[slot].dady.v, yyyy);
+                         
+   setup.coef[slot].a0.v = spu_sub(vmin_d, spu_add(tempx, tempy));
+}
+
+
+
+#if 0
+/**
+ * Compute a0, dadx and dady for a perspective-corrected interpolant,
+ * for a triangle.
+ * We basically multiply the vertex value by 1/w before computing
+ * the plane coefficients (a0, dadx, dady).
+ * Later, when we compute the value at a particular fragment position we'll
+ * divide the interpolated value by the interpolated W at that fragment.
+ */
+static void tri_persp_coeff( unsigned slot,
+                             unsigned i )
+{
+   /* premultiply by 1/w:
+    */
+   float mina = setup.vmin->data[slot][i] * setup.vmin->data[0][3];
+   float mida = setup.vmid->data[slot][i] * setup.vmid->data[0][3];
+   float maxa = setup.vmax->data[slot][i] * setup.vmax->data[0][3];
+
+   float botda = mida - mina;
+   float majda = maxa - mina;
+   float a = setup.ebot.dy * majda - botda * setup.emaj.dy;
+   float b = setup.emaj.dx * botda - majda * setup.ebot.dx;
+      
+   /*
+   printf("tri persp %d,%d: %f %f %f\n", slot, i,
+          setup.vmin->data[slot][i],
+          setup.vmid->data[slot][i],
+          setup.vmax->data[slot][i]
+          );
+   */
+
+   assert(slot < PIPE_MAX_SHADER_INPUTS);
+   assert(i <= 3);
+
+   setup.coef[slot].dadx.f[i] = a * setup.oneoverarea;
+   setup.coef[slot].dady.f[i] = b * setup.oneoverarea;
+   setup.coef[slot].a0.f[i] = (mina - 
+			    (setup.coef[slot].dadx.f[i] * (setup.vmin->data[0][0] - 0.5f) + 
+			     setup.coef[slot].dady.f[i] * (setup.vmin->data[0][1] - 0.5f)));
+}
+#endif
+
+
+/**
+ * Compute the setup.coef[] array dadx, dady, a0 values.
+ * Must be called after setup.vmin,vmid,vmax,vprovoke are initialized.
+ */
+static void setup_tri_coefficients(void)
+{
+#if 1
+   uint i;
+
+   for (i = 0; i < spu.vertex_info.num_attribs; i++) {
+      switch (spu.vertex_info.interp_mode[i]) {
+      case INTERP_NONE:
+         break;
+      case INTERP_POS:
+         /*tri_linear_coeff(i, 2, 3);*/
+         /* XXX interp W if PERSPECTIVE... */
+         tri_linear_coeff4(i);
+         break;
+      case INTERP_CONSTANT:
+         const_coeff(i);
+         break;
+      case INTERP_LINEAR:
+         tri_linear_coeff4(i);
+         break;
+      case INTERP_PERSPECTIVE:
+         tri_linear_coeff4(i);  /* temporary */
+         break;
+      default:
+         ASSERT(0);
+      }
+   }
+#else
+   ASSERT(spu.vertex_info.interp_mode[0] == INTERP_POS);
+   ASSERT(spu.vertex_info.interp_mode[1] == INTERP_LINEAR ||
+          spu.vertex_info.interp_mode[1] == INTERP_CONSTANT);
+   tri_linear_coeff(0, 2, 3);  /* slot 0, z */
+   tri_linear_coeff(1, 0, 4);  /* slot 1, color */
+#endif
+}
+
+
+static void setup_tri_edges(void)
+{
+   float vmin_x = spu_extract(setup.vmin->data[0], 0) + 0.5f;
+   float vmid_x = spu_extract(setup.vmid->data[0], 0) + 0.5f;
+
+   float vmin_y = spu_extract(setup.vmin->data[0], 1) - 0.5f;
+   float vmid_y = spu_extract(setup.vmid->data[0], 1) - 0.5f;
+   float vmax_y = spu_extract(setup.vmax->data[0], 1) - 0.5f;
+
+   setup.emaj.sy = CEILF(vmin_y);
+   setup.emaj.lines = (int) CEILF(vmax_y - setup.emaj.sy);
+   setup.emaj.dxdy = setup.emaj.dx / setup.emaj.dy;
+   setup.emaj.sx = vmin_x + (setup.emaj.sy - vmin_y) * setup.emaj.dxdy;
+
+   setup.etop.sy = CEILF(vmid_y);
+   setup.etop.lines = (int) CEILF(vmax_y - setup.etop.sy);
+   setup.etop.dxdy = setup.etop.dx / setup.etop.dy;
+   setup.etop.sx = vmid_x + (setup.etop.sy - vmid_y) * setup.etop.dxdy;
+
+   setup.ebot.sy = CEILF(vmin_y);
+   setup.ebot.lines = (int) CEILF(vmid_y - setup.ebot.sy);
+   setup.ebot.dxdy = setup.ebot.dx / setup.ebot.dy;
+   setup.ebot.sx = vmin_x + (setup.ebot.sy - vmin_y) * setup.ebot.dxdy;
+}
+
+
+/**
+ * Render the upper or lower half of a triangle.
+ * Scissoring/cliprect is applied here too.
+ */
+static void subtriangle( struct edge *eleft,
+			 struct edge *eright,
+			 unsigned lines )
+{
+   const int minx = setup.cliprect_minx;
+   const int maxx = setup.cliprect_maxx;
+   const int miny = setup.cliprect_miny;
+   const int maxy = setup.cliprect_maxy;
+   int y, start_y, finish_y;
+   int sy = (int)eleft->sy;
+
+   ASSERT((int)eleft->sy == (int) eright->sy);
+
+   /* clip top/bottom */
+   start_y = sy;
+   finish_y = sy + lines;
+
+   if (start_y < miny)
+      start_y = miny;
+
+   if (finish_y > maxy)
+      finish_y = maxy;
+
+   start_y -= sy;
+   finish_y -= sy;
+
+   /*
+   _mesa_printf("%s %d %d\n", __FUNCTION__, start_y, finish_y);  
+   */
+
+   for (y = start_y; y < finish_y; y++) {
+
+      /* avoid accumulating adds as floats don't have the precision to
+       * accurately iterate large triangle edges that way.  luckily we
+       * can just multiply these days.
+       *
+       * this is all drowned out by the attribute interpolation anyway.
+       */
+      int left = (int)(eleft->sx + y * eleft->dxdy);
+      int right = (int)(eright->sx + y * eright->dxdy);
+
+      /* clip left/right */
+      if (left < minx)
+         left = minx;
+      if (right > maxx)
+         right = maxx;
+
+      if (left < right) {
+         int _y = sy + y;
+         if (block(_y) != setup.span.y) {
+            flush_spans();
+            setup.span.y = block(_y);
+         }
+
+         setup.span.left[_y&1] = left;
+         setup.span.right[_y&1] = right;
+         setup.span.y_flags |= 1<<(_y&1);
+      }
+   }
+
+
+   /* save the values so that emaj can be restarted:
+    */
+   eleft->sx += lines * eleft->dxdy;
+   eright->sx += lines * eright->dxdy;
+   eleft->sy += lines;
+   eright->sy += lines;
+}
+
+static float
+determinant( const float *v0,
+             const float *v1,
+             const float *v2 )
+{
+   /* edge vectors e = v0 - v2, f = v1 - v2 */
+   const float ex = v0[0] - v2[0];
+   const float ey = v0[1] - v2[1];
+   const float fx = v1[0] - v2[0];
+   const float fy = v1[1] - v2[1];
+
+   /* det = cross(e,f).z */
+   return ex * fy - ey * fx;
+}
+
+
+/**
+ * Draw triangle into tile at (tx, ty) (tile coords)
+ * The tile data should have already been fetched.
+ */
+boolean
+tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty, uint front_winding)
+{
+   setup.tx = tx;
+   setup.ty = ty;
+
+   /* set clipping bounds to tile bounds */
+   setup.cliprect_minx = tx * TILE_SIZE;
+   setup.cliprect_miny = ty * TILE_SIZE;
+   setup.cliprect_maxx = (tx + 1) * TILE_SIZE;
+   setup.cliprect_maxy = (ty + 1) * TILE_SIZE;
+
+   /* Before we sort vertices, determine the facing of the triangle,
+    * which will be needed for front/back-face stencil application
+    */
+   float det = determinant(v0, v1, v2);
+   setup.facing = (det > 0.0) ^ (front_winding == PIPE_WINDING_CW);
+
+   if (!setup_sort_vertices((struct vertex_header *) v0,
+                            (struct vertex_header *) v1,
+                            (struct vertex_header *) v2)) {
+      return FALSE; /* totally clipped */
+   }
+
+   setup_tri_coefficients();
+   setup_tri_edges();
+
+   setup.span.y = 0;
+   setup.span.y_flags = 0;
+   setup.span.right[0] = 0;
+   setup.span.right[1] = 0;
+   /*   setup.span.z_mode = tri_z_mode( setup.ctx ); */
+
+   /*   init_constant_attribs( setup ); */
+      
+   if (setup.oneoverarea < 0.0) {
+      /* emaj on left:
+       */
+      subtriangle( &setup.emaj, &setup.ebot, setup.ebot.lines );
+      subtriangle( &setup.emaj, &setup.etop, setup.etop.lines );
+   }
+   else {
+      /* emaj on right:
+       */
+      subtriangle( &setup.ebot, &setup.emaj, setup.ebot.lines );
+      subtriangle( &setup.etop, &setup.emaj, setup.etop.lines );
+   }
+
+   flush_spans();
+
+   return TRUE;
+}
diff --git a/src/gallium/drivers/cell/spu/spu_tri.h b/src/gallium/drivers/cell/spu/spu_tri.h
new file mode 100644
index 0000000000..abc3d35160
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_tri.h
@@ -0,0 +1,37 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef SPU_TRI_H
+#define SPU_TRI_H
+
+
+extern boolean
+tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty, uint front_winding);
+
+
+#endif /* SPU_TRI_H */
diff --git a/src/gallium/drivers/cell/spu/spu_util.c b/src/gallium/drivers/cell/spu/spu_util.c
new file mode 100644
index 0000000000..b8a0d4a265
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_util.c
@@ -0,0 +1,167 @@
+
+#include "cell/common.h"
+#include "pipe/p_shader_tokens.h"
+#include "pipe/p_debug.h"
+#include "tgsi/tgsi_parse.h"
+//#include "tgsi_build.h"
+#include "tgsi/tgsi_util.h"
+
+unsigned
+tgsi_util_get_src_register_swizzle(
+   const struct tgsi_src_register *reg,
+   unsigned component )
+{
+   switch( component ) {
+   case 0:
+      return reg->SwizzleX;
+   case 1:
+      return reg->SwizzleY;
+   case 2:
+      return reg->SwizzleZ;
+   case 3:
+      return reg->SwizzleW;
+   default:
+      ASSERT( 0 );
+   }
+   return 0;
+}
+
+unsigned
+tgsi_util_get_src_register_extswizzle(
+   const struct tgsi_src_register_ext_swz *reg,
+   unsigned component )
+{
+   switch( component ) {
+   case 0:
+      return reg->ExtSwizzleX;
+   case 1:
+      return reg->ExtSwizzleY;
+   case 2:
+      return reg->ExtSwizzleZ;
+   case 3:
+      return reg->ExtSwizzleW;
+   default:
+      ASSERT( 0 );
+   }
+   return 0;
+}
+
+unsigned
+tgsi_util_get_full_src_register_extswizzle(
+   const struct tgsi_full_src_register  *reg,
+   unsigned component )
+{
+   unsigned swizzle;
+
+   /*
+    * First, calculate  the   extended swizzle for a given channel. This will give
+    * us either a channel index into the simple swizzle or  a constant 1 or   0.
+    */
+   swizzle = tgsi_util_get_src_register_extswizzle(
+      &reg->SrcRegisterExtSwz,
+      component );
+
+   ASSERT (TGSI_SWIZZLE_X == TGSI_EXTSWIZZLE_X);
+   ASSERT (TGSI_SWIZZLE_Y == TGSI_EXTSWIZZLE_Y);
+   ASSERT (TGSI_SWIZZLE_Z == TGSI_EXTSWIZZLE_Z);
+   ASSERT (TGSI_SWIZZLE_W == TGSI_EXTSWIZZLE_W);
+   ASSERT (TGSI_EXTSWIZZLE_ZERO > TGSI_SWIZZLE_W);
+   ASSERT (TGSI_EXTSWIZZLE_ONE > TGSI_SWIZZLE_W);
+
+   /*
+    * Second, calculate the simple  swizzle  for   the   unswizzled channel index.
+    * Leave the constants intact, they are   not   affected by the   simple swizzle.
+    */
+   if( swizzle <= TGSI_SWIZZLE_W ) {
+      swizzle = tgsi_util_get_src_register_swizzle(
+         &reg->SrcRegister,
+         component );
+   }
+
+   return swizzle;
+}
+
+unsigned
+tgsi_util_get_src_register_extnegate(
+   const  struct tgsi_src_register_ext_swz *reg,
+   unsigned component )
+{
+   switch( component ) {
+   case 0:
+      return reg->NegateX;
+   case 1:
+      return reg->NegateY;
+   case 2:
+      return reg->NegateZ;
+   case 3:
+      return reg->NegateW;
+   default:
+      ASSERT( 0 );
+   }
+   return 0;
+}
+
+void
+tgsi_util_set_src_register_extnegate(
+   struct tgsi_src_register_ext_swz *reg,
+   unsigned negate,
+   unsigned component )
+{
+   switch( component ) {
+   case 0:
+      reg->NegateX = negate;
+      break;
+   case 1:
+      reg->NegateY = negate;
+      break;
+   case 2:
+      reg->NegateZ = negate;
+      break;
+   case 3:
+      reg->NegateW = negate;
+      break;
+   default:
+      ASSERT( 0 );
+   }
+}
+
+unsigned
+tgsi_util_get_full_src_register_sign_mode(
+   const struct  tgsi_full_src_register *reg,
+   unsigned component )
+{
+   unsigned sign_mode;
+
+   if( reg->SrcRegisterExtMod.Absolute ) {
+      /* Consider only the post-abs negation. */
+
+      if( reg->SrcRegisterExtMod.Negate ) {
+         sign_mode = TGSI_UTIL_SIGN_SET;
+      }
+      else {
+         sign_mode = TGSI_UTIL_SIGN_CLEAR;
+      }
+   }
+   else {
+      /* Accumulate the three negations. */
+
+      unsigned negate;
+
+      negate = reg->SrcRegister.Negate;
+      if( tgsi_util_get_src_register_extnegate( &reg->SrcRegisterExtSwz, component ) ) {
+         negate = !negate;
+      }
+      if( reg->SrcRegisterExtMod.Negate ) {
+         negate = !negate;
+      }
+
+      if( negate ) {
+         sign_mode = TGSI_UTIL_SIGN_TOGGLE;
+      }
+      else {
+         sign_mode = TGSI_UTIL_SIGN_KEEP;
+      }
+   }
+
+   return sign_mode;
+}
diff --git a/src/gallium/drivers/cell/spu/spu_vertex_fetch.c b/src/gallium/drivers/cell/spu/spu_vertex_fetch.c
new file mode 100644
index 0000000000..03375d84a5
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_vertex_fetch.c
@@ -0,0 +1,145 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * (C) Copyright IBM Corporation 2008
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  *   Ian Romanick <idr@us.ibm.com>
+  */
+
+#include "pipe/p_state.h"
+#include "pipe/p_shader_tokens.h"
+#include "spu_exec.h"
+#include "spu_vertex_shader.h"
+#include "spu_main.h"
+#include "spu_dcache.h"
+
+typedef void (*spu_fetch_func)(qword *out, const qword *in,
+			       const qword *shuffle_data);
+
+
+static const qword fetch_shuffle_data[5] ALIGN16_ATTRIB = {
+   /* Shuffle used by CVT_64_FLOAT
+    */
+   {
+      0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
+      0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+   },
+
+   /* Shuffle used by CVT_8_USCALED and CVT_8_SSCALED
+    */
+   {
+      0x00, 0x80, 0x80, 0x80, 0x01, 0x80, 0x80, 0x80,
+      0x02, 0x80, 0x80, 0x80, 0x03, 0x80, 0x80, 0x80,
+   },
+   
+   /* Shuffle used by CVT_16_USCALED and CVT_16_SSCALED
+    */
+   {
+      0x00, 0x01, 0x80, 0x80, 0x02, 0x03, 0x80, 0x80,
+      0x04, 0x05, 0x80, 0x80, 0x06, 0x07, 0x80, 0x80,
+   },
+   
+   /* High value shuffle used by trans4x4.
+    */
+   {
+      0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
+      0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17
+   },
+
+   /* Low value shuffle used by trans4x4.
+    */
+   {
+      0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B,
+      0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F
+   }
+};
+
+
+/**
+ * Fetch vertex attributes for 'count' vertices.
+ */
+static void generic_vertex_fetch(struct spu_vs_context *draw,
+                                 struct spu_exec_machine *machine,
+                                 const unsigned *elts,
+                                 unsigned count)
+{
+   unsigned nr_attrs = draw->vertex_fetch.nr_attrs;
+   unsigned attr;
+
+   ASSERT(count <= 4);
+
+#if DRAW_DBG
+   printf("SPU: %s count = %u, nr_attrs = %u\n", 
+          __FUNCTION__, count, nr_attrs);
+#endif
+
+   /* loop over vertex attributes (vertex shader inputs)
+    */
+   for (attr = 0; attr < nr_attrs; attr++) {
+      const unsigned pitch = draw->vertex_fetch.pitch[attr];
+      const uint64_t src = draw->vertex_fetch.src_ptr[attr];
+      const spu_fetch_func fetch = (spu_fetch_func)
+	  (draw->vertex_fetch.code + draw->vertex_fetch.code_offset[attr]);
+      unsigned i;
+      unsigned idx;
+      const unsigned bytes_per_entry = draw->vertex_fetch.size[attr];
+      const unsigned quads_per_entry = (bytes_per_entry + 15) / 16;
+      qword in[2 * 4] ALIGN16_ATTRIB;
+
+
+      /* Fetch four attributes for four vertices.  
+       */
+      idx = 0;
+      for (i = 0; i < count; i++) {
+         const uint64_t addr = src + (elts[i] * pitch);
+
+#if DRAW_DBG
+         printf("SPU: fetching = 0x%llx\n", addr);
+#endif
+
+         spu_dcache_fetch_unaligned(& in[idx], addr, bytes_per_entry);
+         idx += quads_per_entry;
+      }
+
+      /* Be nice and zero out any missing vertices.
+       */
+      (void) memset(& in[idx], 0, (8 - idx) * sizeof(qword));
+
+
+      /* Convert all 4 vertices to vectors of float.
+       */
+      (*fetch)(&machine->Inputs[attr].xyzw[0].q, in, fetch_shuffle_data);
+   }
+}
+
+
+void spu_update_vertex_fetch( struct spu_vs_context *draw )
+{
+   draw->vertex_fetch.fetch_func = generic_vertex_fetch;
+}
diff --git a/src/gallium/drivers/cell/spu/spu_vertex_shader.c b/src/gallium/drivers/cell/spu/spu_vertex_shader.c
new file mode 100644
index 0000000000..fbe5b34d39
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_vertex_shader.c
@@ -0,0 +1,244 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  *   Brian Paul
+  *   Ian Romanick <idr@us.ibm.com>
+  */
+
+#include <spu_mfcio.h>
+
+#include "pipe/p_state.h"
+#include "pipe/p_shader_tokens.h"
+#include "util/u_math.h"
+#include "draw/draw_private.h"
+#include "draw/draw_context.h"
+#include "cell/common.h"
+#include "spu_vertex_shader.h"
+#include "spu_exec.h"
+#include "spu_main.h"
+
+
+#define MAX_VERTEX_SIZE ((2 + PIPE_MAX_SHADER_OUTPUTS) * 4 * sizeof(float))
+
+
+#define CLIP_RIGHT_BIT 0x01
+#define CLIP_LEFT_BIT 0x02
+#define CLIP_TOP_BIT 0x04
+#define CLIP_BOTTOM_BIT 0x08
+#define CLIP_FAR_BIT 0x10
+#define CLIP_NEAR_BIT 0x20
+
+
+static INLINE float
+dot4(const float *a, const float *b)
+{
+   return (a[0]*b[0] +
+           a[1]*b[1] +
+           a[2]*b[2] +
+           a[3]*b[3]);
+}
+
+static INLINE unsigned
+compute_clipmask(const float *clip, /*const*/ float plane[][4], unsigned nr)
+{
+   unsigned mask = 0;
+   unsigned i;
+
+   /* Do the hardwired planes first:
+    */
+   if (-clip[0] + clip[3] < 0) mask |= CLIP_RIGHT_BIT;
+   if ( clip[0] + clip[3] < 0) mask |= CLIP_LEFT_BIT;
+   if (-clip[1] + clip[3] < 0) mask |= CLIP_TOP_BIT;
+   if ( clip[1] + clip[3] < 0) mask |= CLIP_BOTTOM_BIT;
+   if (-clip[2] + clip[3] < 0) mask |= CLIP_FAR_BIT;
+   if ( clip[2] + clip[3] < 0) mask |= CLIP_NEAR_BIT;
+
+   /* Followed by any remaining ones:
+    */
+   for (i = 6; i < nr; i++) {
+      if (dot4(clip, plane[i]) < 0) 
+         mask |= (1<<i);
+   }
+
+   return mask;
+}
+
+
+/**
+ * Transform vertices with the current vertex program/shader
+ * Up to four vertices can be shaded at a time.
+ * \param vbuffer  the input vertex data
+ * \param elts  indexes of four input vertices
+ * \param count  number of vertices to shade [1..4]
+ * \param vOut  array of pointers to four output vertices
+ */
+static void
+run_vertex_program(struct spu_vs_context *draw,
+                   unsigned elts[4], unsigned count,
+                   const uint64_t *vOut)
+{
+   struct spu_exec_machine *machine = &draw->machine;
+   unsigned int j;
+
+   ALIGN16_DECL(struct spu_exec_vector, inputs, PIPE_MAX_ATTRIBS);
+   ALIGN16_DECL(struct spu_exec_vector, outputs, PIPE_MAX_ATTRIBS);
+   const float *scale = draw->viewport.scale;
+   const float *trans = draw->viewport.translate;
+
+   ASSERT(count <= 4);
+
+   machine->Processor = TGSI_PROCESSOR_VERTEX;
+
+   ASSERT_ALIGN16(draw->constants);
+   machine->Consts = (float (*)[4]) draw->constants;
+
+   machine->Inputs = ALIGN16_ASSIGN(inputs);
+   machine->Outputs = ALIGN16_ASSIGN(outputs);
+
+   spu_vertex_fetch( draw, machine, elts, count );
+
+   /* run shader */
+   spu_exec_machine_run( machine );
+
+
+   /* store machine results */
+   for (j = 0; j < count; j++) {
+      unsigned slot;
+      float x, y, z, w;
+      unsigned char buffer[sizeof(struct vertex_header)
+          + MAX_VERTEX_SIZE] ALIGN16_ATTRIB;
+      struct vertex_header *const tmpOut =
+          (struct vertex_header *) buffer;
+      const unsigned vert_size = ROUNDUP16(sizeof(struct vertex_header)
+                                           + (sizeof(float) * 4 
+                                              * draw->num_vs_outputs));
+
+      mfc_get(tmpOut, vOut[j], vert_size, TAG_VERTEX_BUFFER, 0, 0);
+      wait_on_mask(1 << TAG_VERTEX_BUFFER);
+
+
+      /* Handle attr[0] (position) specially:
+       *
+       * XXX: Computing the clipmask should be done in the vertex
+       * program as a set of DP4 instructions appended to the
+       * user-provided code.
+       */
+      x = tmpOut->clip[0] = machine->Outputs[0].xyzw[0].f[j];
+      y = tmpOut->clip[1] = machine->Outputs[0].xyzw[1].f[j];
+      z = tmpOut->clip[2] = machine->Outputs[0].xyzw[2].f[j];
+      w = tmpOut->clip[3] = machine->Outputs[0].xyzw[3].f[j];
+
+      tmpOut->clipmask = compute_clipmask(tmpOut->clip, draw->plane,
+					   draw->nr_planes);
+      tmpOut->edgeflag = 1;
+
+      /* divide by w */
+      w = 1.0f / w;
+      x *= w;
+      y *= w;
+      z *= w;
+
+      /* Viewport mapping */
+      tmpOut->data[0][0] = x * scale[0] + trans[0];
+      tmpOut->data[0][1] = y * scale[1] + trans[1];
+      tmpOut->data[0][2] = z * scale[2] + trans[2];
+      tmpOut->data[0][3] = w;
+
+      /* Remaining attributes are packed into sequential post-transform
+       * vertex attrib slots.
+       */
+      for (slot = 1; slot < draw->num_vs_outputs; slot++) {
+         tmpOut->data[slot][0] = machine->Outputs[slot].xyzw[0].f[j];
+         tmpOut->data[slot][1] = machine->Outputs[slot].xyzw[1].f[j];
+         tmpOut->data[slot][2] = machine->Outputs[slot].xyzw[2].f[j];
+         tmpOut->data[slot][3] = machine->Outputs[slot].xyzw[3].f[j];
+      }
+
+      mfc_put(tmpOut, vOut[j], vert_size, TAG_VERTEX_BUFFER, 0, 0);
+   } /* loop over vertices */
+}
+
+
+unsigned char immediates[(sizeof(float) * 4 * TGSI_EXEC_NUM_IMMEDIATES) + 32]
+    ALIGN16_ATTRIB;
+
+
+void
+spu_bind_vertex_shader(struct spu_vs_context *draw,
+		       struct cell_shader_info *vs)
+{
+   const unsigned immediate_addr = vs->immediates;
+   const unsigned immediate_size = 
+       ROUNDUP16((sizeof(float) * 4 * vs->num_immediates)
+		 + (immediate_addr & 0x0f));
+ 
+
+   mfc_get(immediates, immediate_addr & ~0x0f, immediate_size,
+           TAG_VERTEX_BUFFER, 0, 0);
+
+   draw->machine.Instructions = (struct tgsi_full_instruction *)
+       vs->instructions;
+   draw->machine.NumInstructions = vs->num_instructions;
+
+   draw->machine.Declarations = (struct tgsi_full_declaration *)
+       vs->declarations;
+   draw->machine.NumDeclarations = vs->num_declarations;
+
+   draw->num_vs_outputs = vs->num_outputs;
+
+   /* specify the shader to interpret/execute */
+   spu_exec_machine_init(&draw->machine,
+			 PIPE_MAX_SAMPLERS,
+			 NULL /*samplers*/,
+			 PIPE_SHADER_VERTEX);
+
+   wait_on_mask(1 << TAG_VERTEX_BUFFER);
+
+   (void) memcpy(& draw->machine.Imms, &immediates[immediate_addr & 0x0f],
+                 sizeof(float) * 4 * vs->num_immediates);
+}
+
+
+void
+spu_execute_vertex_shader(struct spu_vs_context *draw,
+                          const struct cell_command_vs *vs)
+{
+   unsigned i;
+
+   (void) memcpy(draw->plane, vs->plane, sizeof(float) * 4 * vs->nr_planes);
+   draw->nr_planes = vs->nr_planes;
+   draw->vertex_fetch.nr_attrs = vs->nr_attrs;
+
+   for (i = 0; i < vs->num_elts; i += 4) {
+      const unsigned batch_size = MIN2(vs->num_elts - i, 4);
+
+      run_vertex_program(draw, & vs->elts[i], batch_size, &vs->vOut[i]);
+   }
+}
diff --git a/src/gallium/drivers/cell/spu/spu_vertex_shader.h b/src/gallium/drivers/cell/spu/spu_vertex_shader.h
new file mode 100644
index 0000000000..4c74f5e74d
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_vertex_shader.h
@@ -0,0 +1,66 @@
+#ifndef SPU_VERTEX_SHADER_H
+#define SPU_VERTEX_SHADER_H
+
+#include "cell/common.h"
+#include "pipe/p_format.h"
+#include "spu_exec.h"
+
+struct spu_vs_context;
+
+typedef void (*spu_full_fetch_func)( struct spu_vs_context *draw,
+				     struct spu_exec_machine *machine,
+				     const unsigned *elts,
+				     unsigned count );
+
+struct spu_vs_context {
+   struct pipe_viewport_state viewport;
+
+   struct {
+      uint64_t src_ptr[PIPE_MAX_ATTRIBS];
+      unsigned pitch[PIPE_MAX_ATTRIBS];
+      unsigned size[PIPE_MAX_ATTRIBS];
+      unsigned code_offset[PIPE_MAX_ATTRIBS];
+      unsigned nr_attrs;
+      boolean dirty;
+
+      spu_full_fetch_func fetch_func;
+      void *code;
+   } vertex_fetch;
+   
+   /* Clip derived state:
+    */
+   float plane[12][4];
+   unsigned nr_planes;
+
+   struct spu_exec_machine machine;
+   const float (*constants)[4];
+
+   unsigned num_vs_outputs;
+};
+
+extern void spu_update_vertex_fetch(struct spu_vs_context *draw);
+
+static INLINE void spu_vertex_fetch(struct spu_vs_context *draw,
+				    struct spu_exec_machine *machine,
+				    const unsigned *elts,
+				    unsigned count)
+{
+   if (draw->vertex_fetch.dirty) {
+      spu_update_vertex_fetch(draw);
+      draw->vertex_fetch.dirty = 0;
+   }
+   
+   (*draw->vertex_fetch.fetch_func)(draw, machine, elts, count);
+}
+
+struct cell_command_vs;
+
+extern void
+spu_bind_vertex_shader(struct spu_vs_context *draw,
+		       struct cell_shader_info *vs);
+
+extern void
+spu_execute_vertex_shader(struct spu_vs_context *draw,
+			  const struct cell_command_vs *vs);
+
+#endif /* SPU_VERTEX_SHADER_H */