39 files changed, 2090 insertions, 907 deletions
diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index 6bace0bb11..e989d8c2e5 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -84,7 +84,7 @@
 #define CELL_CMD_BATCH                5
 #define CELL_CMD_RELEASE_VERTS        6
 #define CELL_CMD_STATE_FRAMEBUFFER   10
-#define CELL_CMD_STATE_DEPTH_STENCIL 11
+#define CELL_CMD_STATE_FRAGMENT_OPS  11
 #define CELL_CMD_STATE_SAMPLER       12
 #define CELL_CMD_STATE_TEXTURE       13
 #define CELL_CMD_STATE_VERTEX_INFO   14
@@ -92,9 +92,7 @@
 #define CELL_CMD_STATE_UNIFORMS      16
 #define CELL_CMD_STATE_VS_ARRAY_INFO 17
 #define CELL_CMD_STATE_BIND_VS       18
-#define CELL_CMD_STATE_BLEND         19
 #define CELL_CMD_STATE_ATTRIB_FETCH  20
-#define CELL_CMD_STATE_LOGICOP       21
 #define CELL_CMD_VS_EXECUTE          22
 #define CELL_CMD_FLUSH_BUFFER_RANGE  23
 
@@ -106,30 +104,24 @@
 #define CELL_BUFFER_STATUS_USED 20
 
 
+#define CELL_DEBUG_CHECKER  (1 << 0)
+#define CELL_DEBUG_SYNC     (1 << 1)
 
-/**
- */
-struct cell_command_depth_stencil_alpha_test {
-   uint64_t base;               /**< Effective address of code start. */
-   unsigned size;               /**< Size in bytes of SPE code. */
-   unsigned read_depth;         /**< Flag: should depth be read? */
-   unsigned read_stencil;       /**< Flag: should stencil be read? */
-};
 
 
-/**
- * Upload code to perform framebuffer blend operation
- */
-struct cell_command_blend {
-   uint64_t base;               /**< Effective address of code start. */
-   unsigned size;               /**< Size in bytes of SPE code. */
-   unsigned read_fb;            /**< Flag: should framebuffer be read? */
-};
+/** Max instructions for doing per-fragment operations */
+#define SPU_MAX_FRAGMENT_OPS_INSTS 64
 
 
-struct cell_command_logicop {
-   uint64_t base;               /**< Effective address of code start. */
-   unsigned size;               /**< Size in bytes of SPE code. */
+/**
+ * Command to specify per-fragment operations state and generated code.
+ */
+struct cell_command_fragment_ops
+{
+   uint64_t opcode;      /**< CELL_CMD_STATE_FRAGMENT_OPS */
+   struct pipe_depth_stencil_alpha_state dsa;
+   struct pipe_blend_state blend;
+   unsigned code[SPU_MAX_FRAGMENT_OPS_INSTS];
 };
 
 
@@ -169,13 +161,15 @@ struct cell_array_info
 };
 
 
-struct cell_attribute_fetch_code {
+struct cell_attribute_fetch_code
+{
    uint64_t base;
    uint size;
 };
 
 
-struct cell_buffer_range {
+struct cell_buffer_range
+{
    uint64_t base;
    unsigned size;
 };
@@ -263,6 +257,7 @@ struct cell_init_info
 {
    unsigned id;
    unsigned num_spus;
+   unsigned debug_flags;  /**< mask of CELL_DEBUG_x flags */
    struct cell_command *cmd;
 
    /** Buffers for command batches, vertex/index data */
diff --git a/src/gallium/drivers/cell/ppu/Makefile b/src/gallium/drivers/cell/ppu/Makefile
index 0389a9554c..8699f3f8ec 100644
--- a/src/gallium/drivers/cell/ppu/Makefile
+++ b/src/gallium/drivers/cell/ppu/Makefile
@@ -5,7 +5,7 @@
 
 
 TOP = ../../../../..
-include $(TOP)/configs/linux-cell
+include $(TOP)/configs/current
 
 
 # This is the "top-level" cell PPU driver code, will get pulled into libGL.so
@@ -25,9 +25,9 @@ SOURCES = \
 	cell_context.c \
 	cell_draw_arrays.c \
 	cell_flush.c \
+	cell_gen_fragment.c \
 	cell_state_derived.c \
 	cell_state_emit.c \
-	cell_state_per_fragment.c \
 	cell_state_shader.c \
 	cell_pipe_state.c \
 	cell_screen.c \
diff --git a/src/gallium/drivers/cell/ppu/cell_batch.c b/src/gallium/drivers/cell/ppu/cell_batch.c
index f45e5f25b6..16882c0129 100644
--- a/src/gallium/drivers/cell/ppu/cell_batch.c
+++ b/src/gallium/drivers/cell/ppu/cell_batch.c
@@ -32,6 +32,13 @@
 
 
 
+/**
+ * Search the buffer pool for an empty/free buffer and return its index.
+ * Buffers are used for storing vertex data, state and commands which
+ * will be sent to the SPUs.
+ * If no empty buffers are available, wait for one.
+ * \return buffer index in [0, CELL_NUM_BUFFERS-1]
+ */
 uint
 cell_get_empty_buffer(struct cell_context *cell)
 {
@@ -74,6 +81,11 @@ cell_get_empty_buffer(struct cell_context *cell)
 }
 
 
+/**
+ * Flush the current batch buffer to the SPUs.
+ * An empty buffer will be found and set as the new current batch buffer
+ * for subsequent commands/data.
+ */
 void
 cell_batch_flush(struct cell_context *cell)
 {
@@ -93,11 +105,11 @@ cell_batch_flush(struct cell_context *cell)
 
    /*
    printf("cell_batch_dispatch: buf %u at %p, size %u\n",
-          batch, &cell->batch_buffer[batch][0], size);
+          batch, &cell->buffer[batch][0], size);
    */
      
    /*
-    * Build "BATCH" command and sent to all SPUs.
+    * Build "BATCH" command and send to all SPUs.
     */
    cmd_word = CELL_CMD_BATCH | (batch << 8) | (size << 16);
 
@@ -120,6 +132,9 @@ cell_batch_flush(struct cell_context *cell)
 }
 
 
+/**
+ * Return the number of bytes free in the current batch buffer.
+ */
 uint
 cell_batch_free_space(const struct cell_context *cell)
 {
@@ -129,7 +144,9 @@ cell_batch_free_space(const struct cell_context *cell)
 
 
 /**
- * Append data to current batch.
+ * Append data to the current batch buffer.
+ * \param data  address of block of bytes to append
+ * \param bytes  size of block of bytes
  */
 void
 cell_batch_append(struct cell_context *cell, const void *data, uint bytes)
@@ -165,6 +182,10 @@ cell_batch_append(struct cell_context *cell, const void *data, uint bytes)
 }
 
 
+/**
+ * Allocate space in the current batch buffer for 'bytes' space.
+ * \return address in batch buffer to put data
+ */
 void *
 cell_batch_alloc(struct cell_context *cell, uint bytes)
 {
@@ -172,6 +193,10 @@ cell_batch_alloc(struct cell_context *cell, uint bytes)
 }
 
 
+/**
+ * Same as \sa cell_batch_alloc, but return an address at a particular
+ * alignment.
+ */
 void *
 cell_batch_alloc_aligned(struct cell_context *cell, uint bytes,
                          uint alignment)
@@ -215,3 +240,28 @@ cell_batch_alloc_aligned(struct cell_context *cell, uint bytes,
 
    return pos;
 }
+
+
+/**
+ * One-time init of batch buffers.
+ */
+void
+cell_init_batch_buffers(struct cell_context *cell)
+{
+   uint spu, buf;
+
+   /* init command, vertex/index buffer info */
+   for (buf = 0; buf < CELL_NUM_BUFFERS; buf++) {
+      cell->buffer_size[buf] = 0;
+
+      /* init batch buffer status values,
+       * mark 0th buffer as used, rest as free.
+       */
+      for (spu = 0; spu < cell->num_spus; spu++) {
+         if (buf == 0)
+            cell->buffer_status[spu][buf][0] = CELL_BUFFER_STATUS_USED;
+         else
+            cell->buffer_status[spu][buf][0] = CELL_BUFFER_STATUS_FREE;
+      }
+   }
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_batch.h b/src/gallium/drivers/cell/ppu/cell_batch.h
index a6eee0a8b1..f74dd60079 100644
--- a/src/gallium/drivers/cell/ppu/cell_batch.h
+++ b/src/gallium/drivers/cell/ppu/cell_batch.h
@@ -54,5 +54,8 @@ extern void *
 cell_batch_alloc_aligned(struct cell_context *cell, uint bytes,
                          uint alignment);
 
+extern void
+cell_init_batch_buffers(struct cell_context *cell);
+
 
 #endif /* CELL_BATCH_H */
diff --git a/src/gallium/drivers/cell/ppu/cell_clear.c b/src/gallium/drivers/cell/ppu/cell_clear.c
index a421c95c8e..c9c0c721bb 100644
--- a/src/gallium/drivers/cell/ppu/cell_clear.c
+++ b/src/gallium/drivers/cell/ppu/cell_clear.c
@@ -35,6 +35,7 @@
 #include <stdint.h>
 #include "pipe/p_inlines.h"
 #include "util/u_memory.h"
+#include "util/u_pack_color.h"
 #include "cell/common.h"
 #include "cell_clear.h"
 #include "cell_context.h"
@@ -44,6 +45,27 @@
 #include "cell_state.h"
 
 
+/**
+ * Convert packed pixel from one format to another.
+ */
+static unsigned
+convert_color(enum pipe_format srcFormat, unsigned srcColor,
+              enum pipe_format dstFormat)
+{
+   ubyte r, g, b, a;
+   unsigned dstColor;
+
+   util_unpack_color_ub(srcFormat, &srcColor, &r, &g, &b, &a);
+   util_pack_color_ub(r, g, b, a, dstFormat, &dstColor);
+
+   return dstColor;
+}
+
+
+
+/**
+ * Called via pipe->clear()
+ */
 void
 cell_clear_surface(struct pipe_context *pipe, struct pipe_surface *ps,
                    unsigned clearValue)
@@ -61,13 +83,21 @@ cell_clear_surface(struct pipe_context *pipe, struct pipe_surface *ps,
                                               PIPE_BUFFER_USAGE_GPU_WRITE);
 
    if (ps == cell->framebuffer.zsbuf) {
+      /* clear z/stencil buffer */
       surfIndex = 1;
    }
    else {
+      /* clear color buffer */
       surfIndex = 0;
+
+      if (ps->format != PIPE_FORMAT_A8R8G8B8_UNORM) {
+         clearValue = convert_color(PIPE_FORMAT_A8R8G8B8_UNORM, clearValue,
+                                    ps->format);
+      }
    }
 
 
+   /* Build a CLEAR command and place it in the current batch buffer */
    {
       struct cell_command_clear_surface *clr
          = (struct cell_command_clear_surface *)
diff --git a/src/gallium/drivers/cell/ppu/cell_context.c b/src/gallium/drivers/cell/ppu/cell_context.c
index 9ff4e86943..71f1a3049d 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.c
+++ b/src/gallium/drivers/cell/ppu/cell_context.c
@@ -43,11 +43,11 @@
 #include "draw/draw_private.h"
 
 #include "cell/common.h"
+#include "cell_batch.h"
 #include "cell_clear.h"
 #include "cell_context.h"
 #include "cell_draw_arrays.h"
 #include "cell_flush.h"
-#include "cell_render.h"
 #include "cell_state.h"
 #include "cell_surface.h"
 #include "cell_spu.h"
@@ -85,12 +85,20 @@ cell_draw_create(struct cell_context *cell)
 }
 
 
+#ifdef DEBUG
+static const struct debug_named_value cell_debug_flags[] = {
+   {"checker", CELL_DEBUG_CHECKER},/**< modulate tile clear color by SPU ID */
+   {"sync", CELL_DEBUG_SYNC},      /**< SPUs do synchronous DMA */
+   {NULL, 0}
+};
+#endif
+
+
 struct pipe_context *
 cell_create_context(struct pipe_screen *screen,
                     struct cell_winsys *cws)
 {
    struct cell_context *cell;
-   uint spu, buf;
 
    /* some fields need to be 16-byte aligned, so align the whole object */
    cell = (struct cell_context*) align_malloc(sizeof(struct cell_context), 16);
@@ -104,15 +112,6 @@ cell_create_context(struct pipe_screen *screen,
    cell->pipe.screen = screen;
    cell->pipe.destroy = cell_destroy_context;
 
-   /* state setters */
-   cell->pipe.set_vertex_buffers = cell_set_vertex_buffers;
-   cell->pipe.set_vertex_elements = cell_set_vertex_elements;
-
-   cell->pipe.draw_arrays = cell_draw_arrays;
-   cell->pipe.draw_elements = cell_draw_elements;
-   cell->pipe.draw_range_elements = cell_draw_range_elements;
-   cell->pipe.set_edgeflags = cell_set_edgeflags;
-
    cell->pipe.clear = cell_clear_surface;
    cell->pipe.flush = cell_flush;
 
@@ -122,20 +121,28 @@ cell_create_context(struct pipe_screen *screen,
    cell->pipe.wait_query = cell_wait_query;
 #endif
 
+   cell_init_draw_functions(cell);
    cell_init_state_functions(cell);
    cell_init_shader_functions(cell);
    cell_init_surface_functions(cell);
    cell_init_texture_functions(cell);
+   cell_init_vertex_functions(cell);
 
    cell->draw = cell_draw_create(cell);
 
    cell_init_vbuf(cell);
+
    draw_set_rasterize_stage(cell->draw, cell->vbuf);
 
    /* convert all points/lines to tris for the time being */
    draw_wide_point_threshold(cell->draw, 0.0);
    draw_wide_line_threshold(cell->draw, 0.0);
 
+   /* get env vars or read config file to get debug flags */
+   cell->debug_flags = debug_get_flags_option("CELL_DEBUG", 
+                                              cell_debug_flags, 
+                                              0 );
+
    /*
     * SPU stuff
     */
@@ -146,20 +153,7 @@ cell_create_context(struct pipe_screen *screen,
 
    cell_start_spus(cell);
 
-   /* init command, vertex/index buffer info */
-   for (buf = 0; buf < CELL_NUM_BUFFERS; buf++) {
-      cell->buffer_size[buf] = 0;
-
-      /* init batch buffer status values,
-       * mark 0th buffer as used, rest as free.
-       */
-      for (spu = 0; spu < cell->num_spus; spu++) {
-         if (buf == 0)
-            cell->buffer_status[spu][buf][0] = CELL_BUFFER_STATUS_USED;
-         else
-            cell->buffer_status[spu][buf][0] = CELL_BUFFER_STATUS_FREE;
-      }
-   }
+   cell_init_batch_buffers(cell);
 
    return &cell->pipe;
 }
diff --git a/src/gallium/drivers/cell/ppu/cell_context.h b/src/gallium/drivers/cell/ppu/cell_context.h
index f1d1ca89a9..8cec9f45b2 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.h
+++ b/src/gallium/drivers/cell/ppu/cell_context.h
@@ -39,8 +39,13 @@
 #include "rtasm/rtasm_ppc_spe.h"
 #include "tgsi/tgsi_scan.h"
 
+
 struct cell_vbuf_render;
 
+
+/**
+ * Cell vertex shader state, subclass of pipe_shader_state.
+ */
 struct cell_vertex_shader_state
 {
    struct pipe_shader_state shader;
@@ -49,6 +54,9 @@ struct cell_vertex_shader_state
 };
 
 
+/**
+ * Cell fragment shader state, subclass of pipe_shader_state.
+ */
 struct cell_fragment_shader_state
 {
    struct pipe_shader_state shader;
@@ -57,7 +65,11 @@ struct cell_fragment_shader_state
 };
 
 
-struct cell_blend_state {
+/**
+ * Cell blend state atom, subclass of pipe_blend_state.
+ */
+struct cell_blend_state
+{
    struct pipe_blend_state base;
 
    /**
@@ -67,17 +79,24 @@ struct cell_blend_state {
 };
 
 
-struct cell_depth_stencil_alpha_state {
-   struct pipe_depth_stencil_alpha_state   base;
+/**
+ * Cell depth/stencil/alpha state atom, subclass of
+ * pipe_depth_stencil_alpha_state.
+ */
+struct cell_depth_stencil_alpha_state
+{
+   struct pipe_depth_stencil_alpha_state base;
 
    /**
     * Generated code to perform alpha, stencil, and depth testing on the SPE
     */
    struct spe_function code;
-
 };
 
 
+/**
+ * Per-context state, subclass of pipe_context.
+ */
 struct cell_context
 {
    struct pipe_context pipe;
@@ -144,6 +163,8 @@ struct cell_context
 
    struct spe_function attrib_fetch;
    unsigned attrib_fetch_offsets[PIPE_MAX_ATTRIBS];
+
+   unsigned debug_flags;
 };
 
 
diff --git a/src/gallium/drivers/cell/ppu/cell_draw_arrays.c b/src/gallium/drivers/cell/ppu/cell_draw_arrays.c
index f02dffe124..880d535320 100644
--- a/src/gallium/drivers/cell/ppu/cell_draw_arrays.c
+++ b/src/gallium/drivers/cell/ppu/cell_draw_arrays.c
@@ -34,6 +34,7 @@
 #include "pipe/p_defines.h"
 #include "pipe/p_context.h"
 #include "pipe/p_winsys.h"
+#include "pipe/p_inlines.h"
 
 #include "cell_context.h"
 #include "cell_draw_arrays.h"
@@ -76,14 +77,6 @@ cell_unmap_constant_buffers(struct cell_context *sp)
 }
 
 
-boolean
-cell_draw_arrays(struct pipe_context *pipe, unsigned mode,
-                     unsigned start, unsigned count)
-{
-   return cell_draw_elements(pipe, NULL, 0, mode, start, count);
-}
-
-
 
 /**
  * Draw vertex arrays, with optional indexing.
@@ -92,7 +85,7 @@ cell_draw_arrays(struct pipe_context *pipe, unsigned mode,
  *
  * XXX should the element buffer be specified/bound with a separate function?
  */
-boolean
+static boolean
 cell_draw_range_elements(struct pipe_context *pipe,
                          struct pipe_buffer *indexBuffer,
                          unsigned indexSize,
@@ -116,7 +109,7 @@ cell_draw_range_elements(struct pipe_context *pipe,
     * Map vertex buffers
     */
    for (i = 0; i < sp->num_vertex_buffers; i++) {
-      void *buf = pipe->winsys->buffer_map(pipe->winsys,
+      void *buf = pipe_buffer_map(pipe->screen,
                                            sp->vertex_buffer[i].buffer,
                                            PIPE_BUFFER_USAGE_CPU_READ);
       cell_flush_buffer_range(sp, buf, sp->vertex_buffer[i].buffer->size);
@@ -124,7 +117,7 @@ cell_draw_range_elements(struct pipe_context *pipe,
    }
    /* Map index buffer, if present */
    if (indexBuffer) {
-      void *mapped_indexes = pipe->winsys->buffer_map(pipe->winsys,
+      void *mapped_indexes = pipe_buffer_map(pipe->screen,
                                                       indexBuffer,
                                                       PIPE_BUFFER_USAGE_CPU_READ);
       draw_set_mapped_element_buffer(draw, indexSize, mapped_indexes);
@@ -143,11 +136,11 @@ cell_draw_range_elements(struct pipe_context *pipe,
     */
    for (i = 0; i < sp->num_vertex_buffers; i++) {
       draw_set_mapped_vertex_buffer(draw, i, NULL);
-      pipe->winsys->buffer_unmap(pipe->winsys, sp->vertex_buffer[i].buffer);
+      pipe_buffer_unmap(pipe->screen, sp->vertex_buffer[i].buffer);
    }
    if (indexBuffer) {
       draw_set_mapped_element_buffer(draw, 0, NULL);
-      pipe->winsys->buffer_unmap(pipe->winsys, indexBuffer);
+      pipe_buffer_unmap(pipe->screen, indexBuffer);
    }
 
    /* Note: leave drawing surfaces mapped */
@@ -157,7 +150,7 @@ cell_draw_range_elements(struct pipe_context *pipe,
 }
 
 
-boolean
+static boolean
 cell_draw_elements(struct pipe_context *pipe,
                    struct pipe_buffer *indexBuffer,
                    unsigned indexSize,
@@ -170,10 +163,29 @@ cell_draw_elements(struct pipe_context *pipe,
 }
 
 
+static boolean
+cell_draw_arrays(struct pipe_context *pipe, unsigned mode,
+                     unsigned start, unsigned count)
+{
+   return cell_draw_elements(pipe, NULL, 0, mode, start, count);
+}
+
 
-void
+static void
 cell_set_edgeflags(struct pipe_context *pipe, const unsigned *edgeflags)
 {
    struct cell_context *cell = cell_context(pipe);
    draw_set_edgeflags(cell->draw, edgeflags);
 }
+
+
+
+void
+cell_init_draw_functions(struct cell_context *cell)
+{
+   cell->pipe.draw_arrays = cell_draw_arrays;
+   cell->pipe.draw_elements = cell_draw_elements;
+   cell->pipe.draw_range_elements = cell_draw_range_elements;
+   cell->pipe.set_edgeflags = cell_set_edgeflags;
+}
+
diff --git a/src/gallium/drivers/cell/ppu/cell_draw_arrays.h b/src/gallium/drivers/cell/ppu/cell_draw_arrays.h
index cd35ec17b4..148873aa67 100644
--- a/src/gallium/drivers/cell/ppu/cell_draw_arrays.h
+++ b/src/gallium/drivers/cell/ppu/cell_draw_arrays.h
@@ -29,26 +29,8 @@
 #define CELL_DRAW_ARRAYS_H
 
 
-extern boolean
-cell_draw_arrays(struct pipe_context *pipe, unsigned mode,
-                 unsigned start, unsigned count);
-
-extern boolean
-cell_draw_elements(struct pipe_context *pipe,
-                   struct pipe_buffer *indexBuffer,
-                   unsigned indexSize,
-                   unsigned mode, unsigned start, unsigned count);
-
-extern boolean
-cell_draw_range_elements(struct pipe_context *pipe,
-                         struct pipe_buffer *indexBuffer,
-                         unsigned indexSize,
-                         unsigned min_index,
-                         unsigned max_index,
-                         unsigned mode, unsigned start, unsigned count);
-
 extern void
-cell_set_edgeflags(struct pipe_context *pipe, const unsigned *edgeflags);
+cell_init_draw_functions(struct cell_context *cell);
 
 
 #endif /* CELL_DRAW_ARRAYS_H */
diff --git a/src/gallium/drivers/cell/ppu/cell_flush.c b/src/gallium/drivers/cell/ppu/cell_flush.c
index 3aaf3de668..6596b72010 100644
--- a/src/gallium/drivers/cell/ppu/cell_flush.c
+++ b/src/gallium/drivers/cell/ppu/cell_flush.c
@@ -34,6 +34,9 @@
 #include "draw/draw_context.h"
 
 
+/**
+ * Called via pipe->flush()
+ */
 void
 cell_flush(struct pipe_context *pipe, unsigned flags,
            struct pipe_fence_handle **fence)
@@ -50,16 +53,19 @@ cell_flush(struct pipe_context *pipe, unsigned flags,
       flags |= CELL_FLUSH_WAIT;
 
    draw_flush( cell->draw );
-   cell_flush_int(pipe, flags);
+   cell_flush_int(cell, flags);
 }
 
 
-/** internal flush */
+/**
+ * Cell internal flush function.  Send the current batch buffer to all SPUs.
+ * If flags & CELL_FLUSH_WAIT, do not return until the SPUs are idle.
+ * \param flags  bitmask of flags CELL_FLUSH_WAIT, or zero
+ */
 void
-cell_flush_int(struct pipe_context *pipe, unsigned flags)
+cell_flush_int(struct cell_context *cell, unsigned flags)
 {
    static boolean flushing = FALSE;  /* recursion catcher */
-   struct cell_context *cell = cell_context(pipe);
    uint i;
 
    ASSERT(!flushing);
diff --git a/src/gallium/drivers/cell/ppu/cell_flush.h b/src/gallium/drivers/cell/ppu/cell_flush.h
index 8f0645c429..509ae6239a 100644
--- a/src/gallium/drivers/cell/ppu/cell_flush.h
+++ b/src/gallium/drivers/cell/ppu/cell_flush.h
@@ -36,7 +36,7 @@ cell_flush(struct pipe_context *pipe, unsigned flags,
            struct pipe_fence_handle **fence);
 
 extern void
-cell_flush_int(struct pipe_context *pipe, unsigned flags);
+cell_flush_int(struct cell_context *cell, unsigned flags);
 
 extern void
 cell_flush_buffer_range(struct cell_context *cell, void *ptr,
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
new file mode 100644
index 0000000000..79a82ef72b
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
@@ -0,0 +1,862 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+
+/**
+ * Generate SPU per-fragment code (actually per-quad code).
+ * \author Brian Paul
+ */
+
+
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "rtasm/rtasm_ppc_spe.h"
+#include "cell_context.h"
+#include "cell_gen_fragment.h"
+
+
+
+/** Do extra optimizations? */
+#define OPTIMIZATIONS 1
+
+
+/**
+ * Generate SPE code to perform Z/depth testing.
+ *
+ * \param dsa         Gallium depth/stencil/alpha state to gen code for
+ * \param f           SPE function to append instruction onto.
+ * \param mask_reg    register containing quad/pixel "alive" mask (in/out)
+ * \param ifragZ_reg  register containing integer fragment Z values (in)
+ * \param ifbZ_reg    register containing integer frame buffer Z values (in/out)
+ * \param zmask_reg   register containing result of Z test/comparison (out)
+ */
+static void
+gen_depth_test(const struct pipe_depth_stencil_alpha_state *dsa,
+               struct spe_function *f,
+               int mask_reg, int ifragZ_reg, int ifbZ_reg, int zmask_reg)
+{
+   ASSERT(dsa->depth.enabled);
+
+   switch (dsa->depth.func) {
+   case PIPE_FUNC_EQUAL:
+      /* zmask = (ifragZ == ref) */
+      spe_ceq(f, zmask_reg, ifragZ_reg, ifbZ_reg);
+      /* mask = (mask & zmask) */
+      spe_and(f, mask_reg, mask_reg, zmask_reg);
+      break;
+
+   case PIPE_FUNC_NOTEQUAL:
+      /* zmask = (ifragZ == ref) */
+      spe_ceq(f, zmask_reg, ifragZ_reg, ifbZ_reg);
+      /* mask = (mask & ~zmask) */
+      spe_andc(f, mask_reg, mask_reg, zmask_reg);
+      break;
+
+   case PIPE_FUNC_GREATER:
+      /* zmask = (ifragZ > ref) */
+      spe_cgt(f, zmask_reg, ifragZ_reg, ifbZ_reg);
+      /* mask = (mask & zmask) */
+      spe_and(f, mask_reg, mask_reg, zmask_reg);
+      break;
+
+   case PIPE_FUNC_LESS:
+      /* zmask = (ref > ifragZ) */
+      spe_cgt(f, zmask_reg, ifbZ_reg, ifragZ_reg);
+      /* mask = (mask & zmask) */
+      spe_and(f, mask_reg, mask_reg, zmask_reg);
+      break;
+
+   case PIPE_FUNC_LEQUAL:
+      /* zmask = (ifragZ > ref) */
+      spe_cgt(f, zmask_reg, ifragZ_reg, ifbZ_reg);
+      /* mask = (mask & ~zmask) */
+      spe_andc(f, mask_reg, mask_reg, zmask_reg);
+      break;
+
+   case PIPE_FUNC_GEQUAL:
+      /* zmask = (ref > ifragZ) */
+      spe_cgt(f, zmask_reg, ifbZ_reg, ifragZ_reg);
+      /* mask = (mask & ~zmask) */
+      spe_andc(f, mask_reg, mask_reg, zmask_reg);
+      break;
+
+   case PIPE_FUNC_NEVER:
+      spe_il(f, mask_reg, 0);  /* mask = {0,0,0,0} */
+      spe_move(f, zmask_reg, mask_reg);  /* zmask = mask */
+      break;
+
+   case PIPE_FUNC_ALWAYS:
+      /* mask unchanged */
+      spe_il(f, zmask_reg, ~0);  /* zmask = {~0,~0,~0,~0} */
+      break;
+
+   default:
+      ASSERT(0);
+      break;
+   }
+
+   if (dsa->depth.writemask) {
+      /*
+       * If (ztest passed) {
+       *    framebufferZ = fragmentZ;
+       * }
+       * OR,
+       * framebufferZ = (ztest_passed ? fragmentZ : framebufferZ;
+       */
+      spe_selb(f, ifbZ_reg, ifbZ_reg, ifragZ_reg, mask_reg);
+   }
+}
+
+
+/**
+ * Generate SPE code to perform alpha testing.
+ *
+ * \param dsa        Gallium depth/stencil/alpha state to gen code for
+ * \param f          SPE function to append instruction onto.
+ * \param mask_reg   register containing quad/pixel "alive" mask (in/out)
+ * \param fragA_reg  register containing four fragment alpha values (in)
+ */
+static void
+gen_alpha_test(const struct pipe_depth_stencil_alpha_state *dsa,
+               struct spe_function *f, int mask_reg, int fragA_reg)
+{
+   int ref_reg = spe_allocate_available_register(f);
+   int amask_reg = spe_allocate_available_register(f);
+
+   ASSERT(dsa->alpha.enabled);
+
+   if ((dsa->alpha.func != PIPE_FUNC_NEVER) &&
+       (dsa->alpha.func != PIPE_FUNC_ALWAYS)) {
+      /* load/splat the alpha reference float value */
+      spe_load_float(f, ref_reg, dsa->alpha.ref);
+   }
+
+   /* emit code to do the alpha comparison, updating 'mask' */
+   switch (dsa->alpha.func) {
+   case PIPE_FUNC_EQUAL:
+      /* amask = (fragA == ref) */
+      spe_fceq(f, amask_reg, fragA_reg, ref_reg);
+      /* mask = (mask & amask) */
+      spe_and(f, mask_reg, mask_reg, amask_reg);
+      break;
+
+   case PIPE_FUNC_NOTEQUAL:
+      /* amask = (fragA == ref) */
+      spe_fceq(f, amask_reg, fragA_reg, ref_reg);
+      /* mask = (mask & ~amask) */
+      spe_andc(f, mask_reg, mask_reg, amask_reg);
+      break;
+
+   case PIPE_FUNC_GREATER:
+      /* amask = (fragA > ref) */
+      spe_fcgt(f, amask_reg, fragA_reg, ref_reg);
+      /* mask = (mask & amask) */
+      spe_and(f, mask_reg, mask_reg, amask_reg);
+      break;
+
+   case PIPE_FUNC_LESS:
+      /* amask = (ref > fragA) */
+      spe_fcgt(f, amask_reg, ref_reg, fragA_reg);
+      /* mask = (mask & amask) */
+      spe_and(f, mask_reg, mask_reg, amask_reg);
+      break;
+
+   case PIPE_FUNC_LEQUAL:
+      /* amask = (fragA > ref) */
+      spe_fcgt(f, amask_reg, fragA_reg, ref_reg);
+      /* mask = (mask & ~amask) */
+      spe_andc(f, mask_reg, mask_reg, amask_reg);
+      break;
+
+   case PIPE_FUNC_GEQUAL:
+      /* amask = (ref > fragA) */
+      spe_fcgt(f, amask_reg, ref_reg, fragA_reg);
+      /* mask = (mask & ~amask) */
+      spe_andc(f, mask_reg, mask_reg, amask_reg);
+      break;
+
+   case PIPE_FUNC_NEVER:
+      spe_il(f, mask_reg, 0);  /* mask = [0,0,0,0] */
+      break;
+
+   case PIPE_FUNC_ALWAYS:
+      /* no-op, mask unchanged */
+      break;
+
+   default:
+      ASSERT(0);
+      break;
+   }
+
+#if OPTIMIZATIONS
+   /* if mask == {0,0,0,0} we're all done, return */
+   {
+      /* re-use amask reg here */
+      int tmp_reg = amask_reg;
+      /* tmp[0] = (mask[0] | mask[1] | mask[2] | mask[3]) */
+      spe_orx(f, tmp_reg, mask_reg);
+      /* if tmp[0] == 0 then return from function call */
+      spe_biz(f, tmp_reg, SPE_REG_RA, 0, 0);
+   }
+#endif
+
+   spe_release_register(f, ref_reg);
+   spe_release_register(f, amask_reg);
+}
+
+
+
+/**
+ * Generate SPE code to implement the given blend mode for a quad of pixels.
+ * \param f          SPE function to append instruction onto.
+ * \param fragR_reg  register with fragment red values (float) (in/out)
+ * \param fragG_reg  register with fragment green values (float) (in/out)
+ * \param fragB_reg  register with fragment blue values (float) (in/out)
+ * \param fragA_reg  register with fragment alpha values (float) (in/out)
+ * \param fbRGBA_reg register with packed framebuffer colors (integer) (in)
+ */
+static void
+gen_blend(const struct pipe_blend_state *blend,
+          struct spe_function *f,
+          enum pipe_format color_format,
+          int fragR_reg, int fragG_reg, int fragB_reg, int fragA_reg,
+          int fbRGBA_reg)
+{
+   int term1R_reg = spe_allocate_available_register(f);
+   int term1G_reg = spe_allocate_available_register(f);
+   int term1B_reg = spe_allocate_available_register(f);
+   int term1A_reg = spe_allocate_available_register(f);
+
+   int term2R_reg = spe_allocate_available_register(f);
+   int term2G_reg = spe_allocate_available_register(f);
+   int term2B_reg = spe_allocate_available_register(f);
+   int term2A_reg = spe_allocate_available_register(f);
+
+   int fbR_reg = spe_allocate_available_register(f);
+   int fbG_reg = spe_allocate_available_register(f);
+   int fbB_reg = spe_allocate_available_register(f);
+   int fbA_reg = spe_allocate_available_register(f);
+
+   int one_reg = spe_allocate_available_register(f);
+   int tmp_reg = spe_allocate_available_register(f);
+
+   ASSERT(blend->blend_enable);
+
+   /* Unpack/convert framebuffer colors from four 32-bit packed colors
+    * (fbRGBA) to four float RGBA vectors (fbR, fbG, fbB, fbA).
+    * Each 8-bit color component is expanded into a float in [0.0, 1.0].
+    */
+   {
+      int mask_reg = spe_allocate_available_register(f);
+
+      /* mask = {0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff} */
+      spe_fsmbi(f, mask_reg, 0x1111);
+
+      /* XXX there may be more clever ways to implement the following code */
+      switch (color_format) {
+      case PIPE_FORMAT_A8R8G8B8_UNORM:
+         /* fbB = fbB & mask */
+         spe_and(f, fbB_reg, fbRGBA_reg, mask_reg);
+         /* mask = mask << 8 */
+         spe_roti(f, mask_reg, mask_reg, 8);
+
+         /* fbG = fbRGBA & mask */
+         spe_and(f, fbG_reg, fbRGBA_reg, mask_reg);
+         /* fbG = fbG >> 8 */
+         spe_roti(f, fbG_reg, fbG_reg, -8);
+         /* mask = mask << 8 */
+         spe_roti(f, mask_reg, mask_reg, 8);
+
+         /* fbR = fbRGBA & mask */
+         spe_and(f, fbR_reg, fbRGBA_reg, mask_reg);
+         /* fbR = fbR >> 16 */
+         spe_roti(f, fbR_reg, fbR_reg, -16);
+         /* mask = mask << 8 */
+         spe_roti(f, mask_reg, mask_reg, 8);
+
+         /* fbA = fbRGBA & mask */
+         spe_and(f, fbA_reg, fbRGBA_reg, mask_reg);
+         /* fbA = fbA >> 24 */
+         spe_roti(f, fbA_reg, fbA_reg, -24);
+         break;
+
+      case PIPE_FORMAT_B8G8R8A8_UNORM:
+         /* fbA = fbA & mask */
+         spe_and(f, fbA_reg, fbRGBA_reg, mask_reg);
+         /* mask = mask << 8 */
+         spe_roti(f, mask_reg, mask_reg, 8);
+
+         /* fbR = fbRGBA & mask */
+         spe_and(f, fbR_reg, fbRGBA_reg, mask_reg);
+         /* fbR = fbR >> 8 */
+         spe_roti(f, fbR_reg, fbR_reg, -8);
+         /* mask = mask << 8 */
+         spe_roti(f, mask_reg, mask_reg, 8);
+
+         /* fbG = fbRGBA & mask */
+         spe_and(f, fbG_reg, fbRGBA_reg, mask_reg);
+         /* fbG = fbG >> 16 */
+         spe_roti(f, fbG_reg, fbG_reg, -16);
+         /* mask = mask << 8 */
+         spe_roti(f, mask_reg, mask_reg, 8);
+
+         /* fbB = fbRGBA & mask */
+         spe_and(f, fbB_reg, fbRGBA_reg, mask_reg);
+         /* fbB = fbB >> 24 */
+         spe_roti(f, fbB_reg, fbB_reg, -24);
+         break;
+
+      default:
+         ASSERT(0);
+      }
+
+      /* convert int[4] in [0,255] to float[4] in [0.0, 1.0] */
+      spe_cuflt(f, fbR_reg, fbR_reg, 8);
+      spe_cuflt(f, fbG_reg, fbG_reg, 8);
+      spe_cuflt(f, fbB_reg, fbB_reg, 8);
+      spe_cuflt(f, fbA_reg, fbA_reg, 8);
+
+      spe_release_register(f, mask_reg);
+   }
+
+
+   /*
+    * Compute Src RGB terms
+    */
+   switch (blend->rgb_src_factor) {
+   case PIPE_BLENDFACTOR_ONE:
+      spe_move(f, term1R_reg, fragR_reg);
+      spe_move(f, term1G_reg, fragG_reg);
+      spe_move(f, term1B_reg, fragB_reg);
+      break;
+   case PIPE_BLENDFACTOR_ZERO:
+      spe_zero(f, term1R_reg);
+      spe_zero(f, term1G_reg);
+      spe_zero(f, term1B_reg);
+      break;
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+      spe_fm(f, term1R_reg, fragR_reg, fragR_reg);
+      spe_fm(f, term1G_reg, fragG_reg, fragG_reg);
+      spe_fm(f, term1B_reg, fragB_reg, fragB_reg);
+      break;
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+      spe_fm(f, term1R_reg, fragR_reg, fragA_reg);
+      spe_fm(f, term1G_reg, fragG_reg, fragA_reg);
+      spe_fm(f, term1B_reg, fragB_reg, fragA_reg);
+      break;
+      /* XXX more cases */
+   default:
+      ASSERT(0);
+   }
+
+   /*
+    * Compute Src Alpha term
+    */
+   switch (blend->alpha_src_factor) {
+   case PIPE_BLENDFACTOR_ONE:
+      spe_move(f, term1A_reg, fragA_reg);
+      break;
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+      spe_fm(f, term1A_reg, fragA_reg, fragA_reg);
+      break;
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+      spe_fm(f, term1A_reg, fragA_reg, fragA_reg);
+      break;
+      /* XXX more cases */
+   default:
+      ASSERT(0);
+   }
+
+   /*
+    * Compute Dest RGB terms
+    */
+   switch (blend->rgb_dst_factor) {
+   case PIPE_BLENDFACTOR_ONE:
+      spe_move(f, term2R_reg, fbR_reg);
+      spe_move(f, term2G_reg, fbG_reg);
+      spe_move(f, term2B_reg, fbB_reg);
+      break;
+   case PIPE_BLENDFACTOR_ZERO:
+      spe_zero(f, term2R_reg);
+      spe_zero(f, term2G_reg);
+      spe_zero(f, term2B_reg);
+      break;
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+      spe_fm(f, term2R_reg, fbR_reg, fragR_reg);
+      spe_fm(f, term2G_reg, fbG_reg, fragG_reg);
+      spe_fm(f, term2B_reg, fbB_reg, fragB_reg);
+      break;
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+      spe_fm(f, term2R_reg, fbR_reg, fragA_reg);
+      spe_fm(f, term2G_reg, fbG_reg, fragA_reg);
+      spe_fm(f, term2B_reg, fbB_reg, fragA_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+      /* one = {1.0, 1.0, 1.0, 1.0} */
+      spe_load_float(f, one_reg, 1.0f);
+      /* tmp = one - fragA */
+      spe_fs(f, tmp_reg, one_reg, fragA_reg);
+      /* term = fb * tmp */
+      spe_fm(f, term2R_reg, fbR_reg, tmp_reg);
+      spe_fm(f, term2G_reg, fbG_reg, tmp_reg);
+      spe_fm(f, term2B_reg, fbB_reg, tmp_reg);
+      break;
+      /* XXX more cases */
+   default:
+      ASSERT(0);
+   }
+
+   /*
+    * Compute Dest Alpha term
+    */
+   switch (blend->alpha_dst_factor) {
+   case PIPE_BLENDFACTOR_ONE:
+      spe_move(f, term2A_reg, fbA_reg);
+      break;
+   case PIPE_BLENDFACTOR_ZERO:
+      spe_zero(f, term2A_reg);
+      break;
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+      spe_fm(f, term2A_reg, fbA_reg, fragA_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+      /* one = {1.0, 1.0, 1.0, 1.0} */
+      spe_load_float(f, one_reg, 1.0f);
+      /* tmp = one - fragA */
+      spe_fs(f, tmp_reg, one_reg, fragA_reg);
+      /* termA = fbA * tmp */
+      spe_fm(f, term2A_reg, fbA_reg, tmp_reg);
+      break;
+      /* XXX more cases */
+   default:
+      ASSERT(0);
+   }
+
+   /*
+    * Combine Src/Dest RGB terms
+    */
+   switch (blend->rgb_func) {
+   case PIPE_BLEND_ADD:
+      spe_fa(f, fragR_reg, term1R_reg, term2R_reg);
+      spe_fa(f, fragG_reg, term1G_reg, term2G_reg);
+      spe_fa(f, fragB_reg, term1B_reg, term2B_reg);
+      break;
+   case PIPE_BLEND_SUBTRACT:
+      spe_fs(f, fragR_reg, term1R_reg, term2R_reg);
+      spe_fs(f, fragG_reg, term1G_reg, term2G_reg);
+      spe_fs(f, fragB_reg, term1B_reg, term2B_reg);
+      break;
+      /* XXX more cases */
+   default:
+      ASSERT(0);
+   }
+
+   /*
+    * Combine Src/Dest A term
+    */
+   switch (blend->alpha_func) {
+   case PIPE_BLEND_ADD:
+      spe_fa(f, fragA_reg, term1A_reg, term2A_reg);
+      break;
+   case PIPE_BLEND_SUBTRACT:
+      spe_fs(f, fragA_reg, term1A_reg, term2A_reg);
+      break;
+      /* XXX more cases */
+   default:
+      ASSERT(0);
+   }
+
+   spe_release_register(f, term1R_reg);
+   spe_release_register(f, term1G_reg);
+   spe_release_register(f, term1B_reg);
+   spe_release_register(f, term1A_reg);
+
+   spe_release_register(f, term2R_reg);
+   spe_release_register(f, term2G_reg);
+   spe_release_register(f, term2B_reg);
+   spe_release_register(f, term2A_reg);
+
+   spe_release_register(f, fbR_reg);
+   spe_release_register(f, fbG_reg);
+   spe_release_register(f, fbB_reg);
+   spe_release_register(f, fbA_reg);
+
+   spe_release_register(f, one_reg);
+   spe_release_register(f, tmp_reg);
+}
+
+
+static void
+gen_logicop(const struct pipe_blend_state *blend,
+            struct spe_function *f,
+            int fragRGBA_reg, int fbRGBA_reg)
+{
+   /* XXX to-do */
+   /* operate on 32-bit packed pixels, not float colors */
+}
+
+
+static void
+gen_colormask(uint colormask,
+              struct spe_function *f,
+              int fragRGBA_reg, int fbRGBA_reg)
+{
+   /* XXX to-do */
+   /* operate on 32-bit packed pixels, not float colors */
+}
+
+
+
+/**
+ * Generate code to pack a quad of float colors into a four 32-bit integers.
+ *
+ * \param f             SPE function to append instruction onto.
+ * \param color_format  the dest color packing format
+ * \param r_reg         register containing four red values (in/clobbered)
+ * \param g_reg         register containing four green values (in/clobbered)
+ * \param b_reg         register containing four blue values (in/clobbered)
+ * \param a_reg         register containing four alpha values (in/clobbered)
+ * \param rgba_reg      register to store the packed RGBA colors (out)
+ */
+static void
+gen_pack_colors(struct spe_function *f,
+                enum pipe_format color_format,
+                int r_reg, int g_reg, int b_reg, int a_reg,
+                int rgba_reg)
+{
+   /* Convert float[4] in [0.0,1.0] to int[4] in [0,~0], with clamping */
+   spe_cfltu(f, r_reg, r_reg, 32);
+   spe_cfltu(f, g_reg, g_reg, 32);
+   spe_cfltu(f, b_reg, b_reg, 32);
+   spe_cfltu(f, a_reg, a_reg, 32);
+
+   /* Shift the most significant bytes to least the significant positions.
+    * I.e.: reg = reg >> 24
+    */
+   spe_rotmi(f, r_reg, r_reg, -24);
+   spe_rotmi(f, g_reg, g_reg, -24);
+   spe_rotmi(f, b_reg, b_reg, -24);
+   spe_rotmi(f, a_reg, a_reg, -24);
+
+   /* Shift the color bytes according to the surface format */
+   if (color_format == PIPE_FORMAT_A8R8G8B8_UNORM) {
+      spe_roti(f, g_reg, g_reg, 8);   /* green <<= 8 */
+      spe_roti(f, r_reg, r_reg, 16);  /* red <<= 16 */
+      spe_roti(f, a_reg, a_reg, 24);  /* alpha <<= 24 */
+   }
+   else if (color_format == PIPE_FORMAT_B8G8R8A8_UNORM) {
+      spe_roti(f, r_reg, r_reg, 8);   /* red <<= 8 */
+      spe_roti(f, g_reg, g_reg, 16);  /* green <<= 16 */
+      spe_roti(f, b_reg, b_reg, 24);  /* blue <<= 24 */
+   }
+   else {
+      ASSERT(0);
+   }
+
+   /* Merge red, green, blue, alpha registers to make packed RGBA colors.
+    * Eg: after shifting according to color_format we might have:
+    *     R = {0x00ff0000, 0x00110000, 0x00220000, 0x00330000}
+    *     G = {0x0000ff00, 0x00004400, 0x00005500, 0x00006600}
+    *     B = {0x000000ff, 0x00000077, 0x00000088, 0x00000099}
+    *     A = {0xff000000, 0xaa000000, 0xbb000000, 0xcc000000}
+    * OR-ing all those together gives us four packed colors:
+    *  RGBA = {0xffffffff, 0xaa114477, 0xbb225588, 0xcc336699}
+    */
+   spe_or(f, rgba_reg, r_reg, g_reg);
+   spe_or(f, rgba_reg, rgba_reg, b_reg);
+   spe_or(f, rgba_reg, rgba_reg, a_reg);
+}
+
+
+
+
+/**
+ * Generate SPE code to implement the fragment operations (alpha test,
+ * depth test, stencil test, blending, colormask, and final
+ * framebuffer write) as specified by the current context state.
+ *
+ * Logically, this code will be called after running the fragment
+ * shader.  But under some circumstances we could run some of this
+ * code before the fragment shader to cull fragments/quads that are
+ * totally occluded/discarded.
+ *
+ * XXX we only support PIPE_FORMAT_Z24S8_UNORM z/stencil buffer right now.
+ *
+ * See the spu_default_fragment_ops() function to see how the per-fragment
+ * operations would be done with ordinary C code.
+ * The code we generate here though has no branches, is SIMD, etc and
+ * should be much faster.
+ *
+ * \param cell  the rendering context (in)
+ * \param f     the generated function (out)
+ */
+void
+gen_fragment_function(struct cell_context *cell, struct spe_function *f)
+{
+   const struct pipe_depth_stencil_alpha_state *dsa =
+      &cell->depth_stencil->base;
+   const struct pipe_blend_state *blend = &cell->blend->base;
+   const enum pipe_format color_format = cell->framebuffer.cbufs[0]->format;
+
+   /* For SPE function calls: reg $3 = first param, $4 = second param, etc. */
+   const int x_reg = 3;  /* uint */
+   const int y_reg = 4;  /* uint */
+   const int color_tile_reg = 5;  /* tile_t * */
+   const int depth_tile_reg = 6;  /* tile_t * */
+   const int fragZ_reg = 7;   /* vector float */
+   const int fragR_reg = 8;   /* vector float */
+   const int fragG_reg = 9;   /* vector float */
+   const int fragB_reg = 10;  /* vector float */
+   const int fragA_reg = 11;  /* vector float */
+   const int mask_reg = 12;   /* vector uint */
+
+   /* offset of quad from start of tile
+    * XXX assuming 4-byte pixels for color AND Z/stencil!!!!
+    */
+   int quad_offset_reg;
+
+   int fbRGBA_reg;  /**< framebuffer's RGBA colors for quad */
+   int fbZS_reg;    /**< framebuffer's combined z/stencil values for quad */
+
+   spe_init_func(f, SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE);
+   spe_allocate_register(f, x_reg);
+   spe_allocate_register(f, y_reg);
+   spe_allocate_register(f, color_tile_reg);
+   spe_allocate_register(f, depth_tile_reg);
+   spe_allocate_register(f, fragZ_reg);
+   spe_allocate_register(f, fragR_reg);
+   spe_allocate_register(f, fragG_reg);
+   spe_allocate_register(f, fragB_reg);
+   spe_allocate_register(f, fragA_reg);
+   spe_allocate_register(f, mask_reg);
+
+   quad_offset_reg = spe_allocate_available_register(f);
+   fbRGBA_reg = spe_allocate_available_register(f);
+   fbZS_reg = spe_allocate_available_register(f);
+
+   /* compute offset of quad from start of tile, in bytes */
+   {
+      int x2_reg = spe_allocate_available_register(f);
+      int y2_reg = spe_allocate_available_register(f);
+
+      ASSERT(TILE_SIZE == 32);
+
+      spe_rotmi(f, x2_reg, x_reg, -1);  /* x2 = x / 2 */
+      spe_rotmi(f, y2_reg, y_reg, -1);  /* y2 = y / 2 */
+      spe_shli(f, y2_reg, y2_reg, 4);   /* y2 *= 16 */
+      spe_a(f, quad_offset_reg, y2_reg, x2_reg);  /* offset = y2 + x2 */
+      spe_shli(f, quad_offset_reg, quad_offset_reg, 4);   /* offset *= 16 */
+
+      spe_release_register(f, x2_reg);
+      spe_release_register(f, y2_reg);
+   }
+
+
+   if (dsa->alpha.enabled) {
+      gen_alpha_test(dsa, f, mask_reg, fragA_reg);
+   }
+
+   if (dsa->depth.enabled || dsa->stencil[0].enabled) {
+      const enum pipe_format zs_format = cell->framebuffer.zsbuf->format;
+      boolean write_depth_stencil;
+
+      int fbZ_reg = spe_allocate_available_register(f); /* Z values */
+      int fbS_reg = spe_allocate_available_register(f); /* Stencil values */
+
+      /* fetch quad of depth/stencil values from tile at (x,y) */
+      /* Load: fbZS_reg = memory[depth_tile_reg + offset_reg] */
+      spe_lqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg);
+
+      if (dsa->depth.enabled) {
+         /* Extract Z bits from fbZS_reg into fbZ_reg */
+         if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
+             zs_format == PIPE_FORMAT_X8Z24_UNORM) {
+            int mask_reg = spe_allocate_available_register(f);
+            spe_fsmbi(f, mask_reg, 0x7777);  /* mask[0,1,2,3] = 0x00ffffff */
+            spe_and(f, fbZ_reg, fbZS_reg, mask_reg);  /* fbZ = fbZS & mask */
+            spe_release_register(f, mask_reg);
+            /* OK, fbZ_reg has four 24-bit Z values now */
+         }
+         else {
+            /* XXX handle other z/stencil formats */
+            ASSERT(0);
+         }
+
+         /* Convert fragZ values from float[4] to uint[4] */
+         if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
+             zs_format == PIPE_FORMAT_X8Z24_UNORM ||
+             zs_format == PIPE_FORMAT_Z24S8_UNORM ||
+             zs_format == PIPE_FORMAT_Z24X8_UNORM) {
+            /* 24-bit Z values */
+            int scale_reg = spe_allocate_available_register(f);
+
+            /* scale_reg[0,1,2,3] = float(2^24-1) */
+            spe_load_float(f, scale_reg, (float) 0xffffff);
+
+            /* XXX these two instructions might be combined */
+            spe_fm(f, fragZ_reg, fragZ_reg, scale_reg); /* fragZ *= scale */
+            spe_cfltu(f, fragZ_reg, fragZ_reg, 0);  /* fragZ = (int) fragZ */
+
+            spe_release_register(f, scale_reg);
+         }
+         else {
+            /* XXX handle 16-bit Z format */
+            ASSERT(0);
+         }
+      }
+
+      if (dsa->stencil[0].enabled) {
+         /* Extract Stencil bit sfrom fbZS_reg into fbS_reg */
+         if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
+             zs_format == PIPE_FORMAT_X8Z24_UNORM) {
+            /* XXX extract with a shift */
+            ASSERT(0);
+         }
+         else if (zs_format == PIPE_FORMAT_Z24S8_UNORM ||
+                  zs_format == PIPE_FORMAT_Z24X8_UNORM) {
+            /* XXX extract with a mask */
+            ASSERT(0);
+         }
+      }
+
+
+      if (dsa->stencil[0].enabled) {
+         /* XXX this may involve depth testing too */
+         // gen_stencil_test(dsa, f, ... );
+         ASSERT(0);
+      }
+      else if (dsa->depth.enabled) {
+         int zmask_reg = spe_allocate_available_register(f);
+         gen_depth_test(dsa, f, mask_reg, fragZ_reg, fbZ_reg, zmask_reg);
+         spe_release_register(f, zmask_reg);
+      }
+
+      /* do we need to write Z and/or Stencil back into framebuffer? */
+      write_depth_stencil = (dsa->depth.writemask |
+                             dsa->stencil[0].write_mask |
+                             dsa->stencil[1].write_mask);
+
+      if (write_depth_stencil) {
+         /* Merge latest Z and Stencil values into fbZS_reg.
+          * fbZ_reg has four Z vals in bits [23..0] or bits [15..0].
+          * fbS_reg has four 8-bit Z values in bits [7..0].
+          */
+         if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
+             zs_format == PIPE_FORMAT_X8Z24_UNORM) {
+            spe_shli(f, fbS_reg, fbS_reg, 24); /* fbS = fbS << 24 */
+            spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
+         }
+         else if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
+                  zs_format == PIPE_FORMAT_X8Z24_UNORM) {
+            /* XXX to do */
+            ASSERT(0);
+         }
+         else if (zs_format == PIPE_FORMAT_Z16_UNORM) {
+            /* XXX to do */
+            ASSERT(0);
+         }
+         else if (zs_format == PIPE_FORMAT_S8_UNORM) {
+            /* XXX to do */
+            ASSERT(0);
+         }
+         else {
+            /* bad zs_format */
+            ASSERT(0);
+         }
+
+         /* Store: memory[depth_tile_reg + quad_offset_reg] = fbZS */
+         spe_stqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg);
+      }
+
+      spe_release_register(f, fbZ_reg);
+      spe_release_register(f, fbS_reg);
+   }
+
+
+   /* Get framebuffer quad/colors.  We'll need these for blending,
+    * color masking, and to obey the quad/pixel mask.
+    * Load: fbRGBA_reg = memory[color_tile + quad_offset]
+    * Note: if mask={~0,~0,~0,~0} and we're not blending or colormasking
+    * we could skip this load.
+    */
+   spe_lqx(f, fbRGBA_reg, color_tile_reg, quad_offset_reg);
+
+
+   if (blend->blend_enable) {
+      gen_blend(blend, f, color_format,
+                fragR_reg, fragG_reg, fragB_reg, fragA_reg, fbRGBA_reg);
+   }
+
+   /*
+    * Write fragment colors to framebuffer/tile.
+    * This involves converting the fragment colors from float[4] to the
+    * tile's specific format and obeying the quad/pixel mask.
+    */
+   {
+      int rgba_reg = spe_allocate_available_register(f);
+
+      /* Pack four float colors as four 32-bit int colors */
+      gen_pack_colors(f, color_format,
+                      fragR_reg, fragG_reg, fragB_reg, fragA_reg,
+                      rgba_reg);
+
+      if (blend->logicop_enable) {
+         gen_logicop(blend, f, rgba_reg, fbRGBA_reg);
+      }
+
+      if (blend->colormask != 0xf) {
+         gen_colormask(blend->colormask, f, rgba_reg, fbRGBA_reg);
+      }
+
+
+      /* Mix fragment colors with framebuffer colors using the quad/pixel mask:
+       * if (mask[i])
+       *    rgba[i] = rgba[i];
+       * else
+       *    rgba[i] = framebuffer[i];
+       */
+      spe_selb(f, rgba_reg, fbRGBA_reg, rgba_reg, mask_reg);
+
+      /* Store updated quad in tile:
+       * memory[color_tile + quad_offset] = rgba_reg;
+       */
+      spe_stqx(f, rgba_reg, color_tile_reg, quad_offset_reg);
+
+      spe_release_register(f, rgba_reg);
+   }
+
+   printf("gen_fragment_ops nr instructions: %u\n", f->num_inst);
+
+   spe_bi(f, SPE_REG_RA, 0, 0);  /* return from function call */
+
+
+   spe_release_register(f, fbRGBA_reg);
+   spe_release_register(f, fbZS_reg);
+   spe_release_register(f, quad_offset_reg);
+}
+
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.h b/src/gallium/drivers/cell/ppu/cell_gen_fragment.h
new file mode 100644
index 0000000000..0ea0fc690c
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.h
@@ -0,0 +1,38 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef CELL_GEN_FRAGMENT_H
+#define CELL_GEN_FRAGMENT_H
+
+
+extern void
+gen_fragment_function(struct cell_context *cell, struct spe_function *f);
+
+
+#endif /* CELL_GEN_FRAGMENT_H */
+
diff --git a/src/gallium/drivers/cell/ppu/cell_pipe_state.c b/src/gallium/drivers/cell/ppu/cell_pipe_state.c
index fe5437023b..e04cf5f274 100644
--- a/src/gallium/drivers/cell/ppu/cell_pipe_state.c
+++ b/src/gallium/drivers/cell/ppu/cell_pipe_state.c
@@ -34,6 +34,7 @@
 #include "pipe/p_inlines.h"
 #include "draw/draw_context.h"
 #include "cell_context.h"
+#include "cell_flush.h"
 #include "cell_state.h"
 #include "cell_texture.h"
 #include "cell_state_per_fragment.h"
@@ -130,8 +131,9 @@ cell_delete_depth_stencil_alpha_state(struct pipe_context *pipe, void *depth)
 }
 
 
-static void cell_set_clip_state( struct pipe_context *pipe,
-			     const struct pipe_clip_state *clip )
+static void
+cell_set_clip_state(struct pipe_context *pipe,
+                    const struct pipe_clip_state *clip)
 {
    struct cell_context *cell = cell_context(pipe);
 
@@ -310,8 +312,21 @@ cell_set_framebuffer_state(struct pipe_context *pipe,
          cell->zsbuf_map = NULL;
       }
 
-      /* update my state */
-      cell->framebuffer = *fb;
+      /* Finish any pending rendering to the current surface before
+       * installing a new surface!
+       */
+      cell_flush_int(cell, CELL_FLUSH_WAIT);
+
+      /* update my state
+       * (this is also where old surfaces will finally get freed)
+       */
+      cell->framebuffer.width = fb->width;
+      cell->framebuffer.height = fb->height;
+      cell->framebuffer.num_cbufs = fb->num_cbufs;
+      for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
+         pipe_surface_reference(&cell->framebuffer.cbufs[i], fb->cbufs[i]);
+      }
+      pipe_surface_reference(&cell->framebuffer.zsbuf, fb->zsbuf);
 
       /* map new surfaces */
       if (csurf)
diff --git a/src/gallium/drivers/cell/ppu/cell_spu.c b/src/gallium/drivers/cell/ppu/cell_spu.c
index 973c0b1aa1..9508227e29 100644
--- a/src/gallium/drivers/cell/ppu/cell_spu.c
+++ b/src/gallium/drivers/cell/ppu/cell_spu.c
@@ -26,6 +26,11 @@
  **************************************************************************/
 
 
+/**
+ * Utility/wrappers for communicating with the SPUs.
+ */
+
+
 #include <pthread.h>
 
 #include "cell_spu.h"
@@ -40,6 +45,9 @@ helpful headers:
 */
 
 
+/**
+ * Cell/SPU info that's not per-context.
+ */
 struct cell_global_info cell_global;
 
 
@@ -74,7 +82,11 @@ wait_mbox_message(spe_context_ptr_t ctx)
 }
 
 
-static void *cell_thread_function(void *arg)
+/**
+ * Called by pthread_create() to spawn an SPU thread.
+ */
+static void *
+cell_thread_function(void *arg)
 {
    struct cell_init_info *init = (struct cell_init_info *) arg;
    unsigned entry = SPE_DEFAULT_ENTRY;
@@ -92,7 +104,10 @@ static void *cell_thread_function(void *arg)
 
 
 /**
- * Create the SPU threads
+ * Create the SPU threads.  This is done once during driver initialization.
+ * This involves setting the the "init" message which is sent to each SPU.
+ * The init message specifies an SPU id, total number of SPUs, location
+ * and number of batch buffers, etc.
  */
 void
 cell_start_spus(struct cell_context *cell)
@@ -100,7 +115,6 @@ cell_start_spus(struct cell_context *cell)
    static boolean one_time_init = FALSE;
    uint i, j;
 
-
    if (one_time_init) {
       fprintf(stderr, "PPU: Multiple rendering contexts not yet supported "
 	      "on Cell.\n");
@@ -120,6 +134,7 @@ cell_start_spus(struct cell_context *cell)
    for (i = 0; i < cell->num_spus; i++) {
       cell_global.inits[i].id = i;
       cell_global.inits[i].num_spus = cell->num_spus;
+      cell_global.inits[i].debug_flags = cell->debug_flags;
       cell_global.inits[i].cmd = &cell_global.command[i];
       for (j = 0; j < CELL_NUM_BUFFERS; j++) {
          cell_global.inits[i].buffers[j] = cell->buffer[j];
@@ -137,14 +152,17 @@ cell_start_spus(struct cell_context *cell)
          exit(1);
       }
       
-      pthread_create(&cell_global.spe_threads[i], NULL, &cell_thread_function,
-		     &cell_global.inits[i]);
+      pthread_create(&cell_global.spe_threads[i], /* returned thread handle */
+                     NULL,                        /* pthread attribs */
+                     &cell_thread_function,       /* start routine */
+		     &cell_global.inits[i]);      /* thread argument */
    }
 }
 
 
 /**
  * Tell all the SPUs to stop/exit.
+ * This is done when the driver's exiting / cleaning up.
  */
 void
 cell_spu_exit(struct cell_context *cell)
diff --git a/src/gallium/drivers/cell/ppu/cell_state.h b/src/gallium/drivers/cell/ppu/cell_state.h
index 82580ea35a..a7771a55a3 100644
--- a/src/gallium/drivers/cell/ppu/cell_state.h
+++ b/src/gallium/drivers/cell/ppu/cell_state.h
@@ -48,19 +48,17 @@
 #define CELL_NEW_VERTEX_INFO   0x8000
 
 
-void cell_set_vertex_elements(struct pipe_context *,
-                              unsigned count,
-                              const struct pipe_vertex_element *);
+extern void
+cell_update_derived( struct cell_context *softpipe );
 
-void cell_set_vertex_buffers(struct pipe_context *,
-                             unsigned count,
-                             const struct pipe_vertex_buffer *);
 
-void cell_update_derived( struct cell_context *softpipe );
+extern void
+cell_init_shader_functions(struct cell_context *cell);
 
 
-void
-cell_init_shader_functions(struct cell_context *cell);
+extern void
+cell_init_vertex_functions(struct cell_context *cell);
+
 
 #endif /* CELL_STATE_H */
 
diff --git a/src/gallium/drivers/cell/ppu/cell_state_derived.c b/src/gallium/drivers/cell/ppu/cell_state_derived.c
index 8ab938a02a..efc4f78364 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_derived.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_derived.c
@@ -35,21 +35,6 @@
 #include "cell_state_emit.h"
 
 
-static int
-find_vs_output(const struct cell_vertex_shader_state *vs,
-               uint semantic_name,
-               uint semantic_index)
-{
-   uint i;
-   for (i = 0; i < vs->info.num_outputs; i++) {
-      if (vs->info.output_semantic_name[i] == semantic_name &&
-          vs->info.output_semantic_index[i] == semantic_index)
-         return i;
-   }
-   return -1;
-}
-
-
 /**
  * Determine how to map vertex program outputs to fragment program inputs.
  * Basically, this will be used when computing the triangle interpolation
@@ -58,7 +43,6 @@ find_vs_output(const struct cell_vertex_shader_state *vs,
 static void
 calculate_vertex_layout( struct cell_context *cell )
 {
-   const struct cell_vertex_shader_state *vs = cell->vs;
    const struct cell_fragment_shader_state *fs = cell->fs;
    const enum interp_mode colorInterp
       = cell->rasterizer->flatshade ? INTERP_CONSTANT : INTERP_LINEAR;
@@ -82,7 +66,7 @@ calculate_vertex_layout( struct cell_context *cell )
    vinfo->num_attribs = 0;
 
    /* we always want to emit vertex pos */
-   src = find_vs_output(vs, TGSI_SEMANTIC_POSITION, 0);
+   src = draw_find_vs_output(cell->draw, TGSI_SEMANTIC_POSITION, 0);
    assert(src >= 0);
    draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_POS, src);
 
@@ -98,14 +82,14 @@ calculate_vertex_layout( struct cell_context *cell )
          break;
 
       case TGSI_SEMANTIC_COLOR:
-         src = find_vs_output(vs, TGSI_SEMANTIC_COLOR, 
-                              fs->info.input_semantic_index[i]);
+         src = draw_find_vs_output(cell->draw, TGSI_SEMANTIC_COLOR, 
+                                   fs->info.input_semantic_index[i]);
          assert(src >= 0);
          draw_emit_vertex_attr(vinfo, EMIT_4F, colorInterp, src);
          break;
 
       case TGSI_SEMANTIC_FOG:
-         src = find_vs_output(vs, TGSI_SEMANTIC_FOG, 0);
+         src = draw_find_vs_output(cell->draw, TGSI_SEMANTIC_FOG, 0);
 #if 1
          if (src < 0) /* XXX temp hack, try demos/fogcoord.c with this */
             src = 0;
@@ -116,7 +100,7 @@ calculate_vertex_layout( struct cell_context *cell )
 
       case TGSI_SEMANTIC_GENERIC:
          /* this includes texcoords and varying vars */
-         src = find_vs_output(vs, TGSI_SEMANTIC_GENERIC,
+         src = draw_find_vs_output(cell->draw, TGSI_SEMANTIC_GENERIC,
                               fs->info.input_semantic_index[i]);
          assert(src >= 0);
          draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, src);
@@ -163,6 +147,9 @@ compute_cliprect(struct cell_context *sp)
 
 
 
+/**
+ * Update derived state, send current state to SPUs prior to rendering.
+ */
 void cell_update_derived( struct cell_context *cell )
 {
    if (cell->dirty & (CELL_NEW_RASTERIZER |
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index 9d88c1cf3d..180b89c1f6 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -27,6 +27,7 @@
 
 #include "util/u_memory.h"
 #include "cell_context.h"
+#include "cell_gen_fragment.h"
 #include "cell_state.h"
 #include "cell_state_emit.h"
 #include "cell_state_per_fragment.h"
@@ -47,27 +48,13 @@ emit_state_cmd(struct cell_context *cell, uint cmd,
 }
 
 
-
+/**
+ * For state marked as 'dirty', construct a state-update command block
+ * and insert it into the current batch buffer.
+ */
 void
 cell_emit_state(struct cell_context *cell)
 {
-   if (cell->dirty & (CELL_NEW_FRAMEBUFFER | CELL_NEW_BLEND)) {
-      struct cell_command_logicop logicop;
-
-      if (cell->logic_op.store != NULL) {
-	 spe_release_func(& cell->logic_op);
-      }
-
-      cell_generate_logic_op(& cell->logic_op,
-			     & cell->blend->base,
-			     cell->framebuffer.cbufs[0]);
-
-      logicop.base = (intptr_t) cell->logic_op.store;
-      logicop.size = 64 * 4;
-      emit_state_cmd(cell, CELL_CMD_STATE_LOGICOP, &logicop,
-		     sizeof(logicop));
-   }
-
    if (cell->dirty & CELL_NEW_FRAMEBUFFER) {
       struct pipe_surface *cbuf = cell->framebuffer.cbufs[0];
       struct pipe_surface *zbuf = cell->framebuffer.zsbuf;
@@ -80,44 +67,33 @@ cell_emit_state(struct cell_context *cell)
       fb->depth_format = zbuf ? zbuf->format : PIPE_FORMAT_NONE;
       fb->width = cell->framebuffer.width;
       fb->height = cell->framebuffer.height;
+#if 0
+      printf("EMIT color format %s\n", pf_name(fb->color_format));
+      printf("EMIT depth format %s\n", pf_name(fb->depth_format));
+#endif
    }
 
-   if (cell->dirty & CELL_NEW_BLEND) {
-      struct cell_command_blend blend;
 
-      if (cell->blend != NULL) {
-         blend.base = (intptr_t) cell->blend->code.store;
-         blend.size = (char *) cell->blend->code.csr
-             - (char *) cell->blend->code.store;
-         blend.read_fb = TRUE;
-      } else {
-         blend.base = 0;
-         blend.size = 0;
-         blend.read_fb = FALSE;
-      }
-
-      emit_state_cmd(cell, CELL_CMD_STATE_BLEND, &blend, sizeof(blend));
-   }
-
-   if (cell->dirty & CELL_NEW_DEPTH_STENCIL) {
-      struct cell_command_depth_stencil_alpha_test dsat;
-      
-
-      if (cell->depth_stencil != NULL) {
-	 dsat.base = (intptr_t) cell->depth_stencil->code.store;
-	 dsat.size = (char *) cell->depth_stencil->code.csr
-	     - (char *) cell->depth_stencil->code.store;
-	 dsat.read_depth = TRUE;
-	 dsat.read_stencil = FALSE;
-      } else {
-	 dsat.base = 0;
-	 dsat.size = 0;
-	 dsat.read_depth = FALSE;
-	 dsat.read_stencil = FALSE;
-      }
-
-      emit_state_cmd(cell, CELL_CMD_STATE_DEPTH_STENCIL, &dsat,
-		     sizeof(dsat));
+   if (cell->dirty & (CELL_NEW_FRAMEBUFFER |
+                      CELL_NEW_DEPTH_STENCIL |
+                      CELL_NEW_BLEND)) {
+      /* XXX we don't want to always do codegen here.  We should have
+       * a hash/lookup table to cache previous results...
+       */
+      struct cell_command_fragment_ops *fops
+            = cell_batch_alloc(cell, sizeof(*fops));
+      struct spe_function spe_code;
+
+      /* generate new code */
+      gen_fragment_function(cell, &spe_code);
+      /* put the new code into the batch buffer */
+      fops->opcode = CELL_CMD_STATE_FRAGMENT_OPS;
+      memcpy(&fops->code, spe_code.store,
+             SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE);
+      fops->dsa = cell->depth_stencil->base;
+      fops->blend = cell->blend->base;
+      /* free codegen buffer */
+      spe_release_func(&spe_code);
    }
 
    if (cell->dirty & CELL_NEW_SAMPLER) {
@@ -157,7 +133,8 @@ cell_emit_state(struct cell_context *cell)
       emit_state_cmd(cell, CELL_CMD_STATE_VERTEX_INFO,
                      &cell->vertex_info, sizeof(struct vertex_info));
    }
-   
+
+#if 0
    if (cell->dirty & CELL_NEW_VS) {
       const struct draw_context *const draw = cell->draw;
       struct cell_shader_info info;
@@ -170,7 +147,7 @@ cell_emit_state(struct cell_context *cell)
       info.immediates = (uintptr_t) draw->vs.machine.Imms;
       info.num_immediates = draw->vs.machine.ImmLimit / 4;
 
-      emit_state_cmd(cell, CELL_CMD_STATE_BIND_VS,
-		     & info, sizeof(info));
+      emit_state_cmd(cell, CELL_CMD_STATE_BIND_VS, &info, sizeof(info));
    }
+#endif
 }
diff --git a/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c b/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c
index 53ae3aa50e..78cb446c14 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c
@@ -132,9 +132,9 @@ emit_alpha_test(struct pipe_depth_stencil_alpha_state *dsa,
 
 
 /**
+ * Generate code to perform Z testing.  Four Z values are tested at once.
  * \param dsa        Current depth-test state
  * \param f          Function to which code should be appended
- * \param m          Mask of allocated / free SPE registers
  * \param mask       Index of register to contain depth-pass mask
  * \param stored     Index of register containing values from depth buffer
  * \param calculated Index of register containing per-fragment depth values
@@ -198,6 +198,7 @@ emit_depth_test(struct pipe_depth_stencil_alpha_state *dsa,
 
 
 /**
+ * Generate code to apply the stencil operation (after testing).
  * \note Emits a maximum of 5 instructions.
  *
  * \warning
@@ -222,9 +223,13 @@ emit_stencil_op(struct spe_function *f,
       spe_il(f, result, ref);
       break;
    case PIPE_STENCIL_OP_INCR:
+      /* clamp = [0xff, 0xff, 0xff, 0xff] */
       spe_il(f, clamp, 0x0ff);
+      /* result[i] = in[i] + 1 */
       spe_ai(f, result, in, 1);
+      /* clamp_mask[i] = (result[i] > 0xff) */
       spe_clgti(f, clamp_mask, result, 0x0ff);
+      /* result[i] = clamp_mask[i] ? clamp[i] : result[i] */
       spe_selb(f, result, result, clamp, clamp_mask);
       break;
    case PIPE_STENCIL_OP_DECR:
@@ -259,10 +264,10 @@ emit_stencil_op(struct spe_function *f,
 
 
 /**
+ * Generate code to do stencil test.  Four pixels are tested at once.
  * \param dsa        Depth / stencil test state
  * \param face       0 for front face, 1 for back face
  * \param f          Function to append instructions to
- * \param reg_mask   Mask of allocated registers
  * \param mask       Register containing mask of fragments passing the
  *                   alpha test
  * \param depth_mask Register containing mask of fragments passing the
@@ -310,13 +315,14 @@ emit_stencil_test(struct pipe_depth_stencil_alpha_state *dsa,
 
    switch (dsa->stencil[face].func) {
    case PIPE_FUNC_NEVER:
-      spe_il(f, stencil_mask, 0);
+      spe_il(f, stencil_mask, 0);   /* stencil_mask[0..3] = [0,0,0,0] */
       break;
 
    case PIPE_FUNC_NOTEQUAL:
       complement = TRUE;
       /* FALLTHROUGH */
    case PIPE_FUNC_EQUAL:
+      /* stencil_mask[i] = (stored[i] == ref) */
       spe_ceqi(f, stencil_mask, stored, ref);
       break;
 
@@ -324,6 +330,8 @@ emit_stencil_test(struct pipe_depth_stencil_alpha_state *dsa,
       complement = TRUE;
       /* FALLTHROUGH */
    case PIPE_FUNC_GREATER:
+      complement = TRUE;
+      /* stencil_mask[i] = (stored[i] > ref) */
       spe_clgti(f, stencil_mask, stored, ref);
       break;
 
@@ -331,8 +339,11 @@ emit_stencil_test(struct pipe_depth_stencil_alpha_state *dsa,
       complement = TRUE;
       /* FALLTHROUGH */
    case PIPE_FUNC_GEQUAL:
+      /* stencil_mask[i] = (stored[i] > ref) */
       spe_clgti(f, stencil_mask, stored, ref);
+      /* tmp[i] = (stored[i] == ref) */
       spe_ceqi(f, tmp, stored, ref);
+      /* stencil_mask[i] = stencil_mask[i] | tmp[i] */
       spe_or(f, stencil_mask, stencil_mask, tmp);
       break;
 
@@ -461,7 +472,7 @@ cell_generate_depth_stencil_test(struct cell_depth_stencil_alpha_state *cdsa)
     * + 25 (front stencil) + 25 (back stencil) + 4 = 63 instructions.  Round
     * up to 64 to make it a happy power-of-two.
     */
-   spe_init_func(f, 4 * 64);
+   spe_init_func(f, SPE_INST_SIZE * 64);
 
 
    /* Allocate registers for the function's input parameters.  Cleverly (and
@@ -540,7 +551,7 @@ cell_generate_depth_stencil_test(struct cell_depth_stencil_alpha_state *cdsa)
          spe_selb(f, depth, depth, zvals, mask);
    }
 
-   spe_bi(f, 0, 0, 0);
+   spe_bi(f, 0, 0, 0);  /* return from function call */
 
 
 #if 0
@@ -956,7 +967,7 @@ cell_generate_alpha_blend(struct cell_blend_state *cb)
     * + 4 (fragment mask) + 1 (return) = 55 instlructions.  Round up to 64 to
     * make it a happy power-of-two.
     */
-   spe_init_func(f, 4 * 64);
+   spe_init_func(f, SPE_INST_SIZE * 64);
 
 
    const int frag[4] = {
@@ -1144,9 +1155,10 @@ cell_generate_alpha_blend(struct cell_blend_state *cb)
 }
 
 
-int PC_OFFSET(const struct spe_function *f, const void *d)
+static int
+PC_OFFSET(const struct spe_function *f, const void *d)
 {
-   const intptr_t pc = (intptr_t) f->csr;
+   const intptr_t pc = (intptr_t) &f->store[f->num_inst];
    const intptr_t ea = ~0x0f & (intptr_t) d;
 
    return (ea - pc) >> 2;
@@ -1178,7 +1190,7 @@ cell_generate_logic_op(struct spe_function *f,
     * bytes (equiv. to 8 instructions) are needed for data storage.  Round up
     * to 64 to make it a happy power-of-two.
     */
-   spe_init_func(f, 4 * 64);
+   spe_init_func(f, SPE_INST_SIZE * 64);
 
 
    /* Pixel colors in framebuffer format in AoS layout.
diff --git a/src/gallium/drivers/cell/ppu/cell_state_shader.c b/src/gallium/drivers/cell/ppu/cell_state_shader.c
index 86bcad05e9..97e44eeb1a 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_shader.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_shader.c
@@ -53,7 +53,10 @@ cell_vertex_shader_state(void *shader)
 }
 
 
-
+/**
+ * Create fragment shader state.
+ * Called via pipe->create_fs_state()
+ */
 static void *
 cell_create_fs_state(struct pipe_context *pipe,
                      const struct pipe_shader_state *templ)
@@ -77,6 +80,9 @@ cell_create_fs_state(struct pipe_context *pipe,
 }
 
 
+/**
+ * Called via pipe->bind_fs_state()
+ */
 static void
 cell_bind_fs_state(struct pipe_context *pipe, void *fs)
 {
@@ -88,6 +94,9 @@ cell_bind_fs_state(struct pipe_context *pipe, void *fs)
 }
 
 
+/**
+ * Called via pipe->delete_fs_state()
+ */
 static void
 cell_delete_fs_state(struct pipe_context *pipe, void *fs)
 {
@@ -98,6 +107,10 @@ cell_delete_fs_state(struct pipe_context *pipe, void *fs)
 }
 
 
+/**
+ * Create vertex shader state.
+ * Called via pipe->create_vs_state()
+ */
 static void *
 cell_create_vs_state(struct pipe_context *pipe,
                      const struct pipe_shader_state *templ)
@@ -128,6 +141,9 @@ cell_create_vs_state(struct pipe_context *pipe,
 }
 
 
+/**
+ * Called via pipe->bind_vs_state()
+ */
 static void
 cell_bind_vs_state(struct pipe_context *pipe, void *vs)
 {
@@ -142,6 +158,9 @@ cell_bind_vs_state(struct pipe_context *pipe, void *vs)
 }
 
 
+/**
+ * Called via pipe->delete_vs_state()
+ */
 static void
 cell_delete_vs_state(struct pipe_context *pipe, void *vs)
 {
@@ -154,6 +173,9 @@ cell_delete_vs_state(struct pipe_context *pipe, void *vs)
 }
 
 
+/**
+ * Called via pipe->set_constant_buffer()
+ */
 static void
 cell_set_constant_buffer(struct pipe_context *pipe,
                          uint shader, uint index,
@@ -166,7 +188,7 @@ cell_set_constant_buffer(struct pipe_context *pipe,
    assert(index == 0);
 
    /* note: reference counting */
-   pipe_buffer_reference(ws,
+   winsys_buffer_reference(ws,
                         &cell->constants[shader].buffer,
                         buf->buffer);
    cell->constants[shader].size = buf->size;
diff --git a/src/gallium/drivers/cell/ppu/cell_state_vertex.c b/src/gallium/drivers/cell/ppu/cell_state_vertex.c
index 114684c2a3..fbe55c8472 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_vertex.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_vertex.c
@@ -35,7 +35,7 @@
 #include "draw/draw_context.h"
 
 
-void
+static void
 cell_set_vertex_elements(struct pipe_context *pipe,
                          unsigned count,
                          const struct pipe_vertex_element *elements)
@@ -53,7 +53,7 @@ cell_set_vertex_elements(struct pipe_context *pipe,
 }
 
 
-void
+static void
 cell_set_vertex_buffers(struct pipe_context *pipe,
                         unsigned count,
                         const struct pipe_vertex_buffer *buffers)
@@ -69,3 +69,11 @@ cell_set_vertex_buffers(struct pipe_context *pipe,
 
    draw_set_vertex_buffers(cell->draw, count, buffers);
 }
+
+
+void
+cell_init_vertex_functions(struct cell_context *cell)
+{
+   cell->pipe.set_vertex_buffers = cell_set_vertex_buffers;
+   cell->pipe.set_vertex_elements = cell_set_vertex_elements;
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_surface.c b/src/gallium/drivers/cell/ppu/cell_surface.c
index d9e3b510dc..732c64082e 100644
--- a/src/gallium/drivers/cell/ppu/cell_surface.c
+++ b/src/gallium/drivers/cell/ppu/cell_surface.c
@@ -25,108 +25,13 @@
  * 
  **************************************************************************/
 
-#include "pipe/p_defines.h"
-#include "pipe/p_inlines.h"
-#include "pipe/p_winsys.h"
-#include "util/u_memory.h"
 #include "util/u_rect.h"
-#include "util/u_tile.h"
-
 #include "cell_context.h"
-#include "cell_surface.h"
-
-
-static void
-cell_surface_copy(struct pipe_context *pipe,
-                  boolean do_flip,
-                  struct pipe_surface *dst,
-                  unsigned dstx, unsigned dsty,
-                  struct pipe_surface *src,
-                  unsigned srcx, unsigned srcy,
-                  unsigned width, unsigned height)
-{
-   assert( dst->cpp == src->cpp );
-
-   pipe_copy_rect(pipe_surface_map(dst, PIPE_BUFFER_USAGE_CPU_WRITE),
-                  &dst->block,
-                  dst->stride,
-                  dstx, dsty,
-                  width, height,
-                  pipe_surface_map(src, PIPE_BUFFER_USAGE_CPU_READ),
-                  do_flip ? -src->stride : src->stride,
-                  srcx, do_flip ? height - 1 - srcy : srcy);
-
-   pipe_surface_unmap(src);
-   pipe_surface_unmap(dst);
-}
-
-
-static void *
-get_pointer(struct pipe_surface *dst, void *dst_map, unsigned x, unsigned y)
-{
-   return (char *)dst_map + y / dst->block.height * dst->stride + x / dst->block.width * dst->block.size;
-}
-
-
-#define UBYTE_TO_USHORT(B) ((B) | ((B) << 8))
-
-
-/**
- * Fill a rectangular sub-region.  Need better logic about when to
- * push buffers into AGP - will currently do so whenever possible.
- */
-static void
-cell_surface_fill(struct pipe_context *pipe,
-                  struct pipe_surface *dst,
-                  unsigned dstx, unsigned dsty,
-                  unsigned width, unsigned height, unsigned value)
-{
-   unsigned i, j;
-   void *dst_map = pipe_surface_map(dst, PIPE_BUFFER_USAGE_CPU_WRITE);
-
-   assert(dst->stride > 0);
-
-   switch (dst->block.size) {
-   case 1:
-   case 2:
-   case 4:
-      pipe_fill_rect(dst_map, &dst->block, dst->stride, dstx, dsty, width, height, value);
-      break;
-   case 8:
-      {
-         /* expand the 4-byte clear value to an 8-byte value */
-         ushort *row = (ushort *) get_pointer(dst, dst_map, dstx, dsty);
-         ushort val0 = UBYTE_TO_USHORT((value >>  0) & 0xff);
-         ushort val1 = UBYTE_TO_USHORT((value >>  8) & 0xff);
-         ushort val2 = UBYTE_TO_USHORT((value >> 16) & 0xff);
-         ushort val3 = UBYTE_TO_USHORT((value >> 24) & 0xff);
-         val0 = (val0 << 8) | val0;
-         val1 = (val1 << 8) | val1;
-         val2 = (val2 << 8) | val2;
-         val3 = (val3 << 8) | val3;
-         for (i = 0; i < height; i++) {
-            for (j = 0; j < width; j++) {
-               row[j*4+0] = val0;
-               row[j*4+1] = val1;
-               row[j*4+2] = val2;
-               row[j*4+3] = val3;
-            }
-            row += dst->stride/2;
-         }
-      }
-      break;
-   default:
-      assert(0);
-      break;
-   }
-
-   pipe_surface_unmap( dst );
-}
 
 
 void
 cell_init_surface_functions(struct cell_context *cell)
 {
-   cell->pipe.surface_copy = cell_surface_copy;
-   cell->pipe.surface_fill = cell_surface_fill;
+   cell->pipe.surface_copy = util_surface_copy;
+   cell->pipe.surface_fill = util_surface_fill;
 }
diff --git a/src/gallium/drivers/cell/ppu/cell_texture.c b/src/gallium/drivers/cell/ppu/cell_texture.c
index 5a0942bbd6..b6590dfb86 100644
--- a/src/gallium/drivers/cell/ppu/cell_texture.c
+++ b/src/gallium/drivers/cell/ppu/cell_texture.c
@@ -63,19 +63,30 @@ cell_texture_layout(struct cell_texture * spt)
    spt->buffer_size = 0;
 
    for ( level = 0 ; level <= pt->last_level ; level++ ) {
+      unsigned size;
+      unsigned w_tile, h_tile;
+
+      /* width, height, rounded up to tile size */
+      w_tile = align(width, TILE_SIZE);
+      h_tile = align(height, TILE_SIZE);
+
       pt->width[level] = width;
       pt->height[level] = height;
       pt->depth[level] = depth;
-      pt->nblocksx[level] = pf_get_nblocksx(&pt->block, width);  
-      pt->nblocksy[level] = pf_get_nblocksy(&pt->block, height);  
+      pt->nblocksx[level] = pf_get_nblocksx(&pt->block, w_tile);  
+      pt->nblocksy[level] = pf_get_nblocksy(&pt->block, h_tile);  
 
       spt->stride[level] = pt->nblocksx[level] * pt->block.size;
 
       spt->level_offset[level] = spt->buffer_size;
 
-      spt->buffer_size += (pt->nblocksy[level] *
-			  ((pt->target == PIPE_TEXTURE_CUBE) ? 6 : depth) *
-                           pt->nblocksx[level] * pt->block.size);
+      size = pt->nblocksx[level] * pt->nblocksy[level] * pt->block.size;
+      if (pt->target == PIPE_TEXTURE_CUBE)
+         size *= 6;
+      else
+         size *= depth;
+
+      spt->buffer_size += size;
 
       width  = minify(width);
       height = minify(height);
@@ -85,8 +96,8 @@ cell_texture_layout(struct cell_texture * spt)
 
 
 static struct pipe_texture *
-cell_texture_create_screen(struct pipe_screen *screen,
-                           const struct pipe_texture *templat)
+cell_texture_create(struct pipe_screen *screen,
+                    const struct pipe_texture *templat)
 {
    struct pipe_winsys *ws = screen->winsys;
    struct cell_texture *spt = CALLOC_STRUCT(cell_texture);
@@ -113,8 +124,8 @@ cell_texture_create_screen(struct pipe_screen *screen,
 
 
 static void
-cell_texture_release_screen(struct pipe_screen *screen,
-                            struct pipe_texture **pt)
+cell_texture_release(struct pipe_screen *screen,
+                     struct pipe_texture **pt)
 {
    if (!*pt)
       return;
@@ -130,7 +141,7 @@ cell_texture_release_screen(struct pipe_screen *screen,
       DBG("%s deleting %p\n", __FUNCTION__, (void *) spt);
       */
 
-      pipe_buffer_reference(screen->winsys, &spt->buffer, NULL);
+      pipe_buffer_reference(screen, &spt->buffer, NULL);
 
       FREE(spt);
    }
@@ -138,6 +149,7 @@ cell_texture_release_screen(struct pipe_screen *screen,
 }
 
 
+#if 0
 static void
 cell_texture_update(struct pipe_context *pipe, struct pipe_texture *texture,
                     uint face, uint levelsMask)
@@ -145,13 +157,14 @@ cell_texture_update(struct pipe_context *pipe, struct pipe_texture *texture,
    /* XXX TO DO:  re-tile the texture data ... */
 
 }
+#endif
 
 
 static struct pipe_surface *
-cell_get_tex_surface_screen(struct pipe_screen *screen,
-                            struct pipe_texture *pt,
-                            unsigned face, unsigned level, unsigned zslice,
-                            unsigned usage)
+cell_get_tex_surface(struct pipe_screen *screen,
+                     struct pipe_texture *pt,
+                     unsigned face, unsigned level, unsigned zslice,
+                     unsigned usage)
 {
    struct pipe_winsys *ws = screen->winsys;
    struct cell_texture *spt = cell_texture(pt);
@@ -161,7 +174,7 @@ cell_get_tex_surface_screen(struct pipe_screen *screen,
    if (ps) {
       assert(ps->refcount);
       assert(ps->winsys);
-      pipe_buffer_reference(ws, &ps->buffer, spt->buffer);
+      winsys_buffer_reference(ws, &ps->buffer, spt->buffer);
       ps->format = pt->format;
       ps->block = pt->block;
       ps->width = pt->width[level];
@@ -174,12 +187,17 @@ cell_get_tex_surface_screen(struct pipe_screen *screen,
 
       /* XXX may need to override usage flags (see sp_texture.c) */
 
+      pipe_texture_reference(&ps->texture, pt); 
+      ps->face = face;
+      ps->level = level;
+      ps->zslice = zslice;
 
       if (pt->target == PIPE_TEXTURE_CUBE || pt->target == PIPE_TEXTURE_3D) {
 	 ps->offset += ((pt->target == PIPE_TEXTURE_CUBE) ? face : zslice) *
 		       ps->nblocksy *
 		       ps->stride;
-      } else {
+      }
+      else {
 	 assert(face == 0);
 	 assert(zslice == 0);
       }
@@ -189,6 +207,11 @@ cell_get_tex_surface_screen(struct pipe_screen *screen,
 
 
 
+/**
+ * Copy tile data from linear layout to tiled layout.
+ * XXX this should be rolled into the future surface-creation code.
+ * XXX also need "untile" code...
+ */
 static void
 tile_copy_data(uint w, uint h, uint tile_size, uint *dst, const uint *src)
 {
@@ -219,6 +242,7 @@ tile_copy_data(uint w, uint h, uint tile_size, uint *dst, const uint *src)
 
 /**
  * Convert linear texture image data to tiled format for SPU usage.
+ * XXX recast this in terms of pipe_surfaces (aka texture views).
  */
 static void
 cell_tile_texture(struct cell_context *cell,
@@ -285,6 +309,21 @@ cell_update_texture_mapping(struct cell_context *cell)
 }
 
 
+static void 
+cell_tex_surface_release(struct pipe_screen *screen, 
+                         struct pipe_surface **s)
+{
+   /* Effectively do the texture_update work here - if texture images
+    * needed post-processing to put them into hardware layout, this is
+    * where it would happen.  For softpipe, nothing to do.
+    */
+   assert ((*s)->texture);
+   pipe_texture_reference(&(*s)->texture, NULL); 
+
+   screen->winsys->surface_release(screen->winsys, s);
+}
+
+
 static void *
 cell_surface_map( struct pipe_screen *screen,
                   struct pipe_surface *surface,
@@ -297,7 +336,7 @@ cell_surface_map( struct pipe_screen *screen,
       return NULL;
    }
 
-   map = screen->winsys->buffer_map( screen->winsys, surface->buffer, flags );
+   map = pipe_buffer_map( screen, surface->buffer, flags );
    if (map == NULL)
       return NULL;
 
@@ -323,7 +362,7 @@ static void
 cell_surface_unmap(struct pipe_screen *screen,
                    struct pipe_surface *surface)
 {
-   screen->winsys->buffer_unmap( screen->winsys, surface->buffer );
+   pipe_buffer_unmap( screen, surface->buffer );
 }
 
 
@@ -333,12 +372,15 @@ cell_init_texture_functions(struct cell_context *cell)
    /*cell->pipe.texture_update = cell_texture_update;*/
 }
 
+
 void
 cell_init_screen_texture_funcs(struct pipe_screen *screen)
 {
-   screen->texture_create = cell_texture_create_screen;
-   screen->texture_release = cell_texture_release_screen;
-   screen->get_tex_surface = cell_get_tex_surface_screen;
+   screen->texture_create = cell_texture_create;
+   screen->texture_release = cell_texture_release;
+
+   screen->get_tex_surface = cell_get_tex_surface;
+   screen->tex_surface_release = cell_tex_surface_release;
 
    screen->surface_map = cell_surface_map;
    screen->surface_unmap = cell_surface_unmap;
diff --git a/src/gallium/drivers/cell/ppu/cell_vbuf.c b/src/gallium/drivers/cell/ppu/cell_vbuf.c
index e4230c7a5f..aa63435b93 100644
--- a/src/gallium/drivers/cell/ppu/cell_vbuf.c
+++ b/src/gallium/drivers/cell/ppu/cell_vbuf.c
@@ -26,6 +26,11 @@
  **************************************************************************/
 
 /**
+ * Vertex buffer code.  The draw module transforms vertices to window
+ * coords, etc. and emits the vertices into buffer supplied by this module.
+ * When a vertex buffer is full, or we flush, we'll send the vertex data
+ * to the SPUs.
+ *
  * Authors
  *  Brian Paul
  */
@@ -113,7 +118,7 @@ cell_vbuf_release_vertices(struct vbuf_render *vbr, void *vertices,
    }
 
    cvbr->vertex_buf = ~0;
-   cell_flush_int(&cell->pipe, 0x0);
+   cell_flush_int(cell, 0x0);
 
    assert(vertices == cvbr->vertex_buffer);
    cvbr->vertex_buffer = NULL;
@@ -121,12 +126,13 @@ cell_vbuf_release_vertices(struct vbuf_render *vbr, void *vertices,
 
 
 
-static void
+static boolean
 cell_vbuf_set_primitive(struct vbuf_render *vbr, unsigned prim)
 {
    struct cell_vbuf_render *cvbr = cell_vbuf_render(vbr);
    cvbr->prim = prim;
    /*printf("cell_set_prim %u\n", prim);*/
+   return TRUE;
 }
 
 
@@ -244,7 +250,7 @@ cell_vbuf_draw(struct vbuf_render *vbr,
 
 #if 0
    /* helpful for debug */
-   cell_flush_int(&cell->pipe, CELL_FLUSH_WAIT);
+   cell_flush_int(cell, CELL_FLUSH_WAIT);
 #endif
 }
 
diff --git a/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c b/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c
index 2ece0250f6..566df7f59e 100644
--- a/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c
+++ b/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c
@@ -297,10 +297,9 @@ void cell_update_vertex_fetch(struct draw_context *draw)
 
 
    /* Each fetch function can be a maximum of 34 instructions (note: this is
-    * actually a slight over-estimate).  That means (34 * 4) = 136 bytes
-    * each maximum.
+    * actually a slight over-estimate).
     */
-   spe_init_func(p, 136 * unique_attr_formats);
+   spe_init_func(p, 34 * SPE_INST_SIZE * unique_attr_formats);
 
 
    /* Allocate registers for the function's input parameters.
diff --git a/src/gallium/drivers/cell/ppu/cell_vertex_shader.c b/src/gallium/drivers/cell/ppu/cell_vertex_shader.c
index 3658947715..2b10c116fa 100644
--- a/src/gallium/drivers/cell/ppu/cell_vertex_shader.c
+++ b/src/gallium/drivers/cell/ppu/cell_vertex_shader.c
@@ -135,7 +135,7 @@ cell_vertex_shader_queue_flush(struct draw_context *draw)
       vs->num_elts = n;
       send_mbox_message(cell_global.spe_contexts[0], CELL_CMD_VS_EXECUTE);
 
-      cell_flush_int(& cell->pipe, CELL_FLUSH_WAIT);
+      cell_flush_int(cell, CELL_FLUSH_WAIT);
    }
 
    draw->vs.post_nr = draw->vs.queue_nr;
diff --git a/src/gallium/drivers/cell/spu/Makefile b/src/gallium/drivers/cell/spu/Makefile
index 8e83610790..1ae0dfb8c1 100644
--- a/src/gallium/drivers/cell/spu/Makefile
+++ b/src/gallium/drivers/cell/spu/Makefile
@@ -5,7 +5,7 @@
 
 
 TOP = ../../../../..
-include $(TOP)/configs/linux-cell
+include $(TOP)/configs/current
 
 
 PROG = g3d
@@ -22,12 +22,15 @@ SOURCES = \
 	spu_render.c \
 	spu_texture.c \
 	spu_tile.c \
-	spu_tri.c \
+	spu_tri.c
+
+OLD_SOURCES = \
 	spu_exec.c \
 	spu_util.c \
 	spu_vertex_fetch.c \
 	spu_vertex_shader.c
 
+
 SPU_OBJECTS = $(SOURCES:.c=.o) \
 
 SPU_ASM_OUT = $(SOURCES:.c=.s) \
@@ -43,7 +46,7 @@ INCLUDE_DIRS = \
 	$(SPU_CC) $(SPU_CFLAGS) -c $<
 
 .c.s:
-	$(SPU_CC) $(SPU_CFLAGS) -S $<
+	$(SPU_CC) $(SPU_CFLAGS) -O3 -S $<
 
 
 # The .a file will be linked into the main/PPU executable
diff --git a/src/gallium/drivers/cell/spu/spu_colorpack.h b/src/gallium/drivers/cell/spu/spu_colorpack.h
index e9fee8a3a6..fd8dc6ded3 100644
--- a/src/gallium/drivers/cell/spu/spu_colorpack.h
+++ b/src/gallium/drivers/cell/spu/spu_colorpack.h
@@ -79,14 +79,14 @@ spu_pack_color_shuffle(vector float rgba, vector unsigned char shuffle)
 
 
 static INLINE vector float
-spu_unpack_color(uint color)
+spu_unpack_B8G8R8A8(uint color)
 {
    vector unsigned int color_u4 = spu_splats(color);
    color_u4 = spu_shuffle(color_u4, color_u4,
                           ((vector unsigned char) {
-                             0, 0, 0, 0,
-                             5, 5, 5, 5,
                              10, 10, 10, 10,
+                             5, 5, 5, 5,
+                             0, 0, 0, 0,
                              15, 15, 15, 15}) );
    return spu_convtf(color_u4, 32);
 }
diff --git a/src/gallium/drivers/cell/spu/spu_exec.c b/src/gallium/drivers/cell/spu/spu_exec.c
index 89c61136a4..e27df2dfb3 100644
--- a/src/gallium/drivers/cell/spu/spu_exec.c
+++ b/src/gallium/drivers/cell/spu/spu_exec.c
@@ -382,10 +382,10 @@ fetch_src_file_channel(
          break;
 
       case TGSI_FILE_IMMEDIATE:
-         assert( index->i[0] < (int) mach->ImmLimit );
-         assert( index->i[1] < (int) mach->ImmLimit );
-         assert( index->i[2] < (int) mach->ImmLimit );
-         assert( index->i[3] < (int) mach->ImmLimit );
+         ASSERT( index->i[0] < (int) mach->ImmLimit );
+         ASSERT( index->i[1] < (int) mach->ImmLimit );
+         ASSERT( index->i[2] < (int) mach->ImmLimit );
+         ASSERT( index->i[3] < (int) mach->ImmLimit );
 
          chan->f[0] = mach->Imms[index->i[0]][swizzle];
          chan->f[1] = mach->Imms[index->i[1]][swizzle];
@@ -409,7 +409,7 @@ fetch_src_file_channel(
          break;
 
       default:
-         assert( 0 );
+         ASSERT( 0 );
       }
       break;
 
@@ -422,7 +422,7 @@ fetch_src_file_channel(
       break;
 
    default:
-      assert( 0 );
+      ASSERT( 0 );
    }
 }
 
@@ -471,7 +471,7 @@ fetch_source(
          index.q = si_shli(index.q, 12);
          break;
       default:
-         assert( 0 );
+         ASSERT( 0 );
       }
 
       index.i[0] += reg->SrcRegisterDim.Index;
@@ -558,7 +558,7 @@ store_dest(
       break;
 
    default:
-      assert( 0 );
+      ASSERT( 0 );
       return;
    }
 
@@ -582,11 +582,11 @@ store_dest(
       break;
 
    case TGSI_SAT_MINUS_PLUS_ONE:
-      assert( 0 );
+      ASSERT( 0 );
       break;
 
    default:
-      assert( 0 );
+      ASSERT( 0 );
    }
 }
 
@@ -769,7 +769,7 @@ exec_tex(struct spu_exec_machine *mach,
       break;
 
    default:
-      assert (0);
+      ASSERT (0);
    }
 
    FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
@@ -861,7 +861,7 @@ exec_declaration(struct spu_exec_machine *mach,
             break;
 
          default:
-            assert( 0 );
+            ASSERT( 0 );
          }
 
          if( mask == TGSI_WRITEMASK_XYZW ) {
@@ -971,11 +971,11 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_EXP:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_LOG:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_MUL:
@@ -1151,24 +1151,24 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_CND:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_CND0:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_DOT2ADD:
       /* TGSI_OPCODE_DP2A */
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_INDEX:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_NEGATE:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_FRAC:
@@ -1181,7 +1181,7 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_CLAMP:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_FLOOR:
@@ -1276,7 +1276,7 @@ exec_instruction(
       break;
 
     case TGSI_OPCODE_MULTIPLYMATRIX:
-       assert (0);
+       ASSERT (0);
        break;
 
     case TGSI_OPCODE_ABS:
@@ -1290,7 +1290,7 @@ exec_instruction(
        break;
 
    case TGSI_OPCODE_RCC:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_DPH:
@@ -1353,23 +1353,23 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_PK2H:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_PK2US:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_PK4B:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_PK4UB:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_RFL:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_SEQ:
@@ -1384,7 +1384,7 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_SFL:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_SGT:
@@ -1429,7 +1429,7 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_STR:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_TEX:
@@ -1452,7 +1452,7 @@ exec_instruction(
       /* src[1] = d[strq]/dx */
       /* src[2] = d[strq]/dy */
       /* src[3] = sampler unit */
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_TXL:
@@ -1470,35 +1470,35 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_UP2H:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_UP2US:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_UP4B:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_UP4UB:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_X2D:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_ARA:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_ARR:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_BRA:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_CAL:
@@ -1507,14 +1507,14 @@ exec_instruction(
          /* do the call */
 
          /* push the Cond, Loop, Cont stacks */
-         assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
+         ASSERT(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
          mach->CondStack[mach->CondStackTop++] = mach->CondMask;
-         assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
+         ASSERT(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
          mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
-         assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
+         ASSERT(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
          mach->ContStack[mach->ContStackTop++] = mach->ContMask;
 
-         assert(mach->FuncStackTop < TGSI_EXEC_MAX_CALL_NESTING);
+         ASSERT(mach->FuncStackTop < TGSI_EXEC_MAX_CALL_NESTING);
          mach->FuncStack[mach->FuncStackTop++] = mach->FuncMask;
 
          /* note that PC was already incremented above */
@@ -1538,13 +1538,13 @@ exec_instruction(
          *pc = mach->CallStack[--mach->CallStackTop];
 
          /* pop the Cond, Loop, Cont stacks */
-         assert(mach->CondStackTop > 0);
+         ASSERT(mach->CondStackTop > 0);
          mach->CondMask = mach->CondStack[--mach->CondStackTop];
-         assert(mach->LoopStackTop > 0);
+         ASSERT(mach->LoopStackTop > 0);
          mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
-         assert(mach->ContStackTop > 0);
+         ASSERT(mach->ContStackTop > 0);
          mach->ContMask = mach->ContStack[--mach->ContStackTop];
-         assert(mach->FuncStackTop > 0);
+         ASSERT(mach->FuncStackTop > 0);
          mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
 
          UPDATE_EXEC_MASK(mach);
@@ -1552,7 +1552,7 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_SSG:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_CMP:
@@ -1592,11 +1592,11 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_NRM:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_DIV:
-      assert( 0 );
+      ASSERT( 0 );
       break;
 
    case TGSI_OPCODE_DP2:
@@ -1615,7 +1615,7 @@ exec_instruction(
 
    case TGSI_OPCODE_IF:
       /* push CondMask */
-      assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
+      ASSERT(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
       mach->CondStack[mach->CondStackTop++] = mach->CondMask;
       FETCH( &r[0], 0, CHAN_X );
       /* update CondMask */
@@ -1639,7 +1639,7 @@ exec_instruction(
       /* invert CondMask wrt previous mask */
       {
          uint prevMask;
-         assert(mach->CondStackTop > 0);
+         ASSERT(mach->CondStackTop > 0);
          prevMask = mach->CondStack[mach->CondStackTop - 1];
          mach->CondMask = ~mach->CondMask & prevMask;
          UPDATE_EXEC_MASK(mach);
@@ -1649,7 +1649,7 @@ exec_instruction(
 
    case TGSI_OPCODE_ENDIF:
       /* pop CondMask */
-      assert(mach->CondStackTop > 0);
+      ASSERT(mach->CondStackTop > 0);
       mach->CondMask = mach->CondStack[--mach->CondStackTop];
       UPDATE_EXEC_MASK(mach);
       break;
@@ -1660,19 +1660,19 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_REP:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_ENDREP:
-       assert (0);
+       ASSERT (0);
        break;
 
    case TGSI_OPCODE_PUSHA:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_POPA:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_CEIL:
@@ -1746,7 +1746,7 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_MOD:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_XOR:
@@ -1759,15 +1759,15 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_SAD:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_TXF:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_TXQ:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_EMIT:
@@ -1784,9 +1784,9 @@ exec_instruction(
       /* fall-through (for now) */
    case TGSI_OPCODE_BGNLOOP2:
       /* push LoopMask and ContMasks */
-      assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
+      ASSERT(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
       mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
-      assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
+      ASSERT(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
       mach->ContStack[mach->ContStackTop++] = mach->ContMask;
       break;
 
@@ -1794,7 +1794,7 @@ exec_instruction(
       /* fall-through (for now at least) */
    case TGSI_OPCODE_ENDLOOP2:
       /* Restore ContMask, but don't pop */
-      assert(mach->ContStackTop > 0);
+      ASSERT(mach->ContStackTop > 0);
       mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
       if (mach->LoopMask) {
          /* repeat loop: jump to instruction just past BGNLOOP */
@@ -1802,10 +1802,10 @@ exec_instruction(
       }
       else {
          /* exit loop: pop LoopMask */
-         assert(mach->LoopStackTop > 0);
+         ASSERT(mach->LoopStackTop > 0);
          mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
          /* pop ContMask */
-         assert(mach->ContStackTop > 0);
+         ASSERT(mach->ContStackTop > 0);
          mach->ContMask = mach->ContStack[--mach->ContStackTop];
       }
       UPDATE_EXEC_MASK(mach);
@@ -1834,26 +1834,26 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_NOISE1:
-      assert( 0 );
+      ASSERT( 0 );
       break;
 
    case TGSI_OPCODE_NOISE2:
-      assert( 0 );
+      ASSERT( 0 );
       break;
 
    case TGSI_OPCODE_NOISE3:
-      assert( 0 );
+      ASSERT( 0 );
       break;
 
    case TGSI_OPCODE_NOISE4:
-      assert( 0 );
+      ASSERT( 0 );
       break;
 
    case TGSI_OPCODE_NOP:
       break;
 
    default:
-      assert( 0 );
+      ASSERT( 0 );
    }
 }
 
@@ -1874,11 +1874,11 @@ spu_exec_machine_run( struct spu_exec_machine *mach )
    mach->FuncMask = 0xf;
    mach->ExecMask = 0xf;
 
-   mach->CondStackTop = 0; /* temporarily subvert this assertion */
-   assert(mach->CondStackTop == 0);
-   assert(mach->LoopStackTop == 0);
-   assert(mach->ContStackTop == 0);
-   assert(mach->CallStackTop == 0);
+   mach->CondStackTop = 0; /* temporarily subvert this ASSERTion */
+   ASSERT(mach->CondStackTop == 0);
+   ASSERT(mach->LoopStackTop == 0);
+   ASSERT(mach->ContStackTop == 0);
+   ASSERT(mach->CallStackTop == 0);
 
    mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] = 0;
    mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] = 0;
diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index e04ffeb9b1..2a7cb75f59 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -34,6 +34,7 @@
 
 #include "spu_main.h"
 #include "spu_render.h"
+#include "spu_per_fragment_op.h"
 #include "spu_texture.h"
 #include "spu_tile.h"
 //#include "spu_test.h"
@@ -46,7 +47,7 @@
 /*
 helpful headers:
 /usr/lib/gcc/spu/4.1.1/include/spu_mfcio.h
-/opt/ibm/cell-sdk/prototype/sysroot/usr/include/libmisc.h
+/opt/cell/sdk/usr/include/libmisc.h
 */
 
 boolean Debug = FALSE;
@@ -55,17 +56,13 @@ struct spu_global spu;
 
 struct spu_vs_context draw;
 
-static unsigned char attribute_fetch_code_buffer[136 * PIPE_MAX_ATTRIBS]
-    ALIGN16_ATTRIB;
 
-static unsigned char depth_stencil_code_buffer[4 * 64]
-    ALIGN16_ATTRIB;
-
-static unsigned char fb_blend_code_buffer[4 * 64]
+/**
+ * Buffers containing dynamically generated SPU code:
+ */
+static unsigned char attribute_fetch_code_buffer[136 * PIPE_MAX_ATTRIBS]
     ALIGN16_ATTRIB;
 
-static unsigned char logicop_code_buffer[4 * 64]
-    ALIGN16_ATTRIB;
 
 
 /**
@@ -136,54 +133,75 @@ really_clear_tiles(uint surfaceIndex)
 static void
 cmd_clear_surface(const struct cell_command_clear_surface *clear)
 {
-   const uint num_tiles = spu.fb.width_tiles * spu.fb.height_tiles;
-   uint i;
-
    if (Debug)
       printf("SPU %u: CLEAR SURF %u to 0x%08x\n", spu.init.id,
              clear->surface, clear->value);
 
-#define CLEAR_OPT 1
-#if CLEAR_OPT
-   /* set all tile's status to CLEAR */
    if (clear->surface == 0) {
-      memset(spu.ctile_status, TILE_STATUS_CLEAR, sizeof(spu.ctile_status));
       spu.fb.color_clear_value = clear->value;
+      if (spu.init.debug_flags & CELL_DEBUG_CHECKER) {
+         uint x = (spu.init.id << 4) | (spu.init.id << 12) |
+            (spu.init.id << 20) | (spu.init.id << 28);
+         spu.fb.color_clear_value ^= x;
+      }
    }
    else {
-      memset(spu.ztile_status, TILE_STATUS_CLEAR, sizeof(spu.ztile_status));
       spu.fb.depth_clear_value = clear->value;
    }
-   return;
-#endif
 
+#define CLEAR_OPT 1
+#if CLEAR_OPT
+
+   /* Simply set all tiles' status to CLEAR.
+    * When we actually begin rendering into a tile, we'll initialize it to
+    * the clear value.  If any tiles go untouched during the frame,
+    * really_clear_tiles() will set them to the clear value.
+    */
    if (clear->surface == 0) {
-      spu.fb.color_clear_value = clear->value;
-      clear_c_tile(&spu.ctile);
+      memset(spu.ctile_status, TILE_STATUS_CLEAR, sizeof(spu.ctile_status));
    }
    else {
-      spu.fb.depth_clear_value = clear->value;
-      clear_z_tile(&spu.ztile);
+      memset(spu.ztile_status, TILE_STATUS_CLEAR, sizeof(spu.ztile_status));
    }
 
+#else
+
+   /*
+    * This path clears the whole framebuffer to the clear color right now.
+    */
+
    /*
    printf("SPU: %s num=%d w=%d h=%d\n",
           __FUNCTION__, num_tiles, spu.fb.width_tiles, spu.fb.height_tiles);
    */
 
-   for (i = spu.init.id; i < num_tiles; i += spu.init.num_spus) {
-      uint tx = i % spu.fb.width_tiles;
-      uint ty = i / spu.fb.width_tiles;
-      if (clear->surface == 0)
-         put_tile(tx, ty, &spu.ctile, TAG_SURFACE_CLEAR, 0);
-      else
-         put_tile(tx, ty, &spu.ztile, TAG_SURFACE_CLEAR, 1);
-      /* XXX we don't want this here, but it fixes bad tile results */
+   /* init a single tile to the clear value */
+   if (clear->surface == 0) {
+      clear_c_tile(&spu.ctile);
+   }
+   else {
+      clear_z_tile(&spu.ztile);
    }
 
-#if 0
-   wait_on_mask(1 << TAG_SURFACE_CLEAR);
-#endif
+   /* walk over my tiles, writing the 'clear' tile's data */
+   {
+      const uint num_tiles = spu.fb.width_tiles * spu.fb.height_tiles;
+      uint i;
+      for (i = spu.init.id; i < num_tiles; i += spu.init.num_spus) {
+         uint tx = i % spu.fb.width_tiles;
+         uint ty = i / spu.fb.width_tiles;
+         if (clear->surface == 0)
+            put_tile(tx, ty, &spu.ctile, TAG_SURFACE_CLEAR, 0);
+         else
+            put_tile(tx, ty, &spu.ztile, TAG_SURFACE_CLEAR, 1);
+      }
+   }
+
+   if (spu.init.debug_flags & CELL_DEBUG_SYNC) {
+      wait_on_mask(1 << TAG_SURFACE_CLEAR);
+   }
+
+#endif /* CLEAR_OPT */
 
    if (Debug)
       printf("SPU %u: CLEAR SURF done\n", spu.init.id);
@@ -201,6 +219,31 @@ cmd_release_verts(const struct cell_command_release_verts *release)
 }
 
 
+/**
+ * Process a CELL_CMD_STATE_FRAGMENT_OPS command.
+ * This involves installing new fragment ops SPU code.
+ * If this function is never called, we'll use a regular C fallback function
+ * for fragment processing.
+ */
+static void
+cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops)
+{
+   if (Debug)
+      printf("SPU %u: CMD_STATE_FRAGMENT_OPS\n", spu.init.id);
+   /* Copy SPU code from batch buffer to spu buffer */
+   memcpy(spu.fragment_ops_code, fops->code, SPU_MAX_FRAGMENT_OPS_INSTS * 4);
+   /* Copy state info */
+   memcpy(&spu.depth_stencil_alpha, &fops->dsa, sizeof(fops->dsa));
+   memcpy(&spu.blend, &fops->blend, sizeof(fops->blend));
+
+   /* Point function pointer at new code */
+   spu.fragment_ops = (spu_fragment_ops_func) spu.fragment_ops_code;
+
+   spu.read_depth = spu.depth_stencil_alpha.depth.enabled;
+   spu.read_stencil = spu.depth_stencil_alpha.stencil[0].enabled;
+}
+
+
 static void
 cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
 {
@@ -227,87 +270,24 @@ cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
 
    switch (spu.fb.depth_format) {
    case PIPE_FORMAT_Z32_UNORM:
+      spu.fb.zsize = 4;
+      spu.fb.zscale = (float) 0xffffffffu;
+      break;
    case PIPE_FORMAT_Z24S8_UNORM:
    case PIPE_FORMAT_S8Z24_UNORM:
+   case PIPE_FORMAT_Z24X8_UNORM:
+   case PIPE_FORMAT_X8Z24_UNORM:
       spu.fb.zsize = 4;
+      spu.fb.zscale = (float) 0x00ffffffu;
       break;
    case PIPE_FORMAT_Z16_UNORM:
       spu.fb.zsize = 2;
+      spu.fb.zscale = (float) 0xffffu;
       break;
    default:
       spu.fb.zsize = 0;
       break;
    }
-
-   if (spu.fb.color_format == PIPE_FORMAT_A8R8G8B8_UNORM)
-      spu.color_shuffle = ((vector unsigned char) {
-                              12, 0, 4, 8, 0, 0, 0, 0, 
-                              0, 0, 0, 0, 0, 0, 0, 0});
-   else if (spu.fb.color_format == PIPE_FORMAT_B8G8R8A8_UNORM)
-      spu.color_shuffle = ((vector unsigned char) {
-                              8, 4, 0, 12, 0, 0, 0, 0, 
-                              0, 0, 0, 0, 0, 0, 0, 0});
-   else
-      ASSERT(0);
-}
-
-
-static void
-cmd_state_blend(const struct cell_command_blend *state)
-{
-   if (Debug)
-      printf("SPU %u: BLEND: enabled %d\n",
-             spu.init.id,
-             (state->size != 0));
-
-   ASSERT_ALIGN16(state->base);
-
-   if (state->size != 0) {
-      mfc_get(fb_blend_code_buffer,
-              (unsigned int) state->base,  /* src */
-              ROUNDUP16(state->size),
-              TAG_BATCH_BUFFER,
-              0, /* tid */
-              0  /* rid */);
-      wait_on_mask(1 << TAG_BATCH_BUFFER);
-      spu.blend = (blend_func) fb_blend_code_buffer;
-      spu.read_fb = state->read_fb;
-   } else {
-      spu.read_fb = FALSE;
-   }
-}
-
-
-static void
-cmd_state_depth_stencil(const struct cell_command_depth_stencil_alpha_test *state)
-{
-   if (Debug)
-      printf("SPU %u: DEPTH_STENCIL: ztest %d\n",
-             spu.init.id,
-             state->read_depth);
-
-   ASSERT_ALIGN16(state->base);
-
-   if (state->size != 0) {
-      mfc_get(depth_stencil_code_buffer,
-	      (unsigned int) state->base,  /* src */
-	      ROUNDUP16(state->size),
-	      TAG_BATCH_BUFFER,
-	      0, /* tid */
-	      0  /* rid */);
-      wait_on_mask(1 << TAG_BATCH_BUFFER);
-   } else {
-      /* If there is no code, emit a return instruction.
-       */
-      depth_stencil_code_buffer[0] = 0x35;
-      depth_stencil_code_buffer[1] = 0x00;
-      depth_stencil_code_buffer[2] = 0x00;
-      depth_stencil_code_buffer[3] = 0x00;
-   }
-
-   spu.frag_test = (frag_test_func) depth_stencil_code_buffer;
-   spu.read_depth = state->read_depth;
-   spu.read_stencil = state->read_stencil;
 }
 
 
@@ -381,6 +361,21 @@ cmd_state_vs_array_info(const struct cell_array_info *vs_info)
 
 
 static void
+cmd_state_attrib_fetch(const struct cell_attribute_fetch_code *code)
+{
+   mfc_get(attribute_fetch_code_buffer,
+           (unsigned int) code->base,  /* src */
+           code->size,
+           TAG_BATCH_BUFFER,
+           0, /* tid */
+           0  /* rid */);
+   wait_on_mask(1 << TAG_BATCH_BUFFER);
+
+   draw.vertex_fetch.code = attribute_fetch_code_buffer;
+}
+
+
+static void
 cmd_finish(void)
 {
    if (Debug)
@@ -395,7 +390,9 @@ cmd_finish(void)
 
 
 /**
- * Execute a batch of commands
+ * Execute a batch of commands which was sent to us by the PPU.
+ * See the cell_emit_state.c code to see where the commands come from.
+ *
  * The opcode param encodes the location of the buffer and its size.
  */
 static void
@@ -432,16 +429,14 @@ cmd_batch(uint opcode)
       printf("SPU %u: release batch buf %u\n", spu.init.id, buf);
    release_buffer(buf);
 
+   /*
+    * Loop over commands in the batch buffer
+    */
    for (pos = 0; pos < usize; /* no incr */) {
       switch (buffer[pos]) {
-      case CELL_CMD_STATE_FRAMEBUFFER:
-         {
-            struct cell_command_framebuffer *fb
-               = (struct cell_command_framebuffer *) &buffer[pos];
-            cmd_state_framebuffer(fb);
-            pos += sizeof(*fb) / 8;
-         }
-         break;
+      /*
+       * rendering commands
+       */
       case CELL_CMD_CLEAR_SURFACE:
          {
             struct cell_command_clear_surface *clr
@@ -459,26 +454,24 @@ cmd_batch(uint opcode)
             pos += pos_incr;
          }
          break;
-      case CELL_CMD_RELEASE_VERTS:
+      /*
+       * state-update commands
+       */
+      case CELL_CMD_STATE_FRAMEBUFFER:
          {
-            struct cell_command_release_verts *release
-               = (struct cell_command_release_verts *) &buffer[pos];
-            cmd_release_verts(release);
-            pos += sizeof(*release) / 8;
+            struct cell_command_framebuffer *fb
+               = (struct cell_command_framebuffer *) &buffer[pos];
+            cmd_state_framebuffer(fb);
+            pos += sizeof(*fb) / 8;
          }
          break;
-      case CELL_CMD_FINISH:
-         cmd_finish();
-         pos += 1;
-         break;
-      case CELL_CMD_STATE_BLEND:
-         cmd_state_blend((struct cell_command_blend *) &buffer[pos+1]);
-         pos += (1 + ROUNDUP8(sizeof(struct cell_command_blend)) / 8);
-         break;
-      case CELL_CMD_STATE_DEPTH_STENCIL:
-         cmd_state_depth_stencil((struct cell_command_depth_stencil_alpha_test *)
-                                 &buffer[pos+1]);
-         pos += (1 + ROUNDUP8(sizeof(struct cell_command_depth_stencil_alpha_test)) / 8);
+      case CELL_CMD_STATE_FRAGMENT_OPS:
+         {
+            struct cell_command_fragment_ops *fops
+               = (struct cell_command_fragment_ops *) &buffer[pos];
+            cmd_state_fragment_ops(fops);
+            pos += sizeof(*fops) / 8;
+         }
          break;
       case CELL_CMD_STATE_SAMPLER:
          {
@@ -514,42 +507,32 @@ cmd_batch(uint opcode)
          pos += (1 + ROUNDUP8(sizeof(struct cell_array_info)) / 8);
          break;
       case CELL_CMD_STATE_BIND_VS:
+#if 0
          spu_bind_vertex_shader(&draw,
                                 (struct cell_shader_info *) &buffer[pos+1]);
+#endif
          pos += (1 + ROUNDUP8(sizeof(struct cell_shader_info)) / 8);
          break;
-      case CELL_CMD_STATE_ATTRIB_FETCH: {
-         struct cell_attribute_fetch_code *code =
-             (struct cell_attribute_fetch_code *) &buffer[pos+1];
-
-              mfc_get(attribute_fetch_code_buffer,
-                      (unsigned int) code->base,  /* src */
-                      code->size,
-                      TAG_BATCH_BUFFER,
-                      0, /* tid */
-                      0  /* rid */);
-         wait_on_mask(1 << TAG_BATCH_BUFFER);
-
-         draw.vertex_fetch.code = attribute_fetch_code_buffer;
+      case CELL_CMD_STATE_ATTRIB_FETCH:
+         cmd_state_attrib_fetch((struct cell_attribute_fetch_code *)
+                                &buffer[pos+1]);
          pos += (1 + ROUNDUP8(sizeof(struct cell_attribute_fetch_code)) / 8);
          break;
-      }
-      case CELL_CMD_STATE_LOGICOP: {
-         struct cell_command_logicop *code =
-             (struct cell_command_logicop *) &buffer[pos+1];
-
-              mfc_get(logicop_code_buffer,
-                      (unsigned int) code->base,  /* src */
-                      code->size,
-                      TAG_BATCH_BUFFER,
-                      0, /* tid */
-                      0  /* rid */);
-         wait_on_mask(1 << TAG_BATCH_BUFFER);
-
-	 spu.logicop = (logicop_func) logicop_code_buffer;
-         pos += (1 + ROUNDUP8(sizeof(struct cell_command_logicop)) / 8);
+      /*
+       * misc commands
+       */
+      case CELL_CMD_FINISH:
+         cmd_finish();
+         pos += 1;
+         break;
+      case CELL_CMD_RELEASE_VERTS:
+         {
+            struct cell_command_release_verts *release
+               = (struct cell_command_release_verts *) &buffer[pos];
+            cmd_release_verts(release);
+            pos += sizeof(*release) / 8;
+         }
          break;
-      }
       case CELL_CMD_FLUSH_BUFFER_RANGE: {
 	 struct cell_buffer_range *br = (struct cell_buffer_range *)
 	     &buffer[pos+1];
@@ -618,7 +601,9 @@ main_loop(void)
          exitFlag = 1;
          break;
       case CELL_CMD_VS_EXECUTE:
+#if 0
          spu_execute_vertex_shader(&draw, &cmd.vs);
+#endif
          break;
       case CELL_CMD_BATCH:
          cmd_batch(opcode);
@@ -643,6 +628,11 @@ one_time_init(void)
    memset(spu.ctile_status, TILE_STATUS_DEFINED, sizeof(spu.ctile_status));
    memset(spu.ztile_status, TILE_STATUS_DEFINED, sizeof(spu.ztile_status));
    invalidate_tex_cache();
+
+   /* Install default/fallback fragment processing function.
+    * This will normally be overriden by a code-gen'd function.
+    */
+   spu.fragment_ops = spu_fallback_fragment_ops;
 }
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index e962e1426c..d40539da83 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -41,6 +41,10 @@
 #define MAX_HEIGHT 1024
 
 
+/**
+ * A tile is basically a TILE_SIZE x TILE_SIZE block of 4-byte pixels.
+ * The data may be addressed through several different types.
+ */
 typedef union {
    ushort us[TILE_SIZE][TILE_SIZE];
    uint   ui[TILE_SIZE][TILE_SIZE];
@@ -56,38 +60,23 @@ typedef union {
 #define TILE_STATUS_GETTING 5  /**< mfc_get() called but not yet arrived */
 
 
-struct spu_frag_test_results {
-   qword mask;
-   qword depth;
-   qword stencil;
-};
-
-typedef struct spu_frag_test_results (*frag_test_func)(qword frag_mask,
-    qword pixel_depth, qword pixel_stencil, qword frag_depth,
-    qword frag_alpha, qword facing);
-
-
-struct spu_blend_results {
-   qword r;
-   qword g;
-   qword b;
-   qword a;
-};
+/** Function for sampling textures */
+typedef vector float (*spu_sample_texture_func)(uint unit,
+                                                vector float texcoord);
 
-typedef struct spu_blend_results (*blend_func)(
-    qword frag_r, qword frag_g, qword frag_b, qword frag_a,
-    qword pixel_r, qword pixel_g, qword pixel_b, qword pixel_a,
-    qword const_r, qword const_g, qword const_b, qword const_a);
+/** Function for performing per-fragment ops */
+typedef void (*spu_fragment_ops_func)(uint x, uint y,
+                                      tile_t *colorTile,
+                                      tile_t *depthStencilTile,
+                                      vector float fragZ,
+                                      vector float fragRed,
+                                      vector float fragGreen,
+                                      vector float fragBlue,
+                                      vector float fragAlpha,
+                                      vector unsigned int mask);
 
-typedef struct spu_blend_results (*logicop_func)(
-    qword pixel_r, qword pixel_g, qword pixel_b, qword pixel_a,
-    qword frag_r, qword frag_g, qword frag_b, qword frag_a,
-    qword frag_mask);
-
-
-typedef vector float (*sample_texture_func)(uint unit, vector float texcoord);
-
-struct spu_framebuffer {
+struct spu_framebuffer
+{
    void *color_start;              /**< addr of color surface in main memory */
    void *depth_start;              /**< addr of depth surface in main memory */
    enum pipe_format color_format;
@@ -99,6 +88,7 @@ struct spu_framebuffer {
    uint depth_clear_value;
 
    uint zsize;                     /**< 0, 2 or 4 bytes per Z */
+   float zscale;                   /**< 65535.0, 2^24-1 or 2^32-1 */
 } ALIGN16_ATTRIB;
 
 
@@ -115,35 +105,31 @@ struct spu_texture
 
 
 /**
- * All SPU global/context state will be in singleton object of this type:
+ * All SPU global/context state will be in a singleton object of this type:
  */
 struct spu_global
 {
+   /** One-time init/constant info */
    struct cell_init_info init;
 
+   /*
+    * Current state
+    */
    struct spu_framebuffer fb;
-   boolean read_depth;
-   boolean read_stencil;
-   frag_test_func frag_test;
-   
-   boolean read_fb;
-   blend_func blend;
-   qword const_blend_color[4] ALIGN16_ATTRIB;
-
-   logicop_func logicop;
-
+   struct pipe_depth_stencil_alpha_state depth_stencil_alpha;
+   struct pipe_blend_state blend;
    struct pipe_sampler_state sampler[PIPE_MAX_SAMPLERS];
    struct spu_texture texture[PIPE_MAX_SAMPLERS];
-
    struct vertex_info vertex_info;
 
-   /* XXX more state to come */
-
-
-   /** current color and Z tiles */
+   /** Current color and Z tiles */
    tile_t ctile ALIGN16_ATTRIB;
    tile_t ztile ALIGN16_ATTRIB;
 
+   /** Read depth/stencil tiles? */
+   boolean read_depth;
+   boolean read_stencil;
+
    /** Current tiles' status */
    ubyte cur_ctile_status, cur_ztile_status;
 
@@ -151,11 +137,13 @@ struct spu_global
    ubyte ctile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
    ubyte ztile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
 
+   /** Current fragment ops machine code */
+   uint fragment_ops_code[SPU_MAX_FRAGMENT_OPS_INSTS];
+   /** Current fragment ops function */
+   spu_fragment_ops_func fragment_ops;
 
-   /** for converting RGBA to PIPE_FORMAT_x colors */
-   vector unsigned char color_shuffle;
-
-   sample_texture_func sample_texture[CELL_MAX_SAMPLERS];
+   /** Current texture sampler function */
+   spu_sample_texture_func sample_texture[CELL_MAX_SAMPLERS];
 
 } ALIGN16_ATTRIB;
 
diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
index b4cffeeb32..03dd547845 100644
--- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
+++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
@@ -1,211 +1,475 @@
-/*
- * (C) Copyright IBM Corporation 2008
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
  * All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * on the rights to use, copy, modify, merge, publish, distribute, sub
- * license, and/or sell copies of the Software, and to permit persons to whom
- * the Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
- * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
- * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- * USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
 
 /**
- * \file spu_per_fragment_op.c
- * SPU implementation various per-fragment operations.
- *
- * \author Ian Romanick <idr@us.ibm.com>
+ * \author Brian Paul
  */
 
+
+#include <transpose_matrix4x4.h>
 #include "pipe/p_format.h"
 #include "spu_main.h"
+#include "spu_colorpack.h"
 #include "spu_per_fragment_op.h"
 
-#define ZERO 0x80
-
-static void
-read_ds_quad(tile_t *buffer, unsigned x, unsigned y,
-             enum pipe_format depth_format, qword *depth,
-             qword *stencil)
-{
-   const int ix = x / 2;
-   const int iy = y / 2;
-
-   switch (depth_format) {
-   case PIPE_FORMAT_Z16_UNORM: {
-      qword *ptr = (qword *) &buffer->us8[iy][ix / 2];
-
-      const qword shuf_vec = (qword) {
-         ZERO, ZERO, 0, 1, ZERO, ZERO, 2, 3,
-         ZERO, ZERO, 4, 5, ZERO, ZERO, 6, 7
-      };
 
+#define LINEAR_QUAD_LAYOUT 1
 
-      /* At even X values we want the first 4 shorts, and at odd X values we
-       * want the second 4 shorts.
-       */
-      qword bias = (qword) spu_splats((unsigned char) ((ix & 0x01) << 3));
-      qword bias_mask = si_fsmbi(0x3333);
-      qword sv = si_a(shuf_vec, si_and(bias_mask, bias));
-
-      *depth = si_shufb(*ptr, *ptr, sv);
-      *stencil = si_il(0);
-      break;
-   }
-
-
-   case PIPE_FORMAT_Z32_UNORM: {
-      qword *ptr = (qword *) &buffer->ui4[iy][ix];
-
-      *depth = *ptr;
-      *stencil = si_il(0);
-      break;
-   }
-      
 
-   case PIPE_FORMAT_Z24S8_UNORM: {
-      qword *ptr = (qword *) &buffer->ui4[iy][ix];
-      qword mask = si_fsmbi(0xEEEE);
-
-      *depth = si_rotmai(si_and(*ptr, mask), -8);
-      *stencil = si_andc(*ptr, mask);
-      break;
+/**
+ * Called by rasterizer for each quad after the shader has run.  Do
+ * all the per-fragment operations including alpha test, z test,
+ * stencil test, blend, colormask and logicops.  This is a
+ * fallback/debug function.  In reality we'll use a generated function
+ * produced by the PPU.  But this function is useful for
+ * debug/validation.
+ */
+void
+spu_fallback_fragment_ops(uint x, uint y,
+                          tile_t *colorTile,
+                          tile_t *depthStencilTile,
+                          vector float fragZ,
+                          vector float fragR,
+                          vector float fragG,
+                          vector float fragB,
+                          vector float fragA,
+                          vector unsigned int mask)
+{
+   vector float frag_aos[4];
+   unsigned int c0, c1, c2, c3;
+
+   /* do alpha test */
+   if (spu.depth_stencil_alpha.alpha.enabled) {
+      vector float ref = spu_splats(spu.depth_stencil_alpha.alpha.ref);
+      vector unsigned int amask;
+
+      switch (spu.depth_stencil_alpha.alpha.func) {
+      case PIPE_FUNC_LESS:
+         amask = spu_cmpgt(ref, fragA);  /* mask = (fragA < ref) */
+         break;
+      case PIPE_FUNC_GREATER:
+         amask = spu_cmpgt(fragA, ref);  /* mask = (fragA > ref) */
+         break;
+      case PIPE_FUNC_GEQUAL:
+         amask = spu_cmpgt(ref, fragA);
+         amask = spu_nor(amask, amask);
+         break;
+      case PIPE_FUNC_LEQUAL:
+         amask = spu_cmpgt(fragA, ref);
+         amask = spu_nor(amask, amask);
+         break;
+      case PIPE_FUNC_EQUAL:
+         amask = spu_cmpeq(ref, fragA);
+         break;
+      case PIPE_FUNC_NOTEQUAL:
+         amask = spu_cmpeq(ref, fragA);
+         amask = spu_nor(amask, amask);
+         break;
+      case PIPE_FUNC_ALWAYS:
+         amask = spu_splats(0xffffffffU);
+         break;
+      case PIPE_FUNC_NEVER:
+         amask = spu_splats( 0x0U);
+         break;
+      default:
+         ;
+      }
+
+      mask = spu_and(mask, amask);
    }
 
-
-   case PIPE_FORMAT_S8Z24_UNORM: {
-      qword *ptr = (qword *) &buffer->ui4[iy][ix];
-
-      *depth = si_and(*ptr, si_fsmbi(0x7777));
-      *stencil = si_andi(si_roti(*ptr, 8), 0x0ff);
-      break;
+   /* Z and/or stencil testing... */
+   if (spu.depth_stencil_alpha.depth.enabled ||
+       spu.depth_stencil_alpha.stencil[0].enabled) {
+
+      /* get four Z/Stencil values from tile */
+      vector unsigned int mask24 = spu_splats((unsigned int)0x00ffffffU);
+      vector unsigned int ifbZS = depthStencilTile->ui4[y/2][x/2];
+      vector unsigned int ifbZ = spu_and(ifbZS, mask24);
+      vector unsigned int ifbS = spu_andc(ifbZS, mask24);
+
+      if (spu.depth_stencil_alpha.stencil[0].enabled) {
+         /* do stencil test */
+         ASSERT(spu.fb.depth_format == PIPE_FORMAT_S8Z24_UNORM);
+
+      }
+      else if (spu.depth_stencil_alpha.depth.enabled) {
+         /* do depth test */
+
+         ASSERT(spu.fb.depth_format == PIPE_FORMAT_S8Z24_UNORM ||
+                spu.fb.depth_format == PIPE_FORMAT_X8Z24_UNORM);
+
+         vector unsigned int ifragZ;
+         vector unsigned int zmask;
+
+         /* convert four fragZ from float to uint */
+         fragZ = spu_mul(fragZ, spu_splats((float) 0xffffff));
+         ifragZ = spu_convtu(fragZ, 0);
+
+         /* do depth comparison, setting zmask with results */
+         switch (spu.depth_stencil_alpha.depth.func) {
+         case PIPE_FUNC_LESS:
+            zmask = spu_cmpgt(ifbZ, ifragZ);  /* mask = (ifragZ < ifbZ) */
+            break;
+         case PIPE_FUNC_GREATER:
+            zmask = spu_cmpgt(ifragZ, ifbZ);  /* mask = (ifbZ > ifragZ) */
+            break;
+         case PIPE_FUNC_GEQUAL:
+            zmask = spu_cmpgt(ifbZ, ifragZ);
+            zmask = spu_nor(zmask, zmask);
+            break;
+         case PIPE_FUNC_LEQUAL:
+            zmask = spu_cmpgt(ifragZ, ifbZ);
+            zmask = spu_nor(zmask, zmask);
+            break;
+         case PIPE_FUNC_EQUAL:
+            zmask = spu_cmpeq(ifbZ, ifragZ);
+            break;
+         case PIPE_FUNC_NOTEQUAL:
+            zmask = spu_cmpeq(ifbZ, ifragZ);
+            zmask = spu_nor(zmask, zmask);
+            break;
+         case PIPE_FUNC_ALWAYS:
+            zmask = spu_splats(0xffffffffU);
+            break;
+         case PIPE_FUNC_NEVER:
+            zmask = spu_splats( 0x0U);
+            break;
+         default:
+            ;
+         }
+
+         mask = spu_and(mask, zmask);
+
+         /* merge framebuffer Z and fragment Z according to the mask */
+         ifbZ = spu_or(spu_and(ifragZ, mask),
+                       spu_andc(ifbZ, mask));
+      }
+
+      if (spu_extract(spu_orx(mask), 0)) {
+         /* put new fragment Z/Stencil values back into Z/Stencil tile */
+         depthStencilTile->ui4[y/2][x/2] = spu_or(ifbZ, ifbS);
+
+         spu.cur_ztile_status = TILE_STATUS_DIRTY;
+      }
    }
 
-
-   default:
-      assert(0);
-      break;
+   if (spu.blend.blend_enable) {
+      /* blending terms, misc regs */
+      vector float term1r, term1g, term1b, term1a;
+      vector float term2r, term2g, term2b, term2a;
+      vector float one, tmp;
+
+      vector float fbRGBA[4];  /* current framebuffer colors */
+
+      /* get colors from framebuffer/tile */
+      {
+         vector float fc[4];
+         uint c0, c1, c2, c3;
+
+#if LINEAR_QUAD_LAYOUT /* See comments/diagram below */
+         c0 = colorTile->ui[y][x*2+0];
+         c1 = colorTile->ui[y][x*2+1];
+         c2 = colorTile->ui[y][x*2+2];
+         c3 = colorTile->ui[y][x*2+3];
+#else
+         c0 = colorTile->ui[y+0][x+0];
+         c1 = colorTile->ui[y+0][x+1];
+         c2 = colorTile->ui[y+1][x+0];
+         c3 = colorTile->ui[y+1][x+1];
+#endif
+         switch (spu.fb.color_format) {
+         case PIPE_FORMAT_B8G8R8A8_UNORM:
+            fc[0] = spu_unpack_B8G8R8A8(c0);
+            fc[1] = spu_unpack_B8G8R8A8(c1);
+            fc[2] = spu_unpack_B8G8R8A8(c2);
+            fc[3] = spu_unpack_B8G8R8A8(c3);
+            break;
+         case PIPE_FORMAT_A8R8G8B8_UNORM:
+            fc[0] = spu_unpack_A8R8G8B8(c0);
+            fc[1] = spu_unpack_A8R8G8B8(c1);
+            fc[2] = spu_unpack_A8R8G8B8(c2);
+            fc[3] = spu_unpack_A8R8G8B8(c3);
+            break;
+         default:
+            ASSERT(0);
+         }
+         _transpose_matrix4x4(fbRGBA, fc);
+      }
+
+      /*
+       * Compute Src RGB terms
+       */
+      switch (spu.blend.rgb_src_factor) {
+      case PIPE_BLENDFACTOR_ONE:
+         term1r = fragR;
+         term1g = fragG;
+         term1b = fragB;
+         break;
+      case PIPE_BLENDFACTOR_ZERO:
+         term1r =
+         term1g =
+         term1b = spu_splats(0.0f);
+         break;
+      case PIPE_BLENDFACTOR_SRC_COLOR:
+         term1r = spu_mul(fragR, fragR);
+         term1g = spu_mul(fragG, fragG);
+         term1b = spu_mul(fragB, fragB);
+         break;
+      case PIPE_BLENDFACTOR_SRC_ALPHA:
+         term1r = spu_mul(fragR, fragA);
+         term1g = spu_mul(fragG, fragA);
+         term1b = spu_mul(fragB, fragA);
+         break;
+      /* XXX more cases */
+      default:
+         ASSERT(0);
+      }
+
+      /*
+       * Compute Src Alpha term
+       */
+      switch (spu.blend.alpha_src_factor) {
+      case PIPE_BLENDFACTOR_ONE:
+         term1a = fragA;
+         break;
+      case PIPE_BLENDFACTOR_SRC_COLOR:
+         term1a = spu_splats(0.0f);
+         break;
+      case PIPE_BLENDFACTOR_SRC_ALPHA:
+         term1a = spu_mul(fragA, fragA);
+         break;
+      /* XXX more cases */
+      default:
+         ASSERT(0);
+      }
+
+      /*
+       * Compute Dest RGB terms
+       */
+      switch (spu.blend.rgb_dst_factor) {
+      case PIPE_BLENDFACTOR_ONE:
+         term2r = fragR;
+         term2g = fragG;
+         term2b = fragB;
+         break;
+      case PIPE_BLENDFACTOR_ZERO:
+         term2r =
+         term2g =
+         term2b = spu_splats(0.0f);
+         break;
+      case PIPE_BLENDFACTOR_SRC_COLOR:
+         term2r = spu_mul(fbRGBA[0], fragR);
+         term2g = spu_mul(fbRGBA[1], fragG);
+         term2b = spu_mul(fbRGBA[2], fragB);
+         break;
+      case PIPE_BLENDFACTOR_SRC_ALPHA:
+         term2r = spu_mul(fbRGBA[0], fragA);
+         term2g = spu_mul(fbRGBA[1], fragA);
+         term2b = spu_mul(fbRGBA[2], fragA);
+         break;
+      case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+         one = spu_splats(1.0f);
+         tmp = spu_sub(one, fragA);
+         term2r = spu_mul(fbRGBA[0], tmp);
+         term2g = spu_mul(fbRGBA[1], tmp);
+         term2b = spu_mul(fbRGBA[2], tmp);
+         break;
+      /* XXX more cases */
+      default:
+         ASSERT(0);
+      }
+
+      /*
+       * Compute Dest Alpha term
+       */
+      switch (spu.blend.alpha_dst_factor) {
+      case PIPE_BLENDFACTOR_ONE:
+         term2a = fragA;
+         break;
+      case PIPE_BLENDFACTOR_SRC_COLOR:
+         term2a = spu_splats(0.0f);
+         break;
+      case PIPE_BLENDFACTOR_SRC_ALPHA:
+         term2a = spu_mul(fbRGBA[3], fragA);
+         break;
+      case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+         one = spu_splats(1.0f);
+         tmp = spu_sub(one, fragA);
+         term2a = spu_mul(fbRGBA[3], tmp);
+         break;
+      /* XXX more cases */
+      default:
+         ASSERT(0);
+      }
+
+      /*
+       * Combine Src/Dest RGB terms
+       */
+      switch (spu.blend.rgb_func) {
+      case PIPE_BLEND_ADD:
+         fragR = spu_add(term1r, term2r);
+         fragG = spu_add(term1g, term2g);
+         fragB = spu_add(term1b, term2b);
+         break;
+      case PIPE_BLEND_SUBTRACT:
+         fragR = spu_sub(term1r, term2r);
+         fragG = spu_sub(term1g, term2g);
+         fragB = spu_sub(term1b, term2b);
+         break;
+      /* XXX more cases */
+      default:
+         ASSERT(0);
+      }
+
+      /*
+       * Combine Src/Dest A term
+       */
+      switch (spu.blend.alpha_func) {
+      case PIPE_BLEND_ADD:
+         fragA = spu_add(term1a, term2a);
+         break;
+      case PIPE_BLEND_SUBTRACT:
+         fragA = spu_sub(term1a, term2a);
+         break;
+      /* XXX more cases */
+      default:
+         ASSERT(0);
+      }
    }
-}
-
-
-static void
-write_ds_quad(tile_t *buffer, unsigned x, unsigned y,
-              enum pipe_format depth_format,
-              qword depth, qword stencil)
-{
-   const int ix = x / 2;
-   const int iy = y / 2;
-
-   (void) stencil;
 
-   switch (depth_format) {
-   case PIPE_FORMAT_Z16_UNORM: {
-      qword *ptr = (qword *) &buffer->us8[iy][ix / 2];
 
-      qword sv = ((ix & 0x01) == 0) 
-          ? (qword) { 2, 3, 6, 7, 10, 11, 14, 15,
-                      24, 25, 26, 27, 28, 29, 30, 31 }
-          : (qword) { 16, 17, 18, 19, 20 , 21, 22, 23,
-                      2, 3, 6, 7, 10, 11, 14, 15 };
-      *ptr = si_shufb(depth, *ptr, sv);
-      break;
+   /*
+    * Convert RRRR,GGGG,BBBB,AAAA to RGBA,RGBA,RGBA,RGBA.
+    */
+#if 0
+   /* original code */
+   {
+      vector float frag_soa[4];
+      frag_soa[0] = fragR;
+      frag_soa[1] = fragG;
+      frag_soa[2] = fragB;
+      frag_soa[3] = fragA;
+      _transpose_matrix4x4(frag_aos, frag_soa);
    }
-
-
-   case PIPE_FORMAT_Z32_UNORM: {
-      qword *ptr = (qword *) &buffer->ui4[iy][ix];
-      *ptr = depth;
+#else
+   /* short-cut relying on function parameter layout: */
+   _transpose_matrix4x4(frag_aos, &fragR);
+   (void) fragG;
+   (void) fragB;
+#endif
+
+   /*
+    * Pack float colors into 32-bit RGBA words.
+    */
+   switch (spu.fb.color_format) {
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+      c0 = spu_pack_A8R8G8B8(frag_aos[0]);
+      c1 = spu_pack_A8R8G8B8(frag_aos[1]);
+      c2 = spu_pack_A8R8G8B8(frag_aos[2]);
+      c3 = spu_pack_A8R8G8B8(frag_aos[3]);
       break;
-   }
-
 
-   case PIPE_FORMAT_Z24S8_UNORM: {
-      qword *ptr = (qword *) &buffer->ui4[iy][ix];
-      qword mask = si_fsmbi(0xEEEE);
-
-      depth = si_shli(depth, 8);
-      *ptr = si_selb(stencil, depth, mask);
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
+      c0 = spu_pack_B8G8R8A8(frag_aos[0]);
+      c1 = spu_pack_B8G8R8A8(frag_aos[1]);
+      c2 = spu_pack_B8G8R8A8(frag_aos[2]);
+      c3 = spu_pack_B8G8R8A8(frag_aos[3]);
       break;
+   default:
+      fprintf(stderr, "SPU: Bad pixel format in spu_default_fragment_ops\n");
+      ASSERT(0);
    }
 
 
-   case PIPE_FORMAT_S8Z24_UNORM: {
-      qword *ptr = (qword *) &buffer->ui4[iy][ix];
-      qword mask = si_fsmbi(0x7777);
-
-      stencil = si_shli(stencil, 24);
-      *ptr = si_selb(stencil, depth, mask);
-      break;
+   /*
+    * Color masking
+    */
+   if (spu.blend.colormask != 0xf) {
+      /* XXX to do */
+      /* apply color mask to 32-bit packed colors */
    }
 
 
-   default:
-      assert(0);
-      break;
+   /*
+    * Logic Ops
+    */
+   if (spu.blend.logicop_enable) {
+      /* XXX to do */
+      /* apply logicop to 32-bit packed colors */
    }
-}
 
 
-qword
-spu_do_depth_stencil(int x, int y,
-                     qword frag_mask, qword frag_depth, qword frag_alpha,
-                     qword facing)
-{
-   struct spu_frag_test_results  result;
-   qword pixel_depth;
-   qword pixel_stencil;
-
-   /* All of this preable code (everthing before the call to frag_test) should
-    * be generated on the PPU and upload to the SPU.
+   /*
+    * If mask is non-zero, mark tile as dirty.
     */
-   if (spu.read_depth || spu.read_stencil) {
-      read_ds_quad(&spu.ztile, x, y, spu.fb.depth_format,
-                   &pixel_depth, &pixel_stencil);
+   if (spu_extract(spu_orx(mask), 0)) {
+      spu.cur_ctile_status = TILE_STATUS_DIRTY;
    }
-   
-   switch (spu.fb.depth_format) {
-   case PIPE_FORMAT_Z16_UNORM:
-      frag_depth = si_fm(frag_depth, (qword)spu_splats((float)(0x0000ffffu)));
-      frag_depth = si_cfltu(frag_depth, 0);
-      break;
-   case PIPE_FORMAT_Z32_UNORM:
-      frag_depth = si_fm(frag_depth, (qword)spu_splats((float)(0xffffffffu)));
-      frag_depth = si_cfltu(frag_depth, 0);
-      break;
-   case PIPE_FORMAT_Z24S8_UNORM:
-   case PIPE_FORMAT_S8Z24_UNORM:
-      frag_depth = si_fm(frag_depth, (qword)spu_splats((float)(0x00ffffffu)));
-      frag_depth = si_cfltu(frag_depth, 0);
-      break;
-   default:
-      ASSERT(0);
-      break;
+   else {
+      return;
    }
 
-   result = (*spu.frag_test)(frag_mask, pixel_depth, pixel_stencil,
-                             frag_depth, frag_alpha, facing);
-
 
-   /* This code (everthing after the call to frag_test) should
-    * be generated on the PPU and upload to the SPU.
+   /*
+    * Write new quad colors to the framebuffer/tile.
+    * Only write pixels where the corresponding mask word is set.
     */
-   if (spu.read_depth || spu.read_stencil) {
-      write_ds_quad(&spu.ztile, x, y, spu.fb.depth_format,
-                    result.depth, result.stencil);
-   }
-
-   return result.mask;
+#if LINEAR_QUAD_LAYOUT
+   /*
+    * Quad layout:
+    *  +--+--+--+--+
+    *  |p0|p1|p2|p3|
+    *  +--+--+--+--+
+    */
+   if (spu_extract(mask, 0))
+      colorTile->ui[y][x*2] = c0;
+   if (spu_extract(mask, 1))
+      colorTile->ui[y][x*2+1] = c1;
+   if (spu_extract(mask, 2))
+      colorTile->ui[y][x*2+2] = c2;
+   if (spu_extract(mask, 3))
+      colorTile->ui[y][x*2+3] = c3;
+#else
+   /*
+    * Quad layout:
+    *  +--+--+
+    *  |p0|p1|
+    *  +--+--+
+    *  |p2|p3|
+    *  +--+--+
+    */
+   if (spu_extract(mask, 0))
+      colorTile->ui[y+0][x+0] = c0;
+   if (spu_extract(mask, 1))
+      colorTile->ui[y+0][x+1] = c1;
+   if (spu_extract(mask, 2))
+      colorTile->ui[y+1][x+0] = c2;
+   if (spu_extract(mask, 3))
+      colorTile->ui[y+1][x+1] = c3;
+#endif
 }
diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.h b/src/gallium/drivers/cell/spu/spu_per_fragment_op.h
index 6571258699..f817abf046 100644
--- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.h
+++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.h
@@ -1,32 +1,44 @@
-/*
- * (C) Copyright IBM Corporation 2008
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
  * All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * on the rights to use, copy, modify, merge, publish, distribute, sub
- * license, and/or sell copies of the Software, and to permit persons to whom
- * the Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
- * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
- * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- * USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
 
 #ifndef SPU_PER_FRAGMENT_OP
 #define SPU_PER_FRAGMENT_OP
 
-extern qword
-spu_do_depth_stencil(int x, int y, qword frag_mask, qword frag_depth,
-		     qword frag_alpha, qword facing);
+
+extern void
+spu_fallback_fragment_ops(uint x, uint y,
+                          tile_t *colorTile,
+                          tile_t *depthStencilTile,
+                          vector float fragZ,
+                          vector float fragRed,
+                          vector float fragGreen,
+                          vector float fragBlue,
+                          vector float fragAlpha,
+                          vector unsigned int mask);
+
 
 #endif /* SPU_PER_FRAGMENT_OP */
diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
index 5051774f00..117b8a36f8 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.c
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -97,7 +97,7 @@ get_four_texels(uint unit, vec_uint4 x, vec_uint4 y, vec_uint4 *texels)
    const qword offset_y = si_andi((qword) y, 0x1f);
 
    const qword tiles_per_row = (qword) spu_splats(spu.texture[unit].tiles_per_row);
-   const qword tile_size = (qword) spu_splats(sizeof(tile_t));
+   const qword tile_size = (qword) spu_splats((unsigned) sizeof(tile_t));
 
    qword tile_offset = si_mpya((qword) tile_y, tiles_per_row, (qword) tile_x);
    tile_offset = si_mpy((qword) tile_offset, tile_size);
diff --git a/src/gallium/drivers/cell/spu/spu_tile.c b/src/gallium/drivers/cell/spu/spu_tile.c
index 12dc246328..216a33126b 100644
--- a/src/gallium/drivers/cell/spu/spu_tile.c
+++ b/src/gallium/drivers/cell/spu/spu_tile.c
@@ -31,6 +31,9 @@
 #include "spu_main.h"
 
 
+/**
+ * Get tile of color or Z values from main memory, put into SPU memory.
+ */
 void
 get_tile(uint tx, uint ty, tile_t *tile, int tag, int zBuf)
 {
@@ -56,6 +59,9 @@ get_tile(uint tx, uint ty, tile_t *tile, int tag, int zBuf)
 }
 
 
+/**
+ * Move tile of color or Z values from SPU memory to main memory.
+ */
 void
 put_tile(uint tx, uint ty, const tile_t *tile, int tag, int zBuf)
 {
diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index 2a4e0b423c..f02cdd1f76 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -38,7 +38,6 @@
 #include "spu_texture.h"
 #include "spu_tile.h"
 #include "spu_tri.h"
-#include "spu_per_fragment_op.h"
 
 
 /** Masks are uint[4] vectors with each element being 0 or 0xffffffff */
@@ -209,7 +208,7 @@ clip_emit_quad(struct setup_stage *setup)
 /**
  * Evaluate attribute coefficients (plane equations) to compute
  * attribute values for the four fragments in a quad.
- * Eg: four colors will be compute.
+ * Eg: four colors will be computed (in AoS format).
  */
 static INLINE void
 eval_coeff(uint slot, float x, float y, vector float result[4])
@@ -255,31 +254,6 @@ eval_z(float x, float y)
 }
 
 
-static INLINE mask_t
-do_depth_test(int x, int y, mask_t quadmask)
-{
-   float4 zvals;
-   mask_t mask;
-
-   if (spu.fb.depth_format == PIPE_FORMAT_NONE)
-      return quadmask;
-
-   zvals.v = eval_z((float) x, (float) y);
-
-   mask = (mask_t) spu_do_depth_stencil(x - setup.cliprect_minx,
-					y - setup.cliprect_miny,
-					(qword) quadmask, 
-					(qword) zvals.v,
-					(qword) spu_splats((unsigned char) 0x0ffu),
-					(qword) spu_splats((unsigned int) 0x01u));
-
-   if (spu_extract(spu_orx(mask), 0))
-      spu.cur_ztile_status = TILE_STATUS_DIRTY;
-
-   return mask;
-}
-
-
 /**
  * Emit a quad (pass to next stage).  No clipping is done.
  * Note: about 1/5 to 1/7 of the time, mask is zero and this function
@@ -289,18 +263,6 @@ do_depth_test(int x, int y, mask_t quadmask)
 static INLINE void
 emit_quad( int x, int y, mask_t mask )
 {
-#if 0
-   struct softpipe_context *sp = setup.softpipe;
-   setup.quad.x0 = x;
-   setup.quad.y0 = y;
-   setup.quad.mask = mask;
-   sp->quad.first->run(sp->quad.first, &setup.quad);
-#else
-
-   if (spu.read_depth) {
-      mask = do_depth_test(x, y, mask);
-   }
-
    /* If any bits in mask are set... */
    if (spu_extract(spu_orx(mask), 0)) {
       const int ix = x - setup.cliprect_minx;
@@ -308,6 +270,7 @@ emit_quad( int x, int y, mask_t mask )
       vector float colors[4];
 
       spu.cur_ctile_status = TILE_STATUS_DIRTY;
+      spu.cur_ztile_status = TILE_STATUS_DIRTY;
 
       if (spu.texture[0].start) {
          /* texture mapping */
@@ -355,55 +318,29 @@ emit_quad( int x, int y, mask_t mask )
       }
 
 
-      /* Convert fragment data from AoS to SoA format.
-       */
-      qword soa_frag[4];
-      _transpose_matrix4x4((vec_float4 *) soa_frag, colors);
+      {
+         /* Convert fragment data from AoS to SoA format.
+          * I.e. (RGBA,RGBA,RGBA,RGBA) -> (RRRR,GGGG,BBBB,AAAA)
+          * This is temporary!
+          */
+         vector float soa_frag[4];
+         _transpose_matrix4x4(soa_frag, colors);
 
-      /* Read the current framebuffer values.
-       */
-      const qword pix[4] = {
-         (qword) spu_splats(spu.ctile.ui[iy+0][ix+0]),
-         (qword) spu_splats(spu.ctile.ui[iy+0][ix+1]),
-         (qword) spu_splats(spu.ctile.ui[iy+1][ix+0]),
-         (qword) spu_splats(spu.ctile.ui[iy+1][ix+1]),
-      };
+         float4 fragZ;
 
-      qword soa_pix[4];
+         fragZ.v = eval_z((float) x, (float) y);
 
-      if (spu.read_fb) {
-         /* Convert pixel data from AoS to SoA format.
+         /* Do all per-fragment/quad operations here, including:
+          *  alpha test, z test, stencil test, blend and framebuffer writing.
           */
-         vec_float4 aos_pix[4] = {
-            spu_unpack_A8R8G8B8(spu.ctile.ui[iy+0][ix+0]),
-            spu_unpack_A8R8G8B8(spu.ctile.ui[iy+0][ix+1]),
-            spu_unpack_A8R8G8B8(spu.ctile.ui[iy+1][ix+0]),
-            spu_unpack_A8R8G8B8(spu.ctile.ui[iy+1][ix+1]),
-         };
-
-         _transpose_matrix4x4((vec_float4 *) soa_pix, aos_pix);
+         spu.fragment_ops(ix, iy, &spu.ctile, &spu.ztile,
+                          fragZ.v,
+                          soa_frag[0], soa_frag[1],
+                          soa_frag[2], soa_frag[3],
+                          mask);
       }
 
-
-      struct spu_blend_results result =
-          (*spu.blend)(soa_frag[0], soa_frag[1], soa_frag[2], soa_frag[3],
-                       soa_pix[0], soa_pix[1], soa_pix[2], soa_pix[3],
-                       spu.const_blend_color[0], spu.const_blend_color[1],
-                       spu.const_blend_color[2], spu.const_blend_color[3]);
-
-
-      /* Convert final pixel data from SoA to AoS format.
-       */
-      result = (*spu.logicop)(pix[0], pix[1], pix[2], pix[3],
-                              result.r, result.g, result.b, result.a,
-                              (qword) mask);
-
-      spu.ctile.ui[iy+0][ix+0] = spu_extract((vec_uint4) result.r, 0);
-      spu.ctile.ui[iy+0][ix+1] = spu_extract((vec_uint4) result.g, 0);
-      spu.ctile.ui[iy+1][ix+0] = spu_extract((vec_uint4) result.b, 0);
-      spu.ctile.ui[iy+1][ix+1] = spu_extract((vec_uint4) result.a, 0);
    }
-#endif
 }
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_util.c b/src/gallium/drivers/cell/spu/spu_util.c
index b25ca4eafc..b8a0d4a265 100644
--- a/src/gallium/drivers/cell/spu/spu_util.c
+++ b/src/gallium/drivers/cell/spu/spu_util.c
@@ -1,4 +1,5 @@
 
+#include "cell/common.h"
 #include "pipe/p_shader_tokens.h"
 #include "pipe/p_debug.h"
 #include "tgsi/tgsi_parse.h"
@@ -20,7 +21,7 @@ tgsi_util_get_src_register_swizzle(
    case 3:
       return reg->SwizzleW;
    default:
-      assert( 0 );
+      ASSERT( 0 );
    }
    return 0;
 }
@@ -40,7 +41,7 @@ tgsi_util_get_src_register_extswizzle(
    case 3:
       return reg->ExtSwizzleW;
    default:
-      assert( 0 );
+      ASSERT( 0 );
    }
    return 0;
 }
@@ -60,12 +61,12 @@ tgsi_util_get_full_src_register_extswizzle(
       &reg->SrcRegisterExtSwz,
       component );
 
-   assert (TGSI_SWIZZLE_X == TGSI_EXTSWIZZLE_X);
-   assert (TGSI_SWIZZLE_Y == TGSI_EXTSWIZZLE_Y);
-   assert (TGSI_SWIZZLE_Z == TGSI_EXTSWIZZLE_Z);
-   assert (TGSI_SWIZZLE_W == TGSI_EXTSWIZZLE_W);
-   assert (TGSI_EXTSWIZZLE_ZERO > TGSI_SWIZZLE_W);
-   assert (TGSI_EXTSWIZZLE_ONE > TGSI_SWIZZLE_W);
+   ASSERT (TGSI_SWIZZLE_X == TGSI_EXTSWIZZLE_X);
+   ASSERT (TGSI_SWIZZLE_Y == TGSI_EXTSWIZZLE_Y);
+   ASSERT (TGSI_SWIZZLE_Z == TGSI_EXTSWIZZLE_Z);
+   ASSERT (TGSI_SWIZZLE_W == TGSI_EXTSWIZZLE_W);
+   ASSERT (TGSI_EXTSWIZZLE_ZERO > TGSI_SWIZZLE_W);
+   ASSERT (TGSI_EXTSWIZZLE_ONE > TGSI_SWIZZLE_W);
 
    /*
     * Second, calculate the simple  swizzle  for   the   unswizzled channel index.
@@ -95,7 +96,7 @@ tgsi_util_get_src_register_extnegate(
    case 3:
       return reg->NegateW;
    default:
-      assert( 0 );
+      ASSERT( 0 );
    }
    return 0;
 }
@@ -120,7 +121,7 @@ tgsi_util_set_src_register_extnegate(
       reg->NegateW = negate;
       break;
    default:
-      assert( 0 );
+      ASSERT( 0 );
    }
 }
 
diff --git a/src/gallium/drivers/cell/spu/spu_vertex_fetch.c b/src/gallium/drivers/cell/spu/spu_vertex_fetch.c
index 26f2363749..03375d84a5 100644
--- a/src/gallium/drivers/cell/spu/spu_vertex_fetch.c
+++ b/src/gallium/drivers/cell/spu/spu_vertex_fetch.c
@@ -92,7 +92,7 @@ static void generic_vertex_fetch(struct spu_vs_context *draw,
    unsigned nr_attrs = draw->vertex_fetch.nr_attrs;
    unsigned attr;
 
-   assert(count <= 4);
+   ASSERT(count <= 4);
 
 #if DRAW_DBG
    printf("SPU: %s count = %u, nr_attrs = %u\n", 
diff --git a/src/gallium/drivers/cell/spu/spu_vertex_shader.c b/src/gallium/drivers/cell/spu/spu_vertex_shader.c
index f81d19fea1..fbe5b34d39 100644
--- a/src/gallium/drivers/cell/spu/spu_vertex_shader.c
+++ b/src/gallium/drivers/cell/spu/spu_vertex_shader.c
@@ -112,7 +112,7 @@ run_vertex_program(struct spu_vs_context *draw,
    const float *scale = draw->viewport.scale;
    const float *trans = draw->viewport.translate;
 
-   assert(count <= 4);
+   ASSERT(count <= 4);
 
    machine->Processor = TGSI_PROCESSOR_VERTEX;