From bb5becf1e289b2c9240d98299e9447a9673da9fc Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 5 Sep 2008 13:54:14 -0600
Subject: gallium: comments, assertions, etc

---
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c | 36 +++++++++++++++++++++++++----
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h | 20 +++++++++-------
 2 files changed, 43 insertions(+), 13 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
index 285ddc0e3f..fe5beba456 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
@@ -300,7 +300,9 @@ void _name (struct spe_function *p, int imm) \
 #include "rtasm_ppc_spe.h"
 
 
-/*
+/**
+ * Initialize an spe_function.
+ * \param code_size  size of instruction buffer to allocate, in bytes.
  */
 void spe_init_func(struct spe_function *p, unsigned code_size)
 {
@@ -324,10 +326,14 @@ void spe_release_func(struct spe_function *p)
 }
 
 
+/**
+ * Alloate a SPE register.
+ * \return register index or -1 if none left.
+ */
 int spe_allocate_available_register(struct spe_function *p)
 {
    unsigned i;
-   for (i = 0; i < 128; i++) {
+   for (i = 0; i < SPE_NUM_REGS; i++) {
       const uint64_t mask = (1ULL << (i % 64));
       const unsigned idx = i / 64;
 
@@ -341,11 +347,15 @@ int spe_allocate_available_register(struct spe_function *p)
 }
 
 
+/**
+ * Mark the given SPE register as "allocated".
+ */
 int spe_allocate_register(struct spe_function *p, int reg)
 {
    const unsigned idx = reg / 64;
    const unsigned bit = reg % 64;
 
+   assert(reg < SPE_NUM_REGS);
    assert((p->regs[idx] & (1ULL << bit)) != 0);
 
    p->regs[idx] &= ~(1ULL << bit);
@@ -353,57 +363,73 @@ int spe_allocate_register(struct spe_function *p, int reg)
 }
 
 
+/**
+ * Mark the given SPE register as "unallocated".
+ */
 void spe_release_register(struct spe_function *p, int reg)
 {
    const unsigned idx = reg / 64;
    const unsigned bit = reg % 64;
 
+   assert(reg < SPE_NUM_REGS);
    assert((p->regs[idx] & (1ULL << bit)) == 0);
 
    p->regs[idx] |= (1ULL << bit);
 }
 
 
+/**
+ * For branch instructions:
+ * \param d  if 1, disable interupts if branch is taken
+ * \param e  if 1, enable interupts if branch is taken
+ * If d and e are both zero, don't change interupt status (right?)
+ */
 
-
+/** Branch Indirect to address in rA */
 void spe_bi(struct spe_function *p, unsigned rA, int d, int e)
 {
     emit_RI7(p, 0x1a8, 0, rA, (d << 5) | (e << 4));
 }
 
+/** Interupt Return */
 void spe_iret(struct spe_function *p, unsigned rA, int d, int e)
 {
     emit_RI7(p, 0x1aa, 0, rA, (d << 5) | (e << 4));
 }
 
+/** Branch indirect and set link on external data */
 void spe_bisled(struct spe_function *p, unsigned rT, unsigned rA, int d,
 		int e)
 {
     emit_RI7(p, 0x1ab, rT, rA, (d << 5) | (e << 4));
 }
 
+/** Branch indirect and set link.  Save PC in rT, jump to rA. */
 void spe_bisl(struct spe_function *p, unsigned rT, unsigned rA, int d,
 		int e)
 {
     emit_RI7(p, 0x1a9, rT, rA, (d << 5) | (e << 4));
 }
 
-void spe_biz(struct spe_function *p, unsigned rT, unsigned rA, int d,
-		int e)
+/** Branch indirect if zero word.  If rT.word[0]==0, jump to rA. */
+void spe_biz(struct spe_function *p, unsigned rT, unsigned rA, int d, int e)
 {
     emit_RI7(p, 0x128, rT, rA, (d << 5) | (e << 4));
 }
 
+/** Branch indirect if non-zero word.  If rT.word[0]!=0, jump to rA. */
 void spe_binz(struct spe_function *p, unsigned rT, unsigned rA, int d, int e)
 {
     emit_RI7(p, 0x129, rT, rA, (d << 5) | (e << 4));
 }
 
+/** Branch indirect if zero halfword.  If rT.halfword[1]==0, jump to rA. */
 void spe_bihz(struct spe_function *p, unsigned rT, unsigned rA, int d, int e)
 {
     emit_RI7(p, 0x12a, rT, rA, (d << 5) | (e << 4));
 }
 
+/** Branch indirect if non-zero halfword.  If rT.halfword[1]!=0, jump to rA. */
 void spe_bihnz(struct spe_function *p, unsigned rT, unsigned rA, int d, int e)
 {
     emit_RI7(p, 0x12b, rT, rA, (d << 5) | (e << 4));
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
index 1cacc717b1..7dd754ba77 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
@@ -32,13 +32,17 @@
 #ifndef RTASM_PPC_SPE_H
 #define RTASM_PPC_SPE_H
 
-struct spe_function {
-    /**
-     *
-     */
-    uint32_t *store;
-    uint32_t *csr;
-    const char *fn;
+/** 4 bytes per instruction */
+#define SPE_INST_SIZE 4
+
+/** number of general-purpose SIMD registers */
+#define SPE_NUM_REGS  128
+
+struct spe_function
+{
+    uint32_t *store;  /**< instruction buffer */
+    uint32_t *csr;    /**< next free pos in instruction buffer */
+    const char *fn;   /**< unused */
 
     /**
      * Mask of used / unused registers
@@ -50,7 +54,7 @@ struct spe_function {
      * spe_allocate_register, spe_allocate_available_register,
      * spe_release_register
      */
-    uint64_t regs[2];
+    uint64_t regs[SPE_NUM_REGS / 64];
 };
 
 extern void spe_init_func(struct spe_function *p, unsigned code_size);
-- 
cgit v1.2.3


From 0e79e474de164a765b9759398c83b6bfa16a0012 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 5 Sep 2008 13:55:02 -0600
Subject: cell: comments, etc.

---
 .../drivers/cell/ppu/cell_state_per_fragment.c     | 28 ++++++++++----
 src/gallium/drivers/cell/ppu/cell_vertex_fetch.c   |  5 +--
 src/gallium/drivers/cell/spu/spu_per_fragment_op.c | 44 +++++++++++++++-------
 3 files changed, 52 insertions(+), 25 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c b/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c
index 53ae3aa50e..705867107b 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c
@@ -132,9 +132,9 @@ emit_alpha_test(struct pipe_depth_stencil_alpha_state *dsa,
 
 
 /**
+ * Generate code to perform Z testing.  Four Z values are tested at once.
  * \param dsa        Current depth-test state
  * \param f          Function to which code should be appended
- * \param m          Mask of allocated / free SPE registers
  * \param mask       Index of register to contain depth-pass mask
  * \param stored     Index of register containing values from depth buffer
  * \param calculated Index of register containing per-fragment depth values
@@ -198,6 +198,7 @@ emit_depth_test(struct pipe_depth_stencil_alpha_state *dsa,
 
 
 /**
+ * Generate code to apply the stencil operation (after testing).
  * \note Emits a maximum of 5 instructions.
  *
  * \warning
@@ -222,9 +223,13 @@ emit_stencil_op(struct spe_function *f,
       spe_il(f, result, ref);
       break;
    case PIPE_STENCIL_OP_INCR:
+      /* clamp = [0xff, 0xff, 0xff, 0xff] */
       spe_il(f, clamp, 0x0ff);
+      /* result[i] = in[i] + 1 */
       spe_ai(f, result, in, 1);
+      /* clamp_mask[i] = (result[i] > 0xff) */
       spe_clgti(f, clamp_mask, result, 0x0ff);
+      /* result[i] = clamp_mask[i] ? clamp[i] : result[i] */
       spe_selb(f, result, result, clamp, clamp_mask);
       break;
    case PIPE_STENCIL_OP_DECR:
@@ -259,10 +264,10 @@ emit_stencil_op(struct spe_function *f,
 
 
 /**
+ * Generate code to do stencil test.  Four pixels are tested at once.
  * \param dsa        Depth / stencil test state
  * \param face       0 for front face, 1 for back face
  * \param f          Function to append instructions to
- * \param reg_mask   Mask of allocated registers
  * \param mask       Register containing mask of fragments passing the
  *                   alpha test
  * \param depth_mask Register containing mask of fragments passing the
@@ -310,13 +315,14 @@ emit_stencil_test(struct pipe_depth_stencil_alpha_state *dsa,
 
    switch (dsa->stencil[face].func) {
    case PIPE_FUNC_NEVER:
-      spe_il(f, stencil_mask, 0);
+      spe_il(f, stencil_mask, 0);   /* stencil_mask[0..3] = [0,0,0,0] */
       break;
 
    case PIPE_FUNC_NOTEQUAL:
       complement = TRUE;
       /* FALLTHROUGH */
    case PIPE_FUNC_EQUAL:
+      /* stencil_mask[i] = (stored[i] == ref) */
       spe_ceqi(f, stencil_mask, stored, ref);
       break;
 
@@ -324,6 +330,8 @@ emit_stencil_test(struct pipe_depth_stencil_alpha_state *dsa,
       complement = TRUE;
       /* FALLTHROUGH */
    case PIPE_FUNC_GREATER:
+      complement = TRUE;
+      /* stencil_mask[i] = (stored[i] > ref) */
       spe_clgti(f, stencil_mask, stored, ref);
       break;
 
@@ -331,8 +339,11 @@ emit_stencil_test(struct pipe_depth_stencil_alpha_state *dsa,
       complement = TRUE;
       /* FALLTHROUGH */
    case PIPE_FUNC_GEQUAL:
+      /* stencil_mask[i] = (stored[i] > ref) */
       spe_clgti(f, stencil_mask, stored, ref);
+      /* tmp[i] = (stored[i] == ref) */
       spe_ceqi(f, tmp, stored, ref);
+      /* stencil_mask[i] = stencil_mask[i] | tmp[i] */
       spe_or(f, stencil_mask, stencil_mask, tmp);
       break;
 
@@ -461,7 +472,7 @@ cell_generate_depth_stencil_test(struct cell_depth_stencil_alpha_state *cdsa)
     * + 25 (front stencil) + 25 (back stencil) + 4 = 63 instructions.  Round
     * up to 64 to make it a happy power-of-two.
     */
-   spe_init_func(f, 4 * 64);
+   spe_init_func(f, SPE_INST_SIZE * 64);
 
 
    /* Allocate registers for the function's input parameters.  Cleverly (and
@@ -540,7 +551,7 @@ cell_generate_depth_stencil_test(struct cell_depth_stencil_alpha_state *cdsa)
          spe_selb(f, depth, depth, zvals, mask);
    }
 
-   spe_bi(f, 0, 0, 0);
+   spe_bi(f, 0, 0, 0);  /* return from function call */
 
 
 #if 0
@@ -956,7 +967,7 @@ cell_generate_alpha_blend(struct cell_blend_state *cb)
     * + 4 (fragment mask) + 1 (return) = 55 instlructions.  Round up to 64 to
     * make it a happy power-of-two.
     */
-   spe_init_func(f, 4 * 64);
+   spe_init_func(f, SPE_INST_SIZE * 64);
 
 
    const int frag[4] = {
@@ -1144,7 +1155,8 @@ cell_generate_alpha_blend(struct cell_blend_state *cb)
 }
 
 
-int PC_OFFSET(const struct spe_function *f, const void *d)
+static int
+PC_OFFSET(const struct spe_function *f, const void *d)
 {
    const intptr_t pc = (intptr_t) f->csr;
    const intptr_t ea = ~0x0f & (intptr_t) d;
@@ -1178,7 +1190,7 @@ cell_generate_logic_op(struct spe_function *f,
     * bytes (equiv. to 8 instructions) are needed for data storage.  Round up
     * to 64 to make it a happy power-of-two.
     */
-   spe_init_func(f, 4 * 64);
+   spe_init_func(f, SPE_INST_SIZE * 64);
 
 
    /* Pixel colors in framebuffer format in AoS layout.
diff --git a/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c b/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c
index 2ece0250f6..566df7f59e 100644
--- a/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c
+++ b/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c
@@ -297,10 +297,9 @@ void cell_update_vertex_fetch(struct draw_context *draw)
 
 
    /* Each fetch function can be a maximum of 34 instructions (note: this is
-    * actually a slight over-estimate).  That means (34 * 4) = 136 bytes
-    * each maximum.
+    * actually a slight over-estimate).
     */
-   spe_init_func(p, 136 * unique_attr_formats);
+   spe_init_func(p, 34 * SPE_INST_SIZE * unique_attr_formats);
 
 
    /* Allocate registers for the function's input parameters.
diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
index c0a729b3d2..db88735226 100644
--- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
+++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
@@ -35,8 +35,17 @@
 
 #define ZERO 0x80
 
+
+/**
+ * Get a "quad" of four fragment Z/stencil values from the given tile.
+ * \param tile  the tile of Z/stencil values
+ * \param x, y  location of the quad in the tile, in pixels
+ * \param depth_format  format of the tile's data
+ * \param detph  returns four depth values
+ * \param stencil  returns four stencil values
+ */
 static void
-read_ds_quad(tile_t *buffer, unsigned x, unsigned y,
+read_ds_quad(tile_t *tile, unsigned x, unsigned y,
              enum pipe_format depth_format, qword *depth,
              qword *stencil)
 {
@@ -45,14 +54,13 @@ read_ds_quad(tile_t *buffer, unsigned x, unsigned y,
 
    switch (depth_format) {
    case PIPE_FORMAT_Z16_UNORM: {
-      qword *ptr = (qword *) &buffer->us8[iy][ix / 2];
+      qword *ptr = (qword *) &tile->us8[iy][ix / 2];
 
       const qword shuf_vec = (qword) {
          ZERO, ZERO, 0, 1, ZERO, ZERO, 2, 3,
          ZERO, ZERO, 4, 5, ZERO, ZERO, 6, 7
       };
 
-
       /* At even X values we want the first 4 shorts, and at odd X values we
        * want the second 4 shorts.
        */
@@ -65,18 +73,16 @@ read_ds_quad(tile_t *buffer, unsigned x, unsigned y,
       break;
    }
 
-
    case PIPE_FORMAT_Z32_UNORM: {
-      qword *ptr = (qword *) &buffer->ui4[iy][ix];
+      qword *ptr = (qword *) &tile->ui4[iy][ix];
 
       *depth = *ptr;
       *stencil = si_il(0);
       break;
    }
-      
 
    case PIPE_FORMAT_Z24S8_UNORM: {
-      qword *ptr = (qword *) &buffer->ui4[iy][ix];
+      qword *ptr = (qword *) &tile->ui4[iy][ix];
       qword mask = si_fsmbi(0xEEEE);
 
       *depth = si_rotmai(si_and(*ptr, mask), -8);
@@ -84,16 +90,14 @@ read_ds_quad(tile_t *buffer, unsigned x, unsigned y,
       break;
    }
 
-
    case PIPE_FORMAT_S8Z24_UNORM: {
-      qword *ptr = (qword *) &buffer->ui4[iy][ix];
+      qword *ptr = (qword *) &tile->ui4[iy][ix];
 
       *depth = si_and(*ptr, si_fsmbi(0x7777));
       *stencil = si_andi(si_roti(*ptr, 8), 0x0ff);
       break;
    }
 
-
    default:
       ASSERT(0);
       break;
@@ -101,6 +105,14 @@ read_ds_quad(tile_t *buffer, unsigned x, unsigned y,
 }
 
 
+/**
+ * Put a quad of Z/stencil values into a tile.
+ * \param tile  the tile of Z/stencil values to write into
+ * \param x, y  location of the quad in the tile, in pixels
+ * \param depth_format  format of the tile's data
+ * \param detph  depth values to store
+ * \param stencil  stencil values to store
+ */
 static void
 write_ds_quad(tile_t *buffer, unsigned x, unsigned y,
               enum pipe_format depth_format,
@@ -124,14 +136,12 @@ write_ds_quad(tile_t *buffer, unsigned x, unsigned y,
       break;
    }
 
-
    case PIPE_FORMAT_Z32_UNORM: {
       qword *ptr = (qword *) &buffer->ui4[iy][ix];
       *ptr = depth;
       break;
    }
 
-
    case PIPE_FORMAT_Z24S8_UNORM: {
       qword *ptr = (qword *) &buffer->ui4[iy][ix];
       qword mask = si_fsmbi(0xEEEE);
@@ -141,7 +151,6 @@ write_ds_quad(tile_t *buffer, unsigned x, unsigned y,
       break;
    }
 
-
    case PIPE_FORMAT_S8Z24_UNORM: {
       qword *ptr = (qword *) &buffer->ui4[iy][ix];
       qword mask = si_fsmbi(0x7777);
@@ -151,7 +160,6 @@ write_ds_quad(tile_t *buffer, unsigned x, unsigned y,
       break;
    }
 
-
    default:
       ASSERT(0);
       break;
@@ -159,6 +167,14 @@ write_ds_quad(tile_t *buffer, unsigned x, unsigned y,
 }
 
 
+/**
+ * Do depth/stencil/alpha test for a "quad" of 4 fragments.
+ * \param x,y  location of quad within tile
+ * \param frag_mask  indicates which fragments are "alive"
+ * \param frag_depth  four fragment depth values
+ * \param frag_alpha  four fragment alpha values
+ * \param facing  front/back facing for four fragments (1=front, 0=back)
+ */
 qword
 spu_do_depth_stencil(int x, int y,
                      qword frag_mask, qword frag_depth, qword frag_alpha,
-- 
cgit v1.2.3


From cd9722dcddcb41af3196860280d23542dc673700 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Mon, 8 Sep 2008 11:50:13 -0600
Subject: cell: comments

---
 src/gallium/drivers/cell/spu/spu_tri.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'src/gallium')

diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index 2a4e0b423c..a3ea0a3e69 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -209,7 +209,7 @@ clip_emit_quad(struct setup_stage *setup)
 /**
  * Evaluate attribute coefficients (plane equations) to compute
  * attribute values for the four fragments in a quad.
- * Eg: four colors will be compute.
+ * Eg: four colors will be computed (in AoS format).
  */
 static INLINE void
 eval_coeff(uint slot, float x, float y, vector float result[4])
@@ -356,6 +356,7 @@ emit_quad( int x, int y, mask_t mask )
 
 
       /* Convert fragment data from AoS to SoA format.
+       * I.e. (RGBA,RGBA,RGBA,RGBA) -> (RRRR,GGGG,BBBB,AAAA)
        */
       qword soa_frag[4];
       _transpose_matrix4x4((vec_float4 *) soa_frag, colors);
@@ -373,6 +374,7 @@ emit_quad( int x, int y, mask_t mask )
 
       if (spu.read_fb) {
          /* Convert pixel data from AoS to SoA format.
+          * I.e. (RGBA,RGBA,RGBA,RGBA) -> (RRRR,GGGG,BBBB,AAAA)
           */
          vec_float4 aos_pix[4] = {
             spu_unpack_A8R8G8B8(spu.ctile.ui[iy+0][ix+0]),
@@ -393,6 +395,7 @@ emit_quad( int x, int y, mask_t mask )
 
 
       /* Convert final pixel data from SoA to AoS format.
+       * I.e. (RRRR,GGGG,BBBB,AAAA) -> (RGBA,RGBA,RGBA,RGBA)
        */
       result = (*spu.logicop)(pix[0], pix[1], pix[2], pix[3],
                               result.r, result.g, result.b, result.a,
-- 
cgit v1.2.3


From 04ae4fba3c0a656cf2747fc994b99f99576d0e2b Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Mon, 8 Sep 2008 11:53:14 -0600
Subject: cell: minor change to Z float/int conversion code (avoid switch)

---
 src/gallium/drivers/cell/spu/spu_main.c            |  5 ++++
 src/gallium/drivers/cell/spu/spu_main.h            |  5 ++++
 src/gallium/drivers/cell/spu/spu_per_fragment_op.c | 34 +++++++++-------------
 3 files changed, 23 insertions(+), 21 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index d223f32d94..c4236817a9 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -252,12 +252,17 @@ cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
 
    switch (spu.fb.depth_format) {
    case PIPE_FORMAT_Z32_UNORM:
+      spu.fb.zsize = 4;
+      spu.fb.zscale = (float) 0xffffffffu;
+      break;
    case PIPE_FORMAT_Z24S8_UNORM:
    case PIPE_FORMAT_S8Z24_UNORM:
       spu.fb.zsize = 4;
+      spu.fb.zscale = (float) 0x00ffffffu;
       break;
    case PIPE_FORMAT_Z16_UNORM:
       spu.fb.zsize = 2;
+      spu.fb.zscale = (float) 0xffffu;
       break;
    default:
       spu.fb.zsize = 0;
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 4879f8c9c8..c2a53c9dcf 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -41,6 +41,10 @@
 #define MAX_HEIGHT 1024
 
 
+/**
+ * A tile is basically a TILE_SIZE x TILE_SIZE block of 4-byte pixels.
+ * The data may be addressed through several different types.
+ */
 typedef union {
    ushort us[TILE_SIZE][TILE_SIZE];
    uint   ui[TILE_SIZE][TILE_SIZE];
@@ -99,6 +103,7 @@ struct spu_framebuffer {
    uint depth_clear_value;
 
    uint zsize;                     /**< 0, 2 or 4 bytes per Z */
+   float zscale;                   /**< 65535.0, 2^24-1 or 2^32-1 */
 } ALIGN16_ATTRIB;
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
index db88735226..29dc07a2e8 100644
--- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
+++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
@@ -144,18 +144,22 @@ write_ds_quad(tile_t *buffer, unsigned x, unsigned y,
 
    case PIPE_FORMAT_Z24S8_UNORM: {
       qword *ptr = (qword *) &buffer->ui4[iy][ix];
+      /* form select mask = 1110,1110,1110,1110 */
       qword mask = si_fsmbi(0xEEEE);
-
+      /* depth[i] = depth[i] << 8 */
       depth = si_shli(depth, 8);
+      /* *ptr[i] = depth[i][31:8] | stencil[i][7:0] */
       *ptr = si_selb(stencil, depth, mask);
       break;
    }
 
    case PIPE_FORMAT_S8Z24_UNORM: {
       qword *ptr = (qword *) &buffer->ui4[iy][ix];
+      /* form select mask = 0111,0111,0111,0111 */
       qword mask = si_fsmbi(0x7777);
-
+      /* stencil[i] = stencil[i] << 24 */
       stencil = si_shli(stencil, 24);
+      /* *ptr[i] = stencil[i][31:24] | depth[i][23:0] */
       *ptr = si_selb(stencil, depth, mask);
       break;
    }
@@ -191,25 +195,13 @@ spu_do_depth_stencil(int x, int y,
       read_ds_quad(&spu.ztile, x, y, spu.fb.depth_format,
                    &pixel_depth, &pixel_stencil);
    }
-   
-   switch (spu.fb.depth_format) {
-   case PIPE_FORMAT_Z16_UNORM:
-      frag_depth = si_fm(frag_depth, (qword)spu_splats((float)(0x0000ffffu)));
-      frag_depth = si_cfltu(frag_depth, 0);
-      break;
-   case PIPE_FORMAT_Z32_UNORM:
-      frag_depth = si_fm(frag_depth, (qword)spu_splats((float)(0xffffffffu)));
-      frag_depth = si_cfltu(frag_depth, 0);
-      break;
-   case PIPE_FORMAT_Z24S8_UNORM:
-   case PIPE_FORMAT_S8Z24_UNORM:
-      frag_depth = si_fm(frag_depth, (qword)spu_splats((float)(0x00ffffffu)));
-      frag_depth = si_cfltu(frag_depth, 0);
-      break;
-   default:
-      ASSERT(0);
-      break;
-   }
+
+   /* convert floating point Z values to 32-bit uint */
+
+   /* frag_depth *= spu.fb.zscale */
+   frag_depth = si_fm(frag_depth, (qword)spu_splats(spu.fb.zscale));
+   /* frag_depth = uint(frag_depth) */
+   frag_depth = si_cfltu(frag_depth, 0);
 
    result = (*spu.frag_test)(frag_mask, pixel_depth, pixel_stencil,
                              frag_depth, frag_alpha, facing);
-- 
cgit v1.2.3


From ee582fd3a7a9ddbcb5595249201cf213a6c6f014 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 10 Sep 2008 17:11:48 -0600
Subject: gallium: assorted additions and fixes to Cell SPE rtasm code

Fix incorrect opcode for fsmbi.
Added "macro" functions for loading floats/ints, register complement, zero, move.
Added #defines for return address and stack pointer registers.
Added assertions to check that the instruction buffer doesn't overflow.
---
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c | 88 +++++++++++++++++++++++------
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h | 38 +++++++++++--
 2 files changed, 105 insertions(+), 21 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
index fe5beba456..61010e4333 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
@@ -151,8 +151,8 @@ static void emit_RR(struct spe_function *p, unsigned op, unsigned rT,
     inst.inst.rB = rB;
     inst.inst.rA = rA;
     inst.inst.rT = rT;
-    *p->csr = inst.bits;
-    p->csr++;
+    p->store[p->num_inst++] = inst.bits;
+    assert(p->num_inst <= p->max_inst);
 }
 
 
@@ -165,8 +165,8 @@ static void emit_RRR(struct spe_function *p, unsigned op, unsigned rT,
     inst.inst.rB = rB;
     inst.inst.rA = rA;
     inst.inst.rC = rC;
-    *p->csr = inst.bits;
-    p->csr++;
+    p->store[p->num_inst++] = inst.bits;
+    assert(p->num_inst <= p->max_inst);
 }
 
 
@@ -178,8 +178,8 @@ static void emit_RI7(struct spe_function *p, unsigned op, unsigned rT,
     inst.inst.i7 = imm;
     inst.inst.rA = rA;
     inst.inst.rT = rT;
-    *p->csr = inst.bits;
-    p->csr++;
+    p->store[p->num_inst++] = inst.bits;
+    assert(p->num_inst <= p->max_inst);
 }
 
 
@@ -192,8 +192,8 @@ static void emit_RI8(struct spe_function *p, unsigned op, unsigned rT,
     inst.inst.i8 = imm;
     inst.inst.rA = rA;
     inst.inst.rT = rT;
-    *p->csr = inst.bits;
-    p->csr++;
+    p->store[p->num_inst++] = inst.bits;
+    assert(p->num_inst <= p->max_inst);
 }
 
 
@@ -206,8 +206,8 @@ static void emit_RI10(struct spe_function *p, unsigned op, unsigned rT,
     inst.inst.i10 = imm;
     inst.inst.rA = rA;
     inst.inst.rT = rT;
-    *p->csr = inst.bits;
-    p->csr++;
+    p->store[p->num_inst++] = inst.bits;
+    assert(p->num_inst <= p->max_inst);
 }
 
 
@@ -218,8 +218,8 @@ static void emit_RI16(struct spe_function *p, unsigned op, unsigned rT,
     inst.inst.op = op;
     inst.inst.i16 = imm;
     inst.inst.rT = rT;
-    *p->csr = inst.bits;
-    p->csr++;
+    p->store[p->num_inst++] = inst.bits;
+    assert(p->num_inst <= p->max_inst);
 }
 
 
@@ -230,8 +230,8 @@ static void emit_RI18(struct spe_function *p, unsigned op, unsigned rT,
     inst.inst.op = op;
     inst.inst.i18 = imm;
     inst.inst.rT = rT;
-    *p->csr = inst.bits;
-    p->csr++;
+    p->store[p->num_inst++] = inst.bits;
+    assert(p->num_inst <= p->max_inst);
 }
 
 
@@ -307,8 +307,9 @@ void _name (struct spe_function *p, int imm) \
 void spe_init_func(struct spe_function *p, unsigned code_size)
 {
     p->store = align_malloc(code_size, 16);
-    p->csr = p->store;
-    
+    p->num_inst = 0;
+    p->max_inst = code_size / SPE_INST_SIZE;
+
     /* Conservatively treat R0 - R2 and R80 - R127 as non-volatile.
      */
     p->regs[0] = ~7;
@@ -318,11 +319,11 @@ void spe_init_func(struct spe_function *p, unsigned code_size)
 
 void spe_release_func(struct spe_function *p)
 {
+    assert(p->num_inst <= p->max_inst);
     if (p->store != NULL) {
         align_free(p->store);
     }
     p->store = NULL;
-    p->csr = NULL;
 }
 
 
@@ -337,6 +338,7 @@ int spe_allocate_available_register(struct spe_function *p)
       const uint64_t mask = (1ULL << (i % 64));
       const unsigned idx = i / 64;
 
+      assert(idx < 2);
       if ((p->regs[idx] & mask) != 0) {
          p->regs[idx] &= ~mask;
          return i;
@@ -371,6 +373,8 @@ void spe_release_register(struct spe_function *p, int reg)
    const unsigned idx = reg / 64;
    const unsigned bit = reg % 64;
 
+   assert(idx < 2);
+
    assert(reg < SPE_NUM_REGS);
    assert((p->regs[idx] & (1ULL << bit)) == 0);
 
@@ -458,4 +462,54 @@ EMIT_R   (spe_mfspr, 0x00c);
 EMIT_R   (spe_mtspr, 0x10c);
 #endif
 
+
+/**
+ ** Helper / "macro" instructions.
+ ** Use somewhat verbose names as a reminder that these aren't native
+ ** SPE instructions.
+ **/
+
+
+void
+spe_load_float(struct spe_function *p, unsigned rT, float x)
+{
+   union {
+      float f;
+      unsigned u;
+   } bits;
+   bits.f = x;
+   spe_ilhu(p, rT, bits.u >> 16);
+   spe_iohl(p, rT, bits.u & 0xffff);
+}
+
+
+void
+spe_load_int(struct spe_function *p, unsigned rT, int i)
+{
+   spe_ilhu(p, rT, i >> 16);
+   spe_iohl(p, rT, i & 0xffff);
+}
+
+
+void
+spe_complement(struct spe_function *p, unsigned rT)
+{
+   spe_nor(p, rT, rT, rT);
+}
+
+
+void
+spe_move(struct spe_function *p, unsigned rT, unsigned rA)
+{
+   spe_ori(p, rT, rA, 0);
+}
+
+
+void
+spe_zero(struct spe_function *p, unsigned rT)
+{
+   spe_xor(p, rT, rT, rT);
+}
+
+
 #endif /* GALLIUM_CELL */
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
index 7dd754ba77..dee8c55c4a 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
@@ -25,6 +25,7 @@
 /**
  * \file
  * Real-time assembly generation interface for Cell B.E. SPEs.
+ * For details, see /opt/cell/sdk/docs/arch/SPU_ISA_v1.2_27Jan2007_pub.pdf
  *
  * \author Ian Romanick <idr@us.ibm.com>
  */
@@ -38,11 +39,18 @@
 /** number of general-purpose SIMD registers */
 #define SPE_NUM_REGS  128
 
+/** Return Address register */
+#define SPE_REG_RA  0
+
+/** Stack Pointer register */
+#define SPE_REG_SP  1
+
+
 struct spe_function
 {
-    uint32_t *store;  /**< instruction buffer */
-    uint32_t *csr;    /**< next free pos in instruction buffer */
-    const char *fn;   /**< unused */
+   uint32_t *store;  /**< instruction buffer */
+   uint num_inst;
+   uint max_inst;
 
     /**
      * Mask of used / unused registers
@@ -123,7 +131,8 @@ EMIT_RI16(spe_ilhu,  0x082);
 EMIT_RI16(spe_il,    0x081);
 EMIT_RI18(spe_ila,   0x021);
 EMIT_RI16(spe_iohl,  0x0c1);
-EMIT_RI16(spe_fsmbi, 0x0c5);
+EMIT_RI16(spe_fsmbi, 0x065);
+
 
 
 /* Integer and logical instructions
@@ -275,6 +284,27 @@ extern void spe_bihnz(struct spe_function *p, unsigned rT, unsigned rA,
     int d, int e);
 
 
+/** Load/splat immediate float into rT. */
+extern void
+spe_load_float(struct spe_function *p, unsigned rT, float x);
+
+/** Load/splat immediate int into rT. */
+extern void
+spe_load_int(struct spe_function *p, unsigned rT, int i);
+
+/** Complement/invert all bits in rT. */
+extern void
+spe_complement(struct spe_function *p, unsigned rT);
+
+/** rT = rA. */
+extern void
+spe_move(struct spe_function *p, unsigned rT, unsigned rA);
+
+/** rT = {0,0,0,0}. */
+extern void
+spe_zero(struct spe_function *p, unsigned rT);
+
+
 /* Floating-point instructions
  */
 EMIT_RR  (spe_fa,         0x2c4);
-- 
cgit v1.2.3


From 284ab5a6127f8b452acaa0e10ac1d9ebc87fac3e Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 10 Sep 2008 18:22:00 -0600
Subject: cell: checkpoint commit of new per-fragment processing

Do code generation for alpha test, z test, stencil, blend, colormask
and framebuffer/tile read/write as a single code block.
Ian's previous blend/z/stencil test code is still there but mostly disabled
and will be removed soon.
---
 src/gallium/drivers/cell/common.h                  |  20 +-
 src/gallium/drivers/cell/ppu/Makefile              |   1 +
 src/gallium/drivers/cell/ppu/cell_gen_fragment.c   | 530 +++++++++++++++++++++
 src/gallium/drivers/cell/ppu/cell_gen_fragment.h   |  38 ++
 src/gallium/drivers/cell/ppu/cell_state_emit.c     |  31 +-
 .../drivers/cell/ppu/cell_state_per_fragment.c     |   2 +-
 src/gallium/drivers/cell/spu/Makefile              |   2 +-
 src/gallium/drivers/cell/spu/spu_main.c            |  53 ++-
 src/gallium/drivers/cell/spu/spu_main.h            |  23 +
 src/gallium/drivers/cell/spu/spu_per_fragment_op.c | 231 ++++++++-
 src/gallium/drivers/cell/spu/spu_per_fragment_op.h |  11 +
 src/gallium/drivers/cell/spu/spu_tri.c             |  30 ++
 src/gallium/winsys/xlib/xm_api.c                   |   7 +-
 src/gallium/winsys/xlib/xm_winsys.c                |  35 ++
 14 files changed, 998 insertions(+), 16 deletions(-)
 create mode 100644 src/gallium/drivers/cell/ppu/cell_gen_fragment.c
 create mode 100644 src/gallium/drivers/cell/ppu/cell_gen_fragment.h

(limited to 'src/gallium')

diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index c0ca201e1d..a62530c64d 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -97,6 +97,7 @@
 #define CELL_CMD_STATE_LOGICOP       21
 #define CELL_CMD_VS_EXECUTE          22
 #define CELL_CMD_FLUSH_BUFFER_RANGE  23
+#define CELL_CMD_STATE_FRAGMENT_OPS  24
 
 
 #define CELL_NUM_BUFFERS 4
@@ -112,30 +113,43 @@
 
 /**
  */
-struct cell_command_depth_stencil_alpha_test {
+struct cell_command_depth_stencil_alpha_test
+{
    uint64_t base;               /**< Effective address of code start. */
    unsigned size;               /**< Size in bytes of SPE code. */
    unsigned read_depth;         /**< Flag: should depth be read? */
    unsigned read_stencil;       /**< Flag: should stencil be read? */
+   struct pipe_depth_stencil_alpha_state state;
 };
 
 
 /**
  * Upload code to perform framebuffer blend operation
  */
-struct cell_command_blend {
+struct cell_command_blend
+{
    uint64_t base;               /**< Effective address of code start. */
    unsigned size;               /**< Size in bytes of SPE code. */
    unsigned read_fb;            /**< Flag: should framebuffer be read? */
 };
 
 
-struct cell_command_logicop {
+struct cell_command_logicop
+{
    uint64_t base;               /**< Effective address of code start. */
    unsigned size;               /**< Size in bytes of SPE code. */
 };
 
 
+#define SPU_MAX_FRAGMENT_OPS_INSTS 64
+
+struct cell_command_fragment_ops
+{
+   uint64_t opcode;      /**< CELL_CMD_STATE_FRAGMENT_OPS */
+   unsigned code[SPU_MAX_FRAGMENT_OPS_INSTS];
+};
+
+
 /**
  * Tell SPUs about the framebuffer size, location
  */
diff --git a/src/gallium/drivers/cell/ppu/Makefile b/src/gallium/drivers/cell/ppu/Makefile
index 25473e200c..b5a6fcb8de 100644
--- a/src/gallium/drivers/cell/ppu/Makefile
+++ b/src/gallium/drivers/cell/ppu/Makefile
@@ -25,6 +25,7 @@ SOURCES = \
 	cell_context.c \
 	cell_draw_arrays.c \
 	cell_flush.c \
+	cell_gen_fragment.c \
 	cell_state_derived.c \
 	cell_state_emit.c \
 	cell_state_per_fragment.c \
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
new file mode 100644
index 0000000000..df29476be6
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
@@ -0,0 +1,530 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+
+/**
+ * Generate SPU per-fragment code (actually per-quad code).
+ * \author Brian Paul
+ */
+
+
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "rtasm/rtasm_ppc_spe.h"
+#include "cell_context.h"
+#include "cell_gen_fragment.h"
+
+
+
+/** Do extra optimizations? */
+#define OPTIMIZATIONS 1
+
+
+/**
+ * Generate SPE code to perform Z/depth testing.
+ *
+ * \param dsa         Gallium depth/stencil/alpha state to gen code for
+ * \param f           SPE function to append instruction onto.
+ * \param mask_reg    register containing quad/pixel "alive" mask (in/out)
+ * \param ifragZ_reg  register containing integer fragment Z values (in)
+ * \param ifbZ_reg    register containing integer frame buffer Z values (in/out)
+ * \param zmask_reg   register containing result of Z test/comparison (out)
+ */
+static void
+gen_depth_test(const struct pipe_depth_stencil_alpha_state *dsa,
+               struct spe_function *f,
+               int mask_reg, int ifragZ_reg, int ifbZ_reg, int zmask_reg)
+{
+   ASSERT(dsa->depth.enabled);
+
+   switch (dsa->depth.func) {
+   case PIPE_FUNC_EQUAL:
+      /* zmask = (ifragZ == ref) */
+      spe_ceq(f, zmask_reg, ifragZ_reg, ifbZ_reg);
+      /* mask = (mask & zmask) */
+      spe_and(f, mask_reg, mask_reg, zmask_reg);
+      break;
+
+   case PIPE_FUNC_NOTEQUAL:
+      /* zmask = (ifragZ == ref) */
+      spe_ceq(f, zmask_reg, ifragZ_reg, ifbZ_reg);
+      /* mask = (mask & ~zmask) */
+      spe_andc(f, mask_reg, mask_reg, zmask_reg);
+      break;
+
+   case PIPE_FUNC_GREATER:
+      /* zmask = (ifragZ > ref) */
+      spe_cgt(f, zmask_reg, ifragZ_reg, ifbZ_reg);
+      /* mask = (mask & zmask) */
+      spe_and(f, mask_reg, mask_reg, zmask_reg);
+      break;
+
+   case PIPE_FUNC_LESS:
+      /* zmask = (ref > ifragZ) */
+      spe_cgt(f, zmask_reg, ifbZ_reg, ifragZ_reg);
+      /* mask = (mask & zmask) */
+      spe_and(f, mask_reg, mask_reg, zmask_reg);
+      break;
+
+   case PIPE_FUNC_LEQUAL:
+      /* zmask = (ifragZ > ref) */
+      spe_cgt(f, zmask_reg, ifragZ_reg, ifbZ_reg);
+      /* mask = (mask & ~zmask) */
+      spe_andc(f, mask_reg, mask_reg, zmask_reg);
+      break;
+
+   case PIPE_FUNC_GEQUAL:
+      /* zmask = (ref > ifragZ) */
+      spe_cgt(f, zmask_reg, ifbZ_reg, ifragZ_reg);
+      /* mask = (mask & ~zmask) */
+      spe_andc(f, mask_reg, mask_reg, zmask_reg);
+      break;
+
+   case PIPE_FUNC_NEVER:
+      spe_il(f, mask_reg, 0);  /* mask = {0,0,0,0} */
+      spe_move(f, zmask_reg, mask_reg);  /* zmask = mask */
+      break;
+
+   case PIPE_FUNC_ALWAYS:
+      /* mask unchanged */
+      spe_il(f, zmask_reg, ~0);  /* zmask = {~0,~0,~0,~0} */
+      break;
+
+   default:
+      ASSERT(0);
+      break;
+   }
+
+   if (dsa->depth.writemask) {
+      /*
+       * If (ztest passed) {
+       *    framebufferZ = fragmentZ;
+       * }
+       * OR,
+       * framebufferZ = (ztest_passed ? fragmentZ : framebufferZ;
+       */
+      spe_selb(f, ifbZ_reg, ifbZ_reg, ifragZ_reg, mask_reg);
+   }
+}
+
+
+/**
+ * Generate SPE code to perform alpha testing.
+ *
+ * \param dsa        Gallium depth/stencil/alpha state to gen code for
+ * \param f          SPE function to append instruction onto.
+ * \param mask_reg   register containing quad/pixel "alive" mask (in/out)
+ * \param fragA_reg  register containing four fragment alpha values (in)
+ */
+static void
+gen_alpha_test(const struct pipe_depth_stencil_alpha_state *dsa,
+               struct spe_function *f, int mask_reg, int fragA_reg)
+{
+   int ref_reg = spe_allocate_available_register(f);
+   int amask_reg = spe_allocate_available_register(f);
+
+   ASSERT(dsa->alpha.enabled);
+
+   if ((dsa->alpha.func != PIPE_FUNC_NEVER) &&
+       (dsa->alpha.func != PIPE_FUNC_ALWAYS)) {
+      /* load/splat the alpha reference float value */
+      spe_load_float(f, ref_reg, dsa->alpha.ref);
+   }
+
+   /* emit code to do the alpha comparison, updating 'mask' */
+   switch (dsa->alpha.func) {
+   case PIPE_FUNC_EQUAL:
+      /* amask = (fragA == ref) */
+      spe_fceq(f, amask_reg, fragA_reg, ref_reg);
+      /* mask = (mask & amask) */
+      spe_and(f, mask_reg, mask_reg, amask_reg);
+      break;
+
+   case PIPE_FUNC_NOTEQUAL:
+      /* amask = (fragA == ref) */
+      spe_fceq(f, amask_reg, fragA_reg, ref_reg);
+      /* mask = (mask & ~amask) */
+      spe_andc(f, mask_reg, mask_reg, amask_reg);
+      break;
+
+   case PIPE_FUNC_GREATER:
+      /* amask = (fragA > ref) */
+      spe_fcgt(f, amask_reg, fragA_reg, ref_reg);
+      /* mask = (mask & amask) */
+      spe_and(f, mask_reg, mask_reg, amask_reg);
+      break;
+
+   case PIPE_FUNC_LESS:
+      /* amask = (ref > fragA) */
+      spe_fcgt(f, amask_reg, ref_reg, fragA_reg);
+      /* mask = (mask & amask) */
+      spe_and(f, mask_reg, mask_reg, amask_reg);
+      break;
+
+   case PIPE_FUNC_LEQUAL:
+      /* amask = (fragA > ref) */
+      spe_fcgt(f, amask_reg, fragA_reg, ref_reg);
+      /* mask = (mask & ~amask) */
+      spe_andc(f, mask_reg, mask_reg, amask_reg);
+      break;
+
+   case PIPE_FUNC_GEQUAL:
+      /* amask = (ref > fragA) */
+      spe_fcgt(f, amask_reg, ref_reg, fragA_reg);
+      /* mask = (mask & ~amask) */
+      spe_andc(f, mask_reg, mask_reg, amask_reg);
+      break;
+
+   case PIPE_FUNC_NEVER:
+      spe_il(f, mask_reg, 0);  /* mask = [0,0,0,0] */
+      break;
+
+   case PIPE_FUNC_ALWAYS:
+      /* no-op, mask unchanged */
+      break;
+
+   default:
+      ASSERT(0);
+      break;
+   }
+
+#if OPTIMIZATIONS
+   /* if mask == {0,0,0,0} we're all done, return */
+   {
+      /* re-use amask reg here */
+      int tmp_reg = amask_reg;
+      /* tmp[0] = (mask[0] | mask[1] | mask[2] | mask[3]) */
+      spe_orx(f, tmp_reg, mask_reg);
+      /* if tmp[0] == 0 then return from function call */
+      spe_biz(f, tmp_reg, SPE_REG_RA, 0, 0);
+   }
+#endif
+
+   spe_release_register(f, ref_reg);
+   spe_release_register(f, amask_reg);
+}
+
+
+
+/**
+ * Generate SPE code to implement the fragment operations (alpha test,
+ * depth test, stencil test, blending, colormask, and final
+ * framebuffer write) as specified by the current context state.
+ *
+ * Logically, this code will be called after running the fragment
+ * shader.  But under some circumstances we could run some of this
+ * code before the fragment shader to cull fragments/quads that are
+ * totally occluded/discarded.
+ *
+ * XXX we only support PIPE_FORMAT_Z24S8_UNORM z/stencil buffer right now.
+ *
+ * See the spu_default_fragment_ops() function to see how the per-fragment
+ * operations would be done with ordinary C code.
+ * The code we generate here though has no branches, is SIMD, etc and
+ * should be much faster.
+ *
+ * \param cell  the rendering context (in)
+ * \param f     the generated function (out)
+ */
+void
+gen_fragment_function(struct cell_context *cell, struct spe_function *f)
+{
+   const struct pipe_depth_stencil_alpha_state *dsa =
+      &cell->depth_stencil->base;
+   const struct pipe_blend_state *blend = &cell->blend->base;
+
+   /* For SPE function calls: reg $3 = first param, $4 = second param, etc. */
+   const int x_reg = 3;  /* uint */
+   const int y_reg = 4;  /* uint */
+   const int color_tile_reg = 5;  /* tile_t * */
+   const int depth_tile_reg = 6;  /* tile_t * */
+   const int fragZ_reg = 7;   /* vector float */
+   const int fragR_reg = 8;   /* vector float */
+   const int fragG_reg = 9;   /* vector float */
+   const int fragB_reg = 10;  /* vector float */
+   const int fragA_reg = 11;  /* vector float */
+   const int mask_reg = 12;   /* vector uint */
+
+   /* offset of quad from start of tile
+    * XXX assuming 4-byte pixels for color AND Z/stencil!!!!
+    */
+   int quad_offset_reg;
+
+   int fbRGBA_reg;  /**< framebuffer's RGBA colors for quad */
+   int fbZS_reg;    /**< framebuffer's combined z/stencil values for quad */
+
+   spe_init_func(f, SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE);
+   spe_allocate_register(f, x_reg);
+   spe_allocate_register(f, y_reg);
+   spe_allocate_register(f, color_tile_reg);
+   spe_allocate_register(f, depth_tile_reg);
+   spe_allocate_register(f, fragZ_reg);
+   spe_allocate_register(f, fragR_reg);
+   spe_allocate_register(f, fragG_reg);
+   spe_allocate_register(f, fragB_reg);
+   spe_allocate_register(f, fragA_reg);
+   spe_allocate_register(f, mask_reg);
+
+   quad_offset_reg = spe_allocate_available_register(f);
+   fbRGBA_reg = spe_allocate_available_register(f);
+   fbZS_reg = spe_allocate_available_register(f);
+
+   /* compute offset of quad from start of tile, in bytes */
+   {
+      int x2_reg = spe_allocate_available_register(f);
+      int y2_reg = spe_allocate_available_register(f);
+
+      ASSERT(TILE_SIZE == 32);
+
+      spe_rotmi(f, x2_reg, x_reg, -1);  /* x2 = x / 2 */
+      spe_rotmi(f, y2_reg, y_reg, -1);  /* y2 = y / 2 */
+      spe_shli(f, y2_reg, y2_reg, 4);   /* y2 *= 16 */
+      spe_a(f, quad_offset_reg, y2_reg, x2_reg);  /* offset = y2 + x2 */
+      spe_shli(f, quad_offset_reg, quad_offset_reg, 4);   /* offset *= 16 */
+
+      spe_release_register(f, x2_reg);
+      spe_release_register(f, y2_reg);
+   }
+
+
+   if (dsa->alpha.enabled) {
+      gen_alpha_test(dsa, f, mask_reg, fragA_reg);
+   }
+
+   if (dsa->depth.enabled || dsa->stencil[0].enabled) {
+      const enum pipe_format zs_format = cell->framebuffer.zsbuf->format;
+      boolean write_depth_stencil;
+
+      int fbZ_reg = spe_allocate_available_register(f); /* Z values */
+      int fbS_reg = spe_allocate_available_register(f); /* Stencil values */
+
+      /* fetch quad of depth/stencil values from tile at (x,y) */
+      /* Load: fbZS_reg = memory[depth_tile_reg + offset_reg] */
+      spe_lqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg);
+
+      if (dsa->depth.enabled) {
+         /* Extract Z bits from fbZS_reg into fbZ_reg */
+         if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
+             zs_format == PIPE_FORMAT_X8Z24_UNORM) {
+            int mask_reg = spe_allocate_available_register(f);
+            spe_fsmbi(f, mask_reg, 0x7777);  /* mask[0,1,2,3] = 0x00ffffff */
+            spe_and(f, fbZ_reg, fbZS_reg, mask_reg);  /* fbZ = fbZS & mask */
+            spe_release_register(f, mask_reg);
+            /* OK, fbZ_reg has four 24-bit Z values now */
+         }
+         else {
+            /* XXX handle other z/stencil formats */
+            ASSERT(0);
+         }
+
+         /* Convert fragZ values from float[4] to uint[4] */
+         if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
+             zs_format == PIPE_FORMAT_X8Z24_UNORM ||
+             zs_format == PIPE_FORMAT_Z24S8_UNORM ||
+             zs_format == PIPE_FORMAT_Z24X8_UNORM) {
+            /* 24-bit Z values */
+            int scale_reg = spe_allocate_available_register(f);
+
+            /* scale_reg[0,1,2,3] = float(2^24-1) */
+            spe_load_float(f, scale_reg, (float) 0xffffff);
+
+            /* XXX these two instructions might be combined */
+            spe_fm(f, fragZ_reg, fragZ_reg, scale_reg); /* fragZ *= scale */
+            spe_cfltu(f, fragZ_reg, fragZ_reg, 0);  /* fragZ = (int) fragZ */
+
+            spe_release_register(f, scale_reg);
+         }
+         else {
+            /* XXX handle 16-bit Z format */
+            ASSERT(0);
+         }
+      }
+
+      if (dsa->stencil[0].enabled) {
+         /* Extract Stencil bit sfrom fbZS_reg into fbS_reg */
+         if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
+             zs_format == PIPE_FORMAT_X8Z24_UNORM) {
+            /* XXX extract with a shift */
+            ASSERT(0);
+         }
+         else if (zs_format == PIPE_FORMAT_Z24S8_UNORM ||
+                  zs_format == PIPE_FORMAT_Z24X8_UNORM) {
+            /* XXX extract with a mask */
+            ASSERT(0);
+         }
+      }
+
+
+      if (dsa->stencil[0].enabled) {
+         /* XXX this may involve depth testing too */
+         // gen_stencil_test(dsa, f, ... );
+         ASSERT(0);
+      }
+      else if (dsa->depth.enabled) {
+         int zmask_reg = spe_allocate_available_register(f);
+         gen_depth_test(dsa, f, mask_reg, fragZ_reg, fbZ_reg, zmask_reg);
+         spe_release_register(f, zmask_reg);
+      }
+
+      /* do we need to write Z and/or Stencil back into framebuffer? */
+      write_depth_stencil = (dsa->depth.writemask |
+                             dsa->stencil[0].write_mask |
+                             dsa->stencil[1].write_mask);
+
+      if (write_depth_stencil) {
+         /* Merge latest Z and Stencil values into fbZS_reg.
+          * fbZ_reg has four Z vals in bits [23..0] or bits [15..0].
+          * fbS_reg has four 8-bit Z values in bits [7..0].
+          */
+         if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
+             zs_format == PIPE_FORMAT_X8Z24_UNORM) {
+            spe_shli(f, fbS_reg, fbS_reg, 24); /* fbS = fbS << 24 */
+            spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
+         }
+         else if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
+                  zs_format == PIPE_FORMAT_X8Z24_UNORM) {
+            /* XXX to do */
+            ASSERT(0);
+         }
+         else if (zs_format == PIPE_FORMAT_Z16_UNORM) {
+            /* XXX to do */
+            ASSERT(0);
+         }
+         else if (zs_format == PIPE_FORMAT_S8_UNORM) {
+            /* XXX to do */
+            ASSERT(0);
+         }
+         else {
+            /* bad zs_format */
+            ASSERT(0);
+         }
+
+         /* Store: memory[depth_tile_reg + quad_offset_reg] = fbZS */
+         spe_stqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg);
+      }
+
+      spe_release_register(f, fbZ_reg);
+      spe_release_register(f, fbS_reg);
+   }
+
+
+   /* Get framebuffer quad/colors.  We'll need these for blending,
+    * color masking, and to obey the quad/pixel mask.
+    * Load: fbRGBA_reg = memory[color_tile + quad_offset]
+    * Note: if mask={~0,~0,~0,~0} and we're not blending or colormasking
+    * we could skip this load.
+    */
+   spe_lqx(f, fbRGBA_reg, color_tile_reg, quad_offset_reg);
+
+
+   if (blend->blend_enable) {
+      /* convert packed tile colors in fbRGBA_reg to float[4] vectors */
+
+      // gen_blend_code(blend, f, mask_reg, ... );
+
+   }
+
+
+
+   /*
+    * Write fragment colors to framebuffer/tile.
+    * This involves converting the fragment colors from float[4] to the
+    * tile's specific format and obeying the quad/pixel mask.
+    */
+   {
+      const enum pipe_format color_format = cell->framebuffer.cbufs[0]->format;
+      int rgba_reg = spe_allocate_available_register(f);
+
+      /* Convert float[4] in [0.0,1.0] to int[4] in [0,~0], with clamping */
+      spe_cfltu(f, fragR_reg, fragR_reg, 32);
+      spe_cfltu(f, fragG_reg, fragG_reg, 32);
+      spe_cfltu(f, fragB_reg, fragB_reg, 32);
+      spe_cfltu(f, fragA_reg, fragA_reg, 32);
+
+      /* Shift most the significant bytes to least the significant positions.
+       * I.e.: reg = reg >> 24
+       */
+      spe_rotmi(f, fragR_reg, fragR_reg, -24);
+      spe_rotmi(f, fragG_reg, fragG_reg, -24);
+      spe_rotmi(f, fragB_reg, fragB_reg, -24);
+      spe_rotmi(f, fragA_reg, fragA_reg, -24);
+
+      /* Shift the color bytes according to the surface format */
+      if (color_format == PIPE_FORMAT_A8R8G8B8_UNORM) {
+         spe_roti(f, fragG_reg, fragG_reg, 8);   /* green <<= 8 */
+         spe_roti(f, fragR_reg, fragR_reg, 16);  /* red <<= 16 */
+         spe_roti(f, fragA_reg, fragA_reg, 24);  /* alpha <<= 24 */
+      }
+      else if (color_format == PIPE_FORMAT_B8G8R8A8_UNORM) {
+         spe_roti(f, fragR_reg, fragR_reg, 8);   /* red <<= 8 */
+         spe_roti(f, fragG_reg, fragG_reg, 16);  /* green <<= 16 */
+         spe_roti(f, fragB_reg, fragB_reg, 24);  /* blue <<= 24 */
+      }
+      else {
+         ASSERT(0);
+      }
+
+      /* Merge red, green, blue, alpha registers to make packed RGBA colors.
+       * Eg: after shifting according to color_format we might have:
+       *     R = {0x00ff0000, 0x00110000, 0x00220000, 0x00330000}
+       *     G = {0x0000ff00, 0x00004400, 0x00005500, 0x00006600}
+       *     B = {0x000000ff, 0x00000077, 0x00000088, 0x00000099}
+       *     A = {0xff000000, 0xaa000000, 0xbb000000, 0xcc000000}
+       * OR-ing all those together gives us four packed colors:
+       *  RGBA = {0xffffffff, 0xaa114477, 0xbb225588, 0xcc336699}
+       */
+      spe_or(f, rgba_reg, fragR_reg, fragG_reg);
+      spe_or(f, rgba_reg, rgba_reg, fragB_reg);
+      spe_or(f, rgba_reg, rgba_reg, fragA_reg);
+
+      /* Mix fragment colors with framebuffer colors using the quad/pixel mask:
+       * if (mask[i])
+       *    rgba[i] = rgba[i];
+       * else
+       *    rgba[i] = framebuffer[i];
+       */
+      spe_selb(f, rgba_reg, fbRGBA_reg, rgba_reg, mask_reg);
+
+      /* Store updated quad in tile:
+       * memory[color_tile + quad_offset] = rgba_reg;
+       */
+      spe_stqx(f, rgba_reg, color_tile_reg, quad_offset_reg);
+
+      spe_release_register(f, rgba_reg);
+   }
+
+   printf("gen_fragment_ops nr instructions: %u\n", f->num_inst);
+
+   spe_bi(f, SPE_REG_RA, 0, 0);  /* return from function call */
+
+
+   spe_release_register(f, fbRGBA_reg);
+   spe_release_register(f, fbZS_reg);
+   spe_release_register(f, quad_offset_reg);
+}
+
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.h b/src/gallium/drivers/cell/ppu/cell_gen_fragment.h
new file mode 100644
index 0000000000..0ea0fc690c
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.h
@@ -0,0 +1,38 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef CELL_GEN_FRAGMENT_H
+#define CELL_GEN_FRAGMENT_H
+
+
+extern void
+gen_fragment_function(struct cell_context *cell, struct spe_function *f);
+
+
+#endif /* CELL_GEN_FRAGMENT_H */
+
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index f2feaa329a..06777aac14 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -27,6 +27,7 @@
 
 #include "util/u_memory.h"
 #include "cell_context.h"
+#include "cell_gen_fragment.h"
 #include "cell_state.h"
 #include "cell_state_emit.h"
 #include "cell_state_per_fragment.h"
@@ -83,6 +84,29 @@ cell_emit_state(struct cell_context *cell)
       fb->depth_format = zbuf ? zbuf->format : PIPE_FORMAT_NONE;
       fb->width = cell->framebuffer.width;
       fb->height = cell->framebuffer.height;
+#if 0
+      printf("EMIT color format %s\n", pf_name(fb->color_format));
+      printf("EMIT depth format %s\n", pf_name(fb->depth_format));
+#endif
+   }
+
+
+   if (cell->dirty & (CELL_NEW_FRAMEBUFFER | CELL_NEW_DEPTH_STENCIL)) {
+      /* XXX we don't want to always do codegen here.  We should have
+       * a hash/lookup table to cache previous results...
+       */
+      struct cell_command_fragment_ops *fops
+            = cell_batch_alloc(cell, sizeof(*fops));
+      struct spe_function spe_code;
+
+      /* generate new code */
+      gen_fragment_function(cell, &spe_code);
+      /* put the new code into the batch buffer */
+      fops->opcode = CELL_CMD_STATE_FRAGMENT_OPS;
+      memcpy(&fops->code, spe_code.store,
+             SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE);
+      /* free codegen buffer */
+      spe_release_func(&spe_code);
    }
 
    if (cell->dirty & CELL_NEW_BLEND) {
@@ -90,8 +114,7 @@ cell_emit_state(struct cell_context *cell)
 
       if (cell->blend != NULL) {
          blend.base = (intptr_t) cell->blend->code.store;
-         blend.size = (char *) cell->blend->code.csr
-             - (char *) cell->blend->code.store;
+         blend.size = cell->blend->code.num_inst * SPE_INST_SIZE;
          blend.read_fb = TRUE;
       }
       else {
@@ -108,10 +131,10 @@ cell_emit_state(struct cell_context *cell)
 
       if (cell->depth_stencil != NULL) {
 	 dsat.base = (intptr_t) cell->depth_stencil->code.store;
-	 dsat.size = (char *) cell->depth_stencil->code.csr
-	     - (char *) cell->depth_stencil->code.store;
+	 dsat.size = cell->depth_stencil->code.num_inst * SPE_INST_SIZE;
 	 dsat.read_depth = TRUE;
 	 dsat.read_stencil = FALSE;
+         dsat.state = cell->depth_stencil->base;
       }
       else {
 	 dsat.base = 0;
diff --git a/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c b/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c
index 705867107b..78cb446c14 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c
@@ -1158,7 +1158,7 @@ cell_generate_alpha_blend(struct cell_blend_state *cb)
 static int
 PC_OFFSET(const struct spe_function *f, const void *d)
 {
-   const intptr_t pc = (intptr_t) f->csr;
+   const intptr_t pc = (intptr_t) &f->store[f->num_inst];
    const intptr_t ea = ~0x0f & (intptr_t) d;
 
    return (ea - pc) >> 2;
diff --git a/src/gallium/drivers/cell/spu/Makefile b/src/gallium/drivers/cell/spu/Makefile
index d49abb2e82..e285ae9fdb 100644
--- a/src/gallium/drivers/cell/spu/Makefile
+++ b/src/gallium/drivers/cell/spu/Makefile
@@ -43,7 +43,7 @@ INCLUDE_DIRS = \
 	$(SPU_CC) $(SPU_CFLAGS) -c $<
 
 .c.s:
-	$(SPU_CC) $(SPU_CFLAGS) -S $<
+	$(SPU_CC) $(SPU_CFLAGS) -O3 -S $<
 
 
 # The .a file will be linked into the main/PPU executable
diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index c4236817a9..4e0ec15925 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -34,6 +34,7 @@
 
 #include "spu_main.h"
 #include "spu_render.h"
+#include "spu_per_fragment_op.h"
 #include "spu_texture.h"
 #include "spu_tile.h"
 //#include "spu_test.h"
@@ -46,7 +47,7 @@
 /*
 helpful headers:
 /usr/lib/gcc/spu/4.1.1/include/spu_mfcio.h
-/opt/ibm/cell-sdk/prototype/sysroot/usr/include/libmisc.h
+/opt/cell/sdk/usr/include/libmisc.h
 */
 
 boolean Debug = FALSE;
@@ -226,6 +227,24 @@ cmd_release_verts(const struct cell_command_release_verts *release)
 }
 
 
+/**
+ * Process a CELL_CMD_STATE_FRAGMENT_OPS command.
+ * This involves installing new fragment ops SPU code.
+ * If this function is never called, we'll use a regular C fallback function
+ * for fragment processing.
+ */
+static void
+cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops)
+{
+   if (Debug)
+      printf("SPU %u: CMD_STATE_FRAGMENT_OPS\n", spu.init.id);
+   /* Copy SPU code from batch buffer to spu buffer */
+   memcpy(spu.fragment_ops.code, fops->code, SPU_MAX_FRAGMENT_OPS_INSTS * 4);
+   /* Point function pointer at new code */
+   spu.fragment_ops.func = (spu_fragment_ops_func) spu.fragment_ops.code;
+}
+
+
 static void
 cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
 {
@@ -257,6 +276,8 @@ cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
       break;
    case PIPE_FORMAT_Z24S8_UNORM:
    case PIPE_FORMAT_S8Z24_UNORM:
+   case PIPE_FORMAT_Z24X8_UNORM:
+   case PIPE_FORMAT_X8Z24_UNORM:
       spu.fb.zsize = 4;
       spu.fb.zscale = (float) 0x00ffffffu;
       break;
@@ -282,6 +303,8 @@ cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
 }
 
 
+#define NEW_FRAGMENT_FUNCTION 01
+
 static void
 cmd_state_blend(const struct cell_command_blend *state)
 {
@@ -302,7 +325,9 @@ cmd_state_blend(const struct cell_command_blend *state)
       wait_on_mask(1 << TAG_BATCH_BUFFER);
       spu.blend = (blend_func) fb_blend_code_buffer;
       spu.read_fb = state->read_fb;
-   } else {
+   }
+   else
+   {
       spu.read_fb = FALSE;
    }
 }
@@ -326,7 +351,9 @@ cmd_state_depth_stencil(const struct cell_command_depth_stencil_alpha_test *stat
 	      0, /* tid */
 	      0  /* rid */);
       wait_on_mask(1 << TAG_BATCH_BUFFER);
-   } else {
+   }
+   else
+   {
       /* If there is no code, emit a return instruction.
        */
       depth_stencil_code_buffer[0] = 0x35;
@@ -338,12 +365,14 @@ cmd_state_depth_stencil(const struct cell_command_depth_stencil_alpha_test *stat
    spu.frag_test = (frag_test_func) depth_stencil_code_buffer;
    spu.read_depth = state->read_depth;
    spu.read_stencil = state->read_stencil;
+   spu.depth_stencil_alpha = state->state;
 }
 
 
 static void
 cmd_state_logicop(const struct cell_command_logicop * code)
 {
+#if !NEW_FRAGMENT_FUNCTION
    mfc_get(logicop_code_buffer,
            (unsigned int) code->base,  /* src */
            code->size,
@@ -353,6 +382,7 @@ cmd_state_logicop(const struct cell_command_logicop * code)
    wait_on_mask(1 << TAG_BATCH_BUFFER);
 
    spu.logicop = (logicop_func) logicop_code_buffer;
+#endif
 }
 
 
@@ -455,7 +485,9 @@ cmd_finish(void)
 
 
 /**
- * Execute a batch of commands
+ * Execute a batch of commands which was sent to us by the PPU.
+ * See the cell_emit_state.c code to see where the commands come from.
+ *
  * The opcode param encodes the location of the buffer and its size.
  */
 static void
@@ -519,6 +551,14 @@ cmd_batch(uint opcode)
             pos += pos_incr;
          }
          break;
+      case CELL_CMD_STATE_FRAGMENT_OPS:
+         {
+            struct cell_command_fragment_ops *fops
+               = (struct cell_command_fragment_ops *) &buffer[pos];
+            cmd_state_fragment_ops(fops);
+            pos += sizeof(*fops) / 8;
+         }
+         break;
       case CELL_CMD_RELEASE_VERTS:
          {
             struct cell_command_release_verts *release
@@ -680,6 +720,11 @@ one_time_init(void)
    memset(spu.ctile_status, TILE_STATUS_DEFINED, sizeof(spu.ctile_status));
    memset(spu.ztile_status, TILE_STATUS_DEFINED, sizeof(spu.ztile_status));
    invalidate_tex_cache();
+
+   /* Install default/fallback fragment processing function.
+    * This will normally be overriden by a code-gen'd function.
+    */
+   spu.fragment_ops.func = spu_fallback_fragment_ops;
 }
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index c2a53c9dcf..7ab34f5222 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -91,6 +91,24 @@ typedef struct spu_blend_results (*logicop_func)(
 
 typedef vector float (*sample_texture_func)(uint unit, vector float texcoord);
 
+
+typedef void (*spu_fragment_ops_func)(uint x, uint y,
+                                      tile_t *colorTile,
+                                      tile_t *depthStencilTile,
+                                      vector float fragZ,
+                                      vector float fragRed,
+                                      vector float fragGreen,
+                                      vector float fragBlue,
+                                      vector float fragAlpha,
+                                      vector unsigned int mask);
+
+struct spu_fragment_ops
+{
+   uint code[SPU_MAX_FRAGMENT_OPS_INSTS];
+   spu_fragment_ops_func func;  /**< Current fragment ops function */
+} ALIGN16_ATTRIB;
+
+
 struct spu_framebuffer {
    void *color_start;              /**< addr of color surface in main memory */
    void *depth_start;              /**< addr of depth surface in main memory */
@@ -127,6 +145,9 @@ struct spu_global
    struct cell_init_info init;
 
    struct spu_framebuffer fb;
+
+   struct pipe_depth_stencil_alpha_state depth_stencil_alpha;
+
    boolean read_depth;
    boolean read_stencil;
    frag_test_func frag_test;  /**< Current depth/stencil test code */
@@ -142,6 +163,8 @@ struct spu_global
 
    struct vertex_info vertex_info;
 
+   struct spu_fragment_ops fragment_ops;
+
    /* XXX more state to come */
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
index 29dc07a2e8..ffc596aa62 100644
--- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
+++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
@@ -29,8 +29,11 @@
  * \author Ian Romanick <idr@us.ibm.com>
  */
 
+
+#include <transpose_matrix4x4.h>
 #include "pipe/p_format.h"
 #include "spu_main.h"
+#include "spu_colorpack.h"
 #include "spu_per_fragment_op.h"
 
 #define ZERO 0x80
@@ -90,7 +93,8 @@ read_ds_quad(tile_t *tile, unsigned x, unsigned y,
       break;
    }
 
-   case PIPE_FORMAT_S8Z24_UNORM: {
+   case PIPE_FORMAT_S8Z24_UNORM:
+   case PIPE_FORMAT_X8Z24_UNORM: {
       qword *ptr = (qword *) &tile->ui4[iy][ix];
 
       *depth = si_and(*ptr, si_fsmbi(0x7777));
@@ -153,7 +157,8 @@ write_ds_quad(tile_t *buffer, unsigned x, unsigned y,
       break;
    }
 
-   case PIPE_FORMAT_S8Z24_UNORM: {
+   case PIPE_FORMAT_S8Z24_UNORM:
+   case PIPE_FORMAT_X8Z24_UNORM: {
       qword *ptr = (qword *) &buffer->ui4[iy][ix];
       /* form select mask = 0111,0111,0111,0111 */
       qword mask = si_fsmbi(0x7777);
@@ -217,3 +222,225 @@ spu_do_depth_stencil(int x, int y,
 
    return result.mask;
 }
+
+
+
+
+/**
+ * Called by rasterizer for each quad after the shader has run.  This
+ * is a fallback/debug function.  In reality we'll use a generated
+ * function produced by the PPU.  But this function is useful for
+ * debug/validation.
+ */
+void
+spu_fallback_fragment_ops(uint x, uint y,
+                          tile_t *colorTile,
+                          tile_t *depthStencilTile,
+                          vector float fragZ,
+                          vector float fragRed,
+                          vector float fragGreen,
+                          vector float fragBlue,
+                          vector float fragAlpha,
+                          vector unsigned int mask)
+{
+   vector float frag_soa[4], frag_aos[4];
+   unsigned int c0, c1, c2, c3;
+
+   /* do alpha test */
+   if (spu.depth_stencil_alpha.alpha.enabled) {
+      vector float ref = spu_splats(spu.depth_stencil_alpha.alpha.ref);
+      vector unsigned int amask;
+
+      switch (spu.depth_stencil_alpha.alpha.func) {
+      case PIPE_FUNC_LESS:
+         amask = spu_cmpgt(ref, fragAlpha);  /* mask = (fragAlpha < ref) */
+         break;
+      case PIPE_FUNC_GREATER:
+         amask = spu_cmpgt(fragAlpha, ref);  /* mask = (fragAlpha > ref) */
+         break;
+      case PIPE_FUNC_GEQUAL:
+         amask = spu_cmpgt(ref, fragAlpha);
+         amask = spu_nor(amask, amask);
+         break;
+      case PIPE_FUNC_LEQUAL:
+         amask = spu_cmpgt(fragAlpha, ref);
+         amask = spu_nor(amask, amask);
+         break;
+      case PIPE_FUNC_EQUAL:
+         amask = spu_cmpeq(ref, fragAlpha);
+         break;
+      case PIPE_FUNC_NOTEQUAL:
+         amask = spu_cmpeq(ref, fragAlpha);
+         amask = spu_nor(amask, amask);
+         break;
+      case PIPE_FUNC_ALWAYS:
+         amask = spu_splats(0xffffffffU);
+         break;
+      case PIPE_FUNC_NEVER:
+         amask = spu_splats( 0x0U);
+         break;
+      default:
+         ;
+      }
+
+      mask = spu_and(mask, amask);
+   }
+
+   /* Z and/or stencil testing... */
+   if (spu.depth_stencil_alpha.depth.enabled ||
+       spu.depth_stencil_alpha.stencil[0].enabled) {
+
+      /* get four Z/Stencil values from tile */
+      vector unsigned int mask24 = spu_splats((unsigned int)0x00ffffffU);
+      vector unsigned int ifbZS = depthStencilTile->ui4[y/2][x/2];
+      vector unsigned int ifbZ = spu_and(ifbZS, mask24);
+      vector unsigned int ifbS = spu_andc(ifbZS, mask24);
+
+      if (spu.depth_stencil_alpha.stencil[0].enabled) {
+         /* do stencil test */
+         ASSERT(spu.fb.depth_format == PIPE_FORMAT_S8Z24_UNORM);
+
+      }
+      else if (spu.depth_stencil_alpha.depth.enabled) {
+         /* do depth test */
+
+         ASSERT(spu.fb.depth_format == PIPE_FORMAT_S8Z24_UNORM ||
+                spu.fb.depth_format == PIPE_FORMAT_X8Z24_UNORM);
+
+         vector unsigned int ifragZ;
+         vector unsigned int zmask;
+
+         /* convert four fragZ from float to uint */
+         fragZ = spu_mul(fragZ, spu_splats((float) 0xffffff));
+         ifragZ = spu_convtu(fragZ, 0);
+
+         /* do depth comparison, setting zmask with results */
+         switch (spu.depth_stencil_alpha.depth.func) {
+         case PIPE_FUNC_LESS:
+            zmask = spu_cmpgt(ifbZ, ifragZ);  /* mask = (ifragZ < ifbZ) */
+            break;
+         case PIPE_FUNC_GREATER:
+            zmask = spu_cmpgt(ifragZ, ifbZ);  /* mask = (ifbZ > ifragZ) */
+            break;
+         case PIPE_FUNC_GEQUAL:
+            zmask = spu_cmpgt(ifbZ, ifragZ);
+            zmask = spu_nor(zmask, zmask);
+            break;
+         case PIPE_FUNC_LEQUAL:
+            zmask = spu_cmpgt(ifragZ, ifbZ);
+            zmask = spu_nor(zmask, zmask);
+            break;
+         case PIPE_FUNC_EQUAL:
+            zmask = spu_cmpeq(ifbZ, ifragZ);
+            break;
+         case PIPE_FUNC_NOTEQUAL:
+            zmask = spu_cmpeq(ifbZ, ifragZ);
+            zmask = spu_nor(zmask, zmask);
+            break;
+         case PIPE_FUNC_ALWAYS:
+            zmask = spu_splats(0xffffffffU);
+            break;
+         case PIPE_FUNC_NEVER:
+            zmask = spu_splats( 0x0U);
+            break;
+         default:
+            ;
+         }
+
+         mask = spu_and(mask, zmask);
+
+         /* merge framebuffer Z and fragment Z according to the mask */
+         ifbZ = spu_or(spu_and(ifragZ, mask),
+                       spu_andc(ifbZ, mask));
+      }
+
+      if (spu_extract(spu_orx(mask), 0)) {
+         /* put new fragment Z/Stencil values back into Z/Stencil tile */
+         depthStencilTile->ui4[y/2][x/2] = spu_or(ifbZ, ifbS);
+
+         spu.cur_ztile_status = TILE_STATUS_DIRTY;
+      }
+   }
+
+   /* XXX do blending here */
+
+   /* XXX do colormask test here */
+
+
+   if (spu_extract(spu_orx(mask), 0)) {
+      spu.cur_ctile_status = TILE_STATUS_DIRTY;
+   }
+   else {
+      return;
+   }
+
+   /* convert RRRR,GGGG,BBBB,AAAA to RGBA,RGBA,RGBA,RGBA */
+#if 0
+   {
+      vector float frag_soa[4];
+      frag_soa[0] = fragRed;
+      frag_soa[1] = fragGreen;
+      frag_soa[2] = fragBlue;
+      frag_soa[3] = fragAlpha;
+      _transpose_matrix4x4(frag_aos, frag_soa);
+   }
+#else
+   /* short-cut relying on function parameter layout: */
+   _transpose_matrix4x4(frag_aos, &fragRed);
+   (void) fragGreen;
+   (void) fragBlue;
+#endif
+
+   switch (spu.fb.color_format) {
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+      c0 = spu_pack_A8R8G8B8(frag_aos[0]);
+      c1 = spu_pack_A8R8G8B8(frag_aos[1]);
+      c2 = spu_pack_A8R8G8B8(frag_aos[2]);
+      c3 = spu_pack_A8R8G8B8(frag_aos[3]);
+      break;
+
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
+      c0 = spu_pack_B8G8R8A8(frag_aos[0]);
+      c1 = spu_pack_B8G8R8A8(frag_aos[1]);
+      c2 = spu_pack_B8G8R8A8(frag_aos[2]);
+      c3 = spu_pack_B8G8R8A8(frag_aos[3]);
+      break;
+   default:
+      fprintf(stderr, "SPU: Bad pixel format in spu_default_fragment_ops\n");
+      ASSERT(0);
+   }
+
+#if 0
+   /*
+    * Quad layout:
+    *  +--+--+
+    *  |p0|p1|
+    *  +--+--+
+    *  |p2|p3|
+    *  +--+--+
+    */
+   if (spu_extract(mask, 0))
+      colorTile->ui[y+0][x+0] = c0;
+   if (spu_extract(mask, 1))
+      colorTile->ui[y+0][x+1] = c1;
+   if (spu_extract(mask, 2))
+      colorTile->ui[y+1][x+0] = c2;
+   if (spu_extract(mask, 3))
+      colorTile->ui[y+1][x+1] = c3;   
+#else
+   /*
+    * Quad layout:
+    *  +--+--+--+--+
+    *  |p0|p1|p2|p3|
+    *  +--+--+--+--+
+    */
+   if (spu_extract(mask, 0))
+      colorTile->ui[y][x*2] = c0;
+   if (spu_extract(mask, 1))
+      colorTile->ui[y][x*2+1] = c1;
+   if (spu_extract(mask, 2))
+      colorTile->ui[y][x*2+2] = c2;
+   if (spu_extract(mask, 3))
+      colorTile->ui[y][x*2+3] = c3;   
+#endif
+}
diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.h b/src/gallium/drivers/cell/spu/spu_per_fragment_op.h
index 6571258699..ffadf0661c 100644
--- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.h
+++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.h
@@ -29,4 +29,15 @@ extern qword
 spu_do_depth_stencil(int x, int y, qword frag_mask, qword frag_depth,
 		     qword frag_alpha, qword facing);
 
+extern void
+spu_fallback_fragment_ops(uint x, uint y,
+                          tile_t *colorTile,
+                          tile_t *depthStencilTile,
+                          vector float fragZ,
+                          vector float fragRed,
+                          vector float fragGreen,
+                          vector float fragBlue,
+                          vector float fragAlpha,
+                          vector unsigned int mask);
+
 #endif /* SPU_PER_FRAGMENT_OP */
diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index a3ea0a3e69..71ef6ca24f 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -297,9 +297,12 @@ emit_quad( int x, int y, mask_t mask )
    sp->quad.first->run(sp->quad.first, &setup.quad);
 #else
 
+#define NEW_FRAGMENT_FUNCTION 01
+#if !NEW_FRAGMENT_FUNCTION
    if (spu.read_depth) {
       mask = do_depth_test(x, y, mask);
    }
+#endif
 
    /* If any bits in mask are set... */
    if (spu_extract(spu_orx(mask), 0)) {
@@ -308,6 +311,7 @@ emit_quad( int x, int y, mask_t mask )
       vector float colors[4];
 
       spu.cur_ctile_status = TILE_STATUS_DIRTY;
+      spu.cur_ztile_status = TILE_STATUS_DIRTY;
 
       if (spu.texture[0].start) {
          /* texture mapping */
@@ -355,6 +359,29 @@ emit_quad( int x, int y, mask_t mask )
       }
 
 
+#if NEW_FRAGMENT_FUNCTION
+      {
+         /* Convert fragment data from AoS to SoA format.
+          * I.e. (RGBA,RGBA,RGBA,RGBA) -> (RRRR,GGGG,BBBB,AAAA)
+          * This is temporary!
+          */
+         vector float soa_frag[4];
+         _transpose_matrix4x4(soa_frag, colors);
+
+         float4 fragZ;
+
+         fragZ.v = eval_z((float) x, (float) y);
+
+         /* Do all per-fragment/quad operations here, including:
+          *  alpha test, z test, stencil test, blend and framebuffer writing.
+          */
+         spu.fragment_ops.func(ix, iy, &spu.ctile, &spu.ztile,
+                               fragZ.v,
+                               soa_frag[0], soa_frag[1],
+                               soa_frag[2], soa_frag[3],
+                               mask);
+      }
+#else
       /* Convert fragment data from AoS to SoA format.
        * I.e. (RGBA,RGBA,RGBA,RGBA) -> (RRRR,GGGG,BBBB,AAAA)
        */
@@ -405,6 +432,9 @@ emit_quad( int x, int y, mask_t mask )
       spu.ctile.ui[iy+0][ix+1] = spu_extract((vec_uint4) result.g, 0);
       spu.ctile.ui[iy+1][ix+0] = spu_extract((vec_uint4) result.b, 0);
       spu.ctile.ui[iy+1][ix+1] = spu_extract((vec_uint4) result.a, 0);
+
+#endif /* NEW_FRAGMENT_FUNCTION */
+
    }
 #endif
 }
diff --git a/src/gallium/winsys/xlib/xm_api.c b/src/gallium/winsys/xlib/xm_api.c
index b010513107..28bd6ceab4 100644
--- a/src/gallium/winsys/xlib/xm_api.c
+++ b/src/gallium/winsys/xlib/xm_api.c
@@ -349,12 +349,17 @@ create_xmesa_buffer(XMesaDrawable d, BufferType type,
 
    if (vis->mesa_visual.depthBits == 0)
       depthFormat = PIPE_FORMAT_NONE;
+#ifdef GALLIUM_CELL /* XXX temporary for Cell! */
+   else
+      depthFormat = PIPE_FORMAT_S8Z24_UNORM;
+#else
    else if (vis->mesa_visual.depthBits <= 16)
-      depthFormat = PIPE_FORMAT_Z16_UNORM;
+      depthFormat = PIPE_FORMAT_Z16UNORM;
    else if (vis->mesa_visual.depthBits <= 24)
       depthFormat = PIPE_FORMAT_S8Z24_UNORM;
    else
       depthFormat = PIPE_FORMAT_Z32_UNORM;
+#endif
 
    if (vis->mesa_visual.stencilBits == 8) {
       if (depthFormat == PIPE_FORMAT_S8Z24_UNORM)
diff --git a/src/gallium/winsys/xlib/xm_winsys.c b/src/gallium/winsys/xlib/xm_winsys.c
index 5e9a1f92f1..c4a30d3702 100644
--- a/src/gallium/winsys/xlib/xm_winsys.c
+++ b/src/gallium/winsys/xlib/xm_winsys.c
@@ -275,6 +275,39 @@ xm_buffer_destroy(struct pipe_winsys *pws,
 }
 
 
+/**
+ * For Cell.  Basically, rearrange the pixels/quads from this layout:
+ *  +--+--+--+--+
+ *  |p0|p1|p2|p3|....
+ *  +--+--+--+--+
+ *
+ * to this layout:
+ *  +--+--+
+ *  |p0|p1|....
+ *  +--+--+
+ *  |p2|p3|
+ *  +--+--+
+ */
+static void
+twiddle_tile(uint *tile)
+{
+   uint tile2[TILE_SIZE * TILE_SIZE];
+   int y, x;
+
+   for (y = 0; y < TILE_SIZE; y+=2) {
+      for (x = 0; x < TILE_SIZE; x+=2) {
+         int k = 4 * (y/2 * TILE_SIZE/2 + x/2);
+         tile2[y * TILE_SIZE + (x + 0)] = tile[k];
+         tile2[y * TILE_SIZE + (x + 1)] = tile[k+1];
+         tile2[(y + 1) * TILE_SIZE + (x + 0)] = tile[k+2];
+         tile2[(y + 1) * TILE_SIZE + (x + 1)] = tile[k+3];
+      }
+   }
+   memcpy(tile, tile2, sizeof(tile2));
+}
+
+
+
 /**
  * Display a surface that's in a tiled configuration.  That is, all the
  * pixels for a TILE_SIZExTILE_SIZE block are contiguous in memory.
@@ -321,6 +354,8 @@ xmesa_display_surface_tiled(XMesaBuffer b, const struct pipe_surface *surf)
 
          ximage->data = (char *) xm_buf->data + offset;
 
+         twiddle_tile((uint *) ximage->data);
+
          if (XSHM_ENABLED(xm_buf)) {
 #if defined(USE_XSHM) && !defined(XFree86Server)
             XShmPutImage(b->xm_visual->display, b->drawable, b->gc,
-- 
cgit v1.2.3


From 701fcee65db6b72f98e926d838956bbcc54f1cc6 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 10 Sep 2008 18:51:43 -0600
Subject: cell: remove old per-fragment code, replace with all new code

---
 src/gallium/drivers/cell/spu/spu_per_fragment_op.c | 236 +++------------------
 src/gallium/drivers/cell/spu/spu_per_fragment_op.h |  47 ++--
 src/gallium/drivers/cell/spu/spu_tri.c             |  96 ---------
 3 files changed, 48 insertions(+), 331 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
index ffc596aa62..9ed5fc50cd 100644
--- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
+++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
@@ -1,32 +1,32 @@
-/*
- * (C) Copyright IBM Corporation 2008
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
  * All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * on the rights to use, copy, modify, merge, publish, distribute, sub
- * license, and/or sell copies of the Software, and to permit persons to whom
- * the Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
- * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
- * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- * USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
 
 /**
- * \file spu_per_fragment_op.c
- * SPU implementation various per-fragment operations.
- *
- * \author Ian Romanick <idr@us.ibm.com>
+ * \author Brian Paul
  */
 
 
@@ -36,194 +36,6 @@
 #include "spu_colorpack.h"
 #include "spu_per_fragment_op.h"
 
-#define ZERO 0x80
-
-
-/**
- * Get a "quad" of four fragment Z/stencil values from the given tile.
- * \param tile  the tile of Z/stencil values
- * \param x, y  location of the quad in the tile, in pixels
- * \param depth_format  format of the tile's data
- * \param detph  returns four depth values
- * \param stencil  returns four stencil values
- */
-static void
-read_ds_quad(tile_t *tile, unsigned x, unsigned y,
-             enum pipe_format depth_format, qword *depth,
-             qword *stencil)
-{
-   const int ix = x / 2;
-   const int iy = y / 2;
-
-   switch (depth_format) {
-   case PIPE_FORMAT_Z16_UNORM: {
-      qword *ptr = (qword *) &tile->us8[iy][ix / 2];
-
-      const qword shuf_vec = (qword) {
-         ZERO, ZERO, 0, 1, ZERO, ZERO, 2, 3,
-         ZERO, ZERO, 4, 5, ZERO, ZERO, 6, 7
-      };
-
-      /* At even X values we want the first 4 shorts, and at odd X values we
-       * want the second 4 shorts.
-       */
-      qword bias = (qword) spu_splats((unsigned char) ((ix & 0x01) << 3));
-      qword bias_mask = si_fsmbi(0x3333);
-      qword sv = si_a(shuf_vec, si_and(bias_mask, bias));
-
-      *depth = si_shufb(*ptr, *ptr, sv);
-      *stencil = si_il(0);
-      break;
-   }
-
-   case PIPE_FORMAT_Z32_UNORM: {
-      qword *ptr = (qword *) &tile->ui4[iy][ix];
-
-      *depth = *ptr;
-      *stencil = si_il(0);
-      break;
-   }
-
-   case PIPE_FORMAT_Z24S8_UNORM: {
-      qword *ptr = (qword *) &tile->ui4[iy][ix];
-      qword mask = si_fsmbi(0xEEEE);
-
-      *depth = si_rotmai(si_and(*ptr, mask), -8);
-      *stencil = si_andc(*ptr, mask);
-      break;
-   }
-
-   case PIPE_FORMAT_S8Z24_UNORM:
-   case PIPE_FORMAT_X8Z24_UNORM: {
-      qword *ptr = (qword *) &tile->ui4[iy][ix];
-
-      *depth = si_and(*ptr, si_fsmbi(0x7777));
-      *stencil = si_andi(si_roti(*ptr, 8), 0x0ff);
-      break;
-   }
-
-   default:
-      ASSERT(0);
-      break;
-   }
-}
-
-
-/**
- * Put a quad of Z/stencil values into a tile.
- * \param tile  the tile of Z/stencil values to write into
- * \param x, y  location of the quad in the tile, in pixels
- * \param depth_format  format of the tile's data
- * \param detph  depth values to store
- * \param stencil  stencil values to store
- */
-static void
-write_ds_quad(tile_t *buffer, unsigned x, unsigned y,
-              enum pipe_format depth_format,
-              qword depth, qword stencil)
-{
-   const int ix = x / 2;
-   const int iy = y / 2;
-
-   (void) stencil;
-
-   switch (depth_format) {
-   case PIPE_FORMAT_Z16_UNORM: {
-      qword *ptr = (qword *) &buffer->us8[iy][ix / 2];
-
-      qword sv = ((ix & 0x01) == 0) 
-          ? (qword) { 2, 3, 6, 7, 10, 11, 14, 15,
-                      24, 25, 26, 27, 28, 29, 30, 31 }
-          : (qword) { 16, 17, 18, 19, 20 , 21, 22, 23,
-                      2, 3, 6, 7, 10, 11, 14, 15 };
-      *ptr = si_shufb(depth, *ptr, sv);
-      break;
-   }
-
-   case PIPE_FORMAT_Z32_UNORM: {
-      qword *ptr = (qword *) &buffer->ui4[iy][ix];
-      *ptr = depth;
-      break;
-   }
-
-   case PIPE_FORMAT_Z24S8_UNORM: {
-      qword *ptr = (qword *) &buffer->ui4[iy][ix];
-      /* form select mask = 1110,1110,1110,1110 */
-      qword mask = si_fsmbi(0xEEEE);
-      /* depth[i] = depth[i] << 8 */
-      depth = si_shli(depth, 8);
-      /* *ptr[i] = depth[i][31:8] | stencil[i][7:0] */
-      *ptr = si_selb(stencil, depth, mask);
-      break;
-   }
-
-   case PIPE_FORMAT_S8Z24_UNORM:
-   case PIPE_FORMAT_X8Z24_UNORM: {
-      qword *ptr = (qword *) &buffer->ui4[iy][ix];
-      /* form select mask = 0111,0111,0111,0111 */
-      qword mask = si_fsmbi(0x7777);
-      /* stencil[i] = stencil[i] << 24 */
-      stencil = si_shli(stencil, 24);
-      /* *ptr[i] = stencil[i][31:24] | depth[i][23:0] */
-      *ptr = si_selb(stencil, depth, mask);
-      break;
-   }
-
-   default:
-      ASSERT(0);
-      break;
-   }
-}
-
-
-/**
- * Do depth/stencil/alpha test for a "quad" of 4 fragments.
- * \param x,y  location of quad within tile
- * \param frag_mask  indicates which fragments are "alive"
- * \param frag_depth  four fragment depth values
- * \param frag_alpha  four fragment alpha values
- * \param facing  front/back facing for four fragments (1=front, 0=back)
- */
-qword
-spu_do_depth_stencil(int x, int y,
-                     qword frag_mask, qword frag_depth, qword frag_alpha,
-                     qword facing)
-{
-   struct spu_frag_test_results  result;
-   qword pixel_depth;
-   qword pixel_stencil;
-
-   /* All of this preable code (everthing before the call to frag_test) should
-    * be generated on the PPU and upload to the SPU.
-    */
-   if (spu.read_depth || spu.read_stencil) {
-      read_ds_quad(&spu.ztile, x, y, spu.fb.depth_format,
-                   &pixel_depth, &pixel_stencil);
-   }
-
-   /* convert floating point Z values to 32-bit uint */
-
-   /* frag_depth *= spu.fb.zscale */
-   frag_depth = si_fm(frag_depth, (qword)spu_splats(spu.fb.zscale));
-   /* frag_depth = uint(frag_depth) */
-   frag_depth = si_cfltu(frag_depth, 0);
-
-   result = (*spu.frag_test)(frag_mask, pixel_depth, pixel_stencil,
-                             frag_depth, frag_alpha, facing);
-
-
-   /* This code (everthing after the call to frag_test) should
-    * be generated on the PPU and upload to the SPU.
-    */
-   if (spu.read_depth || spu.read_stencil) {
-      write_ds_quad(&spu.ztile, x, y, spu.fb.depth_format,
-                    result.depth, result.stencil);
-   }
-
-   return result.mask;
-}
-
-
 
 
 /**
diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.h b/src/gallium/drivers/cell/spu/spu_per_fragment_op.h
index ffadf0661c..f817abf046 100644
--- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.h
+++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.h
@@ -1,33 +1,33 @@
-/*
- * (C) Copyright IBM Corporation 2008
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
  * All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * on the rights to use, copy, modify, merge, publish, distribute, sub
- * license, and/or sell copies of the Software, and to permit persons to whom
- * the Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
- * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
- * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- * USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
 
 #ifndef SPU_PER_FRAGMENT_OP
 #define SPU_PER_FRAGMENT_OP
 
-extern qword
-spu_do_depth_stencil(int x, int y, qword frag_mask, qword frag_depth,
-		     qword frag_alpha, qword facing);
 
 extern void
 spu_fallback_fragment_ops(uint x, uint y,
@@ -40,4 +40,5 @@ spu_fallback_fragment_ops(uint x, uint y,
                           vector float fragAlpha,
                           vector unsigned int mask);
 
+
 #endif /* SPU_PER_FRAGMENT_OP */
diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index 71ef6ca24f..a5bf3270c7 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -38,7 +38,6 @@
 #include "spu_texture.h"
 #include "spu_tile.h"
 #include "spu_tri.h"
-#include "spu_per_fragment_op.h"
 
 
 /** Masks are uint[4] vectors with each element being 0 or 0xffffffff */
@@ -255,31 +254,6 @@ eval_z(float x, float y)
 }
 
 
-static INLINE mask_t
-do_depth_test(int x, int y, mask_t quadmask)
-{
-   float4 zvals;
-   mask_t mask;
-
-   if (spu.fb.depth_format == PIPE_FORMAT_NONE)
-      return quadmask;
-
-   zvals.v = eval_z((float) x, (float) y);
-
-   mask = (mask_t) spu_do_depth_stencil(x - setup.cliprect_minx,
-					y - setup.cliprect_miny,
-					(qword) quadmask, 
-					(qword) zvals.v,
-					(qword) spu_splats((unsigned char) 0x0ffu),
-					(qword) spu_splats((unsigned int) 0x01u));
-
-   if (spu_extract(spu_orx(mask), 0))
-      spu.cur_ztile_status = TILE_STATUS_DIRTY;
-
-   return mask;
-}
-
-
 /**
  * Emit a quad (pass to next stage).  No clipping is done.
  * Note: about 1/5 to 1/7 of the time, mask is zero and this function
@@ -289,21 +263,6 @@ do_depth_test(int x, int y, mask_t quadmask)
 static INLINE void
 emit_quad( int x, int y, mask_t mask )
 {
-#if 0
-   struct softpipe_context *sp = setup.softpipe;
-   setup.quad.x0 = x;
-   setup.quad.y0 = y;
-   setup.quad.mask = mask;
-   sp->quad.first->run(sp->quad.first, &setup.quad);
-#else
-
-#define NEW_FRAGMENT_FUNCTION 01
-#if !NEW_FRAGMENT_FUNCTION
-   if (spu.read_depth) {
-      mask = do_depth_test(x, y, mask);
-   }
-#endif
-
    /* If any bits in mask are set... */
    if (spu_extract(spu_orx(mask), 0)) {
       const int ix = x - setup.cliprect_minx;
@@ -359,7 +318,6 @@ emit_quad( int x, int y, mask_t mask )
       }
 
 
-#if NEW_FRAGMENT_FUNCTION
       {
          /* Convert fragment data from AoS to SoA format.
           * I.e. (RGBA,RGBA,RGBA,RGBA) -> (RRRR,GGGG,BBBB,AAAA)
@@ -381,62 +339,8 @@ emit_quad( int x, int y, mask_t mask )
                                soa_frag[2], soa_frag[3],
                                mask);
       }
-#else
-      /* Convert fragment data from AoS to SoA format.
-       * I.e. (RGBA,RGBA,RGBA,RGBA) -> (RRRR,GGGG,BBBB,AAAA)
-       */
-      qword soa_frag[4];
-      _transpose_matrix4x4((vec_float4 *) soa_frag, colors);
-
-      /* Read the current framebuffer values.
-       */
-      const qword pix[4] = {
-         (qword) spu_splats(spu.ctile.ui[iy+0][ix+0]),
-         (qword) spu_splats(spu.ctile.ui[iy+0][ix+1]),
-         (qword) spu_splats(spu.ctile.ui[iy+1][ix+0]),
-         (qword) spu_splats(spu.ctile.ui[iy+1][ix+1]),
-      };
-
-      qword soa_pix[4];
-
-      if (spu.read_fb) {
-         /* Convert pixel data from AoS to SoA format.
-          * I.e. (RGBA,RGBA,RGBA,RGBA) -> (RRRR,GGGG,BBBB,AAAA)
-          */
-         vec_float4 aos_pix[4] = {
-            spu_unpack_A8R8G8B8(spu.ctile.ui[iy+0][ix+0]),
-            spu_unpack_A8R8G8B8(spu.ctile.ui[iy+0][ix+1]),
-            spu_unpack_A8R8G8B8(spu.ctile.ui[iy+1][ix+0]),
-            spu_unpack_A8R8G8B8(spu.ctile.ui[iy+1][ix+1]),
-         };
-
-         _transpose_matrix4x4((vec_float4 *) soa_pix, aos_pix);
-      }
-
-
-      struct spu_blend_results result =
-          (*spu.blend)(soa_frag[0], soa_frag[1], soa_frag[2], soa_frag[3],
-                       soa_pix[0], soa_pix[1], soa_pix[2], soa_pix[3],
-                       spu.const_blend_color[0], spu.const_blend_color[1],
-                       spu.const_blend_color[2], spu.const_blend_color[3]);
-
-
-      /* Convert final pixel data from SoA to AoS format.
-       * I.e. (RRRR,GGGG,BBBB,AAAA) -> (RGBA,RGBA,RGBA,RGBA)
-       */
-      result = (*spu.logicop)(pix[0], pix[1], pix[2], pix[3],
-                              result.r, result.g, result.b, result.a,
-                              (qword) mask);
-
-      spu.ctile.ui[iy+0][ix+0] = spu_extract((vec_uint4) result.r, 0);
-      spu.ctile.ui[iy+0][ix+1] = spu_extract((vec_uint4) result.g, 0);
-      spu.ctile.ui[iy+1][ix+0] = spu_extract((vec_uint4) result.b, 0);
-      spu.ctile.ui[iy+1][ix+1] = spu_extract((vec_uint4) result.a, 0);
-
-#endif /* NEW_FRAGMENT_FUNCTION */
 
    }
-#endif
 }
 
 
-- 
cgit v1.2.3


From 5336e758a483d15d579ffe7cad536be95637d904 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Thu, 11 Sep 2008 08:44:54 -0600
Subject: cell: added cast in spu_splats() call

---
 src/gallium/drivers/cell/spu/spu_texture.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src/gallium')

diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
index 5051774f00..117b8a36f8 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.c
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -97,7 +97,7 @@ get_four_texels(uint unit, vec_uint4 x, vec_uint4 y, vec_uint4 *texels)
    const qword offset_y = si_andi((qword) y, 0x1f);
 
    const qword tiles_per_row = (qword) spu_splats(spu.texture[unit].tiles_per_row);
-   const qword tile_size = (qword) spu_splats(sizeof(tile_t));
+   const qword tile_size = (qword) spu_splats((unsigned) sizeof(tile_t));
 
    qword tile_offset = si_mpya((qword) tile_y, tiles_per_row, (qword) tile_x);
    tile_offset = si_mpy((qword) tile_offset, tile_size);
-- 
cgit v1.2.3


From 6092a057042c9f7a4cae0f0eb9e95307f5f850a1 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Thu, 11 Sep 2008 09:55:39 -0600
Subject: cell: fix shuffle in spu_unpack_B8G8R8A8()

---
 src/gallium/drivers/cell/spu/spu_colorpack.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/drivers/cell/spu/spu_colorpack.h b/src/gallium/drivers/cell/spu/spu_colorpack.h
index e9fee8a3a6..fd8dc6ded3 100644
--- a/src/gallium/drivers/cell/spu/spu_colorpack.h
+++ b/src/gallium/drivers/cell/spu/spu_colorpack.h
@@ -79,14 +79,14 @@ spu_pack_color_shuffle(vector float rgba, vector unsigned char shuffle)
 
 
 static INLINE vector float
-spu_unpack_color(uint color)
+spu_unpack_B8G8R8A8(uint color)
 {
    vector unsigned int color_u4 = spu_splats(color);
    color_u4 = spu_shuffle(color_u4, color_u4,
                           ((vector unsigned char) {
-                             0, 0, 0, 0,
-                             5, 5, 5, 5,
                              10, 10, 10, 10,
+                             5, 5, 5, 5,
+                             0, 0, 0, 0,
                              15, 15, 15, 15}) );
    return spu_convtf(color_u4, 32);
 }
-- 
cgit v1.2.3


From add86031db757b0e3abe48bd8fdea40d4e380e05 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Thu, 11 Sep 2008 10:08:06 -0600
Subject: cell: begin new blending code (both codegen and fallback paths)

---
 src/gallium/drivers/cell/ppu/cell_gen_fragment.c   | 420 ++++++++++++++++++---
 src/gallium/drivers/cell/spu/spu_per_fragment_op.c | 232 ++++++++++--
 2 files changed, 584 insertions(+), 68 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
index df29476be6..7966c0916c 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
@@ -231,6 +231,370 @@ gen_alpha_test(const struct pipe_depth_stencil_alpha_state *dsa,
 
 
+/**
+ * Generate SPE code to implement the given blend mode for a quad of pixels.
+ * \param f          SPE function to append instruction onto.
+ * \param fragR_reg  register with fragment red values (float) (in/out)
+ * \param fragG_reg  register with fragment green values (float) (in/out)
+ * \param fragB_reg  register with fragment blue values (float) (in/out)
+ * \param fragA_reg  register with fragment alpha values (float) (in/out)
+ * \param fbRGBA_reg register with packed framebuffer colors (integer) (in)
+ */
+static void
+gen_blend(const struct pipe_blend_state *blend,
+          struct spe_function *f,
+          enum pipe_format color_format,
+          int fragR_reg, int fragG_reg, int fragB_reg, int fragA_reg,
+          int fbRGBA_reg)
+{
+   int term1R_reg = spe_allocate_available_register(f);
+   int term1G_reg = spe_allocate_available_register(f);
+   int term1B_reg = spe_allocate_available_register(f);
+   int term1A_reg = spe_allocate_available_register(f);
+
+   int term2R_reg = spe_allocate_available_register(f);
+   int term2G_reg = spe_allocate_available_register(f);
+   int term2B_reg = spe_allocate_available_register(f);
+   int term2A_reg = spe_allocate_available_register(f);
+
+   int fbR_reg = spe_allocate_available_register(f);
+   int fbG_reg = spe_allocate_available_register(f);
+   int fbB_reg = spe_allocate_available_register(f);
+   int fbA_reg = spe_allocate_available_register(f);
+
+   int one_reg = spe_allocate_available_register(f);
+   int tmp_reg = spe_allocate_available_register(f);
+
+   ASSERT(blend->blend_enable);
+
+   /* Unpack/convert framebuffer colors from four 32-bit packed colors
+    * (fbRGBA) to four float RGBA vectors (fbR, fbG, fbB, fbA).
+    * Each 8-bit color component is expanded into a float in [0.0, 1.0].
+    */
+   {
+      int mask_reg = spe_allocate_available_register(f);
+
+      /* mask = {0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff} */
+      spe_fsmbi(f, mask_reg, 0x1111);
+
+      /* XXX there may be more clever ways to implement the following code */
+      switch (color_format) {
+      case PIPE_FORMAT_A8R8G8B8_UNORM:
+         /* fbB = fbB & mask */
+         spe_and(f, fbB_reg, fbRGBA_reg, mask_reg);
+         /* mask = mask << 8 */
+         spe_roti(f, mask_reg, mask_reg, 8);
+
+         /* fbG = fbRGBA & mask */
+         spe_and(f, fbB_reg, fbRGBA_reg, mask_reg);
+         /* fbG = fbG >> 8 */
+         spe_roti(f, fbB_reg, fbB_reg, -8);
+         /* mask = mask << 8 */
+         spe_roti(f, mask_reg, mask_reg, 8);
+
+         /* fbR = fbRGBA & mask */
+         spe_and(f, fbR_reg, fbRGBA_reg, mask_reg);
+         /* fbR = fbR >> 16 */
+         spe_roti(f, fbB_reg, fbB_reg, -16);
+         /* mask = mask << 8 */
+         spe_roti(f, mask_reg, mask_reg, 8);
+
+         /* fbA = fbRGBA & mask */
+         spe_and(f, fbA_reg, fbRGBA_reg, mask_reg);
+         /* fbA = fbA >> 24 */
+         spe_roti(f, fbA_reg, fbA_reg, -24);
+         break;
+
+      case PIPE_FORMAT_B8G8R8A8_UNORM:
+         /* fbA = fbA & mask */
+         spe_and(f, fbA_reg, fbRGBA_reg, mask_reg);
+         /* mask = mask << 8 */
+         spe_roti(f, mask_reg, mask_reg, 8);
+
+         /* fbR = fbRGBA & mask */
+         spe_and(f, fbR_reg, fbRGBA_reg, mask_reg);
+         /* fbR = fbR >> 8 */
+         spe_roti(f, fbR_reg, fbR_reg, -8);
+         /* mask = mask << 8 */
+         spe_roti(f, mask_reg, mask_reg, 8);
+
+         /* fbG = fbRGBA & mask */
+         spe_and(f, fbG_reg, fbRGBA_reg, mask_reg);
+         /* fbG = fbG >> 16 */
+         spe_roti(f, fbG_reg, fbG_reg, -16);
+         /* mask = mask << 8 */
+         spe_roti(f, mask_reg, mask_reg, 8);
+
+         /* fbB = fbRGBA & mask */
+         spe_and(f, fbB_reg, fbRGBA_reg, mask_reg);
+         /* fbB = fbB >> 24 */
+         spe_roti(f, fbB_reg, fbB_reg, -24);
+         break;
+
+      default:
+         ASSERT(0);
+      }
+
+      /* convert int[4] in [0,255] to float[4] in [0.0, 1.0] */
+      spe_cuflt(f, fbR_reg, fbR_reg, 8);
+      spe_cuflt(f, fbG_reg, fbG_reg, 8);
+      spe_cuflt(f, fbB_reg, fbB_reg, 8);
+      spe_cuflt(f, fbA_reg, fbA_reg, 8);
+
+      spe_release_register(f, mask_reg);
+   }
+
+
+   /*
+    * Compute Src RGB terms
+    */
+   switch (blend->rgb_src_factor) {
+   case PIPE_BLENDFACTOR_ONE:
+      spe_move(f, term1R_reg, fragR_reg);
+      spe_move(f, term1G_reg, fragG_reg);
+      spe_move(f, term1B_reg, fragB_reg);
+      break;
+   case PIPE_BLENDFACTOR_ZERO:
+      spe_zero(f, term1R_reg);
+      spe_zero(f, term1G_reg);
+      spe_zero(f, term1B_reg);
+      break;
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+      spe_fm(f, term1R_reg, fragR_reg, fragR_reg);
+      spe_fm(f, term1G_reg, fragG_reg, fragG_reg);
+      spe_fm(f, term1B_reg, fragB_reg, fragB_reg);
+      break;
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+      spe_fm(f, term1R_reg, fragR_reg, fragA_reg);
+      spe_fm(f, term1G_reg, fragG_reg, fragA_reg);
+      spe_fm(f, term1B_reg, fragB_reg, fragA_reg);
+      break;
+      /* XXX more cases */
+   default:
+      ASSERT(0);
+   }
+
+   /*
+    * Compute Src Alpha term
+    */
+   switch (blend->alpha_src_factor) {
+   case PIPE_BLENDFACTOR_ONE:
+      spe_move(f, term1A_reg, fragA_reg);
+      break;
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+      spe_fm(f, term1A_reg, fragA_reg, fragA_reg);
+      break;
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+      spe_fm(f, term1A_reg, fragA_reg, fragA_reg);
+      break;
+      /* XXX more cases */
+   default:
+      ASSERT(0);
+   }
+
+   /*
+    * Compute Dest RGB terms
+    */
+   switch (blend->rgb_dst_factor) {
+   case PIPE_BLENDFACTOR_ONE:
+      spe_move(f, term2R_reg, fbR_reg);
+      spe_move(f, term2G_reg, fbG_reg);
+      spe_move(f, term2B_reg, fbB_reg);
+      break;
+   case PIPE_BLENDFACTOR_ZERO:
+      spe_zero(f, term2R_reg);
+      spe_zero(f, term2G_reg);
+      spe_zero(f, term2B_reg);
+      break;
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+      spe_fm(f, term2R_reg, fbR_reg, fragR_reg);
+      spe_fm(f, term2G_reg, fbG_reg, fragG_reg);
+      spe_fm(f, term2B_reg, fbB_reg, fragB_reg);
+      break;
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+      spe_fm(f, term2R_reg, fbR_reg, fragA_reg);
+      spe_fm(f, term2G_reg, fbG_reg, fragA_reg);
+      spe_fm(f, term2B_reg, fbB_reg, fragA_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+      /* one = {1.0, 1.0, 1.0, 1.0} */
+      spe_load_float(f, one_reg, 1.0f);
+      /* tmp = one - fragA */
+      spe_fs(f, tmp_reg, one_reg, fragA_reg);
+      /* term = fb * tmp */
+      spe_fm(f, term2R_reg, fbR_reg, tmp_reg);
+      spe_fm(f, term2G_reg, fbG_reg, tmp_reg);
+      spe_fm(f, term2B_reg, fbB_reg, tmp_reg);
+      break;
+      /* XXX more cases */
+   default:
+      ASSERT(0);
+   }
+
+   /*
+    * Compute Dest Alpha term
+    */
+   switch (blend->alpha_dst_factor) {
+   case PIPE_BLENDFACTOR_ONE:
+      spe_move(f, term2A_reg, fbA_reg);
+      break;
+   case PIPE_BLENDFACTOR_ZERO:
+      spe_zero(f, term2A_reg);
+      break;
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+      spe_fm(f, term2A_reg, fbA_reg, fragA_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+      /* one = {1.0, 1.0, 1.0, 1.0} */
+      spe_load_float(f, one_reg, 1.0f);
+      /* tmp = one - fragA */
+      spe_fs(f, tmp_reg, one_reg, fragA_reg);
+      /* termA = fbA * tmp */
+      spe_fm(f, term2A_reg, fbA_reg, tmp_reg);
+      break;
+      /* XXX more cases */
+   default:
+      ASSERT(0);
+   }
+
+   /*
+    * Combine Src/Dest RGB terms
+    */
+   switch (blend->rgb_func) {
+   case PIPE_BLEND_ADD:
+      spe_fa(f, fragR_reg, term1R_reg, term2R_reg);
+      spe_fa(f, fragG_reg, term1G_reg, term2G_reg);
+      spe_fa(f, fragB_reg, term1B_reg, term2B_reg);
+      break;
+   case PIPE_BLEND_SUBTRACT:
+      spe_fs(f, fragR_reg, term1R_reg, term2R_reg);
+      spe_fs(f, fragG_reg, term1G_reg, term2G_reg);
+      spe_fs(f, fragB_reg, term1B_reg, term2B_reg);
+      break;
+      /* XXX more cases */
+   default:
+      ASSERT(0);
+   }
+
+   /*
+    * Combine Src/Dest A term
+    */
+   switch (blend->alpha_func) {
+   case PIPE_BLEND_ADD:
+      spe_fa(f, fragA_reg, term1A_reg, term2A_reg);
+      break;
+   case PIPE_BLEND_SUBTRACT:
+      spe_fs(f, fragA_reg, term1A_reg, term2A_reg);
+      break;
+      /* XXX more cases */
+   default:
+      ASSERT(0);
+   }
+
+   spe_release_register(f, term1R_reg);
+   spe_release_register(f, term1G_reg);
+   spe_release_register(f, term1B_reg);
+   spe_release_register(f, term1A_reg);
+
+   spe_release_register(f, term2R_reg);
+   spe_release_register(f, term2G_reg);
+   spe_release_register(f, term2B_reg);
+   spe_release_register(f, term2A_reg);
+
+   spe_release_register(f, fbR_reg);
+   spe_release_register(f, fbG_reg);
+   spe_release_register(f, fbB_reg);
+   spe_release_register(f, fbA_reg);
+
+   spe_release_register(f, one_reg);
+   spe_release_register(f, tmp_reg);
+}
+
+
+static void
+gen_logicop(const struct pipe_blend_state *blend,
+            struct spe_function *f,
+            int fragRGBA_reg, int fbRGBA_reg)
+{
+   /* XXX to-do */
+   /* operate on 32-bit packed pixels, not float colors */
+}
+
+
+static void
+gen_colormask(uint colormask,
+              struct spe_function *f,
+              int fragRGBA_reg, int fbRGBA_reg)
+{
+   /* XXX to-do */
+   /* operate on 32-bit packed pixels, not float colors */
+}
+
+
+
+/**
+ * Generate code to pack a quad of float colors into a four 32-bit integers.
+ *
+ * \param f             SPE function to append instruction onto.
+ * \param color_format  the dest color packing format
+ * \param r_reg         register containing four red values (in/clobbered)
+ * \param g_reg         register containing four green values (in/clobbered)
+ * \param b_reg         register containing four blue values (in/clobbered)
+ * \param a_reg         register containing four alpha values (in/clobbered)
+ * \param rgba_reg      register to store the packed RGBA colors (out)
+ */
+static void
+gen_pack_colors(struct spe_function *f,
+                enum pipe_format color_format,
+                int r_reg, int g_reg, int b_reg, int a_reg,
+                int rgba_reg)
+{
+   /* Convert float[4] in [0.0,1.0] to int[4] in [0,~0], with clamping */
+   spe_cfltu(f, r_reg, r_reg, 32);
+   spe_cfltu(f, g_reg, g_reg, 32);
+   spe_cfltu(f, b_reg, b_reg, 32);
+   spe_cfltu(f, a_reg, a_reg, 32);
+
+   /* Shift the most significant bytes to least the significant positions.
+    * I.e.: reg = reg >> 24
+    */
+   spe_rotmi(f, r_reg, r_reg, -24);
+   spe_rotmi(f, g_reg, g_reg, -24);
+   spe_rotmi(f, b_reg, b_reg, -24);
+   spe_rotmi(f, a_reg, a_reg, -24);
+
+   /* Shift the color bytes according to the surface format */
+   if (color_format == PIPE_FORMAT_A8R8G8B8_UNORM) {
+      spe_roti(f, g_reg, g_reg, 8);   /* green <<= 8 */
+      spe_roti(f, r_reg, r_reg, 16);  /* red <<= 16 */
+      spe_roti(f, a_reg, a_reg, 24);  /* alpha <<= 24 */
+   }
+   else if (color_format == PIPE_FORMAT_B8G8R8A8_UNORM) {
+      spe_roti(f, r_reg, r_reg, 8);   /* red <<= 8 */
+      spe_roti(f, g_reg, g_reg, 16);  /* green <<= 16 */
+      spe_roti(f, b_reg, b_reg, 24);  /* blue <<= 24 */
+   }
+   else {
+      ASSERT(0);
+   }
+
+   /* Merge red, green, blue, alpha registers to make packed RGBA colors.
+    * Eg: after shifting according to color_format we might have:
+    *     R = {0x00ff0000, 0x00110000, 0x00220000, 0x00330000}
+    *     G = {0x0000ff00, 0x00004400, 0x00005500, 0x00006600}
+    *     B = {0x000000ff, 0x00000077, 0x00000088, 0x00000099}
+    *     A = {0xff000000, 0xaa000000, 0xbb000000, 0xcc000000}
+    * OR-ing all those together gives us four packed colors:
+    *  RGBA = {0xffffffff, 0xaa114477, 0xbb225588, 0xcc336699}
+    */
+   spe_or(f, rgba_reg, r_reg, g_reg);
+   spe_or(f, rgba_reg, rgba_reg, b_reg);
+   spe_or(f, rgba_reg, rgba_reg, a_reg);
+}
+
+
+
+
 /**
  * Generate SPE code to implement the fragment operations (alpha test,
  * depth test, stencil test, blending, colormask, and final
@@ -257,6 +621,7 @@ gen_fragment_function(struct cell_context *cell, struct spe_function *f)
    const struct pipe_depth_stencil_alpha_state *dsa =
       &cell->depth_stencil->base;
    const struct pipe_blend_state *blend = &cell->blend->base;
+   const enum pipe_format color_format = cell->framebuffer.cbufs[0]->format;
 
    /* For SPE function calls: reg $3 = first param, $4 = second param, etc. */
    const int x_reg = 3;  /* uint */
@@ -443,64 +808,31 @@ gen_fragment_function(struct cell_context *cell, struct spe_function *f)
 
 
    if (blend->blend_enable) {
-      /* convert packed tile colors in fbRGBA_reg to float[4] vectors */
-
-      // gen_blend_code(blend, f, mask_reg, ... );
-
+      gen_blend(blend, f, color_format,
+                fragR_reg, fragG_reg, fragB_reg, fragA_reg, fbRGBA_reg);
    }
 
-
-
    /*
     * Write fragment colors to framebuffer/tile.
     * This involves converting the fragment colors from float[4] to the
     * tile's specific format and obeying the quad/pixel mask.
     */
    {
-      const enum pipe_format color_format = cell->framebuffer.cbufs[0]->format;
       int rgba_reg = spe_allocate_available_register(f);
 
-      /* Convert float[4] in [0.0,1.0] to int[4] in [0,~0], with clamping */
-      spe_cfltu(f, fragR_reg, fragR_reg, 32);
-      spe_cfltu(f, fragG_reg, fragG_reg, 32);
-      spe_cfltu(f, fragB_reg, fragB_reg, 32);
-      spe_cfltu(f, fragA_reg, fragA_reg, 32);
+      /* Pack four float colors as four 32-bit int colors */
+      gen_pack_colors(f, color_format,
+                      fragR_reg, fragG_reg, fragB_reg, fragA_reg,
+                      rgba_reg);
 
-      /* Shift most the significant bytes to least the significant positions.
-       * I.e.: reg = reg >> 24
-       */
-      spe_rotmi(f, fragR_reg, fragR_reg, -24);
-      spe_rotmi(f, fragG_reg, fragG_reg, -24);
-      spe_rotmi(f, fragB_reg, fragB_reg, -24);
-      spe_rotmi(f, fragA_reg, fragA_reg, -24);
-
-      /* Shift the color bytes according to the surface format */
-      if (color_format == PIPE_FORMAT_A8R8G8B8_UNORM) {
-         spe_roti(f, fragG_reg, fragG_reg, 8);   /* green <<= 8 */
-         spe_roti(f, fragR_reg, fragR_reg, 16);  /* red <<= 16 */
-         spe_roti(f, fragA_reg, fragA_reg, 24);  /* alpha <<= 24 */
-      }
-      else if (color_format == PIPE_FORMAT_B8G8R8A8_UNORM) {
-         spe_roti(f, fragR_reg, fragR_reg, 8);   /* red <<= 8 */
-         spe_roti(f, fragG_reg, fragG_reg, 16);  /* green <<= 16 */
-         spe_roti(f, fragB_reg, fragB_reg, 24);  /* blue <<= 24 */
+      if (blend->logicop_enable) {
+         gen_logicop(blend, f, rgba_reg, fbRGBA_reg);
       }
-      else {
-         ASSERT(0);
+
+      if (blend->colormask != 0xf) {
+         gen_colormask(blend->colormask, f, rgba_reg, fbRGBA_reg);
       }
 
-      /* Merge red, green, blue, alpha registers to make packed RGBA colors.
-       * Eg: after shifting according to color_format we might have:
-       *     R = {0x00ff0000, 0x00110000, 0x00220000, 0x00330000}
-       *     G = {0x0000ff00, 0x00004400, 0x00005500, 0x00006600}
-       *     B = {0x000000ff, 0x00000077, 0x00000088, 0x00000099}
-       *     A = {0xff000000, 0xaa000000, 0xbb000000, 0xcc000000}
-       * OR-ing all those together gives us four packed colors:
-       *  RGBA = {0xffffffff, 0xaa114477, 0xbb225588, 0xcc336699}
-       */
-      spe_or(f, rgba_reg, fragR_reg, fragG_reg);
-      spe_or(f, rgba_reg, rgba_reg, fragB_reg);
-      spe_or(f, rgba_reg, rgba_reg, fragA_reg);
 
       /* Mix fragment colors with framebuffer colors using the quad/pixel mask:
        * if (mask[i])
diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
index 9ed5fc50cd..3f0eabaa05 100644
--- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
+++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
@@ -39,9 +39,11 @@
 
 
 /**
- * Called by rasterizer for each quad after the shader has run.  This
- * is a fallback/debug function.  In reality we'll use a generated
- * function produced by the PPU.  But this function is useful for
+ * Called by rasterizer for each quad after the shader has run.  Do
+ * all the per-fragment operations including alpha test, z test,
+ * stencil test, blend, colormask and logicops.  This is a
+ * fallback/debug function.  In reality we'll use a generated function
+ * produced by the PPU.  But this function is useful for
  * debug/validation.
  */
 void
@@ -49,13 +51,13 @@ spu_fallback_fragment_ops(uint x, uint y,
                           tile_t *colorTile,
                           tile_t *depthStencilTile,
                           vector float fragZ,
-                          vector float fragRed,
-                          vector float fragGreen,
-                          vector float fragBlue,
-                          vector float fragAlpha,
+                          vector float fragR,
+                          vector float fragG,
+                          vector float fragB,
+                          vector float fragA,
                           vector unsigned int mask)
 {
-   vector float frag_soa[4], frag_aos[4];
+   vector float frag_aos[4];
    unsigned int c0, c1, c2, c3;
 
    /* do alpha test */
@@ -65,24 +67,24 @@ spu_fallback_fragment_ops(uint x, uint y,
 
       switch (spu.depth_stencil_alpha.alpha.func) {
       case PIPE_FUNC_LESS:
-         amask = spu_cmpgt(ref, fragAlpha);  /* mask = (fragAlpha < ref) */
+         amask = spu_cmpgt(ref, fragA);  /* mask = (fragA < ref) */
          break;
       case PIPE_FUNC_GREATER:
-         amask = spu_cmpgt(fragAlpha, ref);  /* mask = (fragAlpha > ref) */
+         amask = spu_cmpgt(fragA, ref);  /* mask = (fragA > ref) */
          break;
       case PIPE_FUNC_GEQUAL:
-         amask = spu_cmpgt(ref, fragAlpha);
+         amask = spu_cmpgt(ref, fragA);
          amask = spu_nor(amask, amask);
          break;
       case PIPE_FUNC_LEQUAL:
-         amask = spu_cmpgt(fragAlpha, ref);
+         amask = spu_cmpgt(fragA, ref);
          amask = spu_nor(amask, amask);
          break;
       case PIPE_FUNC_EQUAL:
-         amask = spu_cmpeq(ref, fragAlpha);
+         amask = spu_cmpeq(ref, fragA);
          break;
       case PIPE_FUNC_NOTEQUAL:
-         amask = spu_cmpeq(ref, fragAlpha);
+         amask = spu_cmpeq(ref, fragA);
          amask = spu_nor(amask, amask);
          break;
       case PIPE_FUNC_ALWAYS:
@@ -174,7 +176,189 @@ spu_fallback_fragment_ops(uint x, uint y,
       }
    }
 
-   /* XXX do blending here */
+   if (spu.blend.blend_enable) {
+      vector float term1r, term1g, term1b, term1a;
+      vector float term2r, term2g, term2b, term2a;
+
+      vector float fbRGBA[4];
+
+      vector float one, tmp;
+
+      /* get colors from framebuffer */
+      {
+         vector float fc[4];
+         uint c0, c1, c2, c3;
+#if 0
+         c0 = colorTile->ui[y+0][x+0];
+         c1 = colorTile->ui[y+0][x+1];
+         c2 = colorTile->ui[y+1][x+0];
+         c3 = colorTile->ui[y+1][x+1];
+#else
+         c0 = colorTile->ui[y][x*2+0];
+         c1 = colorTile->ui[y][x*2+1];
+         c2 = colorTile->ui[y][x*2+2];
+         c3 = colorTile->ui[y][x*2+3];
+#endif
+         switch (spu.fb.color_format) {
+         case PIPE_FORMAT_B8G8R8A8_UNORM:
+            fc[0] = spu_unpack_B8G8R8A8(c0);
+            fc[1] = spu_unpack_B8G8R8A8(c1);
+            fc[2] = spu_unpack_B8G8R8A8(c2);
+            fc[3] = spu_unpack_B8G8R8A8(c3);
+            break;
+         case PIPE_FORMAT_A8R8G8B8_UNORM:
+            fc[0] = spu_unpack_A8R8G8B8(c0);
+            fc[1] = spu_unpack_A8R8G8B8(c1);
+            fc[2] = spu_unpack_A8R8G8B8(c2);
+            fc[3] = spu_unpack_A8R8G8B8(c3);
+            break;
+         default:
+            ASSERT(0);
+         }
+         _transpose_matrix4x4(fbRGBA, fc);
+      }
+
+      /*
+       * Compute Src RGB terms
+       */
+      switch (spu.blend.rgb_src_factor) {
+      case PIPE_BLENDFACTOR_ONE:
+         term1r = fragR;
+         term1g = fragG;
+         term1b = fragB;
+         break;
+      case PIPE_BLENDFACTOR_ZERO:
+         term1r =
+         term1g =
+         term1b = spu_splats(0.0f);
+         break;
+      case PIPE_BLENDFACTOR_SRC_COLOR:
+         term1r = spu_mul(fragR, fragR);
+         term1g = spu_mul(fragG, fragG);
+         term1b = spu_mul(fragB, fragB);
+         break;
+      case PIPE_BLENDFACTOR_SRC_ALPHA:
+         term1r = spu_mul(fragR, fragA);
+         term1g = spu_mul(fragG, fragA);
+         term1b = spu_mul(fragB, fragA);
+         break;
+      /* XXX more cases */
+      default:
+         ASSERT(0);
+      }
+
+      /*
+       * Compute Src Alpha term
+       */
+      switch (spu.blend.alpha_src_factor) {
+      case PIPE_BLENDFACTOR_ONE:
+         term1a = fragA;
+         break;
+      case PIPE_BLENDFACTOR_SRC_COLOR:
+         term1a = spu_splats(0.0f);
+         break;
+      case PIPE_BLENDFACTOR_SRC_ALPHA:
+         term1a = spu_mul(fragA, fragA);
+         break;
+      /* XXX more cases */
+      default:
+         ASSERT(0);
+      }
+
+      /*
+       * Compute Dest RGB terms
+       */
+      switch (spu.blend.rgb_dst_factor) {
+      case PIPE_BLENDFACTOR_ONE:
+         term2r = fragR;
+         term2g = fragG;
+         term2b = fragB;
+         break;
+      case PIPE_BLENDFACTOR_ZERO:
+         term2r =
+         term2g =
+         term2b = spu_splats(0.0f);
+         break;
+      case PIPE_BLENDFACTOR_SRC_COLOR:
+         term2r = spu_mul(fbRGBA[0], fragR);
+         term2g = spu_mul(fbRGBA[1], fragG);
+         term2b = spu_mul(fbRGBA[2], fragB);
+         break;
+      case PIPE_BLENDFACTOR_SRC_ALPHA:
+         term2r = spu_mul(fbRGBA[0], fragA);
+         term2g = spu_mul(fbRGBA[1], fragA);
+         term2b = spu_mul(fbRGBA[2], fragA);
+         break;
+      case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+         one = spu_splats(1.0f);
+         tmp = spu_sub(one, fragA);
+         term2r = spu_mul(fbRGBA[0], tmp);
+         term2g = spu_mul(fbRGBA[1], tmp);
+         term2b = spu_mul(fbRGBA[2], tmp);
+         break;
+      /* XXX more cases */
+      default:
+         ASSERT(0);
+      }
+
+      /*
+       * Compute Dest Alpha term
+       */
+      switch (spu.blend.alpha_dst_factor) {
+      case PIPE_BLENDFACTOR_ONE:
+         term2a = fragA;
+         break;
+      case PIPE_BLENDFACTOR_SRC_COLOR:
+         term2a = spu_splats(0.0f);
+         break;
+      case PIPE_BLENDFACTOR_SRC_ALPHA:
+         term2a = spu_mul(fbRGBA[3], fragA);
+         break;
+      case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+         one = spu_splats(1.0f);
+         tmp = spu_sub(one, fragA);
+         term2a = spu_mul(fbRGBA[3], tmp);
+         break;
+      /* XXX more cases */
+      default:
+         ASSERT(0);
+      }
+
+      /*
+       * Combine Src/Dest RGB terms
+       */
+      switch (spu.blend.rgb_func) {
+      case PIPE_BLEND_ADD:
+         fragR = spu_add(term1r, term2r);
+         fragG = spu_add(term1g, term2g);
+         fragB = spu_add(term1b, term2b);
+         break;
+      case PIPE_BLEND_SUBTRACT:
+         fragR = spu_sub(term1r, term2r);
+         fragG = spu_sub(term1g, term2g);
+         fragB = spu_sub(term1b, term2b);
+         break;
+      /* XXX more cases */
+      default:
+         ASSERT(0);
+      }
+
+      /*
+       * Combine Src/Dest A term
+       */
+      switch (spu.blend.alpha_func) {
+      case PIPE_BLEND_ADD:
+         fragA = spu_add(term1a, term2a);
+         break;
+      case PIPE_BLEND_SUBTRACT:
+         fragA = spu_sub(term1a, term2a);
+         break;
+      /* XXX more cases */
+      default:
+         ASSERT(0);
+      }
+   }
+
 
    /* XXX do colormask test here */
 
@@ -190,17 +374,17 @@ spu_fallback_fragment_ops(uint x, uint y,
 #if 0
    {
       vector float frag_soa[4];
-      frag_soa[0] = fragRed;
-      frag_soa[1] = fragGreen;
-      frag_soa[2] = fragBlue;
-      frag_soa[3] = fragAlpha;
+      frag_soa[0] = fragR;
+      frag_soa[1] = fragG;
+      frag_soa[2] = fragB;
+      frag_soa[3] = fragA;
       _transpose_matrix4x4(frag_aos, frag_soa);
    }
 #else
    /* short-cut relying on function parameter layout: */
-   _transpose_matrix4x4(frag_aos, &fragRed);
-   (void) fragGreen;
-   (void) fragBlue;
+   _transpose_matrix4x4(frag_aos, &fragR);
+   (void) fragG;
+   (void) fragB;
 #endif
 
    switch (spu.fb.color_format) {
@@ -238,7 +422,7 @@ spu_fallback_fragment_ops(uint x, uint y,
    if (spu_extract(mask, 2))
       colorTile->ui[y+1][x+0] = c2;
    if (spu_extract(mask, 3))
-      colorTile->ui[y+1][x+1] = c3;   
+      colorTile->ui[y+1][x+1] = c3;
 #else
    /*
     * Quad layout:
@@ -253,6 +437,6 @@ spu_fallback_fragment_ops(uint x, uint y,
    if (spu_extract(mask, 2))
       colorTile->ui[y][x*2+2] = c2;
    if (spu_extract(mask, 3))
-      colorTile->ui[y][x*2+3] = c3;   
+      colorTile->ui[y][x*2+3] = c3;
 #endif
 }
-- 
cgit v1.2.3


From 283ffdf99605c536d00e03ad6ec91a6f8e006fc2 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Thu, 11 Sep 2008 10:13:20 -0600
Subject: cell: checkpoint: remove more of the old per-fragment code

---
 src/gallium/drivers/cell/common.h              |   2 +
 src/gallium/drivers/cell/ppu/Makefile          |   1 -
 src/gallium/drivers/cell/ppu/cell_state_emit.c |  60 ++-----------
 src/gallium/drivers/cell/spu/spu_main.c        | 115 +++----------------------
 src/gallium/drivers/cell/spu/spu_main.h        |  37 +-------
 5 files changed, 19 insertions(+), 196 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index a62530c64d..61d2b7d1ae 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -146,6 +146,8 @@ struct cell_command_logicop
 struct cell_command_fragment_ops
 {
    uint64_t opcode;      /**< CELL_CMD_STATE_FRAGMENT_OPS */
+   struct pipe_depth_stencil_alpha_state dsa;
+   struct pipe_blend_state blend;
    unsigned code[SPU_MAX_FRAGMENT_OPS_INSTS];
 };
 
diff --git a/src/gallium/drivers/cell/ppu/Makefile b/src/gallium/drivers/cell/ppu/Makefile
index b5a6fcb8de..8699f3f8ec 100644
--- a/src/gallium/drivers/cell/ppu/Makefile
+++ b/src/gallium/drivers/cell/ppu/Makefile
@@ -28,7 +28,6 @@ SOURCES = \
 	cell_gen_fragment.c \
 	cell_state_derived.c \
 	cell_state_emit.c \
-	cell_state_per_fragment.c \
 	cell_state_shader.c \
 	cell_pipe_state.c \
 	cell_screen.c \
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index 06777aac14..2bfb976c59 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -55,23 +55,6 @@ emit_state_cmd(struct cell_context *cell, uint cmd,
 void
 cell_emit_state(struct cell_context *cell)
 {
-   if (cell->dirty & (CELL_NEW_FRAMEBUFFER | CELL_NEW_BLEND)) {
-      struct cell_command_logicop logicop;
-
-      if (cell->logic_op.store != NULL) {
-	 spe_release_func(& cell->logic_op);
-      }
-
-      cell_generate_logic_op(& cell->logic_op,
-			     & cell->blend->base,
-			     cell->framebuffer.cbufs[0]);
-
-      logicop.base = (intptr_t) cell->logic_op.store;
-      logicop.size = 64 * 4;
-      emit_state_cmd(cell, CELL_CMD_STATE_LOGICOP, &logicop,
-		     sizeof(logicop));
-   }
-
    if (cell->dirty & CELL_NEW_FRAMEBUFFER) {
       struct pipe_surface *cbuf = cell->framebuffer.cbufs[0];
       struct pipe_surface *zbuf = cell->framebuffer.zsbuf;
@@ -91,7 +74,9 @@ cell_emit_state(struct cell_context *cell)
    }
 
 
-   if (cell->dirty & (CELL_NEW_FRAMEBUFFER | CELL_NEW_DEPTH_STENCIL)) {
+   if (cell->dirty & (CELL_NEW_FRAMEBUFFER |
+                      CELL_NEW_DEPTH_STENCIL |
+                      CELL_NEW_BLEND)) {
       /* XXX we don't want to always do codegen here.  We should have
        * a hash/lookup table to cache previous results...
        */
@@ -105,47 +90,12 @@ cell_emit_state(struct cell_context *cell)
       fops->opcode = CELL_CMD_STATE_FRAGMENT_OPS;
       memcpy(&fops->code, spe_code.store,
              SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE);
+      fops->dsa = cell->depth_stencil->base;
+      fops->blend = cell->blend->base;
       /* free codegen buffer */
       spe_release_func(&spe_code);
    }
 
-   if (cell->dirty & CELL_NEW_BLEND) {
-      struct cell_command_blend blend;
-
-      if (cell->blend != NULL) {
-         blend.base = (intptr_t) cell->blend->code.store;
-         blend.size = cell->blend->code.num_inst * SPE_INST_SIZE;
-         blend.read_fb = TRUE;
-      }
-      else {
-         blend.base = 0;
-         blend.size = 0;
-         blend.read_fb = FALSE;
-      }
-
-      emit_state_cmd(cell, CELL_CMD_STATE_BLEND, &blend, sizeof(blend));
-   }
-
-   if (cell->dirty & CELL_NEW_DEPTH_STENCIL) {
-      struct cell_command_depth_stencil_alpha_test dsat;
-
-      if (cell->depth_stencil != NULL) {
-	 dsat.base = (intptr_t) cell->depth_stencil->code.store;
-	 dsat.size = cell->depth_stencil->code.num_inst * SPE_INST_SIZE;
-	 dsat.read_depth = TRUE;
-	 dsat.read_stencil = FALSE;
-         dsat.state = cell->depth_stencil->base;
-      }
-      else {
-	 dsat.base = 0;
-	 dsat.size = 0;
-	 dsat.read_depth = FALSE;
-	 dsat.read_stencil = FALSE;
-      }
-
-      emit_state_cmd(cell, CELL_CMD_STATE_DEPTH_STENCIL, &dsat, sizeof(dsat));
-   }
-
    if (cell->dirty & CELL_NEW_SAMPLER) {
       uint i;
       for (i = 0; i < CELL_MAX_SAMPLERS; i++) {
diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index 4e0ec15925..6afca19dfd 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -63,14 +63,6 @@ struct spu_vs_context draw;
 static unsigned char attribute_fetch_code_buffer[136 * PIPE_MAX_ATTRIBS]
     ALIGN16_ATTRIB;
 
-static unsigned char depth_stencil_code_buffer[4 * 64]
-    ALIGN16_ATTRIB;
-
-static unsigned char fb_blend_code_buffer[4 * 64]
-    ALIGN16_ATTRIB;
-
-static unsigned char logicop_code_buffer[4 * 64]
-    ALIGN16_ATTRIB;
 
 
 /**
@@ -240,8 +232,15 @@ cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops)
       printf("SPU %u: CMD_STATE_FRAGMENT_OPS\n", spu.init.id);
    /* Copy SPU code from batch buffer to spu buffer */
    memcpy(spu.fragment_ops.code, fops->code, SPU_MAX_FRAGMENT_OPS_INSTS * 4);
+   /* Copy state info */
+   memcpy(&spu.depth_stencil_alpha, &fops->dsa, sizeof(fops->dsa));
+   memcpy(&spu.blend, &fops->blend, sizeof(fops->blend));
+
    /* Point function pointer at new code */
    spu.fragment_ops.func = (spu_fragment_ops_func) spu.fragment_ops.code;
+
+   spu.read_depth = spu.depth_stencil_alpha.depth.enabled;
+   spu.read_stencil = spu.depth_stencil_alpha.stencil[0].enabled;
 }
 
 
@@ -303,89 +302,6 @@ cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
 }
 
 
-#define NEW_FRAGMENT_FUNCTION 01
-
-static void
-cmd_state_blend(const struct cell_command_blend *state)
-{
-   if (Debug)
-      printf("SPU %u: BLEND: enabled %d\n",
-             spu.init.id,
-             (state->size != 0));
-
-   ASSERT_ALIGN16(state->base);
-
-   if (state->size != 0) {
-      mfc_get(fb_blend_code_buffer,
-              (unsigned int) state->base,  /* src */
-              ROUNDUP16(state->size),
-              TAG_BATCH_BUFFER,
-              0, /* tid */
-              0  /* rid */);
-      wait_on_mask(1 << TAG_BATCH_BUFFER);
-      spu.blend = (blend_func) fb_blend_code_buffer;
-      spu.read_fb = state->read_fb;
-   }
-   else
-   {
-      spu.read_fb = FALSE;
-   }
-}
-
-
-static void
-cmd_state_depth_stencil(const struct cell_command_depth_stencil_alpha_test *state)
-{
-   if (Debug)
-      printf("SPU %u: DEPTH_STENCIL: ztest %d\n",
-             spu.init.id,
-             state->read_depth);
-
-   ASSERT_ALIGN16(state->base);
-
-   if (state->size != 0) {
-      mfc_get(depth_stencil_code_buffer,
-	      (unsigned int) state->base,  /* src */
-	      ROUNDUP16(state->size),
-	      TAG_BATCH_BUFFER,
-	      0, /* tid */
-	      0  /* rid */);
-      wait_on_mask(1 << TAG_BATCH_BUFFER);
-   }
-   else
-   {
-      /* If there is no code, emit a return instruction.
-       */
-      depth_stencil_code_buffer[0] = 0x35;
-      depth_stencil_code_buffer[1] = 0x00;
-      depth_stencil_code_buffer[2] = 0x00;
-      depth_stencil_code_buffer[3] = 0x00;
-   }
-
-   spu.frag_test = (frag_test_func) depth_stencil_code_buffer;
-   spu.read_depth = state->read_depth;
-   spu.read_stencil = state->read_stencil;
-   spu.depth_stencil_alpha = state->state;
-}
-
-
-static void
-cmd_state_logicop(const struct cell_command_logicop * code)
-{
-#if !NEW_FRAGMENT_FUNCTION
-   mfc_get(logicop_code_buffer,
-           (unsigned int) code->base,  /* src */
-           code->size,
-           TAG_BATCH_BUFFER,
-           0, /* tid */
-           0  /* rid */);
-   wait_on_mask(1 << TAG_BATCH_BUFFER);
-
-   spu.logicop = (logicop_func) logicop_code_buffer;
-#endif
-}
-
-
 static void
 cmd_state_sampler(const struct cell_command_sampler *sampler)
 {
@@ -571,15 +487,6 @@ cmd_batch(uint opcode)
          cmd_finish();
          pos += 1;
          break;
-      case CELL_CMD_STATE_BLEND:
-         cmd_state_blend((struct cell_command_blend *) &buffer[pos+1]);
-         pos += (1 + ROUNDUP8(sizeof(struct cell_command_blend)) / 8);
-         break;
-      case CELL_CMD_STATE_DEPTH_STENCIL:
-         cmd_state_depth_stencil((struct cell_command_depth_stencil_alpha_test *)
-                                 &buffer[pos+1]);
-         pos += (1 + ROUNDUP8(sizeof(struct cell_command_depth_stencil_alpha_test)) / 8);
-         break;
       case CELL_CMD_STATE_SAMPLER:
          {
             struct cell_command_sampler *sampler
@@ -614,19 +521,17 @@ cmd_batch(uint opcode)
          pos += (1 + ROUNDUP8(sizeof(struct cell_array_info)) / 8);
          break;
       case CELL_CMD_STATE_BIND_VS:
+#if 01
          spu_bind_vertex_shader(&draw,
                                 (struct cell_shader_info *) &buffer[pos+1]);
          pos += (1 + ROUNDUP8(sizeof(struct cell_shader_info)) / 8);
+#endif
          break;
       case CELL_CMD_STATE_ATTRIB_FETCH:
          cmd_state_attrib_fetch((struct cell_attribute_fetch_code *)
                                 &buffer[pos+1]);
          pos += (1 + ROUNDUP8(sizeof(struct cell_attribute_fetch_code)) / 8);
          break;
-      case CELL_CMD_STATE_LOGICOP:
-         cmd_state_logicop((struct cell_command_logicop *) &buffer[pos+1]);
-         pos += (1 + ROUNDUP8(sizeof(struct cell_command_logicop)) / 8);
-         break;
       case CELL_CMD_FLUSH_BUFFER_RANGE: {
 	 struct cell_buffer_range *br = (struct cell_buffer_range *)
 	     &buffer[pos+1];
@@ -695,7 +600,9 @@ main_loop(void)
          exitFlag = 1;
          break;
       case CELL_CMD_VS_EXECUTE:
+#if 01
          spu_execute_vertex_shader(&draw, &cmd.vs);
+#endif
          break;
       case CELL_CMD_BATCH:
          cmd_batch(opcode);
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 7ab34f5222..f0f8be47db 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -60,35 +60,6 @@ typedef union {
 #define TILE_STATUS_GETTING 5  /**< mfc_get() called but not yet arrived */
 
 
-struct spu_frag_test_results {
-   qword mask;
-   qword depth;
-   qword stencil;
-};
-
-typedef struct spu_frag_test_results (*frag_test_func)(qword frag_mask,
-    qword pixel_depth, qword pixel_stencil, qword frag_depth,
-    qword frag_alpha, qword facing);
-
-
-struct spu_blend_results {
-   qword r;
-   qword g;
-   qword b;
-   qword a;
-};
-
-typedef struct spu_blend_results (*blend_func)(
-    qword frag_r, qword frag_g, qword frag_b, qword frag_a,
-    qword pixel_r, qword pixel_g, qword pixel_b, qword pixel_a,
-    qword const_r, qword const_g, qword const_b, qword const_a);
-
-typedef struct spu_blend_results (*logicop_func)(
-    qword pixel_r, qword pixel_g, qword pixel_b, qword pixel_a,
-    qword frag_r, qword frag_g, qword frag_b, qword frag_a,
-    qword frag_mask);
-
-
 typedef vector float (*sample_texture_func)(uint unit, vector float texcoord);
 
 
@@ -147,16 +118,10 @@ struct spu_global
    struct spu_framebuffer fb;
 
    struct pipe_depth_stencil_alpha_state depth_stencil_alpha;
+   struct pipe_blend_state blend;
 
    boolean read_depth;
    boolean read_stencil;
-   frag_test_func frag_test;  /**< Current depth/stencil test code */
-   
-   boolean read_fb;   /**< Does current blend mode require framebuffer read? */
-   blend_func blend;  /**< Current blend code */
-   qword const_blend_color[4] ALIGN16_ATTRIB;
-
-   logicop_func logicop;  /**< Current logicop code **/
 
    struct pipe_sampler_state sampler[PIPE_MAX_SAMPLERS];
    struct spu_texture texture[PIPE_MAX_SAMPLERS];
-- 
cgit v1.2.3


From aa4a08d429712fa516342ec02253c2591794ea5f Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Thu, 11 Sep 2008 10:25:38 -0600
Subject: cell: asst. clean-up

---
 src/gallium/drivers/cell/spu/spu_main.c | 23 +++++-----------
 src/gallium/drivers/cell/spu/spu_main.h | 47 +++++++++++++++------------------
 src/gallium/drivers/cell/spu/spu_tri.c  | 10 +++----
 3 files changed, 32 insertions(+), 48 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index 6afca19dfd..29686964d2 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -231,13 +231,13 @@ cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops)
    if (Debug)
       printf("SPU %u: CMD_STATE_FRAGMENT_OPS\n", spu.init.id);
    /* Copy SPU code from batch buffer to spu buffer */
-   memcpy(spu.fragment_ops.code, fops->code, SPU_MAX_FRAGMENT_OPS_INSTS * 4);
+   memcpy(spu.fragment_ops_code, fops->code, SPU_MAX_FRAGMENT_OPS_INSTS * 4);
    /* Copy state info */
    memcpy(&spu.depth_stencil_alpha, &fops->dsa, sizeof(fops->dsa));
    memcpy(&spu.blend, &fops->blend, sizeof(fops->blend));
 
    /* Point function pointer at new code */
-   spu.fragment_ops.func = (spu_fragment_ops_func) spu.fragment_ops.code;
+   spu.fragment_ops = (spu_fragment_ops_func) spu.fragment_ops_code;
 
    spu.read_depth = spu.depth_stencil_alpha.depth.enabled;
    spu.read_stencil = spu.depth_stencil_alpha.stencil[0].enabled;
@@ -288,17 +288,6 @@ cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
       spu.fb.zsize = 0;
       break;
    }
-
-   if (spu.fb.color_format == PIPE_FORMAT_A8R8G8B8_UNORM)
-      spu.color_shuffle = ((vector unsigned char) {
-                              12, 0, 4, 8, 0, 0, 0, 0, 
-                              0, 0, 0, 0, 0, 0, 0, 0});
-   else if (spu.fb.color_format == PIPE_FORMAT_B8G8R8A8_UNORM)
-      spu.color_shuffle = ((vector unsigned char) {
-                              8, 4, 0, 12, 0, 0, 0, 0, 
-                              0, 0, 0, 0, 0, 0, 0, 0});
-   else
-      ASSERT(0);
 }
 
 
@@ -521,11 +510,11 @@ cmd_batch(uint opcode)
          pos += (1 + ROUNDUP8(sizeof(struct cell_array_info)) / 8);
          break;
       case CELL_CMD_STATE_BIND_VS:
-#if 01
+#if 0
          spu_bind_vertex_shader(&draw,
                                 (struct cell_shader_info *) &buffer[pos+1]);
-         pos += (1 + ROUNDUP8(sizeof(struct cell_shader_info)) / 8);
 #endif
+         pos += (1 + ROUNDUP8(sizeof(struct cell_shader_info)) / 8);
          break;
       case CELL_CMD_STATE_ATTRIB_FETCH:
          cmd_state_attrib_fetch((struct cell_attribute_fetch_code *)
@@ -600,7 +589,7 @@ main_loop(void)
          exitFlag = 1;
          break;
       case CELL_CMD_VS_EXECUTE:
-#if 01
+#if 0
          spu_execute_vertex_shader(&draw, &cmd.vs);
 #endif
          break;
@@ -631,7 +620,7 @@ one_time_init(void)
    /* Install default/fallback fragment processing function.
     * This will normally be overriden by a code-gen'd function.
     */
-   spu.fragment_ops.func = spu_fallback_fragment_ops;
+   spu.fragment_ops = spu_fallback_fragment_ops;
 }
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index f0f8be47db..d40539da83 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -60,9 +60,11 @@ typedef union {
 #define TILE_STATUS_GETTING 5  /**< mfc_get() called but not yet arrived */
 
 
-typedef vector float (*sample_texture_func)(uint unit, vector float texcoord);
-
+/** Function for sampling textures */
+typedef vector float (*spu_sample_texture_func)(uint unit,
+                                                vector float texcoord);
 
+/** Function for performing per-fragment ops */
 typedef void (*spu_fragment_ops_func)(uint x, uint y,
                                       tile_t *colorTile,
                                       tile_t *depthStencilTile,
@@ -73,14 +75,8 @@ typedef void (*spu_fragment_ops_func)(uint x, uint y,
                                       vector float fragAlpha,
                                       vector unsigned int mask);
 
-struct spu_fragment_ops
+struct spu_framebuffer
 {
-   uint code[SPU_MAX_FRAGMENT_OPS_INSTS];
-   spu_fragment_ops_func func;  /**< Current fragment ops function */
-} ALIGN16_ATTRIB;
-
-
-struct spu_framebuffer {
    void *color_start;              /**< addr of color surface in main memory */
    void *depth_start;              /**< addr of depth surface in main memory */
    enum pipe_format color_format;
@@ -109,34 +105,31 @@ struct spu_texture
 
 
 /**
- * All SPU global/context state will be in singleton object of this type:
+ * All SPU global/context state will be in a singleton object of this type:
  */
 struct spu_global
 {
+   /** One-time init/constant info */
    struct cell_init_info init;
 
+   /*
+    * Current state
+    */
    struct spu_framebuffer fb;
-
    struct pipe_depth_stencil_alpha_state depth_stencil_alpha;
    struct pipe_blend_state blend;
-
-   boolean read_depth;
-   boolean read_stencil;
-
    struct pipe_sampler_state sampler[PIPE_MAX_SAMPLERS];
    struct spu_texture texture[PIPE_MAX_SAMPLERS];
-
    struct vertex_info vertex_info;
 
-   struct spu_fragment_ops fragment_ops;
-
-   /* XXX more state to come */
-
-
-   /** current color and Z tiles */
+   /** Current color and Z tiles */
    tile_t ctile ALIGN16_ATTRIB;
    tile_t ztile ALIGN16_ATTRIB;
 
+   /** Read depth/stencil tiles? */
+   boolean read_depth;
+   boolean read_stencil;
+
    /** Current tiles' status */
    ubyte cur_ctile_status, cur_ztile_status;
 
@@ -144,11 +137,13 @@ struct spu_global
    ubyte ctile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
    ubyte ztile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
 
+   /** Current fragment ops machine code */
+   uint fragment_ops_code[SPU_MAX_FRAGMENT_OPS_INSTS];
+   /** Current fragment ops function */
+   spu_fragment_ops_func fragment_ops;
 
-   /** for converting RGBA to PIPE_FORMAT_x colors */
-   vector unsigned char color_shuffle;
-
-   sample_texture_func sample_texture[CELL_MAX_SAMPLERS];
+   /** Current texture sampler function */
+   spu_sample_texture_func sample_texture[CELL_MAX_SAMPLERS];
 
 } ALIGN16_ATTRIB;
 
diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index a5bf3270c7..f02cdd1f76 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -333,11 +333,11 @@ emit_quad( int x, int y, mask_t mask )
          /* Do all per-fragment/quad operations here, including:
           *  alpha test, z test, stencil test, blend and framebuffer writing.
           */
-         spu.fragment_ops.func(ix, iy, &spu.ctile, &spu.ztile,
-                               fragZ.v,
-                               soa_frag[0], soa_frag[1],
-                               soa_frag[2], soa_frag[3],
-                               mask);
+         spu.fragment_ops(ix, iy, &spu.ctile, &spu.ztile,
+                          fragZ.v,
+                          soa_frag[0], soa_frag[1],
+                          soa_frag[2], soa_frag[3],
+                          mask);
       }
 
    }
-- 
cgit v1.2.3


From f19903aa83e9b6e18930cbda14cfec3cca2e1bf2 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Thu, 11 Sep 2008 10:26:00 -0600
Subject: cell: remove old blend/depth/stencil/logicop structs

---
 src/gallium/drivers/cell/common.h | 29 -----------------------------
 1 file changed, 29 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index 61d2b7d1ae..8aa2b23ec0 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -111,35 +111,6 @@
 #define CELL_DEBUG_SYNC     (1 << 1)
 
 
-/**
- */
-struct cell_command_depth_stencil_alpha_test
-{
-   uint64_t base;               /**< Effective address of code start. */
-   unsigned size;               /**< Size in bytes of SPE code. */
-   unsigned read_depth;         /**< Flag: should depth be read? */
-   unsigned read_stencil;       /**< Flag: should stencil be read? */
-   struct pipe_depth_stencil_alpha_state state;
-};
-
-
-/**
- * Upload code to perform framebuffer blend operation
- */
-struct cell_command_blend
-{
-   uint64_t base;               /**< Effective address of code start. */
-   unsigned size;               /**< Size in bytes of SPE code. */
-   unsigned read_fb;            /**< Flag: should framebuffer be read? */
-};
-
-
-struct cell_command_logicop
-{
-   uint64_t base;               /**< Effective address of code start. */
-   unsigned size;               /**< Size in bytes of SPE code. */
-};
-
 
 #define SPU_MAX_FRAGMENT_OPS_INSTS 64
 
-- 
cgit v1.2.3


From 924653e37db4501d0f03721e9d74abffe46a3c72 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Thu, 11 Sep 2008 10:27:17 -0600
Subject: cell: don't build unused sources

---
 src/gallium/drivers/cell/spu/Makefile | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'src/gallium')

diff --git a/src/gallium/drivers/cell/spu/Makefile b/src/gallium/drivers/cell/spu/Makefile
index e285ae9fdb..1ae0dfb8c1 100644
--- a/src/gallium/drivers/cell/spu/Makefile
+++ b/src/gallium/drivers/cell/spu/Makefile
@@ -22,12 +22,15 @@ SOURCES = \
 	spu_render.c \
 	spu_texture.c \
 	spu_tile.c \
-	spu_tri.c \
+	spu_tri.c
+
+OLD_SOURCES = \
 	spu_exec.c \
 	spu_util.c \
 	spu_vertex_fetch.c \
 	spu_vertex_shader.c
 
+
 SPU_OBJECTS = $(SOURCES:.c=.o) \
 
 SPU_ASM_OUT = $(SOURCES:.c=.s) \
-- 
cgit v1.2.3


From a558369ec66e3d9e2b88f4df9a3b5a3704b19ef3 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Thu, 11 Sep 2008 10:33:13 -0600
Subject: cell: disable NEW_VS emit

---
 src/gallium/drivers/cell/ppu/cell_state_emit.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'src/gallium')

diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index 2bfb976c59..180b89c1f6 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -133,7 +133,8 @@ cell_emit_state(struct cell_context *cell)
       emit_state_cmd(cell, CELL_CMD_STATE_VERTEX_INFO,
                      &cell->vertex_info, sizeof(struct vertex_info));
    }
-   
+
+#if 0
    if (cell->dirty & CELL_NEW_VS) {
       const struct draw_context *const draw = cell->draw;
       struct cell_shader_info info;
@@ -148,4 +149,5 @@ cell_emit_state(struct cell_context *cell)
 
       emit_state_cmd(cell, CELL_CMD_STATE_BIND_VS, &info, sizeof(info));
    }
+#endif
 }
-- 
cgit v1.2.3


From f6bf8d9d410d94372b72f4f6ede6196ae5a4a67f Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Thu, 11 Sep 2008 10:33:24 -0600
Subject: cell: clean-up, comments

---
 src/gallium/drivers/cell/spu/spu_main.c | 52 ++++++++++++++++++++-------------
 1 file changed, 32 insertions(+), 20 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index 29686964d2..2a7cb75f59 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -429,16 +429,14 @@ cmd_batch(uint opcode)
       printf("SPU %u: release batch buf %u\n", spu.init.id, buf);
    release_buffer(buf);
 
+   /*
+    * Loop over commands in the batch buffer
+    */
    for (pos = 0; pos < usize; /* no incr */) {
       switch (buffer[pos]) {
-      case CELL_CMD_STATE_FRAMEBUFFER:
-         {
-            struct cell_command_framebuffer *fb
-               = (struct cell_command_framebuffer *) &buffer[pos];
-            cmd_state_framebuffer(fb);
-            pos += sizeof(*fb) / 8;
-         }
-         break;
+      /*
+       * rendering commands
+       */
       case CELL_CMD_CLEAR_SURFACE:
          {
             struct cell_command_clear_surface *clr
@@ -456,6 +454,17 @@ cmd_batch(uint opcode)
             pos += pos_incr;
          }
          break;
+      /*
+       * state-update commands
+       */
+      case CELL_CMD_STATE_FRAMEBUFFER:
+         {
+            struct cell_command_framebuffer *fb
+               = (struct cell_command_framebuffer *) &buffer[pos];
+            cmd_state_framebuffer(fb);
+            pos += sizeof(*fb) / 8;
+         }
+         break;
       case CELL_CMD_STATE_FRAGMENT_OPS:
          {
             struct cell_command_fragment_ops *fops
@@ -464,18 +473,6 @@ cmd_batch(uint opcode)
             pos += sizeof(*fops) / 8;
          }
          break;
-      case CELL_CMD_RELEASE_VERTS:
-         {
-            struct cell_command_release_verts *release
-               = (struct cell_command_release_verts *) &buffer[pos];
-            cmd_release_verts(release);
-            pos += sizeof(*release) / 8;
-         }
-         break;
-      case CELL_CMD_FINISH:
-         cmd_finish();
-         pos += 1;
-         break;
       case CELL_CMD_STATE_SAMPLER:
          {
             struct cell_command_sampler *sampler
@@ -521,6 +518,21 @@ cmd_batch(uint opcode)
                                 &buffer[pos+1]);
          pos += (1 + ROUNDUP8(sizeof(struct cell_attribute_fetch_code)) / 8);
          break;
+      /*
+       * misc commands
+       */
+      case CELL_CMD_FINISH:
+         cmd_finish();
+         pos += 1;
+         break;
+      case CELL_CMD_RELEASE_VERTS:
+         {
+            struct cell_command_release_verts *release
+               = (struct cell_command_release_verts *) &buffer[pos];
+            cmd_release_verts(release);
+            pos += sizeof(*release) / 8;
+         }
+         break;
       case CELL_CMD_FLUSH_BUFFER_RANGE: {
 	 struct cell_buffer_range *br = (struct cell_buffer_range *)
 	     &buffer[pos+1];
-- 
cgit v1.2.3


From 73c6ae98c1c60635883a733f36d59d246e74aa2a Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Thu, 11 Sep 2008 10:38:37 -0600
Subject: cell: remove old state CMDs, added comments

---
 src/gallium/drivers/cell/common.h | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index 8aa2b23ec0..e989d8c2e5 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -84,7 +84,7 @@
 #define CELL_CMD_BATCH                5
 #define CELL_CMD_RELEASE_VERTS        6
 #define CELL_CMD_STATE_FRAMEBUFFER   10
-#define CELL_CMD_STATE_DEPTH_STENCIL 11
+#define CELL_CMD_STATE_FRAGMENT_OPS  11
 #define CELL_CMD_STATE_SAMPLER       12
 #define CELL_CMD_STATE_TEXTURE       13
 #define CELL_CMD_STATE_VERTEX_INFO   14
@@ -92,12 +92,9 @@
 #define CELL_CMD_STATE_UNIFORMS      16
 #define CELL_CMD_STATE_VS_ARRAY_INFO 17
 #define CELL_CMD_STATE_BIND_VS       18
-#define CELL_CMD_STATE_BLEND         19
 #define CELL_CMD_STATE_ATTRIB_FETCH  20
-#define CELL_CMD_STATE_LOGICOP       21
 #define CELL_CMD_VS_EXECUTE          22
 #define CELL_CMD_FLUSH_BUFFER_RANGE  23
-#define CELL_CMD_STATE_FRAGMENT_OPS  24
 
 
 #define CELL_NUM_BUFFERS 4
@@ -112,8 +109,13 @@
 
 
+/** Max instructions for doing per-fragment operations */
 #define SPU_MAX_FRAGMENT_OPS_INSTS 64
 
+
+/**
+ * Command to specify per-fragment operations state and generated code.
+ */
 struct cell_command_fragment_ops
 {
    uint64_t opcode;      /**< CELL_CMD_STATE_FRAGMENT_OPS */
@@ -159,13 +161,15 @@ struct cell_array_info
 };
 
 
-struct cell_attribute_fetch_code {
+struct cell_attribute_fetch_code
+{
    uint64_t base;
    uint size;
 };
 
 
-struct cell_buffer_range {
+struct cell_buffer_range
+{
    uint64_t base;
    unsigned size;
 };
-- 
cgit v1.2.3


From 1b5331d7ebcf7b1a1693972cf13407184cab1e48 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Thu, 11 Sep 2008 10:38:55 -0600
Subject: cell: fix typos in blend code-gen

---
 src/gallium/drivers/cell/ppu/cell_gen_fragment.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
index 7966c0916c..79a82ef72b 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
@@ -286,16 +286,16 @@ gen_blend(const struct pipe_blend_state *blend,
          spe_roti(f, mask_reg, mask_reg, 8);
 
          /* fbG = fbRGBA & mask */
-         spe_and(f, fbB_reg, fbRGBA_reg, mask_reg);
+         spe_and(f, fbG_reg, fbRGBA_reg, mask_reg);
          /* fbG = fbG >> 8 */
-         spe_roti(f, fbB_reg, fbB_reg, -8);
+         spe_roti(f, fbG_reg, fbG_reg, -8);
          /* mask = mask << 8 */
          spe_roti(f, mask_reg, mask_reg, 8);
 
          /* fbR = fbRGBA & mask */
          spe_and(f, fbR_reg, fbRGBA_reg, mask_reg);
          /* fbR = fbR >> 16 */
-         spe_roti(f, fbB_reg, fbB_reg, -16);
+         spe_roti(f, fbR_reg, fbR_reg, -16);
          /* mask = mask << 8 */
          spe_roti(f, mask_reg, mask_reg, 8);
 
-- 
cgit v1.2.3


From 7ce1d0fb6700fd4998a095de2c9edf5ed920464c Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Thu, 11 Sep 2008 10:52:03 -0600
Subject: cell: more comments, stub code for colormask/logicop/etc

---
 src/gallium/drivers/cell/spu/spu_per_fragment_op.c | 107 ++++++++++++++-------
 1 file changed, 70 insertions(+), 37 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
index 3f0eabaa05..03dd547845 100644
--- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
+++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
@@ -37,6 +37,8 @@
 #include "spu_per_fragment_op.h"
 
 
+#define LINEAR_QUAD_LAYOUT 1
+
 
 /**
  * Called by rasterizer for each quad after the shader has run.  Do
@@ -177,27 +179,28 @@ spu_fallback_fragment_ops(uint x, uint y,
    }
 
    if (spu.blend.blend_enable) {
+      /* blending terms, misc regs */
       vector float term1r, term1g, term1b, term1a;
       vector float term2r, term2g, term2b, term2a;
-
-      vector float fbRGBA[4];
-
       vector float one, tmp;
 
-      /* get colors from framebuffer */
+      vector float fbRGBA[4];  /* current framebuffer colors */
+
+      /* get colors from framebuffer/tile */
       {
          vector float fc[4];
          uint c0, c1, c2, c3;
-#if 0
-         c0 = colorTile->ui[y+0][x+0];
-         c1 = colorTile->ui[y+0][x+1];
-         c2 = colorTile->ui[y+1][x+0];
-         c3 = colorTile->ui[y+1][x+1];
-#else
+
+#if LINEAR_QUAD_LAYOUT /* See comments/diagram below */
          c0 = colorTile->ui[y][x*2+0];
          c1 = colorTile->ui[y][x*2+1];
          c2 = colorTile->ui[y][x*2+2];
          c3 = colorTile->ui[y][x*2+3];
+#else
+         c0 = colorTile->ui[y+0][x+0];
+         c1 = colorTile->ui[y+0][x+1];
+         c2 = colorTile->ui[y+1][x+0];
+         c3 = colorTile->ui[y+1][x+1];
 #endif
          switch (spu.fb.color_format) {
          case PIPE_FORMAT_B8G8R8A8_UNORM:
@@ -360,18 +363,11 @@ spu_fallback_fragment_ops(uint x, uint y,
    }
 
 
-   /* XXX do colormask test here */
-
-
-   if (spu_extract(spu_orx(mask), 0)) {
-      spu.cur_ctile_status = TILE_STATUS_DIRTY;
-   }
-   else {
-      return;
-   }
-
-   /* convert RRRR,GGGG,BBBB,AAAA to RGBA,RGBA,RGBA,RGBA */
+   /*
+    * Convert RRRR,GGGG,BBBB,AAAA to RGBA,RGBA,RGBA,RGBA.
+    */
 #if 0
+   /* original code */
    {
       vector float frag_soa[4];
       frag_soa[0] = fragR;
@@ -387,6 +383,9 @@ spu_fallback_fragment_ops(uint x, uint y,
    (void) fragB;
 #endif
 
+   /*
+    * Pack float colors into 32-bit RGBA words.
+    */
    switch (spu.fb.color_format) {
    case PIPE_FORMAT_A8R8G8B8_UNORM:
       c0 = spu_pack_A8R8G8B8(frag_aos[0]);
@@ -406,24 +405,41 @@ spu_fallback_fragment_ops(uint x, uint y,
       ASSERT(0);
    }
 
-#if 0
+
    /*
-    * Quad layout:
-    *  +--+--+
-    *  |p0|p1|
-    *  +--+--+
-    *  |p2|p3|
-    *  +--+--+
+    * Color masking
     */
-   if (spu_extract(mask, 0))
-      colorTile->ui[y+0][x+0] = c0;
-   if (spu_extract(mask, 1))
-      colorTile->ui[y+0][x+1] = c1;
-   if (spu_extract(mask, 2))
-      colorTile->ui[y+1][x+0] = c2;
-   if (spu_extract(mask, 3))
-      colorTile->ui[y+1][x+1] = c3;
-#else
+   if (spu.blend.colormask != 0xf) {
+      /* XXX to do */
+      /* apply color mask to 32-bit packed colors */
+   }
+
+
+   /*
+    * Logic Ops
+    */
+   if (spu.blend.logicop_enable) {
+      /* XXX to do */
+      /* apply logicop to 32-bit packed colors */
+   }
+
+
+   /*
+    * If mask is non-zero, mark tile as dirty.
+    */
+   if (spu_extract(spu_orx(mask), 0)) {
+      spu.cur_ctile_status = TILE_STATUS_DIRTY;
+   }
+   else {
+      return;
+   }
+
+
+   /*
+    * Write new quad colors to the framebuffer/tile.
+    * Only write pixels where the corresponding mask word is set.
+    */
+#if LINEAR_QUAD_LAYOUT
    /*
     * Quad layout:
     *  +--+--+--+--+
@@ -438,5 +454,22 @@ spu_fallback_fragment_ops(uint x, uint y,
       colorTile->ui[y][x*2+2] = c2;
    if (spu_extract(mask, 3))
       colorTile->ui[y][x*2+3] = c3;
+#else
+   /*
+    * Quad layout:
+    *  +--+--+
+    *  |p0|p1|
+    *  +--+--+
+    *  |p2|p3|
+    *  +--+--+
+    */
+   if (spu_extract(mask, 0))
+      colorTile->ui[y+0][x+0] = c0;
+   if (spu_extract(mask, 1))
+      colorTile->ui[y+0][x+1] = c1;
+   if (spu_extract(mask, 2))
+      colorTile->ui[y+1][x+0] = c2;
+   if (spu_extract(mask, 3))
+      colorTile->ui[y+1][x+1] = c3;
 #endif
 }
-- 
cgit v1.2.3


From fbf1586b36f8fb181ecee6a285c94f11e30005ba Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Thu, 11 Sep 2008 12:01:19 -0600
Subject: gallium: typo: s/PIPE_FORMAT_Z16UNORM/PIPE_FORMAT_Z16_UNORM/

---
 src/gallium/winsys/xlib/xm_api.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src/gallium')

diff --git a/src/gallium/winsys/xlib/xm_api.c b/src/gallium/winsys/xlib/xm_api.c
index 28bd6ceab4..d28a6423b9 100644
--- a/src/gallium/winsys/xlib/xm_api.c
+++ b/src/gallium/winsys/xlib/xm_api.c
@@ -354,7 +354,7 @@ create_xmesa_buffer(XMesaDrawable d, BufferType type,
       depthFormat = PIPE_FORMAT_S8Z24_UNORM;
 #else
    else if (vis->mesa_visual.depthBits <= 16)
-      depthFormat = PIPE_FORMAT_Z16UNORM;
+      depthFormat = PIPE_FORMAT_Z16_UNORM;
    else if (vis->mesa_visual.depthBits <= 24)
       depthFormat = PIPE_FORMAT_S8Z24_UNORM;
    else
-- 
cgit v1.2.3


From be925ab6e8ecf6758adb2c6f2c423af31c5f86ca Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Thu, 11 Sep 2008 15:48:13 -0600
Subject: cell: put cell_ prefix on gen_fragment_function()

---
 src/gallium/drivers/cell/ppu/cell_gen_fragment.c | 4 ++--
 src/gallium/drivers/cell/ppu/cell_gen_fragment.h | 2 +-
 src/gallium/drivers/cell/ppu/cell_state_emit.c   | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
index 79a82ef72b..5622701dda 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
@@ -616,7 +616,7 @@ gen_pack_colors(struct spe_function *f,
  * \param f     the generated function (out)
  */
 void
-gen_fragment_function(struct cell_context *cell, struct spe_function *f)
+cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
 {
    const struct pipe_depth_stencil_alpha_state *dsa =
       &cell->depth_stencil->base;
@@ -850,7 +850,7 @@ gen_fragment_function(struct cell_context *cell, struct spe_function *f)
       spe_release_register(f, rgba_reg);
    }
 
-   printf("gen_fragment_ops nr instructions: %u\n", f->num_inst);
+   //printf("gen_fragment_ops nr instructions: %u\n", f->num_inst);
 
    spe_bi(f, SPE_REG_RA, 0, 0);  /* return from function call */
 
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.h b/src/gallium/drivers/cell/ppu/cell_gen_fragment.h
index 0ea0fc690c..b59de198dc 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.h
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.h
@@ -31,7 +31,7 @@
 
 
 extern void
-gen_fragment_function(struct cell_context *cell, struct spe_function *f);
+cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f);
 
 
 #endif /* CELL_GEN_FRAGMENT_H */
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index 180b89c1f6..3ebf0749ad 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -85,7 +85,7 @@ cell_emit_state(struct cell_context *cell)
       struct spe_function spe_code;
 
       /* generate new code */
-      gen_fragment_function(cell, &spe_code);
+      cell_gen_fragment_function(cell, &spe_code);
       /* put the new code into the batch buffer */
       fops->opcode = CELL_CMD_STATE_FRAGMENT_OPS;
       memcpy(&fops->code, spe_code.store,
-- 
cgit v1.2.3


From 178bbaff80d079606a1135bd65f1a85bac9774c4 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Thu, 11 Sep 2008 17:07:30 -0600
Subject: gallium: add special cases in spe_load_float(), spe_load_int(), added
 spe_splat()

---
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c | 45 +++++++++++++++++++++++------
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h |  4 +++
 2 files changed, 40 insertions(+), 9 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
index 61010e4333..a04cc6c4ff 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
@@ -473,21 +473,48 @@ EMIT_R   (spe_mtspr, 0x10c);
 void
 spe_load_float(struct spe_function *p, unsigned rT, float x)
 {
-   union {
-      float f;
-      unsigned u;
-   } bits;
-   bits.f = x;
-   spe_ilhu(p, rT, bits.u >> 16);
-   spe_iohl(p, rT, bits.u & 0xffff);
+   if (x == 0.0f) {
+      spe_il(p, rT, 0x0);
+   }
+   else if (x == 0.5f) {
+      spe_ilhu(p, rT, 0x3f00);
+   }
+   else if (x == 1.0f) {
+      spe_ilhu(p, rT, 0x3f80);
+   }
+   else if (x == -1.0f) {
+      spe_ilhu(p, rT, 0xbf80);
+   }
+   else {
+      union {
+         float f;
+         unsigned u;
+      } bits;
+      bits.f = x;
+      spe_ilhu(p, rT, bits.u >> 16);
+      spe_iohl(p, rT, bits.u & 0xffff);
+   }
 }
 
 
 void
 spe_load_int(struct spe_function *p, unsigned rT, int i)
 {
-   spe_ilhu(p, rT, i >> 16);
-   spe_iohl(p, rT, i & 0xffff);
+   if (-32768 <= i && i <= 32767) {
+      spe_il(p, rT, i);
+   }
+   else {
+      spe_ilhu(p, rT, i >> 16);
+      spe_iohl(p, rT, i & 0xffff);
+   }
+}
+
+
+void
+spe_splat(struct spe_function *p, unsigned rT, unsigned rA)
+{
+   spe_ila(p, rT, 66051);
+   spe_shufb(p, rT, rA, rA, rT);
 }
 
 
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
index dee8c55c4a..d95e5aace3 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
@@ -292,6 +292,10 @@ spe_load_float(struct spe_function *p, unsigned rT, float x);
 extern void
 spe_load_int(struct spe_function *p, unsigned rT, int i);
 
+/** Replicate word 0 of rA across rT. */
+extern void
+spe_splat(struct spe_function *p, unsigned rT, unsigned rA);
+
 /** Complement/invert all bits in rT. */
 extern void
 spe_complement(struct spe_function *p, unsigned rT);
-- 
cgit v1.2.3


From bc304bbd49d15ce1130f3ba07adaa85ef03ed931 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Thu, 11 Sep 2008 17:08:52 -0600
Subject: cell: minor improvements to fragment code-gen

---
 src/gallium/drivers/cell/ppu/cell_gen_fragment.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
index 5622701dda..06219d4e98 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
@@ -265,6 +265,8 @@ gen_blend(const struct pipe_blend_state *blend,
    int one_reg = spe_allocate_available_register(f);
    int tmp_reg = spe_allocate_available_register(f);
 
+   boolean one_reg_set = false; /* avoid setting one_reg more than once */
+
    ASSERT(blend->blend_enable);
 
    /* Unpack/convert framebuffer colors from four 32-bit packed colors
@@ -275,7 +277,7 @@ gen_blend(const struct pipe_blend_state *blend,
       int mask_reg = spe_allocate_available_register(f);
 
       /* mask = {0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff} */
-      spe_fsmbi(f, mask_reg, 0x1111);
+      spe_load_int(f, mask_reg, 0xff);
 
       /* XXX there may be more clever ways to implement the following code */
       switch (color_format) {
@@ -418,7 +420,10 @@ gen_blend(const struct pipe_blend_state *blend,
       break;
    case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
       /* one = {1.0, 1.0, 1.0, 1.0} */
-      spe_load_float(f, one_reg, 1.0f);
+      if (!one_reg_set) {
+         spe_load_float(f, one_reg, 1.0f);
+         one_reg_set = true;
+      }
       /* tmp = one - fragA */
       spe_fs(f, tmp_reg, one_reg, fragA_reg);
       /* term = fb * tmp */
@@ -446,7 +451,10 @@ gen_blend(const struct pipe_blend_state *blend,
       break;
    case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
       /* one = {1.0, 1.0, 1.0, 1.0} */
-      spe_load_float(f, one_reg, 1.0f);
+      if (!one_reg_set) {
+         spe_load_float(f, one_reg, 1.0f);
+         one_reg_set = true;
+      }
       /* tmp = one - fragA */
       spe_fs(f, tmp_reg, one_reg, fragA_reg);
       /* termA = fbA * tmp */
-- 
cgit v1.2.3


From 084ab37b7f34d509af995efaef4615289669f72b Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Thu, 11 Sep 2008 17:10:10 -0600
Subject: cell: fix tile twidding bug seen in the event of multiple expose
 events

---
 src/gallium/winsys/xlib/xm_winsys.c | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/winsys/xlib/xm_winsys.c b/src/gallium/winsys/xlib/xm_winsys.c
index c4a30d3702..2acbc94fc8 100644
--- a/src/gallium/winsys/xlib/xm_winsys.c
+++ b/src/gallium/winsys/xlib/xm_winsys.c
@@ -289,21 +289,19 @@ xm_buffer_destroy(struct pipe_winsys *pws,
  *  +--+--+
  */
 static void
-twiddle_tile(uint *tile)
+twiddle_tile(const uint *tileIn, uint *tileOut)
 {
-   uint tile2[TILE_SIZE * TILE_SIZE];
    int y, x;
 
    for (y = 0; y < TILE_SIZE; y+=2) {
       for (x = 0; x < TILE_SIZE; x+=2) {
          int k = 4 * (y/2 * TILE_SIZE/2 + x/2);
-         tile2[y * TILE_SIZE + (x + 0)] = tile[k];
-         tile2[y * TILE_SIZE + (x + 1)] = tile[k+1];
-         tile2[(y + 1) * TILE_SIZE + (x + 0)] = tile[k+2];
-         tile2[(y + 1) * TILE_SIZE + (x + 1)] = tile[k+3];
+         tileOut[y * TILE_SIZE + (x + 0)] = tileIn[k];
+         tileOut[y * TILE_SIZE + (x + 1)] = tileIn[k+1];
+         tileOut[(y + 1) * TILE_SIZE + (x + 0)] = tileIn[k+2];
+         tileOut[(y + 1) * TILE_SIZE + (x + 1)] = tileIn[k+3];
       }
    }
-   memcpy(tile, tile2, sizeof(tile2));
 }
 
 
@@ -339,6 +337,7 @@ xmesa_display_surface_tiled(XMesaBuffer b, const struct pipe_surface *surf)
 
    for (y = 0; y < surf->height; y += TILE_SIZE) {
       for (x = 0; x < surf->width; x += TILE_SIZE) {
+         uint tmpTile[TILE_SIZE * TILE_SIZE];
          int tx = x / TILE_SIZE;
          int ty = y / TILE_SIZE;
          int offset = ty * tilesPerRow + tx;
@@ -352,9 +351,9 @@ xmesa_display_surface_tiled(XMesaBuffer b, const struct pipe_surface *surf)
 
          offset *= 4 * TILE_SIZE * TILE_SIZE;
 
-         ximage->data = (char *) xm_buf->data + offset;
-
-         twiddle_tile((uint *) ximage->data);
+         twiddle_tile((uint *) ((char *) xm_buf->data + offset),
+                      tmpTile);
+         ximage->data = (char*) tmpTile;
 
          if (XSHM_ENABLED(xm_buf)) {
 #if defined(USE_XSHM) && !defined(XFree86Server)
-- 
cgit v1.2.3


From aa66f08a21b791f338b519f0c2162cd8f7b3aeb0 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Thu, 11 Sep 2008 17:59:52 -0600
Subject: cell: initial support for fragment shader code generation.

TGSI shaders are translated into SPE instructions which are then sent to
the SPEs for execution.  Only a few opcodes work, no swizzling yet, no
support for constants/immediates, etc.
---
 src/gallium/drivers/cell/common.h                |  15 +
 src/gallium/drivers/cell/ppu/Makefile            |   1 +
 src/gallium/drivers/cell/ppu/cell_context.h      |   1 +
 src/gallium/drivers/cell/ppu/cell_gen_fp.c       | 523 +++++++++++++++++++++++
 src/gallium/drivers/cell/ppu/cell_gen_fp.h       |  42 ++
 src/gallium/drivers/cell/ppu/cell_state_emit.c   |  16 +
 src/gallium/drivers/cell/ppu/cell_state_shader.c |   8 +-
 src/gallium/drivers/cell/spu/spu_main.c          |  25 +-
 src/gallium/drivers/cell/spu/spu_main.h          |  15 +
 src/gallium/drivers/cell/spu/spu_tri.c           |  35 ++
 10 files changed, 678 insertions(+), 3 deletions(-)
 create mode 100644 src/gallium/drivers/cell/ppu/cell_gen_fp.c
 create mode 100644 src/gallium/drivers/cell/ppu/cell_gen_fp.h

(limited to 'src/gallium')

diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index e989d8c2e5..cb0631baf5 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -92,6 +92,7 @@
 #define CELL_CMD_STATE_UNIFORMS      16
 #define CELL_CMD_STATE_VS_ARRAY_INFO 17
 #define CELL_CMD_STATE_BIND_VS       18
+#define CELL_CMD_STATE_FRAGMENT_PROGRAM 19
 #define CELL_CMD_STATE_ATTRIB_FETCH  20
 #define CELL_CMD_VS_EXECUTE          22
 #define CELL_CMD_FLUSH_BUFFER_RANGE  23
@@ -125,6 +126,20 @@ struct cell_command_fragment_ops
 };
 
 
+/** Max instructions for fragment programs */
+#define SPU_MAX_FRAGMENT_PROGRAM_INSTS 128
+
+/**
+ * Command to send a fragment progra to SPUs.
+ */
+struct cell_command_fragment_program
+{
+   uint64_t opcode;      /**< CELL_CMD_STATE_FRAGMENT_PROGRAM */
+   uint num_inst;        /**< Number of instructions */
+   unsigned code[SPU_MAX_FRAGMENT_PROGRAM_INSTS];
+};
+
+
 /**
  * Tell SPUs about the framebuffer size, location
  */
diff --git a/src/gallium/drivers/cell/ppu/Makefile b/src/gallium/drivers/cell/ppu/Makefile
index 8699f3f8ec..b28f4c5c31 100644
--- a/src/gallium/drivers/cell/ppu/Makefile
+++ b/src/gallium/drivers/cell/ppu/Makefile
@@ -26,6 +26,7 @@ SOURCES = \
 	cell_draw_arrays.c \
 	cell_flush.c \
 	cell_gen_fragment.c \
+	cell_gen_fp.c \
 	cell_state_derived.c \
 	cell_state_emit.c \
 	cell_state_shader.c \
diff --git a/src/gallium/drivers/cell/ppu/cell_context.h b/src/gallium/drivers/cell/ppu/cell_context.h
index 8cec9f45b2..14914b9c6f 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.h
+++ b/src/gallium/drivers/cell/ppu/cell_context.h
@@ -61,6 +61,7 @@ struct cell_fragment_shader_state
 {
    struct pipe_shader_state shader;
    struct tgsi_shader_info info;
+   struct spe_function code;
    void *data;
 };
 
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
new file mode 100644
index 0000000000..6ffe94eb14
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
@@ -0,0 +1,523 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+
+/**
+ * Generate SPU fragment program/shader code.
+ *
+ * Note that we generate SOA-style code here.  So each TGSI instruction
+ * operates on four pixels (and is translated into four SPU instructions,
+ * generally speaking).
+ *
+ * \author Brian Paul
+ */
+
+
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_util.h"
+#include "tgsi/tgsi_exec.h"
+#include "tgsi/tgsi_dump.h"
+#include "rtasm/rtasm_ppc_spe.h"
+#include "util/u_memory.h"
+#include "cell_context.h"
+#include "cell_gen_fp.h"
+
+
+/** Set to 1 to enable debug/disassembly printfs */
+#define DISASSEM 01
+
+
+/**
+ * Context needed during code generation.
+ */
+struct codegen
+{
+   int inputs_reg;      /**< 1st function parameter */
+   int outputs_reg;     /**< 2nd function parameter */
+   int constants_reg;   /**< 3rd function parameter */
+   int temp_regs[8][4]; /**< maps TGSI temps to SPE registers */
+
+   int one_reg;         /**< register containing {1.0, 1.0, 1.0, 1.0} */
+
+   /** Per-instruction temps / intermediate temps */
+   int num_itemps;
+   int itemps[3];
+
+   struct spe_function *f;
+   boolean error;
+};
+
+
+/**
+ * Allocate an intermediate temporary register.
+ */
+static int
+get_itemp(struct codegen *gen)
+{
+   int t = spe_allocate_available_register(gen->f);
+   assert(gen->num_itemps < Elements(gen->itemps));
+   gen->itemps[gen->num_itemps++] = t;
+   return t;
+}
+
+/**
+ * Free all intermediate temporary registers.  To be called after each
+ * instruction has been emitted.
+ */
+static void
+free_itemps(struct codegen *gen)
+{
+   int i;
+   for (i = 0; i < gen->num_itemps; i++) {
+      spe_release_register(gen->f, gen->itemps[i]);
+   }
+   gen->num_itemps = 0;
+}
+
+
+/**
+ * Return index of an SPE register containing {1.0, 1.0, 1.0, 1.0}.
+ * The register is allocated and initialized upon the first call.
+ */
+static int
+get_const_one_reg(struct codegen *gen)
+{
+   if (gen->one_reg <= 0) {
+      gen->one_reg = spe_allocate_available_register(gen->f);
+   }
+
+   /* one = {1.0, 1.0, 1.0, 1.0} */
+   spe_load_float(gen->f, gen->one_reg, 1.0f);
+#if DISASSEM
+   printf("il\tr%d, 1.0f\n", gen->one_reg);
+#endif
+
+   return gen->one_reg;
+}
+
+
+/**
+ * Return the index of the SPU temporary containing the named TGSI
+ * source register.  If the TGSI register is a TGSI_FILE_TEMPORARY we
+ * just return the corresponding SPE register.  If the TGIS register
+ * is TGSI_FILE_INPUT/CONSTANT/IMMEDIATE we allocate a new SPE register
+ * and emit an SPE load instruction.
+ */
+static int
+get_src_reg(struct codegen *gen,
+            int channel,
+            const struct tgsi_full_src_register *src)
+{
+   int reg;
+
+   /* XXX need to examine src swizzle info here.
+    * That will involve changing the channel var...
+    */
+
+
+   switch (src->SrcRegister.File) {
+   case TGSI_FILE_TEMPORARY:
+      reg = gen->temp_regs[src->SrcRegister.Index][channel];
+      break;
+   case TGSI_FILE_INPUT:
+      {
+         /* offset is measured in quadwords, not bytes */
+         int offset = src->SrcRegister.Index * 4 + channel;
+         reg = get_itemp(gen);
+         /* Load:  reg = memory[(machine_reg) + offset] */
+         spe_lqd(gen->f, reg, gen->inputs_reg, offset);
+#if DISASSEM
+         printf("lqd\tr%d, r%d + %d\n", reg, gen->inputs_reg, offset);
+#endif
+      }
+      break;
+   case TGSI_FILE_IMMEDIATE:
+      /* xxx fall-through for now / fix */
+   case TGSI_FILE_CONSTANT:
+      /* xxx fall-through for now / fix */
+   default:
+      assert(0);
+   }
+
+   return reg;
+}
+
+
+/**
+ * Return the index of an SPE register to use for the given TGSI register.
+ * If the TGSI register is TGSI_FILE_TEMPORARAY, the index of the
+ * corresponding SPE register is returned.  If the TGSI register is
+ * TGSI_FILE_OUTPUT we allocate an intermediate temporary register.
+ * See store_dest_reg() below...
+ */
+static int
+get_dst_reg(struct codegen *gen,
+            int channel,
+            const struct tgsi_full_dst_register *dest)
+{
+   int reg;
+
+   switch (dest->DstRegister.File) {
+   case TGSI_FILE_TEMPORARY:
+      reg = gen->temp_regs[dest->DstRegister.Index][channel];
+      break;
+   case TGSI_FILE_OUTPUT:
+      reg = get_itemp(gen);
+      break;
+   default:
+      assert(0);
+   }
+
+   return reg;
+}
+
+
+/**
+ * When a TGSI instruction is writing to an output register, this
+ * function emits the SPE store instruction to store the value_reg.
+ * \param value_reg  the SPE register containing the value to store.
+ *                   This would have been returned by get_dst_reg().
+ */
+static void
+store_dest_reg(struct codegen *gen,
+               int value_reg, int channel,
+               const struct tgsi_full_dst_register *dest)
+{
+   switch (dest->DstRegister.File) {
+   case TGSI_FILE_TEMPORARY:
+      /* no-op */
+      break;
+   case TGSI_FILE_OUTPUT:
+      {
+         /* offset is measured in quadwords, not bytes */
+         int offset = dest->DstRegister.Index * 4 + channel;
+         /* Store: memory[(machine_reg) + offset] = reg */
+         spe_stqd(gen->f, value_reg, gen->outputs_reg, offset);
+#if DISASSEM
+         printf("stqd\tr%d, r%d + %d\n", value_reg, gen->outputs_reg, offset);
+#endif
+      }
+      break;
+   default:
+      assert(0);
+   }
+}
+
+
+static boolean
+emit_MOV(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int src_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int dst_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         /* XXX we don't always need to actually emit a mov instruction here */
+         spe_move(gen->f, dst_reg, src_reg);
+#if DISASSEM
+         printf("mov\tr%d, r%d\n", dst_reg, src_reg);
+#endif
+         store_dest_reg(gen, dst_reg, ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+   return true;
+}
+
+
+/**
+ * Emit addition instructions.  Recall that a single TGSI_OPCODE_ADD
+ * becomes (up to) four SPU "fa" instructions because we're doing SOA
+ * processing.
+ */
+static boolean
+emit_ADD(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+   /* Loop over Red/Green/Blue/Alpha channels */
+   for (ch = 0; ch < 4; ch++) {
+      /* If the dest R, G, B or A writemask is enabled... */
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         /* get indexes of the two src, one dest SPE registers */
+         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+
+         /* Emit actual SPE instruction: d = s1 + s2 */
+         spe_fa(gen->f, d_reg, s1_reg, s2_reg);
+#if DISASSEM
+         printf("fa\tr%d, r%d, r%d\n", d_reg, s1_reg, s2_reg);
+#endif
+
+         /* Store the result (a no-op for TGSI_FILE_TEMPORARY dests) */
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         /* Free any intermediate temps we allocated */
+         free_itemps(gen);
+      }
+   }
+   return true;
+}
+
+
+/**
+ * Emit multiply.  See emit_ADD for comments.
+ */
+static boolean
+emit_MUL(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         /* d = s1 * s2 */
+         spe_fm(gen->f, d_reg, s1_reg, s2_reg);
+#if DISASSEM
+         printf("fm\tr%d, r%d, r%d\n", d_reg, s1_reg, s2_reg);
+#endif
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+   return true;
+}
+
+
+/**
+ * Emit set-if-greater-than.
+ * Note that the SPE fcgt instruction produces 0x0 and 0xffffffff as
+ * the result but OpenGL/TGSI needs 0.0 and 1.0 results.
+ * We can easily convert 0x0/0xffffffff to 0.0/1.0 with a bitwise AND.
+ */
+static boolean
+emit_SGT(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+
+         /* d = (s1 > s2) */
+         spe_fcgt(gen->f, d_reg, s1_reg, s2_reg);
+#if DISASSEM
+         printf("fcgt\tr%d, r%d, r%d\n", d_reg, s1_reg, s2_reg);
+#endif
+
+         /* convert d from 0x0/0xffffffff to 0.0/1.0 */
+         /* d = d & one_reg */
+         spe_and(gen->f, d_reg, d_reg, get_const_one_reg(gen));
+#if DISASSEM
+         printf("and\tr%d, r%d, r%d\n", d_reg, d_reg, get_const_one_reg(gen));
+#endif
+
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+
+   return true;
+}
+
+
+/**
+ * Emit END instruction.
+ * We just return from the shader function at this point.
+ *
+ * Note that there may be more code after this that would be
+ * called by TGSI_OPCODE_CALL.
+ */
+static boolean
+emit_END(struct codegen *gen)
+{
+   /* return from function call */
+   spe_bi(gen->f, SPE_REG_RA, 0, 0);
+#if DISASSEM
+   printf("bi\trRA\n");
+#endif
+   return true;
+}
+
+
+/**
+ * Emit code for the given instruction.  Just a big switch stmt.
+ */
+static boolean
+emit_instruction(struct codegen *gen,
+                 const struct tgsi_full_instruction *inst)
+{
+   switch (inst->Instruction.Opcode) {
+   case TGSI_OPCODE_MOV:
+      return emit_MOV(gen, inst);
+   case TGSI_OPCODE_MUL:
+      return emit_MUL(gen, inst);
+   case TGSI_OPCODE_ADD:
+      return emit_ADD(gen, inst);
+   case TGSI_OPCODE_SGT:
+      return emit_SGT(gen, inst);
+   case TGSI_OPCODE_END:
+      return emit_END(gen);
+
+   /* XXX lots more cases to do... */
+
+   default:
+      return false;
+   }
+
+   return true;
+}
+
+
+
+/**
+ * Emit "code" for a TGSI declaration.
+ * We only care about TGSI TEMPORARY register declarations at this time.
+ * For each TGSI TEMPORARY we allocate four SPE registers.
+ */
+static void
+emit_declaration(struct codegen *gen, const struct tgsi_full_declaration *decl)
+{
+   int i, ch;
+
+   switch (decl->Declaration.File) {
+   case TGSI_FILE_TEMPORARY:
+#if DISASSEM
+      printf("Declare temp reg %d .. %d\n",
+             decl->DeclarationRange.First,
+             decl->DeclarationRange.Last);
+#endif
+      for (i = decl->DeclarationRange.First;
+           i <= decl->DeclarationRange.Last;
+           i++) {
+         for (ch = 0; ch < 4; ch++) {
+            gen->temp_regs[i][ch] = spe_allocate_available_register(gen->f);
+         }
+
+         /* XXX if we run out of SPE registers, we need to spill
+          * to SPU memory.  someday...
+          */
+
+#if DISASSEM
+         printf("  SPE regs: %d %d %d %d\n",
+                gen->temp_regs[i][0],
+                gen->temp_regs[i][1],
+                gen->temp_regs[i][2],
+                gen->temp_regs[i][3]);
+#endif
+      }
+      break;
+   default:
+      ; /* ignore */
+   }
+}
+
+
+/**
+ * Translate TGSI shader code to SPE instructions.  This is done when
+ * the state tracker gives us a new shader (via pipe->create_fs_state()).
+ *
+ * \param cell    the rendering context (in)
+ * \param tokens  the TGSI shader (in)
+ * \param f       the generated function (out)
+ */
+boolean
+cell_gen_fragment_program(struct cell_context *cell,
+                          const struct tgsi_token *tokens,
+                          struct spe_function *f)
+{
+   struct tgsi_parse_context parse;
+   struct codegen gen;
+
+   memset(&gen, 0, sizeof(gen));
+   gen.f = f;
+
+   /* For SPE function calls: reg $3 = first param, $4 = second param, etc. */
+   gen.inputs_reg = 3;     /* pointer to inputs array */
+   gen.outputs_reg = 4;    /* pointer to outputs array */
+   gen.constants_reg = 5;  /* pointer to constants array */
+
+   spe_init_func(f, SPU_MAX_FRAGMENT_PROGRAM_INSTS * SPE_INST_SIZE);
+   spe_allocate_register(f, gen.inputs_reg);
+   spe_allocate_register(f, gen.outputs_reg);
+   spe_allocate_register(f, gen.constants_reg);
+
+#if DISASSEM
+   printf("Begin %s\n", __FUNCTION__);
+   tgsi_dump(tokens, 0);
+#endif
+
+   tgsi_parse_init(&parse, tokens);
+
+   while (!tgsi_parse_end_of_tokens(&parse) && !gen.error) {
+      tgsi_parse_token(&parse);
+
+      switch (parse.FullToken.Token.Type) {
+      case TGSI_TOKEN_TYPE_IMMEDIATE:
+#if 0
+         if (!note_immediate(&gen, &parse.FullToken.FullImmediate ))
+            goto fail;
+#endif
+         break;
+
+      case TGSI_TOKEN_TYPE_DECLARATION:
+         emit_declaration(&gen, &parse.FullToken.FullDeclaration);
+         break;
+
+      case TGSI_TOKEN_TYPE_INSTRUCTION:
+         if (!emit_instruction(&gen, &parse.FullToken.FullInstruction )) {
+            gen.error = true;
+         }
+         break;
+
+      default:
+         assert(0);
+
+      }
+   }
+
+
+   if (gen.error) {
+      /* terminate the SPE code */
+      return emit_END(&gen);
+   }
+
+#if DISASSEM
+   printf("cell_gen_fragment_program nr instructions: %d\n", f->num_inst);
+   printf("End %s\n", __FUNCTION__);
+#endif
+
+   tgsi_parse_free( &parse );
+
+   return !gen.error;
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.h b/src/gallium/drivers/cell/ppu/cell_gen_fp.h
new file mode 100644
index 0000000000..99faea7046
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.h
@@ -0,0 +1,42 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+
+#ifndef CELL_GEN_FP_H
+#define CELL_GEN_FP_H
+
+
+
+extern boolean
+cell_gen_fragment_program(struct cell_context *cell,
+                          const struct tgsi_token *tokens,
+                          struct spe_function *f);
+
+
+#endif /* CELL_GEN_FP_H */
+
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index 3ebf0749ad..2da3097983 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -73,6 +73,22 @@ cell_emit_state(struct cell_context *cell)
 #endif
    }
 
+   if (cell->dirty & (CELL_NEW_FS)) {
+      /* Send new fragment program to SPUs */
+      struct cell_command_fragment_program *fp
+            = cell_batch_alloc(cell, sizeof(*fp));
+      fp->opcode = CELL_CMD_STATE_FRAGMENT_PROGRAM;
+      fp->num_inst = cell->fs->code.num_inst;
+      memcpy(&fp->code, cell->fs->code.store,
+             SPU_MAX_FRAGMENT_PROGRAM_INSTS * SPE_INST_SIZE);
+      if (0) {
+         int i;
+         printf("PPU Emit CELL_CMD_STATE_FRAGMENT_PROGRAM:\n");
+         for (i = 0; i < fp->num_inst; i++) {
+            printf(" %3d: 0x%08x\n", i, fp->code[i]);
+         }
+      }
+   }
 
    if (cell->dirty & (CELL_NEW_FRAMEBUFFER |
                       CELL_NEW_DEPTH_STENCIL |
diff --git a/src/gallium/drivers/cell/ppu/cell_state_shader.c b/src/gallium/drivers/cell/ppu/cell_state_shader.c
index 97e44eeb1a..3a0d066da2 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_shader.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_shader.c
@@ -34,7 +34,7 @@
 
 #include "cell_context.h"
 #include "cell_state.h"
-
+#include "cell_gen_fp.h"
 
 
 /** cast wrapper */
@@ -61,7 +61,7 @@ static void *
 cell_create_fs_state(struct pipe_context *pipe,
                      const struct pipe_shader_state *templ)
 {
-   /*struct cell_context *cell = cell_context(pipe);*/
+   struct cell_context *cell = cell_context(pipe);
    struct cell_fragment_shader_state *cfs;
 
    cfs = CALLOC_STRUCT(cell_fragment_shader_state);
@@ -76,6 +76,8 @@ cell_create_fs_state(struct pipe_context *pipe,
 
    tgsi_scan_shader(templ->tokens, &cfs->info);
 
+   cell_gen_fragment_program(cell, cfs->shader.tokens, &cfs->code);
+
    return cfs;
 }
 
@@ -102,6 +104,8 @@ cell_delete_fs_state(struct pipe_context *pipe, void *fs)
 {
    struct cell_fragment_shader_state *cfs = cell_fragment_shader_state(fs);
 
+   spe_release_func(&cfs->code);
+
    FREE((void *) cfs->shader.tokens);
    FREE(cfs);
 }
diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index 2a7cb75f59..78260c4259 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -232,7 +232,7 @@ cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops)
       printf("SPU %u: CMD_STATE_FRAGMENT_OPS\n", spu.init.id);
    /* Copy SPU code from batch buffer to spu buffer */
    memcpy(spu.fragment_ops_code, fops->code, SPU_MAX_FRAGMENT_OPS_INSTS * 4);
-   /* Copy state info */
+   /* Copy state info (for fallback case only) */
    memcpy(&spu.depth_stencil_alpha, &fops->dsa, sizeof(fops->dsa));
    memcpy(&spu.blend, &fops->blend, sizeof(fops->blend));
 
@@ -244,6 +244,21 @@ cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops)
 }
 
 
+static void
+cmd_state_fragment_program(const struct cell_command_fragment_program *fp)
+{
+   if (Debug)
+      printf("SPU %u: CMD_STATE_FRAGMENT_PROGRAM\n", spu.init.id);
+   /* Copy SPU code from batch buffer to spu buffer */
+   memcpy(spu.fragment_program_code, fp->code,
+          SPU_MAX_FRAGMENT_PROGRAM_INSTS * 4);
+#if 01
+   /* Point function pointer at new code */
+   spu.fragment_program = (spu_fragment_program_func)spu.fragment_program_code;
+#endif
+}
+
+
 static void
 cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
 {
@@ -473,6 +488,14 @@ cmd_batch(uint opcode)
             pos += sizeof(*fops) / 8;
          }
          break;
+      case CELL_CMD_STATE_FRAGMENT_PROGRAM:
+         {
+            struct cell_command_fragment_program *fp
+               = (struct cell_command_fragment_program *) &buffer[pos];
+            cmd_state_fragment_program(fp);
+            pos += sizeof(*fp) / 8;
+         }
+         break;
       case CELL_CMD_STATE_SAMPLER:
          {
             struct cell_command_sampler *sampler
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index d40539da83..2c7b625840 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -75,6 +75,12 @@ typedef void (*spu_fragment_ops_func)(uint x, uint y,
                                       vector float fragAlpha,
                                       vector unsigned int mask);
 
+/** Function for running fragment program */
+typedef void (*spu_fragment_program_func)(vector float *inputs,
+                                          vector float *outputs,
+                                          vector float *constants);
+
+
 struct spu_framebuffer
 {
    void *color_start;              /**< addr of color surface in main memory */
@@ -142,9 +148,18 @@ struct spu_global
    /** Current fragment ops function */
    spu_fragment_ops_func fragment_ops;
 
+   /** Current fragment program machine code */
+   uint fragment_program_code[SPU_MAX_FRAGMENT_PROGRAM_INSTS];
+   /** Current fragment ops function */
+   spu_fragment_program_func fragment_program;
+
    /** Current texture sampler function */
    spu_sample_texture_func sample_texture[CELL_MAX_SAMPLERS];
 
+   /** Fragment program constants (XXX preliminary/used) */
+#define MAX_CONSTANTS 32
+   vector float constants[MAX_CONSTANTS];
+
 } ALIGN16_ATTRIB;
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index f02cdd1f76..8b93878192 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -314,7 +314,42 @@ emit_quad( int x, int y, mask_t mask )
       }
       else {
          /* simple shading */
+#if 0
          eval_coeff(1, (float) x, (float) y, colors);
+
+#else
+         /* XXX new fragment program code */
+
+         if (spu.fragment_program) {
+            vector float inputs[4*4], outputs[2*4];
+
+            /* setup inputs */
+            eval_coeff(1, (float) x, (float) y, inputs);
+
+            /* Execute the current fragment program */
+            spu.fragment_program(inputs, outputs, spu.constants);
+
+            /* Copy outputs */
+            colors[0] = outputs[0*4+0];
+            colors[1] = outputs[0*4+1];
+            colors[2] = outputs[0*4+2];
+            colors[3] = outputs[0*4+3];
+
+            if (0 && spu.init.id==0 && y == 48) {
+               printf("colors[0] = %f %f %f %f\n",
+                      spu_extract(colors[0], 0),
+                      spu_extract(colors[0], 1),
+                      spu_extract(colors[0], 2),
+                      spu_extract(colors[0], 3));
+               printf("colors[1] = %f %f %f %f\n",
+                      spu_extract(colors[1], 0),
+                      spu_extract(colors[1], 1),
+                      spu_extract(colors[1], 2),
+                      spu_extract(colors[1], 3));
+            }
+
+         }
+#endif
       }
 
 
-- 
cgit v1.2.3