32 files changed, 2966 insertions, 568 deletions
diff --git a/src/gallium/Makefile b/src/gallium/Makefile
index aa77021daf..36bd3623e7 100644
--- a/src/gallium/Makefile
+++ b/src/gallium/Makefile
@@ -3,6 +3,7 @@ include $(TOP)/configs/current
 
 
 SUBDIRS = auxiliary drivers
+# Note winsys/ needs to be built after src/mesa
 
 
 default: subdirs
diff --git a/src/gallium/auxiliary/draw/draw_vs_exec.c b/src/gallium/auxiliary/draw/draw_vs_exec.c
index 44563803f9..79a19d6be2 100644
--- a/src/gallium/auxiliary/draw/draw_vs_exec.c
+++ b/src/gallium/auxiliary/draw/draw_vs_exec.c
@@ -46,6 +46,7 @@
 struct exec_vertex_shader {
    struct draw_vertex_shader base;
    struct tgsi_exec_machine *machine;
+   const struct tgsi_token *machine_tokens;
 };
 
 static struct exec_vertex_shader *exec_vertex_shader( struct draw_vertex_shader *vs )
@@ -62,12 +63,16 @@ vs_exec_prepare( struct draw_vertex_shader *shader,
 {
    struct exec_vertex_shader *evs = exec_vertex_shader(shader);
 
-   /* specify the vertex program to interpret/execute */
-   tgsi_exec_machine_bind_shader(evs->machine,
-				 shader->state.tokens,
-				 PIPE_MAX_SAMPLERS,
-				 NULL /*samplers*/ );
-
+   /* Specify the vertex program to interpret/execute.
+    * Avoid rebinding when possible.
+    */
+   if (evs->machine_tokens != shader->state.tokens) {
+      tgsi_exec_machine_bind_shader(evs->machine,
+                                    shader->state.tokens,
+                                    PIPE_MAX_SAMPLERS,
+                                    NULL /*samplers*/ );
+      evs->machine_tokens = shader->state.tokens;
+   }
 }
 
 
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc.c b/src/gallium/auxiliary/rtasm/rtasm_ppc.c
new file mode 100644
index 0000000000..534a23568d
--- /dev/null
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc.c
@@ -0,0 +1,365 @@
+/**************************************************************************
+ *
+ * Copyright (C) 2008 Tungsten Graphics, Inc.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * PPC code generation.
+ * \author Brian Paul
+ */
+
+
+#include "util/u_memory.h"
+#include "pipe/p_debug.h"
+#include "rtasm_ppc.h"
+
+
+void
+ppc_init_func(struct ppc_function *p, unsigned max_inst)
+{
+    p->store = align_malloc(max_inst * PPC_INST_SIZE, 16);
+    p->num_inst = 0;
+    p->max_inst = max_inst;
+    p->vec_used = ~0;
+}
+
+
+void
+ppc_release_func(struct ppc_function *p)
+{
+    assert(p->num_inst <= p->max_inst);
+    if (p->store != NULL) {
+        align_free(p->store);
+    }
+    p->store = NULL;
+}
+
+
+/**
+ * Alloate a vector register.
+ * \return register index or -1 if none left.
+ */
+int
+ppc_allocate_vec_register(struct ppc_function *p, int reg)
+{
+   unsigned i;
+   for (i = 0; i < PPC_NUM_VEC_REGS; i++) {
+      const uint64_t mask = 1 << i;
+      if ((p->vec_used & mask) != 0) {
+         p->vec_used &= ~mask;
+         return i;
+      }
+   }
+
+   return -1;
+}
+
+
+/**
+ * Mark the given vector register as "unallocated".
+ */
+void
+ppc_release_vec_register(struct ppc_function *p, int reg)
+{
+   assert(reg < PPC_NUM_VEC_REGS);
+   assert((p->vec_used & (1 << reg)) == 0);
+
+   p->vec_used |= (1 << reg);
+}
+
+
+
+union vx_inst {
+   uint32_t bits;
+   struct {
+      unsigned op:6;
+      unsigned vD:5;
+      unsigned vA:5;
+      unsigned vB:5;
+      unsigned op2:11;
+   } inst;
+};
+
+union vxr_inst {
+   uint32_t bits;
+   struct {
+      unsigned op:6;
+      unsigned vD:5;
+      unsigned vA:5;
+      unsigned vB:5;
+      unsigned rC:1;
+      unsigned op2:10;
+   } inst;
+};
+
+union va_inst {
+   uint32_t bits;
+   struct {
+      unsigned op:6;
+      unsigned vD:5;
+      unsigned vA:5;
+      unsigned vB:5;
+      unsigned vC:5;
+      unsigned op2:6;
+   } inst;
+};
+
+
+static inline void
+emit_vx(struct ppc_function *p, uint op2, uint vD, uint vA, uint vB)
+{
+   union vx_inst inst;
+   inst.inst.op = 4;
+   inst.inst.vD = vD;
+   inst.inst.vA = vA;
+   inst.inst.vB = vB;
+   inst.inst.op2 = op2;
+   p->store[p->num_inst++] = inst.bits;
+   assert(p->num_inst <= p->max_inst);
+};
+
+static inline void
+emit_vxr(struct ppc_function *p, uint op2, uint vD, uint vA, uint vB)
+{
+   union vxr_inst inst;
+   inst.inst.op = 4;
+   inst.inst.vD = vD;
+   inst.inst.vA = vA;
+   inst.inst.vB = vB;
+   inst.inst.rC = 0;
+   inst.inst.op2 = op2;
+   p->store[p->num_inst++] = inst.bits;
+   assert(p->num_inst <= p->max_inst);
+};
+
+static inline void
+emit_va(struct ppc_function *p, uint op2, uint vD, uint vA, uint vB, uint vC)
+{
+   union va_inst inst;
+   inst.inst.op = 4;
+   inst.inst.vD = vD;
+   inst.inst.vA = vA;
+   inst.inst.vB = vB;
+   inst.inst.vC = vC;
+   inst.inst.op2 = op2;
+   p->store[p->num_inst++] = inst.bits;
+   assert(p->num_inst <= p->max_inst);
+};
+
+
+
+/**
+ ** float vector arithmetic
+ **/
+
+/** vector float add */
+void
+ppc_vaddfp(struct ppc_function *p,uint vD, uint vA, uint vB)
+{
+   emit_vx(p, 10, vD, vA, vB);
+}
+
+/** vector float substract */
+void
+ppc_vsubfp(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+   emit_vx(p, 74, vD, vA, vB);
+}
+
+/** vector float min */
+void
+ppc_vminfp(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+   emit_vx(p, 1098, vD, vA, vB);
+}
+
+/** vector float max */
+void
+ppc_vmaxfp(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+   emit_vx(p, 1034, vD, vA, vB);
+}
+
+/** vector float mult add */
+void
+ppc_vmaddfp(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC)
+{
+   emit_va(p, 46, vD, vA, vB, vC);
+}
+
+/** vector float compare greater than */
+void
+ppc_vcmpgtfpx(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+   emit_vxr(p, 710, vD, vA, vB);
+}
+
+/** vector float compare greater than or equal to */
+void
+ppc_vcmpgefpx(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+   emit_vxr(p, 454, vD, vA, vB);
+}
+
+/** vector float compare equal */
+void
+ppc_vcmpeqfpx(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+   emit_vxr(p, 198, vD, vA, vB);
+}
+
+/** vector float 2^x */
+void
+ppc_vexptefp(struct ppc_function *p, uint vD, uint vB)
+{
+   emit_vx(p, 394, vD, 0, vB);
+}
+
+/** vector float log2(x) */
+void
+ppc_vlogefp(struct ppc_function *p, uint vD, uint vB)
+{
+   emit_vx(p, 458, vD, 0, vB);
+}
+
+/** vector float reciprocol */
+void
+ppc_vrefp(struct ppc_function *p, uint vD, uint vB)
+{
+   emit_vx(p, 266, vD, 0, vB);
+}
+
+/** vector float reciprocol sqrt estimate */
+void
+ppc_vrsqrtefp(struct ppc_function *p, uint vD, uint vB)
+{
+   emit_vx(p, 330, vD, 0, vB);
+}
+
+/** vector float round to negative infinity */
+void
+ppc_vrfim(struct ppc_function *p, uint vD, uint vB)
+{
+   emit_vx(p, 714, vD, 0, vB);
+}
+
+/** vector float round to positive infinity */
+void
+ppc_vrfip(struct ppc_function *p, uint vD, uint vB)
+{
+   emit_vx(p, 650, vD, 0, vB);
+}
+
+/** vector float round to nearest int */
+void
+ppc_vrfin(struct ppc_function *p, uint vD, uint vB)
+{
+   emit_vx(p, 522, vD, 0, vB);
+}
+
+/** vector float round to int toward zero */
+void
+ppc_vrfiz(struct ppc_function *p, uint vD, uint vB)
+{
+   emit_vx(p, 586, vD, 0, vB);
+}
+
+
+
+/**
+ ** bitwise operations
+ **/
+
+
+/** vector and */
+void
+ppc_vand(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+   emit_vx(p, 1028, vD, vA, vB);
+}
+
+/** vector and complement */
+void
+ppc_vandc(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+   emit_vx(p, 1092, vD, vA, vB);
+}
+
+/** vector or */
+void
+ppc_vor(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+   emit_vx(p, 1156, vD, vA, vB);
+}
+
+/** vector nor */
+void
+ppc_vnor(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+   emit_vx(p, 1284, vD, vA, vB);
+}
+
+/** vector xor */
+void
+ppc_vxor(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+   emit_vx(p, 1220, vD, vA, vB);
+}
+
+
+/**
+ ** Vector shuffle / select / splat / etc
+ **/
+
+/** vector permute */
+void
+ppc_vperm(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC)
+{
+   emit_va(p, 43, vD, vA, vB, vC);
+}
+
+/** vector select */
+void
+ppc_vsel(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC)
+{
+   emit_va(p, 42, vD, vA, vB, vC);
+}
+
+/** vector splat byte */
+void
+ppc_vspltb(struct ppc_function *p, uint vD, uint vB, uint imm)
+{
+   emit_vx(p, 42, vD, imm, vB);
+}
+
+/** vector splat half word */
+void
+ppc_vsplthw(struct ppc_function *p, uint vD, uint vB, uint imm)
+{
+   emit_vx(p, 588, vD, imm, vB);
+}
+
+/** vector splat word */
+void
+ppc_vspltw(struct ppc_function *p, uint vD, uint vB, uint imm)
+{
+   emit_vx(p, 652, vD, imm, vB);
+}
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc.h b/src/gallium/auxiliary/rtasm/rtasm_ppc.h
new file mode 100644
index 0000000000..ed14e943df
--- /dev/null
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc.h
@@ -0,0 +1,181 @@
+/**************************************************************************
+ *
+ * Copyright (C) 2008 Tungsten Graphics, Inc.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * PPC code generation.
+ * \author Brian Paul
+ */
+
+
+#ifndef RTASM_PPC_H
+#define RTASM_PPC_H
+
+
+#include "pipe/p_compiler.h"
+
+
+#define PPC_INST_SIZE 4  /**< 4 bytes / instruction */
+
+#define PPC_NUM_VEC_REGS 32
+
+
+struct ppc_function
+{
+   uint32_t *store;  /**< instruction buffer */
+   uint num_inst;
+   uint max_inst;
+   uint32_t vec_used;   /** used/free vector registers bitmask */
+   uint32_t reg_used;   /** used/free general-purpose registers bitmask */
+};
+
+
+
+extern void ppc_init_func(struct ppc_function *p, unsigned max_inst);
+extern void ppc_release_func(struct ppc_function *p);
+
+extern int ppc_allocate_vec_register(struct ppc_function *p, int reg);
+extern void ppc_release_vec_register(struct ppc_function *p, int reg);
+
+
+/**
+ ** float vector arithmetic
+ **/
+
+/** vector float add */
+extern void
+ppc_vaddfp(struct ppc_function *p,uint vD, uint vA, uint vB);
+
+/** vector float substract */
+extern void
+ppc_vsubfp(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+/** vector float min */
+extern void
+ppc_vminfp(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+/** vector float max */
+extern void
+ppc_vmaxfp(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+/** vector float mult add */
+extern void
+ppc_vmaddfp(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC);
+
+/** vector float compare greater than */
+extern void
+ppc_vcmpgtfpx(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+/** vector float compare greater than or equal to */
+extern void
+ppc_vcmpgefpx(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+/** vector float compare equal */
+extern void
+ppc_vcmpeqfpx(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+/** vector float 2^x */
+extern void
+ppc_vexptefp(struct ppc_function *p, uint vD, uint vB);
+
+/** vector float log2(x) */
+extern void
+ppc_vlogefp(struct ppc_function *p, uint vD, uint vB);
+
+/** vector float reciprocol */
+extern void
+ppc_vrefp(struct ppc_function *p, uint vD, uint vB);
+
+/** vector float reciprocol sqrt estimate */
+extern void
+ppc_vrsqrtefp(struct ppc_function *p, uint vD, uint vB);
+
+/** vector float round to negative infinity */
+extern void
+ppc_vrfim(struct ppc_function *p, uint vD, uint vB);
+
+/** vector float round to positive infinity */
+extern void
+ppc_vrfip(struct ppc_function *p, uint vD, uint vB);
+
+/** vector float round to nearest int */
+extern void
+ppc_vrfin(struct ppc_function *p, uint vD, uint vB);
+
+/** vector float round to int toward zero */
+extern void
+ppc_vrfiz(struct ppc_function *p, uint vD, uint vB);
+
+
+
+/**
+ ** bitwise operations
+ **/
+
+
+/** vector and */
+extern void
+ppc_vand(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+/** vector and complement */
+extern void
+ppc_vandc(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+/** vector or */
+extern void
+ppc_vor(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+/** vector nor */
+extern void
+ppc_vnor(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+/** vector xor */
+extern void
+ppc_vxor(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+
+/**
+ ** Vector shuffle / select / splat / etc
+ **/
+
+/** vector permute */
+extern void
+ppc_vperm(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC);
+
+/** vector select */
+extern void
+ppc_vsel(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC);
+
+/** vector splat byte */
+extern void
+ppc_vspltb(struct ppc_function *p, uint vD, uint vB, uint imm);
+
+/** vector splat half word */
+extern void
+ppc_vsplthw(struct ppc_function *p, uint vD, uint vB, uint imm);
+
+/** vector splat word */
+extern void
+ppc_vspltw(struct ppc_function *p, uint vD, uint vB, uint imm);
+
+
+#endif /* RTASM_PPC_H */
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
index a04cc6c4ff..491141f190 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
@@ -27,12 +27,16 @@
  * Real-time assembly generation interface for Cell B.E. SPEs.
  *
  * \author Ian Romanick <idr@us.ibm.com>
+ * \author Brian Paul
  */
 
+
+#include <stdio.h>
 #include "pipe/p_compiler.h"
 #include "util/u_memory.h"
 #include "rtasm_ppc_spe.h"
 
+
 #ifdef GALLIUM_CELL
 /**
  * SPE instruction types
@@ -143,8 +147,25 @@ union spe_inst_RI18 {
 /*@}*/
 
 
+static void
+indent(const struct spe_function *p)
+{
+   int i;
+   for (i = 0; i < p->indent; i++) {
+      putchar(' ');
+   }
+}
+
+
+static const char *
+rem_prefix(const char *longname)
+{
+   return longname + 4;
+}
+
+
 static void emit_RR(struct spe_function *p, unsigned op, unsigned rT,
-		    unsigned rA, unsigned rB)
+		    unsigned rA, unsigned rB, const char *name)
 {
     union spe_inst_RR inst;
     inst.inst.op = op;
@@ -153,11 +174,15 @@ static void emit_RR(struct spe_function *p, unsigned op, unsigned rT,
     inst.inst.rT = rT;
     p->store[p->num_inst++] = inst.bits;
     assert(p->num_inst <= p->max_inst);
+    if (p->print) {
+       indent(p);
+       printf("%s\t$%d, $%d, $%d\n", rem_prefix(name), rT, rA, rB);
+    }
 }
 
 
 static void emit_RRR(struct spe_function *p, unsigned op, unsigned rT,
-		    unsigned rA, unsigned rB, unsigned rC)
+                     unsigned rA, unsigned rB, unsigned rC, const char *name)
 {
     union spe_inst_RRR inst;
     inst.inst.op = op;
@@ -167,11 +192,15 @@ static void emit_RRR(struct spe_function *p, unsigned op, unsigned rT,
     inst.inst.rC = rC;
     p->store[p->num_inst++] = inst.bits;
     assert(p->num_inst <= p->max_inst);
+    if (p->print) {
+       indent(p);
+       printf("%s\t$%d, $%d, $%d, $%d\n", rem_prefix(name), rT, rA, rB, rC);
+    }
 }
 
 
 static void emit_RI7(struct spe_function *p, unsigned op, unsigned rT,
-		     unsigned rA, int imm)
+		     unsigned rA, int imm, const char *name)
 {
     union spe_inst_RI7 inst;
     inst.inst.op = op;
@@ -180,12 +209,16 @@ static void emit_RI7(struct spe_function *p, unsigned op, unsigned rT,
     inst.inst.rT = rT;
     p->store[p->num_inst++] = inst.bits;
     assert(p->num_inst <= p->max_inst);
+    if (p->print) {
+       indent(p);
+       printf("%s\t$%d, $%d, 0x%x\n", rem_prefix(name), rT, rA, imm);
+    }
 }
 
 
 
 static void emit_RI8(struct spe_function *p, unsigned op, unsigned rT,
-		     unsigned rA, int imm)
+		     unsigned rA, int imm, const char *name)
 {
     union spe_inst_RI8 inst;
     inst.inst.op = op;
@@ -194,12 +227,16 @@ static void emit_RI8(struct spe_function *p, unsigned op, unsigned rT,
     inst.inst.rT = rT;
     p->store[p->num_inst++] = inst.bits;
     assert(p->num_inst <= p->max_inst);
+    if (p->print) {
+       indent(p);
+       printf("%s\t$%d, $%d, 0x%x\n", rem_prefix(name), rT, rA, imm);
+    }
 }
 
 
 
 static void emit_RI10(struct spe_function *p, unsigned op, unsigned rT,
-		      unsigned rA, int imm)
+		      unsigned rA, int imm, const char *name)
 {
     union spe_inst_RI10 inst;
     inst.inst.op = op;
@@ -208,11 +245,19 @@ static void emit_RI10(struct spe_function *p, unsigned op, unsigned rT,
     inst.inst.rT = rT;
     p->store[p->num_inst++] = inst.bits;
     assert(p->num_inst <= p->max_inst);
+    if (p->print) {
+       indent(p);
+       if (strcmp(name, "spe_lqd") == 0 ||
+           strcmp(name, "spe_stqd") == 0)
+          printf("%s\t$%d, 0x%x($%d)\n", rem_prefix(name), rT, imm, rA);
+       else
+          printf("%s\t$%d, $%d, 0x%x\n", rem_prefix(name), rT, rA, imm);
+    }
 }
 
 
 static void emit_RI16(struct spe_function *p, unsigned op, unsigned rT,
-		      int imm)
+		      int imm, const char *name)
 {
     union spe_inst_RI16 inst;
     inst.inst.op = op;
@@ -220,11 +265,15 @@ static void emit_RI16(struct spe_function *p, unsigned op, unsigned rT,
     inst.inst.rT = rT;
     p->store[p->num_inst++] = inst.bits;
     assert(p->num_inst <= p->max_inst);
+    if (p->print) {
+       indent(p);
+       printf("%s\t$%d, 0x%x\n", rem_prefix(name), rT, imm);
+    }
 }
 
 
 static void emit_RI18(struct spe_function *p, unsigned op, unsigned rT,
-		      int imm)
+		      int imm, const char *name)
 {
     union spe_inst_RI18 inst;
     inst.inst.op = op;
@@ -232,6 +281,10 @@ static void emit_RI18(struct spe_function *p, unsigned op, unsigned rT,
     inst.inst.rT = rT;
     p->store[p->num_inst++] = inst.bits;
     assert(p->num_inst <= p->max_inst);
+    if (p->print) {
+       indent(p);
+       printf("%s\t$%d, 0x%x\n", rem_prefix(name), rT, imm);
+    }
 }
 
 
@@ -240,61 +293,61 @@ static void emit_RI18(struct spe_function *p, unsigned op, unsigned rT,
 #define EMIT_(_name, _op) \
 void _name (struct spe_function *p, unsigned rT) \
 { \
-    emit_RR(p, _op, rT, 0, 0); \
+   emit_RR(p, _op, rT, 0, 0, __FUNCTION__); \
 }
 
 #define EMIT_R(_name, _op) \
 void _name (struct spe_function *p, unsigned rT, unsigned rA) \
 { \
-    emit_RR(p, _op, rT, rA, 0); \
+   emit_RR(p, _op, rT, rA, 0, __FUNCTION__);                 \
 }
 
 #define EMIT_RR(_name, _op) \
 void _name (struct spe_function *p, unsigned rT, unsigned rA, unsigned rB) \
 { \
-    emit_RR(p, _op, rT, rA, rB); \
+   emit_RR(p, _op, rT, rA, rB, __FUNCTION__);                \
 }
 
 #define EMIT_RRR(_name, _op) \
 void _name (struct spe_function *p, unsigned rT, unsigned rA, unsigned rB, unsigned rC) \
 { \
-    emit_RRR(p, _op, rT, rA, rB, rC); \
+   emit_RRR(p, _op, rT, rA, rB, rC, __FUNCTION__);           \
 }
 
 #define EMIT_RI7(_name, _op) \
 void _name (struct spe_function *p, unsigned rT, unsigned rA, int imm) \
 { \
-    emit_RI7(p, _op, rT, rA, imm); \
+   emit_RI7(p, _op, rT, rA, imm, __FUNCTION__);              \
 }
 
 #define EMIT_RI8(_name, _op, bias) \
 void _name (struct spe_function *p, unsigned rT, unsigned rA, int imm) \
 { \
-    emit_RI8(p, _op, rT, rA, bias - imm); \
+   emit_RI8(p, _op, rT, rA, bias - imm, __FUNCTION__);       \
 }
 
 #define EMIT_RI10(_name, _op) \
 void _name (struct spe_function *p, unsigned rT, unsigned rA, int imm) \
 { \
-    emit_RI10(p, _op, rT, rA, imm); \
+   emit_RI10(p, _op, rT, rA, imm, __FUNCTION__);             \
 }
 
 #define EMIT_RI16(_name, _op) \
 void _name (struct spe_function *p, unsigned rT, int imm) \
 { \
-    emit_RI16(p, _op, rT, imm); \
+   emit_RI16(p, _op, rT, imm, __FUNCTION__);                 \
 }
 
 #define EMIT_RI18(_name, _op) \
 void _name (struct spe_function *p, unsigned rT, int imm) \
 { \
-    emit_RI18(p, _op, rT, imm); \
+   emit_RI18(p, _op, rT, imm, __FUNCTION__);                 \
 }
 
 #define EMIT_I16(_name, _op) \
 void _name (struct spe_function *p, int imm) \
 { \
-    emit_RI16(p, _op, 0, imm); \
+   emit_RI16(p, _op, 0, imm, __FUNCTION__);                  \
 }
 
 #include "rtasm_ppc_spe.h"
@@ -314,6 +367,9 @@ void spe_init_func(struct spe_function *p, unsigned code_size)
      */
     p->regs[0] = ~7;
     p->regs[1] = (1U << (80 - 64)) - 1;
+
+    p->print = false;
+    p->indent = 0;
 }
 
 
@@ -327,8 +383,15 @@ void spe_release_func(struct spe_function *p)
 }
 
 
+/** Return current code size in bytes. */
+unsigned spe_code_size(const struct spe_function *p)
+{
+   return p->num_inst * SPE_INST_SIZE;
+}
+
+
 /**
- * Alloate a SPE register.
+ * Allocate a SPE register.
  * \return register index or -1 if none left.
  */
 int spe_allocate_available_register(struct spe_function *p)
@@ -382,6 +445,32 @@ void spe_release_register(struct spe_function *p, int reg)
 }
 
 
+void
+spe_print_code(struct spe_function *p, boolean enable)
+{
+   p->print = enable;
+}
+
+
+void
+spe_indent(struct spe_function *p, int spaces)
+{
+   p->indent += spaces;
+}
+
+
+extern void
+spe_comment(struct spe_function *p, int rel_indent, const char *s)
+{
+   if (p->print) {
+      p->indent += rel_indent;
+      indent(p);
+      p->indent -= rel_indent;
+      printf("# %s\n", s);
+   }
+}
+
+
 /**
  * For branch instructions:
  * \param d  if 1, disable interupts if branch is taken
@@ -392,51 +481,51 @@ void spe_release_register(struct spe_function *p, int reg)
 /** Branch Indirect to address in rA */
 void spe_bi(struct spe_function *p, unsigned rA, int d, int e)
 {
-    emit_RI7(p, 0x1a8, 0, rA, (d << 5) | (e << 4));
+   emit_RI7(p, 0x1a8, 0, rA, (d << 5) | (e << 4), __FUNCTION__);
 }
 
 /** Interupt Return */
 void spe_iret(struct spe_function *p, unsigned rA, int d, int e)
 {
-    emit_RI7(p, 0x1aa, 0, rA, (d << 5) | (e << 4));
+   emit_RI7(p, 0x1aa, 0, rA, (d << 5) | (e << 4), __FUNCTION__);
 }
 
 /** Branch indirect and set link on external data */
 void spe_bisled(struct spe_function *p, unsigned rT, unsigned rA, int d,
 		int e)
 {
-    emit_RI7(p, 0x1ab, rT, rA, (d << 5) | (e << 4));
+   emit_RI7(p, 0x1ab, rT, rA, (d << 5) | (e << 4), __FUNCTION__);
 }
 
 /** Branch indirect and set link.  Save PC in rT, jump to rA. */
 void spe_bisl(struct spe_function *p, unsigned rT, unsigned rA, int d,
 		int e)
 {
-    emit_RI7(p, 0x1a9, rT, rA, (d << 5) | (e << 4));
+   emit_RI7(p, 0x1a9, rT, rA, (d << 5) | (e << 4), __FUNCTION__);
 }
 
 /** Branch indirect if zero word.  If rT.word[0]==0, jump to rA. */
 void spe_biz(struct spe_function *p, unsigned rT, unsigned rA, int d, int e)
 {
-    emit_RI7(p, 0x128, rT, rA, (d << 5) | (e << 4));
+   emit_RI7(p, 0x128, rT, rA, (d << 5) | (e << 4), __FUNCTION__);
 }
 
 /** Branch indirect if non-zero word.  If rT.word[0]!=0, jump to rA. */
 void spe_binz(struct spe_function *p, unsigned rT, unsigned rA, int d, int e)
 {
-    emit_RI7(p, 0x129, rT, rA, (d << 5) | (e << 4));
+   emit_RI7(p, 0x129, rT, rA, (d << 5) | (e << 4), __FUNCTION__);
 }
 
 /** Branch indirect if zero halfword.  If rT.halfword[1]==0, jump to rA. */
 void spe_bihz(struct spe_function *p, unsigned rT, unsigned rA, int d, int e)
 {
-    emit_RI7(p, 0x12a, rT, rA, (d << 5) | (e << 4));
+   emit_RI7(p, 0x12a, rT, rA, (d << 5) | (e << 4), __FUNCTION__);
 }
 
 /** Branch indirect if non-zero halfword.  If rT.halfword[1]!=0, jump to rA. */
 void spe_bihnz(struct spe_function *p, unsigned rT, unsigned rA, int d, int e)
 {
-    emit_RI7(p, 0x12b, rT, rA, (d << 5) | (e << 4));
+   emit_RI7(p, 0x12b, rT, rA, (d << 5) | (e << 4), __FUNCTION__);
 }
 
 
@@ -505,7 +594,28 @@ spe_load_int(struct spe_function *p, unsigned rT, int i)
    }
    else {
       spe_ilhu(p, rT, i >> 16);
-      spe_iohl(p, rT, i & 0xffff);
+      if (i & 0xffff)
+         spe_iohl(p, rT, i & 0xffff);
+   }
+}
+
+void spe_load_uint(struct spe_function *p, unsigned rT, unsigned int ui)
+{
+   /* If the whole value is in the lower 18 bits, use ila, which
+    * doesn't sign-extend.  Otherwise, if the two halfwords of
+    * the constant are identical, use ilh.  Otherwise, we have
+    * to use ilhu followed by iohl.
+    */
+   if ((ui & 0xfffc0000) == ui) {
+      spe_ila(p, rT, ui);
+   }
+   else if ((ui >> 16) == (ui & 0xffff)) {
+      spe_ilh(p, rT, ui & 0xffff);
+   }
+   else {
+      spe_ilhu(p, rT, ui >> 16);
+      if (ui & 0xffff)
+         spe_iohl(p, rT, ui & 0xffff);
    }
 }
 
@@ -513,22 +623,29 @@ spe_load_int(struct spe_function *p, unsigned rT, int i)
 void
 spe_splat(struct spe_function *p, unsigned rT, unsigned rA)
 {
-   spe_ila(p, rT, 66051);
+   /* Duplicate bytes 0, 1, 2, and 3 across the whole register */
+   spe_ila(p, rT, 0x00010203);
    spe_shufb(p, rT, rA, rA, rT);
 }
 
 
 void
-spe_complement(struct spe_function *p, unsigned rT)
+spe_complement(struct spe_function *p, unsigned rT, unsigned rA)
 {
-   spe_nor(p, rT, rT, rT);
+   spe_nor(p, rT, rA, rA);
 }
 
 
 void
 spe_move(struct spe_function *p, unsigned rT, unsigned rA)
 {
-   spe_ori(p, rT, rA, 0);
+   /* Use different instructions depending on the instruction address
+    * to take advantage of the dual pipelines.
+    */
+   if (p->num_inst & 1)
+      spe_shlqbyi(p, rT, rA, 0);  /* odd pipe */
+   else
+      spe_ori(p, rT, rA, 0);  /* even pipe */
 }
 
 
@@ -539,4 +656,70 @@ spe_zero(struct spe_function *p, unsigned rT)
 }
 
 
+void
+spe_splat_word(struct spe_function *p, unsigned rT, unsigned rA, int word)
+{
+   assert(word >= 0);
+   assert(word <= 3);
+
+   if (word == 0) {
+      int tmp1 = rT;
+      spe_ila(p, tmp1, 66051);
+      spe_shufb(p, rT, rA, rA, tmp1);
+   }
+   else {
+      /* XXX review this, we may not need the rotqbyi instruction */
+      int tmp1 = rT;
+      int tmp2 = spe_allocate_available_register(p);
+
+      spe_ila(p, tmp1, 66051);
+      spe_rotqbyi(p, tmp2, rA, 4 * word);
+      spe_shufb(p, rT, tmp2, tmp2, tmp1);
+
+      spe_release_register(p, tmp2);
+   }
+}
+
+/**
+ * For each 32-bit float element of rA and rB, choose the smaller of the
+ * two, compositing them into the rT register.
+ * 
+ * The Float Compare Greater Than (fcgt) instruction will put 1s into
+ * compare_reg where rA > rB, and 0s where rA <= rB.
+ *
+ * Then the Select Bits (selb) instruction will take bits from rA where
+ * compare_reg is 0, and from rB where compare_reg is 1; i.e., from rA
+ * where rA <= rB and from rB where rB > rA, which is exactly the
+ * "min" operation.
+ *
+ * The compare_reg could in many cases be the same as rT, unless
+ * rT == rA || rt == rB.  But since this is common in constructions
+ * like "x = min(x, a)", we always allocate a new register to be safe.
+ */
+void 
+spe_float_min(struct spe_function *p, unsigned rT, unsigned rA, unsigned rB)
+{
+   unsigned int compare_reg = spe_allocate_available_register(p);
+   spe_fcgt(p, compare_reg, rA, rB);
+   spe_selb(p, rT, rA, rB, compare_reg);
+   spe_release_register(p, compare_reg);
+}
+
+/**
+ * For each 32-bit float element of rA and rB, choose the greater of the
+ * two, compositing them into the rT register.
+ * 
+ * The logic is similar to that of spe_float_min() above; the only
+ * difference is that the registers on spe_selb() have been reversed,
+ * so that the larger of the two is selected instead of the smaller.
+ */
+void 
+spe_float_max(struct spe_function *p, unsigned rT, unsigned rA, unsigned rB)
+{
+   unsigned int compare_reg = spe_allocate_available_register(p);
+   spe_fcgt(p, compare_reg, rA, rB);
+   spe_selb(p, rT, rB, rA, compare_reg);
+   spe_release_register(p, compare_reg);
+}
+
 #endif /* GALLIUM_CELL */
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
index d95e5aace3..4165a971a2 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
@@ -28,6 +28,7 @@
  * For details, see /opt/cell/sdk/docs/arch/SPU_ISA_v1.2_27Jan2007_pub.pdf
  *
  * \author Ian Romanick <idr@us.ibm.com>
+ * \author Brian Paul
  */
 
 #ifndef RTASM_PPC_SPE_H
@@ -63,15 +64,25 @@ struct spe_function
      * spe_release_register
      */
     uint64_t regs[SPE_NUM_REGS / 64];
+
+    boolean print; /**< print/dump instructions as they're emitted? */
+    int indent;    /**< number of spaces to indent */
 };
 
+
 extern void spe_init_func(struct spe_function *p, unsigned code_size);
 extern void spe_release_func(struct spe_function *p);
+extern unsigned spe_code_size(const struct spe_function *p);
 
 extern int spe_allocate_available_register(struct spe_function *p);
 extern int spe_allocate_register(struct spe_function *p, int reg);
 extern void spe_release_register(struct spe_function *p, int reg);
 
+extern void spe_print_code(struct spe_function *p, boolean enable);
+extern void spe_indent(struct spe_function *p, int spaces);
+extern void spe_comment(struct spe_function *p, int rel_indent, const char *s);
+
+
 #endif /* RTASM_PPC_SPE_H */
 
 #ifndef EMIT_
@@ -292,13 +303,17 @@ spe_load_float(struct spe_function *p, unsigned rT, float x);
 extern void
 spe_load_int(struct spe_function *p, unsigned rT, int i);
 
+/** Load/splat immediate unsigned int into rT. */
+extern void
+spe_load_uint(struct spe_function *p, unsigned rT, unsigned int ui);
+
 /** Replicate word 0 of rA across rT. */
 extern void
 spe_splat(struct spe_function *p, unsigned rT, unsigned rA);
 
-/** Complement/invert all bits in rT. */
+/** rT = complement_all_bits(rA). */
 extern void
-spe_complement(struct spe_function *p, unsigned rT);
+spe_complement(struct spe_function *p, unsigned rT, unsigned rA);
 
 /** rT = rA. */
 extern void
@@ -308,6 +323,18 @@ spe_move(struct spe_function *p, unsigned rT, unsigned rA);
 extern void
 spe_zero(struct spe_function *p, unsigned rT);
 
+/** rT = splat(rA, word) */
+extern void
+spe_splat_word(struct spe_function *p, unsigned rT, unsigned rA, int word);
+
+/** rT = float min(rA, rB) */
+extern void
+spe_float_min(struct spe_function *p, unsigned rT, unsigned rA, unsigned rB);
+
+/** rT = float max(rA, rB) */
+extern void
+spe_float_max(struct spe_function *p, unsigned rT, unsigned rA, unsigned rB);
+
 
 /* Floating-point instructions
  */
diff --git a/src/gallium/auxiliary/tgsi/tgsi_build.c b/src/gallium/auxiliary/tgsi/tgsi_build.c
index 74614d3688..38fcaf8829 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_build.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_build.c
@@ -793,10 +793,14 @@ tgsi_default_instruction_ext_nv( void )
    return instruction_ext_nv;
 }
 
-union token_u32
+
+/** test for inequality of 32-bit values pointed to by a and b */
+static INLINE boolean
+compare32(const void *a, const void *b)
 {
-   unsigned u32;
-};
+   return *((uint32_t *) a) != *((uint32_t *) b);
+}
+
 
 unsigned
 tgsi_compare_instruction_ext_nv(
@@ -805,7 +809,7 @@ tgsi_compare_instruction_ext_nv(
 {
    a.Padding = b.Padding = 0;
    a.Extended = b.Extended = 0;
-   return ((union token_u32 *) &a)->u32 != ((union token_u32 *) &b)->u32;
+   return compare32(&a, &b);
 }
 
 struct tgsi_instruction_ext_nv
@@ -864,7 +868,7 @@ tgsi_compare_instruction_ext_label(
 {
    a.Padding = b.Padding = 0;
    a.Extended = b.Extended = 0;
-   return ((union token_u32 *) &a)->u32 != ((union token_u32 *) &b)->u32;
+   return compare32(&a, &b);
 }
 
 struct tgsi_instruction_ext_label
@@ -905,7 +909,7 @@ tgsi_compare_instruction_ext_texture(
 {
    a.Padding = b.Padding = 0;
    a.Extended = b.Extended = 0;
-   return ((union token_u32 *) &a)->u32 != ((union token_u32 *) &b)->u32;
+   return compare32(&a, &b);
 }
 
 struct tgsi_instruction_ext_texture
@@ -1027,7 +1031,7 @@ tgsi_compare_src_register_ext_swz(
 {
    a.Padding = b.Padding = 0;
    a.Extended = b.Extended = 0;
-   return ((union token_u32 *) &a)->u32 != ((union token_u32 *) &b)->u32;
+   return compare32(&a, &b);
 }
 
 struct tgsi_src_register_ext_swz
@@ -1095,7 +1099,7 @@ tgsi_compare_src_register_ext_mod(
 {
    a.Padding = b.Padding = 0;
    a.Extended = b.Extended = 0;
-   return ((union token_u32 *) &a)->u32 != ((union token_u32 *) &b)->u32;
+   return compare32(&a, &b);
 }
 
 struct tgsi_src_register_ext_mod
@@ -1241,7 +1245,7 @@ tgsi_compare_dst_register_ext_concode(
 {
    a.Padding = b.Padding = 0;
    a.Extended = b.Extended = 0;
-   return ((union token_u32 *) &a)->u32 != ((union token_u32 *) &b)->u32;
+   return compare32(&a, &b);
 }
 
 struct tgsi_dst_register_ext_concode
@@ -1299,7 +1303,7 @@ tgsi_compare_dst_register_ext_modulate(
 {
    a.Padding = b.Padding = 0;
    a.Extended = b.Extended = 0;
-   return ((union token_u32 *) &a)->u32 != ((union token_u32 *) &b)->u32;
+   return compare32(&a, &b);
 }
 
 struct tgsi_dst_register_ext_modulate
diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.c b/src/gallium/auxiliary/tgsi/tgsi_exec.c
index df002939c6..1a5294eabc 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.c
@@ -1674,6 +1674,7 @@ exec_declaration(
             break;
 
          default:
+            eval = NULL;
             assert( 0 );
          }
 
diff --git a/src/gallium/auxiliary/tgsi/tgsi_parse.c b/src/gallium/auxiliary/tgsi/tgsi_parse.c
index 3757486ba9..2cd56e413a 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_parse.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_parse.c
@@ -88,16 +88,33 @@ tgsi_parse_end_of_tokens(
       1 + ctx->FullHeader.Header.HeaderSize + ctx->FullHeader.Header.BodySize;
 }
 
+
+/**
+ * This function is used to avoid and work-around type punning/aliasing
+ * warnings.  The warnings seem harmless on x86 but on PPC they cause
+ * real failures.
+ */
+static INLINE void
+copy_token(void *dst, const void *src)
+{
+   memcpy(dst, src, 4);
+}
+
+
+/**
+ * Get next 4-byte token, return it at address specified by 'token'
+ */
 static void
 next_token(
    struct tgsi_parse_context *ctx,
    void *token )
 {
    assert( !tgsi_parse_end_of_tokens( ctx ) );
-
-   *(struct tgsi_token *) token = ctx->Tokens[ctx->Position++];
+   copy_token(token, &ctx->Tokens[ctx->Position]);
+   ctx->Position++;
 }
 
+
 void
 tgsi_parse_token(
    struct tgsi_parse_context *ctx )
@@ -116,7 +133,7 @@ tgsi_parse_token(
       struct tgsi_full_declaration *decl = &ctx->FullToken.FullDeclaration;
 
       *decl = tgsi_default_full_declaration();
-      decl->Declaration = *(struct tgsi_declaration *) &token;
+      copy_token(&decl->Declaration, &token);
 
       next_token( ctx, &decl->DeclarationRange );
 
@@ -132,8 +149,7 @@ tgsi_parse_token(
       struct tgsi_full_immediate *imm = &ctx->FullToken.FullImmediate;
 
       *imm = tgsi_default_full_immediate();
-      imm->Immediate = *(struct tgsi_immediate *) &token;
-
+      copy_token(&imm->Immediate, &token);
       assert( !imm->Immediate.Extended );
 
       switch (imm->Immediate.DataType) {
@@ -158,8 +174,7 @@ tgsi_parse_token(
       unsigned extended;
 
       *inst = tgsi_default_full_instruction();
-      inst->Instruction = *(struct tgsi_instruction *) &token;
-
+      copy_token(&inst->Instruction, &token);
       extended = inst->Instruction.Extended;
 
       while( extended ) {
@@ -169,18 +184,15 @@ tgsi_parse_token(
 
          switch( token.Type ) {
          case TGSI_INSTRUCTION_EXT_TYPE_NV:
-            inst->InstructionExtNv =
-               *(struct tgsi_instruction_ext_nv *) &token;
+            copy_token(&inst->InstructionExtNv, &token);
             break;
 
          case TGSI_INSTRUCTION_EXT_TYPE_LABEL:
-            inst->InstructionExtLabel =
-               *(struct tgsi_instruction_ext_label *) &token;
+            copy_token(&inst->InstructionExtLabel, &token);
             break;
 
          case TGSI_INSTRUCTION_EXT_TYPE_TEXTURE:
-            inst->InstructionExtTexture =
-               *(struct tgsi_instruction_ext_texture *) &token;
+            copy_token(&inst->InstructionExtTexture, &token);
             break;
 
          default:
@@ -212,13 +224,13 @@ tgsi_parse_token(
 
             switch( token.Type ) {
             case TGSI_DST_REGISTER_EXT_TYPE_CONDCODE:
-               inst->FullDstRegisters[i].DstRegisterExtConcode =
-                  *(struct tgsi_dst_register_ext_concode *) &token;
+               copy_token(&inst->FullDstRegisters[i].DstRegisterExtConcode,
+                          &token);
                break;
 
             case TGSI_DST_REGISTER_EXT_TYPE_MODULATE:
-               inst->FullDstRegisters[i].DstRegisterExtModulate =
-                  *(struct tgsi_dst_register_ext_modulate *) &token;
+               copy_token(&inst->FullDstRegisters[i].DstRegisterExtModulate,
+                          &token);
                break;
 
             default:
@@ -245,13 +257,13 @@ tgsi_parse_token(
 
             switch( token.Type ) {
             case TGSI_SRC_REGISTER_EXT_TYPE_SWZ:
-               inst->FullSrcRegisters[i].SrcRegisterExtSwz =
-                  *(struct tgsi_src_register_ext_swz *) &token;
+               copy_token(&inst->FullSrcRegisters[i].SrcRegisterExtSwz,
+                          &token);
                break;
 
             case TGSI_SRC_REGISTER_EXT_TYPE_MOD:
-               inst->FullSrcRegisters[i].SrcRegisterExtMod =
-                  *(struct tgsi_src_register_ext_mod *) &token;
+               copy_token(&inst->FullSrcRegisters[i].SrcRegisterExtMod,
+                          &token);
                break;
 
             default:
diff --git a/src/gallium/auxiliary/util/u_blit.c b/src/gallium/auxiliary/util/u_blit.c
index 9adf72944e..d28201ac8d 100644
--- a/src/gallium/auxiliary/util/u_blit.c
+++ b/src/gallium/auxiliary/util/u_blit.c
@@ -104,6 +104,7 @@ util_create_blit(struct pipe_context *pipe, struct cso_context *cso)
    ctx->rasterizer.cull_mode = PIPE_WINDING_NONE;
    ctx->rasterizer.bypass_clipping = 1;
    /*ctx->rasterizer.bypass_vs = 1;*/
+   ctx->rasterizer.gl_rasterization_rules = 1;
 
    /* samplers */
    memset(&ctx->sampler, 0, sizeof(ctx->sampler));
diff --git a/src/gallium/auxiliary/util/u_gen_mipmap.c b/src/gallium/auxiliary/util/u_gen_mipmap.c
index b19a649bbc..9d305ad763 100644
--- a/src/gallium/auxiliary/util/u_gen_mipmap.c
+++ b/src/gallium/auxiliary/util/u_gen_mipmap.c
@@ -725,6 +725,7 @@ util_create_gen_mipmap(struct pipe_context *pipe,
    ctx->rasterizer.cull_mode = PIPE_WINDING_NONE;
    ctx->rasterizer.bypass_clipping = 1;
    /*ctx->rasterizer.bypass_vs = 1;*/
+   ctx->rasterizer.gl_rasterization_rules = 1;
 
    /* sampler state */
    memset(&ctx->sampler, 0, sizeof(ctx->sampler));
diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index cb0631baf5..f0ff96eb47 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -104,11 +104,11 @@
 #define CELL_BUFFER_STATUS_FREE 10
 #define CELL_BUFFER_STATUS_USED 20
 
-
-#define CELL_DEBUG_CHECKER  (1 << 0)
-#define CELL_DEBUG_SYNC     (1 << 1)
-
-
+#define CELL_DEBUG_CHECKER              (1 << 0)
+#define CELL_DEBUG_ASM                  (1 << 1)
+#define CELL_DEBUG_SYNC                 (1 << 2)
+#define CELL_DEBUG_FRAGMENT_OPS         (1 << 3)
+#define CELL_DEBUG_FRAGMENT_OP_FALLBACK (1 << 4)
 
 /** Max instructions for doing per-fragment operations */
 #define SPU_MAX_FRAGMENT_OPS_INSTS 64
diff --git a/src/gallium/drivers/cell/ppu/cell_context.c b/src/gallium/drivers/cell/ppu/cell_context.c
index 71f1a3049d..62e213ea35 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.c
+++ b/src/gallium/drivers/cell/ppu/cell_context.c
@@ -85,13 +85,14 @@ cell_draw_create(struct cell_context *cell)
 }
 
 
-#ifdef DEBUG
 static const struct debug_named_value cell_debug_flags[] = {
    {"checker", CELL_DEBUG_CHECKER},/**< modulate tile clear color by SPU ID */
+   {"asm", CELL_DEBUG_ASM},        /**< dump SPU asm code */
    {"sync", CELL_DEBUG_SYNC},      /**< SPUs do synchronous DMA */
+   {"fragops", CELL_DEBUG_FRAGMENT_OPS}, /**< SPUs emit fragment ops debug messages*/
+   {"fragopfallback", CELL_DEBUG_FRAGMENT_OP_FALLBACK}, /**< SPUs use reference implementation for fragment ops*/
    {NULL, 0}
 };
-#endif
 
 
 struct pipe_context *
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
index 6ffe94eb14..1bc803d590 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
@@ -51,9 +51,13 @@
 #include "cell_gen_fp.h"
 
 
-/** Set to 1 to enable debug/disassembly printfs */
-#define DISASSEM 01
+#define MAX_TEMPS 16
+#define MAX_IMMED  8
 
+#define CHAN_X  0
+#define CHAN_Y  1
+#define CHAN_Z  2
+#define CHAN_W  3
 
 /**
  * Context needed during code generation.
@@ -63,13 +67,21 @@ struct codegen
    int inputs_reg;      /**< 1st function parameter */
    int outputs_reg;     /**< 2nd function parameter */
    int constants_reg;   /**< 3rd function parameter */
-   int temp_regs[8][4]; /**< maps TGSI temps to SPE registers */
+   int temp_regs[MAX_TEMPS][4]; /**< maps TGSI temps to SPE registers */
+   int imm_regs[MAX_IMMED][4];  /**< maps TGSI immediates to SPE registers */
+
+   int num_imm;  /**< number of immediates */
 
    int one_reg;         /**< register containing {1.0, 1.0, 1.0, 1.0} */
 
    /** Per-instruction temps / intermediate temps */
    int num_itemps;
-   int itemps[3];
+   int itemps[10];
+
+   /** Current IF/ELSE/ENDIF nesting level */
+   int if_nesting;
+   /** Index of execution mask register */
+   int exec_mask_reg;
 
    struct spe_function *f;
    boolean error;
@@ -112,19 +124,47 @@ get_const_one_reg(struct codegen *gen)
 {
    if (gen->one_reg <= 0) {
       gen->one_reg = spe_allocate_available_register(gen->f);
-   }
 
-   /* one = {1.0, 1.0, 1.0, 1.0} */
-   spe_load_float(gen->f, gen->one_reg, 1.0f);
-#if DISASSEM
-   printf("il\tr%d, 1.0f\n", gen->one_reg);
-#endif
+      spe_indent(gen->f, 4);
+      spe_comment(gen->f, -4, "INIT CONSTANT 1.0:");
+
+      /* one = {1.0, 1.0, 1.0, 1.0} */
+      spe_load_float(gen->f, gen->one_reg, 1.0f);
+
+      spe_indent(gen->f, -4);
+   }
 
    return gen->one_reg;
 }
 
 
 /**
+ * Return index of the pixel execution mask.
+ * The register is allocated an initialized upon the first call.
+ *
+ * The pixel execution mask controls which pixels in a quad are
+ * modified, according to surrounding conditionals, loops, etc.
+ */
+static int
+get_exec_mask_reg(struct codegen *gen)
+{
+   if (gen->exec_mask_reg <= 0) {
+      gen->exec_mask_reg = spe_allocate_available_register(gen->f);
+
+      spe_indent(gen->f, 4);
+      spe_comment(gen->f, -4, "INIT EXEC MASK = ~0:");
+
+      /* exec_mask = {~0, ~0, ~0, ~0} */
+      spe_load_int(gen->f, gen->exec_mask_reg, ~0);
+
+      spe_indent(gen->f, -4);
+   }
+
+   return gen->exec_mask_reg;
+}
+
+
+/**
  * Return the index of the SPU temporary containing the named TGSI
  * source register.  If the TGSI register is a TGSI_FILE_TEMPORARY we
  * just return the corresponding SPE register.  If the TGIS register
@@ -136,12 +176,15 @@ get_src_reg(struct codegen *gen,
             int channel,
             const struct tgsi_full_src_register *src)
 {
-   int reg;
+   int reg = -1;
+   int swizzle = tgsi_util_get_full_src_register_extswizzle(src, channel);
+   boolean reg_is_itemp = FALSE;
+   uint sign_op;
 
-   /* XXX need to examine src swizzle info here.
-    * That will involve changing the channel var...
-    */
+   assert(swizzle >= 0);
+   assert(swizzle <= 3);
 
+   channel = swizzle;
 
    switch (src->SrcRegister.File) {
    case TGSI_FILE_TEMPORARY:
@@ -152,21 +195,57 @@ get_src_reg(struct codegen *gen,
          /* offset is measured in quadwords, not bytes */
          int offset = src->SrcRegister.Index * 4 + channel;
          reg = get_itemp(gen);
+         reg_is_itemp = TRUE;
          /* Load:  reg = memory[(machine_reg) + offset] */
          spe_lqd(gen->f, reg, gen->inputs_reg, offset);
-#if DISASSEM
-         printf("lqd\tr%d, r%d + %d\n", reg, gen->inputs_reg, offset);
-#endif
       }
       break;
    case TGSI_FILE_IMMEDIATE:
-      /* xxx fall-through for now / fix */
+      reg = gen->imm_regs[src->SrcRegister.Index][channel];
+      break;
    case TGSI_FILE_CONSTANT:
       /* xxx fall-through for now / fix */
    default:
       assert(0);
    }
 
+   /*
+    * Handle absolute value, negate or set-negative of src register.
+    */
+   sign_op = tgsi_util_get_full_src_register_sign_mode(src, channel);
+   if (sign_op != TGSI_UTIL_SIGN_KEEP) {
+      /*
+       * All sign ops are done by manipulating bit 31, the IEEE float sign bit.
+       */
+      const int bit31mask_reg = get_itemp(gen);
+      int result_reg;
+
+      if (reg_is_itemp) {
+         /* re-use 'reg' for the result */
+         result_reg = reg;
+      }
+      else {
+         /* alloc a new reg for the result */
+         result_reg = get_itemp(gen);
+      }
+
+      /* mask with bit 31 set, the rest cleared */
+      spe_load_int(gen->f, bit31mask_reg, (1 << 31));
+
+      if (sign_op == TGSI_UTIL_SIGN_CLEAR) {
+         spe_andc(gen->f, result_reg, reg, bit31mask_reg);
+      }
+      else if (sign_op == TGSI_UTIL_SIGN_SET) {
+         spe_and(gen->f, result_reg, reg, bit31mask_reg);
+      }
+      else {
+         assert(sign_op == TGSI_UTIL_SIGN_TOGGLE);
+         spe_xor(gen->f, result_reg, reg, bit31mask_reg);
+      }
+
+      reg = result_reg;
+   }
+
    return reg;
 }
 
@@ -183,11 +262,14 @@ get_dst_reg(struct codegen *gen,
             int channel,
             const struct tgsi_full_dst_register *dest)
 {
-   int reg;
+   int reg = -1;
 
    switch (dest->DstRegister.File) {
    case TGSI_FILE_TEMPORARY:
-      reg = gen->temp_regs[dest->DstRegister.Index][channel];
+      if (gen->if_nesting > 0)
+         reg = get_itemp(gen);
+      else
+         reg = gen->temp_regs[dest->DstRegister.Index][channel];
       break;
    case TGSI_FILE_OUTPUT:
       reg = get_itemp(gen);
@@ -213,17 +295,40 @@ store_dest_reg(struct codegen *gen,
 {
    switch (dest->DstRegister.File) {
    case TGSI_FILE_TEMPORARY:
-      /* no-op */
+      if (gen->if_nesting > 0) {
+         int d_reg = gen->temp_regs[dest->DstRegister.Index][channel];
+         int exec_reg = get_exec_mask_reg(gen);
+         /* Mix d with new value according to exec mask:
+          * d[i] = mask_reg[i] ? value_reg : d_reg
+          */
+         spe_selb(gen->f, d_reg, d_reg, value_reg, exec_reg);
+      }
+      else {
+         /* we're not inside a condition or loop: do nothing special */
+      }
       break;
    case TGSI_FILE_OUTPUT:
       {
          /* offset is measured in quadwords, not bytes */
          int offset = dest->DstRegister.Index * 4 + channel;
-         /* Store: memory[(machine_reg) + offset] = reg */
-         spe_stqd(gen->f, value_reg, gen->outputs_reg, offset);
-#if DISASSEM
-         printf("stqd\tr%d, r%d + %d\n", value_reg, gen->outputs_reg, offset);
-#endif
+         if (gen->if_nesting > 0) {
+            int exec_reg = get_exec_mask_reg(gen);
+            int curval_reg = get_itemp(gen);
+            /* First read the current value from memory:
+             * Load:  curval = memory[(machine_reg) + offset]
+             */
+            spe_lqd(gen->f, curval_reg, gen->outputs_reg, offset);
+            /* Mix curval with newvalue according to exec mask:
+             * d[i] = mask_reg[i] ? value_reg : d_reg
+             */
+            spe_selb(gen->f, curval_reg, curval_reg, value_reg, exec_reg);
+            /* Store: memory[(machine_reg) + offset] = curval */
+            spe_stqd(gen->f, curval_reg, gen->outputs_reg, offset);
+         }
+         else {
+            /* Store: memory[(machine_reg) + offset] = reg */
+            spe_stqd(gen->f, value_reg, gen->outputs_reg, offset);
+         }
       }
       break;
    default:
@@ -236,15 +341,13 @@ static boolean
 emit_MOV(struct codegen *gen, const struct tgsi_full_instruction *inst)
 {
    int ch;
+   spe_comment(gen->f, -4, "MOV:");
    for (ch = 0; ch < 4; ch++) {
       if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
          int src_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
          int dst_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
          /* XXX we don't always need to actually emit a mov instruction here */
          spe_move(gen->f, dst_reg, src_reg);
-#if DISASSEM
-         printf("mov\tr%d, r%d\n", dst_reg, src_reg);
-#endif
          store_dest_reg(gen, dst_reg, ch, &inst->FullDstRegisters[0]);
          free_itemps(gen);
       }
@@ -253,6 +356,7 @@ emit_MOV(struct codegen *gen, const struct tgsi_full_instruction *inst)
 }
 
 
+
 /**
  * Emit addition instructions.  Recall that a single TGSI_OPCODE_ADD
  * becomes (up to) four SPU "fa" instructions because we're doing SOA
@@ -262,6 +366,7 @@ static boolean
 emit_ADD(struct codegen *gen, const struct tgsi_full_instruction *inst)
 {
    int ch;
+   spe_comment(gen->f, -4, "ADD:");
    /* Loop over Red/Green/Blue/Alpha channels */
    for (ch = 0; ch < 4; ch++) {
       /* If the dest R, G, B or A writemask is enabled... */
@@ -273,9 +378,6 @@ emit_ADD(struct codegen *gen, const struct tgsi_full_instruction *inst)
 
          /* Emit actual SPE instruction: d = s1 + s2 */
          spe_fa(gen->f, d_reg, s1_reg, s2_reg);
-#if DISASSEM
-         printf("fa\tr%d, r%d, r%d\n", d_reg, s1_reg, s2_reg);
-#endif
 
          /* Store the result (a no-op for TGSI_FILE_TEMPORARY dests) */
          store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
@@ -286,6 +388,82 @@ emit_ADD(struct codegen *gen, const struct tgsi_full_instruction *inst)
    return true;
 }
 
+/**
+ * Emit subtract.  See emit_ADD for comments.
+ */
+static boolean
+emit_SUB(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+   spe_comment(gen->f, -4, "SUB:");
+   /* Loop over Red/Green/Blue/Alpha channels */
+   for (ch = 0; ch < 4; ch++) {
+      /* If the dest R, G, B or A writemask is enabled... */
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         /* get indexes of the two src, one dest SPE registers */
+         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+
+         /* Emit actual SPE instruction: d = s1 - s2 */
+         spe_fs(gen->f, d_reg, s1_reg, s2_reg);
+
+         /* Store the result (a no-op for TGSI_FILE_TEMPORARY dests) */
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         /* Free any intermediate temps we allocated */
+         free_itemps(gen);
+      }
+   }
+   return true;
+}
+
+/**
+ * Emit multiply add.  See emit_ADD for comments.
+ */
+static boolean
+emit_MAD(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+   spe_comment(gen->f, -4, "MAD:");
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         int s3_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[2]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         /* d = s1 * s2 + s3 */
+         spe_fma(gen->f, d_reg, s1_reg, s2_reg, s3_reg);
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+   return true;
+}
+
+
+/**
+ * Emit linear interpolate.  See emit_ADD for comments.
+ */
+static boolean
+emit_LERP(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+   spe_comment(gen->f, -4, "LERP:");
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         int s3_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[2]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         /* d = s3 + s1(s2 - s3) */
+         spe_fs(gen->f, d_reg, s2_reg, s3_reg);
+         spe_fma(gen->f, d_reg, d_reg, s1_reg, s3_reg);
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+   return true;
+}
 
 /**
  * Emit multiply.  See emit_ADD for comments.
@@ -294,6 +472,7 @@ static boolean
 emit_MUL(struct codegen *gen, const struct tgsi_full_instruction *inst)
 {
    int ch;
+   spe_comment(gen->f, -4, "MUL:");
    for (ch = 0; ch < 4; ch++) {
       if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
          int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
@@ -301,9 +480,6 @@ emit_MUL(struct codegen *gen, const struct tgsi_full_instruction *inst)
          int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
          /* d = s1 * s2 */
          spe_fm(gen->f, d_reg, s1_reg, s2_reg);
-#if DISASSEM
-         printf("fm\tr%d, r%d, r%d\n", d_reg, s1_reg, s2_reg);
-#endif
          store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
          free_itemps(gen);
       }
@@ -311,6 +487,151 @@ emit_MUL(struct codegen *gen, const struct tgsi_full_instruction *inst)
    return true;
 }
 
+/**
+ * Emit reciprocal.  See emit_ADD for comments.
+ */
+static boolean
+emit_RCP(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+   spe_comment(gen->f, -4, "RCP:");
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         /* d = 1/s1 */
+         spe_frest(gen->f, d_reg, s1_reg);
+         spe_fi(gen->f, d_reg, s1_reg, d_reg);
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+   return true;
+}
+
+/**
+ * Emit reciprocal sqrt.  See emit_ADD for comments.
+ */
+static boolean
+emit_RSQ(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+   spe_comment(gen->f, -4, "RSQ:");
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         /* d = 1/s1 */
+         spe_frsqest(gen->f, d_reg, s1_reg);
+         spe_fi(gen->f, d_reg, s1_reg, d_reg);
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+   return true;
+}
+
+/**
+ * Emit absolute value.  See emit_ADD for comments.
+ */
+static boolean
+emit_ABS(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+   spe_comment(gen->f, -4, "ABS:");
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         const int bit31mask_reg = get_itemp(gen);
+
+         /* mask with bit 31 set, the rest cleared */  
+         spe_load_int(gen->f, bit31mask_reg, (1 << 31));
+
+         /* d = sign bit cleared in s1 */
+         spe_andc(gen->f, d_reg, s1_reg, bit31mask_reg);
+
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+   return true;
+}
+
+/**
+ * Emit 3 component dot product.  See emit_ADD for comments.
+ */
+static boolean
+emit_DP3(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+   spe_comment(gen->f, -4, "DP3:");
+
+   int s1_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]);
+   int s2_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]);
+   int d_reg = get_dst_reg(gen, CHAN_X, &inst->FullDstRegisters[0]);
+   /* d = x * x */
+   spe_fm(gen->f, d_reg, s1_reg, s2_reg);
+
+   s1_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]);
+   s2_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[1]);
+   /* d = y * y + d */
+   spe_fma(gen->f, d_reg, s1_reg, s2_reg, d_reg);
+
+   s1_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]);
+   s2_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[1]);
+   /* d = z * z + d */
+   spe_fma(gen->f, d_reg, s1_reg, s2_reg, d_reg);
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+      }
+   }
+
+   free_itemps(gen);
+   return true;
+}
+
+/**
+ * Emit 4 component dot product.  See emit_ADD for comments.
+ */
+static boolean
+emit_DP4(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+   spe_comment(gen->f, -4, "DP3:");
+
+   int s1_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]);
+   int s2_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]);
+   int d_reg = get_dst_reg(gen, CHAN_X, &inst->FullDstRegisters[0]);
+   /* d = x * x */
+   spe_fm(gen->f, d_reg, s1_reg, s2_reg);
+
+   s1_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]);
+   s2_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[1]);
+   /* d = y * y + d */
+   spe_fma(gen->f, d_reg, s1_reg, s2_reg, d_reg);
+
+   s1_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]);
+   s2_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[1]);
+   /* d = z * z + d */
+   spe_fma(gen->f, d_reg, s1_reg, s2_reg, d_reg);
+
+   s1_reg = get_src_reg(gen, CHAN_W, &inst->FullSrcRegisters[0]);
+   s2_reg = get_src_reg(gen, CHAN_W, &inst->FullSrcRegisters[1]);
+   /* d = w * w + d */
+   spe_fma(gen->f, d_reg, s1_reg, s2_reg, d_reg);
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+      }
+   }
+
+   free_itemps(gen);
+   return true;
+}
 
 /**
  * Emit set-if-greater-than.
@@ -323,6 +644,8 @@ emit_SGT(struct codegen *gen, const struct tgsi_full_instruction *inst)
 {
    int ch;
 
+   spe_comment(gen->f, -4, "SGT:");
+
    for (ch = 0; ch < 4; ch++) {
       if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
          int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
@@ -331,16 +654,10 @@ emit_SGT(struct codegen *gen, const struct tgsi_full_instruction *inst)
 
          /* d = (s1 > s2) */
          spe_fcgt(gen->f, d_reg, s1_reg, s2_reg);
-#if DISASSEM
-         printf("fcgt\tr%d, r%d, r%d\n", d_reg, s1_reg, s2_reg);
-#endif
 
          /* convert d from 0x0/0xffffffff to 0.0/1.0 */
          /* d = d & one_reg */
          spe_and(gen->f, d_reg, d_reg, get_const_one_reg(gen));
-#if DISASSEM
-         printf("and\tr%d, r%d, r%d\n", d_reg, d_reg, get_const_one_reg(gen));
-#endif
 
          store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
          free_itemps(gen);
@@ -350,6 +667,421 @@ emit_SGT(struct codegen *gen, const struct tgsi_full_instruction *inst)
    return true;
 }
 
+/**
+ * Emit set-if_less-then.  See emit_SGT for comments.
+ */
+static boolean
+emit_SLT(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+
+   spe_comment(gen->f, -4, "SLT:");
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+
+         /* d = (s1 < s2) */
+         spe_fcgt(gen->f, d_reg, s2_reg, s1_reg);
+
+         /* convert d from 0x0/0xffffffff to 0.0/1.0 */
+         /* d = d & one_reg */
+         spe_and(gen->f, d_reg, d_reg, get_const_one_reg(gen));
+
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+
+   return true;
+}
+
+/**
+ * Emit set-if_greater-then-or-equal.  See emit_SGT for comments.
+ */
+static boolean
+emit_SGE(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+
+   spe_comment(gen->f, -4, "SGE:");
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+
+         /* d = (s1 >= s2) */
+         spe_fcgt(gen->f, d_reg, s2_reg, s1_reg);
+
+         /* convert d from 0x0/0xffffffff to 0.0/1.0 */
+         /* d = ~d & one_reg */
+         spe_andc(gen->f, d_reg, get_const_one_reg(gen), d_reg);
+
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+
+   return true;
+}
+
+/**
+ * Emit set-if_less-then-or-equal.  See emit_SGT for comments.
+ */
+static boolean
+emit_SLE(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+
+   spe_comment(gen->f, -4, "SLE:");
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+
+         /* d = (s1 <= s2) */
+         spe_fcgt(gen->f, d_reg, s1_reg, s2_reg);
+
+         /* convert d from 0x0/0xffffffff to 0.0/1.0 */
+         /* d = ~d & one_reg */
+         spe_andc(gen->f, d_reg, get_const_one_reg(gen), d_reg);
+
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+
+   return true;
+}
+
+/**
+ * Emit set-if_equal.  See emit_SGT for comments.
+ */
+static boolean
+emit_SEQ(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+
+   spe_comment(gen->f, -4, "SEQ:");
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+
+         /* d = (s1 == s2) */
+         spe_fceq(gen->f, d_reg, s1_reg, s2_reg);
+
+         /* convert d from 0x0/0xffffffff to 0.0/1.0 */
+         /* d = d & one_reg */
+         spe_and(gen->f, d_reg, d_reg, get_const_one_reg(gen));
+
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+
+   return true;
+}
+
+/**
+ * Emit set-if_not_equal.  See emit_SGT for comments.
+ */
+static boolean
+emit_SNE(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+
+   spe_comment(gen->f, -4, "SNE:");
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+
+         /* d = (s1 != s2) */
+         spe_fceq(gen->f, d_reg, s1_reg, s2_reg);
+         spe_nor(gen->f, d_reg, d_reg, d_reg);
+
+         /* convert d from 0x0/0xffffffff to 0.0/1.0 */
+         /* d = d & one_reg */
+         spe_and(gen->f, d_reg, d_reg, get_const_one_reg(gen));
+
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+
+   return true;
+}
+
+/**
+ * Emit compare.  See emit_SGT for comments.
+ */
+static boolean
+emit_CMP(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+
+   spe_comment(gen->f, -4, "CMP:");
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         int s3_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[2]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         int zero_reg = get_itemp(gen);
+   
+         spe_xor(gen->f, zero_reg, zero_reg, zero_reg);
+
+         /* d = (s1 < 0) ? s2 : s3 */
+         spe_fcgt(gen->f, d_reg, zero_reg, s1_reg);
+         spe_selb(gen->f, d_reg, s3_reg, s2_reg, d_reg);
+
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+
+   return true;
+}
+
+/**
+ * Emit floor.  
+ * If negative int subtract one
+ * Convert float to signed int
+ * Convert signed int to float
+ */
+static boolean
+emit_FLR(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+
+   spe_comment(gen->f, -4, "FLR:");
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         int tmp_reg = get_itemp(gen);
+
+         /* If negative, subtract 1.0 */
+         spe_xor(gen->f, tmp_reg, tmp_reg, tmp_reg);
+         spe_fcgt(gen->f, d_reg, tmp_reg, s1_reg);
+         spe_selb(gen->f, tmp_reg, tmp_reg, get_const_one_reg(gen), d_reg);
+         spe_fs(gen->f, d_reg, s1_reg, tmp_reg);
+
+         /* Convert float to int */
+         spe_cflts(gen->f, d_reg, d_reg, 0);
+
+         /* Convert int to float */
+         spe_csflt(gen->f, d_reg, d_reg, 0);
+
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+
+   return true;
+}
+
+/**
+ * Emit frac.  
+ * Input - FLR(Input)
+ */
+static boolean
+emit_FRC(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+
+   spe_comment(gen->f, -4, "FLR:");
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         int tmp_reg = get_itemp(gen);
+
+         /* If negative, subtract 1.0 */
+         spe_xor(gen->f, tmp_reg, tmp_reg, tmp_reg);
+         spe_fcgt(gen->f, d_reg, tmp_reg, s1_reg);
+         spe_selb(gen->f, tmp_reg, tmp_reg, get_const_one_reg(gen), d_reg);
+         spe_fs(gen->f, d_reg, s1_reg, tmp_reg);
+
+         /* Convert float to int */
+         spe_cflts(gen->f, d_reg, d_reg, 0);
+
+         /* Convert int to float */
+         spe_csflt(gen->f, d_reg, d_reg, 0);
+
+         /* d = s1 - FLR(s1) */
+         spe_fs(gen->f, d_reg, s1_reg, d_reg);
+
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+
+   return true;
+}
+
+
+/**
+ * Emit max.  See emit_SGT for comments.
+ */
+static boolean
+emit_MAX(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+
+   spe_comment(gen->f, -4, "MAX:");
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+
+         /* d = (s1 > s2) ? s1 : s2 */
+         spe_fcgt(gen->f, d_reg, s1_reg, s2_reg);
+         spe_selb(gen->f, d_reg, s2_reg, s1_reg, d_reg);
+
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+
+   return true;
+}
+
+/**
+ * Emit max.  See emit_SGT for comments.
+ */
+static boolean
+emit_MIN(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+
+   spe_comment(gen->f, -4, "MIN:");
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+
+         /* d = (s2 > s1) ? s1 : s2 */
+         spe_fcgt(gen->f, d_reg, s2_reg, s1_reg);
+         spe_selb(gen->f, d_reg, s2_reg, s1_reg, d_reg);
+
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+
+   return true;
+}
+
+static boolean
+emit_IF(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   const int channel = 0;
+   const int exec_reg = get_exec_mask_reg(gen);
+
+   spe_comment(gen->f, -4, "IF:");
+
+   /* update execution mask with the predicate register */
+   int tmp_reg = get_itemp(gen);
+   int s1_reg = get_src_reg(gen, channel, &inst->FullSrcRegisters[0]);
+
+   /* tmp = (s1_reg == 0) */
+   spe_ceqi(gen->f, tmp_reg, s1_reg, 0);
+   /* tmp = !tmp */
+   spe_complement(gen->f, tmp_reg, tmp_reg);
+   /* exec_mask = exec_mask & tmp */
+   spe_and(gen->f, exec_reg, exec_reg, tmp_reg);
+
+   gen->if_nesting++;
+
+   free_itemps(gen);
+
+   return true;
+}
+
+
+static boolean
+emit_ELSE(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   const int exec_reg = get_exec_mask_reg(gen);
+
+   spe_comment(gen->f, -4, "ELSE:");
+
+   /* exec_mask = !exec_mask */
+   spe_complement(gen->f, exec_reg, exec_reg);
+
+   return true;
+}
+
+
+static boolean
+emit_ENDIF(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   const int exec_reg = get_exec_mask_reg(gen);
+
+   spe_comment(gen->f, -4, "ENDIF:");
+
+   /* XXX todo: pop execution mask */
+
+   spe_load_int(gen->f, exec_reg, ~0x0);
+
+   gen->if_nesting--;
+   return true;
+}
+
+
+static boolean
+emit_DDX_DDY(struct codegen *gen, const struct tgsi_full_instruction *inst,
+             boolean ddx)
+{
+   int ch;
+
+   spe_comment(gen->f, -4, ddx ? "DDX:" : "DDY:");
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int s_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+
+         int t1_reg = get_itemp(gen);
+         int t2_reg = get_itemp(gen);
+
+         spe_splat_word(gen->f, t1_reg, s_reg, 0); /* upper-left pixel */
+         if (ddx) {
+            spe_splat_word(gen->f, t2_reg, s_reg, 1); /* upper-right pixel */
+         }
+         else {
+            spe_splat_word(gen->f, t2_reg, s_reg, 2); /* lower-left pixel */
+         }
+         spe_fs(gen->f, d_reg, t2_reg, t1_reg);
+
+         free_itemps(gen);
+      }
+   }
+
+   return true;
+}
+
+
+
 
 /**
  * Emit END instruction.
@@ -361,11 +1093,9 @@ emit_SGT(struct codegen *gen, const struct tgsi_full_instruction *inst)
 static boolean
 emit_END(struct codegen *gen)
 {
+   spe_comment(gen->f, -4, "END:");
    /* return from function call */
    spe_bi(gen->f, SPE_REG_RA, 0, 0);
-#if DISASSEM
-   printf("bi\trRA\n");
-#endif
    return true;
 }
 
@@ -384,14 +1114,64 @@ emit_instruction(struct codegen *gen,
       return emit_MUL(gen, inst);
    case TGSI_OPCODE_ADD:
       return emit_ADD(gen, inst);
+   case TGSI_OPCODE_SUB:
+      return emit_SUB(gen, inst);
+   case TGSI_OPCODE_MAD:
+      return emit_MAD(gen, inst);
+   case TGSI_OPCODE_LERP:
+      return emit_LERP(gen, inst);
+   case TGSI_OPCODE_DP3:
+      return emit_DP3(gen, inst);
+   case TGSI_OPCODE_DP4:
+      return emit_DP4(gen, inst);
+   case TGSI_OPCODE_RCP:
+      return emit_RCP(gen, inst);
+   case TGSI_OPCODE_RSQ:
+      return emit_RSQ(gen, inst);
+   case TGSI_OPCODE_ABS:
+      return emit_ABS(gen, inst);
    case TGSI_OPCODE_SGT:
       return emit_SGT(gen, inst);
+   case TGSI_OPCODE_SLT:
+      return emit_SLT(gen, inst);
+   case TGSI_OPCODE_SGE:
+      return emit_SGE(gen, inst);
+   case TGSI_OPCODE_SLE:
+      return emit_SLE(gen, inst);
+   case TGSI_OPCODE_SEQ:
+      return emit_SEQ(gen, inst);
+   case TGSI_OPCODE_SNE:
+      return emit_SNE(gen, inst);
+   case TGSI_OPCODE_CMP:
+      return emit_CMP(gen, inst);
+   case TGSI_OPCODE_MAX:
+      return emit_MAX(gen, inst);
+   case TGSI_OPCODE_MIN:
+      return emit_MIN(gen, inst);
+   case TGSI_OPCODE_FLR:
+      return emit_FLR(gen, inst);
+   case TGSI_OPCODE_FRC:
+      return emit_FRC(gen, inst);
    case TGSI_OPCODE_END:
       return emit_END(gen);
 
+   case TGSI_OPCODE_IF:
+      return emit_IF(gen, inst);
+   case TGSI_OPCODE_ELSE:
+      return emit_ELSE(gen, inst);
+   case TGSI_OPCODE_ENDIF:
+      return emit_ENDIF(gen, inst);
+
+   case TGSI_OPCODE_DDX:
+      return emit_DDX_DDY(gen, inst, true);
+   case TGSI_OPCODE_DDY:
+      return emit_DDX_DDY(gen, inst, false);
+
    /* XXX lots more cases to do... */
 
    default:
+      fprintf(stderr, "Cell: unimplemented TGSI instruction %d!\n",
+              inst->Instruction.Opcode);
       return false;
    }
 
@@ -401,45 +1181,88 @@ emit_instruction(struct codegen *gen,
 
 
 /**
+ * Emit code for a TGSI immediate value (vector of four floats).
+ * This involves register allocation and initialization.
+ * XXX the initialization should be done by a "prepare" stage, not
+ * per quad execution!
+ */
+static boolean
+emit_immediate(struct codegen *gen, const struct tgsi_full_immediate *immed)
+{
+   int ch;
+
+   assert(gen->num_imm < MAX_TEMPS);
+
+   spe_comment(gen->f, -4, "IMMEDIATE:");
+
+   for (ch = 0; ch < 4; ch++) {
+      float val = immed->u.ImmediateFloat32[ch].Float;
+      int reg = spe_allocate_available_register(gen->f);
+
+      if (reg < 0)
+         return false;
+
+      /* update immediate map */
+      gen->imm_regs[gen->num_imm][ch] = reg;
+
+      /* emit initializer instruction */
+      spe_load_float(gen->f, reg, val);
+   }
+
+   gen->num_imm++;
+
+   return true;
+}
+
+
+
+/**
  * Emit "code" for a TGSI declaration.
  * We only care about TGSI TEMPORARY register declarations at this time.
  * For each TGSI TEMPORARY we allocate four SPE registers.
  */
-static void
-emit_declaration(struct codegen *gen, const struct tgsi_full_declaration *decl)
+static boolean
+emit_declaration(struct cell_context *cell,
+                 struct codegen *gen, const struct tgsi_full_declaration *decl)
 {
    int i, ch;
 
    switch (decl->Declaration.File) {
    case TGSI_FILE_TEMPORARY:
-#if DISASSEM
-      printf("Declare temp reg %d .. %d\n",
-             decl->DeclarationRange.First,
-             decl->DeclarationRange.Last);
-#endif
+      if (cell->debug_flags & CELL_DEBUG_ASM) {
+         printf("Declare temp reg %d .. %d\n",
+                decl->DeclarationRange.First,
+                decl->DeclarationRange.Last);
+      }
+
       for (i = decl->DeclarationRange.First;
            i <= decl->DeclarationRange.Last;
            i++) {
+         assert(i < MAX_TEMPS);
          for (ch = 0; ch < 4; ch++) {
             gen->temp_regs[i][ch] = spe_allocate_available_register(gen->f);
+            if (gen->temp_regs[i][ch] < 0)
+               return false; /* out of regs */
          }
 
          /* XXX if we run out of SPE registers, we need to spill
           * to SPU memory.  someday...
           */
 
-#if DISASSEM
-         printf("  SPE regs: %d %d %d %d\n",
-                gen->temp_regs[i][0],
-                gen->temp_regs[i][1],
-                gen->temp_regs[i][2],
-                gen->temp_regs[i][3]);
-#endif
+         if (cell->debug_flags & CELL_DEBUG_ASM) {
+            printf("  SPE regs: %d %d %d %d\n",
+                   gen->temp_regs[i][0],
+                   gen->temp_regs[i][1],
+                   gen->temp_regs[i][2],
+                   gen->temp_regs[i][3]);
+         }
       }
       break;
    default:
       ; /* ignore */
    }
+
+   return true;
 }
 
 
@@ -472,10 +1295,12 @@ cell_gen_fragment_program(struct cell_context *cell,
    spe_allocate_register(f, gen.outputs_reg);
    spe_allocate_register(f, gen.constants_reg);
 
-#if DISASSEM
-   printf("Begin %s\n", __FUNCTION__);
-   tgsi_dump(tokens, 0);
-#endif
+   if (cell->debug_flags & CELL_DEBUG_ASM) {
+      spe_print_code(f, true);
+      spe_indent(f, 8);
+      printf("Begin %s\n", __FUNCTION__);
+      tgsi_dump(tokens, 0);
+   }
 
    tgsi_parse_init(&parse, tokens);
 
@@ -484,25 +1309,22 @@ cell_gen_fragment_program(struct cell_context *cell,
 
       switch (parse.FullToken.Token.Type) {
       case TGSI_TOKEN_TYPE_IMMEDIATE:
-#if 0
-         if (!note_immediate(&gen, &parse.FullToken.FullImmediate ))
-            goto fail;
-#endif
+         if (!emit_immediate(&gen,  &parse.FullToken.FullImmediate))
+            gen.error = true;
          break;
 
       case TGSI_TOKEN_TYPE_DECLARATION:
-         emit_declaration(&gen, &parse.FullToken.FullDeclaration);
+         if (!emit_declaration(cell, &gen, &parse.FullToken.FullDeclaration))
+            gen.error = true;
          break;
 
       case TGSI_TOKEN_TYPE_INSTRUCTION:
-         if (!emit_instruction(&gen, &parse.FullToken.FullInstruction )) {
+         if (!emit_instruction(&gen, &parse.FullToken.FullInstruction))
             gen.error = true;
-         }
          break;
 
       default:
          assert(0);
-
       }
    }
 
@@ -512,10 +1334,10 @@ cell_gen_fragment_program(struct cell_context *cell,
       return emit_END(&gen);
    }
 
-#if DISASSEM
-   printf("cell_gen_fragment_program nr instructions: %d\n", f->num_inst);
-   printf("End %s\n", __FUNCTION__);
-#endif
+   if (cell->debug_flags & CELL_DEBUG_ASM) {
+      printf("cell_gen_fragment_program nr instructions: %d\n", f->num_inst);
+      printf("End %s\n", __FUNCTION__);
+   }
 
    tgsi_parse_free( &parse );
 
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
index 06219d4e98..1837b4c79b 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
@@ -60,6 +60,9 @@ gen_depth_test(const struct pipe_depth_stencil_alpha_state *dsa,
                struct spe_function *f,
                int mask_reg, int ifragZ_reg, int ifbZ_reg, int zmask_reg)
 {
+   /* NOTE: we use clgt below, not cgt, because we want to compare _unsigned_
+    * quantities.  This only makes a difference for 32-bit Z values though.
+    */
    ASSERT(dsa->depth.enabled);
 
    switch (dsa->depth.func) {
@@ -79,28 +82,28 @@ gen_depth_test(const struct pipe_depth_stencil_alpha_state *dsa,
 
    case PIPE_FUNC_GREATER:
       /* zmask = (ifragZ > ref) */
-      spe_cgt(f, zmask_reg, ifragZ_reg, ifbZ_reg);
+      spe_clgt(f, zmask_reg, ifragZ_reg, ifbZ_reg);
       /* mask = (mask & zmask) */
       spe_and(f, mask_reg, mask_reg, zmask_reg);
       break;
 
    case PIPE_FUNC_LESS:
       /* zmask = (ref > ifragZ) */
-      spe_cgt(f, zmask_reg, ifbZ_reg, ifragZ_reg);
+      spe_clgt(f, zmask_reg, ifbZ_reg, ifragZ_reg);
       /* mask = (mask & zmask) */
       spe_and(f, mask_reg, mask_reg, zmask_reg);
       break;
 
    case PIPE_FUNC_LEQUAL:
       /* zmask = (ifragZ > ref) */
-      spe_cgt(f, zmask_reg, ifragZ_reg, ifbZ_reg);
+      spe_clgt(f, zmask_reg, ifragZ_reg, ifbZ_reg);
       /* mask = (mask & ~zmask) */
       spe_andc(f, mask_reg, mask_reg, zmask_reg);
       break;
 
    case PIPE_FUNC_GEQUAL:
       /* zmask = (ref > ifragZ) */
-      spe_cgt(f, zmask_reg, ifbZ_reg, ifragZ_reg);
+      spe_clgt(f, zmask_reg, ifbZ_reg, ifragZ_reg);
       /* mask = (mask & ~zmask) */
       spe_andc(f, mask_reg, mask_reg, zmask_reg);
       break;
@@ -229,7 +232,27 @@ gen_alpha_test(const struct pipe_depth_stencil_alpha_state *dsa,
    spe_release_register(f, amask_reg);
 }
 
+/* This pair of functions is used inline to allocate and deallocate
+ * optional constant registers.  Once a constant is discovered to be 
+ * needed, we will likely need it again, so we don't want to deallocate
+ * it and have to allocate and load it again unnecessarily.
+ */
+static inline void
+setup_const_register(struct spe_function *f, boolean *is_already_set, unsigned int *r, float value)
+{
+   if (*is_already_set) return;
+   *r = spe_allocate_available_register(f);
+   spe_load_float(f, *r, value);
+   *is_already_set = true;
+}
 
+static inline void
+release_const_register(struct spe_function *f, boolean *is_already_set, unsigned int r)
+{
+    if (!*is_already_set) return;
+    spe_release_register(f, r);
+    *is_already_set = false;
+}
 
 /**
  * Generate SPE code to implement the given blend mode for a quad of pixels.
@@ -242,6 +265,7 @@ gen_alpha_test(const struct pipe_depth_stencil_alpha_state *dsa,
  */
 static void
 gen_blend(const struct pipe_blend_state *blend,
+          const struct pipe_blend_color *blend_color,
           struct spe_function *f,
           enum pipe_format color_format,
           int fragR_reg, int fragG_reg, int fragB_reg, int fragA_reg,
@@ -262,10 +286,17 @@ gen_blend(const struct pipe_blend_state *blend,
    int fbB_reg = spe_allocate_available_register(f);
    int fbA_reg = spe_allocate_available_register(f);
 
-   int one_reg = spe_allocate_available_register(f);
    int tmp_reg = spe_allocate_available_register(f);
 
-   boolean one_reg_set = false; /* avoid setting one_reg more than once */
+   /* Optional constant registers we might or might not end up using;
+    * if we do use them, make sure we only allocate them once by
+    * keeping a flag on each one.
+    */
+   boolean one_reg_set = false;
+   unsigned int one_reg;
+   boolean constR_reg_set = false, constG_reg_set = false, 
+      constB_reg_set = false, constA_reg_set = false;
+   unsigned int constR_reg, constG_reg, constB_reg, constA_reg;
 
    ASSERT(blend->blend_enable);
 
@@ -346,127 +377,449 @@ gen_blend(const struct pipe_blend_state *blend,
       spe_release_register(f, mask_reg);
    }
 
-
    /*
-    * Compute Src RGB terms
+    * Compute Src RGB terms.  We're actually looking for the value
+    * of (the appropriate RGB factors) * (the incoming source RGB color),
+    * because in some cases (like PIPE_BLENDFACTOR_ONE and 
+    * PIPE_BLENDFACTOR_ZERO) we can avoid doing unnecessary math.
     */
    switch (blend->rgb_src_factor) {
    case PIPE_BLENDFACTOR_ONE:
+      /* factors = (1,1,1), so term = (R,G,B) */
       spe_move(f, term1R_reg, fragR_reg);
       spe_move(f, term1G_reg, fragG_reg);
       spe_move(f, term1B_reg, fragB_reg);
       break;
    case PIPE_BLENDFACTOR_ZERO:
-      spe_zero(f, term1R_reg);
-      spe_zero(f, term1G_reg);
-      spe_zero(f, term1B_reg);
+      /* factors = (0,0,0), so term = (0,0,0) */
+      spe_load_float(f, term1R_reg, 0.0f);
+      spe_load_float(f, term1G_reg, 0.0f);
+      spe_load_float(f, term1B_reg, 0.0f);
       break;
    case PIPE_BLENDFACTOR_SRC_COLOR:
+      /* factors = (R,G,B), so term = (R*R, G*G, B*B) */
       spe_fm(f, term1R_reg, fragR_reg, fragR_reg);
       spe_fm(f, term1G_reg, fragG_reg, fragG_reg);
       spe_fm(f, term1B_reg, fragB_reg, fragB_reg);
       break;
    case PIPE_BLENDFACTOR_SRC_ALPHA:
+      /* factors = (A,A,A), so term = (R*A, G*A, B*A) */
       spe_fm(f, term1R_reg, fragR_reg, fragA_reg);
       spe_fm(f, term1G_reg, fragG_reg, fragA_reg);
       spe_fm(f, term1B_reg, fragB_reg, fragA_reg);
       break;
-      /* XXX more cases */
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+      /* factors = (1-R,1-G,1-B), so term = (R*(1-R), G*(1-G), B*(1-B)) 
+       * or in other words term = (R-R*R, G-G*G, B-B*B)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term1R_reg, fragR_reg, fragR_reg, fragR_reg);
+      spe_fnms(f, term1G_reg, fragG_reg, fragG_reg, fragG_reg);
+      spe_fnms(f, term1B_reg, fragB_reg, fragB_reg, fragB_reg);
+      break;
+   case PIPE_BLENDFACTOR_DST_COLOR:
+      /* factors = (Rfb,Gfb,Bfb), so term = (R*Rfb, G*Gfb, B*Bfb) */
+      spe_fm(f, term1R_reg, fragR_reg, fbR_reg);
+      spe_fm(f, term1G_reg, fragG_reg, fbG_reg);
+      spe_fm(f, term1B_reg, fragB_reg, fbB_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+      /* factors = (1-Rfb,1-Gfb,1-Bfb), so term = (R*(1-Rfb),G*(1-Gfb),B*(1-Bfb))
+       * or term = (R-R*Rfb, G-G*Gfb, B-B*Bfb)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term1R_reg, fragR_reg, fbR_reg, fragR_reg);
+      spe_fnms(f, term1G_reg, fragG_reg, fbG_reg, fragG_reg);
+      spe_fnms(f, term1B_reg, fragB_reg, fbB_reg, fragB_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+      /* factors = (1-A,1-A,1-A), so term = (R*(1-A),G*(1-A),B*(1-A))
+       * or term = (R-R*A,G-G*A,B-B*A)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term1R_reg, fragR_reg, fragA_reg, fragR_reg);
+      spe_fnms(f, term1G_reg, fragG_reg, fragA_reg, fragG_reg);
+      spe_fnms(f, term1B_reg, fragB_reg, fragA_reg, fragB_reg);
+      break;
+   case PIPE_BLENDFACTOR_DST_ALPHA:
+      /* factors = (Afb, Afb, Afb), so term = (R*Afb, G*Afb, B*Afb) */
+      spe_fm(f, term1R_reg, fragR_reg, fbA_reg);
+      spe_fm(f, term1G_reg, fragG_reg, fbA_reg);
+      spe_fm(f, term1B_reg, fragB_reg, fbA_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+      /* factors = (1-Afb, 1-Afb, 1-Afb), so term = (R*(1-Afb),G*(1-Afb),B*(1-Afb)) 
+       * or term = (R-R*Afb,G-G*Afb,b-B*Afb)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term1R_reg, fragR_reg, fbA_reg, fragR_reg);
+      spe_fnms(f, term1G_reg, fragG_reg, fbA_reg, fragG_reg);
+      spe_fnms(f, term1B_reg, fragB_reg, fbA_reg, fragB_reg);
+      break;
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+      /* We need the optional constant color registers */
+      setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
+      setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
+      setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
+      /* now, factor = (Rc,Gc,Bc), so term = (R*Rc,G*Gc,B*Bc) */
+      spe_fm(f, term1R_reg, fragR_reg, constR_reg);
+      spe_fm(f, term1G_reg, fragG_reg, constG_reg);
+      spe_fm(f, term1B_reg, fragB_reg, constB_reg);
+      break;
+   case PIPE_BLENDFACTOR_CONST_ALPHA:
+      /* we'll need the optional constant alpha register */
+      setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
+      /* factor = (Ac,Ac,Ac), so term = (R*Ac,G*Ac,B*Ac) */
+      spe_fm(f, term1R_reg, fragR_reg, constA_reg);
+      spe_fm(f, term1G_reg, fragG_reg, constA_reg);
+      spe_fm(f, term1B_reg, fragB_reg, constA_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+      /* We need the optional constant color registers */
+      setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
+      setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
+      setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
+      /* factor = (1-Rc,1-Gc,1-Bc), so term = (R*(1-Rc),G*(1-Gc),B*(1-Bc)) 
+       * or term = (R-R*Rc, G-G*Gc, B-B*Bc)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term1R_reg, fragR_reg, constR_reg, fragR_reg);
+      spe_fnms(f, term1G_reg, fragG_reg, constG_reg, fragG_reg);
+      spe_fnms(f, term1B_reg, fragB_reg, constB_reg, fragB_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+      /* We need the optional constant color registers */
+      setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
+      setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
+      setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
+      /* factor = (1-Ac,1-Ac,1-Ac), so term = (R*(1-Ac),G*(1-Ac),B*(1-Ac))
+       * or term = (R-R*Ac,G-G*Ac,B-B*Ac)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term1R_reg, fragR_reg, constA_reg, fragR_reg);
+      spe_fnms(f, term1G_reg, fragG_reg, constA_reg, fragG_reg);
+      spe_fnms(f, term1B_reg, fragB_reg, constA_reg, fragB_reg);
+      break;
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+      /* We'll need the optional {1,1,1,1} register */
+      setup_const_register(f, &one_reg_set, &one_reg, 1.0f);
+      /* factor = (min(A,1-Afb),min(A,1-Afb),min(A,1-Afb)), so 
+       * term = (R*min(A,1-Afb), G*min(A,1-Afb), B*min(A,1-Afb))
+       * We could expand the term (as a*min(b,c) == min(a*b,a*c)
+       * as long as a is positive), but then we'd have to do three
+       * spe_float_min() functions instead of one, so this is simpler.
+       */
+      /* tmp = 1 - Afb */
+      spe_fs(f, tmp_reg, one_reg, fbA_reg);
+      /* tmp = min(A,tmp) */
+      spe_float_min(f, tmp_reg, fragA_reg, tmp_reg);
+      /* term = R*tmp */
+      spe_fm(f, term1R_reg, fragR_reg, tmp_reg);
+      spe_fm(f, term1G_reg, fragG_reg, tmp_reg);
+      spe_fm(f, term1B_reg, fragB_reg, tmp_reg);
+      break;
+
+      /* These are special D3D cases involving a second color output
+       * from the fragment shader.  I'm not sure we can support them
+       * yet... XXX
+       */
+   case PIPE_BLENDFACTOR_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_SRC1_ALPHA:
+   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+
    default:
       ASSERT(0);
    }
 
    /*
-    * Compute Src Alpha term
+    * Compute Src Alpha term.  Like the above, we're looking for
+    * the full term A*factor, not just the factor itself, because
+    * in many cases we can avoid doing unnecessary multiplies.
     */
    switch (blend->alpha_src_factor) {
+   case PIPE_BLENDFACTOR_ZERO:
+      /* factor = 0, so term = 0 */
+      spe_load_float(f, term1A_reg, 0.0f);
+      break;
+
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: /* fall through */
    case PIPE_BLENDFACTOR_ONE:
+      /* factor = 1, so term = A */
       spe_move(f, term1A_reg, fragA_reg);
       break;
+
    case PIPE_BLENDFACTOR_SRC_COLOR:
+      /* factor = A, so term = A*A */
       spe_fm(f, term1A_reg, fragA_reg, fragA_reg);
       break;
    case PIPE_BLENDFACTOR_SRC_ALPHA:
       spe_fm(f, term1A_reg, fragA_reg, fragA_reg);
       break;
-      /* XXX more cases */
+
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+      /* factor = 1-A, so term = A*(1-A) = A-A*A */
+      /* fnms(a,b,c,d) computes a = d - b*c */
+      spe_fnms(f, term1A_reg, fragA_reg, fragA_reg, fragA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_DST_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_DST_COLOR:
+      /* factor = Afb, so term = A*Afb */
+      spe_fm(f, term1A_reg, fragA_reg, fbA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+      /* factor = 1-Afb, so term = A*(1-Afb) = A - A*Afb */
+      /* fnms(a,b,c,d) computes a = d - b*c */
+      spe_fnms(f, term1A_reg, fragA_reg, fbA_reg, fragA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_CONST_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+      /* We need the optional constA_reg register */
+      setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
+      /* factor = Ac, so term = A*Ac */
+      spe_fm(f, term1A_reg, fragA_reg, constA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+      /* We need the optional constA_reg register */
+      setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
+      /* factor = 1-Ac, so term = A*(1-Ac) = A-A*Ac */
+      /* fnms(a,b,c,d) computes a = d - b*c */
+      spe_fnms(f, term1A_reg, fragA_reg, constA_reg, fragA_reg);
+      break;
+
+      /* These are special D3D cases involving a second color output
+       * from the fragment shader.  I'm not sure we can support them
+       * yet... XXX
+       */
+   case PIPE_BLENDFACTOR_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_SRC1_ALPHA:
+   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
    default:
       ASSERT(0);
    }
 
    /*
-    * Compute Dest RGB terms
+    * Compute Dest RGB term.  Like the above, we're looking for
+    * the full term (Rfb,Gfb,Bfb)*(factor), not just the factor itself, because
+    * in many cases we can avoid doing unnecessary multiplies.
     */
    switch (blend->rgb_dst_factor) {
    case PIPE_BLENDFACTOR_ONE:
+      /* factors = (1,1,1), so term = (Rfb,Gfb,Bfb) */
       spe_move(f, term2R_reg, fbR_reg);
       spe_move(f, term2G_reg, fbG_reg);
       spe_move(f, term2B_reg, fbB_reg);
       break;
    case PIPE_BLENDFACTOR_ZERO:
-      spe_zero(f, term2R_reg);
-      spe_zero(f, term2G_reg);
-      spe_zero(f, term2B_reg);
+      /* factor s= (0,0,0), so term = (0,0,0) */
+      spe_load_float(f, term2R_reg, 0.0f);
+      spe_load_float(f, term2G_reg, 0.0f);
+      spe_load_float(f, term2B_reg, 0.0f);
       break;
    case PIPE_BLENDFACTOR_SRC_COLOR:
+      /* factors = (R,G,B), so term = (R*Rfb, G*Gfb, B*Bfb) */
       spe_fm(f, term2R_reg, fbR_reg, fragR_reg);
       spe_fm(f, term2G_reg, fbG_reg, fragG_reg);
       spe_fm(f, term2B_reg, fbB_reg, fragB_reg);
       break;
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+      /* factors = (1-R,1-G,1-B), so term = (Rfb*(1-R), Gfb*(1-G), Bfb*(1-B)) 
+       * or in other words term = (Rfb-Rfb*R, Gfb-Gfb*G, Bfb-Bfb*B)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term2R_reg, fragR_reg, fbR_reg, fbR_reg);
+      spe_fnms(f, term2G_reg, fragG_reg, fbG_reg, fbG_reg);
+      spe_fnms(f, term2B_reg, fragB_reg, fbB_reg, fbB_reg);
+      break;
    case PIPE_BLENDFACTOR_SRC_ALPHA:
+      /* factors = (A,A,A), so term = (Rfb*A, Gfb*A, Bfb*A) */
       spe_fm(f, term2R_reg, fbR_reg, fragA_reg);
       spe_fm(f, term2G_reg, fbG_reg, fragA_reg);
       spe_fm(f, term2B_reg, fbB_reg, fragA_reg);
       break;
    case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
-      /* one = {1.0, 1.0, 1.0, 1.0} */
-      if (!one_reg_set) {
-         spe_load_float(f, one_reg, 1.0f);
-         one_reg_set = true;
-      }
-      /* tmp = one - fragA */
-      spe_fs(f, tmp_reg, one_reg, fragA_reg);
-      /* term = fb * tmp */
-      spe_fm(f, term2R_reg, fbR_reg, tmp_reg);
-      spe_fm(f, term2G_reg, fbG_reg, tmp_reg);
-      spe_fm(f, term2B_reg, fbB_reg, tmp_reg);
-      break;
-      /* XXX more cases */
+      /* factors = (1-A,1-A,1-A) so term = (Rfb-Rfb*A,Gfb-Gfb*A,Bfb-Bfb*A) */
+      /* fnms(a,b,c,d) computes a = d - b*c */
+      spe_fnms(f, term2R_reg, fbR_reg, fragA_reg, fbR_reg);
+      spe_fnms(f, term2G_reg, fbG_reg, fragA_reg, fbG_reg);
+      spe_fnms(f, term2B_reg, fbB_reg, fragA_reg, fbB_reg);
+      break;
+   case PIPE_BLENDFACTOR_DST_COLOR:
+      /* factors = (Rfb,Gfb,Bfb), so term = (Rfb*Rfb, Gfb*Gfb, Bfb*Bfb) */
+      spe_fm(f, term2R_reg, fbR_reg, fbR_reg);
+      spe_fm(f, term2G_reg, fbG_reg, fbG_reg);
+      spe_fm(f, term2B_reg, fbB_reg, fbB_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+      /* factors = (1-Rfb,1-Gfb,1-Bfb), so term = (Rfb*(1-Rfb),Gfb*(1-Gfb),Bfb*(1-Bfb))
+       * or term = (Rfb-Rfb*Rfb, Gfb-Gfb*Gfb, Bfb-Bfb*Bfb)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term2R_reg, fbR_reg, fbR_reg, fbR_reg);
+      spe_fnms(f, term2G_reg, fbG_reg, fbG_reg, fbG_reg);
+      spe_fnms(f, term2B_reg, fbB_reg, fbB_reg, fbB_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_DST_ALPHA:
+      /* factors = (Afb, Afb, Afb), so term = (Rfb*Afb, Gfb*Afb, Bfb*Afb) */
+      spe_fm(f, term2R_reg, fbR_reg, fbA_reg);
+      spe_fm(f, term2G_reg, fbG_reg, fbA_reg);
+      spe_fm(f, term2B_reg, fbB_reg, fbA_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+      /* factors = (1-Afb, 1-Afb, 1-Afb), so term = (Rfb*(1-Afb),Gfb*(1-Afb),Bfb*(1-Afb)) 
+       * or term = (Rfb-Rfb*Afb,Gfb-Gfb*Afb,Bfb-Bfb*Afb)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term2R_reg, fbR_reg, fbA_reg, fbR_reg);
+      spe_fnms(f, term2G_reg, fbG_reg, fbA_reg, fbG_reg);
+      spe_fnms(f, term2B_reg, fbB_reg, fbA_reg, fbB_reg);
+      break;
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+      /* We need the optional constant color registers */
+      setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
+      setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
+      setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
+      /* now, factor = (Rc,Gc,Bc), so term = (Rfb*Rc,Gfb*Gc,Bfb*Bc) */
+      spe_fm(f, term2R_reg, fbR_reg, constR_reg);
+      spe_fm(f, term2G_reg, fbG_reg, constG_reg);
+      spe_fm(f, term2B_reg, fbB_reg, constB_reg);
+      break;
+   case PIPE_BLENDFACTOR_CONST_ALPHA:
+      /* we'll need the optional constant alpha register */
+      setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
+      /* factor = (Ac,Ac,Ac), so term = (Rfb*Ac,Gfb*Ac,Bfb*Ac) */
+      spe_fm(f, term2R_reg, fbR_reg, constA_reg);
+      spe_fm(f, term2G_reg, fbG_reg, constA_reg);
+      spe_fm(f, term2B_reg, fbB_reg, constA_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+      /* We need the optional constant color registers */
+      setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
+      setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
+      setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
+      /* factor = (1-Rc,1-Gc,1-Bc), so term = (Rfb*(1-Rc),Gfb*(1-Gc),Bfb*(1-Bc)) 
+       * or term = (Rfb-Rfb*Rc, Gfb-Gfb*Gc, Bfb-Bfb*Bc)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term2R_reg, fbR_reg, constR_reg, fbR_reg);
+      spe_fnms(f, term2G_reg, fbG_reg, constG_reg, fbG_reg);
+      spe_fnms(f, term2B_reg, fbB_reg, constB_reg, fbB_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+      /* We need the optional constant color registers */
+      setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
+      setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
+      setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
+      /* factor = (1-Ac,1-Ac,1-Ac), so term = (Rfb*(1-Ac),Gfb*(1-Ac),Bfb*(1-Ac))
+       * or term = (Rfb-Rfb*Ac,Gfb-Gfb*Ac,Bfb-Bfb*Ac)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term2R_reg, fbR_reg, constA_reg, fbR_reg);
+      spe_fnms(f, term2G_reg, fbG_reg, constA_reg, fbG_reg);
+      spe_fnms(f, term2B_reg, fbB_reg, constA_reg, fbB_reg);
+      break;
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: /* not supported for dest RGB */
+      ASSERT(0);
+      break;
+
+      /* These are special D3D cases involving a second color output
+       * from the fragment shader.  I'm not sure we can support them
+       * yet... XXX
+       */
+   case PIPE_BLENDFACTOR_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_SRC1_ALPHA:
+   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+
    default:
       ASSERT(0);
    }
 
    /*
-    * Compute Dest Alpha term
+    * Compute Dest Alpha term.  Like the above, we're looking for
+    * the full term Afb*factor, not just the factor itself, because
+    * in many cases we can avoid doing unnecessary multiplies.
     */
    switch (blend->alpha_dst_factor) {
    case PIPE_BLENDFACTOR_ONE:
+      /* factor = 1, so term = Afb */
       spe_move(f, term2A_reg, fbA_reg);
       break;
    case PIPE_BLENDFACTOR_ZERO:
-      spe_zero(f, term2A_reg);
+      /* factor = 0, so term = 0 */
+      spe_load_float(f, term2A_reg, 0.0f);
       break;
-   case PIPE_BLENDFACTOR_SRC_ALPHA:
+
+   case PIPE_BLENDFACTOR_SRC_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+      /* factor = A, so term = Afb*A */
       spe_fm(f, term2A_reg, fbA_reg, fragA_reg);
       break;
-   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
-      /* one = {1.0, 1.0, 1.0, 1.0} */
-      if (!one_reg_set) {
-         spe_load_float(f, one_reg, 1.0f);
-         one_reg_set = true;
-      }
-      /* tmp = one - fragA */
-      spe_fs(f, tmp_reg, one_reg, fragA_reg);
-      /* termA = fbA * tmp */
-      spe_fm(f, term2A_reg, fbA_reg, tmp_reg);
+
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+      /* factor = 1-A, so term = Afb*(1-A) = Afb-Afb*A */
+      /* fnms(a,b,c,d) computes a = d - b*c */
+      spe_fnms(f, term2A_reg, fbA_reg, fragA_reg, fbA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_DST_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_DST_COLOR:
+      /* factor = Afb, so term = Afb*Afb */
+      spe_fm(f, term2A_reg, fbA_reg, fbA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+      /* factor = 1-Afb, so term = Afb*(1-Afb) = Afb - Afb*Afb */
+      /* fnms(a,b,c,d) computes a = d - b*c */
+      spe_fnms(f, term2A_reg, fbA_reg, fbA_reg, fbA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_CONST_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+      /* We need the optional constA_reg register */
+      setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
+      /* factor = Ac, so term = Afb*Ac */
+      spe_fm(f, term2A_reg, fbA_reg, constA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+      /* We need the optional constA_reg register */
+      setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
+      /* factor = 1-Ac, so term = Afb*(1-Ac) = Afb-Afb*Ac */
+      /* fnms(a,b,c,d) computes a = d - b*c */
+      spe_fnms(f, term2A_reg, fbA_reg, constA_reg, fbA_reg);
       break;
-      /* XXX more cases */
+
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: /* not supported for dest alpha */
+      ASSERT(0);
+      break;
+
+      /* These are special D3D cases involving a second color output
+       * from the fragment shader.  I'm not sure we can support them
+       * yet... XXX
+       */
+   case PIPE_BLENDFACTOR_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_SRC1_ALPHA:
+   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
    default:
       ASSERT(0);
    }
 
    /*
-    * Combine Src/Dest RGB terms
+    * Combine Src/Dest RGB terms as per the blend equation.
     */
    switch (blend->rgb_func) {
    case PIPE_BLEND_ADD:
@@ -479,7 +832,21 @@ gen_blend(const struct pipe_blend_state *blend,
       spe_fs(f, fragG_reg, term1G_reg, term2G_reg);
       spe_fs(f, fragB_reg, term1B_reg, term2B_reg);
       break;
-      /* XXX more cases */
+   case PIPE_BLEND_REVERSE_SUBTRACT:
+      spe_fs(f, fragR_reg, term2R_reg, term1R_reg);
+      spe_fs(f, fragG_reg, term2G_reg, term1G_reg);
+      spe_fs(f, fragB_reg, term2B_reg, term1B_reg);
+      break;
+   case PIPE_BLEND_MIN:
+      spe_float_min(f, fragR_reg, term1R_reg, term2R_reg);
+      spe_float_min(f, fragG_reg, term1G_reg, term2G_reg);
+      spe_float_min(f, fragB_reg, term1B_reg, term2B_reg);
+      break;
+   case PIPE_BLEND_MAX:
+      spe_float_max(f, fragR_reg, term1R_reg, term2R_reg);
+      spe_float_max(f, fragG_reg, term1G_reg, term2G_reg);
+      spe_float_max(f, fragB_reg, term1B_reg, term2B_reg);
+      break;
    default:
       ASSERT(0);
    }
@@ -494,7 +861,15 @@ gen_blend(const struct pipe_blend_state *blend,
    case PIPE_BLEND_SUBTRACT:
       spe_fs(f, fragA_reg, term1A_reg, term2A_reg);
       break;
-      /* XXX more cases */
+   case PIPE_BLEND_REVERSE_SUBTRACT:
+      spe_fs(f, fragA_reg, term2A_reg, term1A_reg);
+      break;
+   case PIPE_BLEND_MIN:
+      spe_float_min(f, fragA_reg, term1A_reg, term2A_reg);
+      break;
+   case PIPE_BLEND_MAX:
+      spe_float_max(f, fragA_reg, term1A_reg, term2A_reg);
+      break;
    default:
       ASSERT(0);
    }
@@ -514,8 +889,14 @@ gen_blend(const struct pipe_blend_state *blend,
    spe_release_register(f, fbB_reg);
    spe_release_register(f, fbA_reg);
 
-   spe_release_register(f, one_reg);
    spe_release_register(f, tmp_reg);
+
+   /* Free any optional registers that actually got used */
+   release_const_register(f, &one_reg_set, one_reg);
+   release_const_register(f, &constR_reg_set, constR_reg);
+   release_const_register(f, &constG_reg_set, constG_reg);
+   release_const_register(f, &constB_reg_set, constB_reg);
+   release_const_register(f, &constA_reg_set, constA_reg);
 }
 
 
@@ -524,8 +905,69 @@ gen_logicop(const struct pipe_blend_state *blend,
             struct spe_function *f,
             int fragRGBA_reg, int fbRGBA_reg)
 {
-   /* XXX to-do */
-   /* operate on 32-bit packed pixels, not float colors */
+   /* We've got four 32-bit RGBA packed pixels in each of
+    * fragRGBA_reg and fbRGBA_reg, not sets of floating-point
+    * reds, greens, blues, and alphas.
+    * */
+   ASSERT(blend->logicop_enable);
+
+   switch(blend->logicop_func) {
+      case PIPE_LOGICOP_CLEAR: /* 0 */
+         spe_zero(f, fragRGBA_reg);
+         break;
+      case PIPE_LOGICOP_NOR: /* ~(s | d) */
+         spe_nor(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_AND_INVERTED: /* ~s & d */
+         /* andc R, A, B computes R = A & ~B */
+         spe_andc(f, fragRGBA_reg, fbRGBA_reg, fragRGBA_reg);
+         break;
+      case PIPE_LOGICOP_COPY_INVERTED: /* ~s */
+         spe_complement(f, fragRGBA_reg, fragRGBA_reg);
+         break;
+      case PIPE_LOGICOP_AND_REVERSE: /* s & ~d */
+         /* andc R, A, B computes R = A & ~B */
+         spe_andc(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_INVERT: /* ~d */
+         /* Note that (A nor A) == ~(A|A) == ~A */
+         spe_nor(f, fragRGBA_reg, fbRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_XOR: /* s ^ d */
+         spe_xor(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_NAND: /* ~(s & d) */
+         spe_nand(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_AND: /* s & d */
+         spe_and(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_EQUIV: /* ~(s ^ d) */
+         spe_xor(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+         spe_complement(f, fragRGBA_reg, fragRGBA_reg);
+         break;
+      case PIPE_LOGICOP_NOOP: /* d */
+         spe_move(f, fragRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_OR_INVERTED: /* ~s | d */
+         /* orc R, A, B computes R = A | ~B */
+         spe_orc(f, fragRGBA_reg, fbRGBA_reg, fragRGBA_reg);
+         break;
+      case PIPE_LOGICOP_COPY: /* s */
+         break;
+      case PIPE_LOGICOP_OR_REVERSE: /* s | ~d */
+         /* orc R, A, B computes R = A | ~B */
+         spe_orc(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_OR: /* s | d */
+         spe_or(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_SET: /* 1 */
+         spe_load_int(f, fragRGBA_reg, 0xffffffff);
+         break;
+      default:
+         ASSERT(0);
+   }
 }
 
 
@@ -534,14 +976,84 @@ gen_colormask(uint colormask,
               struct spe_function *f,
               int fragRGBA_reg, int fbRGBA_reg)
 {
-   /* XXX to-do */
-   /* operate on 32-bit packed pixels, not float colors */
-}
+   /* We've got four 32-bit RGBA packed pixels in each of
+    * fragRGBA_reg and fbRGBA_reg, not sets of floating-point
+    * reds, greens, blues, and alphas.
+    * */
+
+   /* The color mask operation can prevent any set of color
+    * components in the incoming fragment from being written to the frame 
+    * buffer; we do this by replacing the masked components of the 
+    * fragment with the frame buffer values.
+    *
+    * There are only 16 possibilities, with a unique mask for
+    * each of the possibilities.  (Technically, there are only 15
+    * possibilities, since we shouldn't be called for the one mask
+    * that does nothing, but the complete implementation is here
+    * anyway to avoid confusion.)
+    *
+    * We implement this via a constant static array which we'll index 
+    * into to get the correct mask.
+    * 
+    * We're dependent on the mask values being low-order bits,
+    * with particular values for each bit; so we start with a
+    * few assertions, which will fail if any of the values were
+    * to change.
+    */
+   ASSERT(PIPE_MASK_R == 0x1);
+   ASSERT(PIPE_MASK_G == 0x2);
+   ASSERT(PIPE_MASK_B == 0x4);
+   ASSERT(PIPE_MASK_A == 0x8);
 
+   /* Here's the list of all possible colormasks, indexed by the
+    * value of the combined mask specifier.
+    */
+   static const unsigned int colormasks[16] = {
+      0x00000000, /* 0: all colors masked */
+      0xff000000, /* 1: PIPE_MASK_R */
+      0x00ff0000, /* 2: PIPE_MASK_G */
+      0xffff0000, /* 3: PIPE_MASK_R | PIPE_MASK_G */
+      0x0000ff00, /* 4: PIPE_MASK_B */
+      0xff00ff00, /* 5: PIPE_MASK_R | PIPE_MASK_B */
+      0x00ffff00, /* 6: PIPE_MASK_G | PIPE_MASK_B */
+      0xffffff00, /* 7: PIPE_MASK_R | PIPE_MASK_G | PIPE_MASK_B */
+      0x000000ff, /* 8: PIPE_MASK_A */
+      0xff0000ff, /* 9: PIPE_MASK_R | PIPE_MASK_A */
+      0x00ff00ff, /* 10: PIPE_MASK_G | PIPE_MASK_A */
+      0xffff00ff, /* 11: PIPE_MASK_R | PIPE_MASK_G | PIPE_MASK_A */
+      0x0000ffff, /* 12: PIPE_MASK_B | PIPE_MASK_A */
+      0xff00ffff, /* 13: PIPE_MASK_R | PIPE_MASK_B | PIPE_MASK_A */
+      0x00ffffff, /* 14: PIPE_MASK_G | PIPE_MASK_B | PIPE_MASK_A */
+      0xffffffff  /* 15: PIPE_MASK_R | PIPE_MASK_G | PIPE_MASK_B | PIPE_MASK_A */
+   };
+
+   /* Get a temporary register to hold the mask */
+   int colormask_reg = spe_allocate_available_register(f);
+
+   /* Look up the desired mask directly and load it into the mask register.
+    * This will load the same mask into each of the four words in the
+    * mask register.
+    */
+   spe_load_uint(f, colormask_reg, colormasks[colormask]);
+
+   /* Use the mask register to select between the fragment color
+    * values and the frame buffer color values.  Wherever the
+    * mask has a 0 bit, the current frame buffer color should override
+    * the fragment color.  Wherever the mask has a 1 bit, the 
+    * fragment color should persevere.  The Select Bits (selb rt, rA, rB, rM)
+    * instruction will select bits from its first operand rA wherever the
+    * the mask bits rM are 0, and from its second operand rB wherever the
+    * mask bits rM are 1.  That means that the frame buffer color is the
+    * first operand, and the fragment color the second.
+    */
+    spe_selb(f, fragRGBA_reg, fbRGBA_reg, fragRGBA_reg, colormask_reg);
 
+    /* Release the temporary register and we're done */
+    spe_release_register(f, colormask_reg);
+}
 
 /**
- * Generate code to pack a quad of float colors into a four 32-bit integers.
+ * Generate code to pack a quad of float colors into four 32-bit integers.
  *
  * \param f             SPE function to append instruction onto.
  * \param color_format  the dest color packing format
@@ -557,13 +1069,16 @@ gen_pack_colors(struct spe_function *f,
                 int r_reg, int g_reg, int b_reg, int a_reg,
                 int rgba_reg)
 {
+   int rg_reg = spe_allocate_available_register(f);
+   int ba_reg = spe_allocate_available_register(f);
+
    /* Convert float[4] in [0.0,1.0] to int[4] in [0,~0], with clamping */
    spe_cfltu(f, r_reg, r_reg, 32);
    spe_cfltu(f, g_reg, g_reg, 32);
    spe_cfltu(f, b_reg, b_reg, 32);
    spe_cfltu(f, a_reg, a_reg, 32);
 
-   /* Shift the most significant bytes to least the significant positions.
+   /* Shift the most significant bytes to the least significant positions.
     * I.e.: reg = reg >> 24
     */
    spe_rotmi(f, r_reg, r_reg, -24);
@@ -595,9 +1110,12 @@ gen_pack_colors(struct spe_function *f,
     * OR-ing all those together gives us four packed colors:
     *  RGBA = {0xffffffff, 0xaa114477, 0xbb225588, 0xcc336699}
     */
-   spe_or(f, rgba_reg, r_reg, g_reg);
-   spe_or(f, rgba_reg, rgba_reg, b_reg);
-   spe_or(f, rgba_reg, rgba_reg, a_reg);
+   spe_or(f, rg_reg, r_reg, g_reg);
+   spe_or(f, ba_reg, a_reg, b_reg);
+   spe_or(f, rgba_reg, rg_reg, ba_reg);
+
+   spe_release_register(f, rg_reg);
+   spe_release_register(f, ba_reg);
 }
 
 
@@ -629,6 +1147,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
    const struct pipe_depth_stencil_alpha_state *dsa =
       &cell->depth_stencil->base;
    const struct pipe_blend_state *blend = &cell->blend->base;
+   const struct pipe_blend_color *blend_color = &cell->blend_color;
    const enum pipe_format color_format = cell->framebuffer.cbufs[0]->format;
 
    /* For SPE function calls: reg $3 = first param, $4 = second param, etc. */
@@ -652,6 +1171,13 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
    int fbZS_reg;    /**< framebuffer's combined z/stencil values for quad */
 
    spe_init_func(f, SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE);
+
+   if (cell->debug_flags & CELL_DEBUG_ASM) {
+      spe_print_code(f, true);
+      spe_indent(f, 8);
+      spe_comment(f, -4, "Begin per-fragment ops");
+   }
+
    spe_allocate_register(f, x_reg);
    spe_allocate_register(f, y_reg);
    spe_allocate_register(f, color_tile_reg);
@@ -710,34 +1236,50 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
             spe_release_register(f, mask_reg);
             /* OK, fbZ_reg has four 24-bit Z values now */
          }
+         else if (zs_format == PIPE_FORMAT_Z24S8_UNORM ||
+                  zs_format == PIPE_FORMAT_Z24X8_UNORM) {
+            spe_rotmi(f, fbZ_reg, fbZS_reg, -8);  /* fbZ = fbZS >> 8 */
+            /* OK, fbZ_reg has four 24-bit Z values now */
+         }
+         else if (zs_format == PIPE_FORMAT_Z32_UNORM) {
+            spe_move(f, fbZ_reg, fbZS_reg);
+            /* OK, fbZ_reg has four 32-bit Z values now */
+         }
+         else if (zs_format == PIPE_FORMAT_Z16_UNORM) {
+            spe_move(f, fbZ_reg, fbZS_reg);
+            /* OK, fbZ_reg has four 16-bit Z values now */
+         }
          else {
-            /* XXX handle other z/stencil formats */
-            ASSERT(0);
+            ASSERT(0);  /* invalid format */
          }
 
-         /* Convert fragZ values from float[4] to uint[4] */
+         /* Convert fragZ values from float[4] to 16, 24 or 32-bit uint[4] */
          if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
              zs_format == PIPE_FORMAT_X8Z24_UNORM ||
              zs_format == PIPE_FORMAT_Z24S8_UNORM ||
              zs_format == PIPE_FORMAT_Z24X8_UNORM) {
-            /* 24-bit Z values */
-            int scale_reg = spe_allocate_available_register(f);
-
-            /* scale_reg[0,1,2,3] = float(2^24-1) */
-            spe_load_float(f, scale_reg, (float) 0xffffff);
-
-            /* XXX these two instructions might be combined */
-            spe_fm(f, fragZ_reg, fragZ_reg, scale_reg); /* fragZ *= scale */
-            spe_cfltu(f, fragZ_reg, fragZ_reg, 0);  /* fragZ = (int) fragZ */
-
-            spe_release_register(f, scale_reg);
+            /* scale/convert fragZ from float in [0,1] to uint in [0, ~0] */
+            spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+            /* fragZ = fragZ >> 8 */
+            spe_rotmi(f, fragZ_reg, fragZ_reg, -8);
          }
-         else {
-            /* XXX handle 16-bit Z format */
-            ASSERT(0);
+         else if (zs_format == PIPE_FORMAT_Z32_UNORM) {
+            /* scale/convert fragZ from float in [0,1] to uint in [0, ~0] */
+            spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
          }
+         else if (zs_format == PIPE_FORMAT_Z16_UNORM) {
+            /* scale/convert fragZ from float in [0,1] to uint in [0, ~0] */
+            spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+            /* fragZ = fragZ >> 16 */
+            spe_rotmi(f, fragZ_reg, fragZ_reg, -16);
+         }
+      }
+      else {
+         /* no Z test, but set Z to zero so we don't OR-in garbage below */
+         spe_load_uint(f, fbZ_reg, 0); /* XXX set to zero for now */
       }
 
+
       if (dsa->stencil[0].enabled) {
          /* Extract Stencil bit sfrom fbZS_reg into fbS_reg */
          if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
@@ -751,7 +1293,10 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
             ASSERT(0);
          }
       }
-
+      else {
+         /* no stencil test, but set to zero so we don't OR-in garbage below */
+         spe_load_uint(f, fbS_reg, 0); /* XXX set to zero for now */
+      }
 
       if (dsa->stencil[0].enabled) {
          /* XXX this may involve depth testing too */
@@ -779,22 +1324,22 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
             spe_shli(f, fbS_reg, fbS_reg, 24); /* fbS = fbS << 24 */
             spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
          }
-         else if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
-                  zs_format == PIPE_FORMAT_X8Z24_UNORM) {
-            /* XXX to do */
-            ASSERT(0);
+         else if (zs_format == PIPE_FORMAT_Z24S8_UNORM ||
+                  zs_format == PIPE_FORMAT_Z24X8_UNORM) {
+            spe_shli(f, fbZ_reg, fbZ_reg, 8); /* fbZ = fbZ << 8 */
+            spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
+         }
+         else if (zs_format == PIPE_FORMAT_Z32_UNORM) {
+            spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */
          }
          else if (zs_format == PIPE_FORMAT_Z16_UNORM) {
-            /* XXX to do */
-            ASSERT(0);
+            spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */
          }
          else if (zs_format == PIPE_FORMAT_S8_UNORM) {
-            /* XXX to do */
-            ASSERT(0);
+            ASSERT(0);   /* XXX to do */
          }
          else {
-            /* bad zs_format */
-            ASSERT(0);
+            ASSERT(0); /* bad zs_format */
          }
 
          /* Store: memory[depth_tile_reg + quad_offset_reg] = fbZS */
@@ -816,7 +1361,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
 
 
    if (blend->blend_enable) {
-      gen_blend(blend, f, color_format,
+      gen_blend(blend, blend_color, f, color_format,
                 fragR_reg, fragG_reg, fragB_reg, fragA_reg, fbRGBA_reg);
    }
 
@@ -837,7 +1382,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
          gen_logicop(blend, f, rgba_reg, fbRGBA_reg);
       }
 
-      if (blend->colormask != 0xf) {
+      if (blend->colormask != PIPE_MASK_RGBA) {
          gen_colormask(blend->colormask, f, rgba_reg, fbRGBA_reg);
       }
 
@@ -866,5 +1411,8 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
    spe_release_register(f, fbRGBA_reg);
    spe_release_register(f, fbZS_reg);
    spe_release_register(f, quad_offset_reg);
-}
 
+   if (cell->debug_flags & CELL_DEBUG_ASM) {
+      spe_comment(f, -4, "End per-fragment ops");
+   }
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_pipe_state.c b/src/gallium/drivers/cell/ppu/cell_pipe_state.c
index 475c6ef0ce..ea820aca74 100644
--- a/src/gallium/drivers/cell/ppu/cell_pipe_state.c
+++ b/src/gallium/drivers/cell/ppu/cell_pipe_state.c
@@ -72,7 +72,9 @@ cell_delete_blend_state(struct pipe_context *pipe, void *blend)
 {
    struct cell_blend_state *cb = (struct cell_blend_state *) blend;
 
+#if 0
    spe_release_func(& cb->code);
+#endif
    FREE(cb);
 }
 
@@ -128,7 +130,9 @@ cell_delete_depth_stencil_alpha_state(struct pipe_context *pipe, void *depth)
    struct cell_depth_stencil_alpha_state *cdsa =
        (struct cell_depth_stencil_alpha_state *) depth;
 
+#if 0
    spe_release_func(& cdsa->code);
+#endif
    FREE(cdsa);
 }
 
diff --git a/src/gallium/drivers/cell/ppu/cell_screen.c b/src/gallium/drivers/cell/ppu/cell_screen.c
index 139b3719b6..47ba6fa290 100644
--- a/src/gallium/drivers/cell/ppu/cell_screen.c
+++ b/src/gallium/drivers/cell/ppu/cell_screen.c
@@ -58,9 +58,9 @@ cell_get_param(struct pipe_screen *screen, int param)
    case PIPE_CAP_MAX_TEXTURE_IMAGE_UNITS:
       return CELL_MAX_SAMPLERS;
    case PIPE_CAP_NPOT_TEXTURES:
-      return 0;
+      return 1;
    case PIPE_CAP_TWO_SIDED_STENCIL:
-      return 0;
+      return 1;
    case PIPE_CAP_GLSL:
       return 1;
    case PIPE_CAP_S3TC:
@@ -68,13 +68,13 @@ cell_get_param(struct pipe_screen *screen, int param)
    case PIPE_CAP_ANISOTROPIC_FILTER:
       return 0;
    case PIPE_CAP_POINT_SPRITE:
-      return 0;
+      return 1;
    case PIPE_CAP_MAX_RENDER_TARGETS:
       return 1;
    case PIPE_CAP_OCCLUSION_QUERY:
-      return 0;
+      return 1;
    case PIPE_CAP_TEXTURE_SHADOW_MAP:
-      return 0;
+      return 10;
    case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
       return 12; /* max 2Kx2K */
    case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
@@ -82,7 +82,7 @@ cell_get_param(struct pipe_screen *screen, int param)
    case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
       return 12; /* max 2Kx2K */
    default:
-      return 0;
+      return 10;
    }
 }
 
@@ -108,7 +108,7 @@ cell_get_paramf(struct pipe_screen *screen, int param)
       return 16.0; /* arbitrary */
 
    default:
-      return 0;
+      return 10;
    }
 }
 
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index 2da3097983..8a389cd6aa 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -100,14 +100,19 @@ cell_emit_state(struct cell_context *cell)
             = cell_batch_alloc(cell, sizeof(*fops));
       struct spe_function spe_code;
 
+      /* Prepare the buffer that will hold the generated code. */
+      spe_init_func(&spe_code, SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE);
+
       /* generate new code */
       cell_gen_fragment_function(cell, &spe_code);
+
       /* put the new code into the batch buffer */
       fops->opcode = CELL_CMD_STATE_FRAGMENT_OPS;
       memcpy(&fops->code, spe_code.store,
              SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE);
       fops->dsa = cell->depth_stencil->base;
       fops->blend = cell->blend->base;
+
       /* free codegen buffer */
       spe_release_func(&spe_code);
    }
diff --git a/src/gallium/drivers/cell/spu/.gitignore b/src/gallium/drivers/cell/spu/.gitignore
new file mode 100644
index 0000000000..2be9a2d324
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/.gitignore
@@ -0,0 +1 @@
+g3d_spu
diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index 78260c4259..b4d30228f7 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -50,7 +50,31 @@ helpful headers:
 /opt/cell/sdk/usr/include/libmisc.h
 */
 
+/* Set to 0 to disable all extraneous debugging code */
+#define DEBUG 1
+
+#if DEBUG
 boolean Debug = FALSE;
+boolean force_fragment_ops_fallback = TRUE;
+
+/* These debug macros use the unusual construction ", ##__VA_ARGS__"
+ * which expands to the expected comma + args if variadic arguments
+ * are supplied, but swallows the comma if there are no variadic
+ * arguments (which avoids syntax errors that would otherwise occur).
+ */
+#define DEBUG_PRINTF(format,...) \
+   if (Debug) \
+      printf("SPU %u: " format, spu.init.id, ##__VA_ARGS__)
+#define D_PRINTF(flag, format,...) \
+   if (spu.init.debug_flags & (flag)) \
+      printf("SPU %u: " format, spu.init.id, ##__VA_ARGS__)
+
+#else
+
+#define DEBUG_PRINTF(...)
+#define D_PRINTF(...)
+
+#endif
 
 struct spu_global spu;
 
@@ -133,9 +157,7 @@ really_clear_tiles(uint surfaceIndex)
 static void
 cmd_clear_surface(const struct cell_command_clear_surface *clear)
 {
-   if (Debug)
-      printf("SPU %u: CLEAR SURF %u to 0x%08x\n", spu.init.id,
-             clear->surface, clear->value);
+   DEBUG_PRINTF("CLEAR SURF %u to 0x%08x\n", clear->surface, clear->value);
 
    if (clear->surface == 0) {
       spu.fb.color_clear_value = clear->value;
@@ -203,17 +225,14 @@ cmd_clear_surface(const struct cell_command_clear_surface *clear)
 
 #endif /* CLEAR_OPT */
 
-   if (Debug)
-      printf("SPU %u: CLEAR SURF done\n", spu.init.id);
+   DEBUG_PRINTF("CLEAR SURF done\n");
 }
 
 
 static void
 cmd_release_verts(const struct cell_command_release_verts *release)
 {
-   if (Debug)
-      printf("SPU %u: RELEASE VERTS %u\n",
-             spu.init.id, release->vertex_buf);
+   DEBUG_PRINTF("RELEASE VERTS %u\n", release->vertex_buf);
    ASSERT(release->vertex_buf != ~0U);
    release_buffer(release->vertex_buf);
 }
@@ -228,16 +247,38 @@ cmd_release_verts(const struct cell_command_release_verts *release)
 static void
 cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops)
 {
-   if (Debug)
-      printf("SPU %u: CMD_STATE_FRAGMENT_OPS\n", spu.init.id);
+   static int warned = 0;
+
+   DEBUG_PRINTF("CMD_STATE_FRAGMENT_OPS\n");
    /* Copy SPU code from batch buffer to spu buffer */
    memcpy(spu.fragment_ops_code, fops->code, SPU_MAX_FRAGMENT_OPS_INSTS * 4);
    /* Copy state info (for fallback case only) */
    memcpy(&spu.depth_stencil_alpha, &fops->dsa, sizeof(fops->dsa));
    memcpy(&spu.blend, &fops->blend, sizeof(fops->blend));
 
-   /* Point function pointer at new code */
-   spu.fragment_ops = (spu_fragment_ops_func) spu.fragment_ops_code;
+   /* Parity twist!  For now, always use the fallback code by default,
+    * only switching to codegen when specifically requested.  This
+    * allows us to develop freely without risking taking down the
+    * branch.
+    *
+    * Later, the parity of this check will be reversed, so that
+    * codegen is *always* used, unless we specifically indicate that
+    * we don't want it.
+    *
+    * Eventually, the option will be removed completely, because in
+    * final code we'll always use codegen and won't even provide the
+    * raw state records that the fallback code requires.
+    */
+   if ((spu.init.debug_flags & CELL_DEBUG_FRAGMENT_OP_FALLBACK) == 0) {
+      spu.fragment_ops = (spu_fragment_ops_func) spu.fragment_ops_code;
+   }
+   else {
+      /* otherwise, the default fallback code remains in place */
+      if (!warned) {
+         fprintf(stderr, "Cell Warning: using fallback per-fragment code\n");
+         warned = 1;
+      }
+   }
 
    spu.read_depth = spu.depth_stencil_alpha.depth.enabled;
    spu.read_stencil = spu.depth_stencil_alpha.stencil[0].enabled;
@@ -247,8 +288,7 @@ cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops)
 static void
 cmd_state_fragment_program(const struct cell_command_fragment_program *fp)
 {
-   if (Debug)
-      printf("SPU %u: CMD_STATE_FRAGMENT_PROGRAM\n", spu.init.id);
+   DEBUG_PRINTF("CMD_STATE_FRAGMENT_PROGRAM\n");
    /* Copy SPU code from batch buffer to spu buffer */
    memcpy(spu.fragment_program_code, fp->code,
           SPU_MAX_FRAGMENT_PROGRAM_INSTS * 4);
@@ -262,9 +302,7 @@ cmd_state_fragment_program(const struct cell_command_fragment_program *fp)
 static void
 cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
 {
-   if (Debug)
-      printf("SPU %u: FRAMEBUFFER: %d x %d at %p, cformat 0x%x  zformat 0x%x\n",
-             spu.init.id,
+   DEBUG_PRINTF("FRAMEBUFFER: %d x %d at %p, cformat 0x%x  zformat 0x%x\n",
              cmd->width,
              cmd->height,
              cmd->color_start,
@@ -309,9 +347,7 @@ cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
 static void
 cmd_state_sampler(const struct cell_command_sampler *sampler)
 {
-   if (Debug)
-      printf("SPU %u: SAMPLER [%u]\n",
-             spu.init.id, sampler->unit);
+   DEBUG_PRINTF("SAMPLER [%u]\n", sampler->unit);
 
    spu.sampler[sampler->unit] = sampler->state;
    if (spu.sampler[sampler->unit].min_img_filter == PIPE_TEX_FILTER_LINEAR)
@@ -328,11 +364,9 @@ cmd_state_texture(const struct cell_command_texture *texture)
    const uint width = texture->width;
    const uint height = texture->height;
 
-   if (Debug) {
-      printf("SPU %u: TEXTURE [%u] at %p  size %u x %u\n", spu.init.id,
+   DEBUG_PRINTF("TEXTURE [%u] at %p  size %u x %u\n",
              texture->unit, texture->start,
              texture->width, texture->height);
-   }
 
    spu.texture[unit].start = texture->start;
    spu.texture[unit].width = width;
@@ -351,10 +385,7 @@ cmd_state_texture(const struct cell_command_texture *texture)
 static void
 cmd_state_vertex_info(const struct vertex_info *vinfo)
 {
-   if (Debug) {
-      printf("SPU %u: VERTEX_INFO num_attribs=%u\n", spu.init.id,
-             vinfo->num_attribs);
-   }
+   DEBUG_PRINTF("VERTEX_INFO num_attribs=%u\n", vinfo->num_attribs);
    ASSERT(vinfo->num_attribs >= 1);
    ASSERT(vinfo->num_attribs <= 8);
    memcpy(&spu.vertex_info, vinfo, sizeof(*vinfo));
@@ -393,8 +424,7 @@ cmd_state_attrib_fetch(const struct cell_attribute_fetch_code *code)
 static void
 cmd_finish(void)
 {
-   if (Debug)
-      printf("SPU %u: FINISH\n", spu.init.id);
+   DEBUG_PRINTF("FINISH\n");
    really_clear_tiles(0);
    /* wait for all outstanding DMAs to finish */
    mfc_write_tag_mask(~0);
@@ -419,9 +449,8 @@ cmd_batch(uint opcode)
    const unsigned usize = size / sizeof(buffer[0]);
    uint pos;
 
-   if (Debug)
-      printf("SPU %u: BATCH buffer %u, len %u, from %p\n",
-             spu.init.id, buf, size, spu.init.buffers[buf]);
+   DEBUG_PRINTF("BATCH buffer %u, len %u, from %p\n",
+             buf, size, spu.init.buffers[buf]);
 
    ASSERT((opcode & CELL_CMD_OPCODE_MASK) == CELL_CMD_BATCH);
 
@@ -440,8 +469,7 @@ cmd_batch(uint opcode)
    wait_on_mask(1 << TAG_BATCH_BUFFER);
 
    /* Tell PPU we're done copying the buffer to local store */
-   if (Debug)
-      printf("SPU %u: release batch buf %u\n", spu.init.id, buf);
+   DEBUG_PRINTF("release batch buf %u\n", buf);
    release_buffer(buf);
 
    /*
@@ -571,8 +599,7 @@ cmd_batch(uint opcode)
       }
    }
 
-   if (Debug)
-      printf("SPU %u: BATCH complete\n", spu.init.id);
+   DEBUG_PRINTF("BATCH complete\n");
 }
 
 
@@ -585,8 +612,7 @@ main_loop(void)
    struct cell_command cmd;
    int exitFlag = 0;
 
-   if (Debug)
-      printf("SPU %u: Enter main loop\n", spu.init.id);
+   DEBUG_PRINTF("Enter main loop\n");
 
    ASSERT((sizeof(struct cell_command) & 0xf) == 0);
    ASSERT_ALIGN16(&cmd);
@@ -595,14 +621,12 @@ main_loop(void)
       unsigned opcode;
       int tag = 0;
 
-      if (Debug)
-         printf("SPU %u: Wait for cmd...\n", spu.init.id);
+      DEBUG_PRINTF("Wait for cmd...\n");
 
       /* read/wait from mailbox */
       opcode = (unsigned int) spu_read_in_mbox();
 
-      if (Debug)
-         printf("SPU %u: got cmd 0x%x\n", spu.init.id, opcode);
+      DEBUG_PRINTF("got cmd 0x%x\n", opcode);
 
       /* command payload */
       mfc_get(&cmd,  /* dest */
@@ -619,8 +643,7 @@ main_loop(void)
 
       switch (opcode & CELL_CMD_OPCODE_MASK) {
       case CELL_CMD_EXIT:
-         if (Debug)
-            printf("SPU %u: EXIT\n", spu.init.id);
+         DEBUG_PRINTF("EXIT\n");
          exitFlag = 1;
          break;
       case CELL_CMD_VS_EXECUTE:
@@ -632,13 +655,12 @@ main_loop(void)
          cmd_batch(opcode);
          break;
       default:
-         printf("Bad opcode!\n");
+         printf("Bad opcode 0x%x!\n", opcode & CELL_CMD_OPCODE_MASK);
       }
 
    }
 
-   if (Debug)
-      printf("SPU %u: Exit main loop\n", spu.init.id);
+   DEBUG_PRINTF("Exit main loop\n");
 
    spu_dcache_report();
 }
@@ -653,7 +675,8 @@ one_time_init(void)
    invalidate_tex_cache();
 
    /* Install default/fallback fragment processing function.
-    * This will normally be overriden by a code-gen'd function.
+    * This will normally be overriden by a code-gen'd function
+    * unless CELL_FORCE_FRAGMENT_OPS_FALLBACK is set.
     */
    spu.fragment_ops = spu_fallback_fragment_ops;
 }
@@ -682,11 +705,13 @@ main(main_param_t speid, main_param_t argp)
 
    ASSERT(sizeof(tile_t) == TILE_SIZE * TILE_SIZE * 4);
    ASSERT(sizeof(struct cell_command_render) % 8 == 0);
+   ASSERT(((unsigned long) &spu.fragment_ops_code) % 32 == 0);
+   ASSERT(((unsigned long) &spu.fragment_program_code) % 32 == 0);
 
    one_time_init();
 
-   if (Debug)
-      printf("SPU: main() speid=%lu\n", (unsigned long) speid);
+   DEBUG_PRINTF("main() speid=%lu\n", (unsigned long) speid);
+   D_PRINTF(CELL_DEBUG_FRAGMENT_OP_FALLBACK, "using fragment op fallback\n");
 
    mfc_get(&spu.init,  /* dest */
            (unsigned int) argp, /* src */
@@ -698,7 +723,7 @@ main(main_param_t speid, main_param_t argp)
 
 #if 0
    if (spu.init.id==0)
-      spu_test_misc();
+      spu_test_misc(spu.init.id);
 #endif
 
    main_loop();
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 2c7b625840..72e540fcff 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -143,13 +143,13 @@ struct spu_global
    ubyte ctile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
    ubyte ztile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
 
-   /** Current fragment ops machine code */
-   uint fragment_ops_code[SPU_MAX_FRAGMENT_OPS_INSTS];
+   /** Current fragment ops machine code, at 32-byte boundary */
+   uint fragment_ops_code[SPU_MAX_FRAGMENT_OPS_INSTS] ALIGN32_ATTRIB;
    /** Current fragment ops function */
    spu_fragment_ops_func fragment_ops;
 
-   /** Current fragment program machine code */
-   uint fragment_program_code[SPU_MAX_FRAGMENT_PROGRAM_INSTS];
+   /** Current fragment program machine code, at 32-byte boundary */
+   uint fragment_program_code[SPU_MAX_FRAGMENT_PROGRAM_INSTS] ALIGN32_ATTRIB;
    /** Current fragment ops function */
    spu_fragment_program_func fragment_program;
 
diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
index 03dd547845..f107764fb2 100644
--- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
+++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
@@ -60,9 +60,12 @@ spu_fallback_fragment_ops(uint x, uint y,
                           vector unsigned int mask)
 {
    vector float frag_aos[4];
-   unsigned int c0, c1, c2, c3;
+   unsigned int fbc0, fbc1, fbc2, fbc3 ; /* framebuffer/tile colors */
+   unsigned int fragc0, fragc1, fragc2, fragc3;  /* fragment colors */
 
-   /* do alpha test */
+   /*
+    * Do alpha test
+    */
    if (spu.depth_stencil_alpha.alpha.enabled) {
       vector float ref = spu_splats(spu.depth_stencil_alpha.alpha.ref);
       vector unsigned int amask;
@@ -102,7 +105,10 @@ spu_fallback_fragment_ops(uint x, uint y,
       mask = spu_and(mask, amask);
    }
 
-   /* Z and/or stencil testing... */
+
+   /*
+    * Z and/or stencil testing...
+    */
    if (spu.depth_stencil_alpha.depth.enabled ||
        spu.depth_stencil_alpha.stencil[0].enabled) {
 
@@ -178,6 +184,32 @@ spu_fallback_fragment_ops(uint x, uint y,
       }
    }
 
+
+   /*
+    * If we'll need the current framebuffer/tile colors for blending
+    * or logicop or colormask, fetch them now.
+    */
+   if (spu.blend.blend_enable ||
+       spu.blend.logicop_enable ||
+       spu.blend.colormask != 0xf) {
+
+#if LINEAR_QUAD_LAYOUT /* See comments/diagram below */
+      fbc0 = colorTile->ui[y][x*2+0];
+      fbc1 = colorTile->ui[y][x*2+1];
+      fbc2 = colorTile->ui[y][x*2+2];
+      fbc3 = colorTile->ui[y][x*2+3];
+#else
+      fbc0 = colorTile->ui[y+0][x+0];
+      fbc1 = colorTile->ui[y+0][x+1];
+      fbc2 = colorTile->ui[y+1][x+0];
+      fbc3 = colorTile->ui[y+1][x+1];
+#endif
+   }
+
+
+   /*
+    * Do blending
+    */
    if (spu.blend.blend_enable) {
       /* blending terms, misc regs */
       vector float term1r, term1g, term1b, term1a;
@@ -186,39 +218,26 @@ spu_fallback_fragment_ops(uint x, uint y,
 
       vector float fbRGBA[4];  /* current framebuffer colors */
 
-      /* get colors from framebuffer/tile */
+      /* convert framebuffer colors from packed int to vector float */
       {
-         vector float fc[4];
-         uint c0, c1, c2, c3;
-
-#if LINEAR_QUAD_LAYOUT /* See comments/diagram below */
-         c0 = colorTile->ui[y][x*2+0];
-         c1 = colorTile->ui[y][x*2+1];
-         c2 = colorTile->ui[y][x*2+2];
-         c3 = colorTile->ui[y][x*2+3];
-#else
-         c0 = colorTile->ui[y+0][x+0];
-         c1 = colorTile->ui[y+0][x+1];
-         c2 = colorTile->ui[y+1][x+0];
-         c3 = colorTile->ui[y+1][x+1];
-#endif
+         vector float temp[4]; /* float colors in AOS form */
          switch (spu.fb.color_format) {
          case PIPE_FORMAT_B8G8R8A8_UNORM:
-            fc[0] = spu_unpack_B8G8R8A8(c0);
-            fc[1] = spu_unpack_B8G8R8A8(c1);
-            fc[2] = spu_unpack_B8G8R8A8(c2);
-            fc[3] = spu_unpack_B8G8R8A8(c3);
+            temp[0] = spu_unpack_B8G8R8A8(fbc0);
+            temp[1] = spu_unpack_B8G8R8A8(fbc1);
+            temp[2] = spu_unpack_B8G8R8A8(fbc2);
+            temp[3] = spu_unpack_B8G8R8A8(fbc3);
             break;
          case PIPE_FORMAT_A8R8G8B8_UNORM:
-            fc[0] = spu_unpack_A8R8G8B8(c0);
-            fc[1] = spu_unpack_A8R8G8B8(c1);
-            fc[2] = spu_unpack_A8R8G8B8(c2);
-            fc[3] = spu_unpack_A8R8G8B8(c3);
+            temp[0] = spu_unpack_A8R8G8B8(fbc0);
+            temp[1] = spu_unpack_A8R8G8B8(fbc1);
+            temp[2] = spu_unpack_A8R8G8B8(fbc2);
+            temp[3] = spu_unpack_A8R8G8B8(fbc3);
             break;
          default:
             ASSERT(0);
          }
-         _transpose_matrix4x4(fbRGBA, fc);
+         _transpose_matrix4x4(fbRGBA, temp); /* fbRGBA = transpose(temp) */
       }
 
       /*
@@ -384,21 +403,20 @@ spu_fallback_fragment_ops(uint x, uint y,
 #endif
 
    /*
-    * Pack float colors into 32-bit RGBA words.
+    * Pack fragment float colors into 32-bit RGBA words.
     */
    switch (spu.fb.color_format) {
    case PIPE_FORMAT_A8R8G8B8_UNORM:
-      c0 = spu_pack_A8R8G8B8(frag_aos[0]);
-      c1 = spu_pack_A8R8G8B8(frag_aos[1]);
-      c2 = spu_pack_A8R8G8B8(frag_aos[2]);
-      c3 = spu_pack_A8R8G8B8(frag_aos[3]);
+      fragc0 = spu_pack_A8R8G8B8(frag_aos[0]);
+      fragc1 = spu_pack_A8R8G8B8(frag_aos[1]);
+      fragc2 = spu_pack_A8R8G8B8(frag_aos[2]);
+      fragc3 = spu_pack_A8R8G8B8(frag_aos[3]);
       break;
-
    case PIPE_FORMAT_B8G8R8A8_UNORM:
-      c0 = spu_pack_B8G8R8A8(frag_aos[0]);
-      c1 = spu_pack_B8G8R8A8(frag_aos[1]);
-      c2 = spu_pack_B8G8R8A8(frag_aos[2]);
-      c3 = spu_pack_B8G8R8A8(frag_aos[3]);
+      fragc0 = spu_pack_B8G8R8A8(frag_aos[0]);
+      fragc1 = spu_pack_B8G8R8A8(frag_aos[1]);
+      fragc2 = spu_pack_B8G8R8A8(frag_aos[2]);
+      fragc3 = spu_pack_B8G8R8A8(frag_aos[3]);
       break;
    default:
       fprintf(stderr, "SPU: Bad pixel format in spu_default_fragment_ops\n");
@@ -407,20 +425,57 @@ spu_fallback_fragment_ops(uint x, uint y,
 
 
    /*
-    * Color masking
+    * Do color masking
     */
    if (spu.blend.colormask != 0xf) {
-      /* XXX to do */
-      /* apply color mask to 32-bit packed colors */
+      uint cmask = 0x0; /* each byte corresponds to a color channel */
+
+      /* Form bitmask depending on color buffer format and colormask bits */
+      switch (spu.fb.color_format) {
+      case PIPE_FORMAT_A8R8G8B8_UNORM:
+         if (spu.blend.colormask & (1<<0))
+            cmask |= 0x00ff0000; /* red */
+         if (spu.blend.colormask & (1<<1))
+            cmask |= 0x0000ff00; /* green */
+         if (spu.blend.colormask & (1<<2))
+            cmask |= 0x000000ff; /* blue */
+         if (spu.blend.colormask & (1<<3))
+            cmask |= 0xff000000; /* alpha */
+         break;
+      case PIPE_FORMAT_B8G8R8A8_UNORM:
+         if (spu.blend.colormask & (1<<0))
+            cmask |= 0x0000ff00; /* red */
+         if (spu.blend.colormask & (1<<1))
+            cmask |= 0x00ff0000; /* green */
+         if (spu.blend.colormask & (1<<2))
+            cmask |= 0xff000000; /* blue */
+         if (spu.blend.colormask & (1<<3))
+            cmask |= 0x000000ff; /* alpha */
+         break;
+      default:
+         ASSERT(0);
+      }
+
+      /*
+       * Apply color mask to the 32-bit packed colors.
+       * if (cmask[i])
+       *    frag color[i] = frag color[i];
+       * else
+       *    frag color[i] = framebuffer color[i];
+       */
+      fragc0 = (fragc0 & cmask) | (fbc0 & ~cmask);
+      fragc1 = (fragc1 & cmask) | (fbc1 & ~cmask);
+      fragc2 = (fragc2 & cmask) | (fbc2 & ~cmask);
+      fragc3 = (fragc3 & cmask) | (fbc3 & ~cmask);
    }
 
 
    /*
-    * Logic Ops
+    * Do logic ops
     */
    if (spu.blend.logicop_enable) {
       /* XXX to do */
-      /* apply logicop to 32-bit packed colors */
+      /* apply logicop to 32-bit packed colors (fragcx and fbcx) */
    }
 
 
@@ -431,45 +486,46 @@ spu_fallback_fragment_ops(uint x, uint y,
       spu.cur_ctile_status = TILE_STATUS_DIRTY;
    }
    else {
+      /* write no fragments */
       return;
    }
 
 
    /*
-    * Write new quad colors to the framebuffer/tile.
+    * Write new fragment/quad colors to the framebuffer/tile.
     * Only write pixels where the corresponding mask word is set.
     */
 #if LINEAR_QUAD_LAYOUT
    /*
     * Quad layout:
     *  +--+--+--+--+
-    *  |p0|p1|p2|p3|
+    *  |p0|p1|p2|p3|...
     *  +--+--+--+--+
     */
    if (spu_extract(mask, 0))
-      colorTile->ui[y][x*2] = c0;
+      colorTile->ui[y][x*2] = fragc0;
    if (spu_extract(mask, 1))
-      colorTile->ui[y][x*2+1] = c1;
+      colorTile->ui[y][x*2+1] = fragc1;
    if (spu_extract(mask, 2))
-      colorTile->ui[y][x*2+2] = c2;
+      colorTile->ui[y][x*2+2] = fragc2;
    if (spu_extract(mask, 3))
-      colorTile->ui[y][x*2+3] = c3;
+      colorTile->ui[y][x*2+3] = fragc3;
 #else
    /*
     * Quad layout:
     *  +--+--+
-    *  |p0|p1|
+    *  |p0|p1|...
     *  +--+--+
-    *  |p2|p3|
+    *  |p2|p3|...
     *  +--+--+
     */
    if (spu_extract(mask, 0))
-      colorTile->ui[y+0][x+0] = c0;
+      colorTile->ui[y+0][x+0] = fragc0;
    if (spu_extract(mask, 1))
-      colorTile->ui[y+0][x+1] = c1;
+      colorTile->ui[y+0][x+1] = fragc1;
    if (spu_extract(mask, 2))
-      colorTile->ui[y+1][x+0] = c2;
+      colorTile->ui[y+1][x+0] = fragc2;
    if (spu_extract(mask, 3))
-      colorTile->ui[y+1][x+1] = c3;
+      colorTile->ui[y+1][x+1] = fragc3;
 #endif
 }
diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index 8b93878192..0a8fb56a62 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -241,6 +241,19 @@ eval_coeff(uint slot, float x, float y, vector float result[4])
 }
 
 
+/**
+ * As above, but return 4 vectors in SOA format.
+ * XXX this will all be re-written someday.
+ */
+static INLINE void
+eval_coeff_soa(uint slot, float x, float y, vector float result[4])
+{
+   eval_coeff(slot, x, y, result);
+   _transpose_matrix4x4(result, result);
+}
+
+
+
 static INLINE vector float
 eval_z(float x, float y)
 {
@@ -267,14 +280,17 @@ emit_quad( int x, int y, mask_t mask )
    if (spu_extract(spu_orx(mask), 0)) {
       const int ix = x - setup.cliprect_minx;
       const int iy = y - setup.cliprect_miny;
-      vector float colors[4];
 
       spu.cur_ctile_status = TILE_STATUS_DIRTY;
       spu.cur_ztile_status = TILE_STATUS_DIRTY;
 
       if (spu.texture[0].start) {
-         /* texture mapping */
+         /*
+          * Temporary texture mapping path
+          * This will go away when fragment programs support TEX inst.
+          */
          const uint unit = 0;
+         vector float colors[4];
          vector float texcoords[4];
          eval_coeff(2, (float) x, (float) y, texcoords);
 
@@ -311,70 +327,60 @@ emit_quad( int x, int y, mask_t mask )
             colors[3] = spu_mul(colors[3], colors1[3]);
          }
 
+         {
+            /* Convert fragment data from AoS to SoA format.
+             * I.e. (RGBA,RGBA,RGBA,RGBA) -> (RRRR,GGGG,BBBB,AAAA)
+             * This is temporary!
+             */
+            vector float soa_frag[4];
+            _transpose_matrix4x4(soa_frag, colors);
+
+            vector float fragZ = eval_z((float) x, (float) y);
+
+            /* Do all per-fragment/quad operations here, including:
+             * alpha test, z test, stencil test, blend and framebuffer writing.
+             */
+            spu.fragment_ops(ix, iy, &spu.ctile, &spu.ztile,
+                             fragZ,
+                             soa_frag[0], soa_frag[1],
+                             soa_frag[2], soa_frag[3],
+                             mask);
+         }
+
       }
       else {
-         /* simple shading */
-#if 0
-         eval_coeff(1, (float) x, (float) y, colors);
+         /*
+          * Run fragment shader, execute per-fragment ops, update fb/tile.
+          */
+         vector float inputs[4*4], outputs[2*4];
+         vector float fragZ = eval_z((float) x, (float) y);
 
+         /* setup inputs */
+#if 0
+         eval_coeff_soa(1, (float) x, (float) y, inputs);
 #else
-         /* XXX new fragment program code */
-
-         if (spu.fragment_program) {
-            vector float inputs[4*4], outputs[2*4];
-
-            /* setup inputs */
-            eval_coeff(1, (float) x, (float) y, inputs);
-
-            /* Execute the current fragment program */
-            spu.fragment_program(inputs, outputs, spu.constants);
-
-            /* Copy outputs */
-            colors[0] = outputs[0*4+0];
-            colors[1] = outputs[0*4+1];
-            colors[2] = outputs[0*4+2];
-            colors[3] = outputs[0*4+3];
-
-            if (0 && spu.init.id==0 && y == 48) {
-               printf("colors[0] = %f %f %f %f\n",
-                      spu_extract(colors[0], 0),
-                      spu_extract(colors[0], 1),
-                      spu_extract(colors[0], 2),
-                      spu_extract(colors[0], 3));
-               printf("colors[1] = %f %f %f %f\n",
-                      spu_extract(colors[1], 0),
-                      spu_extract(colors[1], 1),
-                      spu_extract(colors[1], 2),
-                      spu_extract(colors[1], 3));
-            }
-
+         uint i;
+         for (i = 0; i < spu.vertex_info.num_attribs; i++) {
+            eval_coeff_soa(i+1, (float) x, (float) y, inputs + i * 4);
          }
 #endif
-      }
+         ASSERT(spu.fragment_program);
+         ASSERT(spu.fragment_ops);
 
+         /* Execute the current fragment program */
+         spu.fragment_program(inputs, outputs, spu.constants);
 
-      {
-         /* Convert fragment data from AoS to SoA format.
-          * I.e. (RGBA,RGBA,RGBA,RGBA) -> (RRRR,GGGG,BBBB,AAAA)
-          * This is temporary!
-          */
-         vector float soa_frag[4];
-         _transpose_matrix4x4(soa_frag, colors);
-
-         float4 fragZ;
-
-         fragZ.v = eval_z((float) x, (float) y);
-
-         /* Do all per-fragment/quad operations here, including:
-          *  alpha test, z test, stencil test, blend and framebuffer writing.
+         /* Execute per-fragment/quad operations, including:
+          * alpha test, z test, stencil test, blend and framebuffer writing.
           */
          spu.fragment_ops(ix, iy, &spu.ctile, &spu.ztile,
-                          fragZ.v,
-                          soa_frag[0], soa_frag[1],
-                          soa_frag[2], soa_frag[3],
+                          fragZ,
+                          outputs[0*4+0],
+                          outputs[0*4+1],
+                          outputs[0*4+2],
+                          outputs[0*4+3],
                           mask);
       }
-
    }
 }
 
diff --git a/src/gallium/drivers/softpipe/sp_fs_exec.c b/src/gallium/drivers/softpipe/sp_fs_exec.c
index 701ee4c72f..4fea71c314 100644
--- a/src/gallium/drivers/softpipe/sp_fs_exec.c
+++ b/src/gallium/drivers/softpipe/sp_fs_exec.c
@@ -39,11 +39,20 @@
 #include "tgsi/tgsi_exec.h"
 #include "tgsi/tgsi_parse.h"
 
-struct sp_exec_fragment_shader {
+struct sp_exec_fragment_shader
+{
    struct sp_fragment_shader base;
+   const struct tgsi_token *machine_tokens;
 };
 
 
+/** cast wrapper */
+static INLINE struct sp_exec_fragment_shader *
+sp_exec_fragment_shader(const struct sp_fragment_shader *base)
+{
+   return (struct sp_exec_fragment_shader *) base;
+}
+
 
 /**
  * Compute quad X,Y,Z,W for the four fragments in a quad.
@@ -86,10 +95,20 @@ exec_prepare( const struct sp_fragment_shader *base,
 	      struct tgsi_exec_machine *machine,
 	      struct tgsi_sampler *samplers )
 {
-   tgsi_exec_machine_bind_shader( machine,
-				  base->shader.tokens,
-				  PIPE_MAX_SAMPLERS,
-				  samplers );
+   struct sp_exec_fragment_shader *spefs =
+      sp_exec_fragment_shader(base);
+
+   /*
+    * Bind tokens/shader to the interpreter's machine state.
+    * Avoid redundant binding.
+    */
+   if (spefs->machine_tokens != base->shader.tokens) {
+      tgsi_exec_machine_bind_shader( machine,
+                                     base->shader.tokens,
+                                     PIPE_MAX_SAMPLERS,
+                                     samplers );
+      spefs->machine_tokens = base->shader.tokens;
+   }
 }
 
 
diff --git a/src/gallium/include/pipe/p_compiler.h b/src/gallium/include/pipe/p_compiler.h
index 4d64c74a4a..1e702c7fa7 100644
--- a/src/gallium/include/pipe/p_compiler.h
+++ b/src/gallium/include/pipe/p_compiler.h
@@ -144,10 +144,12 @@ typedef unsigned char boolean;
 #define ALIGN16_DECL(TYPE, NAME, SIZE)  TYPE NAME##___aligned[SIZE] __attribute__(( aligned( 16 ) ))
 #define ALIGN16_ASSIGN(NAME) NAME##___aligned
 #define ALIGN16_ATTRIB  __attribute__(( aligned( 16 ) ))
+#define ALIGN32_ATTRIB  __attribute__(( aligned( 32 ) ))
 #else
 #define ALIGN16_DECL(TYPE, NAME, SIZE)  TYPE NAME##___unaligned[SIZE + 1]
 #define ALIGN16_ASSIGN(NAME) align16(NAME##___unaligned)
 #define ALIGN16_ATTRIB
+#define ALIGN32_ATTRIB
 #endif
 
 
diff --git a/src/gallium/include/pipe/p_screen.h b/src/gallium/include/pipe/p_screen.h
index b15affef7a..3bedc75294 100644
--- a/src/gallium/include/pipe/p_screen.h
+++ b/src/gallium/include/pipe/p_screen.h
@@ -26,6 +26,8 @@
  **************************************************************************/
 
 /**
+ * @file
+ * 
  * Screen, Adapter or GPU
  *
  * These are driver functions/facilities that are context independent.
diff --git a/src/gallium/include/pipe/p_shader_tokens.h b/src/gallium/include/pipe/p_shader_tokens.h
index a562e3a6b1..78c20de3e2 100644
--- a/src/gallium/include/pipe/p_shader_tokens.h
+++ b/src/gallium/include/pipe/p_shader_tokens.h
@@ -1,4 +1,31 @@
-#if !defined TGSI_TOKEN_H
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef TGSI_TOKEN_H
 #define TGSI_TOKEN_H
 
 #ifdef __cplusplus
@@ -396,7 +423,7 @@ struct tgsi_immediate_float32
 #define TGSI_SAT_ZERO_ONE        1  /* clamp to [0,1] */
 #define TGSI_SAT_MINUS_PLUS_ONE  2  /* clamp to [-1,1] */
 
-/*
+/**
  * Opcode is the operation code to execute. A given operation defines the
  * semantics how the source registers (if any) are interpreted and what is
  * written to the destination registers (if any) as a result of execution.
@@ -481,7 +508,7 @@ struct tgsi_instruction_ext
 #define TGSI_SWIZZLE_Z      2
 #define TGSI_SWIZZLE_W      3
 
-/*
+/**
  * Precision controls the precision at which the operation should be executed.
  *
  * CondDstUpdate enables condition code register writes. When this field is
@@ -548,7 +575,7 @@ struct tgsi_instruction_ext_predicate
    unsigned Extended         : 1;    /* BOOL */
 };
 
-/*
+/**
  * File specifies the register array to access.
  *
  * Index specifies the element number of a register in the register file.
@@ -580,7 +607,7 @@ struct tgsi_src_register
    unsigned Extended    : 1;  /* BOOL */
 };
 
-/*
+/**
  * If tgsi_src_register::Extended is TRUE, tgsi_src_register_ext follows.
  * 
  * Then, if tgsi_src_register::Indirect is TRUE, another tgsi_src_register
@@ -599,7 +626,7 @@ struct tgsi_src_register_ext
    unsigned Extended : 1;    /* BOOL */
 };
 
-/*
+/**
  * If tgsi_src_register_ext::Type is TGSI_SRC_REGISTER_EXT_TYPE_SWZ,
  * it should be cast to tgsi_src_register_ext_swz.
  * 
@@ -617,7 +644,7 @@ struct tgsi_src_register_ext
 #define TGSI_EXTSWIZZLE_ZERO    4
 #define TGSI_EXTSWIZZLE_ONE     5
 
-/*
+/**
  * ExtSwizzleX, ExtSwizzleY, ExtSwizzleZ and ExtSwizzleW swizzle the source
  * register in an extended manner.
  *
diff --git a/src/gallium/include/pipe/p_state.h b/src/gallium/include/pipe/p_state.h
index da783389da..342f17260a 100644
--- a/src/gallium/include/pipe/p_state.h
+++ b/src/gallium/include/pipe/p_state.h
@@ -27,6 +27,8 @@
 
 
 /**
+ * @file
+ * 
  * Abstract graphics pipe state objects.
  *
  * Basic notes:
diff --git a/src/gallium/include/pipe/p_thread.h b/src/gallium/include/pipe/p_thread.h
index e01d5a602b..8af3cd958b 100644
--- a/src/gallium/include/pipe/p_thread.h
+++ b/src/gallium/include/pipe/p_thread.h
@@ -25,6 +25,8 @@
 
 
 /**
+ * @file
+ * 
  * Thread, mutex, condition var and thread-specific data functions.
  */
 
diff --git a/src/gallium/winsys/drm/intel/dri/intel_screen.c b/src/gallium/winsys/drm/intel/dri/intel_screen.c
index 3a486481f5..78b9a6db05 100644
--- a/src/gallium/winsys/drm/intel/dri/intel_screen.c
+++ b/src/gallium/winsys/drm/intel/dri/intel_screen.c
@@ -113,7 +113,118 @@ static PFNGLXCREATECONTEXTMODES create_context_modes = NULL;
 
 extern const struct dri_extension card_extensions[];
 
+static GLboolean
+intel_get_param(__DRIscreenPrivate *psp, int param, int *value)
+{
+   int ret;
+   struct drm_i915_getparam gp;
+
+   gp.param = param;
+   gp.value = value;
+
+   ret = drmCommandWriteRead(psp->fd, DRM_I915_GETPARAM, &gp, sizeof(gp));
+   if (ret) {
+      fprintf(stderr, "drm_i915_getparam: %d\n", ret);
+      return GL_FALSE;
+   }
+
+   return GL_TRUE;
+}
+
+static void
+intelSetTexOffset(__DRIcontext *pDRICtx, int texname,
+		  unsigned long long offset, int depth, uint pitch)
+{
+   abort();
+#if 0
+   struct intel_context *intel = (struct intel_context*)
+      ((__DRIcontextPrivate*)pDRICtx->private)->driverPrivate;
+   struct gl_texture_object *tObj = _mesa_lookup_texture(&intel->ctx, texname);
+   struct st_texture_object *stObj = st_texture_object(tObj);
+
+   if (!stObj)
+      return;
+
+   if (stObj->pt)
+      st->pipe->texture_release(intel->st->pipe, &stObj->pt);
+
+   stObj->imageOverride = GL_TRUE;
+   stObj->depthOverride = depth;
+   stObj->pitchOverride = pitch;
+
+   if (offset)
+      stObj->textureOffset = offset;
+#endif
+}
+
+
+#if 0
+static void
+intelHandleDrawableConfig(__DRIdrawablePrivate *dPriv,
+			  __DRIcontextPrivate *pcp,
+			  __DRIDrawableConfigEvent *event)
+{
+   (void) dPriv;
+   (void) pcp;
+   (void) event;
+}
+#endif
+
+#if 0
+static void
+intelHandleBufferAttach(__DRIdrawablePrivate *dPriv,
+			__DRIcontextPrivate *pcp,
+			__DRIBufferAttachEvent *ba)
+{
+   struct intel_screen *intelScreen = intel_screen(dPriv->driScreenPriv);
+
+   switch (ba->buffer.attachment) {
+   case DRI_DRAWABLE_BUFFER_FRONT_LEFT:
+      intelScreen->front.width = dPriv->w;
+      intelScreen->front.height = dPriv->h;
+      intelScreen->front.cpp = ba->buffer.cpp;
+      intelScreen->front.pitch = ba->buffer.pitch;
+      driGenBuffers(intelScreen->base.staticPool, "front", 1, &intelScreen->front.buffer, 0, 0, 0);
+      driBOSetReferenced(intelScreen->front.buffer, ba->buffer.handle);
+      break;
+
+   case DRI_DRAWABLE_BUFFER_BACK_LEFT:
+   case DRI_DRAWABLE_BUFFER_DEPTH:
+   case DRI_DRAWABLE_BUFFER_STENCIL:
+   case DRI_DRAWABLE_BUFFER_ACCUM:
+      /* anything ?? */
+      break;
+
+   default:
+      fprintf(stderr, "unhandled buffer attach event, attachment type %d\n",
+	      ba->buffer.attachment);
+      return;
+   }
+}
+#endif
+
+static const __DRItexOffsetExtension intelTexOffsetExtension = {
+   { __DRI_TEX_OFFSET },
+   intelSetTexOffset,
+};
+
+#if 0
+static const __DRItexBufferExtension intelTexBufferExtension = {
+    { __DRI_TEX_BUFFER, __DRI_TEX_BUFFER_VERSION },
+   intelSetTexBuffer,
+};
+#endif
 
+static const __DRIextension *intelScreenExtensions[] = {
+    &driReadDrawableExtension,
+    &driCopySubBufferExtension.base,
+    &driSwapControlExtension.base,
+    &driFrameTrackingExtension.base,
+    &driMediaStreamCounterExtension.base,
+    &intelTexOffsetExtension.base,
+//    &intelTexBufferExtension.base,
+    NULL
+};
 
 
 static void
@@ -232,7 +343,8 @@ intelCreatePools(__DRIscreenPrivate * sPriv)
 
    intelScreen->havePools = GL_TRUE;
 
-   intelUpdateScreenRotation(sPriv, intelScreen->sarea);
+   if (intelScreen->sarea)
+	intelUpdateScreenRotation(sPriv, intelScreen->sarea);
 
    return GL_TRUE;
 }
@@ -265,11 +377,6 @@ intelInitDriver(__DRIscreenPrivate * sPriv)
    struct intel_screen *intelScreen;
    I830DRIPtr gDRIPriv = (I830DRIPtr) sPriv->pDevPriv;
 
-   PFNGLXSCRENABLEEXTENSIONPROC glx_enable_extension =
-      (PFNGLXSCRENABLEEXTENSIONPROC) (*dri_interface->
-                                      getProcAddress("glxEnableExtension"));
-   void *const psc = sPriv->psc->screenConfigs;
-
    if (sPriv->devPrivSize != sizeof(I830DRIRec)) {
       fprintf(stderr,
               "\nERROR!  sizeof(I830DRIRec) does not match passed size from device driver\n");
@@ -286,28 +393,19 @@ intelInitDriver(__DRIscreenPrivate * sPriv)
                       __driConfigOptions, __driNConfigOptions);
 
    sPriv->private = (void *) intelScreen;
-
    intelScreen->sarea = (drmI830Sarea *) (((GLubyte *) sPriv->pSAREA) +
-					  gDRIPriv->sarea_priv_offset);
-   intelScreen->deviceID = gDRIPriv->deviceID;
-   intelScreen->front.cpp = gDRIPriv->cpp;
-   intelScreen->drmMinor = sPriv->drmMinor;
+                                            gDRIPriv->sarea_priv_offset);
 
-   assert(gDRIPriv->bitsPerPixel == 16 ||
-	  gDRIPriv->bitsPerPixel == 32);
+   intelScreen->deviceID = gDRIPriv->deviceID;
 
+   intelScreen->front.cpp = gDRIPriv->cpp;
+   intelScreen->drmMinor = sPriv->drm_version.minor;
    intelUpdateScreenRotation(sPriv, intelScreen->sarea);
 
    if (0)
       intelPrintDRIInfo(intelScreen, sPriv, gDRIPriv);
 
-   if (glx_enable_extension != NULL) {
-      (*glx_enable_extension) (psc, "GLX_SGI_swap_control");
-      (*glx_enable_extension) (psc, "GLX_SGI_video_sync");
-      (*glx_enable_extension) (psc, "GLX_MESA_swap_control");
-      (*glx_enable_extension) (psc, "GLX_MESA_swap_frame_usage");
-      (*glx_enable_extension) (psc, "GLX_SGI_make_current_read");
-   }
+   sPriv->extensions = intelScreenExtensions;
 
    intelScreen->base.base.flush_frontbuffer = intel_flush_frontbuffer;
    intelScreen->base.base.get_name = intel_get_name;
@@ -406,65 +504,19 @@ intelGetSwapInfo(__DRIdrawablePrivate * dPriv, __DRIswapInfo * sInfo)
    return 0;
 }
 
-
-static void
-intelSetTexOffset(__DRIcontext *pDRICtx, int texname,
-		  unsigned long long offset, int depth, uint pitch)
-{
-   abort();
-#if 0
-   struct intel_context *intel = (struct intel_context*)
-      ((__DRIcontextPrivate*)pDRICtx->private)->driverPrivate;
-   struct gl_texture_object *tObj = _mesa_lookup_texture(&intel->ctx, texname);
-   struct st_texture_object *stObj = st_texture_object(tObj);
-
-   if (!stObj)
-      return;
-
-   if (stObj->pt)
-      st->pipe->texture_release(intel->st->pipe, &stObj->pt);
-
-   stObj->imageOverride = GL_TRUE;
-   stObj->depthOverride = depth;
-   stObj->pitchOverride = pitch;
-
-   if (offset)
-      stObj->textureOffset = offset;
-#endif
-}
-
-
-static const struct __DriverAPIRec intelAPI = {
-   .InitDriver = intelInitDriver,
-   .DestroyScreen = intelDestroyScreen,
-   .CreateContext = intelCreateContext,
-   .DestroyContext = intelDestroyContext,
-   .CreateBuffer = intelCreateBuffer,
-   .DestroyBuffer = intelDestroyBuffer,
-   .SwapBuffers = intelSwapBuffers,
-   .MakeCurrent = intelMakeCurrent,
-   .UnbindContext = intelUnbindContext,
-   .GetSwapInfo = intelGetSwapInfo,
-   .GetMSC = driGetMSC32,
-   .WaitForMSC = driWaitForMSC32,
-   .WaitForSBC = NULL,
-   .SwapBuffersMSC = NULL,
-   .CopySubBuffer = intelCopySubBuffer,
-   .setTexOffset = intelSetTexOffset,
-};
-
-
-static __GLcontextModes *
-intelFillInModes(unsigned pixel_bits, unsigned depth_bits,
-                 unsigned stencil_bits, boolean have_back_buffer)
+static __DRIconfig **
+intelFillInModes(__DRIscreenPrivate *psp,
+		 unsigned pixel_bits, unsigned depth_bits,
+                 unsigned stencil_bits, GLboolean have_back_buffer)
 {
-   __GLcontextModes *modes;
+   __DRIconfig **configs;
    __GLcontextModes *m;
    unsigned num_modes;
    unsigned depth_buffer_factor;
    unsigned back_buffer_factor;
    GLenum fb_format;
    GLenum fb_type;
+   int i;
 
    /* GLX_SWAP_COPY_OML is only supported because the Intel driver doesn't
     * support pageflipping at all.
@@ -508,100 +560,144 @@ intelFillInModes(unsigned pixel_bits, unsigned depth_bits,
       fb_type = GL_UNSIGNED_INT_8_8_8_8_REV;
    }
 
-   modes =
-      (*dri_interface->createContextModes) (num_modes,
-                                            sizeof(__GLcontextModes));
-   m = modes;
-   if (!driFillInModes(&m, fb_format, fb_type,
-                       depth_bits_array, stencil_bits_array,
-                       depth_buffer_factor, back_buffer_modes,
-                       back_buffer_factor, msaa_samples_array, 1, GLX_TRUE_COLOR)) {
-      fprintf(stderr, "[%s:%u] Error creating FBConfig!\n", __func__,
-              __LINE__);
-      return NULL;
-   }
-   if (!driFillInModes(&m, fb_format, fb_type,
-                       depth_bits_array, stencil_bits_array,
-                       depth_buffer_factor, back_buffer_modes,
-                       back_buffer_factor, msaa_samples_array, 1, GLX_DIRECT_COLOR)) {
-      fprintf(stderr, "[%s:%u] Error creating FBConfig!\n", __func__,
+   configs = driCreateConfigs(fb_format, fb_type,
+			      depth_bits_array, stencil_bits_array,
+			      depth_buffer_factor, back_buffer_modes,
+			      back_buffer_factor, msaa_samples_array, 1);
+   if (configs == NULL) {
+    fprintf(stderr, "[%s:%u] Error creating FBConfig!\n", __func__,
               __LINE__);
       return NULL;
    }
 
    /* Mark the visual as slow if there are "fake" stencil bits.
     */
-   for (m = modes; m != NULL; m = m->next) {
+   for (i = 0; configs[i]; i++) {
+      m = &configs[i]->modes;
       if ((m->stencilBits != 0) && (m->stencilBits != stencil_bits)) {
          m->visualRating = GLX_SLOW_CONFIG;
       }
    }
 
-   return modes;
+   return configs;
 }
 
-
 /**
- * This is the bootstrap function for the driver.  libGL supplies all of the
- * requisite information about the system, and the driver initializes itself.
- * This routine also fills in the linked list pointed to by \c driver_modes
- * with the \c __GLcontextModes that the driver can support for windows or
- * pbuffers.
+ * This is the driver specific part of the createNewScreen entry point.
+ * 
+ * \todo maybe fold this into intelInitDriver
  *
- * \return A pointer to a \c __DRIscreenPrivate on success, or \c NULL on
- *         failure.
+ * \return the __GLcontextModes supported by this driver
  */
-PUBLIC void *
-__driCreateNewScreen_20050727(__DRInativeDisplay * dpy, int scrn,
-                              __DRIscreen * psc,
-                              const __GLcontextModes * modes,
-                              const __DRIversion * ddx_version,
-                              const __DRIversion * dri_version,
-                              const __DRIversion * drm_version,
-                              const __DRIframebuffer * frame_buffer,
-                              drmAddress pSAREA, int fd,
-                              int internal_api_version,
-                              const __DRIinterfaceMethods * interface,
-                              __GLcontextModes ** driver_modes)
+static const __DRIconfig **intelInitScreen(__DRIscreenPrivate *psp)
 {
-   __DRIscreenPrivate *psp;
-   static const __DRIversion ddx_expected = { 1, 7, 0 };
+#ifdef I915
+   static const __DRIversion ddx_expected = { 1, 5, 0 };
+#else
+   static const __DRIversion ddx_expected = { 1, 6, 0 };
+#endif
    static const __DRIversion dri_expected = { 4, 0, 0 };
-   static const __DRIversion drm_expected = { 1, 7, 0 };
-
-   dri_interface = interface;
+   static const __DRIversion drm_expected = { 1, 5, 0 };
+   I830DRIPtr dri_priv = (I830DRIPtr) psp->pDevPriv;
 
    if (!driCheckDriDdxDrmVersions2("i915",
-                                   dri_version, &dri_expected,
-                                   ddx_version, &ddx_expected,
-                                   drm_version, &drm_expected)) {
+                                   &psp->dri_version, &dri_expected,
+                                   &psp->ddx_version, &ddx_expected,
+                                   &psp->drm_version, &drm_expected)) {
       return NULL;
    }
 
-   psp = __driUtilCreateNewScreen(dpy, scrn, psc, NULL,
-                                  ddx_version, dri_version, drm_version,
-                                  frame_buffer, pSAREA, fd,
-                                  internal_api_version, &intelAPI);
-
-   if (psp != NULL) {
-      I830DRIPtr dri_priv = (I830DRIPtr) psp->pDevPriv;
-      *driver_modes = intelFillInModes(dri_priv->cpp * 8,
-                                       (dri_priv->cpp == 2) ? 16 : 24,
-                                       (dri_priv->cpp == 2) ? 0 : 8, 1);
-
-      /* Calling driInitExtensions here, with a NULL context pointer,
-       * does not actually enable the extensions.  It just makes sure
-       * that all the dispatch offsets for all the extensions that
-       * *might* be enables are known.  This is needed because the
-       * dispatch offsets need to be known when _mesa_context_create
-       * is called, but we can't enable the extensions until we have a
-       * context pointer.
-       *
-       * Hello chicken.  Hello egg.  How are you two today?
-       */
-      driInitExtensions(NULL, card_extensions, GL_FALSE);
+   /* Calling driInitExtensions here, with a NULL context pointer,
+    * does not actually enable the extensions.  It just makes sure
+    * that all the dispatch offsets for all the extensions that
+    * *might* be enables are known.  This is needed because the
+    * dispatch offsets need to be known when _mesa_context_create is
+    * called, but we can't enable the extensions until we have a
+    * context pointer.
+    *
+    * Hello chicken.  Hello egg.  How are you two today?
+    */
+   driInitExtensions( NULL, card_extensions, GL_FALSE );
+   //intelInitExtensions(NULL, GL_TRUE);
+	   
+   if (!intelInitDriver(psp))
+       return NULL;
+
+   psp->extensions = intelScreenExtensions;
+
+   return (const __DRIconfig **)
+       intelFillInModes(psp, dri_priv->cpp * 8,
+			(dri_priv->cpp == 2) ? 16 : 24,
+			(dri_priv->cpp == 2) ? 0  : 8, 1);
+}
+
+/**
+ * This is the driver specific part of the createNewScreen entry point.
+ * 
+ * \return the __GLcontextModes supported by this driver
+ */
+static const
+__DRIconfig **intelInitScreen2(__DRIscreenPrivate *psp)
+{
+   struct intel_screen *intelScreen;
+
+   /* Calling driInitExtensions here, with a NULL context pointer,
+    * does not actually enable the extensions.  It just makes sure
+    * that all the dispatch offsets for all the extensions that
+    * *might* be enables are known.  This is needed because the
+    * dispatch offsets need to be known when _mesa_context_create is
+    * called, but we can't enable the extensions until we have a
+    * context pointer.
+    *
+    * Hello chicken.  Hello egg.  How are you two today?
+    */
+   //intelInitExtensions(NULL, GL_TRUE);
+
+   /* Allocate the private area */
+   intelScreen = CALLOC_STRUCT(intel_screen);
+   if (!intelScreen) {
+      fprintf(stderr, "\nERROR!  Allocating private area failed\n");
+      return GL_FALSE;
    }
+   /* parse information in __driConfigOptions */
+   driParseOptionInfo(&intelScreen->optionCache,
+                      __driConfigOptions, __driNConfigOptions);
+
+   psp->private = (void *) intelScreen;
+
+   intelScreen->drmMinor = psp->drm_version.minor;
 
-   return (void *) psp;
+   /* Determine chipset ID? */
+   if (!intel_get_param(psp, I915_PARAM_CHIPSET_ID,
+			&intelScreen->deviceID))
+      return GL_FALSE;
+
+   psp->extensions = intelScreenExtensions;
+
+   intel_be_init_device(&intelScreen->base, psp->fd, intelScreen->deviceID);
+   intelScreen->base.base.flush_frontbuffer = intel_flush_frontbuffer;
+   intelScreen->base.base.get_name = intel_get_name;
+
+   return driConcatConfigs(intelFillInModes(psp, 16, 16, 0, 1),
+			   intelFillInModes(psp, 32, 24, 8, 1));
 }
 
+const struct __DriverAPIRec driDriverAPI = {
+   .InitScreen		 = intelInitScreen,
+   .DestroyScreen	 = intelDestroyScreen,
+   .CreateContext	 = intelCreateContext,
+   .DestroyContext	 = intelDestroyContext,
+   .CreateBuffer	 = intelCreateBuffer,
+   .DestroyBuffer	 = intelDestroyBuffer,
+   .SwapBuffers		 = intelSwapBuffers,
+   .MakeCurrent		 = intelMakeCurrent,
+   .UnbindContext	 = intelUnbindContext,
+   .GetSwapInfo		 = intelGetSwapInfo,
+   .GetDrawableMSC	 = driDrawableGetMSC32,
+   .WaitForMSC		 = driWaitForMSC32,
+   .CopySubBuffer	 = intelCopySubBuffer,
+
+   //.InitScreen2		 = intelInitScreen2,
+   //.HandleDrawableConfig = intelHandleDrawableConfig,
+   //.HandleBufferAttach	 = intelHandleBufferAttach,
+};
diff --git a/src/gallium/winsys/drm/intel/dri/intel_swapbuffers.c b/src/gallium/winsys/drm/intel/dri/intel_swapbuffers.c
index 8a18bfd9a4..34ad7eebe1 100644
--- a/src/gallium/winsys/drm/intel/dri/intel_swapbuffers.c
+++ b/src/gallium/winsys/drm/intel/dri/intel_swapbuffers.c
@@ -94,7 +94,6 @@ intelDisplaySurface(__DRIdrawablePrivate *dPriv,
       int i;
 
       ASSERT(surf->buffer);
-      ASSERT(surf->cpp == cpp);
 
       DBG(SWAP, "screen pitch %d  src surface pitch %d\n",
 	  pitch, surf->stride);
diff --git a/src/gallium/winsys/xlib/xm_winsys.c b/src/gallium/winsys/xlib/xm_winsys.c
index 3334af175b..acb5ad8f71 100644
--- a/src/gallium/winsys/xlib/xm_winsys.c
+++ b/src/gallium/winsys/xlib/xm_winsys.c
@@ -352,7 +352,7 @@ xmesa_display_surface_tiled(XMesaBuffer b, const struct pipe_surface *surf)
          /* offset in pixels */
          offset *= TILE_SIZE * TILE_SIZE;
 
-         if (XSHM_ENABLED(xm_buf)) {
+         if (0 && XSHM_ENABLED(xm_buf)) {
             ximage->data = (char *) xm_buf->data + 4 * offset;
             /* make copy of tile data */
             memcpy(tmpTile, (uint *) ximage->data, sizeof(tmpTile));
@@ -365,7 +365,7 @@ xmesa_display_surface_tiled(XMesaBuffer b, const struct pipe_surface *surf)
 #endif
          }
          else {
-            /* twiddel from ximage buffer to temp tile */
+            /* twiddle from ximage buffer to temp tile */
             twiddle_tile((uint *) xm_buf->data + offset, tmpTile);
             /* display temp tile data */
             ximage->data = (char *) tmpTile;