From 5d78212d752e021555356bbb9cc5993ad6d9e847 Mon Sep 17 00:00:00 2001
From: José Fonseca <jrfonseca@tungstengraphics.com>
Date: Tue, 19 Feb 2008 14:00:16 +0900
Subject: Bring in ppc spe rtasm into gallium's rtasm module.

Moving files since these are not being used outside gallium.
---
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h | 314 ++++++++++++++++++++++++++++
 1 file changed, 314 insertions(+)
 create mode 100644 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h

(limited to 'src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
new file mode 100644
index 0000000000..10ce44b3a0
--- /dev/null
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
@@ -0,0 +1,314 @@
+/*
+ * (C) Copyright IBM Corporation 2008
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * \file
+ * Real-time assembly generation interface for Cell B.E. SPEs.
+ *
+ * \author Ian Romanick <idr@us.ibm.com>
+ */
+
+#ifndef RTASM_PPC_SPE_H
+#define RTASM_PPC_SPE_H
+
+struct spe_function {
+    /**
+     *
+     */
+    uint32_t *store;
+    uint32_t *csr;
+    const char *fn;
+};
+
+extern void spe_init_func(struct spe_function *p, unsigned code_size);
+extern void spe_release_func(struct spe_function *p);
+
+#endif /* RTASM_PPC_SPE_H */
+
+#ifndef EMIT_
+#define EMIT_(name, _op) \
+    extern void _name (struct spe_function *p, unsigned rT)
+#define EMIT_R(_name, _op) \
+    extern void _name (struct spe_function *p, unsigned rT, unsigned rA)
+#define EMIT_RR(_name, _op) \
+    extern void _name (struct spe_function *p, unsigned rT, unsigned rA, \
+			   unsigned rB)
+#define EMIT_RRR(_name, _op) \
+    extern void _name (struct spe_function *p, unsigned rT, unsigned rA, \
+			   unsigned rB, unsigned rC)
+#define EMIT_RI7(_name, _op) \
+    extern void _name (struct spe_function *p, unsigned rT, unsigned rA, \
+			   int imm)
+#define EMIT_RI8(_name, _op) \
+    extern void _name (struct spe_function *p, unsigned rT, unsigned rA, \
+			   int imm)
+#define EMIT_RI10(_name, _op) \
+    extern void _name (struct spe_function *p, unsigned rT, unsigned rA, \
+			   int imm)
+#define EMIT_RI16(_name, _op) \
+    extern void _name (struct spe_function *p, unsigned rT, int imm)
+#define EMIT_RI18(_name, _op) \
+    extern void _name (struct spe_function *p, unsigned rT, int imm)
+#define EMIT_I16(_name, _op) \
+    extern void _name (struct spe_function *p, int imm)
+#define UNDEF_EMIT_MACROS
+#endif /* EMIT_ */
+
+
+/* Memory load / store instructions
+ */
+EMIT_RI10(spe_lqd,  0x034);
+EMIT_RR  (spe_lqx,  0x1c4);
+EMIT_RI16(spe_lqa,  0x061);
+EMIT_RI16(spe_lqr,  0x067);
+EMIT_RI10(spe_stqd, 0x024);
+EMIT_RR  (spe_stqx, 0x144);
+EMIT_RI16(spe_stqa, 0x041);
+EMIT_RI16(spe_stqr, 0x047);
+EMIT_RI7 (spe_cbd,  0x1f4);
+EMIT_RR  (spe_cbx,  0x1d4);
+EMIT_RI7 (spe_chd,  0x1f5);
+EMIT_RI7 (spe_chx,  0x1d5);
+EMIT_RI7 (spe_cwd,  0x1f6);
+EMIT_RI7 (spe_cwx,  0x1d6);
+EMIT_RI7 (spe_cdd,  0x1f7);
+EMIT_RI7 (spe_cdx,  0x1d7);
+
+
+/* Constant formation instructions
+ */
+EMIT_RI16(spe_ilh,   0x083);
+EMIT_RI16(spe_ilhu,  0x082);
+EMIT_RI16(spe_il,    0x081);
+EMIT_RI18(spe_ila,   0x021);
+EMIT_RI16(spe_iohl,  0x0c1);
+EMIT_RI16(spe_fsmbi, 0x0c5);
+
+
+/* Integer and logical instructions
+ */
+EMIT_RR  (spe_ah,      0x0c8);
+EMIT_RI10(spe_ahi,     0x01d);
+EMIT_RR  (spe_a,       0x0c0);
+EMIT_RI10(spe_ai,      0x01c);
+EMIT_RR  (spe_sfh,     0x048);
+EMIT_RI10(spe_sfhi,    0x00d);
+EMIT_RR  (spe_sf,      0x040);
+EMIT_RI10(spe_sfi,     0x00c);
+EMIT_RR  (spe_addx,    0x340);
+EMIT_RR  (spe_cg,      0x0c2);
+EMIT_RR  (spe_cgx,     0x342);
+EMIT_RR  (spe_sfx,     0x341);
+EMIT_RR  (spe_bg,      0x042);
+EMIT_RR  (spe_bgx,     0x343);
+EMIT_RR  (spe_mpy,     0x3c4);
+EMIT_RR  (spe_mpyu,    0x3cc);
+EMIT_RI10(spe_mpyi,    0x074);
+EMIT_RI10(spe_mpyui,   0x075);
+EMIT_RRR (spe_mpya,    0x00c);
+EMIT_RR  (spe_mpyh,    0x3c5);
+EMIT_RR  (spe_mpys,    0x3c7);
+EMIT_RR  (spe_mpyhh,   0x3c6);
+EMIT_RR  (spe_mpyhha,  0x346);
+EMIT_RR  (spe_mpyhhu,  0x3ce);
+EMIT_RR  (spe_mpyhhau, 0x34e);
+EMIT_R   (spe_clz,     0x2a5);
+EMIT_R   (spe_cntb,    0x2b4);
+EMIT_R   (spe_fsmb,    0x1b6);
+EMIT_R   (spe_fsmh,    0x1b5);
+EMIT_R   (spe_fsm,     0x1b4);
+EMIT_R   (spe_gbb,     0x1b2);
+EMIT_R   (spe_gbh,     0x1b1);
+EMIT_R   (spe_gb,      0x1b0);
+EMIT_RR  (spe_avgb,    0x0d3);
+EMIT_RR  (spe_absdb,   0x053);
+EMIT_RR  (spe_sumb,    0x253);
+EMIT_R   (spe_xsbh,    0x2b6);
+EMIT_R   (spe_xshw,    0x2ae);
+EMIT_R   (spe_xswd,    0x2a6);
+EMIT_RR  (spe_and,     0x0c1);
+EMIT_RR  (spe_andc,    0x2c1);
+EMIT_RI10(spe_andbi,   0x016);
+EMIT_RI10(spe_andhi,   0x015);
+EMIT_RI10(spe_andi,    0x014);
+EMIT_RR  (spe_or,      0x041);
+EMIT_RR  (spe_orc,     0x2c9);
+EMIT_RI10(spe_orbi,    0x006);
+EMIT_RI10(spe_orhi,    0x005);
+EMIT_RI10(spe_ori,     0x004);
+EMIT_R   (spe_orx,     0x1f0);
+EMIT_RR  (spe_xor,     0x241);
+EMIT_RI10(spe_xorbi,   0x026);
+EMIT_RI10(spe_xorhi,   0x025);
+EMIT_RI10(spe_xori,    0x024);
+EMIT_RR  (spe_nand,    0x0c9);
+EMIT_RR  (spe_nor,     0x049);
+EMIT_RR  (spe_eqv,     0x249);
+EMIT_RRR (spe_selb,    0x008);
+EMIT_RRR (spe_shufb,   0x00b);
+
+
+/* Shift and rotate instructions
+ */
+EMIT_RR  (spe_shlh,      0x05f);
+EMIT_RI7 (spe_shlhi,     0x07f);
+EMIT_RR  (spe_shl,       0x05b);
+EMIT_RI7 (spe_shli,      0x07b);
+EMIT_RR  (spe_shlqbi,    0x1db);
+EMIT_RI7 (spe_shlqbii,   0x1fb);
+EMIT_RR  (spe_shlqby,    0x1df);
+EMIT_RI7 (spe_shlqbyi,   0x1ff);
+EMIT_RR  (spe_shlqbybi,  0x1cf);
+EMIT_RR  (spe_roth,      0x05c);
+EMIT_RI7 (spe_rothi,     0x07c);
+EMIT_RR  (spe_rot,       0x058);
+EMIT_RI7 (spe_roti,      0x078);
+EMIT_RR  (spe_rotqby,    0x1dc);
+EMIT_RI7 (spe_rotqbyi,   0x1fc);
+EMIT_RR  (spe_rotqbybi,  0x1cc);
+EMIT_RR  (spe_rotqbi,    0x1d8);
+EMIT_RI7 (spe_rotqbii,   0x1f8);
+EMIT_RR  (spe_rothm,     0x05d);
+EMIT_RI7 (spe_rothmi,    0x07d);
+EMIT_RR  (spe_rotm,      0x059);
+EMIT_RI7 (spe_rotmi,     0x079);
+EMIT_RR  (spe_rotqmby,   0x1dd);
+EMIT_RI7 (spe_rotqmbyi,  0x1fd);
+EMIT_RR  (spe_rotqmbybi, 0x1cd);
+EMIT_RR  (spe_rotqmbi,   0x1c9);
+EMIT_RI7 (spe_rotqmbii,  0x1f9);
+EMIT_RR  (spe_rotmah,    0x05e);
+EMIT_RI7 (spe_rotmahi,   0x07e);
+EMIT_RR  (spe_rotma,     0x05a);
+EMIT_RI7 (spe_rotmai,    0x07a);
+
+
+/* Compare, branch, and halt instructions
+ */
+EMIT_RR  (spe_heq,       0x3d8);
+EMIT_RI10(spe_heqi,      0x07f);
+EMIT_RR  (spe_hgt,       0x258);
+EMIT_RI10(spe_hgti,      0x04f);
+EMIT_RR  (spe_hlgt,      0x2d8);
+EMIT_RI10(spe_hlgti,     0x05f);
+EMIT_RR  (spe_ceqb,      0x3d0);
+EMIT_RI10(spe_ceqbi,     0x07e);
+EMIT_RR  (spe_ceqh,      0x3c8);
+EMIT_RI10(spe_ceqhi,     0x07d);
+EMIT_RR  (spe_ceq,       0x3c0);
+EMIT_RI10(spe_ceqi,      0x07c);
+EMIT_RR  (spe_cgtb,      0x250);
+EMIT_RI10(spe_cgtbi,     0x04e);
+EMIT_RR  (spe_cgth,      0x248);
+EMIT_RI10(spe_cgthi,     0x04d);
+EMIT_RR  (spe_cgt,       0x240);
+EMIT_RI10(spe_cgti,      0x04c);
+EMIT_RR  (spe_clgtb,     0x2d0);
+EMIT_RI10(spe_clgtbi,    0x05e);
+EMIT_RR  (spe_clgth,     0x2c8);
+EMIT_RI10(spe_clgthi,    0x05d);
+EMIT_RR  (spe_clgt,      0x2c0);
+EMIT_RI10(spe_clgti,     0x05c);
+EMIT_I16 (spe_br,        0x064);
+EMIT_I16 (spe_bra,       0x060);
+EMIT_RI16(spe_brsl,      0x066);
+EMIT_RI16(spe_brasl,     0x062);
+EMIT_RI16(spe_brnz,      0x042);
+EMIT_RI16(spe_brz,       0x040);
+EMIT_RI16(spe_brhnz,     0x046);
+EMIT_RI16(spe_brhz,      0x044);
+
+extern void spe_bi(struct spe_function *p, unsigned rA, int d, int e);
+extern void spe_iret(struct spe_function *p, unsigned rA, int d, int e);
+extern void spe_bisled(struct spe_function *p, unsigned rT, unsigned rA,
+    int d, int e);
+extern void spe_bisl(struct spe_function *p, unsigned rT, unsigned rA,
+    int d, int e);
+extern void spe_biz(struct spe_function *p, unsigned rT, unsigned rA,
+    int d, int e);
+extern void spe_binz(struct spe_function *p, unsigned rT, unsigned rA,
+    int d, int e);
+extern void spe_bihz(struct spe_function *p, unsigned rT, unsigned rA,
+    int d, int e);
+extern void spe_bihnz(struct spe_function *p, unsigned rT, unsigned rA,
+    int d, int e);
+
+
+/* Floating-point instructions
+ */
+EMIT_RR  (spe_fa,         0x2c4);
+EMIT_RR  (spe_dfa,        0x2cc);
+EMIT_RR  (spe_fs,         0x2c5);
+EMIT_RR  (spe_dfs,        0x2cd);
+EMIT_RR  (spe_fm,         0x2c6);
+EMIT_RR  (spe_dfm,        0x2ce);
+EMIT_RRR (spe_fma,        0x00e);
+EMIT_RR  (spe_dfma,       0x35c);
+EMIT_RRR (spe_fnms,       0x00d);
+EMIT_RR  (spe_dfnms,      0x35e);
+EMIT_RRR (spe_fms,        0x00f);
+EMIT_RR  (spe_dfms,       0x35d);
+EMIT_RR  (spe_dfnma,      0x35f);
+EMIT_R   (spe_frest,      0x1b8);
+EMIT_R   (spe_frsqest,    0x1b9);
+EMIT_RR  (spe_fi,         0x3d4);
+EMIT_RI8 (spe_csflt,      0x1da);
+EMIT_RI8 (spe_cflts,      0x1d8);
+EMIT_RI8 (spe_cuflt,      0x1db);
+EMIT_RI8 (spe_cfltu,      0x1d9);
+EMIT_R   (spe_frds,       0x3b9);
+EMIT_R   (spe_fesd,       0x3b8);
+EMIT_RR  (spe_dfceq,      0x3c3);
+EMIT_RR  (spe_dfcmeq,     0x3cb);
+EMIT_RR  (spe_dfcgt,      0x2c3);
+EMIT_RR  (spe_dfcmgt,     0x2cb);
+EMIT_RI7 (spe_dftsv,      0x3bf);
+EMIT_RR  (spe_fceq,       0x3c2);
+EMIT_RR  (spe_fcmeq,      0x3ca);
+EMIT_RR  (spe_fcgt,       0x2c2);
+EMIT_RR  (spe_fcmgt,      0x2ca);
+EMIT_R   (spe_fscrwr,     0x3ba);
+EMIT_    (spe_fscrrd,     0x398);
+
+
+/* Channel instructions
+ */
+EMIT_R   (spe_rdch,       0x00d);
+EMIT_R   (spe_rdchcnt,    0x00f);
+EMIT_R   (spe_wrch,       0x10d);
+
+
+#ifdef UNDEF_EMIT_MACROS
+#undef EMIT_
+#undef EMIT_R
+#undef EMIT_RR
+#undef EMIT_RRR
+#undef EMIT_RI7
+#undef EMIT_RI8
+#undef EMIT_RI10
+#undef EMIT_RI16
+#undef EMIT_RI18
+#undef EMIT_I16
+#undef UNDEF_EMIT_MACROS
+#endif /* EMIT_ */
-- 
cgit v1.2.3


From b1525662b330ca8b4cdd930775f3642bfec3b58f Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Mon, 10 Mar 2008 16:28:54 -0700
Subject: Move SPE register allocator to rtasm code

Move the register allocator to a common location.  There is more code
on the way that will make use of this interface.
---
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c      |  47 +++++++++++
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h      |  16 ++++
 src/gallium/drivers/cell/ppu/cell_vertex_fetch.c | 101 +++++++----------------
 3 files changed, 92 insertions(+), 72 deletions(-)

(limited to 'src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
index 95a2d6fcbb..a996218ce7 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
@@ -306,6 +306,11 @@ void spe_init_func(struct spe_function *p, unsigned code_size)
 {
     p->store = align_malloc(code_size, 16);
     p->csr = p->store;
+    
+    /* Conservatively treat R0 - R2 and R80 - R127 as non-volatile.
+     */
+    p->regs[0] = ~7;
+    p->regs[1] = (1U << (80 - 64)) - 1;
 }
 
 
@@ -317,6 +322,48 @@ void spe_release_func(struct spe_function *p)
 }
 
 
+int spe_allocate_available_register(struct spe_function *p)
+{
+   unsigned i;
+   for (i = 0; i < 128; i++) {
+      const uint64_t mask = (1ULL << (i % 128));
+      const unsigned idx = i / 128;
+
+      if ((p->regs[idx] & mask) != 0) {
+         p->regs[idx] &= ~mask;
+         return i;
+      }
+   }
+
+   return -1;
+}
+
+
+int spe_allocate_register(struct spe_function *p, int reg)
+{
+   const unsigned idx = reg / 128;
+   const unsigned bit = reg % 128;
+
+   assert((p->regs[idx] & (1ULL << bit)) != 0);
+
+   p->regs[idx] &= ~(1ULL << bit);
+   return reg;
+}
+
+
+void spe_release_register(struct spe_function *p, int reg)
+{
+   const unsigned idx = reg / 128;
+   const unsigned bit = reg % 128;
+
+   assert((p->regs[idx] & (1ULL << bit)) == 0);
+
+   p->regs[idx] |= (1ULL << bit);
+}
+
+
+
+
 void spe_bi(struct spe_function *p, unsigned rA, int d, int e)
 {
     emit_RI7(p, 0x1a8, 0, rA, (d << 5) | (e << 4));
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
index 10ce44b3a0..5a1eb1ed8d 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
@@ -39,11 +39,27 @@ struct spe_function {
     uint32_t *store;
     uint32_t *csr;
     const char *fn;
+
+    /**
+     * Mask of used / unused registers
+     *
+     * Each set bit corresponds to an available register.  Each cleared bit
+     * corresponds to an allocated register.
+     *
+     * \sa
+     * spe_allocate_register, spe_allocate_available_register,
+     * spe_release_register
+     */
+    uint64_t regs[2];
 };
 
 extern void spe_init_func(struct spe_function *p, unsigned code_size);
 extern void spe_release_func(struct spe_function *p);
 
+extern int spe_allocate_available_register(struct spe_function *p);
+extern int spe_allocate_register(struct spe_function *p, int reg);
+extern void spe_release_register(struct spe_function *p, int reg);
+
 #endif /* RTASM_PPC_SPE_H */
 
 #ifndef EMIT_
diff --git a/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c b/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c
index 9cf74bab47..4828a8023b 100644
--- a/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c
+++ b/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c
@@ -33,46 +33,11 @@
 #include "cell_context.h"
 #include "rtasm/rtasm_ppc_spe.h"
 
-typedef uint64_t register_mask;
-
-int allocate_available_register(register_mask *m)
-{
-   unsigned i;
-   for (i = 0; i < 64; i++) {
-      const uint64_t mask = (1ULL << i);
-
-      if ((m[0] & mask) != 0) {
-	 m[0] &= ~mask;
-	 return i;
-      }
-   }
-
-   return -1;
-}
-
-
-int allocate_register(register_mask *m, unsigned reg)
-{
-   assert((m[0] & (1ULL << reg)) != 0);
-
-   m[0] &= ~(1ULL << reg);
-   return reg;
-}
-
-
-void release_register(register_mask *m, unsigned reg)
-{
-   assert((m[0] & (1ULL << reg)) == 0);
-
-   m[0] |= (1ULL << reg);
-}
-
 
 /**
  * Emit a 4x4 matrix transpose operation
  *
  * \param p         Function that the transpose operation is to be appended to
- * \param m         Live register mask
  * \param row0      Register containing row 0 of the source matrix
  * \param row1      Register containing row 1 of the source matrix
  * \param row2      Register containing row 2 of the source matrix
@@ -91,15 +56,15 @@ void release_register(register_mask *m, unsigned reg)
  * This function requires that four temporary are available on entry.
  */
 static void
-emit_matrix_transpose(struct spe_function *p, register_mask *m,
+emit_matrix_transpose(struct spe_function *p,
 		      unsigned row0, unsigned row1, unsigned row2,
 		      unsigned row3, unsigned dest_ptr,
 		      unsigned shuf_ptr, unsigned count)
 {
-   int shuf_hi = allocate_available_register(m);
-   int shuf_lo = allocate_available_register(m);
-   int t1 = allocate_available_register(m);
-   int t2 = allocate_available_register(m);
+   int shuf_hi = spe_allocate_available_register(p);
+   int shuf_lo = spe_allocate_available_register(p);
+   int t1 = spe_allocate_available_register(p);
+   int t2 = spe_allocate_available_register(p);
    int t3;
    int t4;
    int col0;
@@ -169,19 +134,19 @@ emit_matrix_transpose(struct spe_function *p, register_mask *m,
 
    /* Release all of the temporary registers used.
     */
-   release_register(m, col0);
-   release_register(m, col1);
-   release_register(m, col2);
-   release_register(m, col3);
-   release_register(m, shuf_hi);
-   release_register(m, shuf_lo);
-   release_register(m, t2);
-   release_register(m, t4);
+   spe_release_register(p, col0);
+   spe_release_register(p, col1);
+   spe_release_register(p, col2);
+   spe_release_register(p, col3);
+   spe_release_register(p, shuf_hi);
+   spe_release_register(p, shuf_lo);
+   spe_release_register(p, t2);
+   spe_release_register(p, t4);
 }
 
 
 static void
-emit_fetch(struct spe_function *p, register_mask *m,
+emit_fetch(struct spe_function *p,
 	   unsigned in_ptr, unsigned *offset,
 	   unsigned out_ptr, unsigned shuf_ptr,
 	   enum pipe_format format)
@@ -191,11 +156,11 @@ emit_fetch(struct spe_function *p, register_mask *m,
    const unsigned type = pf_type(format);
    const unsigned bytes = pf_size_x(format);
 
-   int v0 = allocate_available_register(m);
-   int v1 = allocate_available_register(m);
-   int v2 = allocate_available_register(m);
-   int v3 = allocate_available_register(m);
-   int tmp = allocate_available_register(m);
+   int v0 = spe_allocate_available_register(p);
+   int v1 = spe_allocate_available_register(p);
+   int v2 = spe_allocate_available_register(p);
+   int v3 = spe_allocate_available_register(p);
+   int tmp = spe_allocate_available_register(p);
    int float_zero = -1;
    int float_one = -1;
    float scale_signed = 0.0;
@@ -260,19 +225,19 @@ emit_fetch(struct spe_function *p, register_mask *m,
 
 
    if (count < 4) {
-      float_one = allocate_available_register(m);
+      float_one = spe_allocate_available_register(p);
       spe_il(p, float_one, 1);
       spe_cuflt(p, float_one, float_one, 0);
       
       if (count < 3) {
-	 float_zero = allocate_available_register(m);
+	 float_zero = spe_allocate_available_register(p);
 	 spe_il(p, float_zero, 0);
       }
    }
 
-   release_register(m, tmp);
+   spe_release_register(p, tmp);
 
-   emit_matrix_transpose(p, m, v0, v1, v2, v3, out_ptr, shuf_ptr, count);
+   emit_matrix_transpose(p, v0, v1, v2, v3, out_ptr, shuf_ptr, count);
 
    switch (count) {
    case 1:
@@ -284,11 +249,11 @@ emit_fetch(struct spe_function *p, register_mask *m,
    }
 
    if (float_zero != -1) {
-      release_register(m, float_zero);
+      spe_release_register(p, float_zero);
    }
 
    if (float_one != -1) {
-      release_register(m, float_one);
+      spe_release_register(p, float_one);
    }
 }
 
@@ -297,7 +262,6 @@ void cell_update_vertex_fetch(struct draw_context *draw)
 {
    struct cell_context *const cell =
        (struct cell_context *) draw->driver_private;
-   register_mask m = ~0;
    struct spe_function *p = &cell->attrib_fetch;
    unsigned function_index[PIPE_ATTRIB_MAX];
    unsigned unique_attr_formats;
@@ -338,18 +302,11 @@ void cell_update_vertex_fetch(struct draw_context *draw)
    spe_init_func(p, 136 * unique_attr_formats);
 
 
-   /* Registers 0, 1, and 2 are reserved by the ABI.
-    */
-   allocate_register(&m, 0);
-   allocate_register(&m, 1);
-   allocate_register(&m, 2);
-
-
    /* Allocate registers for the function's input parameters.
     */
-   out_ptr = allocate_register(&m, 3);
-   in_ptr = allocate_register(&m, 4);
-   shuf_ptr = allocate_register(&m, 5);
+   out_ptr = spe_allocate_register(p, 3);
+   in_ptr = spe_allocate_register(p, 4);
+   shuf_ptr = spe_allocate_register(p, 5);
 
 
    /* Generate code for the individual attribute fetch functions.
@@ -362,7 +319,7 @@ void cell_update_vertex_fetch(struct draw_context *draw)
 						     - (void *) p->store);
 
 	 offset = 0;
-	 emit_fetch(p, & m, in_ptr, &offset, out_ptr, shuf_ptr,
+	 emit_fetch(p, in_ptr, &offset, out_ptr, shuf_ptr,
 		    draw->vertex_element[i].src_format);
 	 spe_bi(p, 0, 0, 0);
 
-- 
cgit v1.2.3


From 84d8030735844785c3c97679db2bc1892a9c8c70 Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Mon, 24 Mar 2008 12:15:59 -0700
Subject: cell: Float convert-to and convert-from instructions use different
 shift bias

---
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c |  4 ++--
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h | 10 +++++-----
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
index 24be65bff9..7f6bf577b2 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
@@ -267,10 +267,10 @@ void _name (struct spe_function *p, unsigned rT, unsigned rA, int imm) \
     emit_RI7(p, _op, rT, rA, imm); \
 }
 
-#define EMIT_RI8(_name, _op) \
+#define EMIT_RI8(_name, _op, bias) \
 void _name (struct spe_function *p, unsigned rT, unsigned rA, int imm) \
 { \
-    emit_RI8(p, _op, rT, rA, 155 - imm); \
+    emit_RI8(p, _op, rT, rA, bias - imm); \
 }
 
 #define EMIT_RI10(_name, _op) \
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
index 5a1eb1ed8d..1cacc717b1 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
@@ -76,7 +76,7 @@ extern void spe_release_register(struct spe_function *p, int reg);
 #define EMIT_RI7(_name, _op) \
     extern void _name (struct spe_function *p, unsigned rT, unsigned rA, \
 			   int imm)
-#define EMIT_RI8(_name, _op) \
+#define EMIT_RI8(_name, _op, bias) \
     extern void _name (struct spe_function *p, unsigned rT, unsigned rA, \
 			   int imm)
 #define EMIT_RI10(_name, _op) \
@@ -289,10 +289,10 @@ EMIT_RR  (spe_dfnma,      0x35f);
 EMIT_R   (spe_frest,      0x1b8);
 EMIT_R   (spe_frsqest,    0x1b9);
 EMIT_RR  (spe_fi,         0x3d4);
-EMIT_RI8 (spe_csflt,      0x1da);
-EMIT_RI8 (spe_cflts,      0x1d8);
-EMIT_RI8 (spe_cuflt,      0x1db);
-EMIT_RI8 (spe_cfltu,      0x1d9);
+EMIT_RI8 (spe_csflt,      0x1da, 155);
+EMIT_RI8 (spe_cflts,      0x1d8, 173);
+EMIT_RI8 (spe_cuflt,      0x1db, 155);
+EMIT_RI8 (spe_cfltu,      0x1d9, 173);
 EMIT_R   (spe_frds,       0x3b9);
 EMIT_R   (spe_fesd,       0x3b8);
 EMIT_RR  (spe_dfceq,      0x3c3);
-- 
cgit v1.2.3


From bb5becf1e289b2c9240d98299e9447a9673da9fc Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 5 Sep 2008 13:54:14 -0600
Subject: gallium: comments, assertions, etc

---
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c | 36 +++++++++++++++++++++++++----
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h | 20 +++++++++-------
 2 files changed, 43 insertions(+), 13 deletions(-)

(limited to 'src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
index 285ddc0e3f..fe5beba456 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
@@ -300,7 +300,9 @@ void _name (struct spe_function *p, int imm) \
 #include "rtasm_ppc_spe.h"
 
 
-/*
+/**
+ * Initialize an spe_function.
+ * \param code_size  size of instruction buffer to allocate, in bytes.
  */
 void spe_init_func(struct spe_function *p, unsigned code_size)
 {
@@ -324,10 +326,14 @@ void spe_release_func(struct spe_function *p)
 }
 
 
+/**
+ * Alloate a SPE register.
+ * \return register index or -1 if none left.
+ */
 int spe_allocate_available_register(struct spe_function *p)
 {
    unsigned i;
-   for (i = 0; i < 128; i++) {
+   for (i = 0; i < SPE_NUM_REGS; i++) {
       const uint64_t mask = (1ULL << (i % 64));
       const unsigned idx = i / 64;
 
@@ -341,11 +347,15 @@ int spe_allocate_available_register(struct spe_function *p)
 }
 
 
+/**
+ * Mark the given SPE register as "allocated".
+ */
 int spe_allocate_register(struct spe_function *p, int reg)
 {
    const unsigned idx = reg / 64;
    const unsigned bit = reg % 64;
 
+   assert(reg < SPE_NUM_REGS);
    assert((p->regs[idx] & (1ULL << bit)) != 0);
 
    p->regs[idx] &= ~(1ULL << bit);
@@ -353,57 +363,73 @@ int spe_allocate_register(struct spe_function *p, int reg)
 }
 
 
+/**
+ * Mark the given SPE register as "unallocated".
+ */
 void spe_release_register(struct spe_function *p, int reg)
 {
    const unsigned idx = reg / 64;
    const unsigned bit = reg % 64;
 
+   assert(reg < SPE_NUM_REGS);
    assert((p->regs[idx] & (1ULL << bit)) == 0);
 
    p->regs[idx] |= (1ULL << bit);
 }
 
 
+/**
+ * For branch instructions:
+ * \param d  if 1, disable interupts if branch is taken
+ * \param e  if 1, enable interupts if branch is taken
+ * If d and e are both zero, don't change interupt status (right?)
+ */
 
-
+/** Branch Indirect to address in rA */
 void spe_bi(struct spe_function *p, unsigned rA, int d, int e)
 {
     emit_RI7(p, 0x1a8, 0, rA, (d << 5) | (e << 4));
 }
 
+/** Interupt Return */
 void spe_iret(struct spe_function *p, unsigned rA, int d, int e)
 {
     emit_RI7(p, 0x1aa, 0, rA, (d << 5) | (e << 4));
 }
 
+/** Branch indirect and set link on external data */
 void spe_bisled(struct spe_function *p, unsigned rT, unsigned rA, int d,
 		int e)
 {
     emit_RI7(p, 0x1ab, rT, rA, (d << 5) | (e << 4));
 }
 
+/** Branch indirect and set link.  Save PC in rT, jump to rA. */
 void spe_bisl(struct spe_function *p, unsigned rT, unsigned rA, int d,
 		int e)
 {
     emit_RI7(p, 0x1a9, rT, rA, (d << 5) | (e << 4));
 }
 
-void spe_biz(struct spe_function *p, unsigned rT, unsigned rA, int d,
-		int e)
+/** Branch indirect if zero word.  If rT.word[0]==0, jump to rA. */
+void spe_biz(struct spe_function *p, unsigned rT, unsigned rA, int d, int e)
 {
     emit_RI7(p, 0x128, rT, rA, (d << 5) | (e << 4));
 }
 
+/** Branch indirect if non-zero word.  If rT.word[0]!=0, jump to rA. */
 void spe_binz(struct spe_function *p, unsigned rT, unsigned rA, int d, int e)
 {
     emit_RI7(p, 0x129, rT, rA, (d << 5) | (e << 4));
 }
 
+/** Branch indirect if zero halfword.  If rT.halfword[1]==0, jump to rA. */
 void spe_bihz(struct spe_function *p, unsigned rT, unsigned rA, int d, int e)
 {
     emit_RI7(p, 0x12a, rT, rA, (d << 5) | (e << 4));
 }
 
+/** Branch indirect if non-zero halfword.  If rT.halfword[1]!=0, jump to rA. */
 void spe_bihnz(struct spe_function *p, unsigned rT, unsigned rA, int d, int e)
 {
     emit_RI7(p, 0x12b, rT, rA, (d << 5) | (e << 4));
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
index 1cacc717b1..7dd754ba77 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
@@ -32,13 +32,17 @@
 #ifndef RTASM_PPC_SPE_H
 #define RTASM_PPC_SPE_H
 
-struct spe_function {
-    /**
-     *
-     */
-    uint32_t *store;
-    uint32_t *csr;
-    const char *fn;
+/** 4 bytes per instruction */
+#define SPE_INST_SIZE 4
+
+/** number of general-purpose SIMD registers */
+#define SPE_NUM_REGS  128
+
+struct spe_function
+{
+    uint32_t *store;  /**< instruction buffer */
+    uint32_t *csr;    /**< next free pos in instruction buffer */
+    const char *fn;   /**< unused */
 
     /**
      * Mask of used / unused registers
@@ -50,7 +54,7 @@ struct spe_function {
      * spe_allocate_register, spe_allocate_available_register,
      * spe_release_register
      */
-    uint64_t regs[2];
+    uint64_t regs[SPE_NUM_REGS / 64];
 };
 
 extern void spe_init_func(struct spe_function *p, unsigned code_size);
-- 
cgit v1.2.3


From ee582fd3a7a9ddbcb5595249201cf213a6c6f014 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 10 Sep 2008 17:11:48 -0600
Subject: gallium: assorted additions and fixes to Cell SPE rtasm code

Fix incorrect opcode for fsmbi.
Added "macro" functions for loading floats/ints, register complement, zero, move.
Added #defines for return address and stack pointer registers.
Added assertions to check that the instruction buffer doesn't overflow.
---
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c | 88 +++++++++++++++++++++++------
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h | 38 +++++++++++--
 2 files changed, 105 insertions(+), 21 deletions(-)

(limited to 'src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
index fe5beba456..61010e4333 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
@@ -151,8 +151,8 @@ static void emit_RR(struct spe_function *p, unsigned op, unsigned rT,
     inst.inst.rB = rB;
     inst.inst.rA = rA;
     inst.inst.rT = rT;
-    *p->csr = inst.bits;
-    p->csr++;
+    p->store[p->num_inst++] = inst.bits;
+    assert(p->num_inst <= p->max_inst);
 }
 
 
@@ -165,8 +165,8 @@ static void emit_RRR(struct spe_function *p, unsigned op, unsigned rT,
     inst.inst.rB = rB;
     inst.inst.rA = rA;
     inst.inst.rC = rC;
-    *p->csr = inst.bits;
-    p->csr++;
+    p->store[p->num_inst++] = inst.bits;
+    assert(p->num_inst <= p->max_inst);
 }
 
 
@@ -178,8 +178,8 @@ static void emit_RI7(struct spe_function *p, unsigned op, unsigned rT,
     inst.inst.i7 = imm;
     inst.inst.rA = rA;
     inst.inst.rT = rT;
-    *p->csr = inst.bits;
-    p->csr++;
+    p->store[p->num_inst++] = inst.bits;
+    assert(p->num_inst <= p->max_inst);
 }
 
 
@@ -192,8 +192,8 @@ static void emit_RI8(struct spe_function *p, unsigned op, unsigned rT,
     inst.inst.i8 = imm;
     inst.inst.rA = rA;
     inst.inst.rT = rT;
-    *p->csr = inst.bits;
-    p->csr++;
+    p->store[p->num_inst++] = inst.bits;
+    assert(p->num_inst <= p->max_inst);
 }
 
 
@@ -206,8 +206,8 @@ static void emit_RI10(struct spe_function *p, unsigned op, unsigned rT,
     inst.inst.i10 = imm;
     inst.inst.rA = rA;
     inst.inst.rT = rT;
-    *p->csr = inst.bits;
-    p->csr++;
+    p->store[p->num_inst++] = inst.bits;
+    assert(p->num_inst <= p->max_inst);
 }
 
 
@@ -218,8 +218,8 @@ static void emit_RI16(struct spe_function *p, unsigned op, unsigned rT,
     inst.inst.op = op;
     inst.inst.i16 = imm;
     inst.inst.rT = rT;
-    *p->csr = inst.bits;
-    p->csr++;
+    p->store[p->num_inst++] = inst.bits;
+    assert(p->num_inst <= p->max_inst);
 }
 
 
@@ -230,8 +230,8 @@ static void emit_RI18(struct spe_function *p, unsigned op, unsigned rT,
     inst.inst.op = op;
     inst.inst.i18 = imm;
     inst.inst.rT = rT;
-    *p->csr = inst.bits;
-    p->csr++;
+    p->store[p->num_inst++] = inst.bits;
+    assert(p->num_inst <= p->max_inst);
 }
 
 
@@ -307,8 +307,9 @@ void _name (struct spe_function *p, int imm) \
 void spe_init_func(struct spe_function *p, unsigned code_size)
 {
     p->store = align_malloc(code_size, 16);
-    p->csr = p->store;
-    
+    p->num_inst = 0;
+    p->max_inst = code_size / SPE_INST_SIZE;
+
     /* Conservatively treat R0 - R2 and R80 - R127 as non-volatile.
      */
     p->regs[0] = ~7;
@@ -318,11 +319,11 @@ void spe_init_func(struct spe_function *p, unsigned code_size)
 
 void spe_release_func(struct spe_function *p)
 {
+    assert(p->num_inst <= p->max_inst);
     if (p->store != NULL) {
         align_free(p->store);
     }
     p->store = NULL;
-    p->csr = NULL;
 }
 
 
@@ -337,6 +338,7 @@ int spe_allocate_available_register(struct spe_function *p)
       const uint64_t mask = (1ULL << (i % 64));
       const unsigned idx = i / 64;
 
+      assert(idx < 2);
       if ((p->regs[idx] & mask) != 0) {
          p->regs[idx] &= ~mask;
          return i;
@@ -371,6 +373,8 @@ void spe_release_register(struct spe_function *p, int reg)
    const unsigned idx = reg / 64;
    const unsigned bit = reg % 64;
 
+   assert(idx < 2);
+
    assert(reg < SPE_NUM_REGS);
    assert((p->regs[idx] & (1ULL << bit)) == 0);
 
@@ -458,4 +462,54 @@ EMIT_R   (spe_mfspr, 0x00c);
 EMIT_R   (spe_mtspr, 0x10c);
 #endif
 
+
+/**
+ ** Helper / "macro" instructions.
+ ** Use somewhat verbose names as a reminder that these aren't native
+ ** SPE instructions.
+ **/
+
+
+void
+spe_load_float(struct spe_function *p, unsigned rT, float x)
+{
+   union {
+      float f;
+      unsigned u;
+   } bits;
+   bits.f = x;
+   spe_ilhu(p, rT, bits.u >> 16);
+   spe_iohl(p, rT, bits.u & 0xffff);
+}
+
+
+void
+spe_load_int(struct spe_function *p, unsigned rT, int i)
+{
+   spe_ilhu(p, rT, i >> 16);
+   spe_iohl(p, rT, i & 0xffff);
+}
+
+
+void
+spe_complement(struct spe_function *p, unsigned rT)
+{
+   spe_nor(p, rT, rT, rT);
+}
+
+
+void
+spe_move(struct spe_function *p, unsigned rT, unsigned rA)
+{
+   spe_ori(p, rT, rA, 0);
+}
+
+
+void
+spe_zero(struct spe_function *p, unsigned rT)
+{
+   spe_xor(p, rT, rT, rT);
+}
+
+
 #endif /* GALLIUM_CELL */
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
index 7dd754ba77..dee8c55c4a 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
@@ -25,6 +25,7 @@
 /**
  * \file
  * Real-time assembly generation interface for Cell B.E. SPEs.
+ * For details, see /opt/cell/sdk/docs/arch/SPU_ISA_v1.2_27Jan2007_pub.pdf
  *
  * \author Ian Romanick <idr@us.ibm.com>
  */
@@ -38,11 +39,18 @@
 /** number of general-purpose SIMD registers */
 #define SPE_NUM_REGS  128
 
+/** Return Address register */
+#define SPE_REG_RA  0
+
+/** Stack Pointer register */
+#define SPE_REG_SP  1
+
+
 struct spe_function
 {
-    uint32_t *store;  /**< instruction buffer */
-    uint32_t *csr;    /**< next free pos in instruction buffer */
-    const char *fn;   /**< unused */
+   uint32_t *store;  /**< instruction buffer */
+   uint num_inst;
+   uint max_inst;
 
     /**
      * Mask of used / unused registers
@@ -123,7 +131,8 @@ EMIT_RI16(spe_ilhu,  0x082);
 EMIT_RI16(spe_il,    0x081);
 EMIT_RI18(spe_ila,   0x021);
 EMIT_RI16(spe_iohl,  0x0c1);
-EMIT_RI16(spe_fsmbi, 0x0c5);
+EMIT_RI16(spe_fsmbi, 0x065);
+
 
 
 /* Integer and logical instructions
@@ -275,6 +284,27 @@ extern void spe_bihnz(struct spe_function *p, unsigned rT, unsigned rA,
     int d, int e);
 
 
+/** Load/splat immediate float into rT. */
+extern void
+spe_load_float(struct spe_function *p, unsigned rT, float x);
+
+/** Load/splat immediate int into rT. */
+extern void
+spe_load_int(struct spe_function *p, unsigned rT, int i);
+
+/** Complement/invert all bits in rT. */
+extern void
+spe_complement(struct spe_function *p, unsigned rT);
+
+/** rT = rA. */
+extern void
+spe_move(struct spe_function *p, unsigned rT, unsigned rA);
+
+/** rT = {0,0,0,0}. */
+extern void
+spe_zero(struct spe_function *p, unsigned rT);
+
+
 /* Floating-point instructions
  */
 EMIT_RR  (spe_fa,         0x2c4);
-- 
cgit v1.2.3


From 178bbaff80d079606a1135bd65f1a85bac9774c4 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Thu, 11 Sep 2008 17:07:30 -0600
Subject: gallium: add special cases in spe_load_float(), spe_load_int(), added
 spe_splat()

---
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c | 45 +++++++++++++++++++++++------
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h |  4 +++
 2 files changed, 40 insertions(+), 9 deletions(-)

(limited to 'src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
index 61010e4333..a04cc6c4ff 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
@@ -473,21 +473,48 @@ EMIT_R   (spe_mtspr, 0x10c);
 void
 spe_load_float(struct spe_function *p, unsigned rT, float x)
 {
-   union {
-      float f;
-      unsigned u;
-   } bits;
-   bits.f = x;
-   spe_ilhu(p, rT, bits.u >> 16);
-   spe_iohl(p, rT, bits.u & 0xffff);
+   if (x == 0.0f) {
+      spe_il(p, rT, 0x0);
+   }
+   else if (x == 0.5f) {
+      spe_ilhu(p, rT, 0x3f00);
+   }
+   else if (x == 1.0f) {
+      spe_ilhu(p, rT, 0x3f80);
+   }
+   else if (x == -1.0f) {
+      spe_ilhu(p, rT, 0xbf80);
+   }
+   else {
+      union {
+         float f;
+         unsigned u;
+      } bits;
+      bits.f = x;
+      spe_ilhu(p, rT, bits.u >> 16);
+      spe_iohl(p, rT, bits.u & 0xffff);
+   }
 }
 
 
 void
 spe_load_int(struct spe_function *p, unsigned rT, int i)
 {
-   spe_ilhu(p, rT, i >> 16);
-   spe_iohl(p, rT, i & 0xffff);
+   if (-32768 <= i && i <= 32767) {
+      spe_il(p, rT, i);
+   }
+   else {
+      spe_ilhu(p, rT, i >> 16);
+      spe_iohl(p, rT, i & 0xffff);
+   }
+}
+
+
+void
+spe_splat(struct spe_function *p, unsigned rT, unsigned rA)
+{
+   spe_ila(p, rT, 66051);
+   spe_shufb(p, rT, rA, rA, rT);
 }
 
 
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
index dee8c55c4a..d95e5aace3 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
@@ -292,6 +292,10 @@ spe_load_float(struct spe_function *p, unsigned rT, float x);
 extern void
 spe_load_int(struct spe_function *p, unsigned rT, int i);
 
+/** Replicate word 0 of rA across rT. */
+extern void
+spe_splat(struct spe_function *p, unsigned rT, unsigned rA);
+
 /** Complement/invert all bits in rT. */
 extern void
 spe_complement(struct spe_function *p, unsigned rT);
-- 
cgit v1.2.3


From 31a112cad4d2e515bc668b58abd4e402b4362c70 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 12 Sep 2008 21:08:01 -0600
Subject: gallium: added spe_splat_word()

---
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c | 25 +++++++++++++++++++++++++
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h |  4 ++++
 2 files changed, 29 insertions(+)

(limited to 'src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
index 62e3adb357..89f8e24ce6 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
@@ -540,4 +540,29 @@ spe_zero(struct spe_function *p, unsigned rT)
 }
 
 
+void
+spe_splat_word(struct spe_function *p, unsigned rT, unsigned rA, int word)
+{
+   assert(word >= 0);
+   assert(word <= 3);
+
+   if (word == 0) {
+      int tmp1 = rT;
+      spe_ila(p, tmp1, 66051);
+      spe_shufb(p, rT, rA, rA, tmp1);
+   }
+   else {
+      /* XXX review this, we may not need the rotqbyi instruction */
+      int tmp1 = rT;
+      int tmp2 = spe_allocate_available_register(p);
+
+      spe_ila(p, tmp1, 66051);
+      spe_rotqbyi(p, tmp2, rA, 4 * word);
+      spe_shufb(p, rT, tmp2, tmp2, tmp1);
+
+      spe_release_register(p, tmp2);
+   }
+}
+
+
 #endif /* GALLIUM_CELL */
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
index d95e5aace3..7a3ab9ace5 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
@@ -308,6 +308,10 @@ spe_move(struct spe_function *p, unsigned rT, unsigned rA);
 extern void
 spe_zero(struct spe_function *p, unsigned rT);
 
+/** rT = splat(rA, word) */
+extern void
+spe_splat_word(struct spe_function *p, unsigned rT, unsigned rA, int word);
+
 
 /* Floating-point instructions
  */
-- 
cgit v1.2.3


From 8b5013d232bf6846717fac093465e8a39064e0b6 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 12 Sep 2008 21:52:47 -0600
Subject: gallium: added print/dump code to SPE code emitter

---
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c | 128 ++++++++++++++++++++++------
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h |  10 +++
 2 files changed, 113 insertions(+), 25 deletions(-)

(limited to 'src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
index 89f8e24ce6..8718be9ded 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
@@ -27,12 +27,16 @@
  * Real-time assembly generation interface for Cell B.E. SPEs.
  *
  * \author Ian Romanick <idr@us.ibm.com>
+ * \author Brian Paul
  */
 
+
+#include <stdio.h>
 #include "pipe/p_compiler.h"
 #include "util/u_memory.h"
 #include "rtasm_ppc_spe.h"
 
+
 #ifdef GALLIUM_CELL
 /**
  * SPE instruction types
@@ -143,8 +147,25 @@ union spe_inst_RI18 {
 /*@}*/
 
 
+static void
+indent(const struct spe_function *p)
+{
+   int i;
+   for (i = 0; i < p->indent; i++) {
+      putchar(' ');
+   }
+}
+
+
+static const char *
+rem_prefix(const char *longname)
+{
+   return longname + 4;
+}
+
+
 static void emit_RR(struct spe_function *p, unsigned op, unsigned rT,
-		    unsigned rA, unsigned rB)
+		    unsigned rA, unsigned rB, const char *name)
 {
     union spe_inst_RR inst;
     inst.inst.op = op;
@@ -153,11 +174,15 @@ static void emit_RR(struct spe_function *p, unsigned op, unsigned rT,
     inst.inst.rT = rT;
     p->store[p->num_inst++] = inst.bits;
     assert(p->num_inst <= p->max_inst);
+    if (p->print) {
+       indent(p);
+       printf("%s\tr%d, r%d, r%d\n", rem_prefix(name), rT, rA, rB);
+    }
 }
 
 
 static void emit_RRR(struct spe_function *p, unsigned op, unsigned rT,
-		    unsigned rA, unsigned rB, unsigned rC)
+                     unsigned rA, unsigned rB, unsigned rC, const char *name)
 {
     union spe_inst_RRR inst;
     inst.inst.op = op;
@@ -167,11 +192,15 @@ static void emit_RRR(struct spe_function *p, unsigned op, unsigned rT,
     inst.inst.rC = rC;
     p->store[p->num_inst++] = inst.bits;
     assert(p->num_inst <= p->max_inst);
+    if (p->print) {
+       indent(p);
+       printf("%s\tr%d, r%d, r%d, r%d\n", rem_prefix(name), rT, rA, rB, rB);
+    }
 }
 
 
 static void emit_RI7(struct spe_function *p, unsigned op, unsigned rT,
-		     unsigned rA, int imm)
+		     unsigned rA, int imm, const char *name)
 {
     union spe_inst_RI7 inst;
     inst.inst.op = op;
@@ -180,12 +209,16 @@ static void emit_RI7(struct spe_function *p, unsigned op, unsigned rT,
     inst.inst.rT = rT;
     p->store[p->num_inst++] = inst.bits;
     assert(p->num_inst <= p->max_inst);
+    if (p->print) {
+       indent(p);
+       printf("%s\tr%d, r%d, 0x%x\n", rem_prefix(name), rT, rA, imm);
+    }
 }
 
 
 static void emit_RI8(struct spe_function *p, unsigned op, unsigned rT,
-		     unsigned rA, int imm)
+		     unsigned rA, int imm, const char *name)
 {
     union spe_inst_RI8 inst;
     inst.inst.op = op;
@@ -194,12 +227,16 @@ static void emit_RI8(struct spe_function *p, unsigned op, unsigned rT,
     inst.inst.rT = rT;
     p->store[p->num_inst++] = inst.bits;
     assert(p->num_inst <= p->max_inst);
+    if (p->print) {
+       indent(p);
+       printf("%s\tr%d, r%d, 0x%x\n", rem_prefix(name), rT, rA, imm);
+    }
 }
 
 
 static void emit_RI10(struct spe_function *p, unsigned op, unsigned rT,
-		      unsigned rA, int imm)
+		      unsigned rA, int imm, const char *name)
 {
     union spe_inst_RI10 inst;
     inst.inst.op = op;
@@ -208,11 +245,15 @@ static void emit_RI10(struct spe_function *p, unsigned op, unsigned rT,
     inst.inst.rT = rT;
     p->store[p->num_inst++] = inst.bits;
     assert(p->num_inst <= p->max_inst);
+    if (p->print) {
+       indent(p);
+       printf("%s\tr%d, r%d, 0x%x\n", rem_prefix(name), rT, rA, imm);
+    }
 }
 
 
 static void emit_RI16(struct spe_function *p, unsigned op, unsigned rT,
-		      int imm)
+		      int imm, const char *name)
 {
     union spe_inst_RI16 inst;
     inst.inst.op = op;
@@ -220,11 +261,15 @@ static void emit_RI16(struct spe_function *p, unsigned op, unsigned rT,
     inst.inst.rT = rT;
     p->store[p->num_inst++] = inst.bits;
     assert(p->num_inst <= p->max_inst);
+    if (p->print) {
+       indent(p);
+       printf("%s\tr%d, 0x%x\n", rem_prefix(name), rT, imm);
+    }
 }
 
 
 static void emit_RI18(struct spe_function *p, unsigned op, unsigned rT,
-		      int imm)
+		      int imm, const char *name)
 {
     union spe_inst_RI18 inst;
     inst.inst.op = op;
@@ -232,6 +277,10 @@ static void emit_RI18(struct spe_function *p, unsigned op, unsigned rT,
     inst.inst.rT = rT;
     p->store[p->num_inst++] = inst.bits;
     assert(p->num_inst <= p->max_inst);
+    if (p->print) {
+       indent(p);
+       printf("%s\tr%d, 0x%x\n", rem_prefix(name), rT, imm);
+    }
 }
 
 
@@ -240,61 +289,61 @@ static void emit_RI18(struct spe_function *p, unsigned op, unsigned rT,
 #define EMIT_(_name, _op) \
 void _name (struct spe_function *p, unsigned rT) \
 { \
-    emit_RR(p, _op, rT, 0, 0); \
+   emit_RR(p, _op, rT, 0, 0, __FUNCTION__); \
 }
 
 #define EMIT_R(_name, _op) \
 void _name (struct spe_function *p, unsigned rT, unsigned rA) \
 { \
-    emit_RR(p, _op, rT, rA, 0); \
+   emit_RR(p, _op, rT, rA, 0, __FUNCTION__);                 \
 }
 
 #define EMIT_RR(_name, _op) \
 void _name (struct spe_function *p, unsigned rT, unsigned rA, unsigned rB) \
 { \
-    emit_RR(p, _op, rT, rA, rB); \
+   emit_RR(p, _op, rT, rA, rB, __FUNCTION__);                \
 }
 
 #define EMIT_RRR(_name, _op) \
 void _name (struct spe_function *p, unsigned rT, unsigned rA, unsigned rB, unsigned rC) \
 { \
-    emit_RRR(p, _op, rT, rA, rB, rC); \
+   emit_RRR(p, _op, rT, rA, rB, rC, __FUNCTION__);           \
 }
 
 #define EMIT_RI7(_name, _op) \
 void _name (struct spe_function *p, unsigned rT, unsigned rA, int imm) \
 { \
-    emit_RI7(p, _op, rT, rA, imm); \
+   emit_RI7(p, _op, rT, rA, imm, __FUNCTION__);              \
 }
 
 #define EMIT_RI8(_name, _op, bias) \
 void _name (struct spe_function *p, unsigned rT, unsigned rA, int imm) \
 { \
-    emit_RI8(p, _op, rT, rA, bias - imm); \
+   emit_RI8(p, _op, rT, rA, bias - imm, __FUNCTION__);       \
 }
 
 #define EMIT_RI10(_name, _op) \
 void _name (struct spe_function *p, unsigned rT, unsigned rA, int imm) \
 { \
-    emit_RI10(p, _op, rT, rA, imm); \
+   emit_RI10(p, _op, rT, rA, imm, __FUNCTION__);             \
 }
 
 #define EMIT_RI16(_name, _op) \
 void _name (struct spe_function *p, unsigned rT, int imm) \
 { \
-    emit_RI16(p, _op, rT, imm); \
+   emit_RI16(p, _op, rT, imm, __FUNCTION__);                 \
 }
 
 #define EMIT_RI18(_name, _op) \
 void _name (struct spe_function *p, unsigned rT, int imm) \
 { \
-    emit_RI18(p, _op, rT, imm); \
+   emit_RI18(p, _op, rT, imm, __FUNCTION__);                 \
 }
 
 #define EMIT_I16(_name, _op) \
 void _name (struct spe_function *p, int imm) \
 { \
-    emit_RI16(p, _op, 0, imm); \
+   emit_RI16(p, _op, 0, imm, __FUNCTION__);                  \
 }
 
 #include "rtasm_ppc_spe.h"
@@ -314,6 +363,9 @@ void spe_init_func(struct spe_function *p, unsigned code_size)
      */
     p->regs[0] = ~7;
     p->regs[1] = (1U << (80 - 64)) - 1;
+
+    p->print = false;
+    p->indent = 0;
 }
 
 
@@ -382,6 +434,32 @@ void spe_release_register(struct spe_function *p, int reg)
 }
 
 
+void
+spe_print_code(struct spe_function *p, boolean enable)
+{
+   p->print = enable;
+}
+
+
+void
+spe_indent(struct spe_function *p, int spaces)
+{
+   p->indent += spaces;
+}
+
+
+extern void
+spe_comment(struct spe_function *p, int rel_indent, const char *s)
+{
+   if (p->print) {
+      p->indent += rel_indent;
+      indent(p);
+      p->indent -= rel_indent;
+      printf("%s\n", s);
+   }
+}
+
+
 /**
  * For branch instructions:
  * \param d  if 1, disable interupts if branch is taken
@@ -392,51 +470,51 @@ void spe_release_register(struct spe_function *p, int reg)
 /** Branch Indirect to address in rA */
 void spe_bi(struct spe_function *p, unsigned rA, int d, int e)
 {
-    emit_RI7(p, 0x1a8, 0, rA, (d << 5) | (e << 4));
+   emit_RI7(p, 0x1a8, 0, rA, (d << 5) | (e << 4), __FUNCTION__);
 }
 
 /** Interupt Return */
 void spe_iret(struct spe_function *p, unsigned rA, int d, int e)
 {
-    emit_RI7(p, 0x1aa, 0, rA, (d << 5) | (e << 4));
+   emit_RI7(p, 0x1aa, 0, rA, (d << 5) | (e << 4), __FUNCTION__);
 }
 
 /** Branch indirect and set link on external data */
 void spe_bisled(struct spe_function *p, unsigned rT, unsigned rA, int d,
 		int e)
 {
-    emit_RI7(p, 0x1ab, rT, rA, (d << 5) | (e << 4));
+   emit_RI7(p, 0x1ab, rT, rA, (d << 5) | (e << 4), __FUNCTION__);
 }
 
 /** Branch indirect and set link.  Save PC in rT, jump to rA. */
 void spe_bisl(struct spe_function *p, unsigned rT, unsigned rA, int d,
 		int e)
 {
-    emit_RI7(p, 0x1a9, rT, rA, (d << 5) | (e << 4));
+   emit_RI7(p, 0x1a9, rT, rA, (d << 5) | (e << 4), __FUNCTION__);
 }
 
 /** Branch indirect if zero word.  If rT.word[0]==0, jump to rA. */
 void spe_biz(struct spe_function *p, unsigned rT, unsigned rA, int d, int e)
 {
-    emit_RI7(p, 0x128, rT, rA, (d << 5) | (e << 4));
+   emit_RI7(p, 0x128, rT, rA, (d << 5) | (e << 4), __FUNCTION__);
 }
 
 /** Branch indirect if non-zero word.  If rT.word[0]!=0, jump to rA. */
 void spe_binz(struct spe_function *p, unsigned rT, unsigned rA, int d, int e)
 {
-    emit_RI7(p, 0x129, rT, rA, (d << 5) | (e << 4));
+   emit_RI7(p, 0x129, rT, rA, (d << 5) | (e << 4), __FUNCTION__);
 }
 
 /** Branch indirect if zero halfword.  If rT.halfword[1]==0, jump to rA. */
 void spe_bihz(struct spe_function *p, unsigned rT, unsigned rA, int d, int e)
 {
-    emit_RI7(p, 0x12a, rT, rA, (d << 5) | (e << 4));
+   emit_RI7(p, 0x12a, rT, rA, (d << 5) | (e << 4), __FUNCTION__);
 }
 
 /** Branch indirect if non-zero halfword.  If rT.halfword[1]!=0, jump to rA. */
 void spe_bihnz(struct spe_function *p, unsigned rT, unsigned rA, int d, int e)
 {
-    emit_RI7(p, 0x12b, rT, rA, (d << 5) | (e << 4));
+   emit_RI7(p, 0x12b, rT, rA, (d << 5) | (e << 4), __FUNCTION__);
 }
 
 
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
index 7a3ab9ace5..2579045232 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
@@ -28,6 +28,7 @@
  * For details, see /opt/cell/sdk/docs/arch/SPU_ISA_v1.2_27Jan2007_pub.pdf
  *
  * \author Ian Romanick <idr@us.ibm.com>
+ * \author Brian Paul
  */
 
 #ifndef RTASM_PPC_SPE_H
@@ -63,8 +64,12 @@ struct spe_function
      * spe_release_register
      */
     uint64_t regs[SPE_NUM_REGS / 64];
+
+    boolean print; /**< print/dump instructions as they're emitted? */
+    int indent;    /**< number of spaces to indent */
 };
 
+
 extern void spe_init_func(struct spe_function *p, unsigned code_size);
 extern void spe_release_func(struct spe_function *p);
 
@@ -72,6 +77,11 @@ extern int spe_allocate_available_register(struct spe_function *p);
 extern int spe_allocate_register(struct spe_function *p, int reg);
 extern void spe_release_register(struct spe_function *p, int reg);
 
+extern void spe_print_code(struct spe_function *p, boolean enable);
+extern void spe_indent(struct spe_function *p, int spaces);
+extern void spe_comment(struct spe_function *p, int rel_indent, const char *s);
+
+
 #endif /* RTASM_PPC_SPE_H */
 
 #ifndef EMIT_
-- 
cgit v1.2.3


From f8bba34d4e12ef4c620cac881a4b697a1e668377 Mon Sep 17 00:00:00 2001
From: Robert Ellison <papillo@tungstengraphics.com>
Date: Thu, 18 Sep 2008 01:29:41 -0600
Subject: CELL: finish fragment ops blending (except for unusual D3D modes)

- Added new "macro" functions spe_float_min() and spe_float_max()
  to rtasm_ppc_spe.{ch}.  These emit instructions that cause
  the minimum or maximum of each element in a vector of floats
  to be saved in the destination register.

- Major changes to cell_gen_fragment.c to implement all the blending
  modes (except for the mysterious D3D-based PIPE_BLENDFACTOR_SRC1_COLOR,
  PIPE_BLENDFACTOR_SRC1_ALPHA, PIPE_BLENDFACTOR_INV_SRC1_COLOR, and
  PIPE_BLENDFACTOR_INV_SRC1_ALPHA).

- Some revamping of code in cell_gen_fragment.c: use the new spe_float_min()
  and spe_float_max() functions (instead of expanding these calculations
  inline via macros); create and use an inline utility function for handling
  "optional" register allocation (for the {1,1,1,1} vector, and the
  blend color vectors) instead of expanding with macros; use the Float
  Multiply and Subtract (fnms) instruction to simplify and optimize many
  blending calculations.
---
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c      |  41 +-
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h      |   8 +
 src/gallium/drivers/cell/ppu/cell_gen_fragment.c | 546 ++++++++++++++---------
 3 files changed, 377 insertions(+), 218 deletions(-)

(limited to 'src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
index 870ae802c5..12e0826fb9 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
@@ -384,7 +384,7 @@ void spe_release_func(struct spe_function *p)
 
 
 /**
- * Alloate a SPE register.
+ * Allocate a SPE register.
  * \return register index or -1 if none left.
  */
 int spe_allocate_available_register(struct spe_function *p)
@@ -646,5 +646,44 @@ spe_splat_word(struct spe_function *p, unsigned rT, unsigned rA, int word)
    }
 }
 
+/* For each 32-bit float element of rA and rB, choose the smaller of the
+ * two, compositing them into the rT register.
+ * 
+ * The Float Compare Greater Than (fcgt) instruction will put 1s into
+ * compare_reg where rA > rB, and 0s where rA <= rB.
+ *
+ * Then the Select Bits (selb) instruction will take bits from rA where
+ * compare_reg is 0, and from rB where compare_reg is 1; i.e., from rA
+ * where rA <= rB and from rB where rB > rA, which is exactly the
+ * "min" operation.
+ *
+ * The compare_reg could in many cases be the same as rT, unless
+ * rT == rA || rt == rB.  But since this is common in constructions
+ * like "x = min(x, a)", we always allocate a new register to be safe.
+ */
+void 
+spe_float_min(struct spe_function *p, unsigned int rT, unsigned int rA, unsigned int rB)
+{
+   unsigned int compare_reg = spe_allocate_available_register(p);
+   spe_fcgt(p, compare_reg, rA, rB);
+   spe_selb(p, rT, rA, rB, compare_reg);
+   spe_release_register(p, compare_reg);
+}
+
+/* For each 32-bit float element of rA and rB, choose the greater of the
+ * two, compositing them into the rT register.
+ * 
+ * The logic is similar to that of spe_float_min() above; the only
+ * difference is that the registers on spe_selb() have been reversed,
+ * so that the larger of the two is selected instead of the smaller.
+ */
+void 
+spe_float_max(struct spe_function *p, unsigned int rT, unsigned int rA, unsigned int rB)
+{
+   unsigned int compare_reg = spe_allocate_available_register(p);
+   spe_fcgt(p, compare_reg, rA, rB);
+   spe_selb(p, rT, rB, rA, compare_reg);
+   spe_release_register(p, compare_reg);
+}
 
 #endif /* GALLIUM_CELL */
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
index 2579045232..4ef05ea27d 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
@@ -322,6 +322,14 @@ spe_zero(struct spe_function *p, unsigned rT);
 extern void
 spe_splat_word(struct spe_function *p, unsigned rT, unsigned rA, int word);
 
+/** rT = float min(rA, rB) */
+extern void
+spe_float_min(struct spe_function *p, unsigned rT, unsigned rA, unsigned rB);
+
+/** rT = float max(rA, rB) */
+extern void
+spe_float_max(struct spe_function *p, unsigned rT, unsigned rA, unsigned rB);
+
 
 /* Floating-point instructions
  */
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
index 2c80dd712e..9d25e820ad 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
@@ -229,35 +229,26 @@ gen_alpha_test(const struct pipe_depth_stencil_alpha_state *dsa,
    spe_release_register(f, amask_reg);
 }
 
-/* This is a convenient and oft-used sequence.  It chooses
- * the smaller of each element of reg1 and reg2, and combines them
- * into the result register, as follows:
- * 
- * The Float Compare Greater Than (fcgt) instruction will put
- * 1s into compare_reg where reg1 > reg2, and 0s where reg1 <= reg2.
- *
- * Then the Select Bits (selb) instruction will take bits from
- * reg1 where compare_reg is 0, and from reg2 where compare_reg is
- * 1.  Ergo, result_reg will have the bits from reg1 where reg1 <= reg2,
- * and the bits from reg2 where reg1 > reg2, which is exactly the
- * MIN operation.
+/* This pair of functions is used inline to allocate and deallocate
+ * optional constant registers.  Once a constant is discovered to be 
+ * needed, we will likely need it again, so we don't want to deallocate
+ * it and have to allocate and load it again unnecessarily.
  */
-#define FLOAT_VECTOR_MIN(f, result_reg, reg1, reg2) {\
-   int compare_reg = spe_allocate_available_register(f); \
-   spe_fcgt(f, compare_reg, reg1, reg2); \
-   spe_selb(f, result_reg, reg1, reg2, compare_reg); \
-   spe_release_register(f, compare_reg); \
+static inline void
+setup_const_register(struct spe_function *f, boolean *is_already_set, unsigned int *r, float value)
+{
+   if (*is_already_set) return;
+   *r = spe_allocate_available_register(f);
+   spe_load_float(f, *r, value);
+   *is_already_set = true;
 }
 
-/* The FLOAT_VECTOR_MAX sequence is similar to the FLOAT_VECTOR_MIN 
- * sequence above, except that the registers specified when selecting
- * bits are reversed.
- */
-#define FLOAT_VECTOR_MAX(f, result_reg, reg1, reg2) {\
-   int compare_reg = spe_allocate_available_register(f); \
-   spe_fcgt(f, compare_reg, reg1, reg2); \
-   spe_selb(f, result_reg, reg2, reg1, compare_reg); \
-   spe_release_register(f, compare_reg); \
+static inline void
+release_const_register(struct spe_function *f, boolean *is_already_set, unsigned int r)
+{
+    if (!*is_already_set) return;
+    spe_release_register(f, r);
+    *is_already_set = false;
 }
 
 /**
@@ -294,51 +285,15 @@ gen_blend(const struct pipe_blend_state *blend,
 
    int tmp_reg = spe_allocate_available_register(f);
 
-   /* These values might or might not eventually get put into
-    * registers.  We avoid allocating them and setting them until
-    * they're actually needed; then we avoid setting them more than
-    * once, and release them at the end of code generation.
+   /* Optional constant registers we might or might not end up using;
+    * if we do use them, make sure we only allocate them once by
+    * keeping a flag on each one.
     */
-   boolean one_reg_set = false; 
-   int one_reg;
-#define SET_ONE_REG_IF_UNSET(f) if (!one_reg_set) {\
-   one_reg = spe_allocate_available_register(f); \
-   spe_load_float(f, one_reg, 1.0f); \
-   one_reg_set = true; \
-}
-#define RELEASE_ONE_REG_IF_USED(f) if (one_reg_set) {\
-   spe_release_register(f, one_reg); \
-}
-  
-   boolean const_color_set = false;
-   int constR_reg, constG_reg, constB_reg;
-#define SET_CONST_COLOR_IF_UNSET(f, blend_color) if (!const_color_set) {\
-   constR_reg = spe_allocate_available_register(f); \
-   constG_reg = spe_allocate_available_register(f); \
-   constG_reg = spe_allocate_available_register(f); \
-   spe_load_float(f, constR_reg, blend_color->color[0]); \
-   spe_load_float(f, constG_reg, blend_color->color[1]); \
-   spe_load_float(f, constB_reg, blend_color->color[2]); \
-   const_color_set = true;\
-}
-#define RELEASE_CONST_COLOR_IF_USED(f) if (const_color_set) {\
-   spe_release_register(f, constR_reg); \
-   spe_release_register(f, constG_reg); \
-   spe_release_register(f, constB_reg); \
-}
-
-   boolean const_alpha_set = false;
-   int constA_reg;
-#define SET_CONST_ALPHA_IF_UNSET(f, blend_color) if (!const_alpha_set) {\
-   constA_reg = spe_allocate_available_register(f); \
-   spe_load_float(f, constA_reg, blend_color->color[3]); \
-   const_alpha_set = true; \
-}
-#define RELEASE_CONST_ALPHA_IF_USED(f) if (const_alpha_set) {\
-   spe_release_register(f, constA_reg); \
-}
-
-   /* Real code starts here */
+   boolean one_reg_set = false;
+   unsigned int one_reg;
+   boolean constR_reg_set = false, constG_reg_set = false, 
+      constB_reg_set = false, constA_reg_set = false;
+   unsigned int constR_reg, constG_reg, constB_reg, constA_reg;
 
    ASSERT(blend->blend_enable);
 
@@ -419,10 +374,11 @@ gen_blend(const struct pipe_blend_state *blend,
       spe_release_register(f, mask_reg);
    }
 
-
    /*
     * Compute Src RGB terms.  We're actually looking for the value
-    * of (the appropriate RGB factors) * (the incoming source RGB color).
+    * of (the appropriate RGB factors) * (the incoming source RGB color),
+    * because in some cases (like PIPE_BLENDFACTOR_ONE and 
+    * PIPE_BLENDFACTOR_ZERO) we can avoid doing unnecessary math.
     */
    switch (blend->rgb_src_factor) {
    case PIPE_BLENDFACTOR_ONE:
@@ -450,18 +406,13 @@ gen_blend(const struct pipe_blend_state *blend,
       spe_fm(f, term1B_reg, fragB_reg, fragA_reg);
       break;
    case PIPE_BLENDFACTOR_INV_SRC_COLOR:
-      /* factors = (1-R,1-G,1-B), so term = (R*(1-R), G*(1-G), B*(1-B)) */
-      /* we'll need the optional constant {1,1,1,1} register */
-      SET_ONE_REG_IF_UNSET(f)
-      /* tmp = 1 - R */
-      spe_fs(f, tmp_reg, one_reg, fragR_reg);
-      /* term = R * tmp */
-      spe_fm(f, term1R_reg, fragR_reg, tmp_reg);
-      /* repeat for G and B */
-      spe_fs(f, tmp_reg, one_reg, fragG_reg);
-      spe_fm(f, term1G_reg, fragG_reg, tmp_reg);
-      spe_fs(f, tmp_reg, one_reg, fragB_reg);
-      spe_fm(f, term1B_reg, fragB_reg, tmp_reg);
+      /* factors = (1-R,1-G,1-B), so term = (R*(1-R), G*(1-G), B*(1-B)) 
+       * or in other words term = (R-R*R, G-G*G, B-B*B)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term1R_reg, fragR_reg, fragR_reg, fragR_reg);
+      spe_fnms(f, term1G_reg, fragG_reg, fragG_reg, fragG_reg);
+      spe_fnms(f, term1B_reg, fragB_reg, fragB_reg, fragB_reg);
       break;
    case PIPE_BLENDFACTOR_DST_COLOR:
       /* factors = (Rfb,Gfb,Bfb), so term = (R*Rfb, G*Gfb, B*Bfb) */
@@ -470,30 +421,22 @@ gen_blend(const struct pipe_blend_state *blend,
       spe_fm(f, term1B_reg, fragB_reg, fbB_reg);
       break;
    case PIPE_BLENDFACTOR_INV_DST_COLOR:
-      /* factors = (1-Rfb,1-Gfb,1-Bfb), so term = (R*(1-Rfb),G*(1-Gfb),B*(1-Bfb)) */
-      /* we'll need the optional constant {1,1,1,1} register */
-      SET_ONE_REG_IF_UNSET(f)
-      /* tmp = 1 - Rfb */
-      spe_fs(f, tmp_reg, one_reg, fbR_reg);
-      /* term = R * tmp */
-      spe_fm(f, term1R_reg, fragR_reg, tmp_reg);
-      /* repeat for G and B */
-      spe_fs(f, tmp_reg, one_reg, fbG_reg);
-      spe_fm(f, term1G_reg, fragG_reg, tmp_reg);
-      spe_fs(f, tmp_reg, one_reg, fbB_reg);
-      spe_fm(f, term1B_reg, fragB_reg, tmp_reg);
+      /* factors = (1-Rfb,1-Gfb,1-Bfb), so term = (R*(1-Rfb),G*(1-Gfb),B*(1-Bfb))
+       * or term = (R-R*Rfb, G-G*Gfb, B-B*Bfb)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term1R_reg, fragR_reg, fbR_reg, fragR_reg);
+      spe_fnms(f, term1G_reg, fragG_reg, fbG_reg, fragG_reg);
+      spe_fnms(f, term1B_reg, fragB_reg, fbB_reg, fragB_reg);
       break;
    case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
-      /* factors = (1-A,1-A,1-A), so term = (R*(1-A),G*(1-A),B*(1-A)) */
-      /* we'll need the optional constant {1,1,1,1} register */
-      SET_ONE_REG_IF_UNSET(f)
-      /* tmp = 1 - A */
-      spe_fs(f, tmp_reg, one_reg, fragA_reg);
-      /* term = R * tmp */
-      spe_fm(f, term1R_reg, fragR_reg, tmp_reg);
-      /* repeat for G and B with the same (1-A) factor */
-      spe_fm(f, term1G_reg, fragG_reg, tmp_reg);
-      spe_fm(f, term1B_reg, fragB_reg, tmp_reg);
+      /* factors = (1-A,1-A,1-A), so term = (R*(1-A),G*(1-A),B*(1-A))
+       * or term = (R-R*A,G-G*A,B-B*A)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term1R_reg, fragR_reg, fragA_reg, fragR_reg);
+      spe_fnms(f, term1G_reg, fragG_reg, fragA_reg, fragG_reg);
+      spe_fnms(f, term1B_reg, fragB_reg, fragA_reg, fragB_reg);
       break;
    case PIPE_BLENDFACTOR_DST_ALPHA:
       /* factors = (Afb, Afb, Afb), so term = (R*Afb, G*Afb, B*Afb) */
@@ -502,19 +445,19 @@ gen_blend(const struct pipe_blend_state *blend,
       spe_fm(f, term1B_reg, fragB_reg, fbA_reg);
       break;
    case PIPE_BLENDFACTOR_INV_DST_ALPHA:
-      /* factors = (1-Afb, 1-Afb, 1-Afb), so term = (R*(1-Afb),G*(1-Afb),B*(1-Afb)) */
-      /* we'll need the optional constant {1,1,1,1} register */
-      SET_ONE_REG_IF_UNSET(f)
-      /* tmp = 1 - A */
-      spe_fs(f, tmp_reg, one_reg, fbA_reg);
-      /* term = R * tmp, G*tmp, and B*tmp */
-      spe_fm(f, term1R_reg, fragR_reg, tmp_reg);
-      spe_fm(f, term1G_reg, fragG_reg, tmp_reg);
-      spe_fm(f, term1B_reg, fragB_reg, tmp_reg);
+      /* factors = (1-Afb, 1-Afb, 1-Afb), so term = (R*(1-Afb),G*(1-Afb),B*(1-Afb)) 
+       * or term = (R-R*Afb,G-G*Afb,b-B*Afb)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term1R_reg, fragR_reg, fbA_reg, fragR_reg);
+      spe_fnms(f, term1G_reg, fragG_reg, fbA_reg, fragG_reg);
+      spe_fnms(f, term1B_reg, fragB_reg, fbA_reg, fragB_reg);
       break;
    case PIPE_BLENDFACTOR_CONST_COLOR:
-      /* We'll need the optional blend color registers */
-      SET_CONST_COLOR_IF_UNSET(f,blend_color)
+      /* We need the optional constant color registers */
+      setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
+      setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
+      setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
       /* now, factor = (Rc,Gc,Bc), so term = (R*Rc,G*Gc,B*Bc) */
       spe_fm(f, term1R_reg, fragR_reg, constR_reg);
       spe_fm(f, term1G_reg, fragG_reg, constG_reg);
@@ -522,55 +465,61 @@ gen_blend(const struct pipe_blend_state *blend,
       break;
    case PIPE_BLENDFACTOR_CONST_ALPHA:
       /* we'll need the optional constant alpha register */
-      SET_CONST_ALPHA_IF_UNSET(f, blend_color)
+      setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
       /* factor = (Ac,Ac,Ac), so term = (R*Ac,G*Ac,B*Ac) */
       spe_fm(f, term1R_reg, fragR_reg, constA_reg);
       spe_fm(f, term1G_reg, fragG_reg, constA_reg);
       spe_fm(f, term1B_reg, fragB_reg, constA_reg);
       break;
    case PIPE_BLENDFACTOR_INV_CONST_COLOR:
-      /* We need both the optional {1,1,1,1} register, and the optional
-       * constant color registers
+      /* We need the optional constant color registers */
+      setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
+      setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
+      setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
+      /* factor = (1-Rc,1-Gc,1-Bc), so term = (R*(1-Rc),G*(1-Gc),B*(1-Bc)) 
+       * or term = (R-R*Rc, G-G*Gc, B-B*Bc)
+       * fnms(a,b,c,d) computes a = d - b*c
        */
-      SET_ONE_REG_IF_UNSET(f)
-      SET_CONST_COLOR_IF_UNSET(f, blend_color)
-      /* factor = (1-Rc,1-Gc,1-Bc), so term = (R*(1-Rc),G*(1-Gc),B*(1-Bc)) */
-      spe_fs(f, tmp_reg, one_reg, constR_reg);
-      spe_fm(f, term1R_reg, fragR_reg, tmp_reg);
-      spe_fs(f, tmp_reg, one_reg, constG_reg);
-      spe_fm(f, term1G_reg, fragG_reg, tmp_reg);
-      spe_fs(f, tmp_reg, one_reg, constB_reg);
-      spe_fm(f, term1B_reg, fragB_reg, tmp_reg);
+      spe_fnms(f, term1R_reg, fragR_reg, constR_reg, fragR_reg);
+      spe_fnms(f, term1G_reg, fragG_reg, constG_reg, fragG_reg);
+      spe_fnms(f, term1B_reg, fragB_reg, constB_reg, fragB_reg);
       break;
    case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
-      /* We need the optional {1,1,1,1} register and the optional 
-       * constant alpha register
+      /* We need the optional constant color registers */
+      setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
+      setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
+      setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
+      /* factor = (1-Ac,1-Ac,1-Ac), so term = (R*(1-Ac),G*(1-Ac),B*(1-Ac))
+       * or term = (R-R*Ac,G-G*Ac,B-B*Ac)
+       * fnms(a,b,c,d) computes a = d - b*c
        */
-      SET_ONE_REG_IF_UNSET(f)
-      SET_CONST_ALPHA_IF_UNSET(f, blend_color)
-      /* factor = (1-Ac,1-Ac,1-Ac), so term = (R*(1-Ac),G*(1-Ac),B*(1-Ac)) */
-      spe_fs(f, tmp_reg, one_reg, constA_reg);
-      spe_fm(f, term1R_reg, fragR_reg, tmp_reg);
-      spe_fm(f, term1G_reg, fragG_reg, tmp_reg);
-      spe_fm(f, term1B_reg, fragB_reg, tmp_reg);
+      spe_fnms(f, term1R_reg, fragR_reg, constA_reg, fragR_reg);
+      spe_fnms(f, term1G_reg, fragG_reg, constA_reg, fragG_reg);
+      spe_fnms(f, term1B_reg, fragB_reg, constA_reg, fragB_reg);
       break;
    case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
       /* We'll need the optional {1,1,1,1} register */
-      SET_ONE_REG_IF_UNSET(f)
+      setup_const_register(f, &one_reg_set, &one_reg, 1.0f);
       /* factor = (min(A,1-Afb),min(A,1-Afb),min(A,1-Afb)), so 
        * term = (R*min(A,1-Afb), G*min(A,1-Afb), B*min(A,1-Afb))
+       * We could expand the term (as a*min(b,c) == min(a*b,a*c)
+       * as long as a is positive), but then we'd have to do three
+       * spe_float_min() functions instead of one, so this is simpler.
        */
       /* tmp = 1 - Afb */
       spe_fs(f, tmp_reg, one_reg, fbA_reg);
       /* tmp = min(A,tmp) */
-      FLOAT_VECTOR_MIN(f, tmp_reg, fragA_reg, tmp_reg)
+      spe_float_min(f, tmp_reg, fragA_reg, tmp_reg);
       /* term = R*tmp */
       spe_fm(f, term1R_reg, fragR_reg, tmp_reg);
       spe_fm(f, term1G_reg, fragG_reg, tmp_reg);
       spe_fm(f, term1B_reg, fragB_reg, tmp_reg);
       break;
 
-      /* non-OpenGL cases? */
+      /* These are special D3D cases involving a second color output
+       * from the fragment shader.  I'm not sure we can support them
+       * yet... XXX
+       */
    case PIPE_BLENDFACTOR_SRC1_COLOR:
    case PIPE_BLENDFACTOR_SRC1_ALPHA:
    case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
@@ -581,132 +530,293 @@ gen_blend(const struct pipe_blend_state *blend,
    }
 
    /*
-    * Compute Src Alpha term
+    * Compute Src Alpha term.  Like the above, we're looking for
+    * the full term A*factor, not just the factor itself, because
+    * in many cases we can avoid doing unnecessary multiplies.
     */
    switch (blend->alpha_src_factor) {
+   case PIPE_BLENDFACTOR_ZERO:
+      /* factor = 0, so term = 0 */
+      spe_load_float(f, term1A_reg, 0.0f);
+      break;
+
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: /* fall through */
    case PIPE_BLENDFACTOR_ONE:
+      /* factor = 1, so term = A */
       spe_move(f, term1A_reg, fragA_reg);
       break;
+
    case PIPE_BLENDFACTOR_SRC_COLOR:
+      /* factor = A, so term = A*A */
       spe_fm(f, term1A_reg, fragA_reg, fragA_reg);
       break;
    case PIPE_BLENDFACTOR_SRC_ALPHA:
       spe_fm(f, term1A_reg, fragA_reg, fragA_reg);
       break;
-      /* XXX more cases */
+
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+      /* factor = 1-A, so term = A*(1-A) = A-A*A */
+      /* fnms(a,b,c,d) computes a = d - b*c */
+      spe_fnms(f, term1A_reg, fragA_reg, fragA_reg, fragA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_DST_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_DST_COLOR:
+      /* factor = Afb, so term = A*Afb */
+      spe_fm(f, term1A_reg, fragA_reg, fbA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+      /* factor = 1-Afb, so term = A*(1-Afb) = A - A*Afb */
+      /* fnms(a,b,c,d) computes a = d - b*c */
+      spe_fnms(f, term1A_reg, fragA_reg, fbA_reg, fragA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_CONST_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+      /* We need the optional constA_reg register */
+      setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
+      /* factor = Ac, so term = A*Ac */
+      spe_fm(f, term1A_reg, fragA_reg, constA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+      /* We need the optional constA_reg register */
+      setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
+      /* factor = 1-Ac, so term = A*(1-Ac) = A-A*Ac */
+      /* fnms(a,b,c,d) computes a = d - b*c */
+      spe_fnms(f, term1A_reg, fragA_reg, constA_reg, fragA_reg);
+      break;
+
+      /* These are special D3D cases involving a second color output
+       * from the fragment shader.  I'm not sure we can support them
+       * yet... XXX
+       */
+   case PIPE_BLENDFACTOR_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_SRC1_ALPHA:
+   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
    default:
       ASSERT(0);
    }
 
    /*
-    * Compute Dest RGB terms
+    * Compute Dest RGB term.  Like the above, we're looking for
+    * the full term (Rfb,Gfb,Bfb)*(factor), not just the factor itself, because
+    * in many cases we can avoid doing unnecessary multiplies.
     */
    switch (blend->rgb_dst_factor) {
    case PIPE_BLENDFACTOR_ONE:
+      /* factors = (1,1,1), so term = (Rfb,Gfb,Bfb) */
       spe_move(f, term2R_reg, fbR_reg);
       spe_move(f, term2G_reg, fbG_reg);
       spe_move(f, term2B_reg, fbB_reg);
       break;
    case PIPE_BLENDFACTOR_ZERO:
-      spe_zero(f, term2R_reg);
-      spe_zero(f, term2G_reg);
-      spe_zero(f, term2B_reg);
+      /* factor s= (0,0,0), so term = (0,0,0) */
+      spe_load_float(f, term2R_reg, 0.0f);
+      spe_load_float(f, term2G_reg, 0.0f);
+      spe_load_float(f, term2B_reg, 0.0f);
       break;
    case PIPE_BLENDFACTOR_SRC_COLOR:
+      /* factors = (R,G,B), so term = (R*Rfb, G*Gfb, B*Bfb) */
       spe_fm(f, term2R_reg, fbR_reg, fragR_reg);
       spe_fm(f, term2G_reg, fbG_reg, fragG_reg);
       spe_fm(f, term2B_reg, fbB_reg, fragB_reg);
       break;
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+      /* factors = (1-R,1-G,1-B), so term = (Rfb*(1-R), Gfb*(1-G), Bfb*(1-B)) 
+       * or in other words term = (Rfb-Rfb*R, Gfb-Gfb*G, Bfb-Bfb*B)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term2R_reg, fragR_reg, fbR_reg, fbR_reg);
+      spe_fnms(f, term2G_reg, fragG_reg, fbG_reg, fbG_reg);
+      spe_fnms(f, term2B_reg, fragB_reg, fbB_reg, fbB_reg);
+      break;
    case PIPE_BLENDFACTOR_SRC_ALPHA:
+      /* factors = (A,A,A), so term = (Rfb*A, Gfb*A, Bfb*A) */
       spe_fm(f, term2R_reg, fbR_reg, fragA_reg);
       spe_fm(f, term2G_reg, fbG_reg, fragA_reg);
       spe_fm(f, term2B_reg, fbB_reg, fragA_reg);
       break;
    case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
-#if 0
-      /* one = {1.0, 1.0, 1.0, 1.0} */
-      if (!one_reg_set) {
-         one_reg = spe_allocate_available_register(f);
-         spe_load_float(f, one_reg, 1.0f);
-         one_reg_set = true;
-      }
-      /* tmp = one - fragA */
-      spe_fs(f, tmp_reg, one_reg, fragA_reg);
-      /* term = fb * tmp */
-      spe_fm(f, term2R_reg, fbR_reg, tmp_reg);
-      spe_fm(f, term2G_reg, fbG_reg, tmp_reg);
-      spe_fm(f, term2B_reg, fbB_reg, tmp_reg);
-#else
-      /* Compute:  term2x = fbx * (1.0 - fragA)
-       * Which is:  term2x = fbx - fbx * fragA
-       * Use fnms t,a,b,c which computes t=c-a*b
-       */
+      /* factors = (1-A,1-A,1-A) so term = (Rfb-Rfb*A,Gfb-Gfb*A,Bfb-Bfb*A) */
+      /* fnms(a,b,c,d) computes a = d - b*c */
       spe_fnms(f, term2R_reg, fbR_reg, fragA_reg, fbR_reg);
       spe_fnms(f, term2G_reg, fbG_reg, fragA_reg, fbG_reg);
       spe_fnms(f, term2B_reg, fbB_reg, fragA_reg, fbB_reg);
-#endif
       break;
-      /* XXX more cases */
-      // GL_ONE_MINUS_SRC_COLOR
-      // GL_DST_COLOR
-      // GL_ONE_MINUS_DST_COLOR
-      // GL_DST_ALPHA
-      // GL_CONSTANT_COLOR
-      // GL_ONE_MINUS_CONSTANT_COLOR
-      // GL_CONSTANT_ALPHA
-      // GL_ONE_MINUS_CONSTANT_ALPHA
+   case PIPE_BLENDFACTOR_DST_COLOR:
+      /* factors = (Rfb,Gfb,Bfb), so term = (Rfb*Rfb, Gfb*Gfb, Bfb*Bfb) */
+      spe_fm(f, term2R_reg, fbR_reg, fbR_reg);
+      spe_fm(f, term2G_reg, fbG_reg, fbG_reg);
+      spe_fm(f, term2B_reg, fbB_reg, fbB_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+      /* factors = (1-Rfb,1-Gfb,1-Bfb), so term = (Rfb*(1-Rfb),Gfb*(1-Gfb),Bfb*(1-Bfb))
+       * or term = (Rfb-Rfb*Rfb, Gfb-Gfb*Gfb, Bfb-Bfb*Bfb)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term2R_reg, fbR_reg, fbR_reg, fbR_reg);
+      spe_fnms(f, term2G_reg, fbG_reg, fbG_reg, fbG_reg);
+      spe_fnms(f, term2B_reg, fbB_reg, fbB_reg, fbB_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_DST_ALPHA:
+      /* factors = (Afb, Afb, Afb), so term = (Rfb*Afb, Gfb*Afb, Bfb*Afb) */
+      spe_fm(f, term2R_reg, fbR_reg, fbA_reg);
+      spe_fm(f, term2G_reg, fbG_reg, fbA_reg);
+      spe_fm(f, term2B_reg, fbB_reg, fbA_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+      /* factors = (1-Afb, 1-Afb, 1-Afb), so term = (Rfb*(1-Afb),Gfb*(1-Afb),Bfb*(1-Afb)) 
+       * or term = (Rfb-Rfb*Afb,Gfb-Gfb*Afb,Bfb-Bfb*Afb)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term2R_reg, fbR_reg, fbA_reg, fbR_reg);
+      spe_fnms(f, term2G_reg, fbG_reg, fbA_reg, fbG_reg);
+      spe_fnms(f, term2B_reg, fbB_reg, fbA_reg, fbB_reg);
+      break;
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+      /* We need the optional constant color registers */
+      setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
+      setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
+      setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
+      /* now, factor = (Rc,Gc,Bc), so term = (Rfb*Rc,Gfb*Gc,Bfb*Bc) */
+      spe_fm(f, term2R_reg, fbR_reg, constR_reg);
+      spe_fm(f, term2G_reg, fbG_reg, constG_reg);
+      spe_fm(f, term2B_reg, fbB_reg, constB_reg);
+      break;
+   case PIPE_BLENDFACTOR_CONST_ALPHA:
+      /* we'll need the optional constant alpha register */
+      setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
+      /* factor = (Ac,Ac,Ac), so term = (Rfb*Ac,Gfb*Ac,Bfb*Ac) */
+      spe_fm(f, term2R_reg, fbR_reg, constA_reg);
+      spe_fm(f, term2G_reg, fbG_reg, constA_reg);
+      spe_fm(f, term2B_reg, fbB_reg, constA_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+      /* We need the optional constant color registers */
+      setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
+      setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
+      setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
+      /* factor = (1-Rc,1-Gc,1-Bc), so term = (Rfb*(1-Rc),Gfb*(1-Gc),Bfb*(1-Bc)) 
+       * or term = (Rfb-Rfb*Rc, Gfb-Gfb*Gc, Bfb-Bfb*Bc)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term2R_reg, fbR_reg, constR_reg, fbR_reg);
+      spe_fnms(f, term2G_reg, fbG_reg, constG_reg, fbG_reg);
+      spe_fnms(f, term2B_reg, fbB_reg, constB_reg, fbB_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+      /* We need the optional constant color registers */
+      setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
+      setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
+      setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
+      /* factor = (1-Ac,1-Ac,1-Ac), so term = (Rfb*(1-Ac),Gfb*(1-Ac),Bfb*(1-Ac))
+       * or term = (Rfb-Rfb*Ac,Gfb-Gfb*Ac,Bfb-Bfb*Ac)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term2R_reg, fbR_reg, constA_reg, fbR_reg);
+      spe_fnms(f, term2G_reg, fbG_reg, constA_reg, fbG_reg);
+      spe_fnms(f, term2B_reg, fbB_reg, constA_reg, fbB_reg);
+      break;
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: /* not supported for dest RGB */
+      ASSERT(0);
+      break;
+
+      /* These are special D3D cases involving a second color output
+       * from the fragment shader.  I'm not sure we can support them
+       * yet... XXX
+       */
+   case PIPE_BLENDFACTOR_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_SRC1_ALPHA:
+   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+
    default:
       ASSERT(0);
    }
 
    /*
-    * Compute Dest Alpha term
+    * Compute Dest Alpha term.  Like the above, we're looking for
+    * the full term Afb*factor, not just the factor itself, because
+    * in many cases we can avoid doing unnecessary multiplies.
     */
    switch (blend->alpha_dst_factor) {
    case PIPE_BLENDFACTOR_ONE:
+      /* factor = 1, so term = Afb */
       spe_move(f, term2A_reg, fbA_reg);
       break;
    case PIPE_BLENDFACTOR_ZERO:
-      spe_zero(f, term2A_reg);
+      /* factor = 0, so term = 0 */
+      spe_load_float(f, term2A_reg, 0.0f);
       break;
-   case PIPE_BLENDFACTOR_SRC_ALPHA:
+
+   case PIPE_BLENDFACTOR_SRC_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+      /* factor = A, so term = Afb*A */
       spe_fm(f, term2A_reg, fbA_reg, fragA_reg);
       break;
-   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
-#if 0
-      /* one = {1.0, 1.0, 1.0, 1.0} */
-      if (!one_reg_set) {
-         one_reg = spe_allocate_available_register(f);
-         spe_load_float(f, one_reg, 1.0f);
-         one_reg_set = true;
-      }
-      /* tmp = one - fragA */
-      spe_fs(f, tmp_reg, one_reg, fragA_reg);
-      /* termA = fbA * tmp */
-      spe_fm(f, term2A_reg, fbA_reg, tmp_reg);
-#else
-      /* Compute:  term2A = fbA * (1.0 - fragA)
-       * Which is:  term2A = fbA - fbA * fragA
-       * Use fnms t,a,b,c which computes t=c-a*b
-       */
+
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+      /* factor = 1-A, so term = Afb*(1-A) = Afb-Afb*A */
+      /* fnms(a,b,c,d) computes a = d - b*c */
       spe_fnms(f, term2A_reg, fbA_reg, fragA_reg, fbA_reg);
-#endif
       break;
-      /* XXX more cases */
-      // GL_ONE_MINUS_SRC_COLOR
-      // GL_DST_COLOR
-      // GL_ONE_MINUS_DST_COLOR
-      // GL_DST_ALPHA
-      // GL_CONSTANT_COLOR
-      // GL_ONE_MINUS_CONSTANT_COLOR
-      // GL_CONSTANT_ALPHA
-      // GL_ONE_MINUS_CONSTANT_ALPHA
+
+   case PIPE_BLENDFACTOR_DST_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_DST_COLOR:
+      /* factor = Afb, so term = Afb*Afb */
+      spe_fm(f, term2A_reg, fbA_reg, fbA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+      /* factor = 1-Afb, so term = Afb*(1-Afb) = Afb - Afb*Afb */
+      /* fnms(a,b,c,d) computes a = d - b*c */
+      spe_fnms(f, term2A_reg, fbA_reg, fbA_reg, fbA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_CONST_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+      /* We need the optional constA_reg register */
+      setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
+      /* factor = Ac, so term = Afb*Ac */
+      spe_fm(f, term2A_reg, fbA_reg, constA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+      /* We need the optional constA_reg register */
+      setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
+      /* factor = 1-Ac, so term = Afb*(1-Ac) = Afb-Afb*Ac */
+      /* fnms(a,b,c,d) computes a = d - b*c */
+      spe_fnms(f, term2A_reg, fbA_reg, constA_reg, fbA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: /* not supported for dest alpha */
+      ASSERT(0);
+      break;
+
+      /* These are special D3D cases involving a second color output
+       * from the fragment shader.  I'm not sure we can support them
+       * yet... XXX
+       */
+   case PIPE_BLENDFACTOR_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_SRC1_ALPHA:
+   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
    default:
       ASSERT(0);
    }
 
    /*
-    * Combine Src/Dest RGB terms
+    * Combine Src/Dest RGB terms as per the blend equation.
     */
    switch (blend->rgb_func) {
    case PIPE_BLEND_ADD:
@@ -725,14 +835,14 @@ gen_blend(const struct pipe_blend_state *blend,
       spe_fs(f, fragB_reg, term2B_reg, term1B_reg);
       break;
    case PIPE_BLEND_MIN:
-      FLOAT_VECTOR_MIN(f, fragR_reg, term1R_reg, term2R_reg)
-      FLOAT_VECTOR_MIN(f, fragG_reg, term1G_reg, term2G_reg)
-      FLOAT_VECTOR_MIN(f, fragB_reg, term1B_reg, term2B_reg)
+      spe_float_min(f, fragR_reg, term1R_reg, term2R_reg);
+      spe_float_min(f, fragG_reg, term1G_reg, term2G_reg);
+      spe_float_min(f, fragB_reg, term1B_reg, term2B_reg);
       break;
    case PIPE_BLEND_MAX:
-      FLOAT_VECTOR_MAX(f, fragR_reg, term1R_reg, term2R_reg)
-      FLOAT_VECTOR_MAX(f, fragG_reg, term1G_reg, term2G_reg)
-      FLOAT_VECTOR_MAX(f, fragB_reg, term1B_reg, term2B_reg)
+      spe_float_max(f, fragR_reg, term1R_reg, term2R_reg);
+      spe_float_max(f, fragG_reg, term1G_reg, term2G_reg);
+      spe_float_max(f, fragB_reg, term1B_reg, term2B_reg);
       break;
    default:
       ASSERT(0);
@@ -752,10 +862,10 @@ gen_blend(const struct pipe_blend_state *blend,
       spe_fs(f, fragA_reg, term2A_reg, term1A_reg);
       break;
    case PIPE_BLEND_MIN:
-      FLOAT_VECTOR_MIN(f, fragA_reg, term1A_reg, term2A_reg)
+      spe_float_min(f, fragA_reg, term1A_reg, term2A_reg);
       break;
    case PIPE_BLEND_MAX:
-      FLOAT_VECTOR_MAX(f, fragA_reg, term1A_reg, term2A_reg)
+      spe_float_max(f, fragA_reg, term1A_reg, term2A_reg);
       break;
    default:
       ASSERT(0);
@@ -779,9 +889,11 @@ gen_blend(const struct pipe_blend_state *blend,
    spe_release_register(f, tmp_reg);
 
    /* Free any optional registers that actually got used */
-   RELEASE_ONE_REG_IF_USED(f)
-   RELEASE_CONST_COLOR_IF_USED(f)
-   RELEASE_CONST_ALPHA_IF_USED(f)
+   release_const_register(f, &one_reg_set, one_reg);
+   release_const_register(f, &constR_reg_set, constR_reg);
+   release_const_register(f, &constG_reg_set, constG_reg);
+   release_const_register(f, &constB_reg_set, constB_reg);
+   release_const_register(f, &constA_reg_set, constA_reg);
 }
 
 
-- 
cgit v1.2.3


From a57fbe53dcb54694da9c9b4be1533c9d800079d2 Mon Sep 17 00:00:00 2001
From: Robert Ellison <papillo@tungstengraphics.com>
Date: Fri, 19 Sep 2008 01:55:00 -0600
Subject: CELL: add codegen for logic op, color mask

- rtasm_ppc_spe.c, rtasm_ppc_spe.h: added a new macro function
  "spe_load_uint" for loading and splatting unsigned integers
  in a register; it will use "ila" for values 18 bits or less,
  "ilh" for word values that are symmetric across halfwords,
  "ilhu" for values that have zeroes in their bottom halfwords,
  or "ilhu" followed by "iohl" for general 32-bit values.

  Of the 15 color masks of interest, 4 are 18 bits or less,
  2 are symmetric across halfwords, 3 are zero in the bottom
  halfword, and 6 require two instructions to load.

- cell_gen_fragment.c: added full codegen for logic op and
  color mask.
---
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c      |  23 +++-
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h      |   4 +
 src/gallium/drivers/cell/ppu/cell_gen_fragment.c | 143 ++++++++++++++++++++++-
 3 files changed, 163 insertions(+), 7 deletions(-)

(limited to 'src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
index 12e0826fb9..f60bfba3f5 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
@@ -592,11 +592,32 @@ spe_load_int(struct spe_function *p, unsigned rT, int i)
    }
 }
 
+void spe_load_uint(struct spe_function *p, unsigned rT, unsigned int ui)
+{
+   /* If the whole value is in the lower 18 bits, use ila, which
+    * doesn't sign-extend.  Otherwise, if the two halfwords of
+    * the constant are identical, use ilh.  Otherwise, we have
+    * to use ilhu followed by iohl.
+    */
+   if ((ui & 0xfffc0000) == ui) {
+      spe_ila(p, rT, ui);
+   }
+   else if ((ui >> 16) == (ui & 0xffff)) {
+      spe_ilh(p, rT, ui & 0xffff);
+   }
+   else {
+      spe_ilhu(p, rT, ui >> 16);
+      if (ui & 0xffff)
+         spe_iohl(p, rT, ui & 0xffff);
+   }
+}
+
 
 void
 spe_splat(struct spe_function *p, unsigned rT, unsigned rA)
 {
-   spe_ila(p, rT, 66051);
+   /* Duplicate bytes 0, 1, 2, and 3 across the whole register */
+   spe_ila(p, rT, 0x00010203);
    spe_shufb(p, rT, rA, rA, rT);
 }
 
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
index 4ef05ea27d..09400b3fb2 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
@@ -302,6 +302,10 @@ spe_load_float(struct spe_function *p, unsigned rT, float x);
 extern void
 spe_load_int(struct spe_function *p, unsigned rT, int i);
 
+/** Load/splat immediate unsigned int into rT. */
+extern void
+spe_load_uint(struct spe_function *p, unsigned rT, unsigned int ui);
+
 /** Replicate word 0 of rA across rT. */
 extern void
 spe_splat(struct spe_function *p, unsigned rT, unsigned rA);
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
index 9d25e820ad..899d8423b2 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
@@ -902,8 +902,69 @@ gen_logicop(const struct pipe_blend_state *blend,
             struct spe_function *f,
             int fragRGBA_reg, int fbRGBA_reg)
 {
-   /* XXX to-do */
-   /* operate on 32-bit packed pixels, not float colors */
+   /* We've got four 32-bit RGBA packed pixels in each of
+    * fragRGBA_reg and fbRGBA_reg, not sets of floating-point
+    * reds, greens, blues, and alphas.
+    * */
+   ASSERT(blend->logicop_enable);
+
+   switch(blend->logicop_func) {
+      case PIPE_LOGICOP_CLEAR: /* 0 */
+         spe_zero(f, fragRGBA_reg);
+         break;
+      case PIPE_LOGICOP_NOR: /* ~(s | d) */
+         spe_nor(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_AND_INVERTED: /* ~s & d */
+         /* andc R, A, B computes R = A & ~B */
+         spe_andc(f, fragRGBA_reg, fbRGBA_reg, fragRGBA_reg);
+         break;
+      case PIPE_LOGICOP_COPY_INVERTED: /* ~s */
+         spe_complement(f, fragRGBA_reg);
+         break;
+      case PIPE_LOGICOP_AND_REVERSE: /* s & ~d */
+         /* andc R, A, B computes R = A & ~B */
+         spe_andc(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_INVERT: /* ~d */
+         /* Note that (A nor A) == ~(A|A) == ~A */
+         spe_nor(f, fragRGBA_reg, fbRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_XOR: /* s ^ d */
+         spe_xor(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_NAND: /* ~(s & d) */
+         spe_nand(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_AND: /* s & d */
+         spe_and(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_EQUIV: /* ~(s ^ d) */
+         spe_xor(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+         spe_complement(f, fragRGBA_reg);
+         break;
+      case PIPE_LOGICOP_NOOP: /* d */
+         spe_move(f, fragRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_OR_INVERTED: /* ~s | d */
+         /* orc R, A, B computes R = A | ~B */
+         spe_orc(f, fragRGBA_reg, fbRGBA_reg, fragRGBA_reg);
+         break;
+      case PIPE_LOGICOP_COPY: /* s */
+         break;
+      case PIPE_LOGICOP_OR_REVERSE: /* s | ~d */
+         /* orc R, A, B computes R = A | ~B */
+         spe_orc(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_OR: /* s | d */
+         spe_or(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_SET: /* 1 */
+         spe_load_int(f, fragRGBA_reg, 0xffffffff);
+         break;
+      default:
+         ASSERT(0);
+   }
 }
 
 
@@ -912,11 +973,81 @@ gen_colormask(uint colormask,
               struct spe_function *f,
               int fragRGBA_reg, int fbRGBA_reg)
 {
-   /* XXX to-do */
-   /* operate on 32-bit packed pixels, not float colors */
-}
+   /* We've got four 32-bit RGBA packed pixels in each of
+    * fragRGBA_reg and fbRGBA_reg, not sets of floating-point
+    * reds, greens, blues, and alphas.
+    * */
+
+   /* The color mask operation can prevent any set of color
+    * components in the incoming fragment from being written to the frame 
+    * buffer; we do this by replacing the masked components of the 
+    * fragment with the frame buffer values.
+    *
+    * There are only 16 possibilities, with a unique mask for
+    * each of the possibilities.  (Technically, there are only 15
+    * possibilities, since we shouldn't be called for the one mask
+    * that does nothing, but the complete implementation is here
+    * anyway to avoid confusion.)
+    *
+    * We implement this via a constant static array which we'll index 
+    * into to get the correct mask.
+    * 
+    * We're dependent on the mask values being low-order bits,
+    * with particular values for each bit; so we start with a
+    * few assertions, which will fail if any of the values were
+    * to change.
+    */
+   ASSERT(PIPE_MASK_R == 0x1);
+   ASSERT(PIPE_MASK_G == 0x2);
+   ASSERT(PIPE_MASK_B == 0x4);
+   ASSERT(PIPE_MASK_A == 0x8);
 
+   /* Here's the list of all possible colormasks, indexed by the
+    * value of the combined mask specifier.
+    */
+   static const unsigned int colormasks[16] = {
+      0x00000000, /* 0: all colors masked */
+      0xff000000, /* 1: PIPE_MASK_R */
+      0x00ff0000, /* 2: PIPE_MASK_G */
+      0xffff0000, /* 3: PIPE_MASK_R | PIPE_MASK_G */
+      0x0000ff00, /* 4: PIPE_MASK_B */
+      0xff00ff00, /* 5: PIPE_MASK_R | PIPE_MASK_B */
+      0x00ffff00, /* 6: PIPE_MASK_G | PIPE_MASK_B */
+      0xffffff00, /* 7: PIPE_MASK_R | PIPE_MASK_G | PIPE_MASK_B */
+      0x000000ff, /* 8: PIPE_MASK_A */
+      0xff0000ff, /* 9: PIPE_MASK_R | PIPE_MASK_A */
+      0x00ff00ff, /* 10: PIPE_MASK_G | PIPE_MASK_A */
+      0xffff00ff, /* 11: PIPE_MASK_R | PIPE_MASK_G | PIPE_MASK_A */
+      0x0000ffff, /* 12: PIPE_MASK_B | PIPE_MASK_A */
+      0xff00ffff, /* 13: PIPE_MASK_R | PIPE_MASK_B | PIPE_MASK_A */
+      0x00ffffff, /* 14: PIPE_MASK_G | PIPE_MASK_B | PIPE_MASK_A */
+      0xffffffff  /* 15: PIPE_MASK_R | PIPE_MASK_G | PIPE_MASK_B | PIPE_MASK_A */
+   };
+
+   /* Get a temporary register to hold the mask */
+   int colormask_reg = spe_allocate_available_register(f);
+
+   /* Look up the desired mask directly and load it into the mask register.
+    * This will load the same mask into each of the four words in the
+    * mask register.
+    */
+   spe_load_uint(f, colormask_reg, colormasks[colormask]);
+
+   /* Use the mask register to select between the fragment color
+    * values and the frame buffer color values.  Wherever the
+    * mask has a 0 bit, the current frame buffer color should override
+    * the fragment color.  Wherever the mask has a 1 bit, the 
+    * fragment color should persevere.  The Select Bits (selb rt, rA, rB, rM)
+    * instruction will select bits from its first operand rA wherever the
+    * the mask bits rM are 0, and from its second operand rB wherever the
+    * mask bits rM are 1.  That means that the frame buffer color is the
+    * first operand, and the fragment color the second.
+    */
+    spe_selb(f, fragRGBA_reg, fbRGBA_reg, fragRGBA_reg, colormask_reg);
 
+    /* Release the temporary register and we're done */
+    spe_release_register(f, colormask_reg);
+}
 
 /**
  * Generate code to pack a quad of float colors into a four 32-bit integers.
@@ -1223,7 +1354,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
          gen_logicop(blend, f, rgba_reg, fbRGBA_reg);
       }
 
-      if (blend->colormask != 0xf) {
+      if (blend->colormask != PIPE_MASK_RGBA) {
          gen_colormask(blend->colormask, f, rgba_reg, fbRGBA_reg);
       }
 
-- 
cgit v1.2.3


From 0838b702750d85b0284a97be211fa379e9f8d8d8 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 19 Sep 2008 09:36:29 -0600
Subject: cell: change spe_complement() to take a src and dst reg, like other
 instructions

---
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c      | 14 ++++++++------
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h      |  4 ++--
 src/gallium/drivers/cell/ppu/cell_gen_fp.c       |  4 ++--
 src/gallium/drivers/cell/ppu/cell_gen_fragment.c |  4 ++--
 4 files changed, 14 insertions(+), 12 deletions(-)

(limited to 'src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
index f60bfba3f5..85280f680a 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
@@ -623,9 +623,9 @@ spe_splat(struct spe_function *p, unsigned rT, unsigned rA)
 
 
 void
-spe_complement(struct spe_function *p, unsigned rT)
+spe_complement(struct spe_function *p, unsigned rT, unsigned rA)
 {
-   spe_nor(p, rT, rT, rT);
+   spe_nor(p, rT, rA, rA);
 }
 
 
@@ -667,7 +667,8 @@ spe_splat_word(struct spe_function *p, unsigned rT, unsigned rA, int word)
    }
 }
 
-/* For each 32-bit float element of rA and rB, choose the smaller of the
+/**
+ * For each 32-bit float element of rA and rB, choose the smaller of the
  * two, compositing them into the rT register.
  * 
  * The Float Compare Greater Than (fcgt) instruction will put 1s into
@@ -683,7 +684,7 @@ spe_splat_word(struct spe_function *p, unsigned rT, unsigned rA, int word)
  * like "x = min(x, a)", we always allocate a new register to be safe.
  */
 void 
-spe_float_min(struct spe_function *p, unsigned int rT, unsigned int rA, unsigned int rB)
+spe_float_min(struct spe_function *p, unsigned rT, unsigned rA, unsigned rB)
 {
    unsigned int compare_reg = spe_allocate_available_register(p);
    spe_fcgt(p, compare_reg, rA, rB);
@@ -691,7 +692,8 @@ spe_float_min(struct spe_function *p, unsigned int rT, unsigned int rA, unsigned
    spe_release_register(p, compare_reg);
 }
 
-/* For each 32-bit float element of rA and rB, choose the greater of the
+/**
+ * For each 32-bit float element of rA and rB, choose the greater of the
  * two, compositing them into the rT register.
  * 
  * The logic is similar to that of spe_float_min() above; the only
@@ -699,7 +701,7 @@ spe_float_min(struct spe_function *p, unsigned int rT, unsigned int rA, unsigned
  * so that the larger of the two is selected instead of the smaller.
  */
 void 
-spe_float_max(struct spe_function *p, unsigned int rT, unsigned int rA, unsigned int rB)
+spe_float_max(struct spe_function *p, unsigned rT, unsigned rA, unsigned rB)
 {
    unsigned int compare_reg = spe_allocate_available_register(p);
    spe_fcgt(p, compare_reg, rA, rB);
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
index 09400b3fb2..8a0d70fdac 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
@@ -310,9 +310,9 @@ spe_load_uint(struct spe_function *p, unsigned rT, unsigned int ui);
 extern void
 spe_splat(struct spe_function *p, unsigned rT, unsigned rA);
 
-/** Complement/invert all bits in rT. */
+/** rT = complement_all_bits(rA). */
 extern void
-spe_complement(struct spe_function *p, unsigned rT);
+spe_complement(struct spe_function *p, unsigned rT, unsigned rA);
 
 /** rT = rA. */
 extern void
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
index 6f2b89c695..d835aae255 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
@@ -924,7 +924,7 @@ emit_IF(struct codegen *gen, const struct tgsi_full_instruction *inst)
    /* tmp = (s1_reg == 0) */
    spe_ceqi(gen->f, tmp_reg, s1_reg, 0);
    /* tmp = !tmp */
-   spe_complement(gen->f, tmp_reg);
+   spe_complement(gen->f, tmp_reg, tmp_reg);
    /* exec_mask = exec_mask & tmp */
    spe_and(gen->f, exec_reg, exec_reg, tmp_reg);
 
@@ -944,7 +944,7 @@ emit_ELSE(struct codegen *gen, const struct tgsi_full_instruction *inst)
    spe_comment(gen->f, -4, "ELSE:");
 
    /* exec_mask = !exec_mask */
-   spe_complement(gen->f, exec_reg);
+   spe_complement(gen->f, exec_reg, exec_reg);
 
    return true;
 }
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
index 899d8423b2..06a9fa102f 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
@@ -920,7 +920,7 @@ gen_logicop(const struct pipe_blend_state *blend,
          spe_andc(f, fragRGBA_reg, fbRGBA_reg, fragRGBA_reg);
          break;
       case PIPE_LOGICOP_COPY_INVERTED: /* ~s */
-         spe_complement(f, fragRGBA_reg);
+         spe_complement(f, fragRGBA_reg, fragRGBA_reg);
          break;
       case PIPE_LOGICOP_AND_REVERSE: /* s & ~d */
          /* andc R, A, B computes R = A & ~B */
@@ -941,7 +941,7 @@ gen_logicop(const struct pipe_blend_state *blend,
          break;
       case PIPE_LOGICOP_EQUIV: /* ~(s ^ d) */
          spe_xor(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
-         spe_complement(f, fragRGBA_reg);
+         spe_complement(f, fragRGBA_reg, fragRGBA_reg);
          break;
       case PIPE_LOGICOP_NOOP: /* d */
          spe_move(f, fragRGBA_reg, fbRGBA_reg);
-- 
cgit v1.2.3


From 7af5f944e5709920623c766bc572f8d587709270 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 19 Sep 2008 17:45:51 -0600
Subject: gallium: added spe_code_size()

---
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c | 7 +++++++
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h | 1 +
 2 files changed, 8 insertions(+)

(limited to 'src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
index 85280f680a..1c3e21b4c0 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
@@ -383,6 +383,13 @@ void spe_release_func(struct spe_function *p)
 }
 
 
+/** Return current code size in bytes. */
+unsigned spe_code_size(const struct spe_function *p)
+{
+   return p->num_inst * SPE_INST_SIZE;
+}
+
+
 /**
  * Allocate a SPE register.
  * \return register index or -1 if none left.
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
index 8a0d70fdac..4165a971a2 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
@@ -72,6 +72,7 @@ struct spe_function
 
 extern void spe_init_func(struct spe_function *p, unsigned code_size);
 extern void spe_release_func(struct spe_function *p);
+extern unsigned spe_code_size(const struct spe_function *p);
 
 extern int spe_allocate_available_register(struct spe_function *p);
 extern int spe_allocate_register(struct spe_function *p, int reg);
-- 
cgit v1.2.3


From 938e12c1caee7e34fcc6630f17f422ebdd824ec3 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 26 Sep 2008 17:06:22 -0600
Subject: gallium: SPU register comments

---
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
index 4165a971a2..61c7edeb60 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
@@ -40,10 +40,10 @@
 /** number of general-purpose SIMD registers */
 #define SPE_NUM_REGS  128
 
-/** Return Address register */
+/** Return Address register (aka $lr / Link Register) */
 #define SPE_REG_RA  0
 
-/** Stack Pointer register */
+/** Stack Pointer register (aka $sp) */
 #define SPE_REG_SP  1
 
 
-- 
cgit v1.2.3


From afaa53040bd01ca86762e7d7b1a5a65810767921 Mon Sep 17 00:00:00 2001
From: Robert Ellison <papillo@tungstengraphics.com>
Date: Fri, 3 Oct 2008 18:00:43 -0600
Subject: CELL: changes to generate SPU code for stenciling

This set of code changes are for stencil code generation
support.  Both one-sided and two-sided stenciling are supported.
In addition to the raw code generation changes, these changes had
to be made elsewhere in the system:

- Added new "register set" feature to the SPE assembly generation.
  A "register set" is a way to allocate multiple registers and free
  them all at the same time, delegating register allocation management
  to the spe_function unit.  It's quite useful in complex register
  allocation schemes (like stenciling).

- Added and improved SPE macro calculations.
  These are operations between registers and unsigned integer
  immediates.  In many cases, the calculation can be performed
  with a single instruction; the macros will generate the
  single instruction if possible, or generate a register load
  and register-to-register operation if not.  These macro
  functions are: spe_load_uint() (which has new ways to
  load a value in a single instruction), spe_and_uint(),
  spe_xor_uint(), spe_compare_equal_uint(), and spe_compare_greater_uint().

- Added facing to fragment generation.  While rendering, the rasterizer
  needs to be able to determine front- and back-facing fragments, in order
  to correctly apply two-sided stencil.  That requires these changes:
  - Added front_winding field to the cell_command_render block, so that
    the state tracker could communicate to the rasterizer what it
    considered to be the front-facing direction.
  - Added fragment facing as an input to the fragment function.
  - Calculated facing is passed during emit_quad().
---
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c        | 246 +++++-
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h        |  41 +-
 src/gallium/drivers/cell/common.h                  |   1 +
 src/gallium/drivers/cell/ppu/cell_gen_fragment.c   | 881 ++++++++++++++++++---
 src/gallium/drivers/cell/ppu/cell_render.c         |   1 +
 src/gallium/drivers/cell/ppu/cell_vbuf.c           |   1 +
 src/gallium/drivers/cell/spu/spu_main.h            |   3 +-
 src/gallium/drivers/cell/spu/spu_per_fragment_op.c |  19 +-
 src/gallium/drivers/cell/spu/spu_per_fragment_op.h |   3 +-
 src/gallium/drivers/cell/spu/spu_render.c          |   4 +-
 src/gallium/drivers/cell/spu/spu_tri.c             |  35 +-
 src/gallium/drivers/cell/spu/spu_tri.h             |   2 +-
 12 files changed, 1091 insertions(+), 146 deletions(-)

(limited to 'src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
index 491141f190..8a87e9abb1 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
@@ -359,14 +359,21 @@ void _name (struct spe_function *p, int imm) \
  */
 void spe_init_func(struct spe_function *p, unsigned code_size)
 {
+    register unsigned int i;
+
     p->store = align_malloc(code_size, 16);
     p->num_inst = 0;
     p->max_inst = code_size / SPE_INST_SIZE;
 
+    p->set_count = 0;
+    memset(p->regs, 0, SPE_NUM_REGS * sizeof(p->regs[0]));
+
     /* Conservatively treat R0 - R2 and R80 - R127 as non-volatile.
      */
-    p->regs[0] = ~7;
-    p->regs[1] = (1U << (80 - 64)) - 1;
+    p->regs[0] = p->regs[1] = p->regs[2] = 1;
+    for (i = 80; i <= 127; i++) {
+      p->regs[i] = 1;
+    }
 
     p->print = false;
     p->indent = 0;
@@ -398,12 +405,8 @@ int spe_allocate_available_register(struct spe_function *p)
 {
    unsigned i;
    for (i = 0; i < SPE_NUM_REGS; i++) {
-      const uint64_t mask = (1ULL << (i % 64));
-      const unsigned idx = i / 64;
-
-      assert(idx < 2);
-      if ((p->regs[idx] & mask) != 0) {
-         p->regs[idx] &= ~mask;
+      if (p->regs[i] == 0) {
+         p->regs[i] = 1;
          return i;
       }
    }
@@ -417,31 +420,68 @@ int spe_allocate_available_register(struct spe_function *p)
  */
 int spe_allocate_register(struct spe_function *p, int reg)
 {
-   const unsigned idx = reg / 64;
-   const unsigned bit = reg % 64;
-
    assert(reg < SPE_NUM_REGS);
-   assert((p->regs[idx] & (1ULL << bit)) != 0);
-
-   p->regs[idx] &= ~(1ULL << bit);
+   assert(p->regs[reg] == 0);
+   p->regs[reg] = 1;
    return reg;
 }
 
 
 /**
- * Mark the given SPE register as "unallocated".
+ * Mark the given SPE register as "unallocated".  Note that this should
+ * only be used on registers allocated in the current register set; an
+ * assertion will fail if an attempt is made to deallocate a register
+ * allocated in an earlier register set.
  */
 void spe_release_register(struct spe_function *p, int reg)
 {
-   const unsigned idx = reg / 64;
-   const unsigned bit = reg % 64;
+   assert(reg < SPE_NUM_REGS);
+   assert(p->regs[reg] == 1);
 
-   assert(idx < 2);
+   p->regs[reg] = 0;
+}
 
-   assert(reg < SPE_NUM_REGS);
-   assert((p->regs[idx] & (1ULL << bit)) == 0);
+/**
+ * Start a new set of registers.  This can be called if
+ * it will be difficult later to determine exactly what
+ * registers were actually allocated during a code generation
+ * sequence, and you really just want to deallocate all of them.
+ */
+void spe_allocate_register_set(struct spe_function *p)
+{
+   register unsigned int i;
+
+   /* Keep track of the set count.  If it ever wraps around to 0, 
+    * we're in trouble.
+    */
+   p->set_count++;
+   assert(p->set_count > 0);
+
+   /* Increment the allocation count of all registers currently
+    * allocated.  Then any registers that are allocated in this set
+    * will be the only ones with a count of 1; they'll all be released
+    * when the register set is released.
+    */
+   for (i = 0; i < SPE_NUM_REGS; i++) {
+      if (p->regs[i] > 0) p->regs[i]++;
+   }
+}
+
+void spe_release_register_set(struct spe_function *p)
+{
+   unsigned int i;
+
+   /* If the set count drops below zero, we're in trouble. */
+   assert(p->set_count > 0);
+   p->set_count--;
 
-   p->regs[idx] |= (1ULL << bit);
+   /* Drop the allocation level of all registers.  Any allocated
+    * during this register set will drop to 0 and then become
+    * available.
+    */
+   for (i = 0; i < SPE_NUM_REGS; i++) {
+      if (p->regs[i] > 0) p->regs[i]--;
+   }
 }
 
 
@@ -603,8 +643,10 @@ void spe_load_uint(struct spe_function *p, unsigned rT, unsigned int ui)
 {
    /* If the whole value is in the lower 18 bits, use ila, which
     * doesn't sign-extend.  Otherwise, if the two halfwords of
-    * the constant are identical, use ilh.  Otherwise, we have
-    * to use ilhu followed by iohl.
+    * the constant are identical, use ilh.  Otherwise, if every byte of
+    * the desired value is 0x00 or 0xff, we can use Form Select Mask for
+    * Bytes Immediate (fsmbi) to load the value in a single instruction.
+    * Otherwise, in the general case, we have to use ilhu followed by iohl.
     */
    if ((ui & 0xfffc0000) == ui) {
       spe_ila(p, rT, ui);
@@ -612,13 +654,171 @@ void spe_load_uint(struct spe_function *p, unsigned rT, unsigned int ui)
    else if ((ui >> 16) == (ui & 0xffff)) {
       spe_ilh(p, rT, ui & 0xffff);
    }
+   else if (
+      ((ui & 0x000000ff) == 0 || (ui & 0x000000ff) == 0x000000ff) &&
+      ((ui & 0x0000ff00) == 0 || (ui & 0x0000ff00) == 0x0000ff00) &&
+      ((ui & 0x00ff0000) == 0 || (ui & 0x00ff0000) == 0x00ff0000) &&
+      ((ui & 0xff000000) == 0 || (ui & 0xff000000) == 0xff000000)
+   ) {
+      unsigned int mask = 0;
+      /* fsmbi duplicates each bit in the given mask eight times,
+       * using a 16-bit value to initialize a 16-byte quadword.
+       * Each 4-bit nybble of the mask corresponds to a full word
+       * of the result; look at the value and figure out the mask
+       * (replicated for each word in the quadword), and then
+       * form the "select mask" to get the value.
+       */
+      if ((ui & 0x000000ff) == 0x000000ff) mask |= 0x1111;
+      if ((ui & 0x0000ff00) == 0x0000ff00) mask |= 0x2222;
+      if ((ui & 0x00ff0000) == 0x00ff0000) mask |= 0x4444;
+      if ((ui & 0xff000000) == 0xff000000) mask |= 0x8888;
+      spe_fsmbi(p, rT, mask);
+   }
    else {
+      /* The general case: this usually uses two instructions, but
+       * may use only one if the low-order 16 bits of each word are 0.
+       */
       spe_ilhu(p, rT, ui >> 16);
       if (ui & 0xffff)
          spe_iohl(p, rT, ui & 0xffff);
    }
 }
 
+/* This function is constructed identically to spe_sor_uint() below.
+ * Changes to one should be made in the other.
+ */
+void spe_and_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
+{
+   /* If we can, emit a single instruction, either And Byte Immediate
+    * (which uses the same constant across each byte), And Halfword Immediate
+    * (which sign-extends a 10-bit immediate to 16 bits and uses that
+    * across each halfword), or And Word Immediate (which sign-extends
+    * a 10-bit immediate to 32 bits).
+    *
+    * Otherwise, we'll need to use a temporary register.
+    */
+   register unsigned int tmp;
+
+   /* If the upper 23 bits are all 0s or all 1s, sign extension
+    * will work and we can use And Word Immediate
+    */
+   tmp = ui & 0xfffffe00;
+   if (tmp == 0xfffffe00 || tmp  == 0) {
+      spe_andi(p, rT, rA, ui & 0x000003ff);
+      return;
+   }
+   
+   /* If the ui field is symmetric along halfword boundaries and
+    * the upper 7 bits of each halfword are all 0s or 1s, we
+    * can use And Halfword Immediate
+    */
+   tmp = ui & 0xfe00fe00;
+   if ((tmp == 0xfe00fe00 || tmp == 0) && ((ui >> 16) == (ui & 0x0000ffff))) {
+      spe_andhi(p, rT, rA, ui & 0x000003ff);
+      return;
+   }
+
+   /* If the ui field is symmetric in each byte, then we can use
+    * the And Byte Immediate instruction.
+    */
+   tmp = ui & 0x000000ff;
+   if ((ui >> 24) == tmp && ((ui >> 16) & 0xff) == tmp && ((ui >> 8) & 0xff) == tmp) {
+      spe_andbi(p, rT, rA, tmp);
+      return;
+   }
+
+   /* Otherwise, we'll have to use a temporary register. */
+   unsigned int tmp_reg = spe_allocate_available_register(p);
+   spe_load_uint(p, tmp_reg, ui);
+   spe_and(p, rT, rA, tmp_reg);
+   spe_release_register(p, tmp_reg);
+}
+
+/* This function is constructed identically to spe_and_uint() above.
+ * Changes to one should be made in the other.
+ */
+void spe_xor_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
+{
+   /* If we can, emit a single instruction, either Exclusive Or Byte 
+    * Immediate (which uses the same constant across each byte), Exclusive 
+    * Or Halfword Immediate (which sign-extends a 10-bit immediate to 
+    * 16 bits and uses that across each halfword), or Exclusive Or Word 
+    * Immediate (which sign-extends a 10-bit immediate to 32 bits).
+    *
+    * Otherwise, we'll need to use a temporary register.
+    */
+   register unsigned int tmp;
+
+   /* If the upper 23 bits are all 0s or all 1s, sign extension
+    * will work and we can use Exclusive Or Word Immediate
+    */
+   tmp = ui & 0xfffffe00;
+   if (tmp == 0xfffffe00 || tmp  == 0) {
+      spe_xori(p, rT, rA, ui & 0x000003ff);
+      return;
+   }
+   
+   /* If the ui field is symmetric along halfword boundaries and
+    * the upper 7 bits of each halfword are all 0s or 1s, we
+    * can use Exclusive Or Halfword Immediate
+    */
+   tmp = ui & 0xfe00fe00;
+   if ((tmp == 0xfe00fe00 || tmp == 0) && ((ui >> 16) == (ui & 0x0000ffff))) {
+      spe_xorhi(p, rT, rA, ui & 0x000003ff);
+      return;
+   }
+
+   /* If the ui field is symmetric in each byte, then we can use
+    * the Exclusive Or Byte Immediate instruction.
+    */
+   tmp = ui & 0x000000ff;
+   if ((ui >> 24) == tmp && ((ui >> 16) & 0xff) == tmp && ((ui >> 8) & 0xff) == tmp) {
+      spe_xorbi(p, rT, rA, tmp);
+      return;
+   }
+
+   /* Otherwise, we'll have to use a temporary register. */
+   unsigned int tmp_reg = spe_allocate_available_register(p);
+   spe_load_uint(p, tmp_reg, ui);
+   spe_xor(p, rT, rA, tmp_reg);
+   spe_release_register(p, tmp_reg);
+}
+
+void
+spe_compare_equal_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
+{
+   /* If the comparison value is 9 bits or less, it fits inside a
+    * Compare Equal Word Immediate instruction.
+    */
+   if ((ui & 0x000001ff) == ui) {
+      spe_ceqi(p, rT, rA, ui);
+   }
+   /* Otherwise, we're going to have to load a word first. */
+   else {
+      unsigned int tmp_reg = spe_allocate_available_register(p);
+      spe_load_uint(p, tmp_reg, ui);
+      spe_ceq(p, rT, rA, tmp_reg);
+      spe_release_register(p, tmp_reg);
+   }
+}
+
+void
+spe_compare_greater_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
+{
+   /* If the comparison value is 10 bits or less, it fits inside a
+    * Compare Logical Greater Than Word Immediate instruction.
+    */
+   if ((ui & 0x000003ff) == ui) {
+      spe_clgti(p, rT, rA, ui);
+   }
+   /* Otherwise, we're going to have to load a word first. */
+   else {
+      unsigned int tmp_reg = spe_allocate_available_register(p);
+      spe_load_uint(p, tmp_reg, ui);
+      spe_clgt(p, rT, rA, tmp_reg);
+      spe_release_register(p, tmp_reg);
+   }
+}
 
 void
 spe_splat(struct spe_function *p, unsigned rT, unsigned rA)
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
index 61c7edeb60..cd2e245409 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
@@ -53,17 +53,26 @@ struct spe_function
    uint num_inst;
    uint max_inst;
 
-    /**
-     * Mask of used / unused registers
-     *
-     * Each set bit corresponds to an available register.  Each cleared bit
-     * corresponds to an allocated register.
+   /**
+    * The "set count" reflects the number of nested register sets
+    * are allowed.  In the unlikely case that we exceed the set count,
+    * register allocation will start to be confused, which is critical
+    * enough that we check for it.
+    */
+   unsigned char set_count;
+
+   /** 
+    * Flags for used and unused registers.  Each byte corresponds to a
+    * register; a 0 in that byte means that the register is available.
+    * A value of 1 means that the register was allocated in the current
+    * register set.  Any other value N means that the register was allocated
+    * N register sets ago.
      *
      * \sa
      * spe_allocate_register, spe_allocate_available_register,
-     * spe_release_register
+     * spe_allocate_register_set, spe_release_register_set, spe_release_register, 
      */
-    uint64_t regs[SPE_NUM_REGS / 64];
+    unsigned char regs[SPE_NUM_REGS];
 
     boolean print; /**< print/dump instructions as they're emitted? */
     int indent;    /**< number of spaces to indent */
@@ -77,6 +86,8 @@ extern unsigned spe_code_size(const struct spe_function *p);
 extern int spe_allocate_available_register(struct spe_function *p);
 extern int spe_allocate_register(struct spe_function *p, int reg);
 extern void spe_release_register(struct spe_function *p, int reg);
+extern void spe_allocate_register_set(struct spe_function *p);
+extern void spe_release_register_set(struct spe_function *p);
 
 extern void spe_print_code(struct spe_function *p, boolean enable);
 extern void spe_indent(struct spe_function *p, int spaces);
@@ -307,6 +318,22 @@ spe_load_int(struct spe_function *p, unsigned rT, int i);
 extern void
 spe_load_uint(struct spe_function *p, unsigned rT, unsigned int ui);
 
+/** And immediate value into rT. */
+extern void
+spe_and_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui);
+
+/** Xor immediate value into rT. */
+extern void
+spe_xor_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui);
+
+/** Compare equal with immediate value. */
+extern void
+spe_compare_equal_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui);
+
+/** Compare greater with immediate value. */
+extern void
+spe_compare_greater_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui);
+
 /** Replicate word 0 of rA across rT. */
 extern void
 spe_splat(struct spe_function *p, unsigned rT, unsigned rA);
diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index 99329fd8e2..c223bc1744 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -227,6 +227,7 @@ struct cell_command_render
    float xmin, ymin, xmax, ymax;  /* XXX another dummy field */
    uint min_index;
    boolean inline_verts;
+   uint front_winding; /* the rasterizer needs to be able to determine facing to apply front/back-facing stencil */
 };
 
 
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
index 653afc235d..f920ae13b4 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
@@ -54,10 +54,12 @@
  * \param ifragZ_reg  register containing integer fragment Z values (in)
  * \param ifbZ_reg    register containing integer frame buffer Z values (in/out)
  * \param zmask_reg   register containing result of Z test/comparison (out)
+ *
+ * Returns true if the Z-buffer needs to be updated.
  */
-static void
-gen_depth_test(const struct pipe_depth_stencil_alpha_state *dsa,
-               struct spe_function *f,
+static boolean
+gen_depth_test(struct spe_function *f,
+               const struct pipe_depth_stencil_alpha_state *dsa,
                int mask_reg, int ifragZ_reg, int ifbZ_reg, int zmask_reg)
 {
    /* NOTE: we use clgt below, not cgt, because we want to compare _unsigned_
@@ -132,7 +134,10 @@ gen_depth_test(const struct pipe_depth_stencil_alpha_state *dsa,
        * framebufferZ = (ztest_passed ? fragmentZ : framebufferZ;
        */
       spe_selb(f, ifbZ_reg, ifbZ_reg, ifragZ_reg, mask_reg);
+      return true;
    }
+
+   return false;
 }
 
 
@@ -238,22 +243,34 @@ gen_alpha_test(const struct pipe_depth_stencil_alpha_state *dsa,
  * it and have to allocate and load it again unnecessarily.
  */
 static inline void
-setup_const_register(struct spe_function *f, boolean *is_already_set, unsigned int *r, float value)
+setup_optional_register(struct spe_function *f, boolean *is_already_set, unsigned int *r)
 {
    if (*is_already_set) return;
    *r = spe_allocate_available_register(f);
-   spe_load_float(f, *r, value);
-   *is_already_set = true;
 }
 
 static inline void
-release_const_register(struct spe_function *f, boolean *is_already_set, unsigned int r)
+release_optional_register(struct spe_function *f, boolean *is_already_set, unsigned int r)
 {
     if (!*is_already_set) return;
     spe_release_register(f, r);
     *is_already_set = false;
 }
 
+static inline void
+setup_const_register(struct spe_function *f, boolean *is_already_set, unsigned int *r, float value)
+{
+   if (*is_already_set) return;
+   setup_optional_register(f, is_already_set, r);
+   spe_load_float(f, *r, value);
+}
+
+static inline void
+release_const_register(struct spe_function *f, boolean *is_already_set, unsigned int r)
+{
+    release_optional_register(f, is_already_set, r);
+}
+
 /**
  * Generate SPE code to implement the given blend mode for a quad of pixels.
  * \param f          SPE function to append instruction onto.
@@ -1117,6 +1134,633 @@ gen_colormask(struct spe_function *f,
     spe_release_register(f, colormask_reg);
 }
 
+/* This function is annoyingly similar to gen_depth_test(), above, except
+ * that instead of comparing two varying values (i.e. fragment and buffer),
+ * we're comparing a varying value with a static value.  As such, we have
+ * access to the Compare Immediate instructions where we don't in 
+ * gen_depth_test(), which is what makes us very different.
+ *
+ * The return value in the stencil_pass_reg is a bitmask of valid
+ * fragments that also passed the stencil test.  The bitmask of valid
+ * fragments that failed would be found in (mask_reg & ~stencil_pass_reg).
+ */
+static void
+gen_stencil_test(struct spe_function *f, const struct pipe_stencil_state *state, 
+                 unsigned int mask_reg, unsigned int fbS_reg, 
+                 unsigned int stencil_pass_reg)
+{
+   /* Generate code that puts the set of passing fragments into the stencil_pass_reg
+    * register, taking into account whether each fragment was active to begin with.
+    */
+   switch (state->func) {
+   case PIPE_FUNC_EQUAL:
+      /* stencil_pass = mask & (s == reference) */
+      spe_compare_equal_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
+      spe_and(f, stencil_pass_reg, mask_reg, stencil_pass_reg);
+      /* stencil_fail = mask & ~stencil_pass */
+      break;
+
+   case PIPE_FUNC_NOTEQUAL:
+      /* stencil_pass = mask & ~(s == reference) */
+      spe_compare_equal_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
+      spe_andc(f, stencil_pass_reg, mask_reg, stencil_pass_reg);
+      break;
+
+   case PIPE_FUNC_GREATER:
+      /* stencil_pass = mask & (s > reference) */
+      spe_compare_greater_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
+      spe_and(f, stencil_pass_reg, mask_reg, stencil_pass_reg);
+      break;
+
+   case PIPE_FUNC_LESS: {
+      /* stencil_pass = mask & (reference > s) */
+      /* There's no convenient Compare Less Than Immediate instruction, so
+       * we'll have to do this one the harder way, by loading a register and 
+       * comparing directly.  Compare Logical Greater Than Word (clgt) 
+       * treats its operands as unsigned - no sign extension.
+       */
+      unsigned int tmp_reg = spe_allocate_available_register(f);
+      spe_load_uint(f, tmp_reg, state->ref_value);
+      spe_clgt(f, stencil_pass_reg, tmp_reg, fbS_reg);
+      spe_and(f, stencil_pass_reg, mask_reg, stencil_pass_reg);
+      spe_release_register(f, tmp_reg);
+      break;
+   }
+
+   case PIPE_FUNC_LEQUAL:
+      /* stencil_pass = mask & (s <= reference) = mask & ~(s > reference) */
+      spe_compare_greater_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
+      spe_andc(f, stencil_pass_reg, mask_reg, stencil_pass_reg);
+      break;
+
+   case PIPE_FUNC_GEQUAL: {
+      /* stencil_pass = mask & (s >= reference) = mask & ~(reference > s) */
+      /* As above, we have to do this by loading a register */
+      unsigned int tmp_reg = spe_allocate_available_register(f);
+      spe_load_uint(f, tmp_reg, state->ref_value);
+      spe_clgt(f, stencil_pass_reg, tmp_reg, fbS_reg);
+      spe_andc(f, stencil_pass_reg, mask_reg, stencil_pass_reg);
+      spe_release_register(f, tmp_reg);
+      break;
+   }
+
+   case PIPE_FUNC_NEVER:
+      /* stencil_pass = mask & 0 = 0 */
+      spe_load_uint(f, stencil_pass_reg, 0);
+      spe_move(f, stencil_pass_reg, mask_reg);  /* zmask = mask */
+      break;
+
+   case PIPE_FUNC_ALWAYS:
+      /* stencil_pass = mask & 1 = mask */
+      spe_move(f, stencil_pass_reg, mask_reg);
+      break;
+   }
+
+   /* The fragments that passed the stencil test are now in stencil_pass_reg.
+    * The fragments that failed would be (mask_reg & ~stencil_pass_reg).
+    */
+}
+
+/* This function generates code that calculates a set of new stencil values
+ * given the earlier values and the operation to apply.  It does not
+ * apply any tests.  It is intended to be called up to 3 times
+ * (for the stencil fail operation, for the stencil pass-z fail operation,
+ * and for the stencil pass-z pass operation) to collect up to three
+ * possible sets of values, and for the caller to combine them based
+ * on the result of the tests.
+ *
+ * stencil_max_value should be (2^n - 1) where n is the number of bits
+ * in the stencil buffer - in other words, it should be usable as a mask.
+ */
+static void
+gen_stencil_values(struct spe_function *f, unsigned int stencil_op,
+                   unsigned int stencil_ref_value, unsigned int stencil_max_value,
+                   unsigned int fbS_reg, unsigned int newS_reg)
+{
+   /* The code below assumes that newS_reg and fbS_reg are not the same
+    * register; if they can be, the calculations below will have to use
+    * an additional temporary register.  For now, mark the assumption
+    * with an assertion that will fail if they are the same.
+    */
+   ASSERT(fbS_reg != newS_reg);
+
+   /* The code also assumes the the stencil_max_value is of the form 
+    * 2^n-1 and can therefore be used as a mask for the valid bits in 
+    * addition to a maximum.  Make sure this is the case as well.
+    * The clever math below exploits the fact that incrementing a 
+    * binary number serves to flip all the bits of a number starting at
+    * the LSB and continuing to (and including) the first zero bit
+    * found.  That means that a number and its increment will always
+    * have at least one bit in common (the high order bit, if nothing
+    * else) *unless* the number is zero, *or* the number is of a form
+    * consisting of some number of 1s in the low-order bits followed
+    * by nothing but 0s in the high-order bits.  The latter case
+    * implies it's of the form 2^n-1.
+    */
+   ASSERT(stencil_max_value > 0 && ((stencil_max_value + 1) & stencil_max_value) == 0);
+
+   switch(stencil_op) {
+   case PIPE_STENCIL_OP_KEEP:
+      /* newS = S */
+      spe_move(f, newS_reg, fbS_reg);
+      break;
+
+   case PIPE_STENCIL_OP_ZERO:
+      /* newS = 0 */
+      spe_zero(f, newS_reg);
+      break;
+
+   case PIPE_STENCIL_OP_REPLACE:
+      /* newS = stencil reference value */
+      spe_load_uint(f, newS_reg, stencil_ref_value);
+      break;
+
+   case PIPE_STENCIL_OP_INCR: {
+      /* newS = (s == max ? max : s + 1) */
+      unsigned int equals_reg = spe_allocate_available_register(f);
+
+      spe_compare_equal_uint(f, equals_reg, fbS_reg, stencil_max_value);
+      /* Add Word Immediate computes rT = rA + 10-bit signed immediate */
+      spe_ai(f, newS_reg, fbS_reg, 1);
+      /* Select from the current value or the new value based on the equality test */
+      spe_selb(f, newS_reg, fbS_reg, newS_reg, equals_reg);
+
+      spe_release_register(f, equals_reg);
+      break;
+   }
+   case PIPE_STENCIL_OP_DECR: {
+      /* newS = (s == 0 ? 0 : s - 1) */
+      unsigned int equals_reg = spe_allocate_available_register(f);
+
+      spe_compare_equal_uint(f, equals_reg, fbS_reg, 0);
+      /* Add Word Immediate with a (-1) value works */
+      spe_ai(f, newS_reg, fbS_reg, -1);
+      /* Select from the current value or the new value based on the equality test */
+      spe_selb(f, newS_reg, fbS_reg, newS_reg, equals_reg);
+
+      spe_release_register(f, equals_reg);
+      break;
+   }
+   case PIPE_STENCIL_OP_INCR_WRAP:
+      /* newS = (s == max ? 0 : s + 1), but since max is 2^n-1, we can
+       * do a normal add and mask off the correct bits 
+       */
+      spe_ai(f, newS_reg, fbS_reg, 1);
+      spe_and_uint(f, newS_reg, newS_reg, stencil_max_value);
+      break;
+
+   case PIPE_STENCIL_OP_DECR_WRAP:
+      /* newS = (s == 0 ? max : s - 1), but we'll pull the same mask trick as above */
+      spe_ai(f, newS_reg, fbS_reg, -1);
+      spe_and_uint(f, newS_reg, newS_reg, stencil_max_value);
+      break;
+
+   case PIPE_STENCIL_OP_INVERT:
+      /* newS = ~s.  We take advantage of the mask/max value to invert only
+       * the valid bits for the field so we don't have to do an extra "and".
+       */
+      spe_xor_uint(f, newS_reg, fbS_reg, stencil_max_value);
+      break;
+
+   default:
+      ASSERT(0);
+   }
+}
+
+
+/* This function generates code to get all the necessary possible
+ * stencil values.  For each of the output registers (fail_reg,
+ * zfail_reg, and zpass_reg), it either allocates a new register
+ * and calculates a new set of values based on the stencil operation,
+ * or it reuses a register allocation and calculation done for an
+ * earlier (matching) operation, or it reuses the fbS_reg register
+ * (if the stencil operation is KEEP, which doesn't change the 
+ * stencil buffer).
+ *
+ * Since this function allocates a variable number of registers,
+ * to avoid incurring complex logic to free them, they should
+ * be allocated after a spe_allocate_register_set() call
+ * and released by the corresponding spe_release_register_set() call.
+ */
+static void
+gen_get_stencil_values(struct spe_function *f, const struct pipe_depth_stencil_alpha_state *dsa,
+                       unsigned int fbS_reg, 
+                       unsigned int *fail_reg, unsigned int *zfail_reg, 
+                       unsigned int *zpass_reg, unsigned int *back_fail_reg, 
+                       unsigned int *back_zfail_reg, unsigned int *back_zpass_reg)
+{
+   unsigned zfail_op, back_zfail_op;
+
+   /* Stenciling had better be enabled here */
+   ASSERT(dsa->stencil[0].enabled);
+
+   /* If the depth test is not enabled, it is treated as though it always
+    * passes.  In particular, that means that the "zfail_op" (and the backfacing
+    * counterpart, if active) are not considered - a failing stencil test will
+    * trigger the "fail_op", and a passing stencil test will trigger the
+    * "zpass_op".
+    *
+    * By overriding the operations in this case to be PIPE_STENCIL_OP_KEEP,
+    * we keep them from being calculated.
+    */
+   if (dsa->depth.enabled) {
+      zfail_op = dsa->stencil[0].zfail_op;
+      back_zfail_op = dsa->stencil[1].zfail_op;
+   }
+   else {
+      zfail_op = PIPE_STENCIL_OP_KEEP;
+      back_zfail_op = PIPE_STENCIL_OP_KEEP;
+   }
+
+   /* One-sided or front-facing stencil */
+   if (dsa->stencil[0].fail_op == PIPE_STENCIL_OP_KEEP) {
+      *fail_reg = fbS_reg;
+   }
+   else {
+      *fail_reg = spe_allocate_available_register(f);
+      gen_stencil_values(f, dsa->stencil[0].fail_op, dsa->stencil[0].ref_value, 
+         0xff, fbS_reg, *fail_reg);
+   }
+
+   if (zfail_op == PIPE_STENCIL_OP_KEEP) {
+      *zfail_reg = fbS_reg;
+   }
+   else if (zfail_op == dsa->stencil[0].fail_op) {
+      *zfail_reg = *fail_reg;
+   }
+   else {
+      *zfail_reg = spe_allocate_available_register(f);
+      gen_stencil_values(f, dsa->stencil[0].zfail_op, dsa->stencil[0].ref_value, 
+         0xff, fbS_reg, *zfail_reg);
+   }
+
+   if (dsa->stencil[0].zpass_op == PIPE_STENCIL_OP_KEEP) {
+      *zpass_reg = fbS_reg;
+   }
+   else if (dsa->stencil[0].zpass_op == dsa->stencil[0].fail_op) {
+      *zpass_reg = *fail_reg;
+   }
+   else if (dsa->stencil[0].zpass_op == zfail_op) {
+      *zpass_reg = *zfail_reg;
+   }
+   else {
+      *zpass_reg = spe_allocate_available_register(f);
+      gen_stencil_values(f, dsa->stencil[0].zpass_op, dsa->stencil[0].ref_value, 
+         0xff, fbS_reg, *zpass_reg);
+   }
+
+   /* If two-sided stencil is enabled, we have more work to do. */
+   if (!dsa->stencil[1].enabled) {
+      /* This just flags that the registers need not be deallocated later */
+      *back_fail_reg = fbS_reg;
+      *back_zfail_reg = fbS_reg;
+      *back_zpass_reg = fbS_reg;
+   }
+   else {
+      /* Same calculations as above, but for the back stencil */
+      if (dsa->stencil[1].fail_op == PIPE_STENCIL_OP_KEEP) {
+         *back_fail_reg = fbS_reg;
+      }
+      else if (dsa->stencil[1].fail_op == dsa->stencil[0].fail_op) {
+         *back_fail_reg = *fail_reg;
+      }
+      else if (dsa->stencil[1].fail_op == zfail_op) {
+         *back_fail_reg = *zfail_reg;
+      }
+      else if (dsa->stencil[1].fail_op == dsa->stencil[0].zpass_op) {
+         *back_fail_reg = *zpass_reg;
+      }
+      else {
+         *back_fail_reg = spe_allocate_available_register(f);
+         gen_stencil_values(f, dsa->stencil[1].fail_op, dsa->stencil[1].ref_value, 
+            0xff, fbS_reg, *back_fail_reg);
+      }
+
+      if (back_zfail_op == PIPE_STENCIL_OP_KEEP) {
+         *back_zfail_reg = fbS_reg;
+      }
+      else if (back_zfail_op == dsa->stencil[0].fail_op) {
+         *back_zfail_reg = *fail_reg;
+      }
+      else if (back_zfail_op == zfail_op) {
+         *back_zfail_reg = *zfail_reg;
+      }
+      else if (back_zfail_op == dsa->stencil[0].zpass_op) {
+         *back_zfail_reg = *zpass_reg;
+      }
+      else if (back_zfail_op == dsa->stencil[1].fail_op) {
+         *back_zfail_reg = *back_fail_reg;
+      }
+      else {
+         *back_zfail_reg = spe_allocate_available_register(f);
+         gen_stencil_values(f, dsa->stencil[1].zfail_op, dsa->stencil[1].ref_value, 
+            0xff, fbS_reg, *back_zfail_reg);
+      }
+
+      if (dsa->stencil[1].zpass_op == PIPE_STENCIL_OP_KEEP) {
+         *back_zpass_reg = fbS_reg;
+      }
+      else if (dsa->stencil[1].zpass_op == dsa->stencil[0].fail_op) {
+         *back_zpass_reg = *fail_reg;
+      }
+      else if (dsa->stencil[1].zpass_op == zfail_op) {
+         *back_zpass_reg = *zfail_reg;
+      }
+      else if (dsa->stencil[1].zpass_op == dsa->stencil[0].zpass_op) {
+         *back_zpass_reg = *zpass_reg;
+      }
+      else if (dsa->stencil[1].zpass_op == dsa->stencil[1].fail_op) {
+         *back_zpass_reg = *back_fail_reg;
+      }
+      else if (dsa->stencil[1].zpass_op == back_zfail_op) {
+         *back_zpass_reg = *back_zfail_reg;
+      }
+      else {
+         *back_zfail_reg = spe_allocate_available_register(f);
+         gen_stencil_values(f, dsa->stencil[1].zpass_op, dsa->stencil[1].ref_value, 
+            0xff, fbS_reg, *back_zpass_reg);
+      }
+   } /* End of calculations for back-facing stencil */
+}
+
+static boolean
+gen_stencil_depth_test(struct spe_function *f, 
+                       const struct pipe_depth_stencil_alpha_state *dsa, 
+                       const int const facing_reg,
+                       const int mask_reg, const int fragZ_reg, 
+                       const int fbZ_reg, const int fbS_reg)
+{
+   /* True if we've generated code that could require writeback to the
+    * depth and/or stencil buffers
+    */
+   boolean modified_buffers = false;
+
+   boolean need_to_calculate_stencil_values;
+   boolean need_to_writemask_stencil_values;
+
+   /* Registers.  We may or may not actually allocate these, depending
+    * on whether the state values indicate that we need them.
+    */
+   unsigned int stencil_pass_reg, stencil_fail_reg;
+   unsigned int stencil_fail_values, stencil_pass_depth_fail_values, stencil_pass_depth_pass_values;
+   unsigned int stencil_writemask_reg;
+   unsigned int zmask_reg;
+   unsigned int newS_reg;
+
+   /* Stenciling is quite complex: up to six different configurable stencil 
+    * operations/calculations can be required (three each for front-facing
+    * and back-facing fragments).  Many of those operations will likely 
+    * be identical, so there's good reason to try to avoid calculating 
+    * the same values more than once (which unfortunately makes the code less 
+    * straightforward).
+    *
+    * To make register management easier, we start a new 
+    * register set; we can release all the registers in the set at
+    * once, and avoid having to keep track of exactly which registers
+    * we allocate.  We can still allocate and free registers as 
+    * desired (if we know we no longer need a register), but we don't
+    * have to spend the complexity to track the more difficult variant
+    * register usage scenarios.
+    */
+   spe_allocate_register_set(f);
+
+   /* Calculate the writemask.  If the writemask is trivial (either
+    * all 0s, meaning that we don't need to calculate any stencil values
+    * because they're not going to change the stencil anyway, or all 1s,
+    * meaning that we have to calculate the stencil values but do not
+    * need to mask them), we can avoid generating code.  Don't forget
+    * that we need to consider backfacing stencil, if enabled.
+    */
+   if (dsa->stencil[0].write_mask == 0x0 && (!dsa->stencil[1].enabled || dsa->stencil[1].write_mask == 0x00)) {
+      /* Trivial: don't need to calculate stencil values, and don't need to 
+       * write them back to the framebuffer.
+       */
+      need_to_calculate_stencil_values = false;
+      need_to_writemask_stencil_values = false;
+   }
+   else if (dsa->stencil[0].write_mask == 0xff && (!dsa->stencil[1].enabled || dsa->stencil[1].write_mask == 0x00)) {
+      /* Still trivial, but a little less so.  We need to write the stencil
+       * values, but we don't need to mask them.
+       */
+      need_to_calculate_stencil_values = true;
+      need_to_writemask_stencil_values = false;
+   }
+   else {
+      /* The general case: calculate, mask, and write */
+      need_to_calculate_stencil_values = true;
+      need_to_writemask_stencil_values = true;
+
+      /* While we're here, generate code that calculates what the
+       * writemask should be.  If backface stenciling is enabled,
+       * and the backface writemask is not the same as the frontface
+       * writemask, we'll have to generate code that merges the
+       * two masks into a single effective mask based on fragment facing.
+       */
+      stencil_writemask_reg = spe_allocate_available_register(f);
+      spe_load_uint(f, stencil_writemask_reg, dsa->stencil[0].write_mask);
+      if (dsa->stencil[1].enabled && dsa->stencil[0].write_mask != dsa->stencil[1].write_mask) {
+         unsigned int back_write_mask_reg = spe_allocate_available_register(f);
+         spe_load_uint(f, back_write_mask_reg, dsa->stencil[1].write_mask);
+         spe_selb(f, stencil_writemask_reg, stencil_writemask_reg, back_write_mask_reg, facing_reg);
+         spe_release_register(f, back_write_mask_reg);
+      }
+   }
+
+   /* At least one-sided stenciling must be on.  Generate code that
+    * runs the stencil test on the basic/front-facing stencil, leaving
+    * the mask of passing stencil bits in stencil_pass_reg.  This mask will
+    * be used both to mask the set of active pixels, and also to
+    * determine how the stencil buffer changes.
+    *
+    * This test will *not* change the value in mask_reg (because we don't
+    * yet know whether to apply the two-sided stencil or one-sided stencil).
+    */
+   stencil_pass_reg = spe_allocate_available_register(f);
+   gen_stencil_test(f, &dsa->stencil[0], mask_reg, fbS_reg, stencil_pass_reg);
+
+   /* If two-sided stenciling is on, generate code to run the stencil
+    * test on the backfacing stencil as well, and combine the two results
+    * into the one correct result based on facing.
+    */
+   if (dsa->stencil[1].enabled) {
+      unsigned int temp_reg = spe_allocate_available_register(f);
+      gen_stencil_test(f, &dsa->stencil[1], mask_reg, fbS_reg, temp_reg);
+      spe_selb(f, stencil_pass_reg, stencil_pass_reg, temp_reg, facing_reg);
+      spe_release_register(f, temp_reg);
+   }
+
+   /* Generate code that, given the mask of valid fragments and the
+    * mask of valid fragments that passed the stencil test, computes
+    * the mask of valid fragments that failed the stencil test.  We
+    * have to do this before we run a depth test (because the
+    * depth test should not be performed on fragments that failed the
+    * stencil test, and because the depth test will update the 
+    * mask of valid fragments based on the results of the depth test).
+    */
+   stencil_fail_reg = spe_allocate_available_register(f);
+   spe_andc(f, stencil_fail_reg, mask_reg, stencil_pass_reg);
+   /* Now remove the stenciled-out pixels from the valid fragment mask,
+    * so we can later use the valid fragment mask in the depth test.
+    */
+   spe_and(f, mask_reg, mask_reg, stencil_pass_reg);
+
+   /* We may not need to calculate stencil values, if the writemask is off */
+   if (need_to_calculate_stencil_values) {
+      unsigned int back_stencil_fail_values, back_stencil_pass_depth_fail_values, back_stencil_pass_depth_pass_values;
+      unsigned int front_stencil_fail_values, front_stencil_pass_depth_fail_values, front_stencil_pass_depth_pass_values;
+
+      /* Generate code that calculates exactly which stencil values we need,
+       * without calculating the same value twice (say, if two different
+       * stencil ops have the same value).  This code will work for one-sided
+       * and two-sided stenciling (so that we take into account that operations
+       * may match between front and back stencils), and will also take into
+       * account whether the depth test is enabled (if the depth test is off,
+       * we don't need any of the zfail results, because the depth test always
+       * is considered to pass if it is disabled).  Any register value that
+       * does not need to be calculated will come back with the same value
+       * that's in fbS_reg.
+       *
+       * This function will allocate a variant number of registers that
+       * will be released as part of the register set.
+       */
+      gen_get_stencil_values(f, dsa, fbS_reg, 
+         &front_stencil_fail_values, &front_stencil_pass_depth_fail_values, 
+         &front_stencil_pass_depth_pass_values, &back_stencil_fail_values, 
+         &back_stencil_pass_depth_fail_values, &back_stencil_pass_depth_pass_values);
+
+      /* Tricky, tricky, tricky - the things we do to create optimal
+       * code...
+       *
+       * The various stencil values registers may overlap with each other
+       * and with fbS_reg arbitrarily (as any particular operation is
+       * only calculated once and stored in one register, no matter
+       * how many times it is used).  So we can't change the values 
+       * within those registers directly - if we change a value in a
+       * register that's being referenced by two different calculations,
+       * we've just unwittingly changed the second value as well...
+       *
+       * Avoid this by allocating new registers to hold the results
+       * (there may be 2, if the depth test is off, or 3, if it is on).
+       * These will be released as part of the register set.
+       */
+      if (!dsa->stencil[1].enabled) {
+         /* The easy case: if two-sided stenciling is *not* enabled, we
+          * just use the front-sided values.
+          */
+         stencil_fail_values = front_stencil_fail_values;
+         stencil_pass_depth_fail_values = front_stencil_pass_depth_fail_values;
+         stencil_pass_depth_pass_values = front_stencil_pass_depth_pass_values;
+      }
+      else { /* two-sided stencil enabled */
+         /* Allocate new registers for the needed merged values */
+         stencil_fail_values = spe_allocate_available_register(f);
+         spe_selb(f, stencil_fail_values, front_stencil_fail_values, back_stencil_fail_values, facing_reg);
+         if (dsa->depth.enabled) {
+            stencil_pass_depth_fail_values = spe_allocate_available_register(f);
+            spe_selb(f, stencil_pass_depth_fail_values, front_stencil_pass_depth_fail_values, back_stencil_pass_depth_fail_values, facing_reg);
+         }
+         else {
+            stencil_pass_depth_fail_values = fbS_reg;
+         }
+         stencil_pass_depth_pass_values = spe_allocate_available_register(f);
+         spe_selb(f, stencil_pass_depth_pass_values, front_stencil_pass_depth_pass_values, back_stencil_pass_depth_pass_values, facing_reg);
+      }
+   }
+
+   /* We now have all the stencil values we need.  We also need 
+    * the results of the depth test to figure out which
+    * stencil values will become the new stencil values.  (Even if
+    * we aren't actually calculating stencil values, we need to apply
+    * the depth test if it's enabled.)
+    *
+    * The code generated by gen_depth_test() returns the results of the
+    * test in the given register, but also alters the mask_reg based
+    * on the results of the test.
+    */
+   if (dsa->depth.enabled) {
+      zmask_reg = spe_allocate_available_register(f);
+      modified_buffers |= gen_depth_test(f, dsa, mask_reg, fragZ_reg, fbZ_reg, zmask_reg);
+   }
+
+   if (need_to_calculate_stencil_values) {
+      /* If we need to writemask the stencil values before going into
+       * the stencil buffer, we'll have to use a new register to
+       * hold the new values.  If not, we can just keep using the
+       * current register.
+       */
+      if (need_to_writemask_stencil_values) {
+         newS_reg = spe_allocate_available_register(f);
+         spe_move(f, newS_reg, fbS_reg);
+         modified_buffers = true;
+      }
+      else {
+         newS_reg = fbS_reg;
+      }
+
+      /* Merge in the selected stencil fail values */
+      if (stencil_fail_values != fbS_reg) {
+         spe_selb(f, newS_reg, newS_reg, stencil_fail_values, stencil_fail_reg);
+      }
+
+      /* Same for the stencil pass/depth fail values.  If this calculation
+       * is not needed (say, if depth test is off), then the
+       * stencil_pass_depth_fail_values register will be equal to fbS_reg
+       * and we'll skip the calculation.
+       */
+      if (stencil_pass_depth_fail_values != fbS_reg) {
+         /* We don't actually have a stencil pass/depth fail mask yet.
+          * Calculate it here from the stencil passing mask and the
+          * depth passing mask.  Note that zmask_reg *must* have been
+          * set above if we're here.
+          */
+         unsigned int stencil_pass_depth_fail_mask = spe_allocate_available_register(f);
+         spe_andc(f, stencil_pass_depth_fail_mask, stencil_pass_reg, zmask_reg);
+
+         spe_selb(f, newS_reg, newS_reg, stencil_pass_depth_fail_values, stencil_pass_depth_fail_mask);
+
+         spe_release_register(f, stencil_pass_depth_fail_mask);
+      }
+
+      /* Same for the stencil pass/depth pass mask */
+      if (stencil_pass_depth_pass_values != fbS_reg) {
+         unsigned int stencil_pass_depth_pass_mask = spe_allocate_available_register(f);
+         spe_and(f, stencil_pass_depth_pass_mask, stencil_pass_reg, zmask_reg);
+
+         spe_selb(f, newS_reg, newS_reg, stencil_pass_depth_pass_values, stencil_pass_depth_pass_mask);
+         spe_release_register(f, stencil_pass_depth_pass_mask);
+      }
+
+      /* Almost done.  If we need to writemask, do it now, leaving the
+       * results in the fbS_reg register passed in.  If we don't need
+       * to writemask, then the results are *already* in the fbS_reg,
+       * so there's nothing more to do.
+       */
+
+      if (need_to_writemask_stencil_values) {
+         /* The Select Bytes command makes a fine writemask.  Where
+          * the mask is 0, the first (original) values are retained,
+          * effectively masking out changes.  Where the mask is 1, the
+          * second (new) values are retained, incorporating changes.
+          */
+         spe_selb(f, fbS_reg, fbS_reg, newS_reg, stencil_writemask_reg);
+      }
+   } /* done calculating stencil values */
+
+   /* The stencil and/or depth values have been applied, and the
+    * mask_reg, fbS_reg, and fbZ_reg values have been updated.
+    * We're all done, except that we've allocated a fair number
+    * of registers that we didn't bother tracking.  Release all
+    * those registers as part of the register set, and go home.
+    */
+   spe_release_register_set(f);
+
+   /* Return true if we could have modified the stencil and/or
+    * depth buffers.
+    */
+   return modified_buffers;
+}
+
+
 /**
  * Generate SPE code to implement the fragment operations (alpha test,
  * depth test, stencil test, blending, colormask, and final
@@ -1156,6 +1800,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
    const int fragB_reg = 10;  /* vector float */
    const int fragA_reg = 11;  /* vector float */
    const int mask_reg = 12;   /* vector uint */
+   const int facing_reg = 13; /* uint */
 
    /* offset of quad from start of tile
     * XXX assuming 4-byte pixels for color AND Z/stencil!!!!
@@ -1183,6 +1828,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
    spe_allocate_register(f, fragB_reg);
    spe_allocate_register(f, fragA_reg);
    spe_allocate_register(f, mask_reg);
+   spe_allocate_register(f, facing_reg);
 
    quad_offset_reg = spe_allocate_available_register(f);
    fbRGBA_reg = spe_allocate_available_register(f);
@@ -1195,6 +1841,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
 
       ASSERT(TILE_SIZE == 32);
 
+      spe_comment(f, 0, "Computing tile location in memory");
       spe_rotmi(f, y2_reg, y_reg, -1);  /* y2 = y / 2 */
       spe_rotmi(f, x2_reg, x_reg, -1);  /* x2 = x / 2 */
       spe_shli(f, y2_reg, y2_reg, 4);   /* y2 *= 16 */
@@ -1205,124 +1852,164 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
       spe_release_register(f, y2_reg);
    }
 
-
    if (dsa->alpha.enabled) {
       gen_alpha_test(dsa, f, mask_reg, fragA_reg);
    }
 
+   /* If we need the stencil buffers (because one- or two-sided stencil is
+    * enabled) or the depth buffer (because the depth test is enabled),
+    * go grab them.  Note that if either one- or two-sided stencil is
+    * enabled, dsa->stencil[0].enabled will be true.
+    */
    if (dsa->depth.enabled || dsa->stencil[0].enabled) {
       const enum pipe_format zs_format = cell->framebuffer.zsbuf->format;
       boolean write_depth_stencil;
 
-      int fbZ_reg = spe_allocate_available_register(f); /* Z values */
-      int fbS_reg = spe_allocate_available_register(f); /* Stencil values */
+      /* We may or may not need to allocate a register for Z or stencil values */
+      boolean fbS_reg_set = false, fbZ_reg_set = false;
+      unsigned int fbS_reg, fbZ_reg = 0;
+
+      spe_comment(f, 0, "Loading Z/stencil tile");
 
       /* fetch quad of depth/stencil values from tile at (x,y) */
       /* Load: fbZS_reg = memory[depth_tile_reg + offset_reg] */
+      /* XXX Not sure this is allowed if we've only got a 16-bit Z buffer... */
       spe_lqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg);
 
-      if (dsa->depth.enabled) {
-         /* Extract Z bits from fbZS_reg into fbZ_reg */
-         if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
-             zs_format == PIPE_FORMAT_X8Z24_UNORM) {
-            int mask_reg = spe_allocate_available_register(f);
-            spe_fsmbi(f, mask_reg, 0x7777);  /* mask[0,1,2,3] = 0x00ffffff */
-            spe_and(f, fbZ_reg, fbZS_reg, mask_reg);  /* fbZ = fbZS & mask */
-            spe_release_register(f, mask_reg);
-            /* OK, fbZ_reg has four 24-bit Z values now */
-         }
-         else if (zs_format == PIPE_FORMAT_Z24S8_UNORM ||
-                  zs_format == PIPE_FORMAT_Z24X8_UNORM) {
-            spe_rotmi(f, fbZ_reg, fbZS_reg, -8);  /* fbZ = fbZS >> 8 */
-            /* OK, fbZ_reg has four 24-bit Z values now */
-         }
-         else if (zs_format == PIPE_FORMAT_Z32_UNORM) {
-            spe_move(f, fbZ_reg, fbZS_reg);
-            /* OK, fbZ_reg has four 32-bit Z values now */
-         }
-         else if (zs_format == PIPE_FORMAT_Z16_UNORM) {
-            spe_move(f, fbZ_reg, fbZS_reg);
-            /* OK, fbZ_reg has four 16-bit Z values now */
-         }
-         else {
-            ASSERT(0);  /* invalid format */
-         }
-
-         /* Convert fragZ values from float[4] to 16, 24 or 32-bit uint[4] */
-         if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
-             zs_format == PIPE_FORMAT_X8Z24_UNORM ||
-             zs_format == PIPE_FORMAT_Z24S8_UNORM ||
-             zs_format == PIPE_FORMAT_Z24X8_UNORM) {
-            /* scale/convert fragZ from float in [0,1] to uint in [0, ~0] */
-            spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
-            /* fragZ = fragZ >> 8 */
-            spe_rotmi(f, fragZ_reg, fragZ_reg, -8);
-         }
-         else if (zs_format == PIPE_FORMAT_Z32_UNORM) {
-            /* scale/convert fragZ from float in [0,1] to uint in [0, ~0] */
-            spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
-         }
-         else if (zs_format == PIPE_FORMAT_Z16_UNORM) {
-            /* scale/convert fragZ from float in [0,1] to uint in [0, ~0] */
-            spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
-            /* fragZ = fragZ >> 16 */
-            spe_rotmi(f, fragZ_reg, fragZ_reg, -16);
-         }
-      }
-      else {
-         /* no Z test, but set Z to zero so we don't OR-in garbage below */
-         spe_load_uint(f, fbZ_reg, 0); /* XXX set to zero for now */
+      /* From the Z/stencil buffer format, pull out the bits we need for
+       * Z and/or stencil.  We'll also convert the incoming fragment Z
+       * value in fragZ_reg from a floating point value in [0.0..1.0] to
+       * an unsigned integer value with the appropriate resolution.
+       */
+      switch(zs_format) {
+
+         case PIPE_FORMAT_S8Z24_UNORM: /* fall through */
+         case PIPE_FORMAT_X8Z24_UNORM:
+            if (dsa->depth.enabled) {
+               /* We need the Z part at least */
+               setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
+               /* four 24-bit Z values in the low-order bits */
+               spe_and_uint(f, fbZ_reg, fbZS_reg, 0x00ffffff);
+
+               /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
+                * to a 24-bit unsigned integer
+                */
+               spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+               spe_rotmi(f, fragZ_reg, fragZ_reg, -8);
+            }
+            if (dsa->stencil[0].enabled) {
+               setup_optional_register(f, &fbS_reg_set, &fbS_reg);
+               /* four 8-bit Z values in the high-order bits */
+               spe_rotmi(f, fbS_reg, fbZS_reg, -24);
+            }
+            break;
+
+         case PIPE_FORMAT_Z24S8_UNORM: /* fall through */
+         case PIPE_FORMAT_Z24X8_UNORM:
+            if (dsa->depth.enabled) {
+               setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
+               /* shift by 8 to get the upper 24-bit values */
+               spe_rotmi(f, fbS_reg, fbZS_reg, -8);
+
+               /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
+                * to a 24-bit unsigned integer
+                */
+               spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+               spe_rotmi(f, fragZ_reg, fragZ_reg, -8);
+            }
+            if (dsa->stencil[0].enabled) {
+               setup_optional_register(f, &fbS_reg_set, &fbS_reg);
+               /* 8-bit stencil in the low-order bits - mask them out */
+               spe_and_uint(f, fbS_reg, fbZS_reg, 0x000000ff);
+            }
+            break;
+
+         case PIPE_FORMAT_Z32_UNORM:
+            if (dsa->depth.enabled) {
+               setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
+               /* Copy over 4 32-bit values */
+               spe_move(f, fbZ_reg, fbZS_reg);
+
+               /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
+                * to a 32-bit unsigned integer
+                */
+               spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+            }
+            /* No stencil, so can't do anything there */
+            break;
+
+         case PIPE_FORMAT_Z16_UNORM:
+            if (dsa->depth.enabled) {
+               /* XXX Not sure this is correct, but it was here before, so we're
+                * going with it for now
+                */
+               setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
+               /* Copy over 4 32-bit values */
+               spe_move(f, fbZ_reg, fbZS_reg);
+
+               /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
+                * to a 16-bit unsigned integer
+                */
+               spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+               spe_rotmi(f, fragZ_reg, fragZ_reg, -16);
+            }
+            /* No stencil */
+            break;
+
+         default:
+            ASSERT(0); /* invalid format */
       }
 
-
+      /* If stencil is enabled, use the stencil-specific code
+       * generator to generate both the stencil and depth (if needed)
+       * tests.  Otherwise, if only depth is enabled, generate
+       * a quick depth test.  The test generators themselves will
+       * report back whether the depth/stencil buffer has to be
+       * written back.
+       */
       if (dsa->stencil[0].enabled) {
-         /* Extract Stencil bit sfrom fbZS_reg into fbS_reg */
-         if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
-             zs_format == PIPE_FORMAT_X8Z24_UNORM) {
-            /* XXX extract with a shift */
-            ASSERT(0);
-         }
-         else if (zs_format == PIPE_FORMAT_Z24S8_UNORM ||
-                  zs_format == PIPE_FORMAT_Z24X8_UNORM) {
-            /* XXX extract with a mask */
-            ASSERT(0);
-         }
-      }
-      else {
-         /* no stencil test, but set to zero so we don't OR-in garbage below */
-         spe_load_uint(f, fbS_reg, 0); /* XXX set to zero for now */
-      }
+         /* This will perform the stencil and depth tests, and update
+          * the mask_reg, fbZ_reg, and fbS_reg as required by the
+          * tests.
+          */
+         ASSERT(fbS_reg_set);
+         ASSERT(fbZ_reg_set);
+         spe_comment(f, 0, "Perform stencil test");
 
-      if (dsa->stencil[0].enabled) {
-         /* XXX this may involve depth testing too */
-         // gen_stencil_test(dsa, f, ... );
-         ASSERT(0);
+         write_depth_stencil = gen_stencil_depth_test(f, dsa, facing_reg, mask_reg, fragZ_reg, fbZ_reg, fbS_reg);
       }
       else if (dsa->depth.enabled) {
          int zmask_reg = spe_allocate_available_register(f);
-         gen_depth_test(dsa, f, mask_reg, fragZ_reg, fbZ_reg, zmask_reg);
+         spe_comment(f, 0, "Perform depth test");
+         write_depth_stencil = gen_depth_test(f, dsa, mask_reg, fragZ_reg, fbZ_reg, zmask_reg);
          spe_release_register(f, zmask_reg);
       }
-
-      /* do we need to write Z and/or Stencil back into framebuffer? */
-      write_depth_stencil = (dsa->depth.writemask |
-                             dsa->stencil[0].write_mask |
-                             dsa->stencil[1].write_mask);
+      else {
+         write_depth_stencil = false;
+      }
 
       if (write_depth_stencil) {
          /* Merge latest Z and Stencil values into fbZS_reg.
           * fbZ_reg has four Z vals in bits [23..0] or bits [15..0].
           * fbS_reg has four 8-bit Z values in bits [7..0].
           */
+         spe_comment(f, 0, "Storing depth/stencil values");
          if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
              zs_format == PIPE_FORMAT_X8Z24_UNORM) {
-            spe_shli(f, fbS_reg, fbS_reg, 24); /* fbS = fbS << 24 */
-            spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
+            if (fbS_reg_set) {
+               spe_shli(f, fbS_reg, fbS_reg, 24); /* fbS = fbS << 24 */
+               spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
+            }
+            else {
+               spe_move(f, fbZS_reg, fbZ_reg);
+            }
          }
          else if (zs_format == PIPE_FORMAT_Z24S8_UNORM ||
                   zs_format == PIPE_FORMAT_Z24X8_UNORM) {
             spe_shli(f, fbZ_reg, fbZ_reg, 8); /* fbZ = fbZ << 8 */
-            spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
+            if (fbS_reg_set) {
+               spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
+            }
          }
          else if (zs_format == PIPE_FORMAT_Z32_UNORM) {
             spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */
@@ -1341,11 +2028,10 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
          spe_stqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg);
       }
 
-      spe_release_register(f, fbZ_reg);
-      spe_release_register(f, fbS_reg);
+      release_optional_register(f, &fbZ_reg_set, fbZ_reg);
+      release_optional_register(f, &fbS_reg_set, fbS_reg);
    }
 
-
    /* Get framebuffer quad/colors.  We'll need these for blending,
     * color masking, and to obey the quad/pixel mask.
     * Load: fbRGBA_reg = memory[color_tile + quad_offset]
@@ -1354,8 +2040,8 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
     */
    spe_lqx(f, fbRGBA_reg, color_tile_reg, quad_offset_reg);
 
-
    if (blend->blend_enable) {
+      spe_comment(f, 0, "Perform blending");
       gen_blend(blend, blend_color, f, color_format,
                 fragR_reg, fragG_reg, fragB_reg, fragA_reg, fbRGBA_reg);
    }
@@ -1369,19 +2055,21 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
       int rgba_reg = spe_allocate_available_register(f);
 
       /* Pack four float colors as four 32-bit int colors */
+      spe_comment(f, 0, "Convert fragment colors to framebuffer colors");
       gen_pack_colors(f, color_format,
                       fragR_reg, fragG_reg, fragB_reg, fragA_reg,
                       rgba_reg);
 
       if (blend->logicop_enable) {
+         spe_comment(f, 0, "Compute logic op");
          gen_logicop(blend, f, rgba_reg, fbRGBA_reg);
       }
 
       if (blend->colormask != PIPE_MASK_RGBA) {
+         spe_comment(f, 0, "Compute color mask");
          gen_colormask(f, blend->colormask, color_format, rgba_reg, fbRGBA_reg);
       }
 
-
       /* Mix fragment colors with framebuffer colors using the quad/pixel mask:
        * if (mask[i])
        *    rgba[i] = rgba[i];
@@ -1393,6 +2081,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
       /* Store updated quad in tile:
        * memory[color_tile + quad_offset] = rgba_reg;
        */
+      spe_comment(f, 0, "Store framebuffer colors");
       spe_stqx(f, rgba_reg, color_tile_reg, quad_offset_reg);
 
       spe_release_register(f, rgba_reg);
diff --git a/src/gallium/drivers/cell/ppu/cell_render.c b/src/gallium/drivers/cell/ppu/cell_render.c
index dd25ae880e..79cb8df82f 100644
--- a/src/gallium/drivers/cell/ppu/cell_render.c
+++ b/src/gallium/drivers/cell/ppu/cell_render.c
@@ -152,6 +152,7 @@ cell_flush_prim_buffer(struct cell_context *cell)
       struct cell_command_render *render = &cell_global.command[i].render;
       render->prim_type = PIPE_PRIM_TRIANGLES;
       render->num_verts = cell->prim_buffer.num_verts;
+      render->front_winding = cell->rasterizer->front_winding;
       render->vertex_size = cell->vertex_info->size * 4;
       render->xmin = cell->prim_buffer.xmin;
       render->ymin = cell->prim_buffer.ymin;
diff --git a/src/gallium/drivers/cell/ppu/cell_vbuf.c b/src/gallium/drivers/cell/ppu/cell_vbuf.c
index aa63435b93..578ddf62dc 100644
--- a/src/gallium/drivers/cell/ppu/cell_vbuf.c
+++ b/src/gallium/drivers/cell/ppu/cell_vbuf.c
@@ -214,6 +214,7 @@ cell_vbuf_draw(struct vbuf_render *vbr,
 
       render->opcode = CELL_CMD_RENDER;
       render->prim_type = cvbr->prim;
+      render->front_winding = cell->rasterizer->front_winding;
 
       render->num_indexes = nr_indices;
       render->min_index = min_index;
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 29a305232e..1cd577c23c 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -73,7 +73,8 @@ typedef void (*spu_fragment_ops_func)(uint x, uint y,
                                       vector float fragGreen,
                                       vector float fragBlue,
                                       vector float fragAlpha,
-                                      vector unsigned int mask);
+                                      vector unsigned int mask,
+                                      uint facing);
 
 /** Function for running fragment program */
 typedef void (*spu_fragment_program_func)(vector float *inputs,
diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
index f107764fb2..d252fa6dc1 100644
--- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
+++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
@@ -57,7 +57,8 @@ spu_fallback_fragment_ops(uint x, uint y,
                           vector float fragG,
                           vector float fragB,
                           vector float fragA,
-                          vector unsigned int mask)
+                          vector unsigned int mask,
+                          uint facing)
 {
    vector float frag_aos[4];
    unsigned int fbc0, fbc1, fbc2, fbc3 ; /* framebuffer/tile colors */
@@ -433,23 +434,23 @@ spu_fallback_fragment_ops(uint x, uint y,
       /* Form bitmask depending on color buffer format and colormask bits */
       switch (spu.fb.color_format) {
       case PIPE_FORMAT_A8R8G8B8_UNORM:
-         if (spu.blend.colormask & (1<<0))
+         if (spu.blend.colormask & PIPE_MASK_R)
             cmask |= 0x00ff0000; /* red */
-         if (spu.blend.colormask & (1<<1))
+         if (spu.blend.colormask & PIPE_MASK_G)
             cmask |= 0x0000ff00; /* green */
-         if (spu.blend.colormask & (1<<2))
+         if (spu.blend.colormask & PIPE_MASK_B)
             cmask |= 0x000000ff; /* blue */
-         if (spu.blend.colormask & (1<<3))
+         if (spu.blend.colormask & PIPE_MASK_A)
             cmask |= 0xff000000; /* alpha */
          break;
       case PIPE_FORMAT_B8G8R8A8_UNORM:
-         if (spu.blend.colormask & (1<<0))
+         if (spu.blend.colormask & PIPE_MASK_R)
             cmask |= 0x0000ff00; /* red */
-         if (spu.blend.colormask & (1<<1))
+         if (spu.blend.colormask & PIPE_MASK_G)
             cmask |= 0x00ff0000; /* green */
-         if (spu.blend.colormask & (1<<2))
+         if (spu.blend.colormask & PIPE_MASK_B)
             cmask |= 0xff000000; /* blue */
-         if (spu.blend.colormask & (1<<3))
+         if (spu.blend.colormask & PIPE_MASK_A)
             cmask |= 0x000000ff; /* alpha */
          break;
       default:
diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.h b/src/gallium/drivers/cell/spu/spu_per_fragment_op.h
index f817abf046..a61689c83a 100644
--- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.h
+++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.h
@@ -38,7 +38,8 @@ spu_fallback_fragment_ops(uint x, uint y,
                           vector float fragGreen,
                           vector float fragBlue,
                           vector float fragAlpha,
-                          vector unsigned int mask);
+                          vector unsigned int mask,
+                          uint facing);
 
 
 #endif /* SPU_PER_FRAGMENT_OP */
diff --git a/src/gallium/drivers/cell/spu/spu_render.c b/src/gallium/drivers/cell/spu/spu_render.c
index 305dc98881..82dbeb26b7 100644
--- a/src/gallium/drivers/cell/spu/spu_render.c
+++ b/src/gallium/drivers/cell/spu/spu_render.c
@@ -279,7 +279,7 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
          v1 = (const float *) (vertices + indexes[j+1] * vertex_size);
          v2 = (const float *) (vertices + indexes[j+2] * vertex_size);
 
-         drawn += tri_draw(v0, v1, v2, tx, ty);
+         drawn += tri_draw(v0, v1, v2, tx, ty, render->front_winding);
       }
 
       //printf("SPU %u: drew %u of %u\n", spu.init.id, drawn, render->num_indexes/3);
@@ -297,5 +297,3 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
       printf("SPU %u: RENDER done\n",
              spu.init.id);
 }
-
-
diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index 0a8fb56a62..6039cd80b2 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -118,6 +118,8 @@ struct setup_stage {
 
    float oneoverarea;
 
+   uint facing;
+
    uint tx, ty;
 
    int cliprect_minx, cliprect_maxx, cliprect_miny, cliprect_maxy;
@@ -274,7 +276,7 @@ eval_z(float x, float y)
  * overall.
  */
 static INLINE void
-emit_quad( int x, int y, mask_t mask )
+emit_quad( int x, int y, mask_t mask)
 {
    /* If any bits in mask are set... */
    if (spu_extract(spu_orx(mask), 0)) {
@@ -344,7 +346,8 @@ emit_quad( int x, int y, mask_t mask )
                              fragZ,
                              soa_frag[0], soa_frag[1],
                              soa_frag[2], soa_frag[3],
-                             mask);
+                             mask,
+                             setup.facing);
          }
 
       }
@@ -379,7 +382,8 @@ emit_quad( int x, int y, mask_t mask )
                           outputs[0*4+1],
                           outputs[0*4+2],
                           outputs[0*4+3],
-                          mask);
+                          mask,
+                          setup.facing);
       }
    }
 }
@@ -483,7 +487,7 @@ static void flush_spans( void )
     */
    for (x = block(minleft); x <= block(maxright); x += 2) {
 #if 1
-      emit_quad( x, setup.span.y, calculate_mask( x ) );
+      emit_quad( x, setup.span.y, calculate_mask( x ));
 #endif
    }
 
@@ -902,13 +906,28 @@ static void subtriangle( struct edge *eleft,
    eright->sy += lines;
 }
 
+static float
+determinant( const float *v0,
+             const float *v1,
+             const float *v2 )
+{
+   /* edge vectors e = v0 - v2, f = v1 - v2 */
+   const float ex = v0[0] - v2[0];
+   const float ey = v0[1] - v2[1];
+   const float fx = v1[0] - v2[0];
+   const float fy = v1[1] - v2[1];
+
+   /* det = cross(e,f).z */
+   return ex * fy - ey * fx;
+}
+
 
 /**
  * Draw triangle into tile at (tx, ty) (tile coords)
  * The tile data should have already been fetched.
  */
 boolean
-tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty)
+tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty, uint front_winding)
 {
    setup.tx = tx;
    setup.ty = ty;
@@ -919,6 +938,12 @@ tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty)
    setup.cliprect_maxx = (tx + 1) * TILE_SIZE;
    setup.cliprect_maxy = (ty + 1) * TILE_SIZE;
 
+   /* Before we sort vertices, determine the facing of the triangle,
+    * which will be needed for front/back-face stencil application
+    */
+   float det = determinant(v0, v1, v2);
+   setup.facing = (det > 0.0) ^ (front_winding == PIPE_WINDING_CW);
+
    if (!setup_sort_vertices((struct vertex_header *) v0,
                             (struct vertex_header *) v1,
                             (struct vertex_header *) v2)) {
diff --git a/src/gallium/drivers/cell/spu/spu_tri.h b/src/gallium/drivers/cell/spu/spu_tri.h
index aa694dd7c9..abc3d35160 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.h
+++ b/src/gallium/drivers/cell/spu/spu_tri.h
@@ -31,7 +31,7 @@
 
 
 extern boolean
-tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty);
+tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty, uint front_winding);
 
 
 #endif /* SPU_TRI_H */
-- 
cgit v1.2.3


From d48a92e88040470f93e2186f8eb23e4797a09860 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 8 Oct 2008 20:44:32 -0600
Subject: cell: implement function calls from shader code.  fslight demo runs
 now.

Used for SIN, COS, EXP2, LOG2, POW instructions.  TEX next.

Fixed some bugs in MIN, MAX, DP3, DP4, DPH instructions.

In rtasm code:
  Special-case spe_lqd(), spe_stqd() functions so they take byte offsets but
  low-order 4 bits are shifted out.  This makes things consistant with SPU
  assembly language conventions.
  Added spe_get_registers_used() function.
---
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c      |  76 ++++++++++--
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h      |  11 +-
 src/gallium/drivers/cell/ppu/cell_gen_fp.c       | 141 +++++++++++++++--------
 src/gallium/drivers/cell/ppu/cell_vertex_fetch.c |  30 ++---
 4 files changed, 182 insertions(+), 76 deletions(-)

(limited to 'src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
index c442b1f6aa..9274bc5e3c 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
@@ -174,9 +174,12 @@ reg_name(int reg)
       return "$lr";
    default:
       {
-         static char buf[10];
-         sprintf(buf, "$%d", reg);
-         return buf;
+         /* cycle through four buffers to handle multiple calls per printf */
+         static char buf[4][10];
+         static int b = 0;
+         b = (b + 1) % 4;
+         sprintf(buf[b], "$%d", reg);
+         return buf[b];
       }
    }
 }
@@ -269,15 +272,8 @@ static void emit_RI10(struct spe_function *p, unsigned op, unsigned rT,
     assert(p->num_inst <= p->max_inst);
     if (p->print) {
        indent(p);
-       if (strcmp(name, "spe_lqd") == 0 ||
-           strcmp(name, "spe_stqd") == 0) {
-          printf("%s\t%s, %d(%s)\n",
-                 rem_prefix(name), reg_name(rT), imm, reg_name(rA));
-       }
-       else {
-          printf("%s\t%s, %s, 0x%x\n",
-                 rem_prefix(name), reg_name(rT), reg_name(rA), imm);
-       }
+       printf("%s\t%s, %s, 0x%x\n",
+              rem_prefix(name), reg_name(rT), reg_name(rA), imm);
     }
 }
 
@@ -379,6 +375,7 @@ void _name (struct spe_function *p, int imm) \
 #include "rtasm_ppc_spe.h"
 
 
+
 /**
  * Initialize an spe_function.
  * \param code_size  size of instruction buffer to allocate, in bytes.
@@ -513,6 +510,20 @@ void spe_release_register_set(struct spe_function *p)
 }
 
 
+unsigned
+spe_get_registers_used(const struct spe_function *p, ubyte used[])
+{
+   unsigned i, num = 0;
+   /* only count registers in the range available to callers */
+   for (i = 2; i < 80; i++) {
+      if (p->regs[i]) {
+         used[num++] = i;
+      }
+   }
+   return num;
+}
+
+
 void
 spe_print_code(struct spe_function *p, boolean enable)
 {
@@ -539,6 +550,46 @@ spe_comment(struct spe_function *p, int rel_indent, const char *s)
 }
 
 
+/**
+ * Load quad word.
+ * NOTE: imm is in bytes and the least significant 4 bits must be zero!
+ */
+void spe_lqd(struct spe_function *p, unsigned rT, unsigned rA, int offset)
+{
+   const boolean pSave = p->print;
+
+   p->print = FALSE;
+   assert(offset % 4 == 0);
+   emit_RI10(p, 0x034, rT, rA, offset >> 4, "spe_lqd");
+   p->print = pSave;
+
+   if (p->print) {
+      indent(p);
+      printf("lqd\t%s, %d(%s)\n", reg_name(rT), offset, reg_name(rA));
+   }
+}
+
+
+/**
+ * Store quad word.
+ * NOTE: imm is in bytes and the least significant 4 bits must be zero!
+ */
+void spe_stqd(struct spe_function *p, unsigned rT, unsigned rA, int offset)
+{
+   const boolean pSave = p->print;
+
+   p->print = FALSE;
+   assert(offset % 4 == 0);
+   emit_RI10(p, 0x024, rT, rA, offset >> 4, "spe_stqd");
+   p->print = pSave;
+
+   if (p->print) {
+      indent(p);
+      printf("stqd\t%s, %d(%s)\n", reg_name(rT), offset, reg_name(rA));
+   }
+}
+
+
 /**
  * For branch instructions:
  * \param d  if 1, disable interupts if branch is taken
@@ -764,6 +815,7 @@ spe_and_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
    spe_release_register(p, tmp_reg);
 }
 
+
 /**
  * This function is constructed identically to spe_and_uint() above.
  * Changes to one should be made in the other.
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
index cd2e245409..47dadb343c 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
@@ -89,6 +89,9 @@ extern void spe_release_register(struct spe_function *p, int reg);
 extern void spe_allocate_register_set(struct spe_function *p);
 extern void spe_release_register_set(struct spe_function *p);
 
+extern unsigned
+spe_get_registers_used(const struct spe_function *p, ubyte used[]);
+
 extern void spe_print_code(struct spe_function *p, boolean enable);
 extern void spe_indent(struct spe_function *p, int spaces);
 extern void spe_comment(struct spe_function *p, int rel_indent, const char *s);
@@ -128,11 +131,9 @@ extern void spe_comment(struct spe_function *p, int rel_indent, const char *s);
 
 /* Memory load / store instructions
  */
-EMIT_RI10(spe_lqd,  0x034);
 EMIT_RR  (spe_lqx,  0x1c4);
 EMIT_RI16(spe_lqa,  0x061);
 EMIT_RI16(spe_lqr,  0x067);
-EMIT_RI10(spe_stqd, 0x024);
 EMIT_RR  (spe_stqx, 0x144);
 EMIT_RI16(spe_stqa, 0x041);
 EMIT_RI16(spe_stqr, 0x047);
@@ -290,6 +291,12 @@ EMIT_RI16(spe_brz,       0x040);
 EMIT_RI16(spe_brhnz,     0x046);
 EMIT_RI16(spe_brhz,      0x044);
 
+extern void
+spe_lqd(struct spe_function *p, unsigned rT, unsigned rA, int offset);
+
+extern void
+spe_stqd(struct spe_function *p, unsigned rT, unsigned rA, int offset);
+
 extern void spe_bi(struct spe_function *p, unsigned rA, int d, int e);
 extern void spe_iret(struct spe_function *p, unsigned rA, int d, int e);
 extern void spe_bisled(struct spe_function *p, unsigned rT, unsigned rA,
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
index 3065869d04..640ebcadbb 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
@@ -84,6 +84,8 @@ struct codegen
    /** Index of execution mask register */
    int exec_mask_reg;
 
+   int frame_size;  /**< Stack frame size, in words */
+
    struct spe_function *f;
    boolean error;
 };
@@ -208,7 +210,7 @@ get_src_reg(struct codegen *gen,
             reg = get_itemp(gen);
             reg_is_itemp = TRUE;
             /* Load:  reg = memory[(machine_reg) + offset] */
-            spe_lqd(gen->f, reg, gen->inputs_reg, offset);
+            spe_lqd(gen->f, reg, gen->inputs_reg, offset * 16);
          }
          break;
       case TGSI_FILE_IMMEDIATE:
@@ -221,7 +223,7 @@ get_src_reg(struct codegen *gen,
             reg = get_itemp(gen);
             reg_is_itemp = TRUE;
             /* Load:  reg = memory[(machine_reg) + offset] */
-            spe_lqd(gen->f, reg, gen->constants_reg, offset);
+            spe_lqd(gen->f, reg, gen->constants_reg, offset * 16);
          }
          break;
       default:
@@ -325,6 +327,7 @@ store_dest_reg(struct codegen *gen,
       }
       else {
          /* we're not inside a condition or loop: do nothing special */
+
       }
       break;
    case TGSI_FILE_OUTPUT:
@@ -337,17 +340,17 @@ store_dest_reg(struct codegen *gen,
             /* First read the current value from memory:
              * Load:  curval = memory[(machine_reg) + offset]
              */
-            spe_lqd(gen->f, curval_reg, gen->outputs_reg, offset);
+            spe_lqd(gen->f, curval_reg, gen->outputs_reg, offset * 16);
             /* Mix curval with newvalue according to exec mask:
              * d[i] = mask_reg[i] ? value_reg : d_reg
              */
             spe_selb(gen->f, curval_reg, curval_reg, value_reg, exec_reg);
             /* Store: memory[(machine_reg) + offset] = curval */
-            spe_stqd(gen->f, curval_reg, gen->outputs_reg, offset);
+            spe_stqd(gen->f, curval_reg, gen->outputs_reg, offset * 16);
          }
          else {
             /* Store: memory[(machine_reg) + offset] = reg */
-            spe_stqd(gen->f, value_reg, gen->outputs_reg, offset);
+            spe_stqd(gen->f, value_reg, gen->outputs_reg, offset * 16);
          }
       }
       break;
@@ -357,6 +360,41 @@ store_dest_reg(struct codegen *gen,
 }
 
 
+
+static void
+emit_prologue(struct codegen *gen)
+{
+   gen->frame_size = 256+128; /* XXX temporary */
+
+   spe_comment(gen->f, -4, "Function prologue:");
+
+   /* save $lr on stack     # stqd $lr,16($sp) */
+   spe_stqd(gen->f, SPE_REG_RA, SPE_REG_SP, 16);
+
+   /* save stack pointer    # stqd $sp,-frameSize($sp) */
+   spe_stqd(gen->f, SPE_REG_SP, SPE_REG_SP, -gen->frame_size);
+
+   /* adjust stack pointer  # ai $sp,$sp,-frameSize */
+   spe_ai(gen->f, SPE_REG_SP, SPE_REG_SP, -gen->frame_size);
+}
+
+
+static void
+emit_epilogue(struct codegen *gen)
+{
+   spe_comment(gen->f, -4, "Function epilogue:");
+
+   /* restore stack pointer    # ai $sp,$sp,frameSize */
+   spe_ai(gen->f, SPE_REG_SP, SPE_REG_SP, gen->frame_size);
+
+   /* restore $lr              # lqd $lr,16($sp) */
+   spe_lqd(gen->f, SPE_REG_RA, SPE_REG_SP, 16);
+
+   /* return from function call */
+   spe_bi(gen->f, SPE_REG_RA, 0, 0);
+}
+
+
 static boolean
 emit_MOV(struct codegen *gen, const struct tgsi_full_instruction *inst)
 {
@@ -588,6 +626,7 @@ emit_DP3(struct codegen *gen, const struct tgsi_full_instruction *inst)
    int s1_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]);
    int s2_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]);
    int tmp_reg = get_itemp(gen);
+
    /* t = x0 * x1 */
    spe_fm(gen->f, tmp_reg, s1_reg, s2_reg);
 
@@ -603,7 +642,9 @@ emit_DP3(struct codegen *gen, const struct tgsi_full_instruction *inst)
 
    for (ch = 0; ch < 4; ch++) {
       if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
-         store_dest_reg(gen, tmp_reg, ch, &inst->FullDstRegisters[0]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         spe_move(gen->f, d_reg, tmp_reg);
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
       }
    }
 
@@ -623,6 +664,7 @@ emit_DP4(struct codegen *gen, const struct tgsi_full_instruction *inst)
    int s1_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]);
    int s2_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]);
    int tmp_reg = get_itemp(gen);
+
    /* t = x0 * x1 */
    spe_fm(gen->f, tmp_reg, s1_reg, s2_reg);
 
@@ -643,6 +685,8 @@ emit_DP4(struct codegen *gen, const struct tgsi_full_instruction *inst)
 
    for (ch = 0; ch < 4; ch++) {
       if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         spe_move(gen->f, d_reg, tmp_reg);
          store_dest_reg(gen, tmp_reg, ch, &inst->FullDstRegisters[0]);
       }
    }
@@ -683,6 +727,8 @@ emit_DPH(struct codegen *gen, const struct tgsi_full_instruction *inst)
 
    for (ch = 0; ch < 4; ch++) {
       if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         spe_move(gen->f, d_reg, tmp_reg);
          store_dest_reg(gen, tmp_reg, ch, &inst->FullDstRegisters[0]);
       }
    }
@@ -1112,9 +1158,6 @@ emit_function_call(struct codegen *gen,
    uint addr;
    int ch;
 
-   /* XXX temporary value */
-   const int frameSize = 64; /* stack frame (activation record) size */
-
    assert(num_args <= 3);
 
    /* lookup function address */
@@ -1136,48 +1179,45 @@ emit_function_call(struct codegen *gen,
 
    for (ch = 0; ch < 4; ch++) {
       if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
-         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
-         int s_regs[3];
-         uint a;
+         int s_regs[3], d_reg;
+         ubyte usedRegs[SPE_NUM_REGS];
+         uint a, i, numUsed;
+
          for (a = 0; a < num_args; a++) {
             s_regs[a] = get_src_reg(gen, ch, &inst->FullSrcRegisters[a]);
          }
+         d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
 
-         /* Basically:
-          * save registers on stack
-          * move parameters to registers 3, 4, 5...
-          * call function
-          * save return value (reg 3)
-          * restore registers from stack
-          */
+         numUsed = spe_get_registers_used(gen->f, usedRegs);
+         assert(numUsed < gen->frame_size / 16 - 32);
 
-         /* XXX hack: load first function param */
-         spe_move(gen->f, 3, s_regs[0]);
-
-         /* save $lr on stack     # stqd $lr,16($sp) */
-         spe_stqd(gen->f, SPE_REG_RA, SPE_REG_SP, 16);
-         /* save stack pointer    # stqd $sp,-frameSize($sp) */
-         spe_stqd(gen->f, SPE_REG_SP, SPE_REG_SP, -frameSize);
-
-         /* XXX save registers to stack here */
+         /* save registers to stack */
+         for (i = 0; i < numUsed; i++) {
+            uint reg = usedRegs[i];
+            int offset = 2 + i;
+            spe_stqd(gen->f, reg, SPE_REG_SP, 16 * offset);
+         }
 
-         /* adjust stack pointer  # ai $sp,$sp,-frameSize */
-         spe_ai(gen->f, SPE_REG_SP, SPE_REG_SP, -frameSize);
+         /* setup function arguments */
+         for (a = 0; a < num_args; a++) {
+            spe_move(gen->f, 3 + a, s_regs[a]);
+         }
 
          /* branch to function, save return addr */
          spe_brasl(gen->f, SPE_REG_RA, addr);
 
-         /* restore stack pointer # ai $sp,$sp,frameSize */
-         spe_ai(gen->f, SPE_REG_SP, SPE_REG_SP, frameSize);
-
-         /* XXX restore registers from stack here */
-
-         /* restore $lr           # lqd $lr,16($sp) */
-         spe_lqd(gen->f, SPE_REG_RA, SPE_REG_SP, 16);
-
-         /* XXX hack: save function's return value */
+         /* save function's return value */
          spe_move(gen->f, d_reg, 3);
 
+         /* restore registers from stack */
+         for (i = 0; i < numUsed; i++) {
+            uint reg = usedRegs[i];
+            if (reg != d_reg) {
+               int offset = 2 + i;
+               spe_lqd(gen->f, reg, SPE_REG_SP, 16 * offset);
+            }
+         }
+
          store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
          free_itemps(gen);
       }
@@ -1202,10 +1242,11 @@ emit_MAX(struct codegen *gen, const struct tgsi_full_instruction *inst)
          int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
          int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
          int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         int tmp_reg = get_itemp(gen);
 
          /* d = (s1 > s2) ? s1 : s2 */
-         spe_fcgt(gen->f, d_reg, s1_reg, s2_reg);
-         spe_selb(gen->f, d_reg, s2_reg, s1_reg, d_reg);
+         spe_fcgt(gen->f, tmp_reg, s1_reg, s2_reg);
+         spe_selb(gen->f, d_reg, s2_reg, s1_reg, tmp_reg);
 
          store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
          free_itemps(gen);
@@ -1230,10 +1271,11 @@ emit_MIN(struct codegen *gen, const struct tgsi_full_instruction *inst)
          int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
          int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
          int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         int tmp_reg = get_itemp(gen);
 
          /* d = (s2 > s1) ? s1 : s2 */
-         spe_fcgt(gen->f, d_reg, s2_reg, s1_reg);
-         spe_selb(gen->f, d_reg, s2_reg, s1_reg, d_reg);
+         spe_fcgt(gen->f, tmp_reg, s2_reg, s1_reg);
+         spe_selb(gen->f, d_reg, s2_reg, s1_reg, tmp_reg);
 
          store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
          free_itemps(gen);
@@ -1346,8 +1388,7 @@ static boolean
 emit_END(struct codegen *gen)
 {
    spe_comment(gen->f, -4, "END:");
-   /* return from function call */
-   spe_bi(gen->f, SPE_REG_RA, 0, 0);
+   emit_epilogue(gen);
    return true;
 }
 
@@ -1420,6 +1461,10 @@ emit_instruction(struct codegen *gen,
       return emit_function_call(gen, inst, "spu_sin", 1);
    case TGSI_OPCODE_POW:
       return emit_function_call(gen, inst, "spu_pow", 2);
+   case TGSI_OPCODE_EXPBASE2:
+      return emit_function_call(gen, inst, "spu_exp2", 1);
+   case TGSI_OPCODE_LOGBASE2:
+      return emit_function_call(gen, inst, "spu_log2", 1);
 
    case TGSI_OPCODE_IF:
       return emit_IF(gen, inst);
@@ -1532,6 +1577,7 @@ emit_declaration(struct cell_context *cell,
 }
 
 
+
 /**
  * Translate TGSI shader code to SPE instructions.  This is done when
  * the state tracker gives us a new shader (via pipe->create_fs_state()).
@@ -1571,12 +1617,14 @@ cell_gen_fragment_program(struct cell_context *cell,
 
    tgsi_parse_init(&parse, tokens);
 
+   emit_prologue(&gen);
+
    while (!tgsi_parse_end_of_tokens(&parse) && !gen.error) {
       tgsi_parse_token(&parse);
 
       switch (parse.FullToken.Token.Type) {
       case TGSI_TOKEN_TYPE_IMMEDIATE:
-         if (!emit_immediate(&gen,  &parse.FullToken.FullImmediate))
+         if (!emit_immediate(&gen, &parse.FullToken.FullImmediate))
             gen.error = true;
          break;
 
@@ -1595,7 +1643,6 @@ cell_gen_fragment_program(struct cell_context *cell,
       }
    }
 
-
    if (gen.error) {
       /* terminate the SPE code */
       return emit_END(&gen);
diff --git a/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c b/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c
index 566df7f59e..18969005b0 100644
--- a/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c
+++ b/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c
@@ -73,8 +73,8 @@ emit_matrix_transpose(struct spe_function *p,
    int col3;
 
 
-   spe_lqd(p, shuf_hi, shuf_ptr, 3);
-   spe_lqd(p, shuf_lo, shuf_ptr, 4);
+   spe_lqd(p, shuf_hi, shuf_ptr, 3*16);
+   spe_lqd(p, shuf_lo, shuf_ptr, 4*16);
    spe_shufb(p, t1, row0, row2, shuf_hi);
    spe_shufb(p, t2, row0, row2, shuf_lo);
 
@@ -122,13 +122,13 @@ emit_matrix_transpose(struct spe_function *p,
     */
    switch (count) {
    case 4:
-      spe_stqd(p, col3, dest_ptr, 3);
+      spe_stqd(p, col3, dest_ptr, 3 * 16);
    case 3:
-      spe_stqd(p, col2, dest_ptr, 2);
+      spe_stqd(p, col2, dest_ptr, 2 * 16);
    case 2:
-      spe_stqd(p, col1, dest_ptr, 1);
+      spe_stqd(p, col1, dest_ptr, 1 * 16);
    case 1:
-      spe_stqd(p, col0, dest_ptr, 0);
+      spe_stqd(p, col0, dest_ptr, 0 * 16);
    }
 
 
@@ -166,17 +166,17 @@ emit_fetch(struct spe_function *p,
    float scale_signed = 0.0;
    float scale_unsigned = 0.0;
 
-   spe_lqd(p, v0, in_ptr, 0 + offset[0]);
-   spe_lqd(p, v1, in_ptr, 1 + offset[0]);
-   spe_lqd(p, v2, in_ptr, 2 + offset[0]);
-   spe_lqd(p, v3, in_ptr, 3 + offset[0]);
+   spe_lqd(p, v0, in_ptr, (0 + offset[0]) * 16);
+   spe_lqd(p, v1, in_ptr, (1 + offset[0]) * 16);
+   spe_lqd(p, v2, in_ptr, (2 + offset[0]) * 16);
+   spe_lqd(p, v3, in_ptr, (3 + offset[0]) * 16);
    offset[0] += 4;
    
    switch (bytes) {
    case 1:
       scale_signed = 1.0f / 127.0f;
       scale_unsigned = 1.0f / 255.0f;
-      spe_lqd(p, tmp, shuf_ptr, 1);
+      spe_lqd(p, tmp, shuf_ptr, 1 * 16);
       spe_shufb(p, v0, v0, v0, tmp);
       spe_shufb(p, v1, v1, v1, tmp);
       spe_shufb(p, v2, v2, v2, tmp);
@@ -185,7 +185,7 @@ emit_fetch(struct spe_function *p,
    case 2:
       scale_signed = 1.0f / 32767.0f;
       scale_unsigned = 1.0f / 65535.0f;
-      spe_lqd(p, tmp, shuf_ptr, 2);
+      spe_lqd(p, tmp, shuf_ptr, 2 * 16);
       spe_shufb(p, v0, v0, v0, tmp);
       spe_shufb(p, v1, v1, v1, tmp);
       spe_shufb(p, v2, v2, v2, tmp);
@@ -241,11 +241,11 @@ emit_fetch(struct spe_function *p,
 
    switch (count) {
    case 1:
-      spe_stqd(p, float_zero, out_ptr, 1);
+      spe_stqd(p, float_zero, out_ptr, 1 * 16);
    case 2:
-      spe_stqd(p, float_zero, out_ptr, 2);
+      spe_stqd(p, float_zero, out_ptr, 2 * 16);
    case 3:
-      spe_stqd(p, float_one, out_ptr, 3);
+      spe_stqd(p, float_one, out_ptr, 3 * 16);
    }
 
    if (float_zero != -1) {
-- 
cgit v1.2.3


From d3403b5482ee1c0faa0f42b8782ee3093a2f7b5e Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 10 Oct 2008 14:57:57 -0600
Subject: cell: add emit_RI10s() which does range checking on the 10-bit signed
 immediate field

This type of checking should be expanded to cover more instructions...
---
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c | 16 ++++++++++++++++
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h | 24 ++++++++++++++----------
 2 files changed, 30 insertions(+), 10 deletions(-)

(limited to 'src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
index d0bacd08a6..dea1aed032 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
@@ -278,6 +278,16 @@ static void emit_RI10(struct spe_function *p, unsigned op, unsigned rT,
 }
 
 
+/** As above, but do range checking on signed immediate value */
+static void emit_RI10s(struct spe_function *p, unsigned op, unsigned rT,
+                       unsigned rA, int imm, const char *name)
+{
+    assert(imm <= 511);
+    assert(imm >= -512);
+    emit_RI10(p, op, rT, rA, imm, name);
+}
+
+
 static void emit_RI16(struct spe_function *p, unsigned op, unsigned rT,
 		      int imm, const char *name)
 {
@@ -354,6 +364,12 @@ void _name (struct spe_function *p, unsigned rT, unsigned rA, int imm) \
    emit_RI10(p, _op, rT, rA, imm, __FUNCTION__);             \
 }
 
+#define EMIT_RI10s(_name, _op) \
+void _name (struct spe_function *p, unsigned rT, unsigned rA, int imm) \
+{ \
+   emit_RI10s(p, _op, rT, rA, imm, __FUNCTION__);             \
+}
+
 #define EMIT_RI16(_name, _op) \
 void _name (struct spe_function *p, unsigned rT, int imm) \
 { \
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
index 47dadb343c..d6a3c02f20 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
@@ -119,6 +119,9 @@ extern void spe_comment(struct spe_function *p, int rel_indent, const char *s);
 #define EMIT_RI10(_name, _op) \
     extern void _name (struct spe_function *p, unsigned rT, unsigned rA, \
 			   int imm)
+#define EMIT_RI10s(_name, _op) \
+    extern void _name (struct spe_function *p, unsigned rT, unsigned rA, \
+			   int imm)
 #define EMIT_RI16(_name, _op) \
     extern void _name (struct spe_function *p, unsigned rT, int imm)
 #define EMIT_RI18(_name, _op) \
@@ -163,7 +166,7 @@ EMIT_RI16(spe_fsmbi, 0x065);
 EMIT_RR  (spe_ah,      0x0c8);
 EMIT_RI10(spe_ahi,     0x01d);
 EMIT_RR  (spe_a,       0x0c0);
-EMIT_RI10(spe_ai,      0x01c);
+EMIT_RI10s(spe_ai,      0x01c);
 EMIT_RR  (spe_sfh,     0x048);
 EMIT_RI10(spe_sfhi,    0x00d);
 EMIT_RR  (spe_sf,      0x040);
@@ -201,19 +204,19 @@ EMIT_R   (spe_xshw,    0x2ae);
 EMIT_R   (spe_xswd,    0x2a6);
 EMIT_RR  (spe_and,     0x0c1);
 EMIT_RR  (spe_andc,    0x2c1);
-EMIT_RI10(spe_andbi,   0x016);
-EMIT_RI10(spe_andhi,   0x015);
-EMIT_RI10(spe_andi,    0x014);
+EMIT_RI10s(spe_andbi,   0x016);
+EMIT_RI10s(spe_andhi,   0x015);
+EMIT_RI10s(spe_andi,    0x014);
 EMIT_RR  (spe_or,      0x041);
 EMIT_RR  (spe_orc,     0x2c9);
-EMIT_RI10(spe_orbi,    0x006);
-EMIT_RI10(spe_orhi,    0x005);
-EMIT_RI10(spe_ori,     0x004);
+EMIT_RI10s(spe_orbi,    0x006);
+EMIT_RI10s(spe_orhi,    0x005);
+EMIT_RI10s(spe_ori,     0x004);
 EMIT_R   (spe_orx,     0x1f0);
 EMIT_RR  (spe_xor,     0x241);
-EMIT_RI10(spe_xorbi,   0x026);
-EMIT_RI10(spe_xorhi,   0x025);
-EMIT_RI10(spe_xori,    0x024);
+EMIT_RI10s(spe_xorbi,   0x026);
+EMIT_RI10s(spe_xorhi,   0x025);
+EMIT_RI10s(spe_xori,    0x024);
 EMIT_RR  (spe_nand,    0x0c9);
 EMIT_RR  (spe_nor,     0x049);
 EMIT_RR  (spe_eqv,     0x249);
@@ -422,6 +425,7 @@ EMIT_R   (spe_wrch,       0x10d);
 #undef EMIT_RI7
 #undef EMIT_RI8
 #undef EMIT_RI10
+#undef EMIT_RI10s
 #undef EMIT_RI16
 #undef EMIT_RI18
 #undef EMIT_I16
-- 
cgit v1.2.3


From 7f15e34cfadbeb460d22f9549511694c2bd27495 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 12 Nov 2008 11:01:40 -0700
Subject: cell: fix typo in EMIT_ macro

---
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
index d6a3c02f20..4cde080a2c 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
@@ -100,7 +100,7 @@ extern void spe_comment(struct spe_function *p, int rel_indent, const char *s);
 #endif /* RTASM_PPC_SPE_H */
 
 #ifndef EMIT_
-#define EMIT_(name, _op) \
+#define EMIT_(_name, _op) \
     extern void _name (struct spe_function *p, unsigned rT)
 #define EMIT_R(_name, _op) \
     extern void _name (struct spe_function *p, unsigned rT, unsigned rA)
-- 
cgit v1.2.3


From 1cd15f03706f921f3a9995a4ee860b91496f4bd2 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 12 Nov 2008 11:05:34 -0700
Subject: cell: move semicolons to silence warnings w/ other compilers

---
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h | 378 ++++++++++++++--------------
 1 file changed, 189 insertions(+), 189 deletions(-)

(limited to 'src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
index 4cde080a2c..f1500cef29 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
@@ -101,198 +101,198 @@ extern void spe_comment(struct spe_function *p, int rel_indent, const char *s);
 
 #ifndef EMIT_
 #define EMIT_(_name, _op) \
-    extern void _name (struct spe_function *p, unsigned rT)
+    extern void _name (struct spe_function *p, unsigned rT);
 #define EMIT_R(_name, _op) \
-    extern void _name (struct spe_function *p, unsigned rT, unsigned rA)
+    extern void _name (struct spe_function *p, unsigned rT, unsigned rA);
 #define EMIT_RR(_name, _op) \
     extern void _name (struct spe_function *p, unsigned rT, unsigned rA, \
-			   unsigned rB)
+                       unsigned rB);
 #define EMIT_RRR(_name, _op) \
     extern void _name (struct spe_function *p, unsigned rT, unsigned rA, \
-			   unsigned rB, unsigned rC)
+                       unsigned rB, unsigned rC);
 #define EMIT_RI7(_name, _op) \
     extern void _name (struct spe_function *p, unsigned rT, unsigned rA, \
-			   int imm)
+                       int imm);
 #define EMIT_RI8(_name, _op, bias) \
     extern void _name (struct spe_function *p, unsigned rT, unsigned rA, \
-			   int imm)
+                       int imm);
 #define EMIT_RI10(_name, _op) \
     extern void _name (struct spe_function *p, unsigned rT, unsigned rA, \
-			   int imm)
+                       int imm);
 #define EMIT_RI10s(_name, _op) \
     extern void _name (struct spe_function *p, unsigned rT, unsigned rA, \
-			   int imm)
+                       int imm);
 #define EMIT_RI16(_name, _op) \
-    extern void _name (struct spe_function *p, unsigned rT, int imm)
+    extern void _name (struct spe_function *p, unsigned rT, int imm);
 #define EMIT_RI18(_name, _op) \
-    extern void _name (struct spe_function *p, unsigned rT, int imm)
+    extern void _name (struct spe_function *p, unsigned rT, int imm);
 #define EMIT_I16(_name, _op) \
-    extern void _name (struct spe_function *p, int imm)
+    extern void _name (struct spe_function *p, int imm);
 #define UNDEF_EMIT_MACROS
 #endif /* EMIT_ */
 
 
 /* Memory load / store instructions
  */
-EMIT_RR  (spe_lqx,  0x1c4);
-EMIT_RI16(spe_lqa,  0x061);
-EMIT_RI16(spe_lqr,  0x067);
-EMIT_RR  (spe_stqx, 0x144);
-EMIT_RI16(spe_stqa, 0x041);
-EMIT_RI16(spe_stqr, 0x047);
-EMIT_RI7 (spe_cbd,  0x1f4);
-EMIT_RR  (spe_cbx,  0x1d4);
-EMIT_RI7 (spe_chd,  0x1f5);
-EMIT_RI7 (spe_chx,  0x1d5);
-EMIT_RI7 (spe_cwd,  0x1f6);
-EMIT_RI7 (spe_cwx,  0x1d6);
-EMIT_RI7 (spe_cdd,  0x1f7);
-EMIT_RI7 (spe_cdx,  0x1d7);
+EMIT_RR  (spe_lqx,  0x1c4)
+EMIT_RI16(spe_lqa,  0x061)
+EMIT_RI16(spe_lqr,  0x067)
+EMIT_RR  (spe_stqx, 0x144)
+EMIT_RI16(spe_stqa, 0x041)
+EMIT_RI16(spe_stqr, 0x047)
+EMIT_RI7 (spe_cbd,  0x1f4)
+EMIT_RR  (spe_cbx,  0x1d4)
+EMIT_RI7 (spe_chd,  0x1f5)
+EMIT_RI7 (spe_chx,  0x1d5)
+EMIT_RI7 (spe_cwd,  0x1f6)
+EMIT_RI7 (spe_cwx,  0x1d6)
+EMIT_RI7 (spe_cdd,  0x1f7)
+EMIT_RI7 (spe_cdx,  0x1d7)
 
 
 /* Constant formation instructions
  */
-EMIT_RI16(spe_ilh,   0x083);
-EMIT_RI16(spe_ilhu,  0x082);
-EMIT_RI16(spe_il,    0x081);
-EMIT_RI18(spe_ila,   0x021);
-EMIT_RI16(spe_iohl,  0x0c1);
-EMIT_RI16(spe_fsmbi, 0x065);
+EMIT_RI16(spe_ilh,   0x083)
+EMIT_RI16(spe_ilhu,  0x082)
+EMIT_RI16(spe_il,    0x081)
+EMIT_RI18(spe_ila,   0x021)
+EMIT_RI16(spe_iohl,  0x0c1)
+EMIT_RI16(spe_fsmbi, 0x065)
 
 
 /* Integer and logical instructions
  */
-EMIT_RR  (spe_ah,      0x0c8);
-EMIT_RI10(spe_ahi,     0x01d);
-EMIT_RR  (spe_a,       0x0c0);
-EMIT_RI10s(spe_ai,      0x01c);
-EMIT_RR  (spe_sfh,     0x048);
-EMIT_RI10(spe_sfhi,    0x00d);
-EMIT_RR  (spe_sf,      0x040);
-EMIT_RI10(spe_sfi,     0x00c);
-EMIT_RR  (spe_addx,    0x340);
-EMIT_RR  (spe_cg,      0x0c2);
-EMIT_RR  (spe_cgx,     0x342);
-EMIT_RR  (spe_sfx,     0x341);
-EMIT_RR  (spe_bg,      0x042);
-EMIT_RR  (spe_bgx,     0x343);
-EMIT_RR  (spe_mpy,     0x3c4);
-EMIT_RR  (spe_mpyu,    0x3cc);
-EMIT_RI10(spe_mpyi,    0x074);
-EMIT_RI10(spe_mpyui,   0x075);
-EMIT_RRR (spe_mpya,    0x00c);
-EMIT_RR  (spe_mpyh,    0x3c5);
-EMIT_RR  (spe_mpys,    0x3c7);
-EMIT_RR  (spe_mpyhh,   0x3c6);
-EMIT_RR  (spe_mpyhha,  0x346);
-EMIT_RR  (spe_mpyhhu,  0x3ce);
-EMIT_RR  (spe_mpyhhau, 0x34e);
-EMIT_R   (spe_clz,     0x2a5);
-EMIT_R   (spe_cntb,    0x2b4);
-EMIT_R   (spe_fsmb,    0x1b6);
-EMIT_R   (spe_fsmh,    0x1b5);
-EMIT_R   (spe_fsm,     0x1b4);
-EMIT_R   (spe_gbb,     0x1b2);
-EMIT_R   (spe_gbh,     0x1b1);
-EMIT_R   (spe_gb,      0x1b0);
-EMIT_RR  (spe_avgb,    0x0d3);
-EMIT_RR  (spe_absdb,   0x053);
-EMIT_RR  (spe_sumb,    0x253);
-EMIT_R   (spe_xsbh,    0x2b6);
-EMIT_R   (spe_xshw,    0x2ae);
-EMIT_R   (spe_xswd,    0x2a6);
-EMIT_RR  (spe_and,     0x0c1);
-EMIT_RR  (spe_andc,    0x2c1);
-EMIT_RI10s(spe_andbi,   0x016);
-EMIT_RI10s(spe_andhi,   0x015);
-EMIT_RI10s(spe_andi,    0x014);
-EMIT_RR  (spe_or,      0x041);
-EMIT_RR  (spe_orc,     0x2c9);
-EMIT_RI10s(spe_orbi,    0x006);
-EMIT_RI10s(spe_orhi,    0x005);
-EMIT_RI10s(spe_ori,     0x004);
-EMIT_R   (spe_orx,     0x1f0);
-EMIT_RR  (spe_xor,     0x241);
-EMIT_RI10s(spe_xorbi,   0x026);
-EMIT_RI10s(spe_xorhi,   0x025);
-EMIT_RI10s(spe_xori,    0x024);
-EMIT_RR  (spe_nand,    0x0c9);
-EMIT_RR  (spe_nor,     0x049);
-EMIT_RR  (spe_eqv,     0x249);
-EMIT_RRR (spe_selb,    0x008);
-EMIT_RRR (spe_shufb,   0x00b);
+EMIT_RR  (spe_ah,      0x0c8)
+EMIT_RI10(spe_ahi,     0x01d)
+EMIT_RR  (spe_a,       0x0c0)
+EMIT_RI10s(spe_ai,      0x01c)
+EMIT_RR  (spe_sfh,     0x048)
+EMIT_RI10(spe_sfhi,    0x00d)
+EMIT_RR  (spe_sf,      0x040)
+EMIT_RI10(spe_sfi,     0x00c)
+EMIT_RR  (spe_addx,    0x340)
+EMIT_RR  (spe_cg,      0x0c2)
+EMIT_RR  (spe_cgx,     0x342)
+EMIT_RR  (spe_sfx,     0x341)
+EMIT_RR  (spe_bg,      0x042)
+EMIT_RR  (spe_bgx,     0x343)
+EMIT_RR  (spe_mpy,     0x3c4)
+EMIT_RR  (spe_mpyu,    0x3cc)
+EMIT_RI10(spe_mpyi,    0x074)
+EMIT_RI10(spe_mpyui,   0x075)
+EMIT_RRR (spe_mpya,    0x00c)
+EMIT_RR  (spe_mpyh,    0x3c5)
+EMIT_RR  (spe_mpys,    0x3c7)
+EMIT_RR  (spe_mpyhh,   0x3c6)
+EMIT_RR  (spe_mpyhha,  0x346)
+EMIT_RR  (spe_mpyhhu,  0x3ce)
+EMIT_RR  (spe_mpyhhau, 0x34e)
+EMIT_R   (spe_clz,     0x2a5)
+EMIT_R   (spe_cntb,    0x2b4)
+EMIT_R   (spe_fsmb,    0x1b6)
+EMIT_R   (spe_fsmh,    0x1b5)
+EMIT_R   (spe_fsm,     0x1b4)
+EMIT_R   (spe_gbb,     0x1b2)
+EMIT_R   (spe_gbh,     0x1b1)
+EMIT_R   (spe_gb,      0x1b0)
+EMIT_RR  (spe_avgb,    0x0d3)
+EMIT_RR  (spe_absdb,   0x053)
+EMIT_RR  (spe_sumb,    0x253)
+EMIT_R   (spe_xsbh,    0x2b6)
+EMIT_R   (spe_xshw,    0x2ae)
+EMIT_R   (spe_xswd,    0x2a6)
+EMIT_RR  (spe_and,     0x0c1)
+EMIT_RR  (spe_andc,    0x2c1)
+EMIT_RI10s(spe_andbi,   0x016)
+EMIT_RI10s(spe_andhi,   0x015)
+EMIT_RI10s(spe_andi,    0x014)
+EMIT_RR  (spe_or,      0x041)
+EMIT_RR  (spe_orc,     0x2c9)
+EMIT_RI10s(spe_orbi,    0x006)
+EMIT_RI10s(spe_orhi,    0x005)
+EMIT_RI10s(spe_ori,     0x004)
+EMIT_R   (spe_orx,     0x1f0)
+EMIT_RR  (spe_xor,     0x241)
+EMIT_RI10s(spe_xorbi,   0x026)
+EMIT_RI10s(spe_xorhi,   0x025)
+EMIT_RI10s(spe_xori,    0x024)
+EMIT_RR  (spe_nand,    0x0c9)
+EMIT_RR  (spe_nor,     0x049)
+EMIT_RR  (spe_eqv,     0x249)
+EMIT_RRR (spe_selb,    0x008)
+EMIT_RRR (spe_shufb,   0x00b)
 
 
 /* Shift and rotate instructions
  */
-EMIT_RR  (spe_shlh,      0x05f);
-EMIT_RI7 (spe_shlhi,     0x07f);
-EMIT_RR  (spe_shl,       0x05b);
-EMIT_RI7 (spe_shli,      0x07b);
-EMIT_RR  (spe_shlqbi,    0x1db);
-EMIT_RI7 (spe_shlqbii,   0x1fb);
-EMIT_RR  (spe_shlqby,    0x1df);
-EMIT_RI7 (spe_shlqbyi,   0x1ff);
-EMIT_RR  (spe_shlqbybi,  0x1cf);
-EMIT_RR  (spe_roth,      0x05c);
-EMIT_RI7 (spe_rothi,     0x07c);
-EMIT_RR  (spe_rot,       0x058);
-EMIT_RI7 (spe_roti,      0x078);
-EMIT_RR  (spe_rotqby,    0x1dc);
-EMIT_RI7 (spe_rotqbyi,   0x1fc);
-EMIT_RR  (spe_rotqbybi,  0x1cc);
-EMIT_RR  (spe_rotqbi,    0x1d8);
-EMIT_RI7 (spe_rotqbii,   0x1f8);
-EMIT_RR  (spe_rothm,     0x05d);
-EMIT_RI7 (spe_rothmi,    0x07d);
-EMIT_RR  (spe_rotm,      0x059);
-EMIT_RI7 (spe_rotmi,     0x079);
-EMIT_RR  (spe_rotqmby,   0x1dd);
-EMIT_RI7 (spe_rotqmbyi,  0x1fd);
-EMIT_RR  (spe_rotqmbybi, 0x1cd);
-EMIT_RR  (spe_rotqmbi,   0x1c9);
-EMIT_RI7 (spe_rotqmbii,  0x1f9);
-EMIT_RR  (spe_rotmah,    0x05e);
-EMIT_RI7 (spe_rotmahi,   0x07e);
-EMIT_RR  (spe_rotma,     0x05a);
-EMIT_RI7 (spe_rotmai,    0x07a);
+EMIT_RR  (spe_shlh,      0x05f)
+EMIT_RI7 (spe_shlhi,     0x07f)
+EMIT_RR  (spe_shl,       0x05b)
+EMIT_RI7 (spe_shli,      0x07b)
+EMIT_RR  (spe_shlqbi,    0x1db)
+EMIT_RI7 (spe_shlqbii,   0x1fb)
+EMIT_RR  (spe_shlqby,    0x1df)
+EMIT_RI7 (spe_shlqbyi,   0x1ff)
+EMIT_RR  (spe_shlqbybi,  0x1cf)
+EMIT_RR  (spe_roth,      0x05c)
+EMIT_RI7 (spe_rothi,     0x07c)
+EMIT_RR  (spe_rot,       0x058)
+EMIT_RI7 (spe_roti,      0x078)
+EMIT_RR  (spe_rotqby,    0x1dc)
+EMIT_RI7 (spe_rotqbyi,   0x1fc)
+EMIT_RR  (spe_rotqbybi,  0x1cc)
+EMIT_RR  (spe_rotqbi,    0x1d8)
+EMIT_RI7 (spe_rotqbii,   0x1f8)
+EMIT_RR  (spe_rothm,     0x05d)
+EMIT_RI7 (spe_rothmi,    0x07d)
+EMIT_RR  (spe_rotm,      0x059)
+EMIT_RI7 (spe_rotmi,     0x079)
+EMIT_RR  (spe_rotqmby,   0x1dd)
+EMIT_RI7 (spe_rotqmbyi,  0x1fd)
+EMIT_RR  (spe_rotqmbybi, 0x1cd)
+EMIT_RR  (spe_rotqmbi,   0x1c9)
+EMIT_RI7 (spe_rotqmbii,  0x1f9)
+EMIT_RR  (spe_rotmah,    0x05e)
+EMIT_RI7 (spe_rotmahi,   0x07e)
+EMIT_RR  (spe_rotma,     0x05a)
+EMIT_RI7 (spe_rotmai,    0x07a)
 
 
 /* Compare, branch, and halt instructions
  */
-EMIT_RR  (spe_heq,       0x3d8);
-EMIT_RI10(spe_heqi,      0x07f);
-EMIT_RR  (spe_hgt,       0x258);
-EMIT_RI10(spe_hgti,      0x04f);
-EMIT_RR  (spe_hlgt,      0x2d8);
-EMIT_RI10(spe_hlgti,     0x05f);
-EMIT_RR  (spe_ceqb,      0x3d0);
-EMIT_RI10(spe_ceqbi,     0x07e);
-EMIT_RR  (spe_ceqh,      0x3c8);
-EMIT_RI10(spe_ceqhi,     0x07d);
-EMIT_RR  (spe_ceq,       0x3c0);
-EMIT_RI10(spe_ceqi,      0x07c);
-EMIT_RR  (spe_cgtb,      0x250);
-EMIT_RI10(spe_cgtbi,     0x04e);
-EMIT_RR  (spe_cgth,      0x248);
-EMIT_RI10(spe_cgthi,     0x04d);
-EMIT_RR  (spe_cgt,       0x240);
-EMIT_RI10(spe_cgti,      0x04c);
-EMIT_RR  (spe_clgtb,     0x2d0);
-EMIT_RI10(spe_clgtbi,    0x05e);
-EMIT_RR  (spe_clgth,     0x2c8);
-EMIT_RI10(spe_clgthi,    0x05d);
-EMIT_RR  (spe_clgt,      0x2c0);
-EMIT_RI10(spe_clgti,     0x05c);
-EMIT_I16 (spe_br,        0x064);
-EMIT_I16 (spe_bra,       0x060);
-EMIT_RI16(spe_brsl,      0x066);
-EMIT_RI16(spe_brasl,     0x062);
-EMIT_RI16(spe_brnz,      0x042);
-EMIT_RI16(spe_brz,       0x040);
-EMIT_RI16(spe_brhnz,     0x046);
-EMIT_RI16(spe_brhz,      0x044);
+EMIT_RR  (spe_heq,       0x3d8)
+EMIT_RI10(spe_heqi,      0x07f)
+EMIT_RR  (spe_hgt,       0x258)
+EMIT_RI10(spe_hgti,      0x04f)
+EMIT_RR  (spe_hlgt,      0x2d8)
+EMIT_RI10(spe_hlgti,     0x05f)
+EMIT_RR  (spe_ceqb,      0x3d0)
+EMIT_RI10(spe_ceqbi,     0x07e)
+EMIT_RR  (spe_ceqh,      0x3c8)
+EMIT_RI10(spe_ceqhi,     0x07d)
+EMIT_RR  (spe_ceq,       0x3c0)
+EMIT_RI10(spe_ceqi,      0x07c)
+EMIT_RR  (spe_cgtb,      0x250)
+EMIT_RI10(spe_cgtbi,     0x04e)
+EMIT_RR  (spe_cgth,      0x248)
+EMIT_RI10(spe_cgthi,     0x04d)
+EMIT_RR  (spe_cgt,       0x240)
+EMIT_RI10(spe_cgti,      0x04c)
+EMIT_RR  (spe_clgtb,     0x2d0)
+EMIT_RI10(spe_clgtbi,    0x05e)
+EMIT_RR  (spe_clgth,     0x2c8)
+EMIT_RI10(spe_clgthi,    0x05d)
+EMIT_RR  (spe_clgt,      0x2c0)
+EMIT_RI10(spe_clgti,     0x05c)
+EMIT_I16 (spe_br,        0x064)
+EMIT_I16 (spe_bra,       0x060)
+EMIT_RI16(spe_brsl,      0x066)
+EMIT_RI16(spe_brasl,     0x062)
+EMIT_RI16(spe_brnz,      0x042)
+EMIT_RI16(spe_brz,       0x040)
+EMIT_RI16(spe_brhnz,     0x046)
+EMIT_RI16(spe_brhz,      0x044)
 
 extern void
 spe_lqd(struct spe_function *p, unsigned rT, unsigned rA, int offset);
@@ -375,46 +375,46 @@ spe_float_max(struct spe_function *p, unsigned rT, unsigned rA, unsigned rB);
 
 /* Floating-point instructions
  */
-EMIT_RR  (spe_fa,         0x2c4);
-EMIT_RR  (spe_dfa,        0x2cc);
-EMIT_RR  (spe_fs,         0x2c5);
-EMIT_RR  (spe_dfs,        0x2cd);
-EMIT_RR  (spe_fm,         0x2c6);
-EMIT_RR  (spe_dfm,        0x2ce);
-EMIT_RRR (spe_fma,        0x00e);
-EMIT_RR  (spe_dfma,       0x35c);
-EMIT_RRR (spe_fnms,       0x00d);
-EMIT_RR  (spe_dfnms,      0x35e);
-EMIT_RRR (spe_fms,        0x00f);
-EMIT_RR  (spe_dfms,       0x35d);
-EMIT_RR  (spe_dfnma,      0x35f);
-EMIT_R   (spe_frest,      0x1b8);
-EMIT_R   (spe_frsqest,    0x1b9);
-EMIT_RR  (spe_fi,         0x3d4);
-EMIT_RI8 (spe_csflt,      0x1da, 155);
-EMIT_RI8 (spe_cflts,      0x1d8, 173);
-EMIT_RI8 (spe_cuflt,      0x1db, 155);
-EMIT_RI8 (spe_cfltu,      0x1d9, 173);
-EMIT_R   (spe_frds,       0x3b9);
-EMIT_R   (spe_fesd,       0x3b8);
-EMIT_RR  (spe_dfceq,      0x3c3);
-EMIT_RR  (spe_dfcmeq,     0x3cb);
-EMIT_RR  (spe_dfcgt,      0x2c3);
-EMIT_RR  (spe_dfcmgt,     0x2cb);
-EMIT_RI7 (spe_dftsv,      0x3bf);
-EMIT_RR  (spe_fceq,       0x3c2);
-EMIT_RR  (spe_fcmeq,      0x3ca);
-EMIT_RR  (spe_fcgt,       0x2c2);
-EMIT_RR  (spe_fcmgt,      0x2ca);
-EMIT_R   (spe_fscrwr,     0x3ba);
-EMIT_    (spe_fscrrd,     0x398);
+EMIT_RR  (spe_fa,         0x2c4)
+EMIT_RR  (spe_dfa,        0x2cc)
+EMIT_RR  (spe_fs,         0x2c5)
+EMIT_RR  (spe_dfs,        0x2cd)
+EMIT_RR  (spe_fm,         0x2c6)
+EMIT_RR  (spe_dfm,        0x2ce)
+EMIT_RRR (spe_fma,        0x00e)
+EMIT_RR  (spe_dfma,       0x35c)
+EMIT_RRR (spe_fnms,       0x00d)
+EMIT_RR  (spe_dfnms,      0x35e)
+EMIT_RRR (spe_fms,        0x00f)
+EMIT_RR  (spe_dfms,       0x35d)
+EMIT_RR  (spe_dfnma,      0x35f)
+EMIT_R   (spe_frest,      0x1b8)
+EMIT_R   (spe_frsqest,    0x1b9)
+EMIT_RR  (spe_fi,         0x3d4)
+EMIT_RI8 (spe_csflt,      0x1da, 155)
+EMIT_RI8 (spe_cflts,      0x1d8, 173)
+EMIT_RI8 (spe_cuflt,      0x1db, 155)
+EMIT_RI8 (spe_cfltu,      0x1d9, 173)
+EMIT_R   (spe_frds,       0x3b9)
+EMIT_R   (spe_fesd,       0x3b8)
+EMIT_RR  (spe_dfceq,      0x3c3)
+EMIT_RR  (spe_dfcmeq,     0x3cb)
+EMIT_RR  (spe_dfcgt,      0x2c3)
+EMIT_RR  (spe_dfcmgt,     0x2cb)
+EMIT_RI7 (spe_dftsv,      0x3bf)
+EMIT_RR  (spe_fceq,       0x3c2)
+EMIT_RR  (spe_fcmeq,      0x3ca)
+EMIT_RR  (spe_fcgt,       0x2c2)
+EMIT_RR  (spe_fcmgt,      0x2ca)
+EMIT_R   (spe_fscrwr,     0x3ba)
+EMIT_    (spe_fscrrd,     0x398)
 
 
 /* Channel instructions
  */
-EMIT_R   (spe_rdch,       0x00d);
-EMIT_R   (spe_rdchcnt,    0x00f);
-EMIT_R   (spe_wrch,       0x10d);
+EMIT_R   (spe_rdch,       0x00d)
+EMIT_R   (spe_rdchcnt,    0x00f)
+EMIT_R   (spe_wrch,       0x10d)
 
 
 #ifdef UNDEF_EMIT_MACROS
-- 
cgit v1.2.3


From 2c29a6896a4a026ed3568db9caf90f422b711d8b Mon Sep 17 00:00:00 2001
From: Robert Ellison <papillo@tungstengraphics.com>
Date: Thu, 13 Nov 2008 11:22:12 -0700
Subject: CELL: fix stencil twiddling, stencil invert

Many stencil tests were failing because of a failure to read the
stencil buffer, due to "twiddling" (or "untwiddling") "an unsupported
texture format".  This is fixed for the case of a stencil/Z S824Z format
(which twiddles just like the 32-bit color formats).

tests/stencilwrap.c was failing on the GL_INVERT test, because
the emitted code for "spe_xori" turned out not to be an actual
"xori" instruction, but rather a "stqd" instruction, because
of a typo in the rtasm code.  This is now fixed, and
tests/stencil_wrap now works.
---
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h | 6 +++---
 src/gallium/drivers/cell/ppu/cell_texture.c | 6 ++++--
 2 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
index f1500cef29..7c211ffc51 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
@@ -214,9 +214,9 @@ EMIT_RI10s(spe_orhi,    0x005)
 EMIT_RI10s(spe_ori,     0x004)
 EMIT_R   (spe_orx,     0x1f0)
 EMIT_RR  (spe_xor,     0x241)
-EMIT_RI10s(spe_xorbi,   0x026)
-EMIT_RI10s(spe_xorhi,   0x025)
-EMIT_RI10s(spe_xori,    0x024)
+EMIT_RI10s(spe_xorbi,   0x046)
+EMIT_RI10s(spe_xorhi,   0x045)
+EMIT_RI10s(spe_xori,    0x044)
 EMIT_RR  (spe_nand,    0x0c9)
 EMIT_RR  (spe_nor,     0x049)
 EMIT_RR  (spe_eqv,     0x249)
diff --git a/src/gallium/drivers/cell/ppu/cell_texture.c b/src/gallium/drivers/cell/ppu/cell_texture.c
index ae88d06912..47cd9605c8 100644
--- a/src/gallium/drivers/cell/ppu/cell_texture.c
+++ b/src/gallium/drivers/cell/ppu/cell_texture.c
@@ -314,6 +314,7 @@ cell_twiddle_texture(struct pipe_screen *screen,
    switch (ct->base.format) {
    case PIPE_FORMAT_A8R8G8B8_UNORM:
    case PIPE_FORMAT_B8G8R8A8_UNORM:
+   case PIPE_FORMAT_S8Z24_UNORM:
       {
          int numFaces = ct->base.target == PIPE_TEXTURE_CUBE ? 6 : 1;
          int offset = bufWidth * bufHeight * 4 * surface->face;
@@ -337,7 +338,7 @@ cell_twiddle_texture(struct pipe_screen *screen,
       }
       break;
    default:
-      printf("Cell: twiddle unsupported texture format\n");
+      printf("Cell: twiddle unsupported texture format 0x%x\n", ct->base.format);
       ;
    }
 
@@ -363,6 +364,7 @@ cell_untwiddle_texture(struct pipe_screen *screen,
    switch (ct->base.format) {
    case PIPE_FORMAT_A8R8G8B8_UNORM:
    case PIPE_FORMAT_B8G8R8A8_UNORM:
+   case PIPE_FORMAT_S8Z24_UNORM:
       {
          int numFaces = ct->base.target == PIPE_TEXTURE_CUBE ? 6 : 1;
          int offset = surface->stride * texHeight * 4 * surface->face;
@@ -382,7 +384,7 @@ cell_untwiddle_texture(struct pipe_screen *screen,
    default:
       {
          ct->untiled_data[level] = NULL;
-         printf("Cell: untwiddle unsupported texture format\n");
+         printf("Cell: untwiddle unsupported texture format 0x%x\n", ct->base.format);
       }
    }
 
-- 
cgit v1.2.3


From 11fc390f6478526d4f0bdb4b7e628284da31b3b9 Mon Sep 17 00:00:00 2001
From: Robert Ellison <papillo@tungstengraphics.com>
Date: Fri, 21 Nov 2008 11:42:14 -0700
Subject: CELL: use variant-length fragment ops programs

This is a set of changes that optimizes the memory use of fragment
operation programs (by using and transmitting only as much memory as is
needed for the fragment ops programs, instead of maximal sizes), as well
as eliminate the dependency on hard-coded maximal program sizes.  State
that is not dependent on fragment facing (i.e. that isn't using
two-sided stenciling) will only save and transmit a single
fragment operation program, instead of two identical programs.

- Added the ability to emit a LNOP (No Operation (Load)) instruction.
  This is used to pad the generated fragment operations programs to
  a multiple of 8 bytes, which is necessary for proper operation of
  the dual instruction pipeline, and also required for proper SPU-side
  decoding.

- Added the ability to allocate and manage a variant-length
  struct cell_command_fragment_ops.  This structure now puts the
  generated function field at the end, where it can be as large
  as necessary.

- On the PPU side, we now combine the generated front-facing and
  back-facing code into a single variant-length buffer (and only use one
  if the two sets of code are identical) for transmission to the SPU.

- On the SPU side, we pull the correct sizes out of the buffer,
  allocate a new code buffer if the one we have isn't large enough,
  and save the code to that buffer.  The buffer is deallocated when
  the SPU exits.

- Commented out the emit_fetch() static function, which was not being used.
---
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c      |   7 +-
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h      |  11 ++-
 src/gallium/auxiliary/util/u_memory.h            |   2 +
 src/gallium/drivers/cell/common.h                |  31 +++++--
 src/gallium/drivers/cell/ppu/cell_gen_fragment.c |   7 +-
 src/gallium/drivers/cell/ppu/cell_state_emit.c   |  77 ++++++++++++++--
 src/gallium/drivers/cell/ppu/cell_vertex_fetch.c |   3 +
 src/gallium/drivers/cell/spu/spu_command.c       | 111 +++++++++++++++++------
 src/gallium/drivers/cell/spu/spu_command.h       |  32 ++++++-
 src/gallium/drivers/cell/spu/spu_main.c          |  15 +--
 src/gallium/drivers/cell/spu/spu_main.h          |   4 +-
 11 files changed, 232 insertions(+), 68 deletions(-)

(limited to 'src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
index 1bd9f1c8dd..b9a75ae559 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
@@ -341,7 +341,11 @@ static void emit_RI18(struct spe_function *p, unsigned op, unsigned rT,
 }
 
 
-
+#define EMIT(_name, _op) \
+void _name (struct spe_function *p) \
+{ \
+   emit_RR(p, _op, 0, 0, 0, __FUNCTION__); \
+}
 
 #define EMIT_(_name, _op) \
 void _name (struct spe_function *p, unsigned rT) \
@@ -713,7 +717,6 @@ hbrr;
 #if 0
 stop;
 EMIT_RR  (spe_stopd, 0x140);
-EMIT_    (spe_lnop,  0x001);
 EMIT_    (spe_nop,   0x201);
 sync;
 EMIT_    (spe_dsync, 0x003);
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
index 7c211ffc51..f9ad2acacd 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
@@ -99,7 +99,9 @@ extern void spe_comment(struct spe_function *p, int rel_indent, const char *s);
 
 #endif /* RTASM_PPC_SPE_H */
 
-#ifndef EMIT_
+#ifndef EMIT
+#define EMIT(_name, _op) \
+    extern void _name (struct spe_function *p);
 #define EMIT_(_name, _op) \
     extern void _name (struct spe_function *p, unsigned rT);
 #define EMIT_R(_name, _op) \
@@ -129,7 +131,7 @@ extern void spe_comment(struct spe_function *p, int rel_indent, const char *s);
 #define EMIT_I16(_name, _op) \
     extern void _name (struct spe_function *p, int imm);
 #define UNDEF_EMIT_MACROS
-#endif /* EMIT_ */
+#endif /* EMIT */
 
 
 /* Memory load / store instructions
@@ -294,6 +296,10 @@ EMIT_RI16(spe_brz,       0x040)
 EMIT_RI16(spe_brhnz,     0x046)
 EMIT_RI16(spe_brhz,      0x044)
 
+/* Control instructions
+ */
+EMIT     (spe_lnop,      0x001)
+
 extern void
 spe_lqd(struct spe_function *p, unsigned rT, unsigned rA, int offset);
 
@@ -418,6 +424,7 @@ EMIT_R   (spe_wrch,       0x10d)
 
 
 #ifdef UNDEF_EMIT_MACROS
+#undef EMIT
 #undef EMIT_
 #undef EMIT_R
 #undef EMIT_RR
diff --git a/src/gallium/auxiliary/util/u_memory.h b/src/gallium/auxiliary/util/u_memory.h
index 857102719d..1a6b596421 100644
--- a/src/gallium/auxiliary/util/u_memory.h
+++ b/src/gallium/auxiliary/util/u_memory.h
@@ -151,6 +151,8 @@ REALLOC( void *old_ptr, unsigned old_size, unsigned new_size )
 
 #define CALLOC_STRUCT(T)   (struct T *) CALLOC(1, sizeof(struct T))
 
+#define CALLOC_VARIANT_LENGTH_STRUCT(T,more_size)   ((struct T *) CALLOC(1, sizeof(struct T) + more_size))
+
 
 /**
  * Return memory on given byte alignment
diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index a670ed3c6e..98554d7f52 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -121,11 +121,6 @@
 #define CELL_DEBUG_CMD                  (1 << 5)
 #define CELL_DEBUG_CACHE                (1 << 6)
 
-/** Max instructions for doing per-fragment operations */
-#define SPU_MAX_FRAGMENT_OPS_INSTS 128
-
-
-
 #define CELL_FENCE_IDLE      0
 #define CELL_FENCE_EMITTED   1
 #define CELL_FENCE_SIGNALLED 2
@@ -153,18 +148,36 @@ struct cell_command_fence
 
 /**
  * Command to specify per-fragment operations state and generated code.
- * Note that the dsa, blend, blend_color fields are really only needed
+ * Note that this is a variant-length structure, allocated with as 
+ * much memory as needed to hold the generated code; the "code"
+ * field *must* be the last field in the structure.  Also, the entire
+ * length of the structure (including the variant code field) must be
+ * a multiple of 8 bytes; we require that this structure itself be
+ * a multiple of 8 bytes, and that the generated code also be a multiple
+ * of 8 bytes.
+ *
+ * Also note that the dsa, blend, blend_color fields are really only needed
  * for the fallback/C per-pixel code.  They're not used when we generate
- * dynamic SPU fragment code (which is the normal case).
+ * dynamic SPU fragment code (which is the normal case), and will eventually
+ * be removed from this structure.
  */
 struct cell_command_fragment_ops
 {
    uint64_t opcode;      /**< CELL_CMD_STATE_FRAGMENT_OPS */
+
+   /* Fields for the fallback case */
    struct pipe_depth_stencil_alpha_state dsa;
    struct pipe_blend_state blend;
    struct pipe_blend_color blend_color;
-   unsigned code_front[SPU_MAX_FRAGMENT_OPS_INSTS];
-   unsigned code_back[SPU_MAX_FRAGMENT_OPS_INSTS];
+
+   /* Fields for the generated SPU code */
+   unsigned total_code_size;
+   unsigned front_code_index;
+   unsigned back_code_index;
+   /* this field has variant length, and must be the last field in 
+    * the structure
+    */
+   unsigned code[0];
 };
 
 
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
index 82336d6635..2c64eb1bcc 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
@@ -1776,7 +1776,10 @@ gen_stencil_depth_test(struct spe_function *f,
  * \param cell  the rendering context (in)
  * \param facing whether the generated code is for front-facing or 
  *              back-facing fragments
- * \param f     the generated function (out)
+ * \param f     the generated function (in/out); on input, the function
+ *              must already have been initialized.  On exit, whatever
+ *              instructions within the generated function have had
+ *              the fragment ops appended.
  */
 void
 cell_gen_fragment_function(struct cell_context *cell, const uint facing, struct spe_function *f)
@@ -1808,8 +1811,6 @@ cell_gen_fragment_function(struct cell_context *cell, const uint facing, struct
    int fbRGBA_reg;  /**< framebuffer's RGBA colors for quad */
    int fbZS_reg;    /**< framebuffer's combined z/stencil values for quad */
 
-   spe_init_func(f, SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE);
-
    if (cell->debug_flags & CELL_DEBUG_ASM) {
       spe_print_code(f, true);
       spe_indent(f, 8);
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index 031b27f11f..0a0af81f53 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -76,30 +76,86 @@ lookup_fragment_ops(struct cell_context *cell)
     */
    if (!ops) {
       struct spe_function spe_code_front, spe_code_back;
+      unsigned int facing_dependent, total_code_size;
 
       if (0)
          debug_printf("**** Create New Fragment Ops\n");
 
-      /* Prepare the buffer that will hold the generated code. */
-      spe_init_func(&spe_code_front, SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE);
-      spe_init_func(&spe_code_back, SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE);
+      /* Prepare the buffer that will hold the generated code.  The
+       * "0" passed in for the size means that the SPE code will
+       * use a default size.
+       */
+      spe_init_func(&spe_code_front, 0);
+      spe_init_func(&spe_code_back, 0);
 
-      /* generate new code.  Always generate new code for both front-facing
+      /* Generate new code.  Always generate new code for both front-facing
        * and back-facing fragments, even if it's the same code in both
        * cases.
        */
       cell_gen_fragment_function(cell, CELL_FACING_FRONT, &spe_code_front);
       cell_gen_fragment_function(cell, CELL_FACING_BACK, &spe_code_back);
 
-      /* alloc new fragment ops command */
-      ops = CALLOC_STRUCT(cell_command_fragment_ops);
+      /* Make sure the code is a multiple of 8 bytes long; this is
+       * required to ensure that the dual pipe instruction alignment
+       * is correct.  It's also important for the SPU unpacking,
+       * which assumes 8-byte boundaries.
+       */
+      unsigned int front_code_size = spe_code_size(&spe_code_front);
+      while (front_code_size % 8 != 0) {
+         spe_lnop(&spe_code_front);
+         front_code_size = spe_code_size(&spe_code_front);
+      }
+      unsigned int back_code_size = spe_code_size(&spe_code_back);
+      while (back_code_size % 8 != 0) {
+         spe_lnop(&spe_code_back);
+         back_code_size = spe_code_size(&spe_code_back);
+      }
 
+      /* Determine whether the code we generated is facing-dependent, by
+       * determining whether the generated code is different for the front-
+       * and back-facing fragments.
+       */
+      if (front_code_size == back_code_size && memcmp(spe_code_front.store, spe_code_back.store, front_code_size) == 0) {
+         /* Code is identical; only need one copy. */
+         facing_dependent = 0;
+         total_code_size = front_code_size;
+      }
+      else {
+         /* Code is different for front-facing and back-facing fragments.
+          * Need to send both copies.
+          */
+         facing_dependent = 1;
+         total_code_size = front_code_size + back_code_size;
+      }
+
+      /* alloc new fragment ops command.  Note that this structure
+       * has variant length based on the total code size required.
+       */
+      ops = CALLOC_VARIANT_LENGTH_STRUCT(cell_command_fragment_ops, total_code_size);
       /* populate the new cell_command_fragment_ops object */
       ops->opcode = CELL_CMD_STATE_FRAGMENT_OPS;
-      memcpy(ops->code_front, spe_code_front.store, spe_code_size(&spe_code_front));
-      memcpy(ops->code_back, spe_code_back.store, spe_code_size(&spe_code_back));
+      ops->total_code_size = total_code_size;
+      ops->front_code_index = 0;
+      memcpy(ops->code, spe_code_front.store, front_code_size);
+      if (facing_dependent) {
+        /* We have separate front- and back-facing code.  Append the
+         * back-facing code to the buffer.  Be careful because the code
+         * size is in bytes, but the buffer is of unsigned elements.
+         */
+        ops->back_code_index = front_code_size / sizeof(spe_code_front.store[0]);
+        memcpy(ops->code + ops->back_code_index, spe_code_back.store, back_code_size);
+      }
+      else {
+        /* Use the same code for front- and back-facing fragments */
+        ops->back_code_index = ops->front_code_index;
+      }
+
+      /* Set the fields for the fallback case.  Note that these fields
+       * (and the whole fallback case) will eventually go away.
+       */
       ops->dsa = *cell->depth_stencil;
       ops->blend = *cell->blend;
+      ops->blend_color = cell->blend_color;
 
       /* insert cell_command_fragment_ops object into keymap/cache */
       util_keymap_insert(cell->fragment_ops_cache, &key, ops, NULL);
@@ -200,9 +256,10 @@ cell_emit_state(struct cell_context *cell)
                       CELL_NEW_DEPTH_STENCIL |
                       CELL_NEW_BLEND)) {
       struct cell_command_fragment_ops *fops, *fops_cmd;
-      fops_cmd = cell_batch_alloc(cell, sizeof(*fops_cmd));
+      /* Note that cell_command_fragment_ops is a variant-sized record */
       fops = lookup_fragment_ops(cell);
-      memcpy(fops_cmd, fops, sizeof(*fops));
+      fops_cmd = cell_batch_alloc(cell, sizeof(*fops_cmd) + fops->total_code_size);
+      memcpy(fops_cmd, fops, sizeof(*fops) + fops->total_code_size);
    }
 
    if (cell->dirty & CELL_NEW_SAMPLER) {
diff --git a/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c b/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c
index 18969005b0..9cba537d9e 100644
--- a/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c
+++ b/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c
@@ -145,6 +145,8 @@ emit_matrix_transpose(struct spe_function *p,
 }
 
 
+#if 0
+/* This appears to not be used currently */
 static void
 emit_fetch(struct spe_function *p,
 	   unsigned in_ptr, unsigned *offset,
@@ -256,6 +258,7 @@ emit_fetch(struct spe_function *p,
       spe_release_register(p, float_one);
    }
 }
+#endif
 
 
 void cell_update_vertex_fetch(struct draw_context *draw)
diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index d5faf4e3aa..8500d19754 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -210,45 +210,72 @@ cmd_release_verts(const struct cell_command_release_verts *release)
 static void
 cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops)
 {
-   static int warned = 0;
-
    D_PRINTF(CELL_DEBUG_CMD, "CMD_STATE_FRAGMENT_OPS\n");
-   /* Copy SPU code from batch buffer to spu buffer */
-   memcpy(spu.fragment_ops_code_front, fops->code_front, SPU_MAX_FRAGMENT_OPS_INSTS * 4);
-   memcpy(spu.fragment_ops_code_back, fops->code_back, SPU_MAX_FRAGMENT_OPS_INSTS * 4);
-   /* Copy state info (for fallback case only) */
+
+   /* Copy state info (for fallback case only - this will eventually
+    * go away when the fallback case goes away)
+    */
    memcpy(&spu.depth_stencil_alpha, &fops->dsa, sizeof(fops->dsa));
    memcpy(&spu.blend, &fops->blend, sizeof(fops->blend));
    memcpy(&spu.blend_color, &fops->blend_color, sizeof(fops->blend_color));
 
-   /* Parity twist!  For now, always use the fallback code by default,
-    * only switching to codegen when specifically requested.  This
-    * allows us to develop freely without risking taking down the
-    * branch.
-    *
-    * Later, the parity of this check will be reversed, so that
-    * codegen is *always* used, unless we specifically indicate that
-    * we don't want it.
-    *
-    * Eventually, the option will be removed completely, because in
-    * final code we'll always use codegen and won't even provide the
-    * raw state records that the fallback code requires.
+   /* Make sure the SPU knows which buffers it's expected to read when
+    * it's told to pull tiles.
     */
-   if ((spu.init.debug_flags & CELL_DEBUG_FRAGMENT_OP_FALLBACK) == 0) {
-      spu.fragment_ops[CELL_FACING_FRONT] = (spu_fragment_ops_func) spu.fragment_ops_code_front;
-      spu.fragment_ops[CELL_FACING_BACK] = (spu_fragment_ops_func) spu.fragment_ops_code_back;
-   }
-   else {
-      /* otherwise, the default fallback code remains in place */
+   spu.read_depth_stencil = (spu.depth_stencil_alpha.depth.enabled || spu.depth_stencil_alpha.stencil[0].enabled);
+
+   /* If we're forcing the fallback code to be used (for debug purposes),
+    * install that.  Otherwise install the incoming SPU code.
+    */
+   if ((spu.init.debug_flags & CELL_DEBUG_FRAGMENT_OP_FALLBACK) != 0) {
+      static unsigned int warned = 0;
       if (!warned) {
          fprintf(stderr, "Cell Warning: using fallback per-fragment code\n");
          warned = 1;
       }
+      /* The following two lines aren't really necessary if you
+       * know the debug flags won't change during a run, and if you
+       * know that the function pointers are initialized correctly.
+       * We set them here to allow a person to change the debug
+       * flags during a run (from inside a debugger).
+       */
+      spu.fragment_ops[CELL_FACING_FRONT] = spu_fallback_fragment_ops;
+      spu.fragment_ops[CELL_FACING_BACK] = spu_fallback_fragment_ops;
+      return;
    }
 
-   spu.read_depth_stencil = (spu.depth_stencil_alpha.depth.enabled || spu.depth_stencil_alpha.stencil[0].enabled);
-}
+   /* Make sure the SPU code buffer is large enough to hold the incoming code.
+    * Note that we *don't* use align_malloc() and align_free(), because
+    * those utility functions are *not* available in SPU code.
+    * */
+   if (spu.fragment_ops_code_size < fops->total_code_size) {
+      if (spu.fragment_ops_code != NULL) {
+         free(spu.fragment_ops_code);
+      }
+      spu.fragment_ops_code_size = fops->total_code_size;
+      spu.fragment_ops_code = malloc(fops->total_code_size);
+      if (spu.fragment_ops_code == NULL) {
+         /* Whoops. */
+         fprintf(stderr, "CELL Warning: failed to allocate fragment ops code (%d bytes) - using fallback\n", fops->total_code_size);
+         spu.fragment_ops_code = NULL;
+         spu.fragment_ops_code_size = 0;
+         spu.fragment_ops[CELL_FACING_FRONT] = spu_fallback_fragment_ops;
+         spu.fragment_ops[CELL_FACING_BACK] = spu_fallback_fragment_ops;
+         return;
+      }
+   }
 
+   /* Copy the SPU code from the command buffer to the spu buffer */
+   memcpy(spu.fragment_ops_code, fops->code, fops->total_code_size);
+
+   /* Set the pointers for the front-facing and back-facing fragments
+    * to the specified offsets within the code.  Note that if the
+    * front-facing and back-facing code are the same, they'll have
+    * the same offset.
+    */
+   spu.fragment_ops[CELL_FACING_FRONT] = (spu_fragment_ops_func) &spu.fragment_ops_code[fops->front_code_index];
+   spu.fragment_ops[CELL_FACING_BACK] = (spu_fragment_ops_func) &spu.fragment_ops_code[fops->back_code_index];
+}
 
 static void
 cmd_state_fragment_program(const struct cell_command_fragment_program *fp)
@@ -588,7 +615,8 @@ cmd_batch(uint opcode)
             struct cell_command_fragment_ops *fops
                = (struct cell_command_fragment_ops *) &buffer[pos];
             cmd_state_fragment_ops(fops);
-            pos += sizeof(*fops) / 8;
+            /* This is a variant-sized command */
+            pos += (sizeof(*fops) + fops->total_code_size)/ 8;
          }
          break;
       case CELL_CMD_STATE_FRAGMENT_PROGRAM:
@@ -756,3 +784,32 @@ command_loop(void)
    if (spu.init.debug_flags & CELL_DEBUG_CACHE)
       spu_dcache_report();
 }
+
+/* Initialize this module; we manage the fragment ops buffer here. */
+void
+spu_command_init(void)
+{
+   /* Install default/fallback fragment processing function.
+    * This will normally be overriden by a code-gen'd function
+    * unless CELL_FORCE_FRAGMENT_OPS_FALLBACK is set.
+    */
+   spu.fragment_ops[CELL_FACING_FRONT] = spu_fallback_fragment_ops;
+   spu.fragment_ops[CELL_FACING_BACK] = spu_fallback_fragment_ops;
+
+   /* Set up the basic empty buffer for code-gen'ed fragment ops */
+   spu.fragment_ops_code = NULL;
+   spu.fragment_ops_code_size = 0;
+}
+
+void
+spu_command_close(void)
+{
+   /* Deallocate the code-gen buffer for fragment ops, and reset the
+    * fragment ops functions to their initial setting (just to leave
+    * things in a good state).
+    */
+   if (spu.fragment_ops_code != NULL) {
+      free(spu.fragment_ops_code);
+   }
+   spu_command_init();
+}
diff --git a/src/gallium/drivers/cell/spu/spu_command.h b/src/gallium/drivers/cell/spu/spu_command.h
index 853e9aa549..83dcdade28 100644
--- a/src/gallium/drivers/cell/spu/spu_command.h
+++ b/src/gallium/drivers/cell/spu/spu_command.h
@@ -1,7 +1,35 @@
-
-
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
 
 extern void
 command_loop(void);
 
+extern void
+spu_command_init(void);
 
+extern void
+spu_command_close(void);
diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index 7033f6037d..97c86d194d 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -58,17 +58,8 @@ one_time_init(void)
    memset(spu.ctile_status, TILE_STATUS_DEFINED, sizeof(spu.ctile_status));
    memset(spu.ztile_status, TILE_STATUS_DEFINED, sizeof(spu.ztile_status));
    invalidate_tex_cache();
-
-   /* Install default/fallback fragment processing function.
-    * This will normally be overriden by a code-gen'd function
-    * unless CELL_FORCE_FRAGMENT_OPS_FALLBACK is set.
-    */
-   spu.fragment_ops[CELL_FACING_FRONT] = spu_fallback_fragment_ops;
-   spu.fragment_ops[CELL_FACING_BACK] = spu_fallback_fragment_ops;
 }
 
-
-
 /* In some versions of the SDK the SPE main takes 'unsigned long' as a
  * parameter.  In others it takes 'unsigned long long'.  Use a define to
  * select between the two.
@@ -91,11 +82,11 @@ main(main_param_t speid, main_param_t argp)
 
    ASSERT(sizeof(tile_t) == TILE_SIZE * TILE_SIZE * 4);
    ASSERT(sizeof(struct cell_command_render) % 8 == 0);
-   ASSERT(((unsigned long) &spu.fragment_ops_code_front) % 8 == 0);
-   ASSERT(((unsigned long) &spu.fragment_ops_code_back) % 8 == 0);
+   ASSERT(sizeof(struct cell_command_fragment_ops) % 8 == 0);
    ASSERT(((unsigned long) &spu.fragment_program_code) % 8 == 0);
 
    one_time_init();
+   spu_command_init();
 
    D_PRINTF(CELL_DEBUG_CMD, "main() speid=%lu\n", (unsigned long) speid);
    D_PRINTF(CELL_DEBUG_FRAGMENT_OP_FALLBACK, "using fragment op fallback\n");
@@ -120,5 +111,7 @@ main(main_param_t speid, main_param_t argp)
 
    command_loop();
 
+   spu_command_close();
+
    return 0;
 }
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 24cf7d77ce..33767e7c51 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -169,8 +169,8 @@ struct spu_global
    ubyte ztile_status[CELL_MAX_HEIGHT/TILE_SIZE][CELL_MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
 
    /** Current fragment ops machine code, at 8-byte boundary */
-   uint fragment_ops_code_front[SPU_MAX_FRAGMENT_OPS_INSTS] ALIGN8_ATTRIB;
-   uint fragment_ops_code_back[SPU_MAX_FRAGMENT_OPS_INSTS] ALIGN8_ATTRIB;
+   uint *fragment_ops_code;
+   uint fragment_ops_code_size;
    /** Current fragment ops functions, 0 = frontfacing, 1 = backfacing */
    spu_fragment_ops_func fragment_ops[2];
 
-- 
cgit v1.2.3


From c4a782041b19cb4a08712384b19be25b79acba3c Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Sun, 11 Jan 2009 14:22:00 -0700
Subject: cell: datatype clean-ups in SPE rtasm

---
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c | 123 ++++++++++++++--------------
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h |  81 +++++++++---------
 2 files changed, 99 insertions(+), 105 deletions(-)

(limited to 'src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
index 071bc2015c..53a0e722cf 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
@@ -213,8 +213,8 @@ emit_instruction(struct spe_function *p, uint32_t inst_bits)
 
 
-static void emit_RR(struct spe_function *p, unsigned op, unsigned rT,
-		    unsigned rA, unsigned rB, const char *name)
+static void emit_RR(struct spe_function *p, unsigned op, int rT,
+		    int rA, int rB, const char *name)
 {
     union spe_inst_RR inst;
     inst.inst.op = op;
@@ -230,8 +230,8 @@ static void emit_RR(struct spe_function *p, unsigned op, unsigned rT,
 }
 
 
-static void emit_RRR(struct spe_function *p, unsigned op, unsigned rT,
-                     unsigned rA, unsigned rB, unsigned rC, const char *name)
+static void emit_RRR(struct spe_function *p, unsigned op, int rT,
+                     int rA, int rB, int rC, const char *name)
 {
     union spe_inst_RRR inst;
     inst.inst.op = op;
@@ -248,8 +248,8 @@ static void emit_RRR(struct spe_function *p, unsigned op, unsigned rT,
 }
 
 
-static void emit_RI7(struct spe_function *p, unsigned op, unsigned rT,
-		     unsigned rA, int imm, const char *name)
+static void emit_RI7(struct spe_function *p, unsigned op, int rT,
+		     int rA, int imm, const char *name)
 {
     union spe_inst_RI7 inst;
     inst.inst.op = op;
@@ -266,8 +266,8 @@ static void emit_RI7(struct spe_function *p, unsigned op, unsigned rT,
 
 
-static void emit_RI8(struct spe_function *p, unsigned op, unsigned rT,
-		     unsigned rA, int imm, const char *name)
+static void emit_RI8(struct spe_function *p, unsigned op, int rT,
+		     int rA, int imm, const char *name)
 {
     union spe_inst_RI8 inst;
     inst.inst.op = op;
@@ -284,8 +284,8 @@ static void emit_RI8(struct spe_function *p, unsigned op, unsigned rT,
 
 
-static void emit_RI10(struct spe_function *p, unsigned op, unsigned rT,
-		      unsigned rA, int imm, const char *name)
+static void emit_RI10(struct spe_function *p, unsigned op, int rT,
+		      int rA, int imm, const char *name)
 {
     union spe_inst_RI10 inst;
     inst.inst.op = op;
@@ -302,8 +302,8 @@ static void emit_RI10(struct spe_function *p, unsigned op, unsigned rT,
 
 
 /** As above, but do range checking on signed immediate value */
-static void emit_RI10s(struct spe_function *p, unsigned op, unsigned rT,
-                       unsigned rA, int imm, const char *name)
+static void emit_RI10s(struct spe_function *p, unsigned op, int rT,
+                       int rA, int imm, const char *name)
 {
     assert(imm <= 511);
     assert(imm >= -512);
@@ -311,7 +311,7 @@ static void emit_RI10s(struct spe_function *p, unsigned op, unsigned rT,
 }
 
 
-static void emit_RI16(struct spe_function *p, unsigned op, unsigned rT,
+static void emit_RI16(struct spe_function *p, unsigned op, int rT,
 		      int imm, const char *name)
 {
     union spe_inst_RI16 inst;
@@ -326,7 +326,7 @@ static void emit_RI16(struct spe_function *p, unsigned op, unsigned rT,
 }
 
 
-static void emit_RI18(struct spe_function *p, unsigned op, unsigned rT,
+static void emit_RI18(struct spe_function *p, unsigned op, int rT,
 		      int imm, const char *name)
 {
     union spe_inst_RI18 inst;
@@ -348,61 +348,61 @@ void _name (struct spe_function *p) \
 }
 
 #define EMIT_(_name, _op) \
-void _name (struct spe_function *p, unsigned rT) \
+void _name (struct spe_function *p, int rT) \
 { \
    emit_RR(p, _op, rT, 0, 0, __FUNCTION__); \
 }
 
 #define EMIT_R(_name, _op) \
-void _name (struct spe_function *p, unsigned rT, unsigned rA) \
+void _name (struct spe_function *p, int rT, int rA) \
 { \
    emit_RR(p, _op, rT, rA, 0, __FUNCTION__);                 \
 }
 
 #define EMIT_RR(_name, _op) \
-void _name (struct spe_function *p, unsigned rT, unsigned rA, unsigned rB) \
+void _name (struct spe_function *p, int rT, int rA, int rB) \
 { \
    emit_RR(p, _op, rT, rA, rB, __FUNCTION__);                \
 }
 
 #define EMIT_RRR(_name, _op) \
-void _name (struct spe_function *p, unsigned rT, unsigned rA, unsigned rB, unsigned rC) \
+void _name (struct spe_function *p, int rT, int rA, int rB, int rC) \
 { \
    emit_RRR(p, _op, rT, rA, rB, rC, __FUNCTION__);           \
 }
 
 #define EMIT_RI7(_name, _op) \
-void _name (struct spe_function *p, unsigned rT, unsigned rA, int imm) \
+void _name (struct spe_function *p, int rT, int rA, int imm) \
 { \
    emit_RI7(p, _op, rT, rA, imm, __FUNCTION__);              \
 }
 
 #define EMIT_RI8(_name, _op, bias) \
-void _name (struct spe_function *p, unsigned rT, unsigned rA, int imm) \
+void _name (struct spe_function *p, int rT, int rA, int imm) \
 { \
    emit_RI8(p, _op, rT, rA, bias - imm, __FUNCTION__);       \
 }
 
 #define EMIT_RI10(_name, _op) \
-void _name (struct spe_function *p, unsigned rT, unsigned rA, int imm) \
+void _name (struct spe_function *p, int rT, int rA, int imm) \
 { \
    emit_RI10(p, _op, rT, rA, imm, __FUNCTION__);             \
 }
 
 #define EMIT_RI10s(_name, _op) \
-void _name (struct spe_function *p, unsigned rT, unsigned rA, int imm) \
+void _name (struct spe_function *p, int rT, int rA, int imm) \
 { \
    emit_RI10s(p, _op, rT, rA, imm, __FUNCTION__);             \
 }
 
 #define EMIT_RI16(_name, _op) \
-void _name (struct spe_function *p, unsigned rT, int imm) \
+void _name (struct spe_function *p, int rT, int imm) \
 { \
    emit_RI16(p, _op, rT, imm, __FUNCTION__);                 \
 }
 
 #define EMIT_RI18(_name, _op) \
-void _name (struct spe_function *p, unsigned rT, int imm) \
+void _name (struct spe_function *p, int rT, int imm) \
 { \
    emit_RI18(p, _op, rT, imm, __FUNCTION__);                 \
 }
@@ -424,7 +424,7 @@ void _name (struct spe_function *p, int imm) \
  */
 void spe_init_func(struct spe_function *p, unsigned code_size)
 {
-    unsigned int i;
+    uint i;
 
     if (!code_size)
        code_size = 64;
@@ -503,6 +503,7 @@ int spe_allocate_register(struct spe_function *p, int reg)
  */
 void spe_release_register(struct spe_function *p, int reg)
 {
+   assert(reg >= 0);
    assert(reg < SPE_NUM_REGS);
    assert(p->regs[reg] == 1);
 
@@ -517,7 +518,7 @@ void spe_release_register(struct spe_function *p, int reg)
  */
 void spe_allocate_register_set(struct spe_function *p)
 {
-   unsigned int i;
+   uint i;
 
    /* Keep track of the set count.  If it ever wraps around to 0, 
     * we're in trouble.
@@ -538,7 +539,7 @@ void spe_allocate_register_set(struct spe_function *p)
 
 void spe_release_register_set(struct spe_function *p)
 {
-   unsigned int i;
+   uint i;
 
    /* If the set count drops below zero, we're in trouble. */
    assert(p->set_count > 0);
@@ -599,7 +600,7 @@ spe_comment(struct spe_function *p, int rel_indent, const char *s)
  * Load quad word.
  * NOTE: offset is in bytes and the least significant 4 bits must be zero!
  */
-void spe_lqd(struct spe_function *p, unsigned rT, unsigned rA, int offset)
+void spe_lqd(struct spe_function *p, int rT, int rA, int offset)
 {
    const boolean pSave = p->print;
 
@@ -624,7 +625,7 @@ void spe_lqd(struct spe_function *p, unsigned rT, unsigned rA, int offset)
  * Store quad word.
  * NOTE: offset is in bytes and the least significant 4 bits must be zero!
  */
-void spe_stqd(struct spe_function *p, unsigned rT, unsigned rA, int offset)
+void spe_stqd(struct spe_function *p, int rT, int rA, int offset)
 {
    const boolean pSave = p->print;
 
@@ -653,51 +654,51 @@ void spe_stqd(struct spe_function *p, unsigned rT, unsigned rA, int offset)
  */
 
 /** Branch Indirect to address in rA */
-void spe_bi(struct spe_function *p, unsigned rA, int d, int e)
+void spe_bi(struct spe_function *p, int rA, int d, int e)
 {
    emit_RI7(p, 0x1a8, 0, rA, (d << 5) | (e << 4), __FUNCTION__);
 }
 
 /** Interupt Return */
-void spe_iret(struct spe_function *p, unsigned rA, int d, int e)
+void spe_iret(struct spe_function *p, int rA, int d, int e)
 {
    emit_RI7(p, 0x1aa, 0, rA, (d << 5) | (e << 4), __FUNCTION__);
 }
 
 /** Branch indirect and set link on external data */
-void spe_bisled(struct spe_function *p, unsigned rT, unsigned rA, int d,
+void spe_bisled(struct spe_function *p, int rT, int rA, int d,
 		int e)
 {
    emit_RI7(p, 0x1ab, rT, rA, (d << 5) | (e << 4), __FUNCTION__);
 }
 
 /** Branch indirect and set link.  Save PC in rT, jump to rA. */
-void spe_bisl(struct spe_function *p, unsigned rT, unsigned rA, int d,
+void spe_bisl(struct spe_function *p, int rT, int rA, int d,
 		int e)
 {
    emit_RI7(p, 0x1a9, rT, rA, (d << 5) | (e << 4), __FUNCTION__);
 }
 
 /** Branch indirect if zero word.  If rT.word[0]==0, jump to rA. */
-void spe_biz(struct spe_function *p, unsigned rT, unsigned rA, int d, int e)
+void spe_biz(struct spe_function *p, int rT, int rA, int d, int e)
 {
    emit_RI7(p, 0x128, rT, rA, (d << 5) | (e << 4), __FUNCTION__);
 }
 
 /** Branch indirect if non-zero word.  If rT.word[0]!=0, jump to rA. */
-void spe_binz(struct spe_function *p, unsigned rT, unsigned rA, int d, int e)
+void spe_binz(struct spe_function *p, int rT, int rA, int d, int e)
 {
    emit_RI7(p, 0x129, rT, rA, (d << 5) | (e << 4), __FUNCTION__);
 }
 
 /** Branch indirect if zero halfword.  If rT.halfword[1]==0, jump to rA. */
-void spe_bihz(struct spe_function *p, unsigned rT, unsigned rA, int d, int e)
+void spe_bihz(struct spe_function *p, int rT, int rA, int d, int e)
 {
    emit_RI7(p, 0x12a, rT, rA, (d << 5) | (e << 4), __FUNCTION__);
 }
 
 /** Branch indirect if non-zero halfword.  If rT.halfword[1]!=0, jump to rA. */
-void spe_bihnz(struct spe_function *p, unsigned rT, unsigned rA, int d, int e)
+void spe_bihnz(struct spe_function *p, int rT, int rA, int d, int e)
 {
    emit_RI7(p, 0x12b, rT, rA, (d << 5) | (e << 4), __FUNCTION__);
 }
@@ -733,7 +734,7 @@ EMIT_R   (spe_mtspr, 0x10c);
 
 
 void
-spe_load_float(struct spe_function *p, unsigned rT, float x)
+spe_load_float(struct spe_function *p, int rT, float x)
 {
    if (x == 0.0f) {
       spe_il(p, rT, 0x0);
@@ -760,7 +761,7 @@ spe_load_float(struct spe_function *p, unsigned rT, float x)
 
 
 void
-spe_load_int(struct spe_function *p, unsigned rT, int i)
+spe_load_int(struct spe_function *p, int rT, int i)
 {
    if (-32768 <= i && i <= 32767) {
       spe_il(p, rT, i);
@@ -772,7 +773,7 @@ spe_load_int(struct spe_function *p, unsigned rT, int i)
    }
 }
 
-void spe_load_uint(struct spe_function *p, unsigned rT, unsigned int ui)
+void spe_load_uint(struct spe_function *p, int rT, uint ui)
 {
    /* If the whole value is in the lower 18 bits, use ila, which
     * doesn't sign-extend.  Otherwise, if the two halfwords of
@@ -793,7 +794,7 @@ void spe_load_uint(struct spe_function *p, unsigned rT, unsigned int ui)
       ((ui & 0x00ff0000) == 0 || (ui & 0x00ff0000) == 0x00ff0000) &&
       ((ui & 0xff000000) == 0 || (ui & 0xff000000) == 0xff000000)
    ) {
-      unsigned int mask = 0;
+      uint mask = 0;
       /* fsmbi duplicates each bit in the given mask eight times,
        * using a 16-bit value to initialize a 16-byte quadword.
        * Each 4-bit nybble of the mask corresponds to a full word
@@ -822,7 +823,7 @@ void spe_load_uint(struct spe_function *p, unsigned rT, unsigned int ui)
  * Changes to one should be made in the other.
  */
 void
-spe_and_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
+spe_and_uint(struct spe_function *p, int rT, int rA, uint ui)
 {
    /* If we can, emit a single instruction, either And Byte Immediate
     * (which uses the same constant across each byte), And Halfword Immediate
@@ -832,7 +833,7 @@ spe_and_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
     *
     * Otherwise, we'll need to use a temporary register.
     */
-   unsigned int tmp;
+   uint tmp;
 
    /* If the upper 23 bits are all 0s or all 1s, sign extension
     * will work and we can use And Word Immediate
@@ -863,7 +864,7 @@ spe_and_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
    }
 
    /* Otherwise, we'll have to use a temporary register. */
-   unsigned int tmp_reg = spe_allocate_available_register(p);
+   int tmp_reg = spe_allocate_available_register(p);
    spe_load_uint(p, tmp_reg, ui);
    spe_and(p, rT, rA, tmp_reg);
    spe_release_register(p, tmp_reg);
@@ -875,7 +876,7 @@ spe_and_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
  * Changes to one should be made in the other.
  */
 void
-spe_xor_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
+spe_xor_uint(struct spe_function *p, int rT, int rA, uint ui)
 {
    /* If we can, emit a single instruction, either Exclusive Or Byte 
     * Immediate (which uses the same constant across each byte), Exclusive 
@@ -885,7 +886,7 @@ spe_xor_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
     *
     * Otherwise, we'll need to use a temporary register.
     */
-   unsigned int tmp;
+   uint tmp;
 
    /* If the upper 23 bits are all 0s or all 1s, sign extension
     * will work and we can use Exclusive Or Word Immediate
@@ -916,14 +917,14 @@ spe_xor_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
    }
 
    /* Otherwise, we'll have to use a temporary register. */
-   unsigned int tmp_reg = spe_allocate_available_register(p);
+   int tmp_reg = spe_allocate_available_register(p);
    spe_load_uint(p, tmp_reg, ui);
    spe_xor(p, rT, rA, tmp_reg);
    spe_release_register(p, tmp_reg);
 }
 
 void
-spe_compare_equal_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
+spe_compare_equal_uint(struct spe_function *p, int rT, int rA, uint ui)
 {
    /* If the comparison value is 9 bits or less, it fits inside a
     * Compare Equal Word Immediate instruction.
@@ -933,7 +934,7 @@ spe_compare_equal_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigne
    }
    /* Otherwise, we're going to have to load a word first. */
    else {
-      unsigned int tmp_reg = spe_allocate_available_register(p);
+      int tmp_reg = spe_allocate_available_register(p);
       spe_load_uint(p, tmp_reg, ui);
       spe_ceq(p, rT, rA, tmp_reg);
       spe_release_register(p, tmp_reg);
@@ -941,7 +942,7 @@ spe_compare_equal_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigne
 }
 
 void
-spe_compare_greater_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
+spe_compare_greater_uint(struct spe_function *p, int rT, int rA, uint ui)
 {
    /* If the comparison value is 10 bits or less, it fits inside a
     * Compare Logical Greater Than Word Immediate instruction.
@@ -951,7 +952,7 @@ spe_compare_greater_uint(struct spe_function *p, unsigned rT, unsigned rA, unsig
    }
    /* Otherwise, we're going to have to load a word first. */
    else {
-      unsigned int tmp_reg = spe_allocate_available_register(p);
+      int tmp_reg = spe_allocate_available_register(p);
       spe_load_uint(p, tmp_reg, ui);
       spe_clgt(p, rT, rA, tmp_reg);
       spe_release_register(p, tmp_reg);
@@ -959,10 +960,10 @@ spe_compare_greater_uint(struct spe_function *p, unsigned rT, unsigned rA, unsig
 }
 
 void
-spe_splat(struct spe_function *p, unsigned rT, unsigned rA)
+spe_splat(struct spe_function *p, int rT, int rA)
 {
    /* Use a temporary, just in case rT == rA */
-   unsigned int tmp_reg = spe_allocate_available_register(p);
+   int tmp_reg = spe_allocate_available_register(p);
    /* Duplicate bytes 0, 1, 2, and 3 across the whole register */
    spe_ila(p, tmp_reg, 0x00010203);
    spe_shufb(p, rT, rA, rA, tmp_reg);
@@ -971,14 +972,14 @@ spe_splat(struct spe_function *p, unsigned rT, unsigned rA)
 
 
 void
-spe_complement(struct spe_function *p, unsigned rT, unsigned rA)
+spe_complement(struct spe_function *p, int rT, int rA)
 {
    spe_nor(p, rT, rA, rA);
 }
 
 
 void
-spe_move(struct spe_function *p, unsigned rT, unsigned rA)
+spe_move(struct spe_function *p, int rT, int rA)
 {
    /* Use different instructions depending on the instruction address
     * to take advantage of the dual pipelines.
@@ -991,14 +992,14 @@ spe_move(struct spe_function *p, unsigned rT, unsigned rA)
 
 
 void
-spe_zero(struct spe_function *p, unsigned rT)
+spe_zero(struct spe_function *p, int rT)
 {
    spe_xor(p, rT, rT, rT);
 }
 
 
 void
-spe_splat_word(struct spe_function *p, unsigned rT, unsigned rA, int word)
+spe_splat_word(struct spe_function *p, int rT, int rA, int word)
 {
    assert(word >= 0);
    assert(word <= 3);
@@ -1038,9 +1039,9 @@ spe_splat_word(struct spe_function *p, unsigned rT, unsigned rA, int word)
  * like "x = min(x, a)", we always allocate a new register to be safe.
  */
 void 
-spe_float_min(struct spe_function *p, unsigned rT, unsigned rA, unsigned rB)
+spe_float_min(struct spe_function *p, int rT, int rA, int rB)
 {
-   unsigned int compare_reg = spe_allocate_available_register(p);
+   int compare_reg = spe_allocate_available_register(p);
    spe_fcgt(p, compare_reg, rA, rB);
    spe_selb(p, rT, rA, rB, compare_reg);
    spe_release_register(p, compare_reg);
@@ -1055,9 +1056,9 @@ spe_float_min(struct spe_function *p, unsigned rT, unsigned rA, unsigned rB)
  * so that the larger of the two is selected instead of the smaller.
  */
 void 
-spe_float_max(struct spe_function *p, unsigned rT, unsigned rA, unsigned rB)
+spe_float_max(struct spe_function *p, int rT, int rA, int rB)
 {
-   unsigned int compare_reg = spe_allocate_available_register(p);
+   int compare_reg = spe_allocate_available_register(p);
    spe_fcgt(p, compare_reg, rA, rB);
    spe_selb(p, rT, rB, rA, compare_reg);
    spe_release_register(p, compare_reg);
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
index f9ad2acacd..65d9c77415 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
@@ -79,9 +79,9 @@ struct spe_function
 };
 
 
-extern void spe_init_func(struct spe_function *p, unsigned code_size);
+extern void spe_init_func(struct spe_function *p, uint code_size);
 extern void spe_release_func(struct spe_function *p);
-extern unsigned spe_code_size(const struct spe_function *p);
+extern uint spe_code_size(const struct spe_function *p);
 
 extern int spe_allocate_available_register(struct spe_function *p);
 extern int spe_allocate_register(struct spe_function *p, int reg);
@@ -89,8 +89,7 @@ extern void spe_release_register(struct spe_function *p, int reg);
 extern void spe_allocate_register_set(struct spe_function *p);
 extern void spe_release_register_set(struct spe_function *p);
 
-extern unsigned
-spe_get_registers_used(const struct spe_function *p, ubyte used[]);
+extern uint spe_get_registers_used(const struct spe_function *p, ubyte used[]);
 
 extern void spe_print_code(struct spe_function *p, boolean enable);
 extern void spe_indent(struct spe_function *p, int spaces);
@@ -103,31 +102,25 @@ extern void spe_comment(struct spe_function *p, int rel_indent, const char *s);
 #define EMIT(_name, _op) \
     extern void _name (struct spe_function *p);
 #define EMIT_(_name, _op) \
-    extern void _name (struct spe_function *p, unsigned rT);
+    extern void _name (struct spe_function *p, int rT);
 #define EMIT_R(_name, _op) \
-    extern void _name (struct spe_function *p, unsigned rT, unsigned rA);
+    extern void _name (struct spe_function *p, int rT, int rA);
 #define EMIT_RR(_name, _op) \
-    extern void _name (struct spe_function *p, unsigned rT, unsigned rA, \
-                       unsigned rB);
+    extern void _name (struct spe_function *p, int rT, int rA, int rB);
 #define EMIT_RRR(_name, _op) \
-    extern void _name (struct spe_function *p, unsigned rT, unsigned rA, \
-                       unsigned rB, unsigned rC);
+    extern void _name (struct spe_function *p, int rT, int rA, int rB, int rC);
 #define EMIT_RI7(_name, _op) \
-    extern void _name (struct spe_function *p, unsigned rT, unsigned rA, \
-                       int imm);
+    extern void _name (struct spe_function *p, int rT, int rA, int imm);
 #define EMIT_RI8(_name, _op, bias) \
-    extern void _name (struct spe_function *p, unsigned rT, unsigned rA, \
-                       int imm);
+    extern void _name (struct spe_function *p, int rT, int rA, int imm);
 #define EMIT_RI10(_name, _op) \
-    extern void _name (struct spe_function *p, unsigned rT, unsigned rA, \
-                       int imm);
+    extern void _name (struct spe_function *p, int rT, int rA, int imm);
 #define EMIT_RI10s(_name, _op) \
-    extern void _name (struct spe_function *p, unsigned rT, unsigned rA, \
-                       int imm);
+    extern void _name (struct spe_function *p, int rT, int rA, int imm);
 #define EMIT_RI16(_name, _op) \
-    extern void _name (struct spe_function *p, unsigned rT, int imm);
+    extern void _name (struct spe_function *p, int rT, int imm);
 #define EMIT_RI18(_name, _op) \
-    extern void _name (struct spe_function *p, unsigned rT, int imm);
+    extern void _name (struct spe_function *p, int rT, int imm);
 #define EMIT_I16(_name, _op) \
     extern void _name (struct spe_function *p, int imm);
 #define UNDEF_EMIT_MACROS
@@ -301,82 +294,82 @@ EMIT_RI16(spe_brhz,      0x044)
 EMIT     (spe_lnop,      0x001)
 
 extern void
-spe_lqd(struct spe_function *p, unsigned rT, unsigned rA, int offset);
+spe_lqd(struct spe_function *p, int rT, int rA, int offset);
 
 extern void
-spe_stqd(struct spe_function *p, unsigned rT, unsigned rA, int offset);
+spe_stqd(struct spe_function *p, int rT, int rA, int offset);
 
-extern void spe_bi(struct spe_function *p, unsigned rA, int d, int e);
-extern void spe_iret(struct spe_function *p, unsigned rA, int d, int e);
-extern void spe_bisled(struct spe_function *p, unsigned rT, unsigned rA,
+extern void spe_bi(struct spe_function *p, int rA, int d, int e);
+extern void spe_iret(struct spe_function *p, int rA, int d, int e);
+extern void spe_bisled(struct spe_function *p, int rT, int rA,
     int d, int e);
-extern void spe_bisl(struct spe_function *p, unsigned rT, unsigned rA,
+extern void spe_bisl(struct spe_function *p, int rT, int rA,
     int d, int e);
-extern void spe_biz(struct spe_function *p, unsigned rT, unsigned rA,
+extern void spe_biz(struct spe_function *p, int rT, int rA,
     int d, int e);
-extern void spe_binz(struct spe_function *p, unsigned rT, unsigned rA,
+extern void spe_binz(struct spe_function *p, int rT, int rA,
     int d, int e);
-extern void spe_bihz(struct spe_function *p, unsigned rT, unsigned rA,
+extern void spe_bihz(struct spe_function *p, int rT, int rA,
     int d, int e);
-extern void spe_bihnz(struct spe_function *p, unsigned rT, unsigned rA,
+extern void spe_bihnz(struct spe_function *p, int rT, int rA,
     int d, int e);
 
 
 /** Load/splat immediate float into rT. */
 extern void
-spe_load_float(struct spe_function *p, unsigned rT, float x);
+spe_load_float(struct spe_function *p, int rT, float x);
 
 /** Load/splat immediate int into rT. */
 extern void
-spe_load_int(struct spe_function *p, unsigned rT, int i);
+spe_load_int(struct spe_function *p, int rT, int i);
 
 /** Load/splat immediate unsigned int into rT. */
 extern void
-spe_load_uint(struct spe_function *p, unsigned rT, unsigned int ui);
+spe_load_uint(struct spe_function *p, int rT, uint ui);
 
 /** And immediate value into rT. */
 extern void
-spe_and_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui);
+spe_and_uint(struct spe_function *p, int rT, int rA, uint ui);
 
 /** Xor immediate value into rT. */
 extern void
-spe_xor_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui);
+spe_xor_uint(struct spe_function *p, int rT, int rA, uint ui);
 
 /** Compare equal with immediate value. */
 extern void
-spe_compare_equal_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui);
+spe_compare_equal_uint(struct spe_function *p, int rT, int rA, uint ui);
 
 /** Compare greater with immediate value. */
 extern void
-spe_compare_greater_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui);
+spe_compare_greater_uint(struct spe_function *p, int rT, int rA, uint ui);
 
 /** Replicate word 0 of rA across rT. */
 extern void
-spe_splat(struct spe_function *p, unsigned rT, unsigned rA);
+spe_splat(struct spe_function *p, int rT, int rA);
 
 /** rT = complement_all_bits(rA). */
 extern void
-spe_complement(struct spe_function *p, unsigned rT, unsigned rA);
+spe_complement(struct spe_function *p, int rT, int rA);
 
 /** rT = rA. */
 extern void
-spe_move(struct spe_function *p, unsigned rT, unsigned rA);
+spe_move(struct spe_function *p, int rT, int rA);
 
 /** rT = {0,0,0,0}. */
 extern void
-spe_zero(struct spe_function *p, unsigned rT);
+spe_zero(struct spe_function *p, int rT);
 
 /** rT = splat(rA, word) */
 extern void
-spe_splat_word(struct spe_function *p, unsigned rT, unsigned rA, int word);
+spe_splat_word(struct spe_function *p, int rT, int rA, int word);
 
 /** rT = float min(rA, rB) */
 extern void
-spe_float_min(struct spe_function *p, unsigned rT, unsigned rA, unsigned rB);
+spe_float_min(struct spe_function *p, int rT, int rA, int rB);
 
 /** rT = float max(rA, rB) */
 extern void
-spe_float_max(struct spe_function *p, unsigned rT, unsigned rA, unsigned rB);
+spe_float_max(struct spe_function *p, int rT, int rA, int rB);
 
 
 /* Floating-point instructions
-- 
cgit v1.2.3