From 13f46fa1b9c3009395a0d7f30ebef127f5937451 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keith@tungstengraphics.com>
Date: Tue, 7 Oct 2008 16:44:24 +0100
Subject: draw: don't assume output buffer pointer is aligned (cherry picked
 from commit 23cc303994eb630c56b1224dfdac51dcea41ed03)

---
 src/gallium/auxiliary/draw/draw_vs_aos_io.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src/gallium/auxiliary')

diff --git a/src/gallium/auxiliary/draw/draw_vs_aos_io.c b/src/gallium/auxiliary/draw/draw_vs_aos_io.c
index dd79bc799a..39f75b50b7 100644
--- a/src/gallium/auxiliary/draw/draw_vs_aos_io.c
+++ b/src/gallium/auxiliary/draw/draw_vs_aos_io.c
@@ -338,7 +338,7 @@ static void emit_store_R32G32B32A32( struct aos_compilation *cp,
 				     struct x86_reg dst_ptr,
 				     struct x86_reg dataXMM )
 {
-   sse_movaps(cp->func, dst_ptr, dataXMM);
+   sse_movups(cp->func, dst_ptr, dataXMM);
 }
 
 static void emit_store_R32G32B32( struct aos_compilation *cp, 
-- 
cgit v1.2.3


From 05a8f203cdea768466e5faf1dec4155e1e945c78 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 15 Oct 2008 11:56:57 -0600
Subject: gallium: fix the test in vs_exec_prepare() to avoid redundant
 bindings

Fixes regressions seen in progs/samples/prim.c, progs/demos/ray.c
---
 src/gallium/auxiliary/draw/draw_vs_exec.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'src/gallium/auxiliary')

diff --git a/src/gallium/auxiliary/draw/draw_vs_exec.c b/src/gallium/auxiliary/draw/draw_vs_exec.c
index 79a19d6be2..13d4fcfdbf 100644
--- a/src/gallium/auxiliary/draw/draw_vs_exec.c
+++ b/src/gallium/auxiliary/draw/draw_vs_exec.c
@@ -46,7 +46,6 @@
 struct exec_vertex_shader {
    struct draw_vertex_shader base;
    struct tgsi_exec_machine *machine;
-   const struct tgsi_token *machine_tokens;
 };
 
 static struct exec_vertex_shader *exec_vertex_shader( struct draw_vertex_shader *vs )
@@ -66,12 +65,11 @@ vs_exec_prepare( struct draw_vertex_shader *shader,
    /* Specify the vertex program to interpret/execute.
     * Avoid rebinding when possible.
     */
-   if (evs->machine_tokens != shader->state.tokens) {
+   if (evs->machine->Tokens != shader->state.tokens) {
       tgsi_exec_machine_bind_shader(evs->machine,
                                     shader->state.tokens,
                                     PIPE_MAX_SAMPLERS,
                                     NULL /*samplers*/ );
-      evs->machine_tokens = shader->state.tokens;
    }
 }
 
-- 
cgit v1.2.3


From e0c6653a5fda956119239ef921daf1e3b950dfc8 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 22 Oct 2008 10:35:38 -0600
Subject: cell: implement many more PPC instructions for code gen

---
 src/gallium/auxiliary/rtasm/Makefile    |   1 +
 src/gallium/auxiliary/rtasm/rtasm_ppc.c | 603 ++++++++++++++++++++++++++++++--
 src/gallium/auxiliary/rtasm/rtasm_ppc.h | 141 +++++++-
 3 files changed, 704 insertions(+), 41 deletions(-)

(limited to 'src/gallium/auxiliary')

diff --git a/src/gallium/auxiliary/rtasm/Makefile b/src/gallium/auxiliary/rtasm/Makefile
index 39b8a4dbd7..252dc5274a 100644
--- a/src/gallium/auxiliary/rtasm/Makefile
+++ b/src/gallium/auxiliary/rtasm/Makefile
@@ -7,6 +7,7 @@ C_SOURCES = \
 	rtasm_cpu.c \
 	rtasm_execmem.c \
 	rtasm_x86sse.c \
+	rtasm_ppc.c \
 	rtasm_ppc_spe.c
 
 include ../../Makefile.template
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc.c b/src/gallium/auxiliary/rtasm/rtasm_ppc.c
index 534a23568d..4a94ed0460 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc.c
@@ -23,10 +23,19 @@
 
 /**
  * PPC code generation.
+ * For reference, see http://www.power.org/resources/reading/PowerISA_V2.05.pdf
+ * ABI info: http://www.cs.utsa.edu/~whaley/teach/cs6463FHPO/LEC/lec12_ho.pdf
+ *
+ * Other PPC refs:
+ * http://www-01.ibm.com/chips/techlib/techlib.nsf/techdocs/852569B20050FF778525699600719DF2
+ * http://www.ibm.com/developerworks/eserver/library/es-archguide-v2.html
+ * http://www.freescale.com/files/product/doc/MPCFPE32B.pdf
+ *
  * \author Brian Paul
  */
 
 
+#include <stdio.h>
 #include "util/u_memory.h"
 #include "pipe/p_debug.h"
 #include "rtasm_ppc.h"
@@ -35,30 +44,125 @@
 void
 ppc_init_func(struct ppc_function *p, unsigned max_inst)
 {
-    p->store = align_malloc(max_inst * PPC_INST_SIZE, 16);
-    p->num_inst = 0;
-    p->max_inst = max_inst;
-    p->vec_used = ~0;
+   uint i;
+
+   p->store = align_malloc(max_inst * PPC_INST_SIZE, 16);
+   p->num_inst = 0;
+   p->max_inst = max_inst;
+   p->fp_used = ~0x0;
+   p->vec_used = ~0x0;
+
+   /* only allow using gp registers 7..12 for now */
+   p->reg_used = 0x0;
+   for (i = 7; i < 13; i++)
+      p->reg_used |= (1 << i);
 }
 
 
 void
 ppc_release_func(struct ppc_function *p)
 {
-    assert(p->num_inst <= p->max_inst);
-    if (p->store != NULL) {
-        align_free(p->store);
-    }
-    p->store = NULL;
+   assert(p->num_inst <= p->max_inst);
+   if (p->store != NULL) {
+      align_free(p->store);
+   }
+   p->store = NULL;
+}
+
+
+void (*ppc_get_func(struct ppc_function *p))(void)
+{
+#if 0
+   DUMP_END();
+   if (DISASSEM && p->store)
+      debug_printf("disassemble %p %p\n", p->store, p->csr);
+
+   if (p->store == p->error_overflow)
+      return (void (*)(void)) NULL;
+   else
+#endif
+      return (void (*)(void)) p->store;
+}
+
+
+void
+ppc_dump_func(const struct ppc_function *p)
+{
+   uint i;
+   for (i = 0; i < p->num_inst; i++) {
+      debug_printf("%3u: 0x%08x\n", i, p->store[i]);
+   }
+}
+
+
+/**
+ * Allocate a general purpose register.
+ * \return register index or -1 if none left.
+ */
+int
+ppc_allocate_register(struct ppc_function *p)
+{
+   unsigned i;
+   for (i = 0; i < PPC_NUM_REGS; i++) {
+      const uint64_t mask = 1 << i;
+      if ((p->reg_used & mask) != 0) {
+         p->reg_used &= ~mask;
+         return i;
+      }
+   }
+   return -1;
 }
 
 
 /**
- * Alloate a vector register.
+ * Mark the given general purpose register as "unallocated".
+ */
+void
+ppc_release_register(struct ppc_function *p, int reg)
+{
+   assert(reg < PPC_NUM_REGS);
+   assert((p->reg_used & (1 << reg)) == 0);
+   p->reg_used |= (1 << reg);
+}
+
+
+/**
+ * Allocate a floating point register.
  * \return register index or -1 if none left.
  */
 int
-ppc_allocate_vec_register(struct ppc_function *p, int reg)
+ppc_allocate_fp_register(struct ppc_function *p)
+{
+   unsigned i;
+   for (i = 0; i < PPC_NUM_FP_REGS; i++) {
+      const uint64_t mask = 1 << i;
+      if ((p->fp_used & mask) != 0) {
+         p->fp_used &= ~mask;
+         return i;
+      }
+   }
+   return -1;
+}
+
+
+/**
+ * Mark the given floating point register as "unallocated".
+ */
+void
+ppc_release_fp_register(struct ppc_function *p, int reg)
+{
+   assert(reg < PPC_NUM_FP_REGS);
+   assert((p->fp_used & (1 << reg)) == 0);
+   p->fp_used |= (1 << reg);
+}
+
+
+/**
+ * Allocate a vector register.
+ * \return register index or -1 if none left.
+ */
+int
+ppc_allocate_vec_register(struct ppc_function *p)
 {
    unsigned i;
    for (i = 0; i < PPC_NUM_VEC_REGS; i++) {
@@ -68,7 +172,6 @@ ppc_allocate_vec_register(struct ppc_function *p, int reg)
          return i;
       }
    }
-
    return -1;
 }
 
@@ -81,7 +184,6 @@ ppc_release_vec_register(struct ppc_function *p, int reg)
 {
    assert(reg < PPC_NUM_VEC_REGS);
    assert((p->vec_used & (1 << reg)) == 0);
-
    p->vec_used |= (1 << reg);
 }
 
@@ -98,6 +200,20 @@ union vx_inst {
    } inst;
 };
 
+static inline void
+emit_vx(struct ppc_function *p, uint op2, uint vD, uint vA, uint vB)
+{
+   union vx_inst inst;
+   inst.inst.op = 4;
+   inst.inst.vD = vD;
+   inst.inst.vA = vA;
+   inst.inst.vB = vB;
+   inst.inst.op2 = op2;
+   p->store[p->num_inst++] = inst.bits;
+   assert(p->num_inst <= p->max_inst);
+};
+
+
 union vxr_inst {
    uint32_t bits;
    struct {
@@ -110,6 +226,21 @@ union vxr_inst {
    } inst;
 };
 
+static inline void
+emit_vxr(struct ppc_function *p, uint op2, uint vD, uint vA, uint vB)
+{
+   union vxr_inst inst;
+   inst.inst.op = 4;
+   inst.inst.vD = vD;
+   inst.inst.vA = vA;
+   inst.inst.vB = vB;
+   inst.inst.rC = 0;
+   inst.inst.op2 = op2;
+   p->store[p->num_inst++] = inst.bits;
+   assert(p->num_inst <= p->max_inst);
+};
+
+
 union va_inst {
    uint32_t bits;
    struct {
@@ -122,49 +253,204 @@ union va_inst {
    } inst;
 };
 
-
 static inline void
-emit_vx(struct ppc_function *p, uint op2, uint vD, uint vA, uint vB)
+emit_va(struct ppc_function *p, uint op2, uint vD, uint vA, uint vB, uint vC)
 {
-   union vx_inst inst;
+   union va_inst inst;
    inst.inst.op = 4;
    inst.inst.vD = vD;
    inst.inst.vA = vA;
    inst.inst.vB = vB;
+   inst.inst.vC = vC;
    inst.inst.op2 = op2;
    p->store[p->num_inst++] = inst.bits;
    assert(p->num_inst <= p->max_inst);
 };
 
-static inline void
-emit_vxr(struct ppc_function *p, uint op2, uint vD, uint vA, uint vB)
+
+union i_inst {
+   uint32_t bits;
+   struct {
+      unsigned op:6;
+      unsigned li:24;
+      unsigned aa:1;
+      unsigned lk:1;
+   } inst;
+};
+
+static INLINE void
+emit_i(struct ppc_function *p, uint op, uint li, uint aa, uint lk)
 {
-   union vxr_inst inst;
-   inst.inst.op = 4;
-   inst.inst.vD = vD;
-   inst.inst.vA = vA;
-   inst.inst.vB = vB;
-   inst.inst.rC = 0;
+   union i_inst inst;
+   inst.inst.op = op;
+   inst.inst.li = li;
+   inst.inst.aa = aa;
+   inst.inst.lk = lk;
+   p->store[p->num_inst++] = inst.bits;
+   assert(p->num_inst <= p->max_inst);
+}
+
+
+union xl_inst {
+   uint32_t bits;
+   struct {
+      unsigned op:6;
+      unsigned bo:5;
+      unsigned bi:5;
+      unsigned unused:3;
+      unsigned bh:2;
+      unsigned op2:10;
+      unsigned lk:1;
+   } inst;
+};
+
+static INLINE void
+emit_xl(struct ppc_function *p, uint op, uint bo, uint bi, uint bh,
+        uint op2, uint lk)
+{
+   union xl_inst inst;
+   inst.inst.op = op;
+   inst.inst.bo = bo;
+   inst.inst.bi = bi;
+   inst.inst.unused = 0x0;
+   inst.inst.bh = bh;
    inst.inst.op2 = op2;
+   inst.inst.lk = lk;
    p->store[p->num_inst++] = inst.bits;
    assert(p->num_inst <= p->max_inst);
+}
+
+static INLINE void
+dump_xl(const char *name, uint inst)
+{
+   union xl_inst i;
+
+   i.bits = inst;
+   debug_printf("%s = 0x%08x\n", name, inst);
+   debug_printf(" op: %d 0x%x\n", i.inst.op, i.inst.op);
+   debug_printf(" bo: %d 0x%x\n", i.inst.bo, i.inst.bo);
+   debug_printf(" bi: %d 0x%x\n", i.inst.bi, i.inst.bi);
+   debug_printf(" unused: %d 0x%x\n", i.inst.unused, i.inst.unused);
+   debug_printf(" bh: %d 0x%x\n", i.inst.bh, i.inst.bh);
+   debug_printf(" op2: %d 0x%x\n", i.inst.op2, i.inst.op2);
+   debug_printf(" lk: %d 0x%x\n", i.inst.lk, i.inst.lk);
+}
+
+
+union x_inst {
+   uint32_t bits;
+   struct {
+      unsigned op:6;
+      unsigned vrs:5;
+      unsigned ra:5;
+      unsigned rb:5;
+      unsigned op2:10;
+      unsigned unused:1;
+   } inst;
 };
 
-static inline void
-emit_va(struct ppc_function *p, uint op2, uint vD, uint vA, uint vB, uint vC)
+static INLINE void
+emit_x(struct ppc_function *p, uint op, uint vrs, uint ra, uint rb, uint op2)
 {
-   union va_inst inst;
-   inst.inst.op = 4;
-   inst.inst.vD = vD;
-   inst.inst.vA = vA;
-   inst.inst.vB = vB;
-   inst.inst.vC = vC;
+   union x_inst inst;
+   inst.inst.op = op;
+   inst.inst.vrs = vrs;
+   inst.inst.ra = ra;
+   inst.inst.rb = rb;
    inst.inst.op2 = op2;
+   inst.inst.unused = 0x0;
+   p->store[p->num_inst++] = inst.bits;
+   assert(p->num_inst <= p->max_inst);
+}
+
+
+union d_inst {
+   uint32_t bits;
+   struct {
+      unsigned op:6;
+      unsigned rt:5;
+      unsigned ra:5;
+      unsigned si:16;
+   } inst;
+};
+
+static inline void
+emit_d(struct ppc_function *p, uint op, uint rt, uint ra, int si)
+{
+   union d_inst inst;
+   assert(si >= -32768);
+   assert(si <= 32767);
+   inst.inst.op = op;
+   inst.inst.rt = rt;
+   inst.inst.ra = ra;
+   inst.inst.si = (unsigned) (si & 0xffff);
    p->store[p->num_inst++] = inst.bits;
    assert(p->num_inst <= p->max_inst);
 };
 
 
+union a_inst {
+   uint32_t bits;
+   struct {
+      unsigned op:6;
+      unsigned frt:5;
+      unsigned fra:5;
+      unsigned frb:5;
+      unsigned unused:5;
+      unsigned op2:5;
+      unsigned rc:1;
+   } inst;
+};
+
+static inline void
+emit_a(struct ppc_function *p, uint op, uint frt, uint fra, uint frb, uint op2,
+       uint rc)
+{
+   union a_inst inst;
+   inst.inst.op = op;
+   inst.inst.frt = frt;
+   inst.inst.fra = fra;
+   inst.inst.frb = frb;
+   inst.inst.unused = 0x0;
+   inst.inst.op2 = op2;
+   inst.inst.rc = rc;
+   p->store[p->num_inst++] = inst.bits;
+   assert(p->num_inst <= p->max_inst);
+};
+
+
+union xo_inst {
+   uint32_t bits;
+   struct {
+      unsigned op:6;
+      unsigned rt:5;
+      unsigned ra:5;
+      unsigned rb:5;
+      unsigned oe:1;
+      unsigned op2:9;
+      unsigned rc:1;
+   } inst;
+};
+
+static INLINE void
+emit_xo(struct ppc_function *p, uint op, uint rt, uint ra, uint rb, uint oe,
+        uint op2, uint rc)
+{
+   union xo_inst inst;
+   inst.inst.op = op;
+   inst.inst.rt = rt;
+   inst.inst.ra = ra;
+   inst.inst.rb = rb;
+   inst.inst.oe = oe;
+   inst.inst.op2 = op2;
+   inst.inst.rc = rc;
+   p->store[p->num_inst++] = inst.bits;
+   assert(p->num_inst <= p->max_inst);
+}
+
+
+
+
 
 /**
  ** float vector arithmetic
@@ -172,7 +458,7 @@ emit_va(struct ppc_function *p, uint op2, uint vD, uint vA, uint vB, uint vC)
 
 /** vector float add */
 void
-ppc_vaddfp(struct ppc_function *p,uint vD, uint vA, uint vB)
+ppc_vaddfp(struct ppc_function *p, uint vD, uint vA, uint vB)
 {
    emit_vx(p, 10, vD, vA, vB);
 }
@@ -198,11 +484,11 @@ ppc_vmaxfp(struct ppc_function *p, uint vD, uint vA, uint vB)
    emit_vx(p, 1034, vD, vA, vB);
 }
 
-/** vector float mult add */
+/** vector float mult add: vD = vA * vB + vC */
 void
 ppc_vmaddfp(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC)
 {
-   emit_va(p, 46, vD, vA, vB, vC);
+   emit_va(p, 46, vD, vA, vC, vB); /* note arg order */
 }
 
 /** vector float compare greater than */
@@ -282,13 +568,26 @@ ppc_vrfiz(struct ppc_function *p, uint vD, uint vB)
    emit_vx(p, 586, vD, 0, vB);
 }
 
+/** vector store: store vR at mem[vA+vB] */
+void
+ppc_stvx(struct ppc_function *p, uint vR, uint vA, uint vB)
+{
+   emit_x(p, 31, vR, vA, vB, 231);
+}
+
+/** vector load: vR = mem[vA+vB] */
+void
+ppc_lvx(struct ppc_function *p, uint vR, uint vA, uint vB)
+{
+   emit_x(p, 31, vR, vA, vB, 103);
+}
+
 
 
 /**
- ** bitwise operations
+ ** vector bitwise operations
  **/
 
-
 /** vector and */
 void
 ppc_vand(struct ppc_function *p, uint vD, uint vA, uint vB)
@@ -324,6 +623,14 @@ ppc_vxor(struct ppc_function *p, uint vD, uint vA, uint vB)
    emit_vx(p, 1220, vD, vA, vB);
 }
 
+/** Pseudo-instruction: vector move */
+void
+ppc_vecmove(struct ppc_function *p, uint vD, uint vA)
+{
+   ppc_vor(p, vD, vA, vA);
+}
+
+
 
 /**
  ** Vector shuffle / select / splat / etc
@@ -363,3 +670,225 @@ ppc_vspltw(struct ppc_function *p, uint vD, uint vB, uint imm)
 {
    emit_vx(p, 652, vD, imm, vB);
 }
+
+/** vector splat signed immediate word */
+void
+ppc_vspltisw(struct ppc_function *p, uint vD, int imm)
+{
+   assert(imm >= -16);
+   assert(imm < 15);
+   emit_vx(p, 908, vD, imm, 0);
+}
+
+/** vector shift left word: vD[word] = vA[word] << (vB[word] & 0x1f) */
+void
+ppc_vslw(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+   emit_vx(p, 388, vD, vA, vB);
+}
+
+
+
+
+/**
+ ** integer arithmetic
+ **/
+
+/** rt = ra + imm */
+void
+ppc_addi(struct ppc_function *p, uint rt, uint ra, int imm)
+{
+   emit_d(p, 14, rt, ra, imm);
+}
+
+/** rt = ra + (imm << 16) */
+void
+ppc_addis(struct ppc_function *p, uint rt, uint ra, int imm)
+{
+   emit_d(p, 15, rt, ra, imm);
+}
+
+/** rt = ra + rb */
+void
+ppc_add(struct ppc_function *p, uint rt, uint ra, uint rb)
+{
+   emit_xo(p, 31, rt, ra, rb, 0, 266, 0);
+}
+
+/** rt = ra AND ra */
+void
+ppc_and(struct ppc_function *p, uint rt, uint ra, uint rb)
+{
+   emit_x(p, 31, ra, rt, rb, 28);  /* note argument order */
+}
+
+/** rt = ra AND imm */
+void
+ppc_andi(struct ppc_function *p, uint rt, uint ra, int imm)
+{
+   emit_d(p, 28, ra, rt, imm);  /* note argument order */
+}
+
+/** rt = ra OR ra */
+void
+ppc_or(struct ppc_function *p, uint rt, uint ra, uint rb)
+{
+   emit_x(p, 31, ra, rt, rb, 444);  /* note argument order */
+}
+
+/** rt = ra OR imm */
+void
+ppc_ori(struct ppc_function *p, uint rt, uint ra, int imm)
+{
+   emit_d(p, 24, ra, rt, imm);  /* note argument order */
+}
+
+/** rt = ra XOR ra */
+void
+ppc_xor(struct ppc_function *p, uint rt, uint ra, uint rb)
+{
+   emit_x(p, 31, ra, rt, rb, 316);  /* note argument order */
+}
+
+/** rt = ra XOR imm */
+void
+ppc_xori(struct ppc_function *p, uint rt, uint ra, int imm)
+{
+   emit_d(p, 26, ra, rt, imm);  /* note argument order */
+}
+
+/** pseudo instruction: move: rt = ra */
+void
+ppc_mr(struct ppc_function *p, uint rt, uint ra)
+{
+   ppc_or(p, rt, ra, ra);
+}
+
+/** pseudo instruction: load immediate: rt = imm */
+void
+ppc_li(struct ppc_function *p, uint rt, int imm)
+{
+   ppc_addi(p, rt, 0, imm);
+}
+
+/** rt = imm << 16 */
+void
+ppc_lis(struct ppc_function *p, uint rt, int imm)
+{
+   ppc_addis(p, rt, 0, imm);
+}
+
+/** rt = imm */
+void
+ppc_load_int(struct ppc_function *p, uint rt, int imm)
+{
+   ppc_lis(p, rt, (imm >> 16));          /* rt = imm >> 16 */
+   ppc_ori(p, rt, rt, (imm & 0xffff));   /* rt = rt | (imm & 0xffff) */
+}
+
+
+
+
+/**
+ ** integer load/store
+ **/
+
+/** store rs at memory[(ra)+d],
+ * then update ra = (ra)+d
+ */
+void
+ppc_stwu(struct ppc_function *p, uint rs, uint ra, int d)
+{
+   emit_d(p, 37, rs, ra, d);
+}
+
+/** store rs at memory[(ra)+d] */
+void
+ppc_stw(struct ppc_function *p, uint rs, uint ra, int d)
+{
+   emit_d(p, 36, rs, ra, d);
+}
+
+/** Load rt = mem[(ra)+d];  then zero set high 32 bits to zero. */
+void
+ppc_lwz(struct ppc_function *p, uint rt, uint ra, int d)
+{
+   emit_d(p, 32, rt, ra, d);
+}
+
+
+
+/**
+ ** Float (non-vector) arithmetic
+ **/
+
+/** add: frt = fra + frb */
+void
+ppc_fadd(struct ppc_function *p, uint frt, uint fra, uint frb)
+{
+   emit_a(p, 63, frt, fra, frb, 21, 0);
+}
+
+/** sub: frt = fra - frb */
+void
+ppc_fsub(struct ppc_function *p, uint frt, uint fra, uint frb)
+{
+   emit_a(p, 63, frt, fra, frb, 20, 0);
+}
+
+/** convert to int: rt = (int) ra */
+void
+ppc_fctiwz(struct ppc_function *p, uint rt, uint fra)
+{
+   emit_x(p, 63, rt, 0, fra, 15);
+}
+
+/** store frs at mem[(ra)+offset] */
+void
+ppc_stfs(struct ppc_function *p, uint frs, uint ra, int offset)
+{
+   emit_d(p, 52, frs, ra, offset);
+}
+
+/** store frs at mem[(ra)+(rb)] */
+void
+ppc_stfiwx(struct ppc_function *p, uint frs, uint ra, uint rb)
+{
+   emit_x(p, 31, frs, ra, rb, 983);
+}
+
+/** load frt = mem[(ra)+offset] */
+void
+ppc_lfs(struct ppc_function *p, uint frt, uint ra, int offset)
+{
+   emit_d(p, 48, frt, ra, offset);
+}
+
+
+
+
+
+/**
+ ** branch instructions
+ **/
+
+/** BLR: Branch to link register (p. 35) */
+void
+ppc_blr(struct ppc_function *p)
+{
+   emit_i(p, 18, 0, 0, 1);
+}
+
+/** Branch Conditional to Link Register (p. 36) */
+void
+ppc_bclr(struct ppc_function *p, uint condOp, uint branchHint, uint condReg)
+{
+   emit_xl(p, 19, condOp, condReg, branchHint, 16, 0);
+}
+
+/** Pseudo instruction: return from subroutine */
+void
+ppc_return(struct ppc_function *p)
+{
+   ppc_bclr(p, BRANCH_COND_ALWAYS, BRANCH_HINT_SUB_RETURN, 0);
+}
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc.h b/src/gallium/auxiliary/rtasm/rtasm_ppc.h
index ed14e943df..6370b60494 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc.h
@@ -36,27 +36,46 @@
 
 #define PPC_INST_SIZE 4  /**< 4 bytes / instruction */
 
+#define PPC_NUM_REGS 32
+#define PPC_NUM_FP_REGS 32
 #define PPC_NUM_VEC_REGS 32
 
+/** Stack pointer register */
+#define PPC_REG_SP 1
+
+/** Branch conditions */
+#define BRANCH_COND_ALWAYS       0x14  /* binary 1z1zz (z=ignored) */
+
+/** Branch hints */
+#define BRANCH_HINT_SUB_RETURN   0x0   /* binary 00 */
+
 
 struct ppc_function
 {
    uint32_t *store;  /**< instruction buffer */
    uint num_inst;
    uint max_inst;
-   uint32_t vec_used;   /** used/free vector registers bitmask */
    uint32_t reg_used;   /** used/free general-purpose registers bitmask */
+   uint32_t fp_used;   /** used/free floating point registers bitmask */
+   uint32_t vec_used;   /** used/free vector registers bitmask */
 };
 
 
 extern void ppc_init_func(struct ppc_function *p, unsigned max_inst);
 extern void ppc_release_func(struct ppc_function *p);
-
-extern int ppc_allocate_vec_register(struct ppc_function *p, int reg);
+extern void (*ppc_get_func( struct ppc_function *p ))( void );
+extern void ppc_dump_func(const struct ppc_function *p);
+
+extern int ppc_allocate_register(struct ppc_function *p);
+extern void ppc_release_register(struct ppc_function *p, int reg);
+extern int ppc_allocate_fp_register(struct ppc_function *p);
+extern void ppc_release_fp_register(struct ppc_function *p, int reg);
+extern int ppc_allocate_vec_register(struct ppc_function *p);
 extern void ppc_release_vec_register(struct ppc_function *p, int reg);
 
 
+
 /**
  ** float vector arithmetic
  **/
@@ -126,9 +145,18 @@ extern void
 ppc_vrfiz(struct ppc_function *p, uint vD, uint vB);
 
 
+/** vector store: store vR at mem[vA+vB] */
+extern void
+ppc_stvx(struct ppc_function *p, uint vR, uint vA, uint vB);
+
+/** vector load: vR = mem[vA+vB] */
+extern void
+ppc_lvx(struct ppc_function *p, uint vR, uint vA, uint vB);
+
+
 
 /**
- ** bitwise operations
+ ** vector bitwise operations
  **/
 
 
@@ -152,6 +180,10 @@ ppc_vnor(struct ppc_function *p, uint vD, uint vA, uint vB);
 extern void
 ppc_vxor(struct ppc_function *p, uint vD, uint vA, uint vB);
 
+/** Pseudo-instruction: vector move */
+extern void
+ppc_vecmove(struct ppc_function *p, uint vD, uint vA);
+
 
 /**
  ** Vector shuffle / select / splat / etc
@@ -177,5 +209,106 @@ ppc_vsplthw(struct ppc_function *p, uint vD, uint vB, uint imm);
 extern void
 ppc_vspltw(struct ppc_function *p, uint vD, uint vB, uint imm);
 
+/** vector splat signed immediate word */
+extern void
+ppc_vspltisw(struct ppc_function *p, uint vD, int imm);
+
+/** vector shift left word: vD[word] = vA[word] << (vB[word] & 0x1f) */
+extern void
+ppc_vslw(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+
+
+/**
+ ** scalar arithmetic
+ **/
+
+extern void
+ppc_add(struct ppc_function *p, uint rt, uint ra, uint rb);
+
+extern void
+ppc_addi(struct ppc_function *p, uint rt, uint ra, int imm);
+
+extern void
+ppc_and(struct ppc_function *p, uint rt, uint ra, uint rb);
+
+extern void
+ppc_andi(struct ppc_function *p, uint rt, uint ra, int imm);
+
+extern void
+ppc_or(struct ppc_function *p, uint rt, uint ra, uint rb);
+
+extern void
+ppc_ori(struct ppc_function *p, uint rt, uint ra, int imm);
+
+extern void
+ppc_xor(struct ppc_function *p, uint rt, uint ra, uint rb);
+
+extern void
+ppc_xori(struct ppc_function *p, uint rt, uint ra, int imm);
+
+extern void
+ppc_mr(struct ppc_function *p, uint rt, uint ra);
+
+extern void
+ppc_li(struct ppc_function *p, uint rt, int imm);
+
+extern void
+ppc_lis(struct ppc_function *p, uint rt, int imm);
+
+extern void
+ppc_load_int(struct ppc_function *p, uint rt, int imm);
+
+
+
+/**
+ ** scalar load/store
+ **/
+
+extern void
+ppc_stwu(struct ppc_function *p, uint rs, uint ra, int d);
+
+extern void
+ppc_stw(struct ppc_function *p, uint rs, uint ra, int d);
+
+extern void
+ppc_lwz(struct ppc_function *p, uint rs, uint ra, int d);
+
+
+
+/**
+ ** Float (non-vector) arithmetic
+ **/
+
+extern void
+ppc_fadd(struct ppc_function *p, uint frt, uint fra, uint frb);
+
+extern void
+ppc_fsub(struct ppc_function *p, uint frt, uint fra, uint frb);
+
+extern void
+ppc_fctiwz(struct ppc_function *p, uint rt, uint ra);
+
+extern void
+ppc_stfs(struct ppc_function *p, uint frs, uint ra, int offset);
+
+extern void
+ppc_stfiwx(struct ppc_function *p, uint frs, uint ra, uint rb);
+
+
+
+/**
+ ** branch instructions
+ **/
+
+extern void
+ppc_blr(struct ppc_function *p);
+
+void
+ppc_bclr(struct ppc_function *p, uint condOp, uint branchHint, uint condReg);
+
+extern void
+ppc_return(struct ppc_function *p);
+
 
 #endif /* RTASM_PPC_H */
-- 
cgit v1.2.3


From 049f57f86a2cb8ff08fba819c581a034ca7ea52c Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 22 Oct 2008 11:06:39 -0600
Subject: gallium: added ppc_lvewx()

---
 src/gallium/auxiliary/rtasm/rtasm_ppc.c | 7 +++++++
 src/gallium/auxiliary/rtasm/rtasm_ppc.h | 4 ++++
 2 files changed, 11 insertions(+)

(limited to 'src/gallium/auxiliary')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc.c b/src/gallium/auxiliary/rtasm/rtasm_ppc.c
index 4a94ed0460..aaec2d2191 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc.c
@@ -582,6 +582,13 @@ ppc_lvx(struct ppc_function *p, uint vR, uint vA, uint vB)
    emit_x(p, 31, vR, vA, vB, 103);
 }
 
+/** load vector element word: vR = mem_word[vA+vB] */
+void
+ppc_lvewx(struct ppc_function *p, uint vR, uint vA, uint vB)
+{
+   emit_x(p, 31, vR, vA, vB, 71);
+}
+
 
 
 /**
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc.h b/src/gallium/auxiliary/rtasm/rtasm_ppc.h
index 6370b60494..53d5746dc8 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc.h
@@ -153,6 +153,10 @@ ppc_stvx(struct ppc_function *p, uint vR, uint vA, uint vB);
 extern void
 ppc_lvx(struct ppc_function *p, uint vR, uint vA, uint vB);
 
+/** load vector element word: vR = mem_word[vA+vB] */
+extern void
+ppc_lvewx(struct ppc_function *p, uint vR, uint vA, uint vB);
+
 
 
 /**
-- 
cgit v1.2.3


From 70f4ad44985e3ec6dabc1b0e55a5bf85803a4cd4 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 22 Oct 2008 11:07:35 -0600
Subject: gallium: TGSI to PPC code generation

Based on the TGSIto SSE2 code generator.
Incomplete and lots of SSE stuff still hanging around but the basic dozen
or so TGSI opcodes are functioning.
---
 src/gallium/auxiliary/tgsi/Makefile   |    1 +
 src/gallium/auxiliary/tgsi/tgsi_ppc.c | 2781 +++++++++++++++++++++++++++++++++
 src/gallium/auxiliary/tgsi/tgsi_ppc.h |   48 +
 3 files changed, 2830 insertions(+)
 create mode 100644 src/gallium/auxiliary/tgsi/tgsi_ppc.c
 create mode 100644 src/gallium/auxiliary/tgsi/tgsi_ppc.h

(limited to 'src/gallium/auxiliary')

diff --git a/src/gallium/auxiliary/tgsi/Makefile b/src/gallium/auxiliary/tgsi/Makefile
index c7155a9316..d7df9490cf 100644
--- a/src/gallium/auxiliary/tgsi/Makefile
+++ b/src/gallium/auxiliary/tgsi/Makefile
@@ -11,6 +11,7 @@ C_SOURCES = \
 	tgsi_info.c \
 	tgsi_iterate.c \
 	tgsi_parse.c \
+	tgsi_ppc.c \
 	tgsi_scan.c \
 	tgsi_sse2.c \
 	tgsi_text.c \
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ppc.c b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
new file mode 100644
index 0000000000..112e736523
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
@@ -0,0 +1,2781 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * TGSI to PowerPC code generation.
+ */
+
+#include "pipe/p_config.h"
+
+#if defined(PIPE_ARCH_PPC)
+
+#include "pipe/p_debug.h"
+#include "pipe/p_shader_tokens.h"
+#include "util/u_math.h"
+#include "util/u_sse.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_util.h"
+#include "tgsi_exec.h"
+#include "tgsi_ppc.h"
+#include "rtasm/rtasm_ppc.h"
+
+
+/* for 1/sqrt()
+ *
+ * This costs about 100fps (close to 10%) in gears:
+ */
+#define HIGH_PRECISION 1
+
+#define FAST_MATH 1
+
+
+#define FOR_EACH_CHANNEL( CHAN )\
+   for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)
+
+#define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
+   ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
+
+#define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
+   if (IS_DST0_CHANNEL_ENABLED( INST, CHAN ))
+
+#define FOR_EACH_DST0_ENABLED_CHANNEL( INST, CHAN )\
+   FOR_EACH_CHANNEL( CHAN )\
+      IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )
+
+#define CHAN_X 0
+#define CHAN_Y 1
+#define CHAN_Z 2
+#define CHAN_W 3
+
+#define TEMP_ONE_I   TGSI_EXEC_TEMP_ONE_I
+#define TEMP_ONE_C   TGSI_EXEC_TEMP_ONE_C
+
+#define TEMP_R0   TGSI_EXEC_TEMP_R0
+#define TEMP_ADDR TGSI_EXEC_TEMP_ADDR
+
+
+/**
+ * Context/state used during code gen.
+ */
+struct gen_context
+{
+   struct ppc_function *f;
+   int inputs_reg;    /**< register pointing to input params */
+   int outputs_reg;   /**< register pointing to output params */
+   int temps_reg;     /**< register pointing to temporary "registers" */
+   int immed_reg;     /**< register pointing to immediates buffer */
+   int const_reg;     /**< register pointing to constants buffer */
+};
+
+
+
+#if 0000
+
+/**
+ * X86 utility functions.
+ */
+
+static struct x86_reg
+make_xmm(
+   unsigned xmm )
+{
+   return x86_make_reg(
+      file_XMM,
+      (enum x86_reg_name) xmm );
+}
+
+/**
+ * X86 register mapping helpers.
+ */
+
+static struct x86_reg
+get_const_base( void )
+{
+   return x86_make_reg(
+      file_REG32,
+      reg_CX );
+}
+
+static struct x86_reg
+get_input_base( void )
+{
+   return x86_make_reg(
+      file_REG32,
+      reg_AX );
+}
+
+static struct x86_reg
+get_output_base( void )
+{
+   return x86_make_reg(
+      file_REG32,
+      reg_DX );
+}
+
+static struct x86_reg
+get_temp_base( void )
+{
+   return x86_make_reg(
+      file_REG32,
+      reg_BX );
+}
+
+static struct x86_reg
+get_coef_base( void )
+{
+   return get_output_base();
+}
+
+static struct x86_reg
+get_immediate_base( void )
+{
+   return x86_make_reg(
+      file_REG32,
+      reg_DI );
+}
+
+
+/**
+ * Data access helpers.
+ */
+
+
+static struct x86_reg
+get_immediate(
+   unsigned vec,
+   unsigned chan )
+{
+   return x86_make_disp(
+      get_immediate_base(),
+      (vec * 4 + chan) * 4 );
+}
+
+static struct x86_reg
+get_const(
+   unsigned vec,
+   unsigned chan )
+{
+   return x86_make_disp(
+      get_const_base(),
+      (vec * 4 + chan) * 4 );
+}
+
+static struct x86_reg
+get_input(
+   unsigned vec,
+   unsigned chan )
+{
+   return x86_make_disp(
+      get_input_base(),
+      (vec * 4 + chan) * 16 );
+}
+
+static struct x86_reg
+get_output(
+   unsigned vec,
+   unsigned chan )
+{
+   return x86_make_disp(
+      get_output_base(),
+      (vec * 4 + chan) * 16 );
+}
+
+static struct x86_reg
+get_temp(
+   unsigned vec,
+   unsigned chan )
+{
+   return x86_make_disp(
+      get_temp_base(),
+      (vec * 4 + chan) * 16 );
+}
+
+static struct x86_reg
+get_coef(
+   unsigned vec,
+   unsigned chan,
+   unsigned member )
+{
+   return x86_make_disp(
+      get_coef_base(),
+      ((vec * 3 + member) * 4 + chan) * 4 );
+}
+
+
+static void
+emit_ret(
+   struct x86_function  *func )
+{
+   x86_ret( func );
+}
+
+#endif
+
+/**
+ * Data fetch helpers.
+ */
+
+#if 00
+/**
+ * Copy a shader constant to xmm register
+ * \param xmm  the destination xmm register
+ * \param vec  the src const buffer index
+ * \param chan  src channel to fetch (X, Y, Z or W)
+ */
+static void
+emit_const(
+   struct x86_function *func,
+   uint xmm,
+   int vec,
+   uint chan,
+   uint indirect,
+   uint indirectFile,
+   int indirectIndex )
+{
+   if (indirect) {
+      struct x86_reg r0 = get_input_base();
+      struct x86_reg r1 = get_output_base();
+      uint i;
+
+      assert( indirectFile == TGSI_FILE_ADDRESS );
+      assert( indirectIndex == 0 );
+
+      x86_push( func, r0 );
+      x86_push( func, r1 );
+
+      for (i = 0; i < QUAD_SIZE; i++) {
+         x86_lea( func, r0, get_const( vec, chan ) );
+         x86_mov( func, r1, x86_make_disp( get_temp( TEMP_ADDR, CHAN_X ), i * 4 ) );
+
+         /* Quick hack to multiply by 16 -- need to add SHL to rtasm.
+          */
+         x86_add( func, r1, r1 );
+         x86_add( func, r1, r1 );
+         x86_add( func, r1, r1 );
+         x86_add( func, r1, r1 );
+
+         x86_add( func, r0, r1 );
+         x86_mov( func, r1, x86_deref( r0 ) );
+         x86_mov( func, x86_make_disp( get_temp( TEMP_R0, CHAN_X ), i * 4 ), r1 );
+      }
+
+      x86_pop( func, r1 );
+      x86_pop( func, r0 );
+
+      sse_movaps(
+         func,
+         make_xmm( xmm ),
+         get_temp( TEMP_R0, CHAN_X ) );
+   }
+   else {
+      assert( vec >= 0 );
+
+      sse_movss(
+         func,
+         make_xmm( xmm ),
+         get_const( vec, chan ) );
+      sse_shufps(
+         func,
+         make_xmm( xmm ),
+         make_xmm( xmm ),
+         SHUF( 0, 0, 0, 0 ) );
+   }
+}
+
+static void
+emit_immediate(
+   struct x86_function *func,
+   unsigned xmm,
+   unsigned vec,
+   unsigned chan )
+{
+   sse_movss(
+      func,
+      make_xmm( xmm ),
+      get_immediate( vec, chan ) );
+   sse_shufps(
+      func,
+      make_xmm( xmm ),
+      make_xmm( xmm ),
+      SHUF( 0, 0, 0, 0 ) );
+}
+
+
+/**
+ * Copy a shader input to xmm register
+ * \param xmm  the destination xmm register
+ * \param vec  the src input attrib
+ * \param chan  src channel to fetch (X, Y, Z or W)
+ */
+static void
+emit_inputf(
+   struct x86_function *func,
+   unsigned xmm,
+   unsigned vec,
+   unsigned chan )
+{
+   sse_movups(
+      func,
+      make_xmm( xmm ),
+      get_input( vec, chan ) );
+}
+
+/**
+ * Store an xmm register to a shader output
+ * \param xmm  the source xmm register
+ * \param vec  the dest output attrib
+ * \param chan  src dest channel to store (X, Y, Z or W)
+ */
+static void
+emit_output(
+   struct x86_function *func,
+   unsigned xmm,
+   unsigned vec,
+   unsigned chan )
+{
+   sse_movups(
+      func,
+      get_output( vec, chan ),
+      make_xmm( xmm ) );
+}
+
+/**
+ * Copy a shader temporary to xmm register
+ * \param xmm  the destination xmm register
+ * \param vec  the src temp register
+ * \param chan  src channel to fetch (X, Y, Z or W)
+ */
+static void
+emit_tempf(
+   struct x86_function *func,
+   unsigned xmm,
+   unsigned vec,
+   unsigned chan )
+{
+   sse_movaps(
+      func,
+      make_xmm( xmm ),
+      get_temp( vec, chan ) );
+}
+
+/**
+ * Load an xmm register with an input attrib coefficient (a0, dadx or dady)
+ * \param xmm  the destination xmm register
+ * \param vec  the src input/attribute coefficient index
+ * \param chan  src channel to fetch (X, Y, Z or W)
+ * \param member  0=a0, 1=dadx, 2=dady
+ */
+static void
+emit_coef(
+   struct x86_function *func,
+   unsigned xmm,
+   unsigned vec,
+   unsigned chan,
+   unsigned member )
+{
+   sse_movss(
+      func,
+      make_xmm( xmm ),
+      get_coef( vec, chan, member ) );
+   sse_shufps(
+      func,
+      make_xmm( xmm ),
+      make_xmm( xmm ),
+      SHUF( 0, 0, 0, 0 ) );
+}
+
+/**
+ * Data store helpers.
+ */
+
+static void
+emit_inputs(
+   struct x86_function *func,
+   unsigned xmm,
+   unsigned vec,
+   unsigned chan )
+{
+   sse_movups(
+      func,
+      get_input( vec, chan ),
+      make_xmm( xmm ) );
+}
+
+static void
+emit_temps(
+   struct x86_function *func,
+   unsigned xmm,
+   unsigned vec,
+   unsigned chan )
+{
+   sse_movaps(
+      func,
+      get_temp( vec, chan ),
+      make_xmm( xmm ) );
+}
+
+static void
+emit_addrs(
+   struct x86_function *func,
+   unsigned xmm,
+   unsigned vec,
+   unsigned chan )
+{
+   assert( vec == 0 );
+
+   emit_temps(
+      func,
+      xmm,
+      vec + TGSI_EXEC_TEMP_ADDR,
+      chan );
+}
+
+/**
+ * Coefficent fetch helpers.
+ */
+
+static void
+emit_coef_a0(
+   struct x86_function *func,
+   unsigned xmm,
+   unsigned vec,
+   unsigned chan )
+{
+   emit_coef(
+      func,
+      xmm,
+      vec,
+      chan,
+      0 );
+}
+
+static void
+emit_coef_dadx(
+   struct x86_function *func,
+   unsigned xmm,
+   unsigned vec,
+   unsigned chan )
+{
+   emit_coef(
+      func,
+      xmm,
+      vec,
+      chan,
+      1 );
+}
+
+static void
+emit_coef_dady(
+   struct x86_function *func,
+   unsigned xmm,
+   unsigned vec,
+   unsigned chan )
+{
+   emit_coef(
+      func,
+      xmm,
+      vec,
+      chan,
+      2 );
+}
+#endif
+
+
+/**
+ * Function call helpers.
+ */
+
+#if 00
+/**
+ * NOTE: In gcc, if the destination uses the SSE intrinsics, then it must be 
+ * defined with __attribute__((force_align_arg_pointer)), as we do not guarantee
+ * that the stack pointer is 16 byte aligned, as expected.
+ */
+static void
+emit_func_call_dst(
+   struct x86_function *func,
+   unsigned xmm_save,
+   unsigned xmm_dst,
+   void (PIPE_CDECL *code)() )
+{
+   struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
+   unsigned i, n, xmm;
+   unsigned xmm_mask;
+   
+   /* Bitmask of the xmm registers to save */
+   xmm_mask = (1 << xmm_save) - 1;
+   xmm_mask &= ~(1 << xmm_dst);
+
+   sse_movaps(
+      func,
+      get_temp( TEMP_R0, 0 ),
+      make_xmm( xmm_dst ) );
+
+   x86_push(
+      func,
+      x86_make_reg( file_REG32, reg_AX) );
+   x86_push(
+      func,
+      x86_make_reg( file_REG32, reg_CX) );
+   x86_push(
+      func,
+      x86_make_reg( file_REG32, reg_DX) );
+   
+   for(i = 0, n = 0; i < 8; ++i)
+      if(xmm_mask & (1 << i))
+         ++n;
+   
+   x86_sub_imm(
+      func, 
+      x86_make_reg( file_REG32, reg_SP ),
+      n*16);
+
+   for(i = 0, n = 0; i < 8; ++i)
+      if(xmm_mask & (1 << i)) {
+         sse_movups(
+            func,
+            x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ),
+            make_xmm( xmm ) );
+         ++n;
+      }
+   
+   x86_lea(
+      func,
+      ecx,
+      get_temp( TEMP_R0, 0 ) );
+   
+   x86_push( func, ecx );
+   x86_mov_reg_imm( func, ecx, (unsigned long) code );
+   x86_call( func, ecx );
+   x86_pop(func, ecx );
+   
+   for(i = 0, n = 0; i < 8; ++i)
+      if(xmm_mask & (1 << i)) {
+         sse_movups(
+            func,
+            make_xmm( xmm ),
+            x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ) );
+         ++n;
+      }
+   
+   x86_add_imm(
+      func, 
+      x86_make_reg( file_REG32, reg_SP ),
+      n*16);
+
+   /* Restore GP registers in a reverse order.
+    */
+   x86_pop(
+      func,
+      x86_make_reg( file_REG32, reg_DX) );
+   x86_pop(
+      func,
+      x86_make_reg( file_REG32, reg_CX) );
+   x86_pop(
+      func,
+      x86_make_reg( file_REG32, reg_AX) );
+
+   sse_movaps(
+      func,
+      make_xmm( xmm_dst ),
+      get_temp( TEMP_R0, 0 ) );
+}
+
+static void
+emit_func_call_dst_src(
+   struct x86_function *func,
+   unsigned xmm_save, 
+   unsigned xmm_dst,
+   unsigned xmm_src,
+   void (PIPE_CDECL *code)() )
+{
+   sse_movaps(
+      func,
+      get_temp( TEMP_R0, 1 ),
+      make_xmm( xmm_src ) );
+
+   emit_func_call_dst(
+      func,
+      xmm_save,
+      xmm_dst,
+      code );
+}
+
+/*
+ * Fast SSE2 implementation of special math functions.
+ */
+
+#define POLY0(x, c0) _mm_set1_ps(c0)
+#define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
+#define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
+#define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
+#define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
+#define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
+
+#define EXP_POLY_DEGREE 3
+#define LOG_POLY_DEGREE 5
+
+/**
+ * See http://www.devmaster.net/forums/showthread.php?p=43580
+ */
+static INLINE __m128 
+exp2f4(__m128 x)
+{
+   __m128i ipart;
+   __m128 fpart, expipart, expfpart;
+
+   x = _mm_min_ps(x, _mm_set1_ps( 129.00000f));
+   x = _mm_max_ps(x, _mm_set1_ps(-126.99999f));
+
+   /* ipart = int(x - 0.5) */
+   ipart = _mm_cvtps_epi32(_mm_sub_ps(x, _mm_set1_ps(0.5f)));
+
+   /* fpart = x - ipart */
+   fpart = _mm_sub_ps(x, _mm_cvtepi32_ps(ipart));
+
+   /* expipart = (float) (1 << ipart) */
+   expipart = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart, _mm_set1_epi32(127)), 23));
+
+   /* minimax polynomial fit of 2**x, in range [-0.5, 0.5[ */
+#if EXP_POLY_DEGREE == 5
+   expfpart = POLY5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f);
+#elif EXP_POLY_DEGREE == 4
+   expfpart = POLY4(fpart, 1.0000026f, 6.9300383e-1f, 2.4144275e-1f, 5.2011464e-2f, 1.3534167e-2f);
+#elif EXP_POLY_DEGREE == 3
+   expfpart = POLY3(fpart, 9.9992520e-1f, 6.9583356e-1f, 2.2606716e-1f, 7.8024521e-2f);
+#elif EXP_POLY_DEGREE == 2
+   expfpart = POLY2(fpart, 1.0017247f, 6.5763628e-1f, 3.3718944e-1f);
+#else
+#error
+#endif
+
+   return _mm_mul_ps(expipart, expfpart);
+}
+
+/**
+ * See http://www.devmaster.net/forums/showthread.php?p=43580
+ */
+static INLINE __m128 
+log2f4(__m128 x)
+{
+   __m128i expmask = _mm_set1_epi32(0x7f800000);
+   __m128i mantmask = _mm_set1_epi32(0x007fffff);
+   __m128 one = _mm_set1_ps(1.0f);
+
+   __m128i i = _mm_castps_si128(x);
+
+   /* exp = (float) exponent(x) */
+   __m128 exp = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i, expmask), 23), _mm_set1_epi32(127)));
+
+   /* mant = (float) mantissa(x) */
+   __m128 mant = _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i, mantmask)), one);
+
+   __m128 logmant;
+
+   /* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[ 
+    * These coefficients can be generate with 
+    * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
+    */
+#if LOG_POLY_DEGREE == 6
+   logmant = POLY5(mant, 3.11578814719469302614f, -3.32419399085241980044f, 2.59883907202499966007f, -1.23152682416275988241f, 0.318212422185251071475f, -0.0344359067839062357313f);
+#elif LOG_POLY_DEGREE == 5
+   logmant = POLY4(mant, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
+#elif LOG_POLY_DEGREE == 4
+   logmant = POLY3(mant, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
+#elif LOG_POLY_DEGREE == 3
+   logmant = POLY2(mant, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
+#else
+#error
+#endif
+
+   /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
+   logmant = _mm_mul_ps(logmant, _mm_sub_ps(mant, one));
+
+   return _mm_add_ps(logmant, exp);
+}
+
+static INLINE __m128
+powf4(__m128 x, __m128 y)
+{
+   return exp2f4(_mm_mul_ps(log2f4(x), y));
+}
+
+
+/**
+ * Low-level instruction translators.
+ */
+
+static void
+emit_abs(
+   struct x86_function *func,
+   unsigned xmm )
+{
+   sse_andps(
+      func,
+      make_xmm( xmm ),
+      get_temp(
+         TGSI_EXEC_TEMP_7FFFFFFF_I,
+         TGSI_EXEC_TEMP_7FFFFFFF_C ) );
+}
+
+static void
+emit_add(
+   struct x86_function *func,
+   unsigned xmm_dst,
+   unsigned xmm_src )
+{
+   sse_addps(
+      func,
+      make_xmm( xmm_dst ),
+      make_xmm( xmm_src ) );
+}
+
+static void PIPE_CDECL
+cos4f(
+   float *store )
+{
+   store[0] = cosf( store[0] );
+   store[1] = cosf( store[1] );
+   store[2] = cosf( store[2] );
+   store[3] = cosf( store[3] );
+}
+
+static void
+emit_cos(
+   struct x86_function *func,
+   unsigned xmm_save, 
+   unsigned xmm_dst )
+{
+   emit_func_call_dst(
+      func,
+      xmm_save, 
+      xmm_dst,
+      cos4f );
+}
+
+static void PIPE_CDECL
+#if defined(PIPE_CC_GCC)
+__attribute__((force_align_arg_pointer))
+#endif
+ex24f(
+   float *store )
+{
+   _mm_store_ps(&store[0], exp2f4( _mm_load_ps(&store[0]) ));
+}
+
+static void
+emit_ex2(
+   struct x86_function *func,
+   unsigned xmm_save, 
+   unsigned xmm_dst )
+{
+   emit_func_call_dst(
+      func,
+      xmm_save,
+      xmm_dst,
+      ex24f );
+}
+
+static void
+emit_f2it(
+   struct x86_function *func,
+   unsigned xmm )
+{
+   sse2_cvttps2dq(
+      func,
+      make_xmm( xmm ),
+      make_xmm( xmm ) );
+}
+
+static void PIPE_CDECL
+flr4f(
+   float *store )
+{
+   store[0] = floorf( store[0] );
+   store[1] = floorf( store[1] );
+   store[2] = floorf( store[2] );
+   store[3] = floorf( store[3] );
+}
+
+static void
+emit_flr(
+   struct x86_function *func,
+   unsigned xmm_save, 
+   unsigned xmm_dst )
+{
+   emit_func_call_dst(
+      func,
+      xmm_save,
+      xmm_dst,
+      flr4f );
+}
+
+static void PIPE_CDECL
+frc4f(
+   float *store )
+{
+   store[0] -= floorf( store[0] );
+   store[1] -= floorf( store[1] );
+   store[2] -= floorf( store[2] );
+   store[3] -= floorf( store[3] );
+}
+
+static void
+emit_frc(
+   struct x86_function *func,
+   unsigned xmm_save, 
+   unsigned xmm_dst )
+{
+   emit_func_call_dst(
+      func,
+      xmm_save,
+      xmm_dst,
+      frc4f );
+}
+
+static void PIPE_CDECL
+#if defined(PIPE_CC_GCC)
+__attribute__((force_align_arg_pointer))
+#endif
+lg24f(
+   float *store )
+{
+   _mm_store_ps(&store[0], log2f4( _mm_load_ps(&store[0]) ));
+}
+
+static void
+emit_lg2(
+   struct x86_function *func,
+   unsigned xmm_save, 
+   unsigned xmm_dst )
+{
+   emit_func_call_dst(
+      func,
+      xmm_save,
+      xmm_dst,
+      lg24f );
+}
+
+static void
+emit_MOV(
+   struct x86_function *func,
+   unsigned xmm_dst,
+   unsigned xmm_src )
+{
+   sse_movups(
+      func,
+      make_xmm( xmm_dst ),
+      make_xmm( xmm_src ) );
+}
+
+static void
+emit_mul (struct x86_function *func,
+          unsigned xmm_dst,
+          unsigned xmm_src)
+{
+   sse_mulps(
+      func,
+      make_xmm( xmm_dst ),
+      make_xmm( xmm_src ) );
+}
+
+static void
+emit_neg(
+   struct x86_function *func,
+   unsigned xmm )
+{
+   sse_xorps(
+      func,
+      make_xmm( xmm ),
+      get_temp(
+         TGSI_EXEC_TEMP_80000000_I,
+         TGSI_EXEC_TEMP_80000000_C ) );
+}
+
+static void PIPE_CDECL
+#if defined(PIPE_CC_GCC)
+__attribute__((force_align_arg_pointer))
+#endif
+pow4f(
+   float *store )
+{
+#if 1
+   _mm_store_ps(&store[0], powf4( _mm_load_ps(&store[0]), _mm_load_ps(&store[4]) ));
+#else
+   store[0] = powf( store[0], store[4] );
+   store[1] = powf( store[1], store[5] );
+   store[2] = powf( store[2], store[6] );
+   store[3] = powf( store[3], store[7] );
+#endif
+}
+
+static void
+emit_pow(
+   struct x86_function *func,
+   unsigned xmm_save, 
+   unsigned xmm_dst,
+   unsigned xmm_src )
+{
+   emit_func_call_dst_src(
+      func,
+      xmm_save,
+      xmm_dst,
+      xmm_src,
+      pow4f );
+}
+
+static void
+emit_rcp (
+   struct x86_function *func,
+   unsigned xmm_dst,
+   unsigned xmm_src )
+{
+   /* On Intel CPUs at least, this is only accurate to 12 bits -- not
+    * good enough.  Need to either emit a proper divide or use the
+    * iterative technique described below in emit_rsqrt().
+    */
+   sse2_rcpps(
+      func,
+      make_xmm( xmm_dst ),
+      make_xmm( xmm_src ) );
+}
+
+static void
+emit_rsqrt(
+   struct x86_function *func,
+   unsigned xmm_dst,
+   unsigned xmm_src )
+{
+#if HIGH_PRECISION
+   /* Although rsqrtps() and rcpps() are low precision on some/all SSE
+    * implementations, it is possible to improve its precision at
+    * fairly low cost, using a newton/raphson step, as below:
+    * 
+    * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
+    * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
+    *
+    * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
+    */
+   {
+      struct x86_reg dst = make_xmm( xmm_dst );
+      struct x86_reg src = make_xmm( xmm_src );
+      struct x86_reg tmp0 = make_xmm( 2 );
+      struct x86_reg tmp1 = make_xmm( 3 );
+
+      assert( xmm_dst != xmm_src );
+      assert( xmm_dst != 2 && xmm_dst != 3 );
+      assert( xmm_src != 2 && xmm_src != 3 );
+
+      sse_movaps(  func, dst,  get_temp( TGSI_EXEC_TEMP_HALF_I, TGSI_EXEC_TEMP_HALF_C ) );
+      sse_movaps(  func, tmp0, get_temp( TGSI_EXEC_TEMP_THREE_I, TGSI_EXEC_TEMP_THREE_C ) );
+      sse_rsqrtps( func, tmp1, src  );
+      sse_mulps(   func, src,  tmp1 );
+      sse_mulps(   func, dst,  tmp1 );
+      sse_mulps(   func, src,  tmp1 );
+      sse_subps(   func, tmp0, src  );
+      sse_mulps(   func, dst,  tmp0 );
+   }
+#else
+   /* On Intel CPUs at least, this is only accurate to 12 bits -- not
+    * good enough.
+    */
+   sse_rsqrtps(
+      func,
+      make_xmm( xmm_dst ),
+      make_xmm( xmm_src ) );
+#endif
+}
+
+static void
+emit_setsign(
+   struct x86_function *func,
+   unsigned xmm )
+{
+   sse_orps(
+      func,
+      make_xmm( xmm ),
+      get_temp(
+         TGSI_EXEC_TEMP_80000000_I,
+         TGSI_EXEC_TEMP_80000000_C ) );
+}
+
+static void PIPE_CDECL
+sin4f(
+   float *store )
+{
+   store[0] = sinf( store[0] );
+   store[1] = sinf( store[1] );
+   store[2] = sinf( store[2] );
+   store[3] = sinf( store[3] );
+}
+
+static void
+emit_sin (struct x86_function *func,
+          unsigned xmm_save, 
+          unsigned xmm_dst)
+{
+   emit_func_call_dst(
+      func,
+      xmm_save,
+      xmm_dst,
+      sin4f );
+}
+
+static void
+emit_sub(
+   struct x86_function *func,
+   unsigned xmm_dst,
+   unsigned xmm_src )
+{
+   sse_subps(
+      func,
+      make_xmm( xmm_dst ),
+      make_xmm( xmm_src ) );
+}
+#endif
+
+
+/**
+ * Register fetch.
+ */
+static void
+emit_fetch(struct gen_context *gen,
+           unsigned vec_reg,
+           const struct tgsi_full_src_register *reg,
+           const unsigned chan_index)
+{
+   uint swizzle = tgsi_util_get_full_src_register_extswizzle(reg, chan_index);
+
+   switch (swizzle) {
+   case TGSI_EXTSWIZZLE_X:
+   case TGSI_EXTSWIZZLE_Y:
+   case TGSI_EXTSWIZZLE_Z:
+   case TGSI_EXTSWIZZLE_W:
+      switch (reg->SrcRegister.File) {
+      case TGSI_FILE_INPUT:
+         {
+            int offset_reg = ppc_allocate_register(gen->f);
+            int offset = (reg->SrcRegister.Index * 4 + swizzle) * 16;
+            ppc_li(gen->f, offset_reg, offset);
+            ppc_lvx(gen->f, vec_reg, gen->inputs_reg, offset_reg);
+            ppc_release_register(gen->f, offset_reg);
+         }
+         break;
+      case TGSI_FILE_TEMPORARY:
+         {
+            int offset_reg = ppc_allocate_register(gen->f);
+            int offset = (reg->SrcRegister.Index * 4 + swizzle) * 16;
+            ppc_li(gen->f, offset_reg, offset);
+            ppc_lvx(gen->f, vec_reg, gen->temps_reg, offset_reg);
+            ppc_release_register(gen->f, offset_reg);
+         }
+         break;
+      case TGSI_FILE_IMMEDIATE:
+         {
+            int offset_reg = ppc_allocate_register(gen->f);
+            int offset = (reg->SrcRegister.Index * 4 + swizzle) * 16;
+            ppc_li(gen->f, offset_reg, offset);
+            ppc_lvx(gen->f, vec_reg, gen->immed_reg, offset_reg);
+            ppc_release_register(gen->f, offset_reg);
+         }
+         break;
+      case TGSI_FILE_CONSTANT:
+         {
+            int offset_reg = ppc_allocate_register(gen->f);
+            int offset = (reg->SrcRegister.Index * 4 + swizzle) * 4;
+            ppc_li(gen->f, offset_reg, offset);
+            /* load vector word */
+            ppc_lvewx(gen->f, vec_reg, gen->const_reg, offset_reg);
+            /* splat word[0] across vector */
+            ppc_vspltw(gen->f, vec_reg, vec_reg, 0);
+            ppc_release_register(gen->f, offset_reg);
+         }
+         break;
+      default:
+         assert( 0 );
+      }
+      break;
+
+   case TGSI_EXTSWIZZLE_ZERO:
+#if 0
+      emit_tempf(
+         func,
+         xmm,
+         TGSI_EXEC_TEMP_00000000_I,
+         TGSI_EXEC_TEMP_00000000_C );
+#endif
+      break;
+
+   case TGSI_EXTSWIZZLE_ONE:
+#if 0
+      emit_tempf(
+         func,
+         xmm,
+         TEMP_ONE_I,
+         TEMP_ONE_C );
+#endif
+      break;
+
+   default:
+      assert( 0 );
+   }
+
+#if 0
+   switch( tgsi_util_get_full_src_register_sign_mode( reg, chan_index ) ) {
+   case TGSI_UTIL_SIGN_CLEAR:
+      emit_abs( func, xmm );
+      break;
+
+   case TGSI_UTIL_SIGN_SET:
+      emit_setsign( func, xmm );
+      break;
+
+   case TGSI_UTIL_SIGN_TOGGLE:
+      emit_neg( func, xmm );
+      break;
+
+   case TGSI_UTIL_SIGN_KEEP:
+      break;
+   }
+#endif
+}
+
+#define FETCH( GEN, INST, VEC_REG, SRC_REG, CHAN ) \
+   emit_fetch( GEN, VEC_REG, &(INST).FullSrcRegisters[SRC_REG], CHAN )
+
+
+
+/**
+ * Register store.
+ */
+static void
+emit_store(struct gen_context *gen,
+           unsigned vec_reg,
+           const struct tgsi_full_dst_register *reg,
+           const struct tgsi_full_instruction *inst,
+           unsigned chan_index)
+{
+   switch (reg->DstRegister.File) {
+   case TGSI_FILE_OUTPUT:
+      {
+         int offset_reg = ppc_allocate_register(gen->f);
+         int offset = (reg->DstRegister.Index * 4 + chan_index) * 16;
+         ppc_li(gen->f, offset_reg, offset);
+         ppc_stvx(gen->f, vec_reg, gen->outputs_reg, offset_reg);
+         ppc_release_register(gen->f, offset_reg);
+      }
+      break;
+   case TGSI_FILE_TEMPORARY:
+      {
+         int offset_reg = ppc_allocate_register(gen->f);
+         int offset = (reg->DstRegister.Index * 4 + chan_index) * 16;
+         ppc_li(gen->f, offset_reg, offset);
+         ppc_stvx(gen->f, vec_reg, gen->temps_reg, offset_reg);
+         ppc_release_register(gen->f, offset_reg);
+      }
+      break;
+#if 0
+   case TGSI_FILE_ADDRESS:
+      emit_addrs(
+         func,
+         xmm,
+         reg->DstRegister.Index,
+         chan_index );
+      break;
+#endif
+   default:
+      assert( 0 );
+   }
+
+#if 0
+   switch( inst->Instruction.Saturate ) {
+   case TGSI_SAT_NONE:
+      break;
+
+   case TGSI_SAT_ZERO_ONE:
+      /* assert( 0 ); */
+      break;
+
+   case TGSI_SAT_MINUS_PLUS_ONE:
+      assert( 0 );
+      break;
+   }
+#endif
+}
+
+
+#define STORE( GEN, INST, XMM, INDEX, CHAN )\
+   emit_store( GEN, XMM, &(INST).FullDstRegisters[INDEX], &(INST), CHAN )
+
+
+
+#if 000
+/**
+ * High-level instruction translators.
+ */
+
+static void
+emit_kil(
+   struct x86_function *func,
+   const struct tgsi_full_src_register *reg )
+{
+   unsigned uniquemask;
+   unsigned registers[4];
+   unsigned nextregister = 0;
+   unsigned firstchan = ~0;
+   unsigned chan_index;
+
+   /* This mask stores component bits that were already tested. Note that
+    * we test if the value is less than zero, so 1.0 and 0.0 need not to be
+    * tested. */
+   uniquemask = (1 << TGSI_EXTSWIZZLE_ZERO) | (1 << TGSI_EXTSWIZZLE_ONE);
+
+   FOR_EACH_CHANNEL( chan_index ) {
+      unsigned swizzle;
+
+      /* unswizzle channel */
+      swizzle = tgsi_util_get_full_src_register_extswizzle(
+         reg,
+         chan_index );
+
+      /* check if the component has not been already tested */
+      if( !(uniquemask & (1 << swizzle)) ) {
+         uniquemask |= 1 << swizzle;
+
+         /* allocate register */
+         registers[chan_index] = nextregister;
+         emit_fetch(
+            func,
+            nextregister,
+            reg,
+            chan_index );
+         nextregister++;
+
+         /* mark the first channel used */
+         if( firstchan == ~0 ) {
+            firstchan = chan_index;
+         }
+      }
+   }
+
+   x86_push(
+      func,
+      x86_make_reg( file_REG32, reg_AX ) );
+   x86_push(
+      func,
+      x86_make_reg( file_REG32, reg_DX ) );
+
+   FOR_EACH_CHANNEL( chan_index ) {
+      if( uniquemask & (1 << chan_index) ) {
+         sse_cmpps(
+            func,
+            make_xmm( registers[chan_index] ),
+            get_temp(
+               TGSI_EXEC_TEMP_00000000_I,
+               TGSI_EXEC_TEMP_00000000_C ),
+            cc_LessThan );
+
+         if( chan_index == firstchan ) {
+            sse_pmovmskb(
+               func,
+               x86_make_reg( file_REG32, reg_AX ),
+               make_xmm( registers[chan_index] ) );
+         }
+         else {
+            sse_pmovmskb(
+               func,
+               x86_make_reg( file_REG32, reg_DX ),
+               make_xmm( registers[chan_index] ) );
+            x86_or(
+               func,
+               x86_make_reg( file_REG32, reg_AX ),
+               x86_make_reg( file_REG32, reg_DX ) );
+         }
+      }
+   }
+
+   x86_or(
+      func,
+      get_temp(
+         TGSI_EXEC_TEMP_KILMASK_I,
+         TGSI_EXEC_TEMP_KILMASK_C ),
+      x86_make_reg( file_REG32, reg_AX ) );
+
+   x86_pop(
+      func,
+      x86_make_reg( file_REG32, reg_DX ) );
+   x86_pop(
+      func,
+      x86_make_reg( file_REG32, reg_AX ) );
+}
+
+
+static void
+emit_kilp(
+   struct x86_function *func )
+{
+   /* XXX todo / fix me */
+}
+
+
+static void
+emit_setcc(
+   struct x86_function *func,
+   struct tgsi_full_instruction *inst,
+   enum sse_cc cc )
+{
+   unsigned chan_index;
+
+   FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+      FETCH( func, *inst, 0, 0, chan_index );
+      FETCH( func, *inst, 1, 1, chan_index );
+      sse_cmpps(
+         func,
+         make_xmm( 0 ),
+         make_xmm( 1 ),
+         cc );
+      sse_andps(
+         func,
+         make_xmm( 0 ),
+         get_temp(
+            TEMP_ONE_I,
+            TEMP_ONE_C ) );
+      STORE( func, *inst, 0, 0, chan_index );
+   }
+}
+
+static void
+emit_cmp(
+   struct x86_function *func,
+   struct tgsi_full_instruction *inst )
+{
+   unsigned chan_index;
+
+   FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+      FETCH( func, *inst, 0, 0, chan_index );
+      FETCH( func, *inst, 1, 1, chan_index );
+      FETCH( func, *inst, 2, 2, chan_index );
+      sse_cmpps(
+         func,
+         make_xmm( 0 ),
+         get_temp(
+            TGSI_EXEC_TEMP_00000000_I,
+            TGSI_EXEC_TEMP_00000000_C ),
+         cc_LessThan );
+      sse_andps(
+         func,
+         make_xmm( 1 ),
+         make_xmm( 0 ) );
+      sse_andnps(
+         func,
+         make_xmm( 0 ),
+         make_xmm( 2 ) );
+      sse_orps(
+         func,
+         make_xmm( 0 ),
+         make_xmm( 1 ) );
+      STORE( func, *inst, 0, 0, chan_index );
+   }
+}
+#endif
+
+
+static void
+emit_unaryop(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+   int v0 = ppc_allocate_vec_register(gen->f);
+   uint chan_index;
+   FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) {
+      FETCH(gen, *inst, 0, 0, chan_index);   /* v0 = srcreg[0] */
+      switch (inst->Instruction.Opcode) {
+      case TGSI_OPCODE_ABS:
+         /* turn off the most significant bit of each vector float word */
+         {
+            int v1 = ppc_allocate_vec_register(gen->f);
+            ppc_vspltisw(gen->f, v1, -1);  /* v1 = {-1, -1, -1, -1} */
+            ppc_vslw(gen->f, v1, v1, v1);  /* v1 = {1<<31, 1<<31, 1<<31, 1<<31} */
+            ppc_vandc(gen->f, v0, v0, v1); /* v0 = v0 & ~v1 */
+            ppc_release_vec_register(gen->f, v1);
+         }
+         break;
+      case TGSI_OPCODE_FLOOR:
+         ppc_vrfim(gen->f, v0, v0);         /* v0 = floor(v0) */
+         break;
+      case TGSI_OPCODE_FRAC:
+         {
+            int v1 = ppc_allocate_vec_register(gen->f);
+            ppc_vrfim(gen->f, v1, v0);         /* v1 = floor(v0) */
+            ppc_vsubfp(gen->f, v0, v0, v1);    /* v0 = v0 - v1 */
+            ppc_release_vec_register(gen->f, v1);
+         }
+         break;
+      case TGSI_OPCODE_EXPBASE2:
+         ppc_vexptefp(gen->f, v0, v0);      /* v0 = 2^v0 */
+         break;
+      case TGSI_OPCODE_LOGBASE2:
+         /* XXX this may be broken! */
+         ppc_vlogefp(gen->f, v0, v0);      /* v0 = log2(v0) */
+         break;
+      case TGSI_OPCODE_MOV:
+         /* nothing */
+         break;
+      default:
+         assert(0);
+      }
+      STORE(gen, *inst, v0, 0, chan_index);   /* store v0 */
+   }
+   ppc_release_vec_register(gen->f, v0);
+}
+
+
+static void
+emit_binop(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+   int v0 = ppc_allocate_vec_register(gen->f);
+   int v1 = ppc_allocate_vec_register(gen->f);
+   int v2 = ppc_allocate_vec_register(gen->f);
+   uint chan_index;
+   FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) {
+      FETCH(gen, *inst, v0, 0, chan_index);   /* v0 = srcreg[0] */
+      FETCH(gen, *inst, v1, 1, chan_index);   /* v1 = srcreg[1] */
+      switch (inst->Instruction.Opcode) {
+      case TGSI_OPCODE_ADD:
+         ppc_vaddfp(gen->f, v2, v0, v1);
+         break;
+      case TGSI_OPCODE_SUB:
+         ppc_vsubfp(gen->f, v2, v0, v1);
+         break;
+      case TGSI_OPCODE_MUL:
+         ppc_vxor(gen->f, v2, v2, v2);        /* v2 = {0, 0, 0, 0} */
+         ppc_vmaddfp(gen->f, v2, v0, v1, v2); /* v2 = v0 * v1 + v0 */
+         break;
+      case TGSI_OPCODE_MIN:
+         ppc_vminfp(gen->f, v2, v0, v1);
+         break;
+      case TGSI_OPCODE_MAX:
+         ppc_vmaxfp(gen->f, v2, v0, v1);
+         break;
+      default:
+         assert(0);
+      }
+      STORE(gen, *inst, v2, 0, chan_index);   /* store v2 */
+   }
+   ppc_release_vec_register(gen->f, v0);
+   ppc_release_vec_register(gen->f, v1);
+   ppc_release_vec_register(gen->f, v2);
+}
+
+
+static void
+emit_dotprod(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+   int v0 = ppc_allocate_vec_register(gen->f);
+   int v1 = ppc_allocate_vec_register(gen->f);
+   int v2 = ppc_allocate_vec_register(gen->f);
+   uint chan_index;
+
+   ppc_vxor(gen->f, v2, v2, v2);           /* v2 = {0, 0, 0, 0} */
+
+   FETCH(gen, *inst, v0, 0, CHAN_X);       /* v0 = src0.XXXX */
+   FETCH(gen, *inst, v1, 1, CHAN_X);       /* v1 = src1.XXXX */
+   ppc_vmaddfp(gen->f, v2, v0, v1, v2);    /* v2 = v0 * v1 + v2 */
+
+   FETCH(gen, *inst, v0, 0, CHAN_Y);       /* v0 = src0.YYYY */
+   FETCH(gen, *inst, v1, 1, CHAN_Y);       /* v1 = src1.YYYY */
+   ppc_vmaddfp(gen->f, v2, v0, v1, v2);    /* v2 = v0 * v1 + v2 */
+
+   FETCH(gen, *inst, v0, 0, CHAN_Z);       /* v0 = src0.ZZZZ */
+   FETCH(gen, *inst, v1, 1, CHAN_Z);       /* v1 = src1.ZZZZ */
+   ppc_vmaddfp(gen->f, v2, v0, v1, v2);    /* v2 = v0 * v1 + v2 */
+
+   if (inst->Instruction.Opcode == TGSI_OPCODE_DP4) {
+      FETCH(gen, *inst, v0, 0, CHAN_W);    /* v0 = src0.WWWW */
+      FETCH(gen, *inst, v1, 1, CHAN_W);    /* v1 = src1.WWWW */
+      ppc_vmaddfp(gen->f, v2, v0, v1, v2); /* v2 = v0 * v1 + v2 */
+   }
+   else if (inst->Instruction.Opcode == TGSI_OPCODE_DPH) {
+      FETCH(gen, *inst, v1, 1, CHAN_W);    /* v1 = src1.WWWW */
+      ppc_vaddfp(gen->f, v2, v2, v1);      /* v2 = v2 + v1 */
+   }
+
+   FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) {
+      STORE(gen, *inst, v2, 0, chan_index);  /* store v2 */
+   }
+   ppc_release_vec_register(gen->f, v0);
+   ppc_release_vec_register(gen->f, v1);
+   ppc_release_vec_register(gen->f, v2);
+}
+
+
+static void
+emit_triop(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+   int v0 = ppc_allocate_vec_register(gen->f);
+   int v1 = ppc_allocate_vec_register(gen->f);
+   int v2 = ppc_allocate_vec_register(gen->f);
+   int v3 = ppc_allocate_vec_register(gen->f);
+   uint chan_index;
+   FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) {
+      FETCH(gen, *inst, v0, 0, chan_index);   /* v0 = srcreg[0] */
+      FETCH(gen, *inst, v1, 1, chan_index);   /* v1 = srcreg[1] */
+      FETCH(gen, *inst, v2, 2, chan_index);   /* v2 = srcreg[2] */
+      switch (inst->Instruction.Opcode) {
+      case TGSI_OPCODE_MAD:
+         ppc_vmaddfp(gen->f, v3, v0, v1, v2);   /* v3 = v0 * v1 + v2 */
+         break;
+      case TGSI_OPCODE_LRP:
+         ppc_vsubfp(gen->f, v3, v1, v2);        /* v3 = v1 - v2 */
+         ppc_vmaddfp(gen->f, v3, v0, v3, v2);   /* v3 = v0 * v3 + v2 */
+         break;
+      default:
+         assert(0);
+      }
+      STORE(gen, *inst, v3, 0, chan_index);   /* store v3 */
+   }
+   ppc_release_vec_register(gen->f, v0);
+   ppc_release_vec_register(gen->f, v1);
+   ppc_release_vec_register(gen->f, v2);
+   ppc_release_vec_register(gen->f, v3);
+}
+
+
+static int
+emit_instruction(struct gen_context *gen,
+                 struct tgsi_full_instruction *inst)
+{
+   switch (inst->Instruction.Opcode) {
+   case TGSI_OPCODE_MOV:
+   case TGSI_OPCODE_ABS:
+   case TGSI_OPCODE_FLOOR:
+   case TGSI_OPCODE_FRAC:
+   case TGSI_OPCODE_EXPBASE2:
+   case TGSI_OPCODE_LOGBASE2:
+      emit_unaryop(gen, inst);
+      break;
+   case TGSI_OPCODE_ADD:
+   case TGSI_OPCODE_SUB:
+   case TGSI_OPCODE_MUL:
+   case TGSI_OPCODE_MIN:
+   case TGSI_OPCODE_MAX:
+      emit_binop(gen, inst);
+      break;
+   case TGSI_OPCODE_MAD:
+   case TGSI_OPCODE_LRP:
+      emit_triop(gen, inst);
+      break;
+   case TGSI_OPCODE_DP3:
+   case TGSI_OPCODE_DP4:
+   case TGSI_OPCODE_DPH:
+      emit_dotprod(gen, inst);
+      break;
+   case TGSI_OPCODE_END:
+      /* normal end */
+      return 1;
+   default:
+      return 0;
+   }
+
+#if 0
+   unsigned chan_index;
+
+   switch (inst->Instruction.Opcode) {
+   case TGSI_OPCODE_ARL:
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         emit_f2it( func, 0 );
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_MOV:
+   case TGSI_OPCODE_SWZ:
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_LIT:
+      if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
+          IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
+         emit_tempf(
+            func,
+            0,
+            TEMP_ONE_I,
+            TEMP_ONE_C);
+         if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ) {
+            STORE( func, *inst, 0, 0, CHAN_X );
+         }
+         if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
+            STORE( func, *inst, 0, 0, CHAN_W );
+         }
+      }
+      if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
+          IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
+         if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
+            FETCH( func, *inst, 0, 0, CHAN_X );
+            sse_maxps(
+               func,
+               make_xmm( 0 ),
+               get_temp(
+                  TGSI_EXEC_TEMP_00000000_I,
+                  TGSI_EXEC_TEMP_00000000_C ) );
+            STORE( func, *inst, 0, 0, CHAN_Y );
+         }
+         if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
+            /* XMM[1] = SrcReg[0].yyyy */
+            FETCH( func, *inst, 1, 0, CHAN_Y );
+            /* XMM[1] = max(XMM[1], 0) */
+            sse_maxps(
+               func,
+               make_xmm( 1 ),
+               get_temp(
+                  TGSI_EXEC_TEMP_00000000_I,
+                  TGSI_EXEC_TEMP_00000000_C ) );
+            /* XMM[2] = SrcReg[0].wwww */
+            FETCH( func, *inst, 2, 0, CHAN_W );
+            /* XMM[2] = min(XMM[2], 128.0) */
+            sse_minps(
+               func,
+               make_xmm( 2 ),
+               get_temp(
+                  TGSI_EXEC_TEMP_128_I,
+                  TGSI_EXEC_TEMP_128_C ) );
+            /* XMM[2] = max(XMM[2], -128.0) */
+            sse_maxps(
+               func,
+               make_xmm( 2 ),
+               get_temp(
+                  TGSI_EXEC_TEMP_MINUS_128_I,
+                  TGSI_EXEC_TEMP_MINUS_128_C ) );
+            emit_pow( func, 3, 1, 2 );
+            FETCH( func, *inst, 0, 0, CHAN_X );
+            sse_xorps(
+               func,
+               make_xmm( 2 ),
+               make_xmm( 2 ) );
+            sse_cmpps(
+               func,
+               make_xmm( 2 ),
+               make_xmm( 0 ),
+               cc_LessThanEqual );
+            sse_andps(
+               func,
+               make_xmm( 2 ),
+               make_xmm( 1 ) );
+            STORE( func, *inst, 2, 0, CHAN_Z );
+         }
+      }
+      break;
+
+   case TGSI_OPCODE_RCP:
+   /* TGSI_OPCODE_RECIP */
+      FETCH( func, *inst, 0, 0, CHAN_X );
+      emit_rcp( func, 0, 0 );
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_RSQ:
+   /* TGSI_OPCODE_RECIPSQRT */
+      FETCH( func, *inst, 0, 0, CHAN_X );
+      emit_rsqrt( func, 1, 0 );
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( func, *inst, 1, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_EXP:
+      if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
+          IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
+          IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
+         FETCH( func, *inst, 0, 0, CHAN_X );
+         if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
+             IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
+            emit_MOV( func, 1, 0 );
+            emit_flr( func, 2, 1 );
+            /* dst.x = ex2(floor(src.x)) */
+            if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
+               emit_MOV( func, 2, 1 );
+               emit_ex2( func, 3, 2 );
+               STORE( func, *inst, 2, 0, CHAN_X );
+            }
+            /* dst.y = src.x - floor(src.x) */
+            if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
+               emit_MOV( func, 2, 0 );
+               emit_sub( func, 2, 1 );
+               STORE( func, *inst, 2, 0, CHAN_Y );
+            }
+         }
+         /* dst.z = ex2(src.x) */
+         if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
+            emit_ex2( func, 3, 0 );
+            STORE( func, *inst, 0, 0, CHAN_Z );
+         }
+      }
+      /* dst.w = 1.0 */
+      if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
+         emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
+         STORE( func, *inst, 0, 0, CHAN_W );
+      }
+      break;
+
+   case TGSI_OPCODE_LOG:
+      if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
+          IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
+          IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
+         FETCH( func, *inst, 0, 0, CHAN_X );
+         emit_abs( func, 0 );
+         emit_MOV( func, 1, 0 );
+         emit_lg2( func, 2, 1 );
+         /* dst.z = lg2(abs(src.x)) */
+         if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
+            STORE( func, *inst, 1, 0, CHAN_Z );
+         }
+         if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
+             IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
+            emit_flr( func, 2, 1 );
+            /* dst.x = floor(lg2(abs(src.x))) */
+            if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
+               STORE( func, *inst, 1, 0, CHAN_X );
+            }
+            /* dst.x = abs(src)/ex2(floor(lg2(abs(src.x)))) */
+            if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
+               emit_ex2( func, 2, 1 );
+               emit_rcp( func, 1, 1 );
+               emit_mul( func, 0, 1 );
+               STORE( func, *inst, 0, 0, CHAN_Y );
+            }
+         }
+      }
+      /* dst.w = 1.0 */
+      if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
+         emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
+         STORE( func, *inst, 0, 0, CHAN_W );
+      }
+      break;
+
+   case TGSI_OPCODE_MUL:
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         FETCH( func, *inst, 1, 1, chan_index );
+         emit_mul( func, 0, 1 );
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_ADD:
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         FETCH( func, *inst, 1, 1, chan_index );
+         emit_add( func, 0, 1 );
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_DP3:
+   /* TGSI_OPCODE_DOT3 */
+      FETCH( func, *inst, 0, 0, CHAN_X );
+      FETCH( func, *inst, 1, 1, CHAN_X );
+      emit_mul( func, 0, 1 );
+      FETCH( func, *inst, 1, 0, CHAN_Y );
+      FETCH( func, *inst, 2, 1, CHAN_Y );
+      emit_mul( func, 1, 2 );
+      emit_add( func, 0, 1 );
+      FETCH( func, *inst, 1, 0, CHAN_Z );
+      FETCH( func, *inst, 2, 1, CHAN_Z );
+      emit_mul( func, 1, 2 );
+      emit_add( func, 0, 1 );
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_DP4:
+   /* TGSI_OPCODE_DOT4 */
+      FETCH( func, *inst, 0, 0, CHAN_X );
+      FETCH( func, *inst, 1, 1, CHAN_X );
+      emit_mul( func, 0, 1 );
+      FETCH( func, *inst, 1, 0, CHAN_Y );
+      FETCH( func, *inst, 2, 1, CHAN_Y );
+      emit_mul( func, 1, 2 );
+      emit_add( func, 0, 1 );
+      FETCH( func, *inst, 1, 0, CHAN_Z );
+      FETCH( func, *inst, 2, 1, CHAN_Z );
+      emit_mul(func, 1, 2 );
+      emit_add(func, 0, 1 );
+      FETCH( func, *inst, 1, 0, CHAN_W );
+      FETCH( func, *inst, 2, 1, CHAN_W );
+      emit_mul( func, 1, 2 );
+      emit_add( func, 0, 1 );
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_DST:
+      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
+         emit_tempf(
+            func,
+            0,
+            TEMP_ONE_I,
+            TEMP_ONE_C );
+         STORE( func, *inst, 0, 0, CHAN_X );
+      }
+      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
+         FETCH( func, *inst, 0, 0, CHAN_Y );
+         FETCH( func, *inst, 1, 1, CHAN_Y );
+         emit_mul( func, 0, 1 );
+         STORE( func, *inst, 0, 0, CHAN_Y );
+      }
+      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
+         FETCH( func, *inst, 0, 0, CHAN_Z );
+         STORE( func, *inst, 0, 0, CHAN_Z );
+      }
+      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
+         FETCH( func, *inst, 0, 1, CHAN_W );
+         STORE( func, *inst, 0, 0, CHAN_W );
+      }
+      break;
+
+   case TGSI_OPCODE_MIN:
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         FETCH( func, *inst, 1, 1, chan_index );
+         sse_minps(
+            func,
+            make_xmm( 0 ),
+            make_xmm( 1 ) );
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_MAX:
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         FETCH( func, *inst, 1, 1, chan_index );
+         sse_maxps(
+            func,
+            make_xmm( 0 ),
+            make_xmm( 1 ) );
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SLT:
+   /* TGSI_OPCODE_SETLT */
+      emit_setcc( func, inst, cc_LessThan );
+      break;
+
+   case TGSI_OPCODE_SGE:
+   /* TGSI_OPCODE_SETGE */
+      emit_setcc( func, inst, cc_NotLessThan );
+      break;
+
+   case TGSI_OPCODE_MAD:
+   /* TGSI_OPCODE_MADD */
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         FETCH( func, *inst, 1, 1, chan_index );
+         FETCH( func, *inst, 2, 2, chan_index );
+         emit_mul( func, 0, 1 );
+         emit_add( func, 0, 2 );
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SUB:
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         FETCH( func, *inst, 1, 1, chan_index );
+         emit_sub( func, 0, 1 );
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_LERP:
+   /* TGSI_OPCODE_LRP */
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         FETCH( func, *inst, 1, 1, chan_index );
+         FETCH( func, *inst, 2, 2, chan_index );
+         emit_sub( func, 1, 2 );
+         emit_mul( func, 0, 1 );
+         emit_add( func, 0, 2 );
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_CND:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_CND0:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_DOT2ADD:
+   /* TGSI_OPCODE_DP2A */
+      return 0;
+      break;
+
+   case TGSI_OPCODE_INDEX:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_NEGATE:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_FRAC:
+   /* TGSI_OPCODE_FRC */
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         emit_frc( func, 0, 0 );
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_CLAMP:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_FLOOR:
+   /* TGSI_OPCODE_FLR */
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         emit_flr( func, 0, 0 );
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_ROUND:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_EXPBASE2:
+   /* TGSI_OPCODE_EX2 */
+      FETCH( func, *inst, 0, 0, CHAN_X );
+      emit_ex2( func, 0, 0 );
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_LOGBASE2:
+   /* TGSI_OPCODE_LG2 */
+      FETCH( func, *inst, 0, 0, CHAN_X );
+      emit_lg2( func, 0, 0 );
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_POWER:
+   /* TGSI_OPCODE_POW */
+      FETCH( func, *inst, 0, 0, CHAN_X );
+      FETCH( func, *inst, 1, 1, CHAN_X );
+      emit_pow( func, 0, 0, 1 );
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_CROSSPRODUCT:
+   /* TGSI_OPCODE_XPD */
+      if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
+          IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
+         FETCH( func, *inst, 1, 1, CHAN_Z );
+         FETCH( func, *inst, 3, 0, CHAN_Z );
+      }
+      if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
+          IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
+         FETCH( func, *inst, 0, 0, CHAN_Y );
+         FETCH( func, *inst, 4, 1, CHAN_Y );
+      }
+      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
+         emit_MOV( func, 2, 0 );
+         emit_mul( func, 2, 1 );
+         emit_MOV( func, 5, 3 );
+         emit_mul( func, 5, 4 );
+         emit_sub( func, 2, 5 );
+         STORE( func, *inst, 2, 0, CHAN_X );
+      }
+      if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
+          IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
+         FETCH( func, *inst, 2, 1, CHAN_X );
+         FETCH( func, *inst, 5, 0, CHAN_X );
+      }
+      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
+         emit_mul( func, 3, 2 );
+         emit_mul( func, 1, 5 );
+         emit_sub( func, 3, 1 );
+         STORE( func, *inst, 3, 0, CHAN_Y );
+      }
+      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
+         emit_mul( func, 5, 4 );
+         emit_mul( func, 0, 2 );
+         emit_sub( func, 5, 0 );
+         STORE( func, *inst, 5, 0, CHAN_Z );
+      }
+      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
+	 emit_tempf(
+	    func,
+	    0,
+	    TEMP_ONE_I,
+	    TEMP_ONE_C );
+         STORE( func, *inst, 0, 0, CHAN_W );
+      }
+      break;
+
+   case TGSI_OPCODE_MULTIPLYMATRIX:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_ABS:
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         emit_abs( func, 0) ;
+
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_RCC:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_DPH:
+      FETCH( func, *inst, 0, 0, CHAN_X );
+      FETCH( func, *inst, 1, 1, CHAN_X );
+      emit_mul( func, 0, 1 );
+      FETCH( func, *inst, 1, 0, CHAN_Y );
+      FETCH( func, *inst, 2, 1, CHAN_Y );
+      emit_mul( func, 1, 2 );
+      emit_add( func, 0, 1 );
+      FETCH( func, *inst, 1, 0, CHAN_Z );
+      FETCH( func, *inst, 2, 1, CHAN_Z );
+      emit_mul( func, 1, 2 );
+      emit_add( func, 0, 1 );
+      FETCH( func, *inst, 1, 1, CHAN_W );
+      emit_add( func, 0, 1 );
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_COS:
+      FETCH( func, *inst, 0, 0, CHAN_X );
+      emit_cos( func, 0, 0 );
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_DDX:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_DDY:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_KILP:
+      /* predicated kill */
+      emit_kilp( func );
+      return 0; /* XXX fix me */
+      break;
+
+   case TGSI_OPCODE_KIL:
+      /* conditional kill */
+      emit_kil( func, &inst->FullSrcRegisters[0] );
+      break;
+
+   case TGSI_OPCODE_PK2H:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_PK2US:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_PK4B:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_PK4UB:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_RFL:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_SEQ:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_SFL:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_SGT:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_SIN:
+      FETCH( func, *inst, 0, 0, CHAN_X );
+      emit_sin( func, 0, 0 );
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SLE:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_SNE:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_STR:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_TEX:
+      if (0) {
+	 /* Disable dummy texture code: 
+	  */
+	 emit_tempf(
+	    func,
+	    0,
+	    TEMP_ONE_I,
+	    TEMP_ONE_C );
+	 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+	    STORE( func, *inst, 0, 0, chan_index );
+	 }
+      }
+      else {
+	 return 0;
+      }
+      break;
+
+   case TGSI_OPCODE_TXD:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_UP2H:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_UP2US:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_UP4B:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_UP4UB:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_X2D:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_ARA:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_ARR:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_BRA:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_CAL:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_RET:
+      emit_ret( func );
+      break;
+
+   case TGSI_OPCODE_END:
+      break;
+
+   case TGSI_OPCODE_SSG:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_CMP:
+      emit_cmp (func, inst);
+      break;
+
+   case TGSI_OPCODE_SCS:
+      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
+         FETCH( func, *inst, 0, 0, CHAN_X );
+         emit_cos( func, 0, 0 );
+         STORE( func, *inst, 0, 0, CHAN_X );
+      }
+      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
+         FETCH( func, *inst, 0, 0, CHAN_X );
+         emit_sin( func, 0, 0 );
+         STORE( func, *inst, 0, 0, CHAN_Y );
+      }
+      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
+	 emit_tempf(
+	    func,
+	    0,
+	    TGSI_EXEC_TEMP_00000000_I,
+	    TGSI_EXEC_TEMP_00000000_C );
+         STORE( func, *inst, 0, 0, CHAN_Z );
+      }
+      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
+	 emit_tempf(
+	    func,
+	    0,
+	    TEMP_ONE_I,
+	    TEMP_ONE_C );
+         STORE( func, *inst, 0, 0, CHAN_W );
+      }
+      break;
+
+   case TGSI_OPCODE_TXB:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_NRM:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_DIV:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_DP2:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_TXL:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_BRK:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_IF:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_LOOP:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_REP:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_ELSE:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_ENDIF:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_ENDLOOP:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_ENDREP:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_PUSHA:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_POPA:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_CEIL:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_I2F:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_NOT:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_TRUNC:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_SHL:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_SHR:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_AND:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_OR:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_MOD:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_XOR:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_SAD:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_TXF:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_TXQ:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_CONT:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_EMIT:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_ENDPRIM:
+      return 0;
+      break;
+
+   default:
+      return 0;
+   }
+#endif
+   
+   return 1;
+}
+
+static void
+emit_declaration(
+   struct ppc_function *func,
+   struct tgsi_full_declaration *decl )
+{
+   if( decl->Declaration.File == TGSI_FILE_INPUT ) {
+#if 0
+      unsigned first, last, mask;
+      unsigned i, j;
+
+      first = decl->DeclarationRange.First;
+      last = decl->DeclarationRange.Last;
+      mask = decl->Declaration.UsageMask;
+
+      for( i = first; i <= last; i++ ) {
+         for( j = 0; j < NUM_CHANNELS; j++ ) {
+            if( mask & (1 << j) ) {
+               switch( decl->Declaration.Interpolate ) {
+               case TGSI_INTERPOLATE_CONSTANT:
+                  emit_coef_a0( func, 0, i, j );
+                  emit_inputs( func, 0, i, j );
+                  break;
+
+               case TGSI_INTERPOLATE_LINEAR:
+                  emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
+                  emit_coef_dadx( func, 1, i, j );
+                  emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
+                  emit_coef_dady( func, 3, i, j );
+                  emit_mul( func, 0, 1 );    /* x * dadx */
+                  emit_coef_a0( func, 4, i, j );
+                  emit_mul( func, 2, 3 );    /* y * dady */
+                  emit_add( func, 0, 4 );    /* x * dadx + a0 */
+                  emit_add( func, 0, 2 );    /* x * dadx + y * dady + a0 */
+                  emit_inputs( func, 0, i, j );
+                  break;
+
+               case TGSI_INTERPOLATE_PERSPECTIVE:
+                  emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
+                  emit_coef_dadx( func, 1, i, j );
+                  emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
+                  emit_coef_dady( func, 3, i, j );
+                  emit_mul( func, 0, 1 );    /* x * dadx */
+                  emit_tempf( func, 4, 0, TGSI_SWIZZLE_W );
+                  emit_coef_a0( func, 5, i, j );
+                  emit_rcp( func, 4, 4 );    /* 1.0 / w */
+                  emit_mul( func, 2, 3 );    /* y * dady */
+                  emit_add( func, 0, 5 );    /* x * dadx + a0 */
+                  emit_add( func, 0, 2 );    /* x * dadx + y * dady + a0 */
+                  emit_mul( func, 0, 4 );    /* (x * dadx + y * dady + a0) / w */
+                  emit_inputs( func, 0, i, j );
+                  break;
+
+               default:
+                  assert( 0 );
+		  break;
+               }
+            }
+         }
+      }
+#endif
+   }
+}
+
+#if 0
+static void aos_to_soa( struct x86_function *func, 
+                        uint arg_aos,
+                        uint arg_soa, 
+                        uint arg_num, 
+                        uint arg_stride )
+{
+   struct x86_reg soa_input = x86_make_reg( file_REG32, reg_AX );
+   struct x86_reg aos_input = x86_make_reg( file_REG32, reg_BX );
+   struct x86_reg num_inputs = x86_make_reg( file_REG32, reg_CX );
+   struct x86_reg stride = x86_make_reg( file_REG32, reg_DX );
+   int inner_loop;
+
+
+   /* Save EBX */
+   x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
+
+   x86_mov( func, aos_input,  x86_fn_arg( func, arg_aos ) );
+   x86_mov( func, soa_input,  x86_fn_arg( func, arg_soa ) );
+   x86_mov( func, num_inputs, x86_fn_arg( func, arg_num ) );
+   x86_mov( func, stride,     x86_fn_arg( func, arg_stride ) );
+
+   /* do */
+   inner_loop = x86_get_label( func );
+   {
+      x86_push( func, aos_input );
+      sse_movlps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
+      sse_movlps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
+      x86_add( func, aos_input, stride );
+      sse_movhps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
+      sse_movhps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
+      x86_add( func, aos_input, stride );
+      sse_movlps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
+      sse_movlps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
+      x86_add( func, aos_input, stride );
+      sse_movhps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
+      sse_movhps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
+      x86_pop( func, aos_input );
+
+      sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
+      sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
+      sse_shufps( func, make_xmm( 0 ), make_xmm( 1 ), 0x88 );
+      sse_shufps( func, make_xmm( 2 ), make_xmm( 1 ), 0xdd );
+      sse_shufps( func, make_xmm( 3 ), make_xmm( 4 ), 0x88 );
+      sse_shufps( func, make_xmm( 5 ), make_xmm( 4 ), 0xdd );
+
+      sse_movups( func, x86_make_disp( soa_input, 0 ), make_xmm( 0 ) );
+      sse_movups( func, x86_make_disp( soa_input, 16 ), make_xmm( 2 ) );
+      sse_movups( func, x86_make_disp( soa_input, 32 ), make_xmm( 3 ) );
+      sse_movups( func, x86_make_disp( soa_input, 48 ), make_xmm( 5 ) );
+
+      /* Advance to next input */
+      x86_lea( func, aos_input, x86_make_disp(aos_input, 16) );
+      x86_lea( func, soa_input, x86_make_disp(soa_input, 64) );
+   }
+   /* while --num_inputs */
+   x86_dec( func, num_inputs );
+   x86_jcc( func, cc_NE, inner_loop );
+
+   /* Restore EBX */
+   x86_pop( func, aos_input );
+}
+#endif
+
+#if 0
+static void soa_to_aos( struct x86_function *func, uint aos, uint soa, uint num, uint stride )
+{
+   struct x86_reg soa_output;
+   struct x86_reg aos_output;
+   struct x86_reg num_outputs;
+   struct x86_reg temp;
+   int inner_loop;
+
+   soa_output = x86_make_reg( file_REG32, reg_AX );
+   aos_output = x86_make_reg( file_REG32, reg_BX );
+   num_outputs = x86_make_reg( file_REG32, reg_CX );
+   temp = x86_make_reg( file_REG32, reg_DX );
+
+   /* Save EBX */
+   x86_push( func, aos_output );
+
+   x86_mov( func, soa_output, x86_fn_arg( func, soa ) );
+   x86_mov( func, aos_output, x86_fn_arg( func, aos ) );
+   x86_mov( func, num_outputs, x86_fn_arg( func, num ) );
+
+   /* do */
+   inner_loop = x86_get_label( func );
+   {
+      sse_movups( func, make_xmm( 0 ), x86_make_disp( soa_output, 0 ) );
+      sse_movups( func, make_xmm( 1 ), x86_make_disp( soa_output, 16 ) );
+      sse_movups( func, make_xmm( 3 ), x86_make_disp( soa_output, 32 ) );
+      sse_movups( func, make_xmm( 4 ), x86_make_disp( soa_output, 48 ) );
+
+      sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
+      sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
+      sse_unpcklps( func, make_xmm( 0 ), make_xmm( 1 ) );
+      sse_unpckhps( func, make_xmm( 2 ), make_xmm( 1 ) );
+      sse_unpcklps( func, make_xmm( 3 ), make_xmm( 4 ) );
+      sse_unpckhps( func, make_xmm( 5 ), make_xmm( 4 ) );
+
+      x86_mov( func, temp, x86_fn_arg( func, stride ) );
+      x86_push( func, aos_output );
+      sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
+      sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
+      x86_add( func, aos_output, temp );
+      sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
+      sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
+      x86_add( func, aos_output, temp );
+      sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
+      sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
+      x86_add( func, aos_output, temp );
+      sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
+      sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
+      x86_pop( func, aos_output );
+
+      /* Advance to next output */
+      x86_lea( func, aos_output, x86_make_disp(aos_output, 16) );
+      x86_lea( func, soa_output, x86_make_disp(soa_output, 64) );
+   }
+   /* while --num_outputs */
+   x86_dec( func, num_outputs );
+   x86_jcc( func, cc_NE, inner_loop );
+
+   /* Restore EBX */
+   x86_pop( func, aos_output );
+}
+#endif
+
+
+static void
+emit_prologue(struct ppc_function *func)
+{
+   /* XXX set up stack frame */
+}
+
+
+static void
+emit_epilogue(struct ppc_function *func)
+{
+   ppc_return(func);
+   /* XXX restore prev stack frame */
+}
+
+
+
+/**
+ * Translate a TGSI vertex/fragment shader to PPC code.
+ *
+ * \param tokens  the TGSI input shader
+ * \param func  the output PPC code/function
+ * \param immediates  buffer to place immediates, later passed to PPC func
+ * \return TRUE for success, FALSE if translation failed
+ */
+boolean
+tgsi_emit_ppc(const struct tgsi_token *tokens,
+              struct ppc_function *func,
+              float (*immediates)[4],
+              boolean do_swizzles )
+{
+   struct tgsi_parse_context parse;
+   /*boolean instruction_phase = FALSE;*/
+   unsigned ok = 1;
+   uint num_immediates = 0;
+   struct gen_context gen;
+
+   util_init_math();
+
+   tgsi_parse_init( &parse, tokens );
+
+   gen.f = func;
+   gen.inputs_reg = 3;   /* first function param */
+   gen.outputs_reg = 4;  /* second function param */
+   gen.temps_reg = 5;    /* ... */
+   gen.immed_reg = 6;
+   gen.const_reg = 7;
+
+   emit_prologue(func);
+
+   /*
+    * Different function args for vertex/fragment shaders:
+    */
+#if 0
+   if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
+      /* DECLARATION phase, do not load output argument. */
+      x86_mov(
+         func,
+         get_input_base(),
+         x86_fn_arg( func, 1 ) );
+      /* skipping outputs argument here */
+      x86_mov(
+         func,
+         get_const_base(),
+         x86_fn_arg( func, 3 ) );
+      x86_mov(
+         func,
+         get_temp_base(),
+         x86_fn_arg( func, 4 ) );
+      x86_mov(
+         func,
+         get_coef_base(),
+         x86_fn_arg( func, 5 ) );
+      x86_mov(
+         func,
+         get_immediate_base(),
+         x86_fn_arg( func, 6 ) );
+   }
+   else {
+      assert(parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX);
+
+      if (do_swizzles)
+         aos_to_soa( func, 
+                     6,         /* aos_input */
+                     1,         /* machine->input */
+                     7,         /* num_inputs */
+                     8 );       /* input_stride */
+
+      x86_mov(
+         func,
+         get_input_base(),
+         x86_fn_arg( func, 1 ) );
+      x86_mov(
+         func,
+         get_output_base(),
+         x86_fn_arg( func, 2 ) );
+      x86_mov(
+         func,
+         get_const_base(),
+         x86_fn_arg( func, 3 ) );
+      x86_mov(
+         func,
+         get_temp_base(),
+         x86_fn_arg( func, 4 ) );
+      x86_mov(
+         func,
+         get_immediate_base(),
+         x86_fn_arg( func, 5 ) );
+   }
+#endif
+
+   while (!tgsi_parse_end_of_tokens(&parse) && ok) {
+      tgsi_parse_token(&parse);
+
+      switch (parse.FullToken.Token.Type) {
+      case TGSI_TOKEN_TYPE_DECLARATION:
+         if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
+            emit_declaration(func, &parse.FullToken.FullDeclaration );
+         }
+         break;
+
+      case TGSI_TOKEN_TYPE_INSTRUCTION:
+         if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
+#if 0
+            if( !instruction_phase ) {
+               /* INSTRUCTION phase, overwrite coeff with output. */
+               instruction_phase = TRUE;
+               x86_mov(
+                  func,
+                  get_output_base(),
+                  x86_fn_arg( func, 2 ) );
+            }
+#endif
+         }
+
+         ok = emit_instruction(&gen, &parse.FullToken.FullInstruction);
+
+	 if (!ok) {
+	    debug_printf("failed to translate tgsi opcode %d to PPC (%s)\n", 
+			 parse.FullToken.FullInstruction.Instruction.Opcode,
+                         parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX ?
+                         "vertex shader" : "fragment shader");
+	 }
+         break;
+
+      case TGSI_TOKEN_TYPE_IMMEDIATE:
+         /* splat each immediate component into a float[4] vector for SoA */
+         {
+            const uint size = parse.FullToken.FullImmediate.Immediate.Size - 1;
+            float *imm = (float *) immediates;
+            uint i;
+            assert(size <= 4);
+            assert(num_immediates < TGSI_EXEC_NUM_IMMEDIATES);
+            for (i = 0; i < size; i++) {
+               const float value =
+                  parse.FullToken.FullImmediate.u.ImmediateFloat32[i].Float;
+               imm[num_immediates * 4 + 0] = 
+               imm[num_immediates * 4 + 1] = 
+               imm[num_immediates * 4 + 2] = 
+               imm[num_immediates * 4 + 3] = value;
+               num_immediates++;
+            }
+         }
+         break;
+
+      default:
+	 ok = 0;
+         assert( 0 );
+      }
+   }
+
+#if 0
+   if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
+      if (do_swizzles)
+         soa_to_aos( func, 9, 2, 10, 11 );
+   }
+#endif
+
+   emit_epilogue(func);
+
+   tgsi_parse_free( &parse );
+
+   return ok;
+}
+
+#endif /* PIPE_ARCH_PPC */
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ppc.h b/src/gallium/auxiliary/tgsi/tgsi_ppc.h
new file mode 100644
index 0000000000..7cd2bf9aff
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/tgsi_ppc.h
@@ -0,0 +1,48 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef TGSI_PPC_H
+#define TGSI_PPC_H
+
+#if defined __cplusplus
+extern "C" {
+#endif
+
+struct tgsi_token;
+struct ppc_function;
+
+boolean
+tgsi_emit_ppc(const struct tgsi_token *tokens,
+              struct ppc_function *function,
+              float (*immediates)[4],
+              boolean do_swizzles);
+
+#if defined __cplusplus
+}
+#endif
+
+#endif /* TGSI_PPC_H */
-- 
cgit v1.2.3


From b7da4c3dc199ee382bb9924ac86a3485deccc62d Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 22 Oct 2008 11:08:45 -0600
Subject: gallium: PPC vertex shader support

Works, but dead code lingering, debug code present, etc.
---
 src/gallium/auxiliary/draw/Makefile      |   1 +
 src/gallium/auxiliary/draw/draw_vs.c     |   5 +-
 src/gallium/auxiliary/draw/draw_vs.h     |   4 +
 src/gallium/auxiliary/draw/draw_vs_ppc.c | 270 +++++++++++++++++++++++++++++++
 4 files changed, 279 insertions(+), 1 deletion(-)
 create mode 100644 src/gallium/auxiliary/draw/draw_vs_ppc.c

(limited to 'src/gallium/auxiliary')

diff --git a/src/gallium/auxiliary/draw/Makefile b/src/gallium/auxiliary/draw/Makefile
index f2e36a89e9..bdbf5a08ed 100644
--- a/src/gallium/auxiliary/draw/Makefile
+++ b/src/gallium/auxiliary/draw/Makefile
@@ -40,6 +40,7 @@ C_SOURCES = \
 	draw_vs_aos_machine.c \
 	draw_vs_exec.c \
 	draw_vs_llvm.c \
+	draw_vs_ppc.c  \
 	draw_vs_sse.c 
 
 
diff --git a/src/gallium/auxiliary/draw/draw_vs.c b/src/gallium/auxiliary/draw/draw_vs.c
index 34adbd49b0..7f305304ff 100644
--- a/src/gallium/auxiliary/draw/draw_vs.c
+++ b/src/gallium/auxiliary/draw/draw_vs.c
@@ -85,7 +85,10 @@ draw_create_vertex_shader(struct draw_context *draw,
    if (!vs) {
       vs = draw_create_vs_sse( draw, shader );
       if (!vs) {
-         vs = draw_create_vs_exec( draw, shader );
+         vs = draw_create_vs_ppc( draw, shader );
+         if (!vs) {
+            vs = draw_create_vs_exec( draw, shader );
+         }
       }
    }
 
diff --git a/src/gallium/auxiliary/draw/draw_vs.h b/src/gallium/auxiliary/draw/draw_vs.h
index 68c24abad3..89ae158751 100644
--- a/src/gallium/auxiliary/draw/draw_vs.h
+++ b/src/gallium/auxiliary/draw/draw_vs.h
@@ -157,6 +157,10 @@ struct draw_vertex_shader *
 draw_create_vs_sse(struct draw_context *draw,
 		   const struct pipe_shader_state *templ);
 
+struct draw_vertex_shader *
+draw_create_vs_ppc(struct draw_context *draw,
+		   const struct pipe_shader_state *templ);
+
 struct draw_vertex_shader *
 draw_create_vs_llvm(struct draw_context *draw,
 		    const struct pipe_shader_state *templ);
diff --git a/src/gallium/auxiliary/draw/draw_vs_ppc.c b/src/gallium/auxiliary/draw/draw_vs_ppc.c
new file mode 100644
index 0000000000..a096ad49b8
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_vs_ppc.c
@@ -0,0 +1,270 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  *   Brian Paul
+  */
+
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "pipe/p_config.h"
+
+#include "draw_vs.h"
+
+#if defined(PIPE_ARCH_PPC)
+
+#include "pipe/p_shader_tokens.h"
+
+#include "draw_private.h"
+#include "draw_context.h"
+
+#include "rtasm/rtasm_cpu.h"
+#include "rtasm/rtasm_ppc.h"
+#include "tgsi/tgsi_ppc.h"
+#include "tgsi/tgsi_parse.h"
+
+
+
+typedef void (PIPE_CDECL *codegen_function) (float (*inputs)[4][4],
+                                             float (*outputs)[4][4],
+                                             float (*temps)[4][4],
+                                             float (*immeds)[4][4],
+                                             float (*consts)[4]);
+
+#if 0
+   const struct tgsi_exec_vector *input,
+   struct tgsi_exec_vector *output,
+   float (*constant)[4],        /* 3 */
+   struct tgsi_exec_vector *temporary, /* 4 */
+   float (*immediates)[4],      /* 5 */
+   const float (*aos_input)[4], /* 6 */
+   uint num_inputs,             /* 7 */
+   uint input_stride,           /* 8 */
+   float (*aos_output)[4],      /* 9 */
+   uint num_outputs,            /* 10 */
+   uint output_stride );        /* 11 */
+#endif
+
+struct draw_ppc_vertex_shader {
+   struct draw_vertex_shader base;
+   struct ppc_function ppc_program;
+
+   codegen_function func;
+   
+   struct tgsi_exec_machine *machine;
+};
+
+
+static void
+vs_ppc_prepare( struct draw_vertex_shader *base,
+		struct draw_context *draw )
+{
+}
+
+
+
+/* Simplified vertex shader interface for the pt paths.  Given the
+ * complexity of code-generating all the above operations together,
+ * it's time to try doing all the other stuff separately.
+ */
+static void
+vs_ppc_run_linear( struct draw_vertex_shader *base,
+		   const float (*input)[4],
+		   float (*output)[4],
+		   const float (*constants)[4],
+		   unsigned count,
+		   unsigned input_stride,
+		   unsigned output_stride )
+{
+   struct draw_ppc_vertex_shader *shader = (struct draw_ppc_vertex_shader *)base;
+   struct tgsi_exec_machine *machine = shader->machine;
+   unsigned int i;
+
+#define MAX_VERTICES 4
+
+   /* loop over verts */
+   for (i = 0; i < count; i += MAX_VERTICES) {
+      const uint max_vertices = MIN2(MAX_VERTICES, count - i);
+      float inputs_soa[PIPE_MAX_SHADER_INPUTS][4][4] ALIGN16_ATTRIB;
+      float outputs_soa[PIPE_MAX_SHADER_OUTPUTS][4][4] ALIGN16_ATTRIB;
+      float temps_soa[TGSI_EXEC_NUM_TEMPS][4][4] ALIGN16_ATTRIB;
+      uint attr;
+
+      /* convert (up to) four input verts to SoA format */
+      for (attr = 0; attr < base->info.num_inputs; attr++) {
+         const float *vIn = (const float *) input;
+         uint vert;
+         for (vert = 0; vert < max_vertices; vert++) {
+#if 0
+            if (attr==0)
+               printf("Input v%d a%d: %f %f %f %f\n",
+                      vert, attr, vIn[0], vIn[1], vIn[2], vIn[3]);
+#endif
+            inputs_soa[attr][0][vert] = vIn[attr * 4 + 0];
+            inputs_soa[attr][1][vert] = vIn[attr * 4 + 1];
+            inputs_soa[attr][2][vert] = vIn[attr * 4 + 2];
+            inputs_soa[attr][3][vert] = vIn[attr * 4 + 3];
+            vIn += input_stride / 4;
+         }
+      }
+
+      /* run compiled shader
+       */
+#if 0
+      shader->func(machine->Inputs,
+		   machine->Outputs,
+		   (float (*)[4])constants,
+		   machine->Temps,
+		   (float (*)[4])shader->base.immediates,
+                   input,
+                   base->info.num_inputs,
+                   input_stride,
+                   output,
+                   base->info.num_outputs,
+                   output_stride );
+#else
+      shader->func(inputs_soa, outputs_soa, temps_soa,
+		   (float (*)[4][4]) shader->base.immediates,
+		   (float (*)[4]) constants);
+
+      /*output[0][0] = input[0][0] * 0.5;*/
+#endif
+
+      /* convert (up to) four output verts from SoA back to AoS format */
+      for (attr = 0; attr < base->info.num_outputs; attr++) {
+         float *vOut = (float *) output;
+         uint vert;
+         for (vert = 0; vert < max_vertices; vert++) {
+            vOut[attr * 4 + 0] = outputs_soa[attr][0][vert];
+            vOut[attr * 4 + 1] = outputs_soa[attr][1][vert];
+            vOut[attr * 4 + 2] = outputs_soa[attr][2][vert];
+            vOut[attr * 4 + 3] = outputs_soa[attr][3][vert];
+#if 0
+            if (attr==0)
+               printf("Output v%d a%d: %f %f %f %f\n",
+                      vert, attr, vOut[0], vOut[1], vOut[2], vOut[3]);
+#endif
+            vOut += output_stride / 4;
+         }
+      }
+
+      /* advance to next group of four input/output verts */
+      input = (const float (*)[4])((const char *)input + input_stride * max_vertices);
+      output = (float (*)[4])((char *)output + output_stride * max_vertices);
+   }
+}
+
+
+
+
+static void
+vs_ppc_delete( struct draw_vertex_shader *base )
+{
+   struct draw_ppc_vertex_shader *shader = (struct draw_ppc_vertex_shader *)base;
+   
+   ppc_release_func( &shader->ppc_program );
+
+   align_free( (void *) shader->base.immediates );
+
+   FREE( (void*) shader->base.state.tokens );
+   FREE( shader );
+}
+
+
+struct draw_vertex_shader *
+draw_create_vs_ppc(struct draw_context *draw,
+                          const struct pipe_shader_state *templ)
+{
+   struct draw_ppc_vertex_shader *vs;
+
+   vs = CALLOC_STRUCT( draw_ppc_vertex_shader );
+   if (vs == NULL) 
+      return NULL;
+
+   /* we make a private copy of the tokens */
+   vs->base.state.tokens = tgsi_dup_tokens(templ->tokens);
+   if (!vs->base.state.tokens)
+      goto fail;
+
+   tgsi_scan_shader(templ->tokens, &vs->base.info);
+
+   vs->base.draw = draw;
+#if 0
+   if (1)
+      vs->base.create_varient = draw_vs_varient_aos_ppc;
+   else
+#endif
+      vs->base.create_varient = draw_vs_varient_generic;
+   vs->base.prepare = vs_ppc_prepare;
+   vs->base.run_linear = vs_ppc_run_linear;
+   vs->base.delete = vs_ppc_delete;
+   
+   vs->base.immediates = align_malloc(TGSI_EXEC_NUM_IMMEDIATES * 4 * 4 *
+                                      sizeof(float), 16);
+
+   vs->machine = &draw->vs.machine;
+   
+   ppc_init_func( &vs->ppc_program, 1000 ); /* XXX fix limit */
+
+   if (!tgsi_emit_ppc( (struct tgsi_token *) vs->base.state.tokens,
+			&vs->ppc_program, 
+                        (float (*)[4])vs->base.immediates, 
+                        TRUE )) 
+      goto fail;
+      
+   vs->func = (codegen_function) ppc_get_func( &vs->ppc_program );
+   if (!vs->func) {
+      goto fail;
+   }
+   
+   return &vs->base;
+
+fail:
+   debug_error("tgsi_emit_ppc() failed, falling back to interpreter\n");
+
+   ppc_release_func( &vs->ppc_program );
+   
+   FREE(vs);
+   return NULL;
+}
+
+
+
+#else /* PIPE_ARCH_PPC */
+
+
+struct draw_vertex_shader *
+draw_create_vs_ppc( struct draw_context *draw,
+		    const struct pipe_shader_state *templ )
+{
+   return (void *) 0;
+}
+
+
+#endif /* PIPE_ARCH_PPC */
-- 
cgit v1.2.3


From ba4faef7c07c47ad4f71f3e6ba94cb54217c56ed Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 22 Oct 2008 11:13:31 -0600
Subject: gallium: temporarily disable PPC vertex shader until more things run

---
 src/gallium/auxiliary/draw/draw_vs_ppc.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'src/gallium/auxiliary')

diff --git a/src/gallium/auxiliary/draw/draw_vs_ppc.c b/src/gallium/auxiliary/draw/draw_vs_ppc.c
index a096ad49b8..990a659f27 100644
--- a/src/gallium/auxiliary/draw/draw_vs_ppc.c
+++ b/src/gallium/auxiliary/draw/draw_vs_ppc.c
@@ -203,6 +203,9 @@ draw_create_vs_ppc(struct draw_context *draw,
 {
    struct draw_ppc_vertex_shader *vs;
 
+   /* XXX temporary short-circuit */
+   return NULL;
+
    vs = CALLOC_STRUCT( draw_ppc_vertex_shader );
    if (vs == NULL) 
       return NULL;
-- 
cgit v1.2.3


From ebdc399d83d6bd2f4e3594874483dbca5f9f5c0e Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 22 Oct 2008 13:57:56 -0600
Subject: gallium: fix-up confusing register allocation masks in rtasm_ppc.c

Plus, add ppc_reserve_register() func.
---
 src/gallium/auxiliary/rtasm/rtasm_ppc.c | 56 ++++++++++++++++++++-------------
 src/gallium/auxiliary/rtasm/rtasm_ppc.h |  1 +
 2 files changed, 36 insertions(+), 21 deletions(-)

(limited to 'src/gallium/auxiliary')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc.c b/src/gallium/auxiliary/rtasm/rtasm_ppc.c
index aaec2d2191..2d9f4e079e 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc.c
@@ -49,13 +49,15 @@ ppc_init_func(struct ppc_function *p, unsigned max_inst)
    p->store = align_malloc(max_inst * PPC_INST_SIZE, 16);
    p->num_inst = 0;
    p->max_inst = max_inst;
-   p->fp_used = ~0x0;
-   p->vec_used = ~0x0;
-
-   /* only allow using gp registers 7..12 for now */
    p->reg_used = 0x0;
-   for (i = 7; i < 13; i++)
-      p->reg_used |= (1 << i);
+   p->fp_used = 0x0;
+   p->vec_used = 0x0;
+
+   /* only allow using gp registers 3..12 for now */
+   for (i = 0; i < 3; i++)
+      ppc_reserve_register(p, i);
+   for (i = 12; i < PPC_NUM_REGS; i++)
+      ppc_reserve_register(p, i);
 }
 
 
@@ -95,6 +97,18 @@ ppc_dump_func(const struct ppc_function *p)
 }
 
 
+/**
+ * Mark a register as being unavailable.
+ */
+int
+ppc_reserve_register(struct ppc_function *p, int reg)
+{
+   assert(reg < PPC_NUM_REGS);
+   p->reg_used |= (1 << reg);
+   return reg;
+}
+
+
 /**
  * Allocate a general purpose register.
  * \return register index or -1 if none left.
@@ -105,8 +119,8 @@ ppc_allocate_register(struct ppc_function *p)
    unsigned i;
    for (i = 0; i < PPC_NUM_REGS; i++) {
       const uint64_t mask = 1 << i;
-      if ((p->reg_used & mask) != 0) {
-         p->reg_used &= ~mask;
+      if ((p->reg_used & mask) == 0) {
+         p->reg_used |= mask;
          return i;
       }
    }
@@ -121,8 +135,8 @@ void
 ppc_release_register(struct ppc_function *p, int reg)
 {
    assert(reg < PPC_NUM_REGS);
-   assert((p->reg_used & (1 << reg)) == 0);
-   p->reg_used |= (1 << reg);
+   assert(p->reg_used & (1 << reg));
+   p->reg_used &= ~(1 << reg);
 }
 
 
@@ -136,8 +150,8 @@ ppc_allocate_fp_register(struct ppc_function *p)
    unsigned i;
    for (i = 0; i < PPC_NUM_FP_REGS; i++) {
       const uint64_t mask = 1 << i;
-      if ((p->fp_used & mask) != 0) {
-         p->fp_used &= ~mask;
+      if ((p->fp_used & mask) == 0) {
+         p->fp_used |= mask;
          return i;
       }
    }
@@ -152,8 +166,8 @@ void
 ppc_release_fp_register(struct ppc_function *p, int reg)
 {
    assert(reg < PPC_NUM_FP_REGS);
-   assert((p->fp_used & (1 << reg)) == 0);
-   p->fp_used |= (1 << reg);
+   assert(p->fp_used & (1 << reg));
+   p->fp_used &= ~(1 << reg);
 }
 
 
@@ -167,8 +181,8 @@ ppc_allocate_vec_register(struct ppc_function *p)
    unsigned i;
    for (i = 0; i < PPC_NUM_VEC_REGS; i++) {
       const uint64_t mask = 1 << i;
-      if ((p->vec_used & mask) != 0) {
-         p->vec_used &= ~mask;
+      if ((p->vec_used & mask) == 0) {
+         p->vec_used |= mask;
          return i;
       }
    }
@@ -183,8 +197,8 @@ void
 ppc_release_vec_register(struct ppc_function *p, int reg)
 {
    assert(reg < PPC_NUM_VEC_REGS);
-   assert((p->vec_used & (1 << reg)) == 0);
-   p->vec_used |= (1 << reg);
+   assert(p->vec_used & (1 << reg));
+   p->vec_used &= ~(1 << reg);
 }
 
 
@@ -582,11 +596,11 @@ ppc_lvx(struct ppc_function *p, uint vR, uint vA, uint vB)
    emit_x(p, 31, vR, vA, vB, 103);
 }
 
-/** load vector element word: vR = mem_word[vA+vB] */
+/** load vector element word: vR = mem_word[ra+rb] */
 void
-ppc_lvewx(struct ppc_function *p, uint vR, uint vA, uint vB)
+ppc_lvewx(struct ppc_function *p, uint vr, uint ra, uint rb)
 {
-   emit_x(p, 31, vR, vA, vB, 71);
+   emit_x(p, 31, vr, ra, rb, 71);
 }
 
 
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc.h b/src/gallium/auxiliary/rtasm/rtasm_ppc.h
index 53d5746dc8..85679b4886 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc.h
@@ -67,6 +67,7 @@ extern void ppc_release_func(struct ppc_function *p);
 extern void (*ppc_get_func( struct ppc_function *p ))( void );
 extern void ppc_dump_func(const struct ppc_function *p);
 
+extern int ppc_reserve_register(struct ppc_function *p, int reg);
 extern int ppc_allocate_register(struct ppc_function *p);
 extern void ppc_release_register(struct ppc_function *p, int reg);
 extern int ppc_allocate_fp_register(struct ppc_function *p);
-- 
cgit v1.2.3


From da63edd720fc154820fcbf699e1056ac9357a03f Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 22 Oct 2008 13:59:11 -0600
Subject: gallium: fix broken TGSI_FILE_CONSTANT case, use
 ppc_reserver_register()

---
 src/gallium/auxiliary/tgsi/tgsi_ppc.c | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

(limited to 'src/gallium/auxiliary')

diff --git a/src/gallium/auxiliary/tgsi/tgsi_ppc.c b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
index 112e736523..dbf215c0d5 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ppc.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
@@ -1108,10 +1108,15 @@ emit_fetch(struct gen_context *gen,
             int offset_reg = ppc_allocate_register(gen->f);
             int offset = (reg->SrcRegister.Index * 4 + swizzle) * 4;
             ppc_li(gen->f, offset_reg, offset);
-            /* load vector word */
+            /* Load 4-byte word into vector register.
+             * The vector slot depends on the effective address we load from.
+             * We know that our constants start at a 16-byte boundary so we
+             * know that 'swizzle' tells us which vector slot will have the
+             * loaded word.  The other vector slots will be undefined.
+             */
             ppc_lvewx(gen->f, vec_reg, gen->const_reg, offset_reg);
-            /* splat word[0] across vector */
-            ppc_vspltw(gen->f, vec_reg, vec_reg, 0);
+            /* splat word[swizzle] across the vector reg */
+            ppc_vspltw(gen->f, vec_reg, vec_reg, swizzle);
             ppc_release_register(gen->f, offset_reg);
          }
          break;
@@ -2635,11 +2640,11 @@ tgsi_emit_ppc(const struct tgsi_token *tokens,
    tgsi_parse_init( &parse, tokens );
 
    gen.f = func;
-   gen.inputs_reg = 3;   /* first function param */
-   gen.outputs_reg = 4;  /* second function param */
-   gen.temps_reg = 5;    /* ... */
-   gen.immed_reg = 6;
-   gen.const_reg = 7;
+   gen.inputs_reg = ppc_reserve_register(func, 3);   /* first function param */
+   gen.outputs_reg = ppc_reserve_register(func, 4);  /* second function param */
+   gen.temps_reg = ppc_reserve_register(func, 5);    /* ... */
+   gen.immed_reg = ppc_reserve_register(func, 6);
+   gen.const_reg = ppc_reserve_register(func, 7);
 
    emit_prologue(func);
 
-- 
cgit v1.2.3


From b06d0720194dfecaf45dc97cbd178411aed5205f Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 22 Oct 2008 14:48:33 -0600
Subject: gallium: added ppc_vload_float(), for limited cases

---
 src/gallium/auxiliary/rtasm/rtasm_ppc.c | 18 ++++++++++++++++++
 src/gallium/auxiliary/rtasm/rtasm_ppc.h |  4 ++++
 2 files changed, 22 insertions(+)

(limited to 'src/gallium/auxiliary')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc.c b/src/gallium/auxiliary/rtasm/rtasm_ppc.c
index 2d9f4e079e..65df676eae 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc.c
@@ -603,6 +603,24 @@ ppc_lvewx(struct ppc_function *p, uint vr, uint ra, uint rb)
    emit_x(p, 31, vr, ra, rb, 71);
 }
 
+/** vector load float: vr = splats(imm) */
+void
+ppc_vload_float(struct ppc_function *p, uint vr, float imm)
+{
+   if (imm == 0.0f) {
+      ppc_vxor(p, vr, vr, vr);
+   }
+   else if (imm == 1.0f) {
+      /* use 2^0=1 to get 1.0 */
+      ppc_vxor(p, vr, vr, vr);  /* vr = {0,0,0,0} */
+      ppc_vexptefp(p, vr, vr);  /* vr = 0^0 */
+   }
+   else {
+      assert(0);
+   }
+}
+
+
 
 
 /**
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc.h b/src/gallium/auxiliary/rtasm/rtasm_ppc.h
index 85679b4886..9f1e3fcd84 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc.h
@@ -158,6 +158,10 @@ ppc_lvx(struct ppc_function *p, uint vR, uint vA, uint vB);
 extern void
 ppc_lvewx(struct ppc_function *p, uint vR, uint vA, uint vB);
 
+/** vector load float: vr = splats(imm) */
+extern void
+ppc_vload_float(struct ppc_function *p, uint vr, float imm);
+
 
 
 /**
-- 
cgit v1.2.3


From 51840065607337210fbba5ba1c01874293fbb42e Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 22 Oct 2008 14:48:58 -0600
Subject: gallium: TGSI->PPC inequality operators

---
 src/gallium/auxiliary/tgsi/tgsi_ppc.c | 70 +++++++++++++++++++++++++++++++++++
 1 file changed, 70 insertions(+)

(limited to 'src/gallium/auxiliary')

diff --git a/src/gallium/auxiliary/tgsi/tgsi_ppc.c b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
index dbf215c0d5..9bf364b8c4 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ppc.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
@@ -1495,6 +1495,68 @@ emit_binop(struct gen_context *gen, struct tgsi_full_instruction *inst)
 }
 
 
+/**
+ * Vector comparisons, resulting in 1.0 or 0.0 values.
+ */
+static void
+emit_inequality(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+   int v0 = ppc_allocate_vec_register(gen->f);
+   int v1 = ppc_allocate_vec_register(gen->f);
+   int v2 = ppc_allocate_vec_register(gen->f);
+   int v_one = ppc_allocate_vec_register(gen->f);
+   uint chan_index;
+   boolean complement = FALSE;
+
+   /* v_one = splat(1.0) */
+   ppc_vload_float(gen->f, v_one, 1.0f);
+
+   FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) {
+      FETCH(gen, *inst, v0, 0, chan_index);   /* v0 = srcreg[0] */
+      FETCH(gen, *inst, v1, 1, chan_index);   /* v1 = srcreg[1] */
+
+      switch (inst->Instruction.Opcode) {
+      case TGSI_OPCODE_SNE:
+         complement = TRUE;
+         /* fall-through */
+      case TGSI_OPCODE_SEQ:
+         ppc_vcmpeqfpx(gen->f, v2, v0, v1); /* v2 = v0 == v1 ? ~0 : 0 */
+         break;
+
+      case TGSI_OPCODE_SGE:
+         complement = TRUE;
+         /* fall-through */
+      case TGSI_OPCODE_SLT:
+         ppc_vcmpgtfpx(gen->f, v2, v1, v0); /* v2 = v1 > v0 ? ~0 : 0 */
+         break;
+
+      case TGSI_OPCODE_SLE:
+         complement = TRUE;
+         /* fall-through */
+      case TGSI_OPCODE_SGT:
+         ppc_vcmpgtfpx(gen->f, v2, v0, v1); /* v2 = v0 > v1 ? ~0 : 0 */
+         break;
+      default:
+         assert(0);
+      }
+
+      /* v2 is now {0,0,0,0} or {~0,~0,~0,~0} */
+
+      if (complement)
+         ppc_vandc(gen->f, v2, v_one, v2);    /* v2 = v_one & ~v2 */
+      else
+         ppc_vand(gen->f, v2, v_one, v2);     /* v2 = v_one & v2 */
+
+      STORE(gen, *inst, v2, 0, chan_index);   /* store v2 */
+   }
+
+   ppc_release_vec_register(gen->f, v0);
+   ppc_release_vec_register(gen->f, v1);
+   ppc_release_vec_register(gen->f, v2);
+   ppc_release_vec_register(gen->f, v_one);
+}
+
+
 static void
 emit_dotprod(struct gen_context *gen, struct tgsi_full_instruction *inst)
 {
@@ -1588,6 +1650,14 @@ emit_instruction(struct gen_context *gen,
    case TGSI_OPCODE_MAX:
       emit_binop(gen, inst);
       break;
+   case TGSI_OPCODE_SEQ:
+   case TGSI_OPCODE_SNE:
+   case TGSI_OPCODE_SLT:
+   case TGSI_OPCODE_SGT:
+   case TGSI_OPCODE_SLE:
+   case TGSI_OPCODE_SGE:
+      emit_inequality(gen, inst);
+      break;
    case TGSI_OPCODE_MAD:
    case TGSI_OPCODE_LRP:
       emit_triop(gen, inst);
-- 
cgit v1.2.3


From c6ff870836e7c970f1030e9e0fbdd0cb5df40d29 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 22 Oct 2008 15:21:22 -0600
Subject: cell: TGSI->PPC for RSQ, RCP and src register sign modes

---
 src/gallium/auxiliary/tgsi/tgsi_ppc.c | 162 ++++++++++++++++++++++++----------
 1 file changed, 116 insertions(+), 46 deletions(-)

(limited to 'src/gallium/auxiliary')

diff --git a/src/gallium/auxiliary/tgsi/tgsi_ppc.c b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
index 9bf364b8c4..3637772102 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ppc.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
@@ -84,11 +84,14 @@
 struct gen_context
 {
    struct ppc_function *f;
-   int inputs_reg;    /**< register pointing to input params */
-   int outputs_reg;   /**< register pointing to output params */
-   int temps_reg;     /**< register pointing to temporary "registers" */
-   int immed_reg;     /**< register pointing to immediates buffer */
-   int const_reg;     /**< register pointing to constants buffer */
+   int inputs_reg;    /**< GP register pointing to input params */
+   int outputs_reg;   /**< GP register pointing to output params */
+   int temps_reg;     /**< GP register pointing to temporary "registers" */
+   int immed_reg;     /**< GP register pointing to immediates buffer */
+   int const_reg;     /**< GP register pointing to constants buffer */
+
+   int one_vec;       /**< vector register with {1.0, 1.0, 1.0, 1.0} */
+   int bit31_vec;     /**< vector register with {1<<31, 1<<31, 1<<31, 1<<31} */
 };
 
 
@@ -1059,6 +1062,35 @@ emit_sub(
 #endif
 
 
+/**
+ * Return index of vector register containing {1.0, 1.0, 1.0, 1.0}.
+ */
+static int
+gen_one_vec(struct gen_context *gen)
+{
+   if (gen->one_vec < 0) {
+      gen->one_vec = ppc_allocate_vec_register(gen->f);
+      ppc_vload_float(gen->f, gen->one_vec, 1.0f);
+   }
+   return gen->one_vec;
+}
+
+/**
+ * Return index of vector register containing {1<<31, 1<<31, 1<<31, 1<<31}.
+ */
+static int
+gen_get_bit31_vec(struct gen_context *gen)
+{
+   if (gen->bit31_vec < 0) {
+      gen->bit31_vec = ppc_allocate_vec_register(gen->f);
+      ppc_vspltisw(gen->f, gen->bit31_vec, -1);
+      ppc_vslw(gen->f, gen->bit31_vec, gen->bit31_vec, gen->bit31_vec);
+   }
+   return gen->bit31_vec;
+}
+
+
+
 /**
  * Register fetch.
  */
@@ -1124,49 +1156,42 @@ emit_fetch(struct gen_context *gen,
          assert( 0 );
       }
       break;
-
    case TGSI_EXTSWIZZLE_ZERO:
-#if 0
-      emit_tempf(
-         func,
-         xmm,
-         TGSI_EXEC_TEMP_00000000_I,
-         TGSI_EXEC_TEMP_00000000_C );
-#endif
+      ppc_vload_float(gen->f, vec_reg, 0.0f);
       break;
-
    case TGSI_EXTSWIZZLE_ONE:
-#if 0
-      emit_tempf(
-         func,
-         xmm,
-         TEMP_ONE_I,
-         TEMP_ONE_C );
-#endif
+      {
+         int one_vec = gen_one_vec(gen);
+         ppc_vecmove(gen->f, vec_reg, one_vec);
+      }
       break;
-
    default:
       assert( 0 );
    }
 
-#if 0
-   switch( tgsi_util_get_full_src_register_sign_mode( reg, chan_index ) ) {
-   case TGSI_UTIL_SIGN_CLEAR:
-      emit_abs( func, xmm );
-      break;
-
-   case TGSI_UTIL_SIGN_SET:
-      emit_setsign( func, xmm );
-      break;
-
-   case TGSI_UTIL_SIGN_TOGGLE:
-      emit_neg( func, xmm );
-      break;
-
-   case TGSI_UTIL_SIGN_KEEP:
-      break;
+   {
+      uint sign_op = tgsi_util_get_full_src_register_sign_mode(reg, chan_index);
+      if (sign_op != TGSI_UTIL_SIGN_KEEP) {
+         int bit31_vec = gen_get_bit31_vec(gen);
+
+         switch (sign_op) {
+         case TGSI_UTIL_SIGN_CLEAR:
+            /* vec = vec & ~bit31 */
+            ppc_vandc(gen->f, vec_reg, vec_reg, bit31_vec);
+            break;
+         case TGSI_UTIL_SIGN_SET:
+            /* vec = vec | bit31 */
+            ppc_vor(gen->f, vec_reg, vec_reg, bit31_vec);
+            break;
+         case TGSI_UTIL_SIGN_TOGGLE:
+            /* vec = vec ^ bit31 */
+            ppc_vxor(gen->f, vec_reg, vec_reg, bit31_vec);
+            break;
+         default:
+            assert(0);
+         }
+      }
    }
-#endif
 }
 
 #define FETCH( GEN, INST, VEC_REG, SRC_REG, CHAN ) \
@@ -1409,6 +1434,36 @@ emit_cmp(
 #endif
 
 
+static void
+emit_scalar_unaryop(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+   int v0 = ppc_allocate_vec_register(gen->f);
+   int v1 = ppc_allocate_vec_register(gen->f);
+   uint chan_index;
+
+   FETCH(gen, *inst, v0, 0, CHAN_X);
+
+   switch (inst->Instruction.Opcode) {
+   case TGSI_OPCODE_RSQ:
+      /* v1 = 1.0 / sqrt(v0) */
+      ppc_vrsqrtefp(gen->f, v1, v0);
+      break;
+   case TGSI_OPCODE_RCP:
+      /* v1 = 1.0 / v0 */
+      ppc_vrefp(gen->f, v1, v0);
+      break;
+   default:
+      assert(0);
+   }
+
+   FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+      STORE(gen, *inst, v1, 0, chan_index);
+   }
+   ppc_release_vec_register(gen->f, v0);
+   ppc_release_vec_register(gen->f, v1);
+}
+
+
 static void
 emit_unaryop(struct gen_context *gen, struct tgsi_full_instruction *inst)
 {
@@ -1504,12 +1559,9 @@ emit_inequality(struct gen_context *gen, struct tgsi_full_instruction *inst)
    int v0 = ppc_allocate_vec_register(gen->f);
    int v1 = ppc_allocate_vec_register(gen->f);
    int v2 = ppc_allocate_vec_register(gen->f);
-   int v_one = ppc_allocate_vec_register(gen->f);
    uint chan_index;
    boolean complement = FALSE;
-
-   /* v_one = splat(1.0) */
-   ppc_vload_float(gen->f, v_one, 1.0f);
+   int one_vec = gen_one_vec(gen);
 
    FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) {
       FETCH(gen, *inst, v0, 0, chan_index);   /* v0 = srcreg[0] */
@@ -1543,9 +1595,9 @@ emit_inequality(struct gen_context *gen, struct tgsi_full_instruction *inst)
       /* v2 is now {0,0,0,0} or {~0,~0,~0,~0} */
 
       if (complement)
-         ppc_vandc(gen->f, v2, v_one, v2);    /* v2 = v_one & ~v2 */
+         ppc_vandc(gen->f, v2, one_vec, v2);    /* v2 = one_vec & ~v2 */
       else
-         ppc_vand(gen->f, v2, v_one, v2);     /* v2 = v_one & v2 */
+         ppc_vand(gen->f, v2, one_vec, v2);     /* v2 = one_vec & v2 */
 
       STORE(gen, *inst, v2, 0, chan_index);   /* store v2 */
    }
@@ -1553,7 +1605,6 @@ emit_inequality(struct gen_context *gen, struct tgsi_full_instruction *inst)
    ppc_release_vec_register(gen->f, v0);
    ppc_release_vec_register(gen->f, v1);
    ppc_release_vec_register(gen->f, v2);
-   ppc_release_vec_register(gen->f, v_one);
 }
 
 
@@ -1630,6 +1681,14 @@ emit_triop(struct gen_context *gen, struct tgsi_full_instruction *inst)
 }
 
 
+/*
+static void
+emit_lit(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+}
+*/
+
+
 static int
 emit_instruction(struct gen_context *gen,
                  struct tgsi_full_instruction *inst)
@@ -1643,6 +1702,10 @@ emit_instruction(struct gen_context *gen,
    case TGSI_OPCODE_LOGBASE2:
       emit_unaryop(gen, inst);
       break;
+   case TGSI_OPCODE_RSQ:
+   case TGSI_OPCODE_RCP:
+      emit_scalar_unaryop(gen, inst);
+      break;
    case TGSI_OPCODE_ADD:
    case TGSI_OPCODE_SUB:
    case TGSI_OPCODE_MUL:
@@ -1667,6 +1730,11 @@ emit_instruction(struct gen_context *gen,
    case TGSI_OPCODE_DPH:
       emit_dotprod(gen, inst);
       break;
+      /*
+   case TGSI_OPCODE_LIT:
+      emit_lit(gen, inst);
+      break;
+      */
    case TGSI_OPCODE_END:
       /* normal end */
       return 1;
@@ -2715,6 +2783,8 @@ tgsi_emit_ppc(const struct tgsi_token *tokens,
    gen.temps_reg = ppc_reserve_register(func, 5);    /* ... */
    gen.immed_reg = ppc_reserve_register(func, 6);
    gen.const_reg = ppc_reserve_register(func, 7);
+   gen.one_vec = -1;
+   gen.bit31_vec = -1;
 
    emit_prologue(func);
 
-- 
cgit v1.2.3


From 7b1d08738f30d0fec2f07568b16e08c4fdddeeac Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 22 Oct 2008 15:25:04 -0600
Subject: cell: turn on PPC assembly vertex transform

gears runs with it now (3x faster FPS than before).
---
 src/gallium/auxiliary/draw/draw_vs_ppc.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'src/gallium/auxiliary')

diff --git a/src/gallium/auxiliary/draw/draw_vs_ppc.c b/src/gallium/auxiliary/draw/draw_vs_ppc.c
index 990a659f27..fcc9cbfec5 100644
--- a/src/gallium/auxiliary/draw/draw_vs_ppc.c
+++ b/src/gallium/auxiliary/draw/draw_vs_ppc.c
@@ -203,9 +203,6 @@ draw_create_vs_ppc(struct draw_context *draw,
 {
    struct draw_ppc_vertex_shader *vs;
 
-   /* XXX temporary short-circuit */
-   return NULL;
-
    vs = CALLOC_STRUCT( draw_ppc_vertex_shader );
    if (vs == NULL) 
       return NULL;
@@ -233,7 +230,7 @@ draw_create_vs_ppc(struct draw_context *draw,
 
    vs->machine = &draw->vs.machine;
    
-   ppc_init_func( &vs->ppc_program, 1000 ); /* XXX fix limit */
+   ppc_init_func( &vs->ppc_program, 2000 ); /* XXX fix limit */
 
    if (!tgsi_emit_ppc( (struct tgsi_token *) vs->base.state.tokens,
 			&vs->ppc_program, 
-- 
cgit v1.2.3


From 519c2dbed57b3c5e1717a62df5d5f8b908a1acd6 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 22 Oct 2008 15:30:00 -0600
Subject: gallium: remove SSE remnants from tgsi_ppc.c

---
 src/gallium/auxiliary/tgsi/tgsi_ppc.c | 2987 +++++----------------------------
 1 file changed, 417 insertions(+), 2570 deletions(-)

(limited to 'src/gallium/auxiliary')

diff --git a/src/gallium/auxiliary/tgsi/tgsi_ppc.c b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
index 3637772102..432ec7459b 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ppc.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
@@ -44,14 +44,6 @@
 #include "rtasm/rtasm_ppc.h"
 
 
-/* for 1/sqrt()
- *
- * This costs about 100fps (close to 10%) in gears:
- */
-#define HIGH_PRECISION 1
-
-#define FAST_MATH 1
-
 
 #define FOR_EACH_CHANNEL( CHAN )\
    for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)
@@ -95,2452 +87,515 @@ struct gen_context
 };
 
 
-
-#if 0000
-
 /**
- * X86 utility functions.
+ * Return index of vector register containing {1.0, 1.0, 1.0, 1.0}.
  */
-
-static struct x86_reg
-make_xmm(
-   unsigned xmm )
+static int
+gen_one_vec(struct gen_context *gen)
 {
-   return x86_make_reg(
-      file_XMM,
-      (enum x86_reg_name) xmm );
+   if (gen->one_vec < 0) {
+      gen->one_vec = ppc_allocate_vec_register(gen->f);
+      ppc_vload_float(gen->f, gen->one_vec, 1.0f);
+   }
+   return gen->one_vec;
 }
 
 /**
- * X86 register mapping helpers.
+ * Return index of vector register containing {1<<31, 1<<31, 1<<31, 1<<31}.
  */
-
-static struct x86_reg
-get_const_base( void )
-{
-   return x86_make_reg(
-      file_REG32,
-      reg_CX );
-}
-
-static struct x86_reg
-get_input_base( void )
-{
-   return x86_make_reg(
-      file_REG32,
-      reg_AX );
-}
-
-static struct x86_reg
-get_output_base( void )
-{
-   return x86_make_reg(
-      file_REG32,
-      reg_DX );
-}
-
-static struct x86_reg
-get_temp_base( void )
-{
-   return x86_make_reg(
-      file_REG32,
-      reg_BX );
-}
-
-static struct x86_reg
-get_coef_base( void )
+static int
+gen_get_bit31_vec(struct gen_context *gen)
 {
-   return get_output_base();
+   if (gen->bit31_vec < 0) {
+      gen->bit31_vec = ppc_allocate_vec_register(gen->f);
+      ppc_vspltisw(gen->f, gen->bit31_vec, -1);
+      ppc_vslw(gen->f, gen->bit31_vec, gen->bit31_vec, gen->bit31_vec);
+   }
+   return gen->bit31_vec;
 }
 
-static struct x86_reg
-get_immediate_base( void )
-{
-   return x86_make_reg(
-      file_REG32,
-      reg_DI );
-}
 
 
 /**
- * Data access helpers.
+ * Register fetch.
  */
-
-
-static struct x86_reg
-get_immediate(
-   unsigned vec,
-   unsigned chan )
-{
-   return x86_make_disp(
-      get_immediate_base(),
-      (vec * 4 + chan) * 4 );
-}
-
-static struct x86_reg
-get_const(
-   unsigned vec,
-   unsigned chan )
-{
-   return x86_make_disp(
-      get_const_base(),
-      (vec * 4 + chan) * 4 );
-}
-
-static struct x86_reg
-get_input(
-   unsigned vec,
-   unsigned chan )
+static void
+emit_fetch(struct gen_context *gen,
+           unsigned vec_reg,
+           const struct tgsi_full_src_register *reg,
+           const unsigned chan_index)
 {
-   return x86_make_disp(
-      get_input_base(),
-      (vec * 4 + chan) * 16 );
-}
+   uint swizzle = tgsi_util_get_full_src_register_extswizzle(reg, chan_index);
 
-static struct x86_reg
-get_output(
-   unsigned vec,
-   unsigned chan )
-{
-   return x86_make_disp(
-      get_output_base(),
-      (vec * 4 + chan) * 16 );
-}
+   switch (swizzle) {
+   case TGSI_EXTSWIZZLE_X:
+   case TGSI_EXTSWIZZLE_Y:
+   case TGSI_EXTSWIZZLE_Z:
+   case TGSI_EXTSWIZZLE_W:
+      switch (reg->SrcRegister.File) {
+      case TGSI_FILE_INPUT:
+         {
+            int offset_reg = ppc_allocate_register(gen->f);
+            int offset = (reg->SrcRegister.Index * 4 + swizzle) * 16;
+            ppc_li(gen->f, offset_reg, offset);
+            ppc_lvx(gen->f, vec_reg, gen->inputs_reg, offset_reg);
+            ppc_release_register(gen->f, offset_reg);
+         }
+         break;
+      case TGSI_FILE_TEMPORARY:
+         {
+            int offset_reg = ppc_allocate_register(gen->f);
+            int offset = (reg->SrcRegister.Index * 4 + swizzle) * 16;
+            ppc_li(gen->f, offset_reg, offset);
+            ppc_lvx(gen->f, vec_reg, gen->temps_reg, offset_reg);
+            ppc_release_register(gen->f, offset_reg);
+         }
+         break;
+      case TGSI_FILE_IMMEDIATE:
+         {
+            int offset_reg = ppc_allocate_register(gen->f);
+            int offset = (reg->SrcRegister.Index * 4 + swizzle) * 16;
+            ppc_li(gen->f, offset_reg, offset);
+            ppc_lvx(gen->f, vec_reg, gen->immed_reg, offset_reg);
+            ppc_release_register(gen->f, offset_reg);
+         }
+         break;
+      case TGSI_FILE_CONSTANT:
+         {
+            int offset_reg = ppc_allocate_register(gen->f);
+            int offset = (reg->SrcRegister.Index * 4 + swizzle) * 4;
+            ppc_li(gen->f, offset_reg, offset);
+            /* Load 4-byte word into vector register.
+             * The vector slot depends on the effective address we load from.
+             * We know that our constants start at a 16-byte boundary so we
+             * know that 'swizzle' tells us which vector slot will have the
+             * loaded word.  The other vector slots will be undefined.
+             */
+            ppc_lvewx(gen->f, vec_reg, gen->const_reg, offset_reg);
+            /* splat word[swizzle] across the vector reg */
+            ppc_vspltw(gen->f, vec_reg, vec_reg, swizzle);
+            ppc_release_register(gen->f, offset_reg);
+         }
+         break;
+      default:
+         assert( 0 );
+      }
+      break;
+   case TGSI_EXTSWIZZLE_ZERO:
+      ppc_vload_float(gen->f, vec_reg, 0.0f);
+      break;
+   case TGSI_EXTSWIZZLE_ONE:
+      {
+         int one_vec = gen_one_vec(gen);
+         ppc_vecmove(gen->f, vec_reg, one_vec);
+      }
+      break;
+   default:
+      assert( 0 );
+   }
 
-static struct x86_reg
-get_temp(
-   unsigned vec,
-   unsigned chan )
-{
-   return x86_make_disp(
-      get_temp_base(),
-      (vec * 4 + chan) * 16 );
-}
+   {
+      uint sign_op = tgsi_util_get_full_src_register_sign_mode(reg, chan_index);
+      if (sign_op != TGSI_UTIL_SIGN_KEEP) {
+         int bit31_vec = gen_get_bit31_vec(gen);
 
-static struct x86_reg
-get_coef(
-   unsigned vec,
-   unsigned chan,
-   unsigned member )
-{
-   return x86_make_disp(
-      get_coef_base(),
-      ((vec * 3 + member) * 4 + chan) * 4 );
+         switch (sign_op) {
+         case TGSI_UTIL_SIGN_CLEAR:
+            /* vec = vec & ~bit31 */
+            ppc_vandc(gen->f, vec_reg, vec_reg, bit31_vec);
+            break;
+         case TGSI_UTIL_SIGN_SET:
+            /* vec = vec | bit31 */
+            ppc_vor(gen->f, vec_reg, vec_reg, bit31_vec);
+            break;
+         case TGSI_UTIL_SIGN_TOGGLE:
+            /* vec = vec ^ bit31 */
+            ppc_vxor(gen->f, vec_reg, vec_reg, bit31_vec);
+            break;
+         default:
+            assert(0);
+         }
+      }
+   }
 }
 
+#define FETCH( GEN, INST, VEC_REG, SRC_REG, CHAN ) \
+   emit_fetch( GEN, VEC_REG, &(INST).FullSrcRegisters[SRC_REG], CHAN )
 
-static void
-emit_ret(
-   struct x86_function  *func )
-{
-   x86_ret( func );
-}
-
-#endif
 
-/**
- * Data fetch helpers.
- */
 
-#if 00
 /**
- * Copy a shader constant to xmm register
- * \param xmm  the destination xmm register
- * \param vec  the src const buffer index
- * \param chan  src channel to fetch (X, Y, Z or W)
+ * Register store.
  */
 static void
-emit_const(
-   struct x86_function *func,
-   uint xmm,
-   int vec,
-   uint chan,
-   uint indirect,
-   uint indirectFile,
-   int indirectIndex )
+emit_store(struct gen_context *gen,
+           unsigned vec_reg,
+           const struct tgsi_full_dst_register *reg,
+           const struct tgsi_full_instruction *inst,
+           unsigned chan_index)
 {
-   if (indirect) {
-      struct x86_reg r0 = get_input_base();
-      struct x86_reg r1 = get_output_base();
-      uint i;
-
-      assert( indirectFile == TGSI_FILE_ADDRESS );
-      assert( indirectIndex == 0 );
-
-      x86_push( func, r0 );
-      x86_push( func, r1 );
-
-      for (i = 0; i < QUAD_SIZE; i++) {
-         x86_lea( func, r0, get_const( vec, chan ) );
-         x86_mov( func, r1, x86_make_disp( get_temp( TEMP_ADDR, CHAN_X ), i * 4 ) );
-
-         /* Quick hack to multiply by 16 -- need to add SHL to rtasm.
-          */
-         x86_add( func, r1, r1 );
-         x86_add( func, r1, r1 );
-         x86_add( func, r1, r1 );
-         x86_add( func, r1, r1 );
-
-         x86_add( func, r0, r1 );
-         x86_mov( func, r1, x86_deref( r0 ) );
-         x86_mov( func, x86_make_disp( get_temp( TEMP_R0, CHAN_X ), i * 4 ), r1 );
+   switch (reg->DstRegister.File) {
+   case TGSI_FILE_OUTPUT:
+      {
+         int offset_reg = ppc_allocate_register(gen->f);
+         int offset = (reg->DstRegister.Index * 4 + chan_index) * 16;
+         ppc_li(gen->f, offset_reg, offset);
+         ppc_stvx(gen->f, vec_reg, gen->outputs_reg, offset_reg);
+         ppc_release_register(gen->f, offset_reg);
       }
-
-      x86_pop( func, r1 );
-      x86_pop( func, r0 );
-
-      sse_movaps(
+      break;
+   case TGSI_FILE_TEMPORARY:
+      {
+         int offset_reg = ppc_allocate_register(gen->f);
+         int offset = (reg->DstRegister.Index * 4 + chan_index) * 16;
+         ppc_li(gen->f, offset_reg, offset);
+         ppc_stvx(gen->f, vec_reg, gen->temps_reg, offset_reg);
+         ppc_release_register(gen->f, offset_reg);
+      }
+      break;
+#if 0
+   case TGSI_FILE_ADDRESS:
+      emit_addrs(
          func,
-         make_xmm( xmm ),
-         get_temp( TEMP_R0, CHAN_X ) );
+         xmm,
+         reg->DstRegister.Index,
+         chan_index );
+      break;
+#endif
+   default:
+      assert( 0 );
    }
-   else {
-      assert( vec >= 0 );
 
-      sse_movss(
-         func,
-         make_xmm( xmm ),
-         get_const( vec, chan ) );
-      sse_shufps(
-         func,
-         make_xmm( xmm ),
-         make_xmm( xmm ),
-         SHUF( 0, 0, 0, 0 ) );
+#if 0
+   switch( inst->Instruction.Saturate ) {
+   case TGSI_SAT_NONE:
+      break;
+
+   case TGSI_SAT_ZERO_ONE:
+      /* assert( 0 ); */
+      break;
+
+   case TGSI_SAT_MINUS_PLUS_ONE:
+      assert( 0 );
+      break;
    }
+#endif
 }
 
-static void
-emit_immediate(
-   struct x86_function *func,
-   unsigned xmm,
-   unsigned vec,
-   unsigned chan )
-{
-   sse_movss(
-      func,
-      make_xmm( xmm ),
-      get_immediate( vec, chan ) );
-   sse_shufps(
-      func,
-      make_xmm( xmm ),
-      make_xmm( xmm ),
-      SHUF( 0, 0, 0, 0 ) );
-}
 
+#define STORE( GEN, INST, XMM, INDEX, CHAN )\
+   emit_store( GEN, XMM, &(INST).FullDstRegisters[INDEX], &(INST), CHAN )
 
-/**
- * Copy a shader input to xmm register
- * \param xmm  the destination xmm register
- * \param vec  the src input attrib
- * \param chan  src channel to fetch (X, Y, Z or W)
- */
-static void
-emit_inputf(
-   struct x86_function *func,
-   unsigned xmm,
-   unsigned vec,
-   unsigned chan )
-{
-   sse_movups(
-      func,
-      make_xmm( xmm ),
-      get_input( vec, chan ) );
-}
 
-/**
- * Store an xmm register to a shader output
- * \param xmm  the source xmm register
- * \param vec  the dest output attrib
- * \param chan  src dest channel to store (X, Y, Z or W)
- */
-static void
-emit_output(
-   struct x86_function *func,
-   unsigned xmm,
-   unsigned vec,
-   unsigned chan )
-{
-   sse_movups(
-      func,
-      get_output( vec, chan ),
-      make_xmm( xmm ) );
-}
 
-/**
- * Copy a shader temporary to xmm register
- * \param xmm  the destination xmm register
- * \param vec  the src temp register
- * \param chan  src channel to fetch (X, Y, Z or W)
- */
 static void
-emit_tempf(
-   struct x86_function *func,
-   unsigned xmm,
-   unsigned vec,
-   unsigned chan )
+emit_scalar_unaryop(struct gen_context *gen, struct tgsi_full_instruction *inst)
 {
-   sse_movaps(
-      func,
-      make_xmm( xmm ),
-      get_temp( vec, chan ) );
-}
+   int v0 = ppc_allocate_vec_register(gen->f);
+   int v1 = ppc_allocate_vec_register(gen->f);
+   uint chan_index;
 
-/**
- * Load an xmm register with an input attrib coefficient (a0, dadx or dady)
- * \param xmm  the destination xmm register
- * \param vec  the src input/attribute coefficient index
- * \param chan  src channel to fetch (X, Y, Z or W)
- * \param member  0=a0, 1=dadx, 2=dady
- */
-static void
-emit_coef(
-   struct x86_function *func,
-   unsigned xmm,
-   unsigned vec,
-   unsigned chan,
-   unsigned member )
-{
-   sse_movss(
-      func,
-      make_xmm( xmm ),
-      get_coef( vec, chan, member ) );
-   sse_shufps(
-      func,
-      make_xmm( xmm ),
-      make_xmm( xmm ),
-      SHUF( 0, 0, 0, 0 ) );
+   FETCH(gen, *inst, v0, 0, CHAN_X);
+
+   switch (inst->Instruction.Opcode) {
+   case TGSI_OPCODE_RSQ:
+      /* v1 = 1.0 / sqrt(v0) */
+      ppc_vrsqrtefp(gen->f, v1, v0);
+      break;
+   case TGSI_OPCODE_RCP:
+      /* v1 = 1.0 / v0 */
+      ppc_vrefp(gen->f, v1, v0);
+      break;
+   default:
+      assert(0);
+   }
+
+   FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+      STORE(gen, *inst, v1, 0, chan_index);
+   }
+   ppc_release_vec_register(gen->f, v0);
+   ppc_release_vec_register(gen->f, v1);
 }
 
-/**
- * Data store helpers.
- */
 
 static void
-emit_inputs(
-   struct x86_function *func,
-   unsigned xmm,
-   unsigned vec,
-   unsigned chan )
-{
-   sse_movups(
-      func,
-      get_input( vec, chan ),
-      make_xmm( xmm ) );
-}
-
-static void
-emit_temps(
-   struct x86_function *func,
-   unsigned xmm,
-   unsigned vec,
-   unsigned chan )
-{
-   sse_movaps(
-      func,
-      get_temp( vec, chan ),
-      make_xmm( xmm ) );
-}
-
-static void
-emit_addrs(
-   struct x86_function *func,
-   unsigned xmm,
-   unsigned vec,
-   unsigned chan )
-{
-   assert( vec == 0 );
-
-   emit_temps(
-      func,
-      xmm,
-      vec + TGSI_EXEC_TEMP_ADDR,
-      chan );
-}
-
-/**
- * Coefficent fetch helpers.
- */
-
-static void
-emit_coef_a0(
-   struct x86_function *func,
-   unsigned xmm,
-   unsigned vec,
-   unsigned chan )
-{
-   emit_coef(
-      func,
-      xmm,
-      vec,
-      chan,
-      0 );
-}
-
-static void
-emit_coef_dadx(
-   struct x86_function *func,
-   unsigned xmm,
-   unsigned vec,
-   unsigned chan )
-{
-   emit_coef(
-      func,
-      xmm,
-      vec,
-      chan,
-      1 );
-}
-
-static void
-emit_coef_dady(
-   struct x86_function *func,
-   unsigned xmm,
-   unsigned vec,
-   unsigned chan )
-{
-   emit_coef(
-      func,
-      xmm,
-      vec,
-      chan,
-      2 );
-}
-#endif
-
-
-/**
- * Function call helpers.
- */
-
-#if 00
-/**
- * NOTE: In gcc, if the destination uses the SSE intrinsics, then it must be 
- * defined with __attribute__((force_align_arg_pointer)), as we do not guarantee
- * that the stack pointer is 16 byte aligned, as expected.
- */
-static void
-emit_func_call_dst(
-   struct x86_function *func,
-   unsigned xmm_save,
-   unsigned xmm_dst,
-   void (PIPE_CDECL *code)() )
+emit_unaryop(struct gen_context *gen, struct tgsi_full_instruction *inst)
 {
-   struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
-   unsigned i, n, xmm;
-   unsigned xmm_mask;
-   
-   /* Bitmask of the xmm registers to save */
-   xmm_mask = (1 << xmm_save) - 1;
-   xmm_mask &= ~(1 << xmm_dst);
-
-   sse_movaps(
-      func,
-      get_temp( TEMP_R0, 0 ),
-      make_xmm( xmm_dst ) );
-
-   x86_push(
-      func,
-      x86_make_reg( file_REG32, reg_AX) );
-   x86_push(
-      func,
-      x86_make_reg( file_REG32, reg_CX) );
-   x86_push(
-      func,
-      x86_make_reg( file_REG32, reg_DX) );
-   
-   for(i = 0, n = 0; i < 8; ++i)
-      if(xmm_mask & (1 << i))
-         ++n;
-   
-   x86_sub_imm(
-      func, 
-      x86_make_reg( file_REG32, reg_SP ),
-      n*16);
-
-   for(i = 0, n = 0; i < 8; ++i)
-      if(xmm_mask & (1 << i)) {
-         sse_movups(
-            func,
-            x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ),
-            make_xmm( xmm ) );
-         ++n;
-      }
-   
-   x86_lea(
-      func,
-      ecx,
-      get_temp( TEMP_R0, 0 ) );
-   
-   x86_push( func, ecx );
-   x86_mov_reg_imm( func, ecx, (unsigned long) code );
-   x86_call( func, ecx );
-   x86_pop(func, ecx );
-   
-   for(i = 0, n = 0; i < 8; ++i)
-      if(xmm_mask & (1 << i)) {
-         sse_movups(
-            func,
-            make_xmm( xmm ),
-            x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ) );
-         ++n;
+   int v0 = ppc_allocate_vec_register(gen->f);
+   uint chan_index;
+   FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) {
+      FETCH(gen, *inst, 0, 0, chan_index);   /* v0 = srcreg[0] */
+      switch (inst->Instruction.Opcode) {
+      case TGSI_OPCODE_ABS:
+         /* turn off the most significant bit of each vector float word */
+         {
+            int v1 = ppc_allocate_vec_register(gen->f);
+            ppc_vspltisw(gen->f, v1, -1);  /* v1 = {-1, -1, -1, -1} */
+            ppc_vslw(gen->f, v1, v1, v1);  /* v1 = {1<<31, 1<<31, 1<<31, 1<<31} */
+            ppc_vandc(gen->f, v0, v0, v1); /* v0 = v0 & ~v1 */
+            ppc_release_vec_register(gen->f, v1);
+         }
+         break;
+      case TGSI_OPCODE_FLOOR:
+         ppc_vrfim(gen->f, v0, v0);         /* v0 = floor(v0) */
+         break;
+      case TGSI_OPCODE_FRAC:
+         {
+            int v1 = ppc_allocate_vec_register(gen->f);
+            ppc_vrfim(gen->f, v1, v0);         /* v1 = floor(v0) */
+            ppc_vsubfp(gen->f, v0, v0, v1);    /* v0 = v0 - v1 */
+            ppc_release_vec_register(gen->f, v1);
+         }
+         break;
+      case TGSI_OPCODE_EXPBASE2:
+         ppc_vexptefp(gen->f, v0, v0);      /* v0 = 2^v0 */
+         break;
+      case TGSI_OPCODE_LOGBASE2:
+         /* XXX this may be broken! */
+         ppc_vlogefp(gen->f, v0, v0);      /* v0 = log2(v0) */
+         break;
+      case TGSI_OPCODE_MOV:
+         /* nothing */
+         break;
+      default:
+         assert(0);
       }
-   
-   x86_add_imm(
-      func, 
-      x86_make_reg( file_REG32, reg_SP ),
-      n*16);
-
-   /* Restore GP registers in a reverse order.
-    */
-   x86_pop(
-      func,
-      x86_make_reg( file_REG32, reg_DX) );
-   x86_pop(
-      func,
-      x86_make_reg( file_REG32, reg_CX) );
-   x86_pop(
-      func,
-      x86_make_reg( file_REG32, reg_AX) );
-
-   sse_movaps(
-      func,
-      make_xmm( xmm_dst ),
-      get_temp( TEMP_R0, 0 ) );
-}
-
-static void
-emit_func_call_dst_src(
-   struct x86_function *func,
-   unsigned xmm_save, 
-   unsigned xmm_dst,
-   unsigned xmm_src,
-   void (PIPE_CDECL *code)() )
-{
-   sse_movaps(
-      func,
-      get_temp( TEMP_R0, 1 ),
-      make_xmm( xmm_src ) );
-
-   emit_func_call_dst(
-      func,
-      xmm_save,
-      xmm_dst,
-      code );
-}
-
-/*
- * Fast SSE2 implementation of special math functions.
- */
-
-#define POLY0(x, c0) _mm_set1_ps(c0)
-#define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
-#define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
-#define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
-#define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
-#define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
-
-#define EXP_POLY_DEGREE 3
-#define LOG_POLY_DEGREE 5
-
-/**
- * See http://www.devmaster.net/forums/showthread.php?p=43580
- */
-static INLINE __m128 
-exp2f4(__m128 x)
-{
-   __m128i ipart;
-   __m128 fpart, expipart, expfpart;
-
-   x = _mm_min_ps(x, _mm_set1_ps( 129.00000f));
-   x = _mm_max_ps(x, _mm_set1_ps(-126.99999f));
-
-   /* ipart = int(x - 0.5) */
-   ipart = _mm_cvtps_epi32(_mm_sub_ps(x, _mm_set1_ps(0.5f)));
-
-   /* fpart = x - ipart */
-   fpart = _mm_sub_ps(x, _mm_cvtepi32_ps(ipart));
-
-   /* expipart = (float) (1 << ipart) */
-   expipart = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart, _mm_set1_epi32(127)), 23));
-
-   /* minimax polynomial fit of 2**x, in range [-0.5, 0.5[ */
-#if EXP_POLY_DEGREE == 5
-   expfpart = POLY5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f);
-#elif EXP_POLY_DEGREE == 4
-   expfpart = POLY4(fpart, 1.0000026f, 6.9300383e-1f, 2.4144275e-1f, 5.2011464e-2f, 1.3534167e-2f);
-#elif EXP_POLY_DEGREE == 3
-   expfpart = POLY3(fpart, 9.9992520e-1f, 6.9583356e-1f, 2.2606716e-1f, 7.8024521e-2f);
-#elif EXP_POLY_DEGREE == 2
-   expfpart = POLY2(fpart, 1.0017247f, 6.5763628e-1f, 3.3718944e-1f);
-#else
-#error
-#endif
-
-   return _mm_mul_ps(expipart, expfpart);
-}
-
-/**
- * See http://www.devmaster.net/forums/showthread.php?p=43580
- */
-static INLINE __m128 
-log2f4(__m128 x)
-{
-   __m128i expmask = _mm_set1_epi32(0x7f800000);
-   __m128i mantmask = _mm_set1_epi32(0x007fffff);
-   __m128 one = _mm_set1_ps(1.0f);
-
-   __m128i i = _mm_castps_si128(x);
-
-   /* exp = (float) exponent(x) */
-   __m128 exp = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i, expmask), 23), _mm_set1_epi32(127)));
-
-   /* mant = (float) mantissa(x) */
-   __m128 mant = _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i, mantmask)), one);
-
-   __m128 logmant;
-
-   /* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[ 
-    * These coefficients can be generate with 
-    * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
-    */
-#if LOG_POLY_DEGREE == 6
-   logmant = POLY5(mant, 3.11578814719469302614f, -3.32419399085241980044f, 2.59883907202499966007f, -1.23152682416275988241f, 0.318212422185251071475f, -0.0344359067839062357313f);
-#elif LOG_POLY_DEGREE == 5
-   logmant = POLY4(mant, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
-#elif LOG_POLY_DEGREE == 4
-   logmant = POLY3(mant, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
-#elif LOG_POLY_DEGREE == 3
-   logmant = POLY2(mant, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
-#else
-#error
-#endif
-
-   /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
-   logmant = _mm_mul_ps(logmant, _mm_sub_ps(mant, one));
-
-   return _mm_add_ps(logmant, exp);
-}
-
-static INLINE __m128
-powf4(__m128 x, __m128 y)
-{
-   return exp2f4(_mm_mul_ps(log2f4(x), y));
-}
-
-
-/**
- * Low-level instruction translators.
- */
-
-static void
-emit_abs(
-   struct x86_function *func,
-   unsigned xmm )
-{
-   sse_andps(
-      func,
-      make_xmm( xmm ),
-      get_temp(
-         TGSI_EXEC_TEMP_7FFFFFFF_I,
-         TGSI_EXEC_TEMP_7FFFFFFF_C ) );
-}
-
-static void
-emit_add(
-   struct x86_function *func,
-   unsigned xmm_dst,
-   unsigned xmm_src )
-{
-   sse_addps(
-      func,
-      make_xmm( xmm_dst ),
-      make_xmm( xmm_src ) );
-}
-
-static void PIPE_CDECL
-cos4f(
-   float *store )
-{
-   store[0] = cosf( store[0] );
-   store[1] = cosf( store[1] );
-   store[2] = cosf( store[2] );
-   store[3] = cosf( store[3] );
-}
-
-static void
-emit_cos(
-   struct x86_function *func,
-   unsigned xmm_save, 
-   unsigned xmm_dst )
-{
-   emit_func_call_dst(
-      func,
-      xmm_save, 
-      xmm_dst,
-      cos4f );
-}
-
-static void PIPE_CDECL
-#if defined(PIPE_CC_GCC)
-__attribute__((force_align_arg_pointer))
-#endif
-ex24f(
-   float *store )
-{
-   _mm_store_ps(&store[0], exp2f4( _mm_load_ps(&store[0]) ));
-}
-
-static void
-emit_ex2(
-   struct x86_function *func,
-   unsigned xmm_save, 
-   unsigned xmm_dst )
-{
-   emit_func_call_dst(
-      func,
-      xmm_save,
-      xmm_dst,
-      ex24f );
-}
-
-static void
-emit_f2it(
-   struct x86_function *func,
-   unsigned xmm )
-{
-   sse2_cvttps2dq(
-      func,
-      make_xmm( xmm ),
-      make_xmm( xmm ) );
-}
-
-static void PIPE_CDECL
-flr4f(
-   float *store )
-{
-   store[0] = floorf( store[0] );
-   store[1] = floorf( store[1] );
-   store[2] = floorf( store[2] );
-   store[3] = floorf( store[3] );
-}
-
-static void
-emit_flr(
-   struct x86_function *func,
-   unsigned xmm_save, 
-   unsigned xmm_dst )
-{
-   emit_func_call_dst(
-      func,
-      xmm_save,
-      xmm_dst,
-      flr4f );
-}
-
-static void PIPE_CDECL
-frc4f(
-   float *store )
-{
-   store[0] -= floorf( store[0] );
-   store[1] -= floorf( store[1] );
-   store[2] -= floorf( store[2] );
-   store[3] -= floorf( store[3] );
-}
-
-static void
-emit_frc(
-   struct x86_function *func,
-   unsigned xmm_save, 
-   unsigned xmm_dst )
-{
-   emit_func_call_dst(
-      func,
-      xmm_save,
-      xmm_dst,
-      frc4f );
-}
-
-static void PIPE_CDECL
-#if defined(PIPE_CC_GCC)
-__attribute__((force_align_arg_pointer))
-#endif
-lg24f(
-   float *store )
-{
-   _mm_store_ps(&store[0], log2f4( _mm_load_ps(&store[0]) ));
-}
-
-static void
-emit_lg2(
-   struct x86_function *func,
-   unsigned xmm_save, 
-   unsigned xmm_dst )
-{
-   emit_func_call_dst(
-      func,
-      xmm_save,
-      xmm_dst,
-      lg24f );
-}
-
-static void
-emit_MOV(
-   struct x86_function *func,
-   unsigned xmm_dst,
-   unsigned xmm_src )
-{
-   sse_movups(
-      func,
-      make_xmm( xmm_dst ),
-      make_xmm( xmm_src ) );
-}
-
-static void
-emit_mul (struct x86_function *func,
-          unsigned xmm_dst,
-          unsigned xmm_src)
-{
-   sse_mulps(
-      func,
-      make_xmm( xmm_dst ),
-      make_xmm( xmm_src ) );
-}
-
-static void
-emit_neg(
-   struct x86_function *func,
-   unsigned xmm )
-{
-   sse_xorps(
-      func,
-      make_xmm( xmm ),
-      get_temp(
-         TGSI_EXEC_TEMP_80000000_I,
-         TGSI_EXEC_TEMP_80000000_C ) );
-}
-
-static void PIPE_CDECL
-#if defined(PIPE_CC_GCC)
-__attribute__((force_align_arg_pointer))
-#endif
-pow4f(
-   float *store )
-{
-#if 1
-   _mm_store_ps(&store[0], powf4( _mm_load_ps(&store[0]), _mm_load_ps(&store[4]) ));
-#else
-   store[0] = powf( store[0], store[4] );
-   store[1] = powf( store[1], store[5] );
-   store[2] = powf( store[2], store[6] );
-   store[3] = powf( store[3], store[7] );
-#endif
-}
-
-static void
-emit_pow(
-   struct x86_function *func,
-   unsigned xmm_save, 
-   unsigned xmm_dst,
-   unsigned xmm_src )
-{
-   emit_func_call_dst_src(
-      func,
-      xmm_save,
-      xmm_dst,
-      xmm_src,
-      pow4f );
-}
-
-static void
-emit_rcp (
-   struct x86_function *func,
-   unsigned xmm_dst,
-   unsigned xmm_src )
-{
-   /* On Intel CPUs at least, this is only accurate to 12 bits -- not
-    * good enough.  Need to either emit a proper divide or use the
-    * iterative technique described below in emit_rsqrt().
-    */
-   sse2_rcpps(
-      func,
-      make_xmm( xmm_dst ),
-      make_xmm( xmm_src ) );
-}
-
-static void
-emit_rsqrt(
-   struct x86_function *func,
-   unsigned xmm_dst,
-   unsigned xmm_src )
-{
-#if HIGH_PRECISION
-   /* Although rsqrtps() and rcpps() are low precision on some/all SSE
-    * implementations, it is possible to improve its precision at
-    * fairly low cost, using a newton/raphson step, as below:
-    * 
-    * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
-    * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
-    *
-    * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
-    */
-   {
-      struct x86_reg dst = make_xmm( xmm_dst );
-      struct x86_reg src = make_xmm( xmm_src );
-      struct x86_reg tmp0 = make_xmm( 2 );
-      struct x86_reg tmp1 = make_xmm( 3 );
-
-      assert( xmm_dst != xmm_src );
-      assert( xmm_dst != 2 && xmm_dst != 3 );
-      assert( xmm_src != 2 && xmm_src != 3 );
-
-      sse_movaps(  func, dst,  get_temp( TGSI_EXEC_TEMP_HALF_I, TGSI_EXEC_TEMP_HALF_C ) );
-      sse_movaps(  func, tmp0, get_temp( TGSI_EXEC_TEMP_THREE_I, TGSI_EXEC_TEMP_THREE_C ) );
-      sse_rsqrtps( func, tmp1, src  );
-      sse_mulps(   func, src,  tmp1 );
-      sse_mulps(   func, dst,  tmp1 );
-      sse_mulps(   func, src,  tmp1 );
-      sse_subps(   func, tmp0, src  );
-      sse_mulps(   func, dst,  tmp0 );
+      STORE(gen, *inst, v0, 0, chan_index);   /* store v0 */
    }
-#else
-   /* On Intel CPUs at least, this is only accurate to 12 bits -- not
-    * good enough.
-    */
-   sse_rsqrtps(
-      func,
-      make_xmm( xmm_dst ),
-      make_xmm( xmm_src ) );
-#endif
-}
-
-static void
-emit_setsign(
-   struct x86_function *func,
-   unsigned xmm )
-{
-   sse_orps(
-      func,
-      make_xmm( xmm ),
-      get_temp(
-         TGSI_EXEC_TEMP_80000000_I,
-         TGSI_EXEC_TEMP_80000000_C ) );
-}
-
-static void PIPE_CDECL
-sin4f(
-   float *store )
-{
-   store[0] = sinf( store[0] );
-   store[1] = sinf( store[1] );
-   store[2] = sinf( store[2] );
-   store[3] = sinf( store[3] );
+   ppc_release_vec_register(gen->f, v0);
 }
 
-static void
-emit_sin (struct x86_function *func,
-          unsigned xmm_save, 
-          unsigned xmm_dst)
-{
-   emit_func_call_dst(
-      func,
-      xmm_save,
-      xmm_dst,
-      sin4f );
-}
 
 static void
-emit_sub(
-   struct x86_function *func,
-   unsigned xmm_dst,
-   unsigned xmm_src )
+emit_binop(struct gen_context *gen, struct tgsi_full_instruction *inst)
 {
-   sse_subps(
-      func,
-      make_xmm( xmm_dst ),
-      make_xmm( xmm_src ) );
+   int v0 = ppc_allocate_vec_register(gen->f);
+   int v1 = ppc_allocate_vec_register(gen->f);
+   int v2 = ppc_allocate_vec_register(gen->f);
+   uint chan_index;
+   FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) {
+      FETCH(gen, *inst, v0, 0, chan_index);   /* v0 = srcreg[0] */
+      FETCH(gen, *inst, v1, 1, chan_index);   /* v1 = srcreg[1] */
+      switch (inst->Instruction.Opcode) {
+      case TGSI_OPCODE_ADD:
+         ppc_vaddfp(gen->f, v2, v0, v1);
+         break;
+      case TGSI_OPCODE_SUB:
+         ppc_vsubfp(gen->f, v2, v0, v1);
+         break;
+      case TGSI_OPCODE_MUL:
+         ppc_vxor(gen->f, v2, v2, v2);        /* v2 = {0, 0, 0, 0} */
+         ppc_vmaddfp(gen->f, v2, v0, v1, v2); /* v2 = v0 * v1 + v0 */
+         break;
+      case TGSI_OPCODE_MIN:
+         ppc_vminfp(gen->f, v2, v0, v1);
+         break;
+      case TGSI_OPCODE_MAX:
+         ppc_vmaxfp(gen->f, v2, v0, v1);
+         break;
+      default:
+         assert(0);
+      }
+      STORE(gen, *inst, v2, 0, chan_index);   /* store v2 */
+   }
+   ppc_release_vec_register(gen->f, v0);
+   ppc_release_vec_register(gen->f, v1);
+   ppc_release_vec_register(gen->f, v2);
 }
-#endif
 
 
 /**
- * Return index of vector register containing {1.0, 1.0, 1.0, 1.0}.
+ * Vector comparisons, resulting in 1.0 or 0.0 values.
  */
-static int
-gen_one_vec(struct gen_context *gen)
-{
-   if (gen->one_vec < 0) {
-      gen->one_vec = ppc_allocate_vec_register(gen->f);
-      ppc_vload_float(gen->f, gen->one_vec, 1.0f);
-   }
-   return gen->one_vec;
-}
-
-/**
- * Return index of vector register containing {1<<31, 1<<31, 1<<31, 1<<31}.
- */
-static int
-gen_get_bit31_vec(struct gen_context *gen)
-{
-   if (gen->bit31_vec < 0) {
-      gen->bit31_vec = ppc_allocate_vec_register(gen->f);
-      ppc_vspltisw(gen->f, gen->bit31_vec, -1);
-      ppc_vslw(gen->f, gen->bit31_vec, gen->bit31_vec, gen->bit31_vec);
-   }
-   return gen->bit31_vec;
-}
-
-
-
-/**
- * Register fetch.
- */
-static void
-emit_fetch(struct gen_context *gen,
-           unsigned vec_reg,
-           const struct tgsi_full_src_register *reg,
-           const unsigned chan_index)
-{
-   uint swizzle = tgsi_util_get_full_src_register_extswizzle(reg, chan_index);
-
-   switch (swizzle) {
-   case TGSI_EXTSWIZZLE_X:
-   case TGSI_EXTSWIZZLE_Y:
-   case TGSI_EXTSWIZZLE_Z:
-   case TGSI_EXTSWIZZLE_W:
-      switch (reg->SrcRegister.File) {
-      case TGSI_FILE_INPUT:
-         {
-            int offset_reg = ppc_allocate_register(gen->f);
-            int offset = (reg->SrcRegister.Index * 4 + swizzle) * 16;
-            ppc_li(gen->f, offset_reg, offset);
-            ppc_lvx(gen->f, vec_reg, gen->inputs_reg, offset_reg);
-            ppc_release_register(gen->f, offset_reg);
-         }
-         break;
-      case TGSI_FILE_TEMPORARY:
-         {
-            int offset_reg = ppc_allocate_register(gen->f);
-            int offset = (reg->SrcRegister.Index * 4 + swizzle) * 16;
-            ppc_li(gen->f, offset_reg, offset);
-            ppc_lvx(gen->f, vec_reg, gen->temps_reg, offset_reg);
-            ppc_release_register(gen->f, offset_reg);
-         }
-         break;
-      case TGSI_FILE_IMMEDIATE:
-         {
-            int offset_reg = ppc_allocate_register(gen->f);
-            int offset = (reg->SrcRegister.Index * 4 + swizzle) * 16;
-            ppc_li(gen->f, offset_reg, offset);
-            ppc_lvx(gen->f, vec_reg, gen->immed_reg, offset_reg);
-            ppc_release_register(gen->f, offset_reg);
-         }
-         break;
-      case TGSI_FILE_CONSTANT:
-         {
-            int offset_reg = ppc_allocate_register(gen->f);
-            int offset = (reg->SrcRegister.Index * 4 + swizzle) * 4;
-            ppc_li(gen->f, offset_reg, offset);
-            /* Load 4-byte word into vector register.
-             * The vector slot depends on the effective address we load from.
-             * We know that our constants start at a 16-byte boundary so we
-             * know that 'swizzle' tells us which vector slot will have the
-             * loaded word.  The other vector slots will be undefined.
-             */
-            ppc_lvewx(gen->f, vec_reg, gen->const_reg, offset_reg);
-            /* splat word[swizzle] across the vector reg */
-            ppc_vspltw(gen->f, vec_reg, vec_reg, swizzle);
-            ppc_release_register(gen->f, offset_reg);
-         }
-         break;
-      default:
-         assert( 0 );
-      }
-      break;
-   case TGSI_EXTSWIZZLE_ZERO:
-      ppc_vload_float(gen->f, vec_reg, 0.0f);
-      break;
-   case TGSI_EXTSWIZZLE_ONE:
-      {
-         int one_vec = gen_one_vec(gen);
-         ppc_vecmove(gen->f, vec_reg, one_vec);
-      }
-      break;
-   default:
-      assert( 0 );
-   }
-
-   {
-      uint sign_op = tgsi_util_get_full_src_register_sign_mode(reg, chan_index);
-      if (sign_op != TGSI_UTIL_SIGN_KEEP) {
-         int bit31_vec = gen_get_bit31_vec(gen);
-
-         switch (sign_op) {
-         case TGSI_UTIL_SIGN_CLEAR:
-            /* vec = vec & ~bit31 */
-            ppc_vandc(gen->f, vec_reg, vec_reg, bit31_vec);
-            break;
-         case TGSI_UTIL_SIGN_SET:
-            /* vec = vec | bit31 */
-            ppc_vor(gen->f, vec_reg, vec_reg, bit31_vec);
-            break;
-         case TGSI_UTIL_SIGN_TOGGLE:
-            /* vec = vec ^ bit31 */
-            ppc_vxor(gen->f, vec_reg, vec_reg, bit31_vec);
-            break;
-         default:
-            assert(0);
-         }
-      }
-   }
-}
-
-#define FETCH( GEN, INST, VEC_REG, SRC_REG, CHAN ) \
-   emit_fetch( GEN, VEC_REG, &(INST).FullSrcRegisters[SRC_REG], CHAN )
-
-
-
-/**
- * Register store.
- */
-static void
-emit_store(struct gen_context *gen,
-           unsigned vec_reg,
-           const struct tgsi_full_dst_register *reg,
-           const struct tgsi_full_instruction *inst,
-           unsigned chan_index)
-{
-   switch (reg->DstRegister.File) {
-   case TGSI_FILE_OUTPUT:
-      {
-         int offset_reg = ppc_allocate_register(gen->f);
-         int offset = (reg->DstRegister.Index * 4 + chan_index) * 16;
-         ppc_li(gen->f, offset_reg, offset);
-         ppc_stvx(gen->f, vec_reg, gen->outputs_reg, offset_reg);
-         ppc_release_register(gen->f, offset_reg);
-      }
-      break;
-   case TGSI_FILE_TEMPORARY:
-      {
-         int offset_reg = ppc_allocate_register(gen->f);
-         int offset = (reg->DstRegister.Index * 4 + chan_index) * 16;
-         ppc_li(gen->f, offset_reg, offset);
-         ppc_stvx(gen->f, vec_reg, gen->temps_reg, offset_reg);
-         ppc_release_register(gen->f, offset_reg);
-      }
-      break;
-#if 0
-   case TGSI_FILE_ADDRESS:
-      emit_addrs(
-         func,
-         xmm,
-         reg->DstRegister.Index,
-         chan_index );
-      break;
-#endif
-   default:
-      assert( 0 );
-   }
-
-#if 0
-   switch( inst->Instruction.Saturate ) {
-   case TGSI_SAT_NONE:
-      break;
-
-   case TGSI_SAT_ZERO_ONE:
-      /* assert( 0 ); */
-      break;
-
-   case TGSI_SAT_MINUS_PLUS_ONE:
-      assert( 0 );
-      break;
-   }
-#endif
-}
-
-
-#define STORE( GEN, INST, XMM, INDEX, CHAN )\
-   emit_store( GEN, XMM, &(INST).FullDstRegisters[INDEX], &(INST), CHAN )
-
-
-
-#if 000
-/**
- * High-level instruction translators.
- */
-
-static void
-emit_kil(
-   struct x86_function *func,
-   const struct tgsi_full_src_register *reg )
-{
-   unsigned uniquemask;
-   unsigned registers[4];
-   unsigned nextregister = 0;
-   unsigned firstchan = ~0;
-   unsigned chan_index;
-
-   /* This mask stores component bits that were already tested. Note that
-    * we test if the value is less than zero, so 1.0 and 0.0 need not to be
-    * tested. */
-   uniquemask = (1 << TGSI_EXTSWIZZLE_ZERO) | (1 << TGSI_EXTSWIZZLE_ONE);
-
-   FOR_EACH_CHANNEL( chan_index ) {
-      unsigned swizzle;
-
-      /* unswizzle channel */
-      swizzle = tgsi_util_get_full_src_register_extswizzle(
-         reg,
-         chan_index );
-
-      /* check if the component has not been already tested */
-      if( !(uniquemask & (1 << swizzle)) ) {
-         uniquemask |= 1 << swizzle;
-
-         /* allocate register */
-         registers[chan_index] = nextregister;
-         emit_fetch(
-            func,
-            nextregister,
-            reg,
-            chan_index );
-         nextregister++;
-
-         /* mark the first channel used */
-         if( firstchan == ~0 ) {
-            firstchan = chan_index;
-         }
-      }
-   }
-
-   x86_push(
-      func,
-      x86_make_reg( file_REG32, reg_AX ) );
-   x86_push(
-      func,
-      x86_make_reg( file_REG32, reg_DX ) );
-
-   FOR_EACH_CHANNEL( chan_index ) {
-      if( uniquemask & (1 << chan_index) ) {
-         sse_cmpps(
-            func,
-            make_xmm( registers[chan_index] ),
-            get_temp(
-               TGSI_EXEC_TEMP_00000000_I,
-               TGSI_EXEC_TEMP_00000000_C ),
-            cc_LessThan );
-
-         if( chan_index == firstchan ) {
-            sse_pmovmskb(
-               func,
-               x86_make_reg( file_REG32, reg_AX ),
-               make_xmm( registers[chan_index] ) );
-         }
-         else {
-            sse_pmovmskb(
-               func,
-               x86_make_reg( file_REG32, reg_DX ),
-               make_xmm( registers[chan_index] ) );
-            x86_or(
-               func,
-               x86_make_reg( file_REG32, reg_AX ),
-               x86_make_reg( file_REG32, reg_DX ) );
-         }
-      }
-   }
-
-   x86_or(
-      func,
-      get_temp(
-         TGSI_EXEC_TEMP_KILMASK_I,
-         TGSI_EXEC_TEMP_KILMASK_C ),
-      x86_make_reg( file_REG32, reg_AX ) );
-
-   x86_pop(
-      func,
-      x86_make_reg( file_REG32, reg_DX ) );
-   x86_pop(
-      func,
-      x86_make_reg( file_REG32, reg_AX ) );
-}
-
-
-static void
-emit_kilp(
-   struct x86_function *func )
-{
-   /* XXX todo / fix me */
-}
-
-
-static void
-emit_setcc(
-   struct x86_function *func,
-   struct tgsi_full_instruction *inst,
-   enum sse_cc cc )
-{
-   unsigned chan_index;
-
-   FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-      FETCH( func, *inst, 0, 0, chan_index );
-      FETCH( func, *inst, 1, 1, chan_index );
-      sse_cmpps(
-         func,
-         make_xmm( 0 ),
-         make_xmm( 1 ),
-         cc );
-      sse_andps(
-         func,
-         make_xmm( 0 ),
-         get_temp(
-            TEMP_ONE_I,
-            TEMP_ONE_C ) );
-      STORE( func, *inst, 0, 0, chan_index );
-   }
-}
-
-static void
-emit_cmp(
-   struct x86_function *func,
-   struct tgsi_full_instruction *inst )
-{
-   unsigned chan_index;
-
-   FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-      FETCH( func, *inst, 0, 0, chan_index );
-      FETCH( func, *inst, 1, 1, chan_index );
-      FETCH( func, *inst, 2, 2, chan_index );
-      sse_cmpps(
-         func,
-         make_xmm( 0 ),
-         get_temp(
-            TGSI_EXEC_TEMP_00000000_I,
-            TGSI_EXEC_TEMP_00000000_C ),
-         cc_LessThan );
-      sse_andps(
-         func,
-         make_xmm( 1 ),
-         make_xmm( 0 ) );
-      sse_andnps(
-         func,
-         make_xmm( 0 ),
-         make_xmm( 2 ) );
-      sse_orps(
-         func,
-         make_xmm( 0 ),
-         make_xmm( 1 ) );
-      STORE( func, *inst, 0, 0, chan_index );
-   }
-}
-#endif
-
-
-static void
-emit_scalar_unaryop(struct gen_context *gen, struct tgsi_full_instruction *inst)
+static void
+emit_inequality(struct gen_context *gen, struct tgsi_full_instruction *inst)
 {
    int v0 = ppc_allocate_vec_register(gen->f);
-   int v1 = ppc_allocate_vec_register(gen->f);
-   uint chan_index;
-
-   FETCH(gen, *inst, v0, 0, CHAN_X);
-
-   switch (inst->Instruction.Opcode) {
-   case TGSI_OPCODE_RSQ:
-      /* v1 = 1.0 / sqrt(v0) */
-      ppc_vrsqrtefp(gen->f, v1, v0);
-      break;
-   case TGSI_OPCODE_RCP:
-      /* v1 = 1.0 / v0 */
-      ppc_vrefp(gen->f, v1, v0);
-      break;
-   default:
-      assert(0);
-   }
-
-   FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-      STORE(gen, *inst, v1, 0, chan_index);
-   }
-   ppc_release_vec_register(gen->f, v0);
-   ppc_release_vec_register(gen->f, v1);
-}
-
-
-static void
-emit_unaryop(struct gen_context *gen, struct tgsi_full_instruction *inst)
-{
-   int v0 = ppc_allocate_vec_register(gen->f);
-   uint chan_index;
-   FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) {
-      FETCH(gen, *inst, 0, 0, chan_index);   /* v0 = srcreg[0] */
-      switch (inst->Instruction.Opcode) {
-      case TGSI_OPCODE_ABS:
-         /* turn off the most significant bit of each vector float word */
-         {
-            int v1 = ppc_allocate_vec_register(gen->f);
-            ppc_vspltisw(gen->f, v1, -1);  /* v1 = {-1, -1, -1, -1} */
-            ppc_vslw(gen->f, v1, v1, v1);  /* v1 = {1<<31, 1<<31, 1<<31, 1<<31} */
-            ppc_vandc(gen->f, v0, v0, v1); /* v0 = v0 & ~v1 */
-            ppc_release_vec_register(gen->f, v1);
-         }
-         break;
-      case TGSI_OPCODE_FLOOR:
-         ppc_vrfim(gen->f, v0, v0);         /* v0 = floor(v0) */
-         break;
-      case TGSI_OPCODE_FRAC:
-         {
-            int v1 = ppc_allocate_vec_register(gen->f);
-            ppc_vrfim(gen->f, v1, v0);         /* v1 = floor(v0) */
-            ppc_vsubfp(gen->f, v0, v0, v1);    /* v0 = v0 - v1 */
-            ppc_release_vec_register(gen->f, v1);
-         }
-         break;
-      case TGSI_OPCODE_EXPBASE2:
-         ppc_vexptefp(gen->f, v0, v0);      /* v0 = 2^v0 */
-         break;
-      case TGSI_OPCODE_LOGBASE2:
-         /* XXX this may be broken! */
-         ppc_vlogefp(gen->f, v0, v0);      /* v0 = log2(v0) */
-         break;
-      case TGSI_OPCODE_MOV:
-         /* nothing */
-         break;
-      default:
-         assert(0);
-      }
-      STORE(gen, *inst, v0, 0, chan_index);   /* store v0 */
-   }
-   ppc_release_vec_register(gen->f, v0);
-}
-
-
-static void
-emit_binop(struct gen_context *gen, struct tgsi_full_instruction *inst)
-{
-   int v0 = ppc_allocate_vec_register(gen->f);
-   int v1 = ppc_allocate_vec_register(gen->f);
-   int v2 = ppc_allocate_vec_register(gen->f);
-   uint chan_index;
-   FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) {
-      FETCH(gen, *inst, v0, 0, chan_index);   /* v0 = srcreg[0] */
-      FETCH(gen, *inst, v1, 1, chan_index);   /* v1 = srcreg[1] */
-      switch (inst->Instruction.Opcode) {
-      case TGSI_OPCODE_ADD:
-         ppc_vaddfp(gen->f, v2, v0, v1);
-         break;
-      case TGSI_OPCODE_SUB:
-         ppc_vsubfp(gen->f, v2, v0, v1);
-         break;
-      case TGSI_OPCODE_MUL:
-         ppc_vxor(gen->f, v2, v2, v2);        /* v2 = {0, 0, 0, 0} */
-         ppc_vmaddfp(gen->f, v2, v0, v1, v2); /* v2 = v0 * v1 + v0 */
-         break;
-      case TGSI_OPCODE_MIN:
-         ppc_vminfp(gen->f, v2, v0, v1);
-         break;
-      case TGSI_OPCODE_MAX:
-         ppc_vmaxfp(gen->f, v2, v0, v1);
-         break;
-      default:
-         assert(0);
-      }
-      STORE(gen, *inst, v2, 0, chan_index);   /* store v2 */
-   }
-   ppc_release_vec_register(gen->f, v0);
-   ppc_release_vec_register(gen->f, v1);
-   ppc_release_vec_register(gen->f, v2);
-}
-
-
-/**
- * Vector comparisons, resulting in 1.0 or 0.0 values.
- */
-static void
-emit_inequality(struct gen_context *gen, struct tgsi_full_instruction *inst)
-{
-   int v0 = ppc_allocate_vec_register(gen->f);
-   int v1 = ppc_allocate_vec_register(gen->f);
-   int v2 = ppc_allocate_vec_register(gen->f);
-   uint chan_index;
-   boolean complement = FALSE;
-   int one_vec = gen_one_vec(gen);
-
-   FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) {
-      FETCH(gen, *inst, v0, 0, chan_index);   /* v0 = srcreg[0] */
-      FETCH(gen, *inst, v1, 1, chan_index);   /* v1 = srcreg[1] */
-
-      switch (inst->Instruction.Opcode) {
-      case TGSI_OPCODE_SNE:
-         complement = TRUE;
-         /* fall-through */
-      case TGSI_OPCODE_SEQ:
-         ppc_vcmpeqfpx(gen->f, v2, v0, v1); /* v2 = v0 == v1 ? ~0 : 0 */
-         break;
-
-      case TGSI_OPCODE_SGE:
-         complement = TRUE;
-         /* fall-through */
-      case TGSI_OPCODE_SLT:
-         ppc_vcmpgtfpx(gen->f, v2, v1, v0); /* v2 = v1 > v0 ? ~0 : 0 */
-         break;
-
-      case TGSI_OPCODE_SLE:
-         complement = TRUE;
-         /* fall-through */
-      case TGSI_OPCODE_SGT:
-         ppc_vcmpgtfpx(gen->f, v2, v0, v1); /* v2 = v0 > v1 ? ~0 : 0 */
-         break;
-      default:
-         assert(0);
-      }
-
-      /* v2 is now {0,0,0,0} or {~0,~0,~0,~0} */
-
-      if (complement)
-         ppc_vandc(gen->f, v2, one_vec, v2);    /* v2 = one_vec & ~v2 */
-      else
-         ppc_vand(gen->f, v2, one_vec, v2);     /* v2 = one_vec & v2 */
-
-      STORE(gen, *inst, v2, 0, chan_index);   /* store v2 */
-   }
-
-   ppc_release_vec_register(gen->f, v0);
-   ppc_release_vec_register(gen->f, v1);
-   ppc_release_vec_register(gen->f, v2);
-}
-
-
-static void
-emit_dotprod(struct gen_context *gen, struct tgsi_full_instruction *inst)
-{
-   int v0 = ppc_allocate_vec_register(gen->f);
-   int v1 = ppc_allocate_vec_register(gen->f);
-   int v2 = ppc_allocate_vec_register(gen->f);
-   uint chan_index;
-
-   ppc_vxor(gen->f, v2, v2, v2);           /* v2 = {0, 0, 0, 0} */
-
-   FETCH(gen, *inst, v0, 0, CHAN_X);       /* v0 = src0.XXXX */
-   FETCH(gen, *inst, v1, 1, CHAN_X);       /* v1 = src1.XXXX */
-   ppc_vmaddfp(gen->f, v2, v0, v1, v2);    /* v2 = v0 * v1 + v2 */
-
-   FETCH(gen, *inst, v0, 0, CHAN_Y);       /* v0 = src0.YYYY */
-   FETCH(gen, *inst, v1, 1, CHAN_Y);       /* v1 = src1.YYYY */
-   ppc_vmaddfp(gen->f, v2, v0, v1, v2);    /* v2 = v0 * v1 + v2 */
-
-   FETCH(gen, *inst, v0, 0, CHAN_Z);       /* v0 = src0.ZZZZ */
-   FETCH(gen, *inst, v1, 1, CHAN_Z);       /* v1 = src1.ZZZZ */
-   ppc_vmaddfp(gen->f, v2, v0, v1, v2);    /* v2 = v0 * v1 + v2 */
-
-   if (inst->Instruction.Opcode == TGSI_OPCODE_DP4) {
-      FETCH(gen, *inst, v0, 0, CHAN_W);    /* v0 = src0.WWWW */
-      FETCH(gen, *inst, v1, 1, CHAN_W);    /* v1 = src1.WWWW */
-      ppc_vmaddfp(gen->f, v2, v0, v1, v2); /* v2 = v0 * v1 + v2 */
-   }
-   else if (inst->Instruction.Opcode == TGSI_OPCODE_DPH) {
-      FETCH(gen, *inst, v1, 1, CHAN_W);    /* v1 = src1.WWWW */
-      ppc_vaddfp(gen->f, v2, v2, v1);      /* v2 = v2 + v1 */
-   }
-
-   FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) {
-      STORE(gen, *inst, v2, 0, chan_index);  /* store v2 */
-   }
-   ppc_release_vec_register(gen->f, v0);
-   ppc_release_vec_register(gen->f, v1);
-   ppc_release_vec_register(gen->f, v2);
-}
-
-
-static void
-emit_triop(struct gen_context *gen, struct tgsi_full_instruction *inst)
-{
-   int v0 = ppc_allocate_vec_register(gen->f);
-   int v1 = ppc_allocate_vec_register(gen->f);
-   int v2 = ppc_allocate_vec_register(gen->f);
-   int v3 = ppc_allocate_vec_register(gen->f);
-   uint chan_index;
-   FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) {
-      FETCH(gen, *inst, v0, 0, chan_index);   /* v0 = srcreg[0] */
-      FETCH(gen, *inst, v1, 1, chan_index);   /* v1 = srcreg[1] */
-      FETCH(gen, *inst, v2, 2, chan_index);   /* v2 = srcreg[2] */
-      switch (inst->Instruction.Opcode) {
-      case TGSI_OPCODE_MAD:
-         ppc_vmaddfp(gen->f, v3, v0, v1, v2);   /* v3 = v0 * v1 + v2 */
-         break;
-      case TGSI_OPCODE_LRP:
-         ppc_vsubfp(gen->f, v3, v1, v2);        /* v3 = v1 - v2 */
-         ppc_vmaddfp(gen->f, v3, v0, v3, v2);   /* v3 = v0 * v3 + v2 */
-         break;
-      default:
-         assert(0);
-      }
-      STORE(gen, *inst, v3, 0, chan_index);   /* store v3 */
-   }
-   ppc_release_vec_register(gen->f, v0);
-   ppc_release_vec_register(gen->f, v1);
-   ppc_release_vec_register(gen->f, v2);
-   ppc_release_vec_register(gen->f, v3);
-}
-
-
-/*
-static void
-emit_lit(struct gen_context *gen, struct tgsi_full_instruction *inst)
-{
-}
-*/
-
-
-static int
-emit_instruction(struct gen_context *gen,
-                 struct tgsi_full_instruction *inst)
-{
-   switch (inst->Instruction.Opcode) {
-   case TGSI_OPCODE_MOV:
-   case TGSI_OPCODE_ABS:
-   case TGSI_OPCODE_FLOOR:
-   case TGSI_OPCODE_FRAC:
-   case TGSI_OPCODE_EXPBASE2:
-   case TGSI_OPCODE_LOGBASE2:
-      emit_unaryop(gen, inst);
-      break;
-   case TGSI_OPCODE_RSQ:
-   case TGSI_OPCODE_RCP:
-      emit_scalar_unaryop(gen, inst);
-      break;
-   case TGSI_OPCODE_ADD:
-   case TGSI_OPCODE_SUB:
-   case TGSI_OPCODE_MUL:
-   case TGSI_OPCODE_MIN:
-   case TGSI_OPCODE_MAX:
-      emit_binop(gen, inst);
-      break;
-   case TGSI_OPCODE_SEQ:
-   case TGSI_OPCODE_SNE:
-   case TGSI_OPCODE_SLT:
-   case TGSI_OPCODE_SGT:
-   case TGSI_OPCODE_SLE:
-   case TGSI_OPCODE_SGE:
-      emit_inequality(gen, inst);
-      break;
-   case TGSI_OPCODE_MAD:
-   case TGSI_OPCODE_LRP:
-      emit_triop(gen, inst);
-      break;
-   case TGSI_OPCODE_DP3:
-   case TGSI_OPCODE_DP4:
-   case TGSI_OPCODE_DPH:
-      emit_dotprod(gen, inst);
-      break;
-      /*
-   case TGSI_OPCODE_LIT:
-      emit_lit(gen, inst);
-      break;
-      */
-   case TGSI_OPCODE_END:
-      /* normal end */
-      return 1;
-   default:
-      return 0;
-   }
-
-#if 0
-   unsigned chan_index;
-
-   switch (inst->Instruction.Opcode) {
-   case TGSI_OPCODE_ARL:
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( func, *inst, 0, 0, chan_index );
-         emit_f2it( func, 0 );
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_MOV:
-   case TGSI_OPCODE_SWZ:
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( func, *inst, 0, 0, chan_index );
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_LIT:
-      if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
-          IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
-         emit_tempf(
-            func,
-            0,
-            TEMP_ONE_I,
-            TEMP_ONE_C);
-         if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ) {
-            STORE( func, *inst, 0, 0, CHAN_X );
-         }
-         if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
-            STORE( func, *inst, 0, 0, CHAN_W );
-         }
-      }
-      if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
-          IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
-         if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
-            FETCH( func, *inst, 0, 0, CHAN_X );
-            sse_maxps(
-               func,
-               make_xmm( 0 ),
-               get_temp(
-                  TGSI_EXEC_TEMP_00000000_I,
-                  TGSI_EXEC_TEMP_00000000_C ) );
-            STORE( func, *inst, 0, 0, CHAN_Y );
-         }
-         if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
-            /* XMM[1] = SrcReg[0].yyyy */
-            FETCH( func, *inst, 1, 0, CHAN_Y );
-            /* XMM[1] = max(XMM[1], 0) */
-            sse_maxps(
-               func,
-               make_xmm( 1 ),
-               get_temp(
-                  TGSI_EXEC_TEMP_00000000_I,
-                  TGSI_EXEC_TEMP_00000000_C ) );
-            /* XMM[2] = SrcReg[0].wwww */
-            FETCH( func, *inst, 2, 0, CHAN_W );
-            /* XMM[2] = min(XMM[2], 128.0) */
-            sse_minps(
-               func,
-               make_xmm( 2 ),
-               get_temp(
-                  TGSI_EXEC_TEMP_128_I,
-                  TGSI_EXEC_TEMP_128_C ) );
-            /* XMM[2] = max(XMM[2], -128.0) */
-            sse_maxps(
-               func,
-               make_xmm( 2 ),
-               get_temp(
-                  TGSI_EXEC_TEMP_MINUS_128_I,
-                  TGSI_EXEC_TEMP_MINUS_128_C ) );
-            emit_pow( func, 3, 1, 2 );
-            FETCH( func, *inst, 0, 0, CHAN_X );
-            sse_xorps(
-               func,
-               make_xmm( 2 ),
-               make_xmm( 2 ) );
-            sse_cmpps(
-               func,
-               make_xmm( 2 ),
-               make_xmm( 0 ),
-               cc_LessThanEqual );
-            sse_andps(
-               func,
-               make_xmm( 2 ),
-               make_xmm( 1 ) );
-            STORE( func, *inst, 2, 0, CHAN_Z );
-         }
-      }
-      break;
-
-   case TGSI_OPCODE_RCP:
-   /* TGSI_OPCODE_RECIP */
-      FETCH( func, *inst, 0, 0, CHAN_X );
-      emit_rcp( func, 0, 0 );
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_RSQ:
-   /* TGSI_OPCODE_RECIPSQRT */
-      FETCH( func, *inst, 0, 0, CHAN_X );
-      emit_rsqrt( func, 1, 0 );
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( func, *inst, 1, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_EXP:
-      if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
-          IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
-          IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
-         FETCH( func, *inst, 0, 0, CHAN_X );
-         if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
-             IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
-            emit_MOV( func, 1, 0 );
-            emit_flr( func, 2, 1 );
-            /* dst.x = ex2(floor(src.x)) */
-            if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
-               emit_MOV( func, 2, 1 );
-               emit_ex2( func, 3, 2 );
-               STORE( func, *inst, 2, 0, CHAN_X );
-            }
-            /* dst.y = src.x - floor(src.x) */
-            if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
-               emit_MOV( func, 2, 0 );
-               emit_sub( func, 2, 1 );
-               STORE( func, *inst, 2, 0, CHAN_Y );
-            }
-         }
-         /* dst.z = ex2(src.x) */
-         if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
-            emit_ex2( func, 3, 0 );
-            STORE( func, *inst, 0, 0, CHAN_Z );
-         }
-      }
-      /* dst.w = 1.0 */
-      if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
-         emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
-         STORE( func, *inst, 0, 0, CHAN_W );
-      }
-      break;
-
-   case TGSI_OPCODE_LOG:
-      if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
-          IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
-          IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
-         FETCH( func, *inst, 0, 0, CHAN_X );
-         emit_abs( func, 0 );
-         emit_MOV( func, 1, 0 );
-         emit_lg2( func, 2, 1 );
-         /* dst.z = lg2(abs(src.x)) */
-         if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
-            STORE( func, *inst, 1, 0, CHAN_Z );
-         }
-         if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
-             IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
-            emit_flr( func, 2, 1 );
-            /* dst.x = floor(lg2(abs(src.x))) */
-            if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
-               STORE( func, *inst, 1, 0, CHAN_X );
-            }
-            /* dst.x = abs(src)/ex2(floor(lg2(abs(src.x)))) */
-            if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
-               emit_ex2( func, 2, 1 );
-               emit_rcp( func, 1, 1 );
-               emit_mul( func, 0, 1 );
-               STORE( func, *inst, 0, 0, CHAN_Y );
-            }
-         }
-      }
-      /* dst.w = 1.0 */
-      if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
-         emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
-         STORE( func, *inst, 0, 0, CHAN_W );
-      }
-      break;
-
-   case TGSI_OPCODE_MUL:
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( func, *inst, 0, 0, chan_index );
-         FETCH( func, *inst, 1, 1, chan_index );
-         emit_mul( func, 0, 1 );
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_ADD:
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( func, *inst, 0, 0, chan_index );
-         FETCH( func, *inst, 1, 1, chan_index );
-         emit_add( func, 0, 1 );
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_DP3:
-   /* TGSI_OPCODE_DOT3 */
-      FETCH( func, *inst, 0, 0, CHAN_X );
-      FETCH( func, *inst, 1, 1, CHAN_X );
-      emit_mul( func, 0, 1 );
-      FETCH( func, *inst, 1, 0, CHAN_Y );
-      FETCH( func, *inst, 2, 1, CHAN_Y );
-      emit_mul( func, 1, 2 );
-      emit_add( func, 0, 1 );
-      FETCH( func, *inst, 1, 0, CHAN_Z );
-      FETCH( func, *inst, 2, 1, CHAN_Z );
-      emit_mul( func, 1, 2 );
-      emit_add( func, 0, 1 );
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_DP4:
-   /* TGSI_OPCODE_DOT4 */
-      FETCH( func, *inst, 0, 0, CHAN_X );
-      FETCH( func, *inst, 1, 1, CHAN_X );
-      emit_mul( func, 0, 1 );
-      FETCH( func, *inst, 1, 0, CHAN_Y );
-      FETCH( func, *inst, 2, 1, CHAN_Y );
-      emit_mul( func, 1, 2 );
-      emit_add( func, 0, 1 );
-      FETCH( func, *inst, 1, 0, CHAN_Z );
-      FETCH( func, *inst, 2, 1, CHAN_Z );
-      emit_mul(func, 1, 2 );
-      emit_add(func, 0, 1 );
-      FETCH( func, *inst, 1, 0, CHAN_W );
-      FETCH( func, *inst, 2, 1, CHAN_W );
-      emit_mul( func, 1, 2 );
-      emit_add( func, 0, 1 );
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_DST:
-      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
-         emit_tempf(
-            func,
-            0,
-            TEMP_ONE_I,
-            TEMP_ONE_C );
-         STORE( func, *inst, 0, 0, CHAN_X );
-      }
-      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
-         FETCH( func, *inst, 0, 0, CHAN_Y );
-         FETCH( func, *inst, 1, 1, CHAN_Y );
-         emit_mul( func, 0, 1 );
-         STORE( func, *inst, 0, 0, CHAN_Y );
-      }
-      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
-         FETCH( func, *inst, 0, 0, CHAN_Z );
-         STORE( func, *inst, 0, 0, CHAN_Z );
-      }
-      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
-         FETCH( func, *inst, 0, 1, CHAN_W );
-         STORE( func, *inst, 0, 0, CHAN_W );
-      }
-      break;
-
-   case TGSI_OPCODE_MIN:
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( func, *inst, 0, 0, chan_index );
-         FETCH( func, *inst, 1, 1, chan_index );
-         sse_minps(
-            func,
-            make_xmm( 0 ),
-            make_xmm( 1 ) );
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_MAX:
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( func, *inst, 0, 0, chan_index );
-         FETCH( func, *inst, 1, 1, chan_index );
-         sse_maxps(
-            func,
-            make_xmm( 0 ),
-            make_xmm( 1 ) );
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_SLT:
-   /* TGSI_OPCODE_SETLT */
-      emit_setcc( func, inst, cc_LessThan );
-      break;
-
-   case TGSI_OPCODE_SGE:
-   /* TGSI_OPCODE_SETGE */
-      emit_setcc( func, inst, cc_NotLessThan );
-      break;
-
-   case TGSI_OPCODE_MAD:
-   /* TGSI_OPCODE_MADD */
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( func, *inst, 0, 0, chan_index );
-         FETCH( func, *inst, 1, 1, chan_index );
-         FETCH( func, *inst, 2, 2, chan_index );
-         emit_mul( func, 0, 1 );
-         emit_add( func, 0, 2 );
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_SUB:
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( func, *inst, 0, 0, chan_index );
-         FETCH( func, *inst, 1, 1, chan_index );
-         emit_sub( func, 0, 1 );
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_LERP:
-   /* TGSI_OPCODE_LRP */
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( func, *inst, 0, 0, chan_index );
-         FETCH( func, *inst, 1, 1, chan_index );
-         FETCH( func, *inst, 2, 2, chan_index );
-         emit_sub( func, 1, 2 );
-         emit_mul( func, 0, 1 );
-         emit_add( func, 0, 2 );
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_CND:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_CND0:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_DOT2ADD:
-   /* TGSI_OPCODE_DP2A */
-      return 0;
-      break;
-
-   case TGSI_OPCODE_INDEX:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_NEGATE:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_FRAC:
-   /* TGSI_OPCODE_FRC */
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( func, *inst, 0, 0, chan_index );
-         emit_frc( func, 0, 0 );
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_CLAMP:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_FLOOR:
-   /* TGSI_OPCODE_FLR */
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( func, *inst, 0, 0, chan_index );
-         emit_flr( func, 0, 0 );
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_ROUND:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_EXPBASE2:
-   /* TGSI_OPCODE_EX2 */
-      FETCH( func, *inst, 0, 0, CHAN_X );
-      emit_ex2( func, 0, 0 );
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_LOGBASE2:
-   /* TGSI_OPCODE_LG2 */
-      FETCH( func, *inst, 0, 0, CHAN_X );
-      emit_lg2( func, 0, 0 );
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_POWER:
-   /* TGSI_OPCODE_POW */
-      FETCH( func, *inst, 0, 0, CHAN_X );
-      FETCH( func, *inst, 1, 1, CHAN_X );
-      emit_pow( func, 0, 0, 1 );
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_CROSSPRODUCT:
-   /* TGSI_OPCODE_XPD */
-      if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
-          IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
-         FETCH( func, *inst, 1, 1, CHAN_Z );
-         FETCH( func, *inst, 3, 0, CHAN_Z );
-      }
-      if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
-          IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
-         FETCH( func, *inst, 0, 0, CHAN_Y );
-         FETCH( func, *inst, 4, 1, CHAN_Y );
-      }
-      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
-         emit_MOV( func, 2, 0 );
-         emit_mul( func, 2, 1 );
-         emit_MOV( func, 5, 3 );
-         emit_mul( func, 5, 4 );
-         emit_sub( func, 2, 5 );
-         STORE( func, *inst, 2, 0, CHAN_X );
-      }
-      if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
-          IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
-         FETCH( func, *inst, 2, 1, CHAN_X );
-         FETCH( func, *inst, 5, 0, CHAN_X );
-      }
-      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
-         emit_mul( func, 3, 2 );
-         emit_mul( func, 1, 5 );
-         emit_sub( func, 3, 1 );
-         STORE( func, *inst, 3, 0, CHAN_Y );
-      }
-      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
-         emit_mul( func, 5, 4 );
-         emit_mul( func, 0, 2 );
-         emit_sub( func, 5, 0 );
-         STORE( func, *inst, 5, 0, CHAN_Z );
-      }
-      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
-	 emit_tempf(
-	    func,
-	    0,
-	    TEMP_ONE_I,
-	    TEMP_ONE_C );
-         STORE( func, *inst, 0, 0, CHAN_W );
-      }
-      break;
-
-   case TGSI_OPCODE_MULTIPLYMATRIX:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_ABS:
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( func, *inst, 0, 0, chan_index );
-         emit_abs( func, 0) ;
-
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_RCC:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_DPH:
-      FETCH( func, *inst, 0, 0, CHAN_X );
-      FETCH( func, *inst, 1, 1, CHAN_X );
-      emit_mul( func, 0, 1 );
-      FETCH( func, *inst, 1, 0, CHAN_Y );
-      FETCH( func, *inst, 2, 1, CHAN_Y );
-      emit_mul( func, 1, 2 );
-      emit_add( func, 0, 1 );
-      FETCH( func, *inst, 1, 0, CHAN_Z );
-      FETCH( func, *inst, 2, 1, CHAN_Z );
-      emit_mul( func, 1, 2 );
-      emit_add( func, 0, 1 );
-      FETCH( func, *inst, 1, 1, CHAN_W );
-      emit_add( func, 0, 1 );
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_COS:
-      FETCH( func, *inst, 0, 0, CHAN_X );
-      emit_cos( func, 0, 0 );
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_DDX:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_DDY:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_KILP:
-      /* predicated kill */
-      emit_kilp( func );
-      return 0; /* XXX fix me */
-      break;
-
-   case TGSI_OPCODE_KIL:
-      /* conditional kill */
-      emit_kil( func, &inst->FullSrcRegisters[0] );
-      break;
-
-   case TGSI_OPCODE_PK2H:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_PK2US:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_PK4B:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_PK4UB:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_RFL:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_SEQ:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_SFL:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_SGT:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_SIN:
-      FETCH( func, *inst, 0, 0, CHAN_X );
-      emit_sin( func, 0, 0 );
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_SLE:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_SNE:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_STR:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_TEX:
-      if (0) {
-	 /* Disable dummy texture code: 
-	  */
-	 emit_tempf(
-	    func,
-	    0,
-	    TEMP_ONE_I,
-	    TEMP_ONE_C );
-	 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-	    STORE( func, *inst, 0, 0, chan_index );
-	 }
-      }
-      else {
-	 return 0;
-      }
-      break;
-
-   case TGSI_OPCODE_TXD:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_UP2H:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_UP2US:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_UP4B:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_UP4UB:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_X2D:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_ARA:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_ARR:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_BRA:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_CAL:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_RET:
-      emit_ret( func );
-      break;
-
-   case TGSI_OPCODE_END:
-      break;
-
-   case TGSI_OPCODE_SSG:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_CMP:
-      emit_cmp (func, inst);
-      break;
-
-   case TGSI_OPCODE_SCS:
-      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
-         FETCH( func, *inst, 0, 0, CHAN_X );
-         emit_cos( func, 0, 0 );
-         STORE( func, *inst, 0, 0, CHAN_X );
-      }
-      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
-         FETCH( func, *inst, 0, 0, CHAN_X );
-         emit_sin( func, 0, 0 );
-         STORE( func, *inst, 0, 0, CHAN_Y );
-      }
-      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
-	 emit_tempf(
-	    func,
-	    0,
-	    TGSI_EXEC_TEMP_00000000_I,
-	    TGSI_EXEC_TEMP_00000000_C );
-         STORE( func, *inst, 0, 0, CHAN_Z );
-      }
-      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
-	 emit_tempf(
-	    func,
-	    0,
-	    TEMP_ONE_I,
-	    TEMP_ONE_C );
-         STORE( func, *inst, 0, 0, CHAN_W );
-      }
-      break;
-
-   case TGSI_OPCODE_TXB:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_NRM:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_DIV:
-      return 0;
-      break;
+   int v1 = ppc_allocate_vec_register(gen->f);
+   int v2 = ppc_allocate_vec_register(gen->f);
+   uint chan_index;
+   boolean complement = FALSE;
+   int one_vec = gen_one_vec(gen);
 
-   case TGSI_OPCODE_DP2:
-      return 0;
-      break;
+   FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) {
+      FETCH(gen, *inst, v0, 0, chan_index);   /* v0 = srcreg[0] */
+      FETCH(gen, *inst, v1, 1, chan_index);   /* v1 = srcreg[1] */
 
-   case TGSI_OPCODE_TXL:
-      return 0;
-      break;
+      switch (inst->Instruction.Opcode) {
+      case TGSI_OPCODE_SNE:
+         complement = TRUE;
+         /* fall-through */
+      case TGSI_OPCODE_SEQ:
+         ppc_vcmpeqfpx(gen->f, v2, v0, v1); /* v2 = v0 == v1 ? ~0 : 0 */
+         break;
 
-   case TGSI_OPCODE_BRK:
-      return 0;
-      break;
+      case TGSI_OPCODE_SGE:
+         complement = TRUE;
+         /* fall-through */
+      case TGSI_OPCODE_SLT:
+         ppc_vcmpgtfpx(gen->f, v2, v1, v0); /* v2 = v1 > v0 ? ~0 : 0 */
+         break;
 
-   case TGSI_OPCODE_IF:
-      return 0;
-      break;
+      case TGSI_OPCODE_SLE:
+         complement = TRUE;
+         /* fall-through */
+      case TGSI_OPCODE_SGT:
+         ppc_vcmpgtfpx(gen->f, v2, v0, v1); /* v2 = v0 > v1 ? ~0 : 0 */
+         break;
+      default:
+         assert(0);
+      }
 
-   case TGSI_OPCODE_LOOP:
-      return 0;
-      break;
+      /* v2 is now {0,0,0,0} or {~0,~0,~0,~0} */
 
-   case TGSI_OPCODE_REP:
-      return 0;
-      break;
+      if (complement)
+         ppc_vandc(gen->f, v2, one_vec, v2);    /* v2 = one_vec & ~v2 */
+      else
+         ppc_vand(gen->f, v2, one_vec, v2);     /* v2 = one_vec & v2 */
 
-   case TGSI_OPCODE_ELSE:
-      return 0;
-      break;
+      STORE(gen, *inst, v2, 0, chan_index);   /* store v2 */
+   }
 
-   case TGSI_OPCODE_ENDIF:
-      return 0;
-      break;
+   ppc_release_vec_register(gen->f, v0);
+   ppc_release_vec_register(gen->f, v1);
+   ppc_release_vec_register(gen->f, v2);
+}
 
-   case TGSI_OPCODE_ENDLOOP:
-      return 0;
-      break;
 
-   case TGSI_OPCODE_ENDREP:
-      return 0;
-      break;
+static void
+emit_dotprod(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+   int v0 = ppc_allocate_vec_register(gen->f);
+   int v1 = ppc_allocate_vec_register(gen->f);
+   int v2 = ppc_allocate_vec_register(gen->f);
+   uint chan_index;
 
-   case TGSI_OPCODE_PUSHA:
-      return 0;
-      break;
+   ppc_vxor(gen->f, v2, v2, v2);           /* v2 = {0, 0, 0, 0} */
 
-   case TGSI_OPCODE_POPA:
-      return 0;
-      break;
+   FETCH(gen, *inst, v0, 0, CHAN_X);       /* v0 = src0.XXXX */
+   FETCH(gen, *inst, v1, 1, CHAN_X);       /* v1 = src1.XXXX */
+   ppc_vmaddfp(gen->f, v2, v0, v1, v2);    /* v2 = v0 * v1 + v2 */
 
-   case TGSI_OPCODE_CEIL:
-      return 0;
-      break;
+   FETCH(gen, *inst, v0, 0, CHAN_Y);       /* v0 = src0.YYYY */
+   FETCH(gen, *inst, v1, 1, CHAN_Y);       /* v1 = src1.YYYY */
+   ppc_vmaddfp(gen->f, v2, v0, v1, v2);    /* v2 = v0 * v1 + v2 */
 
-   case TGSI_OPCODE_I2F:
-      return 0;
-      break;
+   FETCH(gen, *inst, v0, 0, CHAN_Z);       /* v0 = src0.ZZZZ */
+   FETCH(gen, *inst, v1, 1, CHAN_Z);       /* v1 = src1.ZZZZ */
+   ppc_vmaddfp(gen->f, v2, v0, v1, v2);    /* v2 = v0 * v1 + v2 */
 
-   case TGSI_OPCODE_NOT:
-      return 0;
-      break;
+   if (inst->Instruction.Opcode == TGSI_OPCODE_DP4) {
+      FETCH(gen, *inst, v0, 0, CHAN_W);    /* v0 = src0.WWWW */
+      FETCH(gen, *inst, v1, 1, CHAN_W);    /* v1 = src1.WWWW */
+      ppc_vmaddfp(gen->f, v2, v0, v1, v2); /* v2 = v0 * v1 + v2 */
+   }
+   else if (inst->Instruction.Opcode == TGSI_OPCODE_DPH) {
+      FETCH(gen, *inst, v1, 1, CHAN_W);    /* v1 = src1.WWWW */
+      ppc_vaddfp(gen->f, v2, v2, v1);      /* v2 = v2 + v1 */
+   }
 
-   case TGSI_OPCODE_TRUNC:
-      return 0;
-      break;
+   FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) {
+      STORE(gen, *inst, v2, 0, chan_index);  /* store v2 */
+   }
+   ppc_release_vec_register(gen->f, v0);
+   ppc_release_vec_register(gen->f, v1);
+   ppc_release_vec_register(gen->f, v2);
+}
 
-   case TGSI_OPCODE_SHL:
-      return 0;
-      break;
 
-   case TGSI_OPCODE_SHR:
-      return 0;
-      break;
+static void
+emit_triop(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+   int v0 = ppc_allocate_vec_register(gen->f);
+   int v1 = ppc_allocate_vec_register(gen->f);
+   int v2 = ppc_allocate_vec_register(gen->f);
+   int v3 = ppc_allocate_vec_register(gen->f);
+   uint chan_index;
+   FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) {
+      FETCH(gen, *inst, v0, 0, chan_index);   /* v0 = srcreg[0] */
+      FETCH(gen, *inst, v1, 1, chan_index);   /* v1 = srcreg[1] */
+      FETCH(gen, *inst, v2, 2, chan_index);   /* v2 = srcreg[2] */
+      switch (inst->Instruction.Opcode) {
+      case TGSI_OPCODE_MAD:
+         ppc_vmaddfp(gen->f, v3, v0, v1, v2);   /* v3 = v0 * v1 + v2 */
+         break;
+      case TGSI_OPCODE_LRP:
+         ppc_vsubfp(gen->f, v3, v1, v2);        /* v3 = v1 - v2 */
+         ppc_vmaddfp(gen->f, v3, v0, v3, v2);   /* v3 = v0 * v3 + v2 */
+         break;
+      default:
+         assert(0);
+      }
+      STORE(gen, *inst, v3, 0, chan_index);   /* store v3 */
+   }
+   ppc_release_vec_register(gen->f, v0);
+   ppc_release_vec_register(gen->f, v1);
+   ppc_release_vec_register(gen->f, v2);
+   ppc_release_vec_register(gen->f, v3);
+}
 
-   case TGSI_OPCODE_AND:
-      return 0;
-      break;
 
-   case TGSI_OPCODE_OR:
-      return 0;
-      break;
+/*
+static void
+emit_lit(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+}
+*/
 
-   case TGSI_OPCODE_MOD:
-      return 0;
-      break;
 
-   case TGSI_OPCODE_XOR:
-      return 0;
+static int
+emit_instruction(struct gen_context *gen,
+                 struct tgsi_full_instruction *inst)
+{
+   switch (inst->Instruction.Opcode) {
+   case TGSI_OPCODE_MOV:
+   case TGSI_OPCODE_ABS:
+   case TGSI_OPCODE_FLOOR:
+   case TGSI_OPCODE_FRAC:
+   case TGSI_OPCODE_EXPBASE2:
+   case TGSI_OPCODE_LOGBASE2:
+      emit_unaryop(gen, inst);
       break;
-
-   case TGSI_OPCODE_SAD:
-      return 0;
+   case TGSI_OPCODE_RSQ:
+   case TGSI_OPCODE_RCP:
+      emit_scalar_unaryop(gen, inst);
       break;
-
-   case TGSI_OPCODE_TXF:
-      return 0;
+   case TGSI_OPCODE_ADD:
+   case TGSI_OPCODE_SUB:
+   case TGSI_OPCODE_MUL:
+   case TGSI_OPCODE_MIN:
+   case TGSI_OPCODE_MAX:
+      emit_binop(gen, inst);
       break;
-
-   case TGSI_OPCODE_TXQ:
-      return 0;
+   case TGSI_OPCODE_SEQ:
+   case TGSI_OPCODE_SNE:
+   case TGSI_OPCODE_SLT:
+   case TGSI_OPCODE_SGT:
+   case TGSI_OPCODE_SLE:
+   case TGSI_OPCODE_SGE:
+      emit_inequality(gen, inst);
       break;
-
-   case TGSI_OPCODE_CONT:
-      return 0;
+   case TGSI_OPCODE_MAD:
+   case TGSI_OPCODE_LRP:
+      emit_triop(gen, inst);
       break;
-
-   case TGSI_OPCODE_EMIT:
-      return 0;
+   case TGSI_OPCODE_DP3:
+   case TGSI_OPCODE_DP4:
+   case TGSI_OPCODE_DPH:
+      emit_dotprod(gen, inst);
       break;
-
-   case TGSI_OPCODE_ENDPRIM:
-      return 0;
+      /*
+   case TGSI_OPCODE_LIT:
+      emit_lit(gen, inst);
       break;
-
+      */
+   case TGSI_OPCODE_END:
+      /* normal end */
+      return 1;
    default:
       return 0;
    }
-#endif
+
    
    return 1;
 }
@@ -2608,133 +663,6 @@ emit_declaration(
    }
 }
 
-#if 0
-static void aos_to_soa( struct x86_function *func, 
-                        uint arg_aos,
-                        uint arg_soa, 
-                        uint arg_num, 
-                        uint arg_stride )
-{
-   struct x86_reg soa_input = x86_make_reg( file_REG32, reg_AX );
-   struct x86_reg aos_input = x86_make_reg( file_REG32, reg_BX );
-   struct x86_reg num_inputs = x86_make_reg( file_REG32, reg_CX );
-   struct x86_reg stride = x86_make_reg( file_REG32, reg_DX );
-   int inner_loop;
-
-
-   /* Save EBX */
-   x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
-
-   x86_mov( func, aos_input,  x86_fn_arg( func, arg_aos ) );
-   x86_mov( func, soa_input,  x86_fn_arg( func, arg_soa ) );
-   x86_mov( func, num_inputs, x86_fn_arg( func, arg_num ) );
-   x86_mov( func, stride,     x86_fn_arg( func, arg_stride ) );
-
-   /* do */
-   inner_loop = x86_get_label( func );
-   {
-      x86_push( func, aos_input );
-      sse_movlps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
-      sse_movlps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
-      x86_add( func, aos_input, stride );
-      sse_movhps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
-      sse_movhps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
-      x86_add( func, aos_input, stride );
-      sse_movlps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
-      sse_movlps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
-      x86_add( func, aos_input, stride );
-      sse_movhps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
-      sse_movhps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
-      x86_pop( func, aos_input );
-
-      sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
-      sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
-      sse_shufps( func, make_xmm( 0 ), make_xmm( 1 ), 0x88 );
-      sse_shufps( func, make_xmm( 2 ), make_xmm( 1 ), 0xdd );
-      sse_shufps( func, make_xmm( 3 ), make_xmm( 4 ), 0x88 );
-      sse_shufps( func, make_xmm( 5 ), make_xmm( 4 ), 0xdd );
-
-      sse_movups( func, x86_make_disp( soa_input, 0 ), make_xmm( 0 ) );
-      sse_movups( func, x86_make_disp( soa_input, 16 ), make_xmm( 2 ) );
-      sse_movups( func, x86_make_disp( soa_input, 32 ), make_xmm( 3 ) );
-      sse_movups( func, x86_make_disp( soa_input, 48 ), make_xmm( 5 ) );
-
-      /* Advance to next input */
-      x86_lea( func, aos_input, x86_make_disp(aos_input, 16) );
-      x86_lea( func, soa_input, x86_make_disp(soa_input, 64) );
-   }
-   /* while --num_inputs */
-   x86_dec( func, num_inputs );
-   x86_jcc( func, cc_NE, inner_loop );
-
-   /* Restore EBX */
-   x86_pop( func, aos_input );
-}
-#endif
-
-#if 0
-static void soa_to_aos( struct x86_function *func, uint aos, uint soa, uint num, uint stride )
-{
-   struct x86_reg soa_output;
-   struct x86_reg aos_output;
-   struct x86_reg num_outputs;
-   struct x86_reg temp;
-   int inner_loop;
-
-   soa_output = x86_make_reg( file_REG32, reg_AX );
-   aos_output = x86_make_reg( file_REG32, reg_BX );
-   num_outputs = x86_make_reg( file_REG32, reg_CX );
-   temp = x86_make_reg( file_REG32, reg_DX );
-
-   /* Save EBX */
-   x86_push( func, aos_output );
-
-   x86_mov( func, soa_output, x86_fn_arg( func, soa ) );
-   x86_mov( func, aos_output, x86_fn_arg( func, aos ) );
-   x86_mov( func, num_outputs, x86_fn_arg( func, num ) );
-
-   /* do */
-   inner_loop = x86_get_label( func );
-   {
-      sse_movups( func, make_xmm( 0 ), x86_make_disp( soa_output, 0 ) );
-      sse_movups( func, make_xmm( 1 ), x86_make_disp( soa_output, 16 ) );
-      sse_movups( func, make_xmm( 3 ), x86_make_disp( soa_output, 32 ) );
-      sse_movups( func, make_xmm( 4 ), x86_make_disp( soa_output, 48 ) );
-
-      sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
-      sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
-      sse_unpcklps( func, make_xmm( 0 ), make_xmm( 1 ) );
-      sse_unpckhps( func, make_xmm( 2 ), make_xmm( 1 ) );
-      sse_unpcklps( func, make_xmm( 3 ), make_xmm( 4 ) );
-      sse_unpckhps( func, make_xmm( 5 ), make_xmm( 4 ) );
-
-      x86_mov( func, temp, x86_fn_arg( func, stride ) );
-      x86_push( func, aos_output );
-      sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
-      sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
-      x86_add( func, aos_output, temp );
-      sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
-      sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
-      x86_add( func, aos_output, temp );
-      sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
-      sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
-      x86_add( func, aos_output, temp );
-      sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
-      sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
-      x86_pop( func, aos_output );
-
-      /* Advance to next output */
-      x86_lea( func, aos_output, x86_make_disp(aos_output, 16) );
-      x86_lea( func, soa_output, x86_make_disp(soa_output, 64) );
-   }
-   /* while --num_outputs */
-   x86_dec( func, num_outputs );
-   x86_jcc( func, cc_NE, inner_loop );
-
-   /* Restore EBX */
-   x86_pop( func, aos_output );
-}
-#endif
 
 
 static void
@@ -2788,67 +716,6 @@ tgsi_emit_ppc(const struct tgsi_token *tokens,
 
    emit_prologue(func);
 
-   /*
-    * Different function args for vertex/fragment shaders:
-    */
-#if 0
-   if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
-      /* DECLARATION phase, do not load output argument. */
-      x86_mov(
-         func,
-         get_input_base(),
-         x86_fn_arg( func, 1 ) );
-      /* skipping outputs argument here */
-      x86_mov(
-         func,
-         get_const_base(),
-         x86_fn_arg( func, 3 ) );
-      x86_mov(
-         func,
-         get_temp_base(),
-         x86_fn_arg( func, 4 ) );
-      x86_mov(
-         func,
-         get_coef_base(),
-         x86_fn_arg( func, 5 ) );
-      x86_mov(
-         func,
-         get_immediate_base(),
-         x86_fn_arg( func, 6 ) );
-   }
-   else {
-      assert(parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX);
-
-      if (do_swizzles)
-         aos_to_soa( func, 
-                     6,         /* aos_input */
-                     1,         /* machine->input */
-                     7,         /* num_inputs */
-                     8 );       /* input_stride */
-
-      x86_mov(
-         func,
-         get_input_base(),
-         x86_fn_arg( func, 1 ) );
-      x86_mov(
-         func,
-         get_output_base(),
-         x86_fn_arg( func, 2 ) );
-      x86_mov(
-         func,
-         get_const_base(),
-         x86_fn_arg( func, 3 ) );
-      x86_mov(
-         func,
-         get_temp_base(),
-         x86_fn_arg( func, 4 ) );
-      x86_mov(
-         func,
-         get_immediate_base(),
-         x86_fn_arg( func, 5 ) );
-   }
-#endif
-
    while (!tgsi_parse_end_of_tokens(&parse) && ok) {
       tgsi_parse_token(&parse);
 
@@ -2860,19 +727,6 @@ tgsi_emit_ppc(const struct tgsi_token *tokens,
          break;
 
       case TGSI_TOKEN_TYPE_INSTRUCTION:
-         if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
-#if 0
-            if( !instruction_phase ) {
-               /* INSTRUCTION phase, overwrite coeff with output. */
-               instruction_phase = TRUE;
-               x86_mov(
-                  func,
-                  get_output_base(),
-                  x86_fn_arg( func, 2 ) );
-            }
-#endif
-         }
-
          ok = emit_instruction(&gen, &parse.FullToken.FullInstruction);
 
 	 if (!ok) {
@@ -2909,13 +763,6 @@ tgsi_emit_ppc(const struct tgsi_token *tokens,
       }
    }
 
-#if 0
-   if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
-      if (do_swizzles)
-         soa_to_aos( func, 9, 2, 10, 11 );
-   }
-#endif
-
    emit_epilogue(func);
 
    tgsi_parse_free( &parse );
-- 
cgit v1.2.3


From 77160cd97b7f2181b7953bcc8d13e86055b819e3 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 22 Oct 2008 15:34:16 -0600
Subject: gallium: var renaming in tgsi_ppc.c

---
 src/gallium/auxiliary/tgsi/tgsi_ppc.c | 36 +++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

(limited to 'src/gallium/auxiliary')

diff --git a/src/gallium/auxiliary/tgsi/tgsi_ppc.c b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
index 432ec7459b..c1e707657b 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ppc.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
@@ -117,11 +117,11 @@ gen_get_bit31_vec(struct gen_context *gen)
 
 
 /**
- * Register fetch.
+ * Register fetch, put result in 'dst_vec'.
  */
 static void
 emit_fetch(struct gen_context *gen,
-           unsigned vec_reg,
+           unsigned dst_vec,
            const struct tgsi_full_src_register *reg,
            const unsigned chan_index)
 {
@@ -138,7 +138,7 @@ emit_fetch(struct gen_context *gen,
             int offset_reg = ppc_allocate_register(gen->f);
             int offset = (reg->SrcRegister.Index * 4 + swizzle) * 16;
             ppc_li(gen->f, offset_reg, offset);
-            ppc_lvx(gen->f, vec_reg, gen->inputs_reg, offset_reg);
+            ppc_lvx(gen->f, dst_vec, gen->inputs_reg, offset_reg);
             ppc_release_register(gen->f, offset_reg);
          }
          break;
@@ -147,7 +147,7 @@ emit_fetch(struct gen_context *gen,
             int offset_reg = ppc_allocate_register(gen->f);
             int offset = (reg->SrcRegister.Index * 4 + swizzle) * 16;
             ppc_li(gen->f, offset_reg, offset);
-            ppc_lvx(gen->f, vec_reg, gen->temps_reg, offset_reg);
+            ppc_lvx(gen->f, dst_vec, gen->temps_reg, offset_reg);
             ppc_release_register(gen->f, offset_reg);
          }
          break;
@@ -156,7 +156,7 @@ emit_fetch(struct gen_context *gen,
             int offset_reg = ppc_allocate_register(gen->f);
             int offset = (reg->SrcRegister.Index * 4 + swizzle) * 16;
             ppc_li(gen->f, offset_reg, offset);
-            ppc_lvx(gen->f, vec_reg, gen->immed_reg, offset_reg);
+            ppc_lvx(gen->f, dst_vec, gen->immed_reg, offset_reg);
             ppc_release_register(gen->f, offset_reg);
          }
          break;
@@ -171,9 +171,9 @@ emit_fetch(struct gen_context *gen,
              * know that 'swizzle' tells us which vector slot will have the
              * loaded word.  The other vector slots will be undefined.
              */
-            ppc_lvewx(gen->f, vec_reg, gen->const_reg, offset_reg);
+            ppc_lvewx(gen->f, dst_vec, gen->const_reg, offset_reg);
             /* splat word[swizzle] across the vector reg */
-            ppc_vspltw(gen->f, vec_reg, vec_reg, swizzle);
+            ppc_vspltw(gen->f, dst_vec, dst_vec, swizzle);
             ppc_release_register(gen->f, offset_reg);
          }
          break;
@@ -182,12 +182,12 @@ emit_fetch(struct gen_context *gen,
       }
       break;
    case TGSI_EXTSWIZZLE_ZERO:
-      ppc_vload_float(gen->f, vec_reg, 0.0f);
+      ppc_vload_float(gen->f, dst_vec, 0.0f);
       break;
    case TGSI_EXTSWIZZLE_ONE:
       {
          int one_vec = gen_one_vec(gen);
-         ppc_vecmove(gen->f, vec_reg, one_vec);
+         ppc_vecmove(gen->f, dst_vec, one_vec);
       }
       break;
    default:
@@ -202,15 +202,15 @@ emit_fetch(struct gen_context *gen,
          switch (sign_op) {
          case TGSI_UTIL_SIGN_CLEAR:
             /* vec = vec & ~bit31 */
-            ppc_vandc(gen->f, vec_reg, vec_reg, bit31_vec);
+            ppc_vandc(gen->f, dst_vec, dst_vec, bit31_vec);
             break;
          case TGSI_UTIL_SIGN_SET:
             /* vec = vec | bit31 */
-            ppc_vor(gen->f, vec_reg, vec_reg, bit31_vec);
+            ppc_vor(gen->f, dst_vec, dst_vec, bit31_vec);
             break;
          case TGSI_UTIL_SIGN_TOGGLE:
             /* vec = vec ^ bit31 */
-            ppc_vxor(gen->f, vec_reg, vec_reg, bit31_vec);
+            ppc_vxor(gen->f, dst_vec, dst_vec, bit31_vec);
             break;
          default:
             assert(0);
@@ -219,17 +219,17 @@ emit_fetch(struct gen_context *gen,
    }
 }
 
-#define FETCH( GEN, INST, VEC_REG, SRC_REG, CHAN ) \
-   emit_fetch( GEN, VEC_REG, &(INST).FullSrcRegisters[SRC_REG], CHAN )
+#define FETCH( GEN, INST, DST_VEC, SRC_REG, CHAN ) \
+   emit_fetch( GEN, DST_VEC, &(INST).FullSrcRegisters[SRC_REG], CHAN )
 
 
 /**
- * Register store.
+ * Register store.  Store 'src_vec' at location indicated by 'reg'.
  */
 static void
 emit_store(struct gen_context *gen,
-           unsigned vec_reg,
+           unsigned src_vec,
            const struct tgsi_full_dst_register *reg,
            const struct tgsi_full_instruction *inst,
            unsigned chan_index)
@@ -240,7 +240,7 @@ emit_store(struct gen_context *gen,
          int offset_reg = ppc_allocate_register(gen->f);
          int offset = (reg->DstRegister.Index * 4 + chan_index) * 16;
          ppc_li(gen->f, offset_reg, offset);
-         ppc_stvx(gen->f, vec_reg, gen->outputs_reg, offset_reg);
+         ppc_stvx(gen->f, src_vec, gen->outputs_reg, offset_reg);
          ppc_release_register(gen->f, offset_reg);
       }
       break;
@@ -249,7 +249,7 @@ emit_store(struct gen_context *gen,
          int offset_reg = ppc_allocate_register(gen->f);
          int offset = (reg->DstRegister.Index * 4 + chan_index) * 16;
          ppc_li(gen->f, offset_reg, offset);
-         ppc_stvx(gen->f, vec_reg, gen->temps_reg, offset_reg);
+         ppc_stvx(gen->f, src_vec, gen->temps_reg, offset_reg);
          ppc_release_register(gen->f, offset_reg);
       }
       break;
-- 
cgit v1.2.3


From 9e3ee82305b4602feca0253dc0e0c27f9bc9b05e Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 22 Oct 2008 16:57:22 -0600
Subject: gallium: PPC LIT instruction (not quite complete yet)

---
 src/gallium/auxiliary/tgsi/tgsi_ppc.c | 89 +++++++++++++++++++++++++++++++++--
 1 file changed, 85 insertions(+), 4 deletions(-)

(limited to 'src/gallium/auxiliary')

diff --git a/src/gallium/auxiliary/tgsi/tgsi_ppc.c b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
index c1e707657b..edd535a884 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ppc.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
@@ -535,12 +535,95 @@ emit_triop(struct gen_context *gen, struct tgsi_full_instruction *inst)
 }
 
 
-/*
+
+/** Approximation for vr = pow(va, vb) */
+static void
+ppc_vec_pow(struct ppc_function *f, int vr, int va, int vb)
+{
+   /* pow(a,b) ~= exp2(log2(a) * b) */
+   int t_vec = ppc_allocate_vec_register(f);
+   int zero_vec = ppc_allocate_vec_register(f);
+
+   ppc_vload_float(f, zero_vec, 0.0f);
+
+   ppc_vlogefp(f, t_vec, va);                   /* t = log2(va) */
+   ppc_vmaddfp(f, t_vec, t_vec, vb, zero_vec);  /* t = t * vb */
+   ppc_vexptefp(f, vr, t_vec);                  /* vr = 2^t */
+
+   ppc_release_vec_register(f, t_vec);
+   ppc_release_vec_register(f, zero_vec);
+}
+
+
 static void
 emit_lit(struct gen_context *gen, struct tgsi_full_instruction *inst)
 {
+   int one_vec = gen_one_vec(gen);
+
+   /* Compute X */
+   if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) {
+      STORE(gen, *inst, one_vec, 0, CHAN_X);
+   }
+
+   /* Compute Y, Z */
+   if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y) ||
+       IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
+      int x_vec = ppc_allocate_vec_register(gen->f);
+      int zero_vec = ppc_allocate_vec_register(gen->f);
+
+      FETCH(gen, *inst, x_vec, 0, CHAN_X);        /* x_vec = src[0].x */
+
+      ppc_vload_float(gen->f, zero_vec, 0.0f);    /* zero = {0,0,0,0} */
+      ppc_vmaxfp(gen->f, x_vec, x_vec, zero_vec); /* x_vec = max(x_vec, 0) */
+
+      if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
+         STORE(gen, *inst, x_vec, 0, CHAN_Y);        /* store Y */
+      }
+
+      if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
+         int y_vec = ppc_allocate_vec_register(gen->f);
+         int z_vec = ppc_allocate_vec_register(gen->f);
+         int w_vec = ppc_allocate_vec_register(gen->f);
+         int pow_vec = ppc_allocate_vec_register(gen->f);
+         int pos_vec = ppc_allocate_vec_register(gen->f);
+         int c128_vec = ppc_allocate_vec_register(gen->f);
+
+         FETCH(gen, *inst, y_vec, 0, CHAN_Y);        /* y_vec = src[0].y */
+         ppc_vmaxfp(gen->f, y_vec, y_vec, zero_vec); /* y_vec = max(y_vec, 0) */
+
+         FETCH(gen, *inst, w_vec, 0, CHAN_W);        /* w_vec = src[0].w */
+
+         /* XXX clamp Y to [-128, 128] */
+         ppc_vload_float(gen->f, c128_vec, 128.0f);
+
+         /* if temp.x > 0
+          *    pow(tmp.y, tmp.w)
+          * else
+          *   0.0
+          */
+
+         ppc_vec_pow(gen->f, pow_vec, y_vec, w_vec);      /* pow = pow(y, w) */
+         ppc_vcmpgtfpx(gen->f, pos_vec, x_vec, zero_vec); /* pos = x > 0 */
+         ppc_vand(gen->f, z_vec, pow_vec, pos_vec);       /* z = pow & pos */
+
+         STORE(gen, *inst, z_vec, 0, CHAN_Z);             /* store Z */
+
+         ppc_release_vec_register(gen->f, y_vec);
+         ppc_release_vec_register(gen->f, z_vec);
+         ppc_release_vec_register(gen->f, w_vec);
+         ppc_release_vec_register(gen->f, pow_vec);
+         ppc_release_vec_register(gen->f, pos_vec);
+      }
+
+      ppc_release_vec_register(gen->f, x_vec);
+      ppc_release_vec_register(gen->f, zero_vec);
+   }
+
+   /* Compute W */
+   if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W)) {
+      STORE(gen, *inst, one_vec, 0, CHAN_W);
+   }
 }
-*/
 
 
 static int
@@ -584,11 +667,9 @@ emit_instruction(struct gen_context *gen,
    case TGSI_OPCODE_DPH:
       emit_dotprod(gen, inst);
       break;
-      /*
    case TGSI_OPCODE_LIT:
       emit_lit(gen, inst);
       break;
-      */
    case TGSI_OPCODE_END:
       /* normal end */
       return 1;
-- 
cgit v1.2.3


From ae81aeb12868db219cbdc02437c481714cfed3f5 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 22 Oct 2008 16:58:05 -0600
Subject: gallium: GALLIUM_NOPPC debug var to disable PPC codegen

---
 src/gallium/auxiliary/tgsi/tgsi_ppc.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'src/gallium/auxiliary')

diff --git a/src/gallium/auxiliary/tgsi/tgsi_ppc.c b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
index edd535a884..9d7de41fe7 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ppc.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
@@ -776,15 +776,21 @@ tgsi_emit_ppc(const struct tgsi_token *tokens,
               float (*immediates)[4],
               boolean do_swizzles )
 {
+   static int use_ppc_asm = -1;
    struct tgsi_parse_context parse;
    /*boolean instruction_phase = FALSE;*/
    unsigned ok = 1;
    uint num_immediates = 0;
    struct gen_context gen;
 
-   util_init_math();
+   if (use_ppc_asm < 0) {
+      /* If GALLIUM_NOPPC is set, don't use PPC codegen */
+      use_ppc_asm = !debug_get_bool_option("GALLIUM_NOPPC", FALSE);
+   }
+   if (!use_ppc_asm)
+      return FALSE;
 
-   tgsi_parse_init( &parse, tokens );
+   util_init_math();
 
    gen.f = func;
    gen.inputs_reg = ppc_reserve_register(func, 3);   /* first function param */
@@ -797,6 +803,8 @@ tgsi_emit_ppc(const struct tgsi_token *tokens,
 
    emit_prologue(func);
 
+   tgsi_parse_init( &parse, tokens );
+
    while (!tgsi_parse_end_of_tokens(&parse) && ok) {
       tgsi_parse_token(&parse);
 
-- 
cgit v1.2.3


From 3026616c48487a7561d8545c08950539f0ad51d1 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 22 Oct 2008 17:17:11 -0600
Subject: gallium: added ppc_vzero()

---
 src/gallium/auxiliary/rtasm/rtasm_ppc.c | 8 ++++++++
 src/gallium/auxiliary/rtasm/rtasm_ppc.h | 5 +++++
 2 files changed, 13 insertions(+)

(limited to 'src/gallium/auxiliary')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc.c b/src/gallium/auxiliary/rtasm/rtasm_ppc.c
index 65df676eae..51d9b53657 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc.c
@@ -669,6 +669,14 @@ ppc_vecmove(struct ppc_function *p, uint vD, uint vA)
    ppc_vor(p, vD, vA, vA);
 }
 
+/** Set vector register to {0,0,0,0} */
+void
+ppc_vzero(struct ppc_function *p, uint vr)
+{
+   ppc_vxor(p, vr, vr, vr);
+}
+
+
 
 
 /**
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc.h b/src/gallium/auxiliary/rtasm/rtasm_ppc.h
index 9f1e3fcd84..f194d3be13 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc.h
@@ -193,6 +193,11 @@ ppc_vxor(struct ppc_function *p, uint vD, uint vA, uint vB);
 extern void
 ppc_vecmove(struct ppc_function *p, uint vD, uint vA);
 
+/** Set vector register to {0,0,0,0} */
+extern void
+ppc_vzero(struct ppc_function *p, uint vr);
+
+
 
 /**
  ** Vector shuffle / select / splat / etc
-- 
cgit v1.2.3


From abbbe876ac98596b143da295abf6887e0a4e50d2 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 22 Oct 2008 17:19:12 -0600
Subject: gallium: new PPC built-in constants array

It's hard to form PPC vector immediates so load them from an array.
---
 src/gallium/auxiliary/draw/draw_vs_ppc.c |  8 +++--
 src/gallium/auxiliary/tgsi/tgsi_ppc.c    | 61 ++++++++++++++++++++++++++++----
 src/gallium/auxiliary/tgsi/tgsi_ppc.h    |  3 ++
 3 files changed, 63 insertions(+), 9 deletions(-)

(limited to 'src/gallium/auxiliary')

diff --git a/src/gallium/auxiliary/draw/draw_vs_ppc.c b/src/gallium/auxiliary/draw/draw_vs_ppc.c
index fcc9cbfec5..8eff6d4fda 100644
--- a/src/gallium/auxiliary/draw/draw_vs_ppc.c
+++ b/src/gallium/auxiliary/draw/draw_vs_ppc.c
@@ -55,7 +55,8 @@ typedef void (PIPE_CDECL *codegen_function) (float (*inputs)[4][4],
                                              float (*outputs)[4][4],
                                              float (*temps)[4][4],
                                              float (*immeds)[4][4],
-                                             float (*consts)[4]);
+                                             float (*consts)[4],
+                                             const float *builtins);
 
 #if 0
    const struct tgsi_exec_vector *input,
@@ -151,7 +152,8 @@ vs_ppc_run_linear( struct draw_vertex_shader *base,
 #else
       shader->func(inputs_soa, outputs_soa, temps_soa,
 		   (float (*)[4][4]) shader->base.immediates,
-		   (float (*)[4]) constants);
+		   (float (*)[4]) constants,
+                   ppc_builtin_constants);
 
       /*output[0][0] = input[0][0] * 0.5;*/
 #endif
@@ -246,7 +248,9 @@ draw_create_vs_ppc(struct draw_context *draw,
    return &vs->base;
 
 fail:
+   /*
    debug_error("tgsi_emit_ppc() failed, falling back to interpreter\n");
+   */
 
    ppc_release_func( &vs->ppc_program );
    
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ppc.c b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
index 9d7de41fe7..6b05fd16cf 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ppc.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
@@ -36,6 +36,7 @@
 #include "pipe/p_debug.h"
 #include "pipe/p_shader_tokens.h"
 #include "util/u_math.h"
+#include "util/u_memory.h"
 #include "util/u_sse.h"
 #include "tgsi/tgsi_parse.h"
 #include "tgsi/tgsi_util.h"
@@ -44,6 +45,14 @@
 #include "rtasm/rtasm_ppc.h"
 
 
+/**
+ * Since it's pretty much impossible to form PPC vector immediates, load
+ * them from memory here:
+ */
+const float ppc_builtin_constants[] ALIGN16_ATTRIB = {
+   1.0f, -128.0f, 128.0, 0.0
+};
+
 
 #define FOR_EACH_CHANNEL( CHAN )\
    for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)
@@ -81,12 +90,46 @@ struct gen_context
    int temps_reg;     /**< GP register pointing to temporary "registers" */
    int immed_reg;     /**< GP register pointing to immediates buffer */
    int const_reg;     /**< GP register pointing to constants buffer */
+   int builtins_reg;  /**< GP register pointint to built-in constants */
 
    int one_vec;       /**< vector register with {1.0, 1.0, 1.0, 1.0} */
    int bit31_vec;     /**< vector register with {1<<31, 1<<31, 1<<31, 1<<31} */
 };
 
 
+/**
+ * Load the given vector register with {value, value, value, value}.
+ * The value must be in the ppu_builtin_constants[] array.
+ * We wouldn't need this if there was a simple way to load PPC vector
+ * registers with immediate values!
+ */
+static void
+load_constant_vec(struct gen_context *gen, int dst_vec, float value)
+{
+   uint pos;
+   for (pos = 0; pos < Elements(ppc_builtin_constants); pos++) {
+      if (ppc_builtin_constants[pos] == value) {
+         int offset_reg = ppc_allocate_register(gen->f);
+         int offset = pos * 4;
+
+         ppc_li(gen->f, offset_reg, offset);
+         /* Load 4-byte word into vector register.
+          * The vector slot depends on the effective address we load from.
+          * We know that our builtins start at a 16-byte boundary so we
+          * know that 'swizzle' tells us which vector slot will have the
+          * loaded word.  The other vector slots will be undefined.
+          */
+         ppc_lvewx(gen->f, dst_vec, gen->builtins_reg, offset_reg);
+         /* splat word[pos % 4] across the vector reg */
+         ppc_vspltw(gen->f, dst_vec, dst_vec, pos % 4);
+         ppc_release_register(gen->f, offset_reg);
+         return;
+      }
+   }
+   assert(0 && "Need to add new constant to ppc_builtin_constants array");
+}
+
+
 /**
  * Return index of vector register containing {1.0, 1.0, 1.0, 1.0}.
  */
@@ -95,7 +138,7 @@ gen_one_vec(struct gen_context *gen)
 {
    if (gen->one_vec < 0) {
       gen->one_vec = ppc_allocate_vec_register(gen->f);
-      ppc_vload_float(gen->f, gen->one_vec, 1.0f);
+      load_constant_vec(gen, gen->one_vec, 1.0f);
    }
    return gen->one_vec;
 }
@@ -115,7 +158,6 @@ gen_get_bit31_vec(struct gen_context *gen)
 }
 
 
-
 /**
  * Register fetch, put result in 'dst_vec'.
  */
@@ -182,7 +224,7 @@ emit_fetch(struct gen_context *gen,
       }
       break;
    case TGSI_EXTSWIZZLE_ZERO:
-      ppc_vload_float(gen->f, dst_vec, 0.0f);
+      ppc_vzero(gen->f, dst_vec);
       break;
    case TGSI_EXTSWIZZLE_ONE:
       {
@@ -544,7 +586,7 @@ ppc_vec_pow(struct ppc_function *f, int vr, int va, int vb)
    int t_vec = ppc_allocate_vec_register(f);
    int zero_vec = ppc_allocate_vec_register(f);
 
-   ppc_vload_float(f, zero_vec, 0.0f);
+   ppc_vzero(f, zero_vec);
 
    ppc_vlogefp(f, t_vec, va);                   /* t = log2(va) */
    ppc_vmaddfp(f, t_vec, t_vec, vb, zero_vec);  /* t = t * vb */
@@ -573,7 +615,7 @@ emit_lit(struct gen_context *gen, struct tgsi_full_instruction *inst)
 
       FETCH(gen, *inst, x_vec, 0, CHAN_X);        /* x_vec = src[0].x */
 
-      ppc_vload_float(gen->f, zero_vec, 0.0f);    /* zero = {0,0,0,0} */
+      ppc_vzero(gen->f, zero_vec);                /* zero = {0,0,0,0} */
       ppc_vmaxfp(gen->f, x_vec, x_vec, zero_vec); /* x_vec = max(x_vec, 0) */
 
       if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
@@ -586,7 +628,8 @@ emit_lit(struct gen_context *gen, struct tgsi_full_instruction *inst)
          int w_vec = ppc_allocate_vec_register(gen->f);
          int pow_vec = ppc_allocate_vec_register(gen->f);
          int pos_vec = ppc_allocate_vec_register(gen->f);
-         int c128_vec = ppc_allocate_vec_register(gen->f);
+         int p128_vec = ppc_allocate_vec_register(gen->f);
+         int n128_vec = ppc_allocate_vec_register(gen->f);
 
          FETCH(gen, *inst, y_vec, 0, CHAN_Y);        /* y_vec = src[0].y */
          ppc_vmaxfp(gen->f, y_vec, y_vec, zero_vec); /* y_vec = max(y_vec, 0) */
@@ -594,7 +637,8 @@ emit_lit(struct gen_context *gen, struct tgsi_full_instruction *inst)
          FETCH(gen, *inst, w_vec, 0, CHAN_W);        /* w_vec = src[0].w */
 
          /* XXX clamp Y to [-128, 128] */
-         ppc_vload_float(gen->f, c128_vec, 128.0f);
+         load_constant_vec(gen, p128_vec, 128.0f);
+         load_constant_vec(gen, n128_vec, -128.0f);
 
          /* if temp.x > 0
           *    pow(tmp.y, tmp.w)
@@ -613,6 +657,8 @@ emit_lit(struct gen_context *gen, struct tgsi_full_instruction *inst)
          ppc_release_vec_register(gen->f, w_vec);
          ppc_release_vec_register(gen->f, pow_vec);
          ppc_release_vec_register(gen->f, pos_vec);
+         ppc_release_vec_register(gen->f, p128_vec);
+         ppc_release_vec_register(gen->f, n128_vec);
       }
 
       ppc_release_vec_register(gen->f, x_vec);
@@ -798,6 +844,7 @@ tgsi_emit_ppc(const struct tgsi_token *tokens,
    gen.temps_reg = ppc_reserve_register(func, 5);    /* ... */
    gen.immed_reg = ppc_reserve_register(func, 6);
    gen.const_reg = ppc_reserve_register(func, 7);
+   gen.builtins_reg = ppc_reserve_register(func, 8);
    gen.one_vec = -1;
    gen.bit31_vec = -1;
 
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ppc.h b/src/gallium/auxiliary/tgsi/tgsi_ppc.h
index 7cd2bf9aff..829ec075e7 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ppc.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_ppc.h
@@ -35,6 +35,9 @@ extern "C" {
 struct tgsi_token;
 struct ppc_function;
 
+extern const float ppc_builtin_constants[];
+
+
 boolean
 tgsi_emit_ppc(const struct tgsi_token *tokens,
               struct ppc_function *function,
-- 
cgit v1.2.3


From f8ab4feb75f4a592e23859813c093dcdbd4b8988 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 22 Oct 2008 17:21:43 -0600
Subject: gallium: remove ppc_vload_float(), rename ppc_vecmove() ->
 ppc_vmove().

---
 src/gallium/auxiliary/rtasm/rtasm_ppc.c | 19 +------------------
 src/gallium/auxiliary/rtasm/rtasm_ppc.h |  6 +-----
 src/gallium/auxiliary/tgsi/tgsi_ppc.c   |  2 +-
 3 files changed, 3 insertions(+), 24 deletions(-)

(limited to 'src/gallium/auxiliary')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc.c b/src/gallium/auxiliary/rtasm/rtasm_ppc.c
index 51d9b53657..7dd8263749 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc.c
@@ -603,23 +603,6 @@ ppc_lvewx(struct ppc_function *p, uint vr, uint ra, uint rb)
    emit_x(p, 31, vr, ra, rb, 71);
 }
 
-/** vector load float: vr = splats(imm) */
-void
-ppc_vload_float(struct ppc_function *p, uint vr, float imm)
-{
-   if (imm == 0.0f) {
-      ppc_vxor(p, vr, vr, vr);
-   }
-   else if (imm == 1.0f) {
-      /* use 2^0=1 to get 1.0 */
-      ppc_vxor(p, vr, vr, vr);  /* vr = {0,0,0,0} */
-      ppc_vexptefp(p, vr, vr);  /* vr = 0^0 */
-   }
-   else {
-      assert(0);
-   }
-}
-
 
 
@@ -664,7 +647,7 @@ ppc_vxor(struct ppc_function *p, uint vD, uint vA, uint vB)
 
 /** Pseudo-instruction: vector move */
 void
-ppc_vecmove(struct ppc_function *p, uint vD, uint vA)
+ppc_vmove(struct ppc_function *p, uint vD, uint vA)
 {
    ppc_vor(p, vD, vA, vA);
 }
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc.h b/src/gallium/auxiliary/rtasm/rtasm_ppc.h
index f194d3be13..f938d8d759 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc.h
@@ -158,10 +158,6 @@ ppc_lvx(struct ppc_function *p, uint vR, uint vA, uint vB);
 extern void
 ppc_lvewx(struct ppc_function *p, uint vR, uint vA, uint vB);
 
-/** vector load float: vr = splats(imm) */
-extern void
-ppc_vload_float(struct ppc_function *p, uint vr, float imm);
-
 
 
 /**
@@ -191,7 +187,7 @@ ppc_vxor(struct ppc_function *p, uint vD, uint vA, uint vB);
 
 /** Pseudo-instruction: vector move */
 extern void
-ppc_vecmove(struct ppc_function *p, uint vD, uint vA);
+ppc_vmove(struct ppc_function *p, uint vD, uint vA);
 
 /** Set vector register to {0,0,0,0} */
 extern void
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ppc.c b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
index 6b05fd16cf..96beec0cc6 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ppc.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
@@ -229,7 +229,7 @@ emit_fetch(struct gen_context *gen,
    case TGSI_EXTSWIZZLE_ONE:
       {
          int one_vec = gen_one_vec(gen);
-         ppc_vecmove(gen->f, dst_vec, one_vec);
+         ppc_vmove(gen->f, dst_vec, one_vec);
       }
       break;
    default:
-- 
cgit v1.2.3


From 0ac99457811eb766e9bdd3903857b5c0fdef7694 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 22 Oct 2008 17:29:37 -0600
Subject: gallium: PPC: clamp y to [-128,128] for LIT

---
 src/gallium/auxiliary/tgsi/tgsi_ppc.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'src/gallium/auxiliary')

diff --git a/src/gallium/auxiliary/tgsi/tgsi_ppc.c b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
index 96beec0cc6..9ad7ecd7cf 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ppc.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
@@ -636,16 +636,17 @@ emit_lit(struct gen_context *gen, struct tgsi_full_instruction *inst)
 
          FETCH(gen, *inst, w_vec, 0, CHAN_W);        /* w_vec = src[0].w */
 
-         /* XXX clamp Y to [-128, 128] */
+         /* clamp Y to [-128, 128] */
          load_constant_vec(gen, p128_vec, 128.0f);
          load_constant_vec(gen, n128_vec, -128.0f);
+         ppc_vmaxfp(gen->f, y_vec, y_vec, n128_vec); /* y = max(y, -128) */
+         ppc_vminfp(gen->f, y_vec, y_vec, p128_vec); /* y = min(y, 128) */
 
          /* if temp.x > 0
-          *    pow(tmp.y, tmp.w)
+          *    z = pow(tmp.y, tmp.w)
           * else
-          *   0.0
+          *    z = 0.0
           */
-
          ppc_vec_pow(gen->f, pow_vec, y_vec, w_vec);      /* pow = pow(y, w) */
          ppc_vcmpgtfpx(gen->f, pos_vec, x_vec, zero_vec); /* pos = x > 0 */
          ppc_vand(gen->f, z_vec, pow_vec, pos_vec);       /* z = pow & pos */
-- 
cgit v1.2.3


From 6b69e3c71741d99a54c6f4dcb605a3c241239aeb Mon Sep 17 00:00:00 2001
From: Michel Dänzer <michel@tungstengraphics.com>
Date: Thu, 23 Oct 2008 10:28:48 +0200
Subject: scons: ppc support.

---
 SConstruct                             | 2 ++
 common.py                              | 3 ++-
 scons/gallium.py                       | 1 +
 src/gallium/auxiliary/draw/SConscript  | 1 +
 src/gallium/auxiliary/rtasm/SConscript | 1 +
 src/gallium/auxiliary/tgsi/SConscript  | 1 +
 src/mesa/SConscript                    | 4 ++++
 7 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'src/gallium/auxiliary')

diff --git a/SConstruct b/SConstruct
index c1dc624651..8c96817dae 100644
--- a/SConstruct
+++ b/SConstruct
@@ -70,12 +70,14 @@ platform = env['platform']
 
 # derived options
 x86 = machine == 'x86'
+ppc = machine == 'ppc'
 gcc = platform in ('linux', 'freebsd', 'darwin')
 msvc = platform in ('windows', 'winddk')
 
 Export([
 	'debug', 
 	'x86', 
+	'ppc', 
 	'dri', 
 	'llvm',
 	'platform',
diff --git a/common.py b/common.py
index dd64e0f434..cc2582f1a4 100644
--- a/common.py
+++ b/common.py
@@ -24,6 +24,7 @@ _machine_map = {
 	'i486': 'x86',
 	'i586': 'x86',
 	'i686': 'x86',
+	'ppc' : 'ppc',
 	'x86_64': 'x86_64',
 }
 if 'PROCESSOR_ARCHITECTURE' in os.environ:
@@ -56,7 +57,7 @@ def AddOptions(opts):
 	opts.Add(BoolOption('profile', 'profile build', 'no'))
 	#opts.Add(BoolOption('quiet', 'quiet command lines', 'no'))
 	opts.Add(EnumOption('machine', 'use machine-specific assembly code', default_machine,
-											 allowed_values=('generic', 'x86', 'x86_64')))
+											 allowed_values=('generic', 'ppc', 'x86', 'x86_64')))
 	opts.Add(EnumOption('platform', 'target platform', default_platform,
 											 allowed_values=('linux', 'cell', 'windows', 'winddk', 'wince')))
 	opts.Add(BoolOption('llvm', 'use LLVM', 'no'))
diff --git a/scons/gallium.py b/scons/gallium.py
index 3631607e66..2a42bdf2bb 100644
--- a/scons/gallium.py
+++ b/scons/gallium.py
@@ -175,6 +175,7 @@ def generate(env):
     machine = env['machine']
     platform = env['platform']
     x86 = env['machine'] == 'x86'
+    ppc = env['machine'] == 'ppc'
     gcc = env['platform'] in ('linux', 'freebsd', 'darwin')
     msvc = env['platform'] in ('windows', 'winddk', 'wince')
 
diff --git a/src/gallium/auxiliary/draw/SConscript b/src/gallium/auxiliary/draw/SConscript
index 544a04918b..5f05aa324a 100644
--- a/src/gallium/auxiliary/draw/SConscript
+++ b/src/gallium/auxiliary/draw/SConscript
@@ -38,6 +38,7 @@ draw = env.ConvenienceLibrary(
 		'draw_vs_aos_machine.c',
 		'draw_vs_exec.c',
 		'draw_vs_llvm.c',
+		'draw_vs_ppc.c',
 		'draw_vs_sse.c',
 		'draw_vs_varient.c'
 	])
diff --git a/src/gallium/auxiliary/rtasm/SConscript b/src/gallium/auxiliary/rtasm/SConscript
index 8ea25922aa..eb48368acc 100644
--- a/src/gallium/auxiliary/rtasm/SConscript
+++ b/src/gallium/auxiliary/rtasm/SConscript
@@ -6,6 +6,7 @@ rtasm = env.ConvenienceLibrary(
 		'rtasm_cpu.c',
 		'rtasm_execmem.c',
 		'rtasm_x86sse.c',
+		'rtasm_ppc.c',
 		'rtasm_ppc_spe.c',
 	])
 
diff --git a/src/gallium/auxiliary/tgsi/SConscript b/src/gallium/auxiliary/tgsi/SConscript
index 45bf3f6d57..8200cce42f 100644
--- a/src/gallium/auxiliary/tgsi/SConscript
+++ b/src/gallium/auxiliary/tgsi/SConscript
@@ -12,6 +12,7 @@ tgsi = env.ConvenienceLibrary(
 		'tgsi_parse.c',
 		'tgsi_sanity.c',
 		'tgsi_scan.c',
+		'tgsi_ppc.c',
 		'tgsi_sse2.c',
 		'tgsi_text.c',
 		'tgsi_transform.c',
diff --git a/src/mesa/SConscript b/src/mesa/SConscript
index af8dfcb493..89b98b37ab 100644
--- a/src/mesa/SConscript
+++ b/src/mesa/SConscript
@@ -283,6 +283,10 @@ if env['platform'] != 'winddk':
 			'x86-64/glapi_x86-64.S'
 		]
 	elif gcc and env['machine'] == 'ppc':
+		env.Append(CPPDEFINES = [
+			'USE_PPC_ASM', 
+			'USE_VMX_ASM', 
+		])
 		mesa_sources += [
 			'ppc/common_ppc.c',
 		]
-- 
cgit v1.2.3