From 13f46fa1b9c3009395a0d7f30ebef127f5937451 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keith@tungstengraphics.com>
Date: Tue, 7 Oct 2008 16:44:24 +0100
Subject: draw: don't assume output buffer pointer is aligned (cherry picked
 from commit 23cc303994eb630c56b1224dfdac51dcea41ed03)

---
 src/gallium/auxiliary/draw/draw_vs_aos_io.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src/gallium')

diff --git a/src/gallium/auxiliary/draw/draw_vs_aos_io.c b/src/gallium/auxiliary/draw/draw_vs_aos_io.c
index dd79bc799a..39f75b50b7 100644
--- a/src/gallium/auxiliary/draw/draw_vs_aos_io.c
+++ b/src/gallium/auxiliary/draw/draw_vs_aos_io.c
@@ -338,7 +338,7 @@ static void emit_store_R32G32B32A32( struct aos_compilation *cp,
 				     struct x86_reg dst_ptr,
 				     struct x86_reg dataXMM )
 {
-   sse_movaps(cp->func, dst_ptr, dataXMM);
+   sse_movups(cp->func, dst_ptr, dataXMM);
 }
 
 static void emit_store_R32G32B32( struct aos_compilation *cp, 
-- 
cgit v1.2.3


From abcd28b0b3fb77d3f99da957faa94e21ed54cae6 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 15 Oct 2008 10:43:53 -0600
Subject: cell: need to flush draw module when constants change

---
 src/gallium/drivers/cell/ppu/cell_state_shader.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'src/gallium')

diff --git a/src/gallium/drivers/cell/ppu/cell_state_shader.c b/src/gallium/drivers/cell/ppu/cell_state_shader.c
index 54a17eaf2b..cda39f8d59 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_shader.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_shader.c
@@ -191,6 +191,8 @@ cell_set_constant_buffer(struct pipe_context *pipe,
    assert(shader < PIPE_SHADER_TYPES);
    assert(index == 0);
 
+   draw_flush(cell->draw);
+
    /* note: reference counting */
    winsys_buffer_reference(ws,
                         &cell->constants[shader].buffer,
-- 
cgit v1.2.3


From 4e506f422a13b20fcc95edb6c7048a9de6e32efa Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 15 Oct 2008 10:53:48 -0600
Subject: cell: fix/add some fallback blend cases

---
 src/gallium/drivers/cell/spu/spu_per_fragment_op.c | 49 ++++++++++++++++++++--
 1 file changed, 46 insertions(+), 3 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
index d252fa6dc1..9404704abf 100644
--- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
+++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
@@ -40,6 +40,24 @@
 #define LINEAR_QUAD_LAYOUT 1
 
 
+static INLINE vector float
+spu_min(vector float a, vector float b)
+{
+   vector unsigned int m;
+   m = spu_cmpgt(a, b);    /* m = a > b ? ~0 : 0 */
+   return spu_sel(a, b, m);
+}
+
+
+static INLINE vector float
+spu_max(vector float a, vector float b)
+{
+   vector unsigned int m;
+   m = spu_cmpgt(a, b);    /* m = a > b ? ~0 : 0 */
+   return spu_sel(b, a, m);
+}
+
+
 /**
  * Called by rasterizer for each quad after the shader has run.  Do
  * all the per-fragment operations including alpha test, z test,
@@ -293,9 +311,9 @@ spu_fallback_fragment_ops(uint x, uint y,
        */
       switch (spu.blend.rgb_dst_factor) {
       case PIPE_BLENDFACTOR_ONE:
-         term2r = fragR;
-         term2g = fragG;
-         term2b = fragB;
+         term2r = fbRGBA[0];
+         term2g = fbRGBA[1];
+         term2b = fbRGBA[2];
          break;
       case PIPE_BLENDFACTOR_ZERO:
          term2r =
@@ -361,8 +379,24 @@ spu_fallback_fragment_ops(uint x, uint y,
          fragG = spu_sub(term1g, term2g);
          fragB = spu_sub(term1b, term2b);
          break;
+      case PIPE_BLEND_REVERSE_SUBTRACT:
+         fragR = spu_sub(term2r, term1r);
+         fragG = spu_sub(term2g, term1g);
+         fragB = spu_sub(term2b, term1b);
+         break;
+      case PIPE_BLEND_MIN:
+         fragR = spu_min(term1r, term2r);
+         fragG = spu_min(term1g, term2g);
+         fragB = spu_min(term1b, term2b);
+         break;
+      case PIPE_BLEND_MAX:
+         fragR = spu_max(term1r, term2r);
+         fragG = spu_max(term1g, term2g);
+         fragB = spu_max(term1b, term2b);
+         break;
       /* XXX more cases */
       default:
+         printf("unsup 0x%x\n", spu.blend.rgb_func);
          ASSERT(0);
       }
 
@@ -376,6 +410,15 @@ spu_fallback_fragment_ops(uint x, uint y,
       case PIPE_BLEND_SUBTRACT:
          fragA = spu_sub(term1a, term2a);
          break;
+      case PIPE_BLEND_REVERSE_SUBTRACT:
+         fragA = spu_sub(term2a, term1a);
+         break;
+      case PIPE_BLEND_MIN:
+         fragA = spu_min(term1a, term2a);
+         break;
+      case PIPE_BLEND_MAX:
+         fragA = spu_max(term1a, term2a);
+         break;
       /* XXX more cases */
       default:
          ASSERT(0);
-- 
cgit v1.2.3


From f60c756ed14f25731ff2a52d6b695ceb5b7a6f6b Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 15 Oct 2008 10:54:06 -0600
Subject: cell: additional debug

---
 src/gallium/drivers/cell/spu/spu_command.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'src/gallium')

diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index c28677ebf8..a07b312111 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -250,6 +250,7 @@ cmd_state_fs_constants(const uint64_t *buffer, uint pos)
 
    /* Expand each float to float[4] for SOA execution */
    for (i = 0; i < num_const; i++) {
+      DEBUG_PRINTF("  const[%u] = %f\n", i, constants[i]);
       spu.constants[i] = spu_splats(constants[i]);
    }
 
-- 
cgit v1.2.3


From 9382a7100fd6de6e615dc661ed813bf43e24ec15 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 15 Oct 2008 10:54:36 -0600
Subject: cell: updated vertex dump/debug code

---
 src/gallium/drivers/cell/spu/spu_tri.c | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index 03f094373d..2417db8960 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -404,11 +404,14 @@ flush_spans(void)
 static void
 print_vertex(const struct vertex_header *v)
 {
-   int i;
-   fprintf(stderr, "Vertex: (%p)\n", v);
-   for (i = 0; i < setup.quad.nr_attrs; i++) {
-      fprintf(stderr, "  %d: %f %f %f %f\n",  i, 
-              v->data[i][0], v->data[i][1], v->data[i][2], v->data[i][3]);
+   uint i;
+   fprintf(stderr, "  Vertex: (%p)\n", v);
+   for (i = 0; i < spu.vertex_info.num_attribs; i++) {
+      fprintf(stderr, "    %d: %f %f %f %f\n",  i, 
+              spu_extract(v->data[i], 0),
+              spu_extract(v->data[i], 1),
+              spu_extract(v->data[i], 2),
+              spu_extract(v->data[i], 3));
    }
 }
 #endif
@@ -420,10 +423,12 @@ setup_sort_vertices(const struct vertex_header *v0,
                     const struct vertex_header *v2)
 {
 #if DEBUG_VERTS
-   fprintf(stderr, "Triangle:\n");
-   print_vertex(v0);
-   print_vertex(v1);
-   print_vertex(v2);
+   if (spu.init.id==0) {
+      fprintf(stderr, "SPU %u: Triangle:\n", spu.init.id);
+      print_vertex(v0);
+      print_vertex(v1);
+      print_vertex(v2);
+   }
 #endif
 
    setup.vprovoke = v2;
-- 
cgit v1.2.3


From a7f06dae20c173a0edbb1d310b5f6b06068a61b0 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 15 Oct 2008 10:37:49 -0600
Subject: gallium: temporariliy revert softpipe shader optimization

---
 src/gallium/drivers/softpipe/sp_fs_exec.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'src/gallium')

diff --git a/src/gallium/drivers/softpipe/sp_fs_exec.c b/src/gallium/drivers/softpipe/sp_fs_exec.c
index 4fea71c314..89429c100e 100644
--- a/src/gallium/drivers/softpipe/sp_fs_exec.c
+++ b/src/gallium/drivers/softpipe/sp_fs_exec.c
@@ -102,7 +102,8 @@ exec_prepare( const struct sp_fragment_shader *base,
     * Bind tokens/shader to the interpreter's machine state.
     * Avoid redundant binding.
     */
-   if (spefs->machine_tokens != base->shader.tokens) {
+   /* XXX revisit this */
+   if (1 /* spefs->machine_tokens != base->shader.tokens*/) {
       tgsi_exec_machine_bind_shader( machine,
                                      base->shader.tokens,
                                      PIPE_MAX_SAMPLERS,
-- 
cgit v1.2.3


From 05a8f203cdea768466e5faf1dec4155e1e945c78 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 15 Oct 2008 11:56:57 -0600
Subject: gallium: fix the test in vs_exec_prepare() to avoid redundant
 bindings

Fixes regressions seen in progs/samples/prim.c, progs/demos/ray.c
---
 src/gallium/auxiliary/draw/draw_vs_exec.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/auxiliary/draw/draw_vs_exec.c b/src/gallium/auxiliary/draw/draw_vs_exec.c
index 79a19d6be2..13d4fcfdbf 100644
--- a/src/gallium/auxiliary/draw/draw_vs_exec.c
+++ b/src/gallium/auxiliary/draw/draw_vs_exec.c
@@ -46,7 +46,6 @@
 struct exec_vertex_shader {
    struct draw_vertex_shader base;
    struct tgsi_exec_machine *machine;
-   const struct tgsi_token *machine_tokens;
 };
 
 static struct exec_vertex_shader *exec_vertex_shader( struct draw_vertex_shader *vs )
@@ -66,12 +65,11 @@ vs_exec_prepare( struct draw_vertex_shader *shader,
    /* Specify the vertex program to interpret/execute.
     * Avoid rebinding when possible.
     */
-   if (evs->machine_tokens != shader->state.tokens) {
+   if (evs->machine->Tokens != shader->state.tokens) {
       tgsi_exec_machine_bind_shader(evs->machine,
                                     shader->state.tokens,
                                     PIPE_MAX_SAMPLERS,
                                     NULL /*samplers*/ );
-      evs->machine_tokens = shader->state.tokens;
    }
 }
 
-- 
cgit v1.2.3


From 5f76a77b319b4b66001dea4bcfccd0484aed82f5 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 15 Oct 2008 11:59:34 -0600
Subject: gallium: fix the shader-rebind test in softpipe, as was done for the
 draw module.

---
 src/gallium/drivers/softpipe/sp_fs_exec.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/drivers/softpipe/sp_fs_exec.c b/src/gallium/drivers/softpipe/sp_fs_exec.c
index 89429c100e..6280f0701d 100644
--- a/src/gallium/drivers/softpipe/sp_fs_exec.c
+++ b/src/gallium/drivers/softpipe/sp_fs_exec.c
@@ -42,7 +42,6 @@
 struct sp_exec_fragment_shader
 {
    struct sp_fragment_shader base;
-   const struct tgsi_token *machine_tokens;
 };
 
 
@@ -102,13 +101,11 @@ exec_prepare( const struct sp_fragment_shader *base,
     * Bind tokens/shader to the interpreter's machine state.
     * Avoid redundant binding.
     */
-   /* XXX revisit this */
-   if (1 /* spefs->machine_tokens != base->shader.tokens*/) {
+   if (machine->Tokens != base->shader.tokens) {
       tgsi_exec_machine_bind_shader( machine,
                                      base->shader.tokens,
                                      PIPE_MAX_SAMPLERS,
                                      samplers );
-      spefs->machine_tokens = base->shader.tokens;
    }
 }
 
-- 
cgit v1.2.3


From 53951531ae7bfd64afae1ae55aac7f6ebd3fe4f5 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 15 Oct 2008 12:35:51 -0600
Subject: cell: propogate blend color to SPUs for the fallback fragment ops
 code

---
 src/gallium/drivers/cell/common.h                  |  4 ++
 src/gallium/drivers/cell/ppu/cell_context.h        |  1 +
 src/gallium/drivers/cell/ppu/cell_state_emit.c     |  1 +
 src/gallium/drivers/cell/spu/spu_command.c         |  1 +
 src/gallium/drivers/cell/spu/spu_main.h            |  1 +
 src/gallium/drivers/cell/spu/spu_per_fragment_op.c | 75 +++++++++++++++++++---
 6 files changed, 74 insertions(+), 9 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index b0169b8e32..3b5a25e165 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -118,12 +118,16 @@
 
 /**
  * Command to specify per-fragment operations state and generated code.
+ * Note that the dsa, blend, blend_color fields are really only needed
+ * for the fallback/C per-pixel code.  They're not used when we generate
+ * dynamic SPU fragment code (which is the normal case).
  */
 struct cell_command_fragment_ops
 {
    uint64_t opcode;      /**< CELL_CMD_STATE_FRAGMENT_OPS */
    struct pipe_depth_stencil_alpha_state dsa;
    struct pipe_blend_state blend;
+   struct pipe_blend_color blend_color;
    unsigned code[SPU_MAX_FRAGMENT_OPS_INSTS];
 };
 
diff --git a/src/gallium/drivers/cell/ppu/cell_context.h b/src/gallium/drivers/cell/ppu/cell_context.h
index 80a9b3d7e1..1fcf03c2b8 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.h
+++ b/src/gallium/drivers/cell/ppu/cell_context.h
@@ -74,6 +74,7 @@ struct cell_fragment_shader_state
 struct cell_fragment_ops_key
 {
    struct pipe_blend_state blend;
+   struct pipe_blend_color blend_color;
    struct pipe_depth_stencil_alpha_state dsa;
    enum pipe_format color_format;
    enum pipe_format zs_format;
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index bb694aa107..d2427584ba 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -52,6 +52,7 @@ lookup_fragment_ops(struct cell_context *cell)
     */
    memset(&key, 0, sizeof(key));
    key.blend = *cell->blend;
+   key.blend_color = cell->blend_color;
    key.dsa = *cell->depth_stencil;
 
    if (cell->framebuffer.cbufs[0])
diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index a07b312111..b521c3aecf 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -195,6 +195,7 @@ cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops)
    /* Copy state info (for fallback case only) */
    memcpy(&spu.depth_stencil_alpha, &fops->dsa, sizeof(fops->dsa));
    memcpy(&spu.blend, &fops->blend, sizeof(fops->blend));
+   memcpy(&spu.blend_color, &fops->blend_color, sizeof(fops->blend_color));
 
    /* Parity twist!  For now, always use the fallback code by default,
     * only switching to codegen when specifically requested.  This
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index eff43b870c..ca72baea8b 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -145,6 +145,7 @@ struct spu_global
    struct spu_framebuffer fb;
    struct pipe_depth_stencil_alpha_state depth_stencil_alpha;
    struct pipe_blend_state blend;
+   struct pipe_blend_color blend_color;
    struct pipe_sampler_state sampler[PIPE_MAX_SAMPLERS];
    struct spu_texture texture[PIPE_MAX_SAMPLERS];
    struct vertex_info vertex_info;
diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
index 9404704abf..f8ffc70492 100644
--- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
+++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
@@ -260,7 +260,7 @@ spu_fallback_fragment_ops(uint x, uint y,
       }
 
       /*
-       * Compute Src RGB terms
+       * Compute Src RGB terms (fragment color * factor)
        */
       switch (spu.blend.rgb_src_factor) {
       case PIPE_BLENDFACTOR_ONE:
@@ -283,13 +283,33 @@ spu_fallback_fragment_ops(uint x, uint y,
          term1g = spu_mul(fragG, fragA);
          term1b = spu_mul(fragB, fragA);
          break;
+      case PIPE_BLENDFACTOR_DST_COLOR:
+         term1r = spu_mul(fragR, fbRGBA[0]);
+         term1g = spu_mul(fragG, fbRGBA[1]);
+         term1b = spu_mul(fragB, fbRGBA[1]);
+         break;
+      case PIPE_BLENDFACTOR_DST_ALPHA:
+         term1r = spu_mul(fragR, fbRGBA[3]);
+         term1g = spu_mul(fragG, fbRGBA[3]);
+         term1b = spu_mul(fragB, fbRGBA[3]);
+         break;
+      case PIPE_BLENDFACTOR_CONST_COLOR:
+         term1r = spu_mul(fragR, spu_splats(spu.blend_color.color[0]));
+         term1g = spu_mul(fragG, spu_splats(spu.blend_color.color[1]));
+         term1b = spu_mul(fragB, spu_splats(spu.blend_color.color[2]));
+         break;
+      case PIPE_BLENDFACTOR_CONST_ALPHA:
+         term1r = spu_mul(fragR, spu_splats(spu.blend_color.color[3]));
+         term1g = spu_mul(fragG, spu_splats(spu.blend_color.color[3]));
+         term1b = spu_mul(fragB, spu_splats(spu.blend_color.color[3]));
+         break;
       /* XXX more cases */
       default:
          ASSERT(0);
       }
 
       /*
-       * Compute Src Alpha term
+       * Compute Src Alpha term (fragment alpha * factor)
        */
       switch (spu.blend.alpha_src_factor) {
       case PIPE_BLENDFACTOR_ONE:
@@ -301,13 +321,23 @@ spu_fallback_fragment_ops(uint x, uint y,
       case PIPE_BLENDFACTOR_SRC_ALPHA:
          term1a = spu_mul(fragA, fragA);
          break;
+      case PIPE_BLENDFACTOR_DST_COLOR:
+         /* fall-through */
+      case PIPE_BLENDFACTOR_DST_ALPHA:
+         term1a = spu_mul(fragA, fbRGBA[3]);
+         break;
+      case PIPE_BLENDFACTOR_CONST_COLOR:
+         /* fall-through */
+      case PIPE_BLENDFACTOR_CONST_ALPHA:
+         term1a = spu_mul(fragR, spu_splats(spu.blend_color.color[3]));
+         break;
       /* XXX more cases */
       default:
          ASSERT(0);
       }
 
       /*
-       * Compute Dest RGB terms
+       * Compute Dest RGB terms (framebuffer color * factor)
        */
       switch (spu.blend.rgb_dst_factor) {
       case PIPE_BLENDFACTOR_ONE:
@@ -337,17 +367,37 @@ spu_fallback_fragment_ops(uint x, uint y,
          term2g = spu_mul(fbRGBA[1], tmp);
          term2b = spu_mul(fbRGBA[2], tmp);
          break;
-      /* XXX more cases */
+      case PIPE_BLENDFACTOR_DST_COLOR:
+         term2r = spu_mul(fbRGBA[0], fbRGBA[0]);
+         term2g = spu_mul(fbRGBA[1], fbRGBA[1]);
+         term2b = spu_mul(fbRGBA[2], fbRGBA[2]);
+         break;
+      case PIPE_BLENDFACTOR_DST_ALPHA:
+         term2r = spu_mul(fbRGBA[0], fbRGBA[3]);
+         term2g = spu_mul(fbRGBA[1], fbRGBA[3]);
+         term2b = spu_mul(fbRGBA[2], fbRGBA[3]);
+         break;
+      case PIPE_BLENDFACTOR_CONST_COLOR:
+         term2r = spu_mul(fbRGBA[0], spu_splats(spu.blend_color.color[0]));
+         term2g = spu_mul(fbRGBA[1], spu_splats(spu.blend_color.color[1]));
+         term2b = spu_mul(fbRGBA[2], spu_splats(spu.blend_color.color[2]));
+         break;
+      case PIPE_BLENDFACTOR_CONST_ALPHA:
+         term2r = spu_mul(fbRGBA[0], spu_splats(spu.blend_color.color[3]));
+         term2g = spu_mul(fbRGBA[1], spu_splats(spu.blend_color.color[3]));
+         term2b = spu_mul(fbRGBA[2], spu_splats(spu.blend_color.color[3]));
+         break;
+       /* XXX more cases */
       default:
          ASSERT(0);
       }
 
       /*
-       * Compute Dest Alpha term
+       * Compute Dest Alpha term (framebuffer alpha * factor)
        */
       switch (spu.blend.alpha_dst_factor) {
       case PIPE_BLENDFACTOR_ONE:
-         term2a = fragA;
+         term2a = fbRGBA[3];
          break;
       case PIPE_BLENDFACTOR_SRC_COLOR:
          term2a = spu_splats(0.0f);
@@ -360,6 +410,16 @@ spu_fallback_fragment_ops(uint x, uint y,
          tmp = spu_sub(one, fragA);
          term2a = spu_mul(fbRGBA[3], tmp);
          break;
+      case PIPE_BLENDFACTOR_DST_COLOR:
+         /* fall-through */
+      case PIPE_BLENDFACTOR_DST_ALPHA:
+         term2a = spu_mul(fbRGBA[3], fbRGBA[3]);
+         break;
+      case PIPE_BLENDFACTOR_CONST_COLOR:
+         /* fall-through */
+      case PIPE_BLENDFACTOR_CONST_ALPHA:
+         term2a = spu_mul(fbRGBA[3], spu_splats(spu.blend_color.color[3]));
+         break;
       /* XXX more cases */
       default:
          ASSERT(0);
@@ -394,9 +454,7 @@ spu_fallback_fragment_ops(uint x, uint y,
          fragG = spu_max(term1g, term2g);
          fragB = spu_max(term1b, term2b);
          break;
-      /* XXX more cases */
       default:
-         printf("unsup 0x%x\n", spu.blend.rgb_func);
          ASSERT(0);
       }
 
@@ -419,7 +477,6 @@ spu_fallback_fragment_ops(uint x, uint y,
       case PIPE_BLEND_MAX:
          fragA = spu_max(term1a, term2a);
          break;
-      /* XXX more cases */
       default:
          ASSERT(0);
       }
-- 
cgit v1.2.3


From ddeec1ed10d6c12403fe8d30c072ea68f044db99 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 15 Oct 2008 13:55:18 -0600
Subject: cell: simplify spu debug code

---
 src/gallium/drivers/cell/common.h           |  1 +
 src/gallium/drivers/cell/ppu/cell_context.c |  1 +
 src/gallium/drivers/cell/spu/spu_command.c  | 47 +++++++++++++----------------
 src/gallium/drivers/cell/spu/spu_debug.h    |  9 ------
 src/gallium/drivers/cell/spu/spu_main.c     |  9 +-----
 src/gallium/drivers/cell/spu/spu_main.h     | 15 +++++++--
 src/gallium/drivers/cell/spu/spu_render.c   |  7 +++--
 7 files changed, 41 insertions(+), 48 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index 3b5a25e165..8ae78265f2 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -111,6 +111,7 @@
 #define CELL_DEBUG_SYNC                 (1 << 2)
 #define CELL_DEBUG_FRAGMENT_OPS         (1 << 3)
 #define CELL_DEBUG_FRAGMENT_OP_FALLBACK (1 << 4)
+#define CELL_DEBUG_CMD                  (1 << 5)
 
 /** Max instructions for doing per-fragment operations */
 #define SPU_MAX_FRAGMENT_OPS_INSTS 64
diff --git a/src/gallium/drivers/cell/ppu/cell_context.c b/src/gallium/drivers/cell/ppu/cell_context.c
index b66aa9c9d9..f8d5eef3ac 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.c
+++ b/src/gallium/drivers/cell/ppu/cell_context.c
@@ -93,6 +93,7 @@ static const struct debug_named_value cell_debug_flags[] = {
    {"sync", CELL_DEBUG_SYNC},      /**< SPUs do synchronous DMA */
    {"fragops", CELL_DEBUG_FRAGMENT_OPS}, /**< SPUs emit fragment ops debug messages*/
    {"fragopfallback", CELL_DEBUG_FRAGMENT_OP_FALLBACK}, /**< SPUs use reference implementation for fragment ops*/
+   {"cmd", CELL_DEBUG_CMD},       /**< SPUs dump command buffer info */
    {NULL, 0}
 };
 
diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index b521c3aecf..ebbed3d1dc 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -44,7 +44,6 @@
 #include "spu_tile.h"
 #include "spu_vertex_shader.h"
 #include "spu_dcache.h"
-#include "spu_debug.h"
 #include "cell/common.h"
 
 
@@ -97,7 +96,7 @@ release_buffer(uint buffer)
 static void
 cmd_clear_surface(const struct cell_command_clear_surface *clear)
 {
-   DEBUG_PRINTF("CLEAR SURF %u to 0x%08x\n", clear->surface, clear->value);
+   D_PRINTF(CELL_DEBUG_CMD, "CLEAR SURF %u to 0x%08x\n", clear->surface, clear->value);
 
    if (clear->surface == 0) {
       spu.fb.color_clear_value = clear->value;
@@ -165,14 +164,14 @@ cmd_clear_surface(const struct cell_command_clear_surface *clear)
 
 #endif /* CLEAR_OPT */
 
-   DEBUG_PRINTF("CLEAR SURF done\n");
+   D_PRINTF(CELL_DEBUG_CMD, "CLEAR SURF done\n");
 }
 
 
 static void
 cmd_release_verts(const struct cell_command_release_verts *release)
 {
-   DEBUG_PRINTF("RELEASE VERTS %u\n", release->vertex_buf);
+   D_PRINTF(CELL_DEBUG_CMD, "RELEASE VERTS %u\n", release->vertex_buf);
    ASSERT(release->vertex_buf != ~0U);
    release_buffer(release->vertex_buf);
 }
@@ -189,7 +188,7 @@ cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops)
 {
    static int warned = 0;
 
-   DEBUG_PRINTF("CMD_STATE_FRAGMENT_OPS\n");
+   D_PRINTF(CELL_DEBUG_CMD, "CMD_STATE_FRAGMENT_OPS\n");
    /* Copy SPU code from batch buffer to spu buffer */
    memcpy(spu.fragment_ops_code, fops->code, SPU_MAX_FRAGMENT_OPS_INSTS * 4);
    /* Copy state info (for fallback case only) */
@@ -229,7 +228,7 @@ cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops)
 static void
 cmd_state_fragment_program(const struct cell_command_fragment_program *fp)
 {
-   DEBUG_PRINTF("CMD_STATE_FRAGMENT_PROGRAM\n");
+   D_PRINTF(CELL_DEBUG_CMD, "CMD_STATE_FRAGMENT_PROGRAM\n");
    /* Copy SPU code from batch buffer to spu buffer */
    memcpy(spu.fragment_program_code, fp->code,
           SPU_MAX_FRAGMENT_PROGRAM_INSTS * 4);
@@ -247,11 +246,11 @@ cmd_state_fs_constants(const uint64_t *buffer, uint pos)
    const float *constants = (const float *) &buffer[pos + 2];
    uint i;
 
-   DEBUG_PRINTF("CMD_STATE_FS_CONSTANTS (%u)\n", num_const);
+   D_PRINTF(CELL_DEBUG_CMD, "CMD_STATE_FS_CONSTANTS (%u)\n", num_const);
 
    /* Expand each float to float[4] for SOA execution */
    for (i = 0; i < num_const; i++) {
-      DEBUG_PRINTF("  const[%u] = %f\n", i, constants[i]);
+      D_PRINTF(CELL_DEBUG_CMD, "  const[%u] = %f\n", i, constants[i]);
       spu.constants[i] = spu_splats(constants[i]);
    }
 
@@ -263,7 +262,7 @@ cmd_state_fs_constants(const uint64_t *buffer, uint pos)
 static void
 cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
 {
-   DEBUG_PRINTF("FRAMEBUFFER: %d x %d at %p, cformat 0x%x  zformat 0x%x\n",
+   D_PRINTF(CELL_DEBUG_CMD, "FRAMEBUFFER: %d x %d at %p, cformat 0x%x  zformat 0x%x\n",
              cmd->width,
              cmd->height,
              cmd->color_start,
@@ -352,7 +351,7 @@ cmd_state_sampler(const struct cell_command_sampler *sampler)
 {
    uint unit = sampler->unit;
 
-   DEBUG_PRINTF("SAMPLER [%u]\n", unit);
+   D_PRINTF(CELL_DEBUG_CMD, "SAMPLER [%u]\n", unit);
 
    spu.sampler[unit] = sampler->state;
 
@@ -404,9 +403,7 @@ cmd_state_texture(const struct cell_command_texture *texture)
    const uint unit = texture->unit;
    uint i;
 
-   //if (spu.init.id==0) Debug=1;
-
-   DEBUG_PRINTF("TEXTURE [%u]\n", texture->unit);
+   D_PRINTF(CELL_DEBUG_CMD, "TEXTURE [%u]\n", texture->unit);
 
    spu.texture[unit].max_level = 0;
    spu.texture[unit].target = texture->target;
@@ -416,7 +413,7 @@ cmd_state_texture(const struct cell_command_texture *texture)
       uint height = texture->height[i];
       uint depth = texture->depth[i];
 
-      DEBUG_PRINTF("  LEVEL %u: at %p  size[0] %u x %u\n", i,
+      D_PRINTF(CELL_DEBUG_CMD, "  LEVEL %u: at %p  size[0] %u x %u\n", i,
              texture->start[i], texture->width[i], texture->height[i]);
 
       spu.texture[unit].level[i].start = texture->start[i];
@@ -438,15 +435,13 @@ cmd_state_texture(const struct cell_command_texture *texture)
    }
 
    update_tex_masks(&spu.texture[unit], &spu.sampler[unit], unit);
-
-   //Debug=0;
 }
 
 
 static void
 cmd_state_vertex_info(const struct vertex_info *vinfo)
 {
-   DEBUG_PRINTF("VERTEX_INFO num_attribs=%u\n", vinfo->num_attribs);
+   D_PRINTF(CELL_DEBUG_CMD, "VERTEX_INFO num_attribs=%u\n", vinfo->num_attribs);
    ASSERT(vinfo->num_attribs >= 1);
    ASSERT(vinfo->num_attribs <= 8);
    memcpy(&spu.vertex_info, vinfo, sizeof(*vinfo));
@@ -485,7 +480,7 @@ cmd_state_attrib_fetch(const struct cell_attribute_fetch_code *code)
 static void
 cmd_finish(void)
 {
-   DEBUG_PRINTF("FINISH\n");
+   D_PRINTF(CELL_DEBUG_CMD, "FINISH\n");
    really_clear_tiles(0);
    /* wait for all outstanding DMAs to finish */
    mfc_write_tag_mask(~0);
@@ -510,7 +505,7 @@ cmd_batch(uint opcode)
    const unsigned usize = size / sizeof(buffer[0]);
    uint pos;
 
-   DEBUG_PRINTF("BATCH buffer %u, len %u, from %p\n",
+   D_PRINTF(CELL_DEBUG_CMD, "BATCH buffer %u, len %u, from %p\n",
              buf, size, spu.init.buffers[buf]);
 
    ASSERT((opcode & CELL_CMD_OPCODE_MASK) == CELL_CMD_BATCH);
@@ -530,7 +525,7 @@ cmd_batch(uint opcode)
    wait_on_mask(1 << TAG_BATCH_BUFFER);
 
    /* Tell PPU we're done copying the buffer to local store */
-   DEBUG_PRINTF("release batch buf %u\n", buf);
+   D_PRINTF(CELL_DEBUG_CMD, "release batch buf %u\n", buf);
    release_buffer(buf);
 
    /*
@@ -663,7 +658,7 @@ cmd_batch(uint opcode)
       }
    }
 
-   DEBUG_PRINTF("BATCH complete\n");
+   D_PRINTF(CELL_DEBUG_CMD, "BATCH complete\n");
 }
 
 
@@ -677,7 +672,7 @@ command_loop(void)
    struct cell_command cmd;
    int exitFlag = 0;
 
-   DEBUG_PRINTF("Enter command loop\n");
+   D_PRINTF(CELL_DEBUG_CMD, "Enter command loop\n");
 
    ASSERT((sizeof(struct cell_command) & 0xf) == 0);
    ASSERT_ALIGN16(&cmd);
@@ -686,12 +681,12 @@ command_loop(void)
       unsigned opcode;
       int tag = 0;
 
-      DEBUG_PRINTF("Wait for cmd...\n");
+      D_PRINTF(CELL_DEBUG_CMD, "Wait for cmd...\n");
 
       /* read/wait from mailbox */
       opcode = (unsigned int) spu_read_in_mbox();
 
-      DEBUG_PRINTF("got cmd 0x%x\n", opcode);
+      D_PRINTF(CELL_DEBUG_CMD, "got cmd 0x%x\n", opcode);
 
       /* command payload */
       mfc_get(&cmd,  /* dest */
@@ -708,7 +703,7 @@ command_loop(void)
 
       switch (opcode & CELL_CMD_OPCODE_MASK) {
       case CELL_CMD_EXIT:
-         DEBUG_PRINTF("EXIT\n");
+         D_PRINTF(CELL_DEBUG_CMD, "EXIT\n");
          exitFlag = 1;
          break;
       case CELL_CMD_VS_EXECUTE:
@@ -725,7 +720,7 @@ command_loop(void)
 
    }
 
-   DEBUG_PRINTF("Exit command loop\n");
+   D_PRINTF(CELL_DEBUG_CMD, "Exit command loop\n");
 
    spu_dcache_report();
 }
diff --git a/src/gallium/drivers/cell/spu/spu_debug.h b/src/gallium/drivers/cell/spu/spu_debug.h
index eeec052655..25653dcdcd 100644
--- a/src/gallium/drivers/cell/spu/spu_debug.h
+++ b/src/gallium/drivers/cell/spu/spu_debug.h
@@ -30,28 +30,19 @@
 #define SPU_DEBUG_H
 
 
-/* Set to 0 to disable all extraneous debugging code */
-#define DEBUG 1
-
 #if DEBUG
-extern boolean Debug;
-extern boolean force_fragment_ops_fallback;
 
 /* These debug macros use the unusual construction ", ##__VA_ARGS__"
  * which expands to the expected comma + args if variadic arguments
  * are supplied, but swallows the comma if there are no variadic
  * arguments (which avoids syntax errors that would otherwise occur).
  */
-#define DEBUG_PRINTF(format,...) \
-   if (Debug) \
-      printf("SPU %u: " format, spu.init.id, ##__VA_ARGS__)
 #define D_PRINTF(flag, format,...) \
    if (spu.init.debug_flags & (flag)) \
       printf("SPU %u: " format, spu.init.id, ##__VA_ARGS__)
 
 #else
 
-#define DEBUG_PRINTF(...)
 #define D_PRINTF(...)
 
 #endif
diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index 4becd0f92a..c8bb251905 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -40,7 +40,6 @@
 #include "spu_per_fragment_op.h"
 #include "spu_texture.h"
 //#include "spu_test.h"
-#include "spu_debug.h"
 #include "cell/common.h"
 
 
@@ -53,12 +52,6 @@ helpful headers:
 struct spu_global spu;
 
 
-#if DEBUG
-boolean Debug = FALSE;
-boolean force_fragment_ops_fallback = TRUE;
-#endif
-
-
 static void
 one_time_init(void)
 {
@@ -102,7 +95,7 @@ main(main_param_t speid, main_param_t argp)
 
    one_time_init();
 
-   DEBUG_PRINTF("main() speid=%lu\n", (unsigned long) speid);
+   D_PRINTF(CELL_DEBUG_CMD, "main() speid=%lu\n", (unsigned long) speid);
    D_PRINTF(CELL_DEBUG_FRAGMENT_OP_FALLBACK, "using fragment op fallback\n");
 
    /* get initialization data */
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index ca72baea8b..569b9e45d4 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -36,6 +36,19 @@
 #include "pipe/p_state.h"
 
 
+#if DEBUG
+/* These debug macros use the unusual construction ", ##__VA_ARGS__"
+ * which expands to the expected comma + args if variadic arguments
+ * are supplied, but swallows the comma if there are no variadic
+ * arguments (which avoids syntax errors that would otherwise occur).
+ */
+#define D_PRINTF(flag, format,...) \
+   if (spu.init.debug_flags & (flag)) \
+      printf("SPU %u: " format, spu.init.id, ##__VA_ARGS__)
+#else
+#define D_PRINTF(...)
+#endif
+
 
 #define MAX_WIDTH 1024
 #define MAX_HEIGHT 1024
@@ -187,8 +200,6 @@ struct spu_global
 
 
 extern struct spu_global spu;
-extern boolean Debug;
-
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_render.c b/src/gallium/drivers/cell/spu/spu_render.c
index 82dbeb26b7..cfff19b6c0 100644
--- a/src/gallium/drivers/cell/spu/spu_render.c
+++ b/src/gallium/drivers/cell/spu/spu_render.c
@@ -177,7 +177,7 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
    uint i, j;
 
 
-   if (Debug) {
+#if 0
       printf("SPU %u: RENDER prim %u, num_vert=%u  num_ind=%u  "
              "inline_vert=%u\n",
              spu.init.id,
@@ -190,7 +190,7 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
       printf("       bound: %g, %g .. %g, %g\n",
              render->xmin, render->ymin, render->xmax, render->ymax);
       */
-   }
+#endif
 
    ASSERT(sizeof(*render) % 4 == 0);
    ASSERT(total_vertex_bytes % 16 == 0);
@@ -293,7 +293,8 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
       spu.ztile_status[ty][tx] = spu.cur_ztile_status;
    }
 
-   if (Debug)
+#if 0
       printf("SPU %u: RENDER done\n",
              spu.init.id);
+#endif
 }
-- 
cgit v1.2.3


From 708f046c215d070e82f40eee895a8d312b1a64c7 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 15 Oct 2008 13:56:00 -0600
Subject: cell: remove obsolete spu_debug.h file

---
 src/gallium/drivers/cell/spu/spu_debug.h | 51 --------------------------------
 1 file changed, 51 deletions(-)
 delete mode 100644 src/gallium/drivers/cell/spu/spu_debug.h

(limited to 'src/gallium')

diff --git a/src/gallium/drivers/cell/spu/spu_debug.h b/src/gallium/drivers/cell/spu/spu_debug.h
deleted file mode 100644
index 25653dcdcd..0000000000
--- a/src/gallium/drivers/cell/spu/spu_debug.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-
-#ifndef SPU_DEBUG_H
-#define SPU_DEBUG_H
-
-
-#if DEBUG
-
-/* These debug macros use the unusual construction ", ##__VA_ARGS__"
- * which expands to the expected comma + args if variadic arguments
- * are supplied, but swallows the comma if there are no variadic
- * arguments (which avoids syntax errors that would otherwise occur).
- */
-#define D_PRINTF(flag, format,...) \
-   if (spu.init.debug_flags & (flag)) \
-      printf("SPU %u: " format, spu.init.id, ##__VA_ARGS__)
-
-#else
-
-#define D_PRINTF(...)
-
-#endif
-
-
-#endif /* SPU_DEBUG_H */
-- 
cgit v1.2.3


From 79e96b3a77f7d5c7136b380abcc675c7242d0ffe Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 15 Oct 2008 13:58:58 -0600
Subject: cell: move some CELL_MAX constants

---
 src/gallium/drivers/cell/common.h       |  6 +++++-
 src/gallium/drivers/cell/spu/spu_main.h | 11 ++---------
 2 files changed, 7 insertions(+), 10 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index 8ae78265f2..d716a26175 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -68,6 +68,9 @@
 
 #define CELL_MAX_SAMPLERS 4
 #define CELL_MAX_TEXTURE_LEVELS 12  /* 2k x 2k */
+#define CELL_MAX_CONSTANTS 32  /**< number of float[4] constants */
+#define CELL_MAX_WIDTH 1024    /**< max framebuffer width */
+#define CELL_MAX_HEIGHT 1024   /**< max framebuffer width */
 
 #define TILE_SIZE 32
 
@@ -99,13 +102,14 @@
 #define CELL_CMD_VS_EXECUTE          22
 #define CELL_CMD_FLUSH_BUFFER_RANGE  23
 
-
+/** Command/batch buffers */
 #define CELL_NUM_BUFFERS 4
 #define CELL_BUFFER_SIZE (4*1024)  /**< 16KB would be the max */
 
 #define CELL_BUFFER_STATUS_FREE 10
 #define CELL_BUFFER_STATUS_USED 20
 
+/** Debug flags */
 #define CELL_DEBUG_CHECKER              (1 << 0)
 #define CELL_DEBUG_ASM                  (1 << 1)
 #define CELL_DEBUG_SYNC                 (1 << 2)
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 569b9e45d4..f87495b72d 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -50,13 +50,6 @@
 #endif
 
 
-#define MAX_WIDTH 1024
-#define MAX_HEIGHT 1024
-
-
-#define CELL_MAX_CONSTANTS 32  /**< number of float[4] constants */
-
-
 /**
  * A tile is basically a TILE_SIZE x TILE_SIZE block of 4-byte pixels.
  * The data may be addressed through several different types.
@@ -175,8 +168,8 @@ struct spu_global
    ubyte cur_ctile_status, cur_ztile_status;
 
    /** Status of all tiles in framebuffer */
-   ubyte ctile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
-   ubyte ztile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
+   ubyte ctile_status[CELL_MAX_HEIGHT/TILE_SIZE][CELL_MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
+   ubyte ztile_status[CELL_MAX_HEIGHT/TILE_SIZE][CELL_MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
 
    /** Current fragment ops machine code, at 8-byte boundary */
    uint fragment_ops_code[SPU_MAX_FRAGMENT_OPS_INSTS] ALIGN8_ATTRIB;
-- 
cgit v1.2.3


From 0eb0b0a816764a323af7a8d2b5cb6792f886ce04 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 15 Oct 2008 14:12:55 -0600
Subject: cell: remove some old, pre-batchbuffer stuff

---
 src/gallium/drivers/cell/common.h          | 14 --------------
 src/gallium/drivers/cell/ppu/cell_spu.c    |  5 +----
 src/gallium/drivers/cell/ppu/cell_spu.h    |  3 +--
 src/gallium/drivers/cell/spu/spu_command.c | 19 -------------------
 4 files changed, 2 insertions(+), 39 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index d716a26175..600f1b37a2 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -269,19 +269,6 @@ struct cell_command_texture
 };
 
 
-/** XXX unions don't seem to work */
-/* XXX this should go away; all commands should be placed in batch buffers */
-struct cell_command
-{
-#if 0
-   struct cell_command_framebuffer fb;
-   struct cell_command_clear_surface clear;
-   struct cell_command_render render;
-#endif
-   struct cell_command_vs vs;
-} ALIGN16_ATTRIB;
-
-
 #define MAX_SPU_FUNCTIONS 12
 /**
  * Used to tell the PPU about the address of particular functions in the
@@ -302,7 +289,6 @@ struct cell_init_info
    unsigned id;
    unsigned num_spus;
    unsigned debug_flags;  /**< mask of CELL_DEBUG_x flags */
-   struct cell_command *cmd;
 
    /** Buffers for command batches, vertex/index data */
    ubyte *buffers[CELL_NUM_BUFFERS];
diff --git a/src/gallium/drivers/cell/ppu/cell_spu.c b/src/gallium/drivers/cell/ppu/cell_spu.c
index df020c4146..90745da3d2 100644
--- a/src/gallium/drivers/cell/ppu/cell_spu.c
+++ b/src/gallium/drivers/cell/ppu/cell_spu.c
@@ -126,9 +126,6 @@ cell_start_spus(struct cell_context *cell)
 
    assert(cell->num_spus <= MAX_SPUS);
 
-   ASSERT_ALIGN16(&cell_global.command[0]);
-   ASSERT_ALIGN16(&cell_global.command[1]);
-
    ASSERT_ALIGN16(&cell_global.inits[0]);
    ASSERT_ALIGN16(&cell_global.inits[1]);
 
@@ -141,7 +138,7 @@ cell_start_spus(struct cell_context *cell)
       cell_global.inits[i].id = i;
       cell_global.inits[i].num_spus = cell->num_spus;
       cell_global.inits[i].debug_flags = cell->debug_flags;
-      cell_global.inits[i].cmd = &cell_global.command[i];
+
       for (j = 0; j < CELL_NUM_BUFFERS; j++) {
          cell_global.inits[i].buffers[j] = cell->buffer[j];
       }
diff --git a/src/gallium/drivers/cell/ppu/cell_spu.h b/src/gallium/drivers/cell/ppu/cell_spu.h
index 137f26612e..3443331b01 100644
--- a/src/gallium/drivers/cell/ppu/cell_spu.h
+++ b/src/gallium/drivers/cell/ppu/cell_spu.h
@@ -50,10 +50,9 @@ struct cell_global_info
    pthread_t spe_threads[MAX_SPUS];
 
    /**
-    * Data sent to SPUs
+    * Data sent to SPUs at start-up
     */
    struct cell_init_info inits[MAX_SPUS];
-   struct cell_command command[MAX_SPUS];
 };
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index ebbed3d1dc..4febd5385b 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -669,38 +669,19 @@ cmd_batch(uint opcode)
 void
 command_loop(void)
 {
-   struct cell_command cmd;
    int exitFlag = 0;
 
    D_PRINTF(CELL_DEBUG_CMD, "Enter command loop\n");
 
-   ASSERT((sizeof(struct cell_command) & 0xf) == 0);
-   ASSERT_ALIGN16(&cmd);
-
    while (!exitFlag) {
       unsigned opcode;
-      int tag = 0;
 
       D_PRINTF(CELL_DEBUG_CMD, "Wait for cmd...\n");
 
       /* read/wait from mailbox */
       opcode = (unsigned int) spu_read_in_mbox();
-
       D_PRINTF(CELL_DEBUG_CMD, "got cmd 0x%x\n", opcode);
 
-      /* command payload */
-      mfc_get(&cmd,  /* dest */
-              (unsigned int) spu.init.cmd, /* src */
-              sizeof(struct cell_command), /* bytes */
-              tag,
-              0, /* tid */
-              0  /* rid */);
-      wait_on_mask( 1 << tag );
-
-      /*
-       * NOTE: most commands should be contained in a batch buffer
-       */
-
       switch (opcode & CELL_CMD_OPCODE_MASK) {
       case CELL_CMD_EXIT:
          D_PRINTF(CELL_DEBUG_CMD, "EXIT\n");
-- 
cgit v1.2.3


From 67f615681c569264eab1bc901473c86cfc54e480 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 15 Oct 2008 14:18:51 -0600
Subject: cell: use CELL_MAX_SPUS consistently.

---
 src/gallium/drivers/cell/common.h       | 2 +-
 src/gallium/drivers/cell/ppu/cell_spu.c | 2 +-
 src/gallium/drivers/cell/ppu/cell_spu.h | 8 +++-----
 3 files changed, 5 insertions(+), 7 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index 600f1b37a2..1f6f2d494b 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -64,7 +64,7 @@
 #define ROUNDUP16(k)  (((k) + 0xf) & ~0xf)
 
 
-#define CELL_MAX_SPUS 6
+#define CELL_MAX_SPUS 8
 
 #define CELL_MAX_SAMPLERS 4
 #define CELL_MAX_TEXTURE_LEVELS 12  /* 2k x 2k */
diff --git a/src/gallium/drivers/cell/ppu/cell_spu.c b/src/gallium/drivers/cell/ppu/cell_spu.c
index 90745da3d2..a6e268b362 100644
--- a/src/gallium/drivers/cell/ppu/cell_spu.c
+++ b/src/gallium/drivers/cell/ppu/cell_spu.c
@@ -124,7 +124,7 @@ cell_start_spus(struct cell_context *cell)
 
    one_time_init = TRUE;
 
-   assert(cell->num_spus <= MAX_SPUS);
+   assert(cell->num_spus <= CELL_MAX_SPUS);
 
    ASSERT_ALIGN16(&cell_global.inits[0]);
    ASSERT_ALIGN16(&cell_global.inits[1]);
diff --git a/src/gallium/drivers/cell/ppu/cell_spu.h b/src/gallium/drivers/cell/ppu/cell_spu.h
index 3443331b01..2e965c6301 100644
--- a/src/gallium/drivers/cell/ppu/cell_spu.h
+++ b/src/gallium/drivers/cell/ppu/cell_spu.h
@@ -36,8 +36,6 @@
 #include "cell_context.h"
 
 
-#define MAX_SPUS 8
-
 /**
  * Global vars, for now anyway.
  */
@@ -46,13 +44,13 @@ struct cell_global_info
    /**
     * SPU/SPE handles, etc
     */
-   spe_context_ptr_t spe_contexts[MAX_SPUS];
-   pthread_t spe_threads[MAX_SPUS];
+   spe_context_ptr_t spe_contexts[CELL_MAX_SPUS];
+   pthread_t spe_threads[CELL_MAX_SPUS];
 
    /**
     * Data sent to SPUs at start-up
     */
-   struct cell_init_info inits[MAX_SPUS];
+   struct cell_init_info inits[CELL_MAX_SPUS];
 };
 
 
-- 
cgit v1.2.3


From 8bcbefb370ef8d0a6751636a28cd12b3e9cde7dc Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 15 Oct 2008 14:20:10 -0600
Subject: cell: query number SPUs with spe_cpu_info_get()

---
 src/gallium/drivers/cell/ppu/cell_context.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/drivers/cell/ppu/cell_context.c b/src/gallium/drivers/cell/ppu/cell_context.c
index f8d5eef3ac..358aa338fe 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.c
+++ b/src/gallium/drivers/cell/ppu/cell_context.c
@@ -153,10 +153,11 @@ cell_create_context(struct pipe_screen *screen,
    /*
     * SPU stuff
     */
-   cell->num_spus = 6;
-   /* XXX is this in SDK 3.0 only?
+   /* This call only works with SDK 3.0.  Anyone still using 2.1??? */
    cell->num_spus = spe_cpu_info_get(SPE_COUNT_PHYSICAL_SPES, -1);
-   */
+   if (cell->debug_flags) {
+      printf("PPU: found %u SPUs\n", cell->num_spus);
+   }
 
    cell_start_spus(cell);
 
-- 
cgit v1.2.3


From 8bf105997748ba268eb65b39461e379fe6642c5a Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 15 Oct 2008 14:26:08 -0600
Subject: cell: query # cells too

---
 src/gallium/drivers/cell/ppu/cell_context.c | 6 ++++--
 src/gallium/drivers/cell/ppu/cell_context.h | 2 +-
 2 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/drivers/cell/ppu/cell_context.c b/src/gallium/drivers/cell/ppu/cell_context.c
index 358aa338fe..097dbcfdc8 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.c
+++ b/src/gallium/drivers/cell/ppu/cell_context.c
@@ -154,9 +154,11 @@ cell_create_context(struct pipe_screen *screen,
     * SPU stuff
     */
    /* This call only works with SDK 3.0.  Anyone still using 2.1??? */
-   cell->num_spus = spe_cpu_info_get(SPE_COUNT_PHYSICAL_SPES, -1);
+   cell->num_cells = spe_cpu_info_get(SPE_COUNT_PHYSICAL_CPU_NODES, -1);
+   cell->num_spus = spe_cpu_info_get(SPE_COUNT_USABLE_SPES, 0);
    if (cell->debug_flags) {
-      printf("PPU: found %u SPUs\n", cell->num_spus);
+      printf("Cell: found %d Cell(s) with %u SPUs\n",
+             cell->num_cells, cell->num_spus);
    }
 
    cell_start_spus(cell);
diff --git a/src/gallium/drivers/cell/ppu/cell_context.h b/src/gallium/drivers/cell/ppu/cell_context.h
index 1fcf03c2b8..a592e728c8 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.h
+++ b/src/gallium/drivers/cell/ppu/cell_context.h
@@ -140,7 +140,7 @@ struct cell_context
 
    struct cell_spu_function_info spu_functions ALIGN16_ATTRIB;
 
-   uint num_spus;
+   uint num_cells, num_spus;
 
    /** Buffers for command batches, vertex/index data */
    uint buffer_size[CELL_NUM_BUFFERS];
-- 
cgit v1.2.3


From ec7d6c656178babdf143faa242f7a3df9d0bc22c Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 15 Oct 2008 14:39:16 -0600
Subject: cell: send rasterizer state to SPUs in proper way, remove
 front_winding hack

---
 src/gallium/drivers/cell/common.h              | 18 ++++++++++++++----
 src/gallium/drivers/cell/ppu/cell_state_emit.c |  7 +++++++
 src/gallium/drivers/cell/ppu/cell_vbuf.c       |  1 -
 src/gallium/drivers/cell/spu/spu_command.c     |  8 ++++++++
 src/gallium/drivers/cell/spu/spu_main.h        |  1 +
 src/gallium/drivers/cell/spu/spu_render.c      |  2 +-
 src/gallium/drivers/cell/spu/spu_tri.c         |  4 ++--
 src/gallium/drivers/cell/spu/spu_tri.h         |  2 +-
 8 files changed, 34 insertions(+), 9 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index 1f6f2d494b..0ff2c491fb 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -99,8 +99,9 @@
 #define CELL_CMD_STATE_FRAGMENT_PROGRAM 19
 #define CELL_CMD_STATE_ATTRIB_FETCH  20
 #define CELL_CMD_STATE_FS_CONSTANTS  21
-#define CELL_CMD_VS_EXECUTE          22
-#define CELL_CMD_FLUSH_BUFFER_RANGE  23
+#define CELL_CMD_STATE_RASTERIZER    22
+#define CELL_CMD_VS_EXECUTE          23
+#define CELL_CMD_FLUSH_BUFFER_RANGE  24
 
 /** Command/batch buffers */
 #define CELL_NUM_BUFFERS 4
@@ -156,13 +157,23 @@ struct cell_command_fragment_program
  */
 struct cell_command_framebuffer
 {
-   uint64_t opcode;     /**< CELL_CMD_FRAMEBUFFER */
+   uint64_t opcode;     /**< CELL_CMD_STATE_FRAMEBUFFER */
    int width, height;
    void *color_start, *depth_start;
    enum pipe_format color_format, depth_format;
 };
 
 
+/**
+ * Tell SPUs about rasterizer state.
+ */
+struct cell_command_rasterizer
+{
+   uint64_t opcode;    /**< CELL_CMD_STATE_RASTERIZER */
+   struct pipe_rasterizer_state rasterizer;
+};
+
+
 /**
  * Clear framebuffer to the given value/color.
  */
@@ -238,7 +249,6 @@ struct cell_command_render
    float xmin, ymin, xmax, ymax;  /* XXX another dummy field */
    uint min_index;
    boolean inline_verts;
-   uint front_winding; /* the rasterizer needs to be able to determine facing to apply front/back-facing stencil */
 };
 
 
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index d2427584ba..e6387382f2 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -147,6 +147,13 @@ cell_emit_state(struct cell_context *cell)
 #endif
    }
 
+   if (cell->dirty & (CELL_NEW_RASTERIZER)) {
+      struct cell_command_rasterizer *rast =
+         cell_batch_alloc(cell, sizeof(*rast));
+      rast->opcode = CELL_CMD_STATE_RASTERIZER;
+      rast->rasterizer = *cell->rasterizer;
+   }
+
    if (cell->dirty & (CELL_NEW_FS)) {
       /* Send new fragment program to SPUs */
       struct cell_command_fragment_program *fp
diff --git a/src/gallium/drivers/cell/ppu/cell_vbuf.c b/src/gallium/drivers/cell/ppu/cell_vbuf.c
index 578ddf62dc..aa63435b93 100644
--- a/src/gallium/drivers/cell/ppu/cell_vbuf.c
+++ b/src/gallium/drivers/cell/ppu/cell_vbuf.c
@@ -214,7 +214,6 @@ cell_vbuf_draw(struct vbuf_render *vbr,
 
       render->opcode = CELL_CMD_RENDER;
       render->prim_type = cvbr->prim;
-      render->front_winding = cell->rasterizer->front_winding;
 
       render->num_indexes = nr_indices;
       render->min_index = min_index;
diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index 4febd5385b..d2c282a022 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -583,6 +583,14 @@ cmd_batch(uint opcode)
       case CELL_CMD_STATE_FS_CONSTANTS:
          pos = cmd_state_fs_constants(buffer, pos);
          break;
+      case CELL_CMD_STATE_RASTERIZER:
+         {
+            struct cell_command_rasterizer *rast =
+               (struct cell_command_rasterizer *) &buffer[pos];
+            spu.rasterizer = rast->rasterizer;
+            pos += sizeof(*rast) / 8;
+         }
+         break;
       case CELL_CMD_STATE_SAMPLER:
          {
             struct cell_command_sampler *sampler
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index f87495b72d..4099e52699 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -153,6 +153,7 @@ struct spu_global
    struct pipe_blend_state blend;
    struct pipe_blend_color blend_color;
    struct pipe_sampler_state sampler[PIPE_MAX_SAMPLERS];
+   struct pipe_rasterizer_state rasterizer;
    struct spu_texture texture[PIPE_MAX_SAMPLERS];
    struct vertex_info vertex_info;
 
diff --git a/src/gallium/drivers/cell/spu/spu_render.c b/src/gallium/drivers/cell/spu/spu_render.c
index cfff19b6c0..75a7f75abc 100644
--- a/src/gallium/drivers/cell/spu/spu_render.c
+++ b/src/gallium/drivers/cell/spu/spu_render.c
@@ -279,7 +279,7 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
          v1 = (const float *) (vertices + indexes[j+1] * vertex_size);
          v2 = (const float *) (vertices + indexes[j+2] * vertex_size);
 
-         drawn += tri_draw(v0, v1, v2, tx, ty, render->front_winding);
+         drawn += tri_draw(v0, v1, v2, tx, ty);
       }
 
       //printf("SPU %u: drew %u of %u\n", spu.init.id, drawn, render->num_indexes/3);
diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index 2417db8960..1519b8cd7e 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -775,7 +775,7 @@ determinant(const float *v0, const float *v1, const float *v2)
  */
 boolean
 tri_draw(const float *v0, const float *v1, const float *v2,
-         uint tx, uint ty, uint front_winding)
+         uint tx, uint ty)
 {
    setup.tx = tx;
    setup.ty = ty;
@@ -790,7 +790,7 @@ tri_draw(const float *v0, const float *v1, const float *v2,
     * which will be needed for front/back-face stencil application
     */
    float det = determinant(v0, v1, v2);
-   setup.facing = (det > 0.0) ^ (front_winding == PIPE_WINDING_CW);
+   setup.facing = (det > 0.0) ^ (spu.rasterizer.front_winding == PIPE_WINDING_CW);
 
    if (!setup_sort_vertices((struct vertex_header *) v0,
                             (struct vertex_header *) v1,
diff --git a/src/gallium/drivers/cell/spu/spu_tri.h b/src/gallium/drivers/cell/spu/spu_tri.h
index abc3d35160..aa694dd7c9 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.h
+++ b/src/gallium/drivers/cell/spu/spu_tri.h
@@ -31,7 +31,7 @@
 
 
 extern boolean
-tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty, uint front_winding);
+tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty);
 
 
 #endif /* SPU_TRI_H */
-- 
cgit v1.2.3


From 30d3b581124a9fa5fbc7aa8404f717c5c2a6ab15 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 15 Oct 2008 15:20:09 -0600
Subject: cell: simplify triangle front/back face determination

---
 src/gallium/drivers/cell/spu/spu_tri.c | 69 ++++++++++++----------------------
 1 file changed, 23 insertions(+), 46 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index 1519b8cd7e..bd7547353d 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -116,7 +116,7 @@ struct setup_stage {
    struct edge etop;
    struct edge emaj;
 
-   float oneOverArea;
+   float oneOverArea;  /* XXX maybe make into vector? */
 
    uint facing;
 
@@ -417,11 +417,19 @@ print_vertex(const struct vertex_header *v)
 #endif
 
 
+/**
+ * Sort vertices from top to bottom.
+ * Compute area and determine front vs. back facing.
+ * Do coarse clip test against tile bounds
+ * \return  FALSE if tri is totally outside tile, TRUE otherwise
+ */
 static boolean
 setup_sort_vertices(const struct vertex_header *v0,
                     const struct vertex_header *v1,
                     const struct vertex_header *v2)
 {
+   float area, sign;
+
 #if DEBUG_VERTS
    if (spu.init.id==0) {
       fprintf(stderr, "SPU %u: Triangle:\n", spu.init.id);
@@ -431,8 +439,6 @@ setup_sort_vertices(const struct vertex_header *v0,
    }
 #endif
 
-   setup.vprovoke = v2;
-
    /* determine bottom to top order of vertices */
    {
       float y0 = spu_extract(v0->data[0], 1);
@@ -444,18 +450,21 @@ setup_sort_vertices(const struct vertex_header *v0,
 	    setup.vmin = v0;   
 	    setup.vmid = v1;   
 	    setup.vmax = v2;
+            sign = -1.0f;
 	 }
 	 else if (y2 <= y0) {
 	    /* y2<=y0<=y1 */
 	    setup.vmin = v2;   
 	    setup.vmid = v0;   
 	    setup.vmax = v1;   
+            sign = -1.0f;
 	 }
 	 else {
 	    /* y0<=y2<=y1 */
 	    setup.vmin = v0;   
 	    setup.vmid = v2;   
 	    setup.vmax = v1;  
+            sign = 1.0f;
 	 }
       }
       else {
@@ -464,18 +473,21 @@ setup_sort_vertices(const struct vertex_header *v0,
 	    setup.vmin = v1;   
 	    setup.vmid = v0;   
 	    setup.vmax = v2;  
+            sign = 1.0f;
 	 }
 	 else if (y2 <= y1) {
 	    /* y2<=y1<=y0 */
 	    setup.vmin = v2;   
 	    setup.vmid = v1;   
 	    setup.vmax = v0;  
+            sign = 1.0f;
 	 }
 	 else {
 	    /* y1<=y2<=y0 */
 	    setup.vmin = v1;   
 	    setup.vmid = v2;   
 	    setup.vmax = v0;
+            sign = -1.0f;
 	 }
       }
    }
@@ -504,31 +516,16 @@ setup_sort_vertices(const struct vertex_header *v0,
    /*
     * Compute triangle's area.  Use 1/area to compute partial
     * derivatives of attributes later.
-    *
-    * The area will be the same as prim->det, but the sign may be
-    * different depending on how the vertices get sorted above.
-    *
-    * To determine whether the primitive is front or back facing we
-    * use the prim->det value because its sign is correct.
     */
-   {
-      const float area = (setup.emaj.dx * setup.ebot.dy -
-                          setup.ebot.dx * setup.emaj.dy);
-
-      setup.oneOverArea = 1.0f / area;
-      /*
-      _mesa_printf("%s one-over-area %f  area %f  det %f\n",
-                   __FUNCTION__, setup.oneOverArea, area, prim->det );
-      */
-   }
+   area = setup.emaj.dx * setup.ebot.dy - setup.ebot.dx * setup.emaj.dy;
 
-#if 0
-   /* We need to know if this is a front or back-facing triangle for:
-    *  - the GLSL gl_FrontFacing fragment attribute (bool)
-    *  - two-sided stencil test
-    */
-   setup.quad.facing = (prim->det > 0.0) ^ (setup.softpipe->rasterizer->front_winding == PIPE_WINDING_CW);
-#endif
+   setup.oneOverArea = 1.0f / area;
+
+   /* The product of area * sign indicates front/back orientation (0/1) */
+   setup.facing = (area * sign > 0.0f)
+      ^ (spu.rasterizer.front_winding == PIPE_WINDING_CW);
+
+   setup.vprovoke = v2;
 
    return TRUE;
 }
@@ -755,20 +752,6 @@ subtriangle(struct edge *eleft, struct edge *eright, unsigned lines)
 }
 
 
-static float
-determinant(const float *v0, const float *v1, const float *v2)
-{
-   /* edge vectors e = v0 - v2, f = v1 - v2 */
-   const float ex = v0[0] - v2[0];
-   const float ey = v0[1] - v2[1];
-   const float fx = v1[0] - v2[0];
-   const float fy = v1[1] - v2[1];
-
-   /* det = cross(e,f).z */
-   return ex * fy - ey * fx;
-}
-
-
 /**
  * Draw triangle into tile at (tx, ty) (tile coords)
  * The tile data should have already been fetched.
@@ -786,12 +769,6 @@ tri_draw(const float *v0, const float *v1, const float *v2,
    setup.cliprect_maxx = (tx + 1) * TILE_SIZE;
    setup.cliprect_maxy = (ty + 1) * TILE_SIZE;
 
-   /* Before we sort vertices, determine the facing of the triangle,
-    * which will be needed for front/back-face stencil application
-    */
-   float det = determinant(v0, v1, v2);
-   setup.facing = (det > 0.0) ^ (spu.rasterizer.front_winding == PIPE_WINDING_CW);
-
    if (!setup_sort_vertices((struct vertex_header *) v0,
                             (struct vertex_header *) v1,
                             (struct vertex_header *) v2)) {
-- 
cgit v1.2.3


From 224c19a758466cdfb821e1a40db4928311278e90 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 15 Oct 2008 15:34:02 -0600
Subject: cell: get rid of last usage of float4 union/typedef

Results in slightly tighter code.
---
 src/gallium/drivers/cell/spu/spu_tri.c | 63 ++++++++++++++++------------------
 1 file changed, 29 insertions(+), 34 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index bd7547353d..d83085d0f9 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -43,11 +43,6 @@
 /** Masks are uint[4] vectors with each element being 0 or 0xffffffff */
 typedef vector unsigned int mask_t;
 
-typedef union
-{
-   vector float v;
-   float f[4];
-} float4;
 
 
 /**
@@ -91,9 +86,9 @@ struct edge {
 
 struct interp_coef
 {
-   float4 a0;
-   float4 dadx;
-   float4 dady;
+   vector float a0;
+   vector float dadx;
+   vector float dady;
 };
 
 
@@ -152,14 +147,14 @@ eval_coeff(uint slot, float x, float y, vector float w, vector float result[4])
       result[QUAD_TOP_LEFT] =
       result[QUAD_TOP_RIGHT] =
       result[QUAD_BOTTOM_LEFT] =
-      result[QUAD_BOTTOM_RIGHT] = setup.coef[slot].a0.v;
+      result[QUAD_BOTTOM_RIGHT] = setup.coef[slot].a0;
       break;
    case INTERP_LINEAR:
       {
-         vector float dadx = setup.coef[slot].dadx.v;
-         vector float dady = setup.coef[slot].dady.v;
+         vector float dadx = setup.coef[slot].dadx;
+         vector float dady = setup.coef[slot].dady;
          vector float topLeft =
-            spu_add(setup.coef[slot].a0.v,
+            spu_add(setup.coef[slot].a0,
                     spu_add(spu_mul(spu_splats(x), dadx),
                             spu_mul(spu_splats(y), dady)));
 
@@ -171,10 +166,10 @@ eval_coeff(uint slot, float x, float y, vector float w, vector float result[4])
       break;
    case INTERP_PERSPECTIVE:
       {
-         vector float dadx = setup.coef[slot].dadx.v;
-         vector float dady = setup.coef[slot].dady.v;
+         vector float dadx = setup.coef[slot].dadx;
+         vector float dady = setup.coef[slot].dady;
          vector float topLeft =
-            spu_add(setup.coef[slot].a0.v,
+            spu_add(setup.coef[slot].a0,
                     spu_add(spu_mul(spu_splats(x), dadx),
                             spu_mul(spu_splats(y), dady)));
 
@@ -212,9 +207,9 @@ static INLINE vector float
 eval_z(float x, float y)
 {
    const uint slot = 0;
-   const float dzdx = setup.coef[slot].dadx.f[2];
-   const float dzdy = setup.coef[slot].dady.f[2];
-   const float topLeft = setup.coef[slot].a0.f[2] + x * dzdx + y * dzdy;
+   const float dzdx = spu_extract(setup.coef[slot].dadx, 2);
+   const float dzdy = spu_extract(setup.coef[slot].dady, 2);
+   const float topLeft = spu_extract(setup.coef[slot].a0, 2) + x * dzdx + y * dzdy;
    const vector float topLeftv = spu_splats(topLeft);
    const vector float derivs = (vector float) { 0.0, dzdx, dzdy, dzdx + dzdy };
    return spu_add(topLeftv, derivs);
@@ -226,9 +221,9 @@ static INLINE vector float
 eval_w(float x, float y)
 {
    const uint slot = 0;
-   const float dwdx = setup.coef[slot].dadx.f[3];
-   const float dwdy = setup.coef[slot].dady.f[3];
-   const float topLeft = setup.coef[slot].a0.f[3] + x * dwdx + y * dwdy;
+   const float dwdx = spu_extract(setup.coef[slot].dadx, 3);
+   const float dwdy = spu_extract(setup.coef[slot].dady, 3);
+   const float topLeft = spu_extract(setup.coef[slot].a0, 3) + x * dwdx + y * dwdy;
    const vector float topLeftv = spu_splats(topLeft);
    const vector float derivs = (vector float) { 0.0, dwdx, dwdy, dwdx + dwdy };
    return spu_add(topLeftv, derivs);
@@ -540,9 +535,9 @@ setup_sort_vertices(const struct vertex_header *v0,
 static INLINE void
 const_coeff4(uint slot)
 {
-   setup.coef[slot].dadx.v = (vector float) {0.0, 0.0, 0.0, 0.0};
-   setup.coef[slot].dady.v = (vector float) {0.0, 0.0, 0.0, 0.0};
-   setup.coef[slot].a0.v = setup.vprovoke->data[slot];
+   setup.coef[slot].dadx = (vector float) {0.0, 0.0, 0.0, 0.0};
+   setup.coef[slot].dady = (vector float) {0.0, 0.0, 0.0, 0.0};
+   setup.coef[slot].a0 = setup.vprovoke->data[slot];
 }
 
 
@@ -566,13 +561,13 @@ tri_linear_coeff4(uint slot)
    vector float b = spu_sub(spu_mul(spu_splats(setup.emaj.dx), botda),
                             spu_mul(majda, spu_splats(setup.ebot.dx)));
 
-   setup.coef[slot].dadx.v = spu_mul(a, spu_splats(setup.oneOverArea));
-   setup.coef[slot].dady.v = spu_mul(b, spu_splats(setup.oneOverArea));
+   setup.coef[slot].dadx = spu_mul(a, spu_splats(setup.oneOverArea));
+   setup.coef[slot].dady = spu_mul(b, spu_splats(setup.oneOverArea));
 
-   vector float tempx = spu_mul(setup.coef[slot].dadx.v, xxxx);
-   vector float tempy = spu_mul(setup.coef[slot].dady.v, yyyy);
+   vector float tempx = spu_mul(setup.coef[slot].dadx, xxxx);
+   vector float tempy = spu_mul(setup.coef[slot].dady, yyyy);
                          
-   setup.coef[slot].a0.v = spu_sub(vmin_d, spu_add(tempx, tempy));
+   setup.coef[slot].a0 = spu_sub(vmin_d, spu_add(tempx, tempy));
 }
 
 
@@ -610,13 +605,13 @@ tri_persp_coeff4(uint slot)
    vector float b = spu_sub(spu_mul(spu_splats(setup.emaj.dx), botda),
                             spu_mul(majda, spu_splats(setup.ebot.dx)));
 
-   setup.coef[slot].dadx.v = spu_mul(a, spu_splats(setup.oneOverArea));
-   setup.coef[slot].dady.v = spu_mul(b, spu_splats(setup.oneOverArea));
+   setup.coef[slot].dadx = spu_mul(a, spu_splats(setup.oneOverArea));
+   setup.coef[slot].dady = spu_mul(b, spu_splats(setup.oneOverArea));
 
-   vector float tempx = spu_mul(setup.coef[slot].dadx.v, xxxx);
-   vector float tempy = spu_mul(setup.coef[slot].dady.v, yyyy);
+   vector float tempx = spu_mul(setup.coef[slot].dadx, xxxx);
+   vector float tempy = spu_mul(setup.coef[slot].dady, yyyy);
                          
-   setup.coef[slot].a0.v = spu_sub(vmin_d, spu_add(tempx, tempy));
+   setup.coef[slot].a0 = spu_sub(vmin_d, spu_add(tempx, tempy));
 }
 
 
-- 
cgit v1.2.3


From 1c915b14a545ffb10cc1c98cc69f997b6471617f Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 15 Oct 2008 19:40:51 -0600
Subject: cell: updated debug code

---
 src/gallium/drivers/cell/spu/spu_render.c | 26 +++++++-------------------
 1 file changed, 7 insertions(+), 19 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/drivers/cell/spu/spu_render.c b/src/gallium/drivers/cell/spu/spu_render.c
index 75a7f75abc..802455bf79 100644
--- a/src/gallium/drivers/cell/spu/spu_render.c
+++ b/src/gallium/drivers/cell/spu/spu_render.c
@@ -176,21 +176,12 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
    const ushort *indexes;
    uint i, j;
 
-
-#if 0
-      printf("SPU %u: RENDER prim %u, num_vert=%u  num_ind=%u  "
-             "inline_vert=%u\n",
-             spu.init.id,
-             render->prim_type,
-             render->num_verts,
-             render->num_indexes,
-             render->inline_verts);
-
-      /*
-      printf("       bound: %g, %g .. %g, %g\n",
-             render->xmin, render->ymin, render->xmax, render->ymax);
-      */
-#endif
+   D_PRINTF(CELL_DEBUG_CMD,
+            "RENDER prim=%u num_vert=%u num_ind=%u inline_vert=%u\n",
+            render->prim_type,
+            render->num_verts,
+            render->num_indexes,
+            render->inline_verts);
 
    ASSERT(sizeof(*render) % 4 == 0);
    ASSERT(total_vertex_bytes % 16 == 0);
@@ -293,8 +284,5 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
       spu.ztile_status[ty][tx] = spu.cur_ztile_status;
    }
 
-#if 0
-      printf("SPU %u: RENDER done\n",
-             spu.init.id);
-#endif
+   D_PRINTF(CELL_DEBUG_CMD, "RENDER done\n");
 }
-- 
cgit v1.2.3


From 0116ee1d1c341726b6ed23c2dddc4515e8a34385 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 15 Oct 2008 20:46:43 -0600
Subject: cell: start some performance measurements

Use the spu_write_decrementer() and spu_read_decrementer() functions to
measure time.  Convert to milliseconds according to the system timebase value.
---
 src/gallium/drivers/cell/common.h          |  1 +
 src/gallium/drivers/cell/ppu/cell_spu.c    | 31 ++++++++++++++++++++++++++++++
 src/gallium/drivers/cell/spu/spu_command.c | 15 +++++++++++++++
 src/gallium/drivers/cell/spu/spu_render.c  |  9 ++++++++-
 4 files changed, 55 insertions(+), 1 deletion(-)

(limited to 'src/gallium')

diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index 0ff2c491fb..469d56cda8 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -299,6 +299,7 @@ struct cell_init_info
    unsigned id;
    unsigned num_spus;
    unsigned debug_flags;  /**< mask of CELL_DEBUG_x flags */
+   float inv_timebase;    /**< 1.0/timebase, for perf measurement */
 
    /** Buffers for command batches, vertex/index data */
    ubyte *buffers[CELL_NUM_BUFFERS];
diff --git a/src/gallium/drivers/cell/ppu/cell_spu.c b/src/gallium/drivers/cell/ppu/cell_spu.c
index a6e268b362..28e5e6d706 100644
--- a/src/gallium/drivers/cell/ppu/cell_spu.c
+++ b/src/gallium/drivers/cell/ppu/cell_spu.c
@@ -52,6 +52,35 @@ helpful headers:
 struct cell_global_info cell_global;
 
 
+/**
+ * Scan /proc/cpuinfo to determine the timebase for the system.
+ * This is used by the SPUs to convert 'decrementer' ticks to seconds.
+ * There may be a better way to get this value...
+ */
+static unsigned
+get_timebase(void)
+{
+   FILE *f = fopen("/proc/cpuinfo", "r");
+   unsigned timebase;
+
+   assert(f);
+   while (!feof(f)) {
+      char line[80];
+      fgets(line, sizeof(line), f);
+      if (strncmp(line, "timebase", 8) == 0) {
+         char *colon = strchr(line, ':');
+         if (colon) {
+            timebase = atoi(colon + 2);
+            break;
+         }
+      }
+   }
+   fclose(f);
+
+   return timebase;
+}
+
+
 /**
  * Write a 1-word message to the given SPE mailbox.
  */
@@ -115,6 +144,7 @@ cell_start_spus(struct cell_context *cell)
 {
    static boolean one_time_init = FALSE;
    uint i, j;
+   uint timebase = get_timebase();
 
    if (one_time_init) {
       fprintf(stderr, "PPU: Multiple rendering contexts not yet supported "
@@ -138,6 +168,7 @@ cell_start_spus(struct cell_context *cell)
       cell_global.inits[i].id = i;
       cell_global.inits[i].num_spus = cell->num_spus;
       cell_global.inits[i].debug_flags = cell->debug_flags;
+      cell_global.inits[i].inv_timebase = 1000.0f / timebase;
 
       for (j = 0; j < CELL_NUM_BUFFERS; j++) {
          cell_global.inits[i].buffers[j] = cell->buffer[j];
diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index d2c282a022..57d265fef7 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -670,6 +670,8 @@ cmd_batch(uint opcode)
 }
 
 
+#define PERF 0
+
 
 /**
  * Main loop for SPEs: Get a command, execute it, repeat.
@@ -678,6 +680,7 @@ void
 command_loop(void)
 {
    int exitFlag = 0;
+   uint t0, t1;
 
    D_PRINTF(CELL_DEBUG_CMD, "Enter command loop\n");
 
@@ -686,10 +689,16 @@ command_loop(void)
 
       D_PRINTF(CELL_DEBUG_CMD, "Wait for cmd...\n");
 
+      if (PERF)
+         spu_write_decrementer(~0);
+
       /* read/wait from mailbox */
       opcode = (unsigned int) spu_read_in_mbox();
       D_PRINTF(CELL_DEBUG_CMD, "got cmd 0x%x\n", opcode);
 
+      if (PERF)
+         t0 = spu_read_decrementer();
+
       switch (opcode & CELL_CMD_OPCODE_MASK) {
       case CELL_CMD_EXIT:
          D_PRINTF(CELL_DEBUG_CMD, "EXIT\n");
@@ -707,6 +716,12 @@ command_loop(void)
          printf("Bad opcode 0x%x!\n", opcode & CELL_CMD_OPCODE_MASK);
       }
 
+      if (PERF) {
+         t1 = spu_read_decrementer();
+         printf("wait mbox time: %gms   batch time: %gms\n",
+                (~0u - t0) * spu.init.inv_timebase,
+                (t0 - t1) * spu.init.inv_timebase);
+      }
    }
 
    D_PRINTF(CELL_DEBUG_CMD, "Exit command loop\n");
diff --git a/src/gallium/drivers/cell/spu/spu_render.c b/src/gallium/drivers/cell/spu/spu_render.c
index 802455bf79..5515bb55c9 100644
--- a/src/gallium/drivers/cell/spu/spu_render.c
+++ b/src/gallium/drivers/cell/spu/spu_render.c
@@ -175,6 +175,7 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
    const ubyte *vertices;
    const ushort *indexes;
    uint i, j;
+   uint num_tiles;
 
    D_PRINTF(CELL_DEBUG_CMD,
             "RENDER prim=%u num_vert=%u num_ind=%u inline_vert=%u\n",
@@ -242,6 +243,8 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
    wait_on_mask(1 << TAG_SURFACE_CLEAR); /* XXX temporary */
 
 
+   num_tiles = 0;
+
    /**
     ** loop over tiles, rendering tris
     **/
@@ -255,6 +258,8 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
       if (!my_tile(tx, ty))
          continue;
 
+      num_tiles++;
+
       spu.cur_ctile_status = spu.ctile_status[ty][tx];
       spu.cur_ztile_status = spu.ztile_status[ty][tx];
 
@@ -284,5 +289,7 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
       spu.ztile_status[ty][tx] = spu.cur_ztile_status;
    }
 
-   D_PRINTF(CELL_DEBUG_CMD, "RENDER done\n");
+   D_PRINTF(CELL_DEBUG_CMD,
+            "RENDER done (%u tiles hit)\n",
+            num_tiles);
 }
-- 
cgit v1.2.3


From 926b8dbb3e86360e5968882df94785ae84d0ad43 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Thu, 16 Oct 2008 09:00:05 -0600
Subject: cell: clean up various texture-related things

Distinguish among texture targets in codegen.
progs/demos/cubemap.c runs correctly now too.
---
 src/gallium/drivers/cell/ppu/cell_gen_fp.c | 29 ++++++++++++++---
 src/gallium/drivers/cell/spu/spu_command.c | 24 ++++++--------
 src/gallium/drivers/cell/spu/spu_funcs.c   | 34 +++++++++++++++++---
 src/gallium/drivers/cell/spu/spu_main.h    | 16 +++++-----
 src/gallium/drivers/cell/spu/spu_texture.c | 50 ++++++++++++++----------------
 src/gallium/drivers/cell/spu/spu_texture.h | 34 +++++++++-----------
 6 files changed, 107 insertions(+), 80 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
index 3dfd5f673d..2b34cf1e23 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
@@ -1337,16 +1337,33 @@ emit_function_call(struct codegen *gen,
 
 
 static boolean
-emit_TXP(struct codegen *gen, const struct tgsi_full_instruction *inst)
+emit_TEX(struct codegen *gen, const struct tgsi_full_instruction *inst)
 {
-   const uint addr = lookup_function(gen->cell, "spu_txp");
+   const uint target = inst->InstructionExtTexture.Texture;
    const uint unit = inst->FullSrcRegisters[1].SrcRegister.Index;
+   uint addr;
    int ch;
    int coord_regs[4], d_regs[4];
 
+   switch (target) {
+   case TGSI_TEXTURE_1D:
+   case TGSI_TEXTURE_2D:
+      addr = lookup_function(gen->cell, "spu_tex_2d");
+      break;
+   case TGSI_TEXTURE_3D:
+      addr = lookup_function(gen->cell, "spu_tex_3d");
+      break;
+   case TGSI_TEXTURE_CUBE:
+      addr = lookup_function(gen->cell, "spu_tex_cube");
+      break;
+   default:
+      ASSERT(0 && "unsupported texture target");
+      return FALSE;
+   }
+
    assert(inst->FullSrcRegisters[1].SrcRegister.File == TGSI_FILE_SAMPLER);
 
-   spe_comment(gen->f, -4, "CALL txp:");
+   spe_comment(gen->f, -4, "CALL tex:");
 
    /* get src/dst reg info */
    for (ch = 0; ch < 4; ch++) {
@@ -1368,7 +1385,7 @@ emit_TXP(struct codegen *gen, const struct tgsi_full_instruction *inst)
          spe_stqd(gen->f, reg, SPE_REG_SP, 16 * offset);
       }
 
-      /* setup function arguments */
+      /* setup function arguments (XXX depends on target) */
       for (i = 0; i < 4; i++) {
          spe_move(gen->f, 3 + i, coord_regs[i]);
       }
@@ -1674,8 +1691,10 @@ emit_instruction(struct codegen *gen,
       /* fall-through for now */
    case TGSI_OPCODE_TXB:
       /* fall-through for now */
+   case TGSI_OPCODE_TXL:
+      /* fall-through for now */
    case TGSI_OPCODE_TXP:
-      return emit_TXP(gen, inst);
+      return emit_TEX(gen, inst);
 
    case TGSI_OPCODE_IF:
       return emit_IF(gen, inst);
diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index 57d265fef7..ff4a52d79a 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -310,8 +310,7 @@ cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
  */
 static void
 update_tex_masks(struct spu_texture *texture,
-                 const struct pipe_sampler_state *sampler,
-                 uint unit)
+                 const struct pipe_sampler_state *sampler)
 {
    uint i;
 
@@ -338,11 +337,6 @@ update_tex_masks(struct spu_texture *texture,
          texture->level[i].scale_t = spu_splats(1.0f);
       }
    }
-
-   /* XXX temporary hack */
-   if (texture->target == PIPE_TEXTURE_CUBE) {
-      spu.sample_texture4[unit] = sample_texture4_cube;
-   }
 }
 
 
@@ -357,12 +351,12 @@ cmd_state_sampler(const struct cell_command_sampler *sampler)
 
    switch (spu.sampler[unit].min_img_filter) {
    case PIPE_TEX_FILTER_LINEAR:
-      spu.min_sample_texture4[unit] = sample_texture4_bilinear;
+      spu.min_sample_texture_2d[unit] = sample_texture_2d_bilinear;
       break;
    case PIPE_TEX_FILTER_ANISO:
       /* fall-through, for now */
    case PIPE_TEX_FILTER_NEAREST:
-      spu.min_sample_texture4[unit] = sample_texture4_nearest;
+      spu.min_sample_texture_2d[unit] = sample_texture_2d_nearest;
       break;
    default:
       ASSERT(0);
@@ -370,12 +364,12 @@ cmd_state_sampler(const struct cell_command_sampler *sampler)
 
    switch (spu.sampler[sampler->unit].mag_img_filter) {
    case PIPE_TEX_FILTER_LINEAR:
-      spu.mag_sample_texture4[unit] = sample_texture4_bilinear;
+      spu.mag_sample_texture_2d[unit] = sample_texture_2d_bilinear;
       break;
    case PIPE_TEX_FILTER_ANISO:
       /* fall-through, for now */
    case PIPE_TEX_FILTER_NEAREST:
-      spu.mag_sample_texture4[unit] = sample_texture4_nearest;
+      spu.mag_sample_texture_2d[unit] = sample_texture_2d_nearest;
       break;
    default:
       ASSERT(0);
@@ -384,16 +378,16 @@ cmd_state_sampler(const struct cell_command_sampler *sampler)
    switch (spu.sampler[sampler->unit].min_mip_filter) {
    case PIPE_TEX_MIPFILTER_NEAREST:
    case PIPE_TEX_MIPFILTER_LINEAR:
-      spu.sample_texture4[unit] = sample_texture4_lod;
+      spu.sample_texture_2d[unit] = sample_texture_2d_lod;
       break;
    case PIPE_TEX_MIPFILTER_NONE:
-      spu.sample_texture4[unit] = spu.mag_sample_texture4[unit];
+      spu.sample_texture_2d[unit] = spu.mag_sample_texture_2d[unit];
       break;
    default:
       ASSERT(0);
    }
 
-   update_tex_masks(&spu.texture[unit], &spu.sampler[unit], unit);
+   update_tex_masks(&spu.texture[unit], &spu.sampler[unit]);
 }
 
 
@@ -434,7 +428,7 @@ cmd_state_texture(const struct cell_command_texture *texture)
          spu.texture[unit].max_level = i;
    }
 
-   update_tex_masks(&spu.texture[unit], &spu.sampler[unit], unit);
+   update_tex_masks(&spu.texture[unit], &spu.sampler[unit]);
 }
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_funcs.c b/src/gallium/drivers/cell/spu/spu_funcs.c
index 5c3ee305d4..3534b35000 100644
--- a/src/gallium/drivers/cell/spu/spu_funcs.c
+++ b/src/gallium/drivers/cell/spu/spu_funcs.c
@@ -43,6 +43,7 @@
 #include "cell/common.h"
 #include "spu_main.h"
 #include "spu_funcs.h"
+#include "spu_texture.h"
 
 
 /** For "return"-ing four vectors */
@@ -102,11 +103,34 @@ spu_log2(vector float x)
 
 
 static struct vec_4x4
-spu_txp(vector float s, vector float t, vector float r, vector float q,
-        unsigned unit)
+spu_tex_2d(vector float s, vector float t, vector float r, vector float q,
+           unsigned unit)
 {
    struct vec_4x4 colors;
-   spu.sample_texture4[unit](s, t, r, q, unit, 0, 0, colors.v);
+   (void) r;
+   (void) q;
+   spu.sample_texture_2d[unit](s, t, unit, 0, 0, colors.v);
+   return colors;
+}
+
+static struct vec_4x4
+spu_tex_3d(vector float s, vector float t, vector float r, vector float q,
+           unsigned unit)
+{
+   struct vec_4x4 colors;
+   (void) r;
+   (void) q;
+   spu.sample_texture_2d[unit](s, t, unit, 0, 0, colors.v);
+   return colors;
+}
+
+static struct vec_4x4
+spu_tex_cube(vector float s, vector float t, vector float r, vector float q,
+           unsigned unit)
+{
+   struct vec_4x4 colors;
+   (void) q;
+   sample_texture_cube(s, t, r, unit, colors.v);
    return colors;
 }
 
@@ -147,7 +171,9 @@ return_function_info(void)
    export_func(&funcs, "spu_pow", &spu_pow);
    export_func(&funcs, "spu_exp2", &spu_exp2);
    export_func(&funcs, "spu_log2", &spu_log2);
-   export_func(&funcs, "spu_txp", &spu_txp);
+   export_func(&funcs, "spu_tex_2d", &spu_tex_2d);
+   export_func(&funcs, "spu_tex_3d", &spu_tex_3d);
+   export_func(&funcs, "spu_tex_cube", &spu_tex_cube);
 
    /* Send the function info back to the PPU / main memory */
    mfc_put((void *) &funcs,  /* src in local store */
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 4099e52699..80e9c696f8 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -70,12 +70,10 @@ typedef union {
 
 
 /** Function for sampling textures */
-typedef void (*spu_sample_texture4_func)(vector float s,
-                                         vector float t,
-                                         vector float r,
-                                         vector float q,
-                                         uint unit, uint level, uint face,
-                                         vector float colors[4]);
+typedef void (*spu_sample_texture_2d_func)(vector float s,
+                                           vector float t,
+                                           uint unit, uint level, uint face,
+                                           vector float colors[4]);
 
 
 /** Function for performing per-fragment ops */
@@ -183,9 +181,9 @@ struct spu_global
    spu_fragment_program_func fragment_program;
 
    /** Current texture sampler function */
-   spu_sample_texture4_func sample_texture4[CELL_MAX_SAMPLERS];
-   spu_sample_texture4_func min_sample_texture4[CELL_MAX_SAMPLERS];
-   spu_sample_texture4_func mag_sample_texture4[CELL_MAX_SAMPLERS];
+   spu_sample_texture_2d_func sample_texture_2d[CELL_MAX_SAMPLERS];
+   spu_sample_texture_2d_func min_sample_texture_2d[CELL_MAX_SAMPLERS];
+   spu_sample_texture_2d_func mag_sample_texture_2d[CELL_MAX_SAMPLERS];
 
    /** Fragment program constants */
    vector float constants[4 * CELL_MAX_CONSTANTS];
diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
index 42eb06a362..04202a7657 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.c
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -126,10 +126,9 @@ spu_clamp(vector signed int vec, vector signed int max)
  * \param colors  returned colors in SOA format (rrrr, gggg, bbbb, aaaa).
  */
 void
-sample_texture4_nearest(vector float s, vector float t,
-                        vector float r, vector float q,
-                        uint unit, uint level, uint face,
-                        vector float colors[4])
+sample_texture_2d_nearest(vector float s, vector float t,
+                          uint unit, uint level, uint face,
+                          vector float colors[4])
 {
    const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
    vector float ss = spu_mul(s, tlevel->scale_s);
@@ -158,10 +157,9 @@ sample_texture4_nearest(vector float s, vector float t,
  * \param colors  returned colors in SOA format (rrrr, gggg, bbbb, aaaa).
  */
 void
-sample_texture4_bilinear(vector float s, vector float t,
-                         vector float r, vector float q,
-                         uint unit, uint level, uint face,
-                         vector float colors[4])
+sample_texture_2d_bilinear(vector float s, vector float t,
+                           uint unit, uint level, uint face,
+                           vector float colors[4])
 {
    const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
    static const vector float half = {-0.5f, -0.5f, -0.5f, -0.5f};
@@ -308,10 +306,9 @@ transpose(vector unsigned int *mOut0,
  * Bilinear filtering, using int intead of float arithmetic
  */
 void
-sample_texture4_bilinear_2(vector float s, vector float t,
-                           vector float r, vector float q,
-                           uint unit, uint level, uint face,
-                           vector float colors[4])
+sample_texture_2d_bilinear_int(vector float s, vector float t,
+                               uint unit, uint level, uint face,
+                               vector float colors[4])
 {
    const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
    static const vector float half = {-0.5f, -0.5f, -0.5f, -0.5f};
@@ -444,10 +441,9 @@ compute_lambda(uint unit, vector float s, vector float t)
  * Texture sampling with level of detail selection.
  */
 void
-sample_texture4_lod(vector float s, vector float t,
-                    vector float r, vector float q,
-                    uint unit, uint level_ignored, uint face,
-                    vector float colors[4])
+sample_texture_2d_lod(vector float s, vector float t,
+                      uint unit, uint level_ignored, uint face,
+                      vector float colors[4])
 {
    /*
     * Note that we're computing a lambda/lod here that's used for all
@@ -455,6 +451,9 @@ sample_texture4_lod(vector float s, vector float t,
     */
    float lambda = compute_lambda(unit, s, t);
 
+   (void) face;
+   (void) level_ignored;
+
    /* apply lod bias */
    lambda += spu.sampler[unit].lod_bias;
 
@@ -466,14 +465,14 @@ sample_texture4_lod(vector float s, vector float t,
 
    if (lambda <= 0.0f) {
       /* magnify */
-      spu.mag_sample_texture4[unit](s, t, r, q, unit, 0, 0, colors);
+      spu.mag_sample_texture_2d[unit](s, t, unit, 0, 0, colors);
    }
    else {
       /* minify */
       int level = (int) (lambda + 0.5f);
       if (level > (int) spu.texture[unit].max_level)
          level = spu.texture[unit].max_level;
-      spu.min_sample_texture4[unit](s, t, r, q, unit, level, 0, colors);
+      spu.min_sample_texture_2d[unit](s, t, unit, level, 0, colors);
       /* XXX to do: mipmap level interpolation */
    }
 }
@@ -552,13 +551,10 @@ choose_cube_face(float rx, float ry, float rz, float *newS, float *newT)
 
 
 void
-sample_texture4_cube(vector float s, vector float t,
-                     vector float r, vector float q,
-                     uint unit, uint level, uint face_ignored,
-                     vector float colors[4])
+sample_texture_cube(vector float s, vector float t, vector float r,
+                    uint unit, vector float colors[4])
 {
-   static const vector float zero = {0.0f, 0.0f, 0.0f, 0.0f};
-   uint p, faces[4];
+   uint p, faces[4], level = 0;
    float newS[4], newT[4];
 
    /* Compute cube face referenced by the four sets of texcoords.
@@ -577,15 +573,15 @@ sample_texture4_cube(vector float s, vector float t,
       /* GOOD!  All four texcoords refer to the same cube face */
       s = (vector float) {newS[0], newS[1], newS[2], newS[3]};
       t = (vector float) {newT[0], newT[1], newT[2], newT[3]};
-      sample_texture4_nearest(s, t, zero, zero, unit, level, faces[0], colors);
+      sample_texture_2d_nearest(s, t, unit, level, faces[0], colors);
    }
    else {
       /* BAD!  The four texcoords refer to different faces */
       for (p = 0; p < 4; p++) {      
          vector float c[4];
 
-         sample_texture4_nearest(spu_splats(newS[p]), spu_splats(newT[p]),
-                                 zero, zero, unit, level, faces[p], c);
+         sample_texture_2d_nearest(spu_splats(newS[p]), spu_splats(newT[p]),
+                                   unit, level, faces[p], c);
 
          float red = spu_extract(c[0], p);
          float green = spu_extract(c[1], p);
diff --git a/src/gallium/drivers/cell/spu/spu_texture.h b/src/gallium/drivers/cell/spu/spu_texture.h
index 387484c3ad..7b75b007b5 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.h
+++ b/src/gallium/drivers/cell/spu/spu_texture.h
@@ -37,37 +37,31 @@ invalidate_tex_cache(void);
 
 
 extern void
-sample_texture4_nearest(vector float s, vector float t,
-                        vector float r, vector float q,
-                        uint unit, uint level, uint face,
-                        vector float colors[4]);
+sample_texture_2d_nearest(vector float s, vector float t,
+                          uint unit, uint level, uint face,
+                          vector float colors[4]);
 
 
 extern void
-sample_texture4_bilinear(vector float s, vector float t,
-                         vector float r, vector float q,
-                         uint unit, uint level, uint face,
-                         vector float colors[4]);
-
-extern void
-sample_texture4_bilinear_2(vector float s, vector float t,
-                           vector float r, vector float q,
+sample_texture_2d_bilinear(vector float s, vector float t,
                            uint unit, uint level, uint face,
                            vector float colors[4]);
 
+extern void
+sample_texture_2d_bilinear_int(vector float s, vector float t,
+                               uint unit, uint level, uint face,
+                               vector float colors[4]);
+
 
 extern void
-sample_texture4_lod(vector float s, vector float t,
-                    vector float r, vector float q,
-                    uint unit, uint level, uint face,
-                    vector float colors[4]);
+sample_texture_2d_lod(vector float s, vector float t,
+                      uint unit, uint level, uint face,
+                      vector float colors[4]);
 
 
 extern void
-sample_texture4_cube(vector float s, vector float t,
-                     vector float r, vector float q,
-                     uint unit, uint level_ignored, uint face_ignored,
-                     vector float colors[4]);
+sample_texture_cube(vector float s, vector float t, vector float r,
+                    uint unit, vector float colors[4]);
 
 
 #endif /* SPU_TEXTURE_H */
-- 
cgit v1.2.3


From 1da8f9b005a197214532e124c764a4e04e835519 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Thu, 16 Oct 2008 09:33:45 -0600
Subject: cell: call proper sampler function in sample_texture_cube()

---
 src/gallium/drivers/cell/spu/spu_texture.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
index 04202a7657..b2d5d4aef8 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.c
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -557,7 +557,7 @@ sample_texture_cube(vector float s, vector float t, vector float r,
    uint p, faces[4], level = 0;
    float newS[4], newT[4];
 
-   /* Compute cube face referenced by the four sets of texcoords.
+   /* Compute cube faces referenced by the four sets of texcoords.
     * XXX we should SIMD-ize this.
     */
    for (p = 0; p < 4; p++) {      
@@ -573,15 +573,15 @@ sample_texture_cube(vector float s, vector float t, vector float r,
       /* GOOD!  All four texcoords refer to the same cube face */
       s = (vector float) {newS[0], newS[1], newS[2], newS[3]};
       t = (vector float) {newT[0], newT[1], newT[2], newT[3]};
-      sample_texture_2d_nearest(s, t, unit, level, faces[0], colors);
+      spu.sample_texture_2d[unit](s, t, unit, level, faces[0], colors);
    }
    else {
       /* BAD!  The four texcoords refer to different faces */
       for (p = 0; p < 4; p++) {      
          vector float c[4];
 
-         sample_texture_2d_nearest(spu_splats(newS[p]), spu_splats(newT[p]),
-                                   unit, level, faces[p], c);
+         spu.sample_texture_2d[unit](spu_splats(newS[p]), spu_splats(newT[p]),
+                                     unit, level, faces[p], c);
 
          float red = spu_extract(c[0], p);
          float green = spu_extract(c[1], p);
-- 
cgit v1.2.3


From f0c70f9aabcb8e7c57c71eac2bd4dc86a2f86a0e Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Thu, 16 Oct 2008 09:52:02 -0600
Subject: cell: update comments

---
 src/gallium/drivers/cell/spu/spu_texture.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
index b2d5d4aef8..19c17c9118 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.c
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -193,10 +193,6 @@ sample_texture_2d_bilinear(vector float s, vector float t,
    get_four_texels(unit, level, face, is0, it1, texels + 8);  /* lower-left */
    get_four_texels(unit, level, face, is1, it1, texels + 12); /* lower-right */
 
-   /* XXX possibly rework following code to compute the weighted sample
-    * colors with integer arithmetic for fewer int->float conversions.
-    */
-
    /* convert packed int texels to float colors */
    vector float ftexels[16];
    spu_unpack_A8R8G8B8_transpose4(texels + 0, ftexels + 0);
@@ -303,7 +299,8 @@ transpose(vector unsigned int *mOut0,
 
 
 /**
- * Bilinear filtering, using int intead of float arithmetic
+ * Bilinear filtering, using int instead of float arithmetic for computing
+ * sample weights.
  */
 void
 sample_texture_2d_bilinear_int(vector float s, vector float t,
-- 
cgit v1.2.3


From 5191429b15a3e7a7ef7cda499de8074c2c0df94f Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Thu, 16 Oct 2008 11:19:22 -0600
Subject: cell: trilinear mipmap interpolation

---
 src/gallium/drivers/cell/spu/spu_texture.c | 55 +++++++++++++++++++++++++-----
 1 file changed, 46 insertions(+), 9 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
index 19c17c9118..e3d9a49dc4 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.c
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -415,7 +415,7 @@ sample_texture_2d_bilinear_int(vector float s, vector float t,
  * Compute level of detail factor from texcoords.
  */
 static float
-compute_lambda(uint unit, vector float s, vector float t)
+compute_lambda_2d(uint unit, vector float s, vector float t)
 {
    uint baseLevel = 0;
    float width = spu.texture[unit].level[baseLevel].width;
@@ -433,9 +433,27 @@ compute_lambda(uint unit, vector float s, vector float t)
 }
 
 
+/**
+ * Blend two sets of colors according to weight.
+ */
+static void
+blend_colors(vector float c0[4], const vector float c1[4], float weight)
+{
+   vector float t = spu_splats(weight);
+   vector float dc0 = spu_sub(c1[0], c0[0]);
+   vector float dc1 = spu_sub(c1[1], c0[1]);
+   vector float dc2 = spu_sub(c1[2], c0[2]);
+   vector float dc3 = spu_sub(c1[3], c0[3]);
+   c0[0] = spu_madd(dc0, t, c0[0]);
+   c0[1] = spu_madd(dc1, t, c0[1]);
+   c0[2] = spu_madd(dc2, t, c0[2]);
+   c0[3] = spu_madd(dc3, t, c0[3]);
+}
+
 
 /**
- * Texture sampling with level of detail selection.
+ * Texture sampling with level of detail selection and possibly mipmap
+ * interpolation.
  */
 void
 sample_texture_2d_lod(vector float s, vector float t,
@@ -446,7 +464,7 @@ sample_texture_2d_lod(vector float s, vector float t,
     * Note that we're computing a lambda/lod here that's used for all
     * four pixels in the quad.
     */
-   float lambda = compute_lambda(unit, s, t);
+   float lambda = compute_lambda_2d(unit, s, t);
 
    (void) face;
    (void) level_ignored;
@@ -462,15 +480,34 @@ sample_texture_2d_lod(vector float s, vector float t,
 
    if (lambda <= 0.0f) {
       /* magnify */
-      spu.mag_sample_texture_2d[unit](s, t, unit, 0, 0, colors);
+      spu.mag_sample_texture_2d[unit](s, t, unit, 0, face, colors);
    }
    else {
       /* minify */
-      int level = (int) (lambda + 0.5f);
-      if (level > (int) spu.texture[unit].max_level)
-         level = spu.texture[unit].max_level;
-      spu.min_sample_texture_2d[unit](s, t, unit, level, 0, colors);
-      /* XXX to do: mipmap level interpolation */
+      if (spu.sampler[unit].min_img_filter == PIPE_TEX_FILTER_LINEAR) {
+         /* sample two mipmap levels and interpolate */
+         int level = (int) lambda;
+         if (level > (int) spu.texture[unit].max_level)
+            level = spu.texture[unit].max_level;
+         spu.min_sample_texture_2d[unit](s, t, unit, level, face, colors);
+         if (spu.sampler[unit].min_img_filter == PIPE_TEX_FILTER_LINEAR) {
+            /* sample second mipmap level */
+            float weight = lambda - (float) level;
+            level++;
+            if (level <= (int) spu.texture[unit].max_level) {
+               vector float colors2[4];
+               spu.min_sample_texture_2d[unit](s, t, unit, level, face, colors2);
+               blend_colors(colors, colors2, weight);
+            }
+         }
+      }
+      else {
+         /* sample one mipmap level */
+         int level = (int) (lambda + 0.5f);
+         if (level > (int) spu.texture[unit].max_level)
+            level = spu.texture[unit].max_level;
+         spu.min_sample_texture_2d[unit](s, t, unit, level, face, colors);
+      }
    }
 }
 
-- 
cgit v1.2.3


From 8bff2fccc9774e3f3af3c0f8ea345037051cf40e Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Thu, 16 Oct 2008 11:48:05 -0600
Subject: cell: CELL_NUM_SPUS env var

---
 src/gallium/drivers/cell/ppu/cell_context.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'src/gallium')

diff --git a/src/gallium/drivers/cell/ppu/cell_context.c b/src/gallium/drivers/cell/ppu/cell_context.c
index 097dbcfdc8..4dad490ce1 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.c
+++ b/src/gallium/drivers/cell/ppu/cell_context.c
@@ -160,6 +160,10 @@ cell_create_context(struct pipe_screen *screen,
       printf("Cell: found %d Cell(s) with %u SPUs\n",
              cell->num_cells, cell->num_spus);
    }
+   if (getenv("CELL_NUM_SPUS")) {
+      cell->num_spus = atoi(getenv("CELL_NUM_SPUS"));
+      assert(cell->num_spus > 0);
+   }
 
    cell_start_spus(cell);
 
-- 
cgit v1.2.3


From 033c90f4c16c1da517d676282508208319bd5ec5 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Thu, 16 Oct 2008 13:49:42 -0600
Subject: cell: implement KIL instruction

---
 src/gallium/drivers/cell/ppu/cell_gen_fp.c | 80 ++++++++++++++++++++++++++++++
 src/gallium/drivers/cell/spu/spu_main.h    |  6 +--
 src/gallium/drivers/cell/spu/spu_tri.c     |  5 +-
 3 files changed, 87 insertions(+), 4 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
index 2b34cf1e23..493ee1a0c9 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
@@ -84,6 +84,9 @@ struct codegen
    /** Index of execution mask register */
    int exec_mask_reg;
 
+   /** KIL mask: indicates which fragments have been killed */
+   int kill_mask_reg;
+
    int frame_size;  /**< Stack frame size, in words */
 
    struct spe_function *f;
@@ -431,8 +434,21 @@ emit_prologue(struct codegen *gen)
 static void
 emit_epilogue(struct codegen *gen)
 {
+   const int return_reg = 3;
+
    spe_comment(gen->f, -4, "Function epilogue:");
 
+   spe_comment(gen->f, 0, "return the killed mask");
+   if (gen->kill_mask_reg > 0) {
+      /* shader called KIL, return the "alive" mask */
+      spe_move(gen->f, return_reg, gen->kill_mask_reg);
+   }
+   else {
+      /* return {0,0,0,0} */
+      spe_load_uint(gen->f, return_reg, 0);
+   }
+
+   spe_comment(gen->f, 0, "restore stack and return");
    if (gen->frame_size >= 512) {
       /* offset is too large for ai instruction */
       int offset_reg = spe_allocate_available_register(gen->f);
@@ -1423,6 +1439,68 @@ emit_TEX(struct codegen *gen, const struct tgsi_full_instruction *inst)
 }
 
 
+/**
+ * KILL if any of src reg values are less than zero.
+ */
+static boolean
+emit_KIL(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+   int s_regs[4], kil_reg = -1, cmp_reg, zero_reg;
+
+   spe_comment(gen->f, -4, "CALL kil:");
+
+   /* zero = {0,0,0,0} */
+   zero_reg = get_itemp(gen);
+   spe_load_uint(gen->f, zero_reg, 0);
+
+   cmp_reg = get_itemp(gen);
+
+   /* get src regs */
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         s_regs[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+      }
+   }
+
+   /* test if any src regs are < 0 */
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         if (kil_reg >= 0) {
+            /* cmp = 0 > src ? : ~0 : 0 */
+            spe_fcgt(gen->f, cmp_reg, zero_reg, s_regs[ch]);
+            /* kil = kil | cmp */
+            spe_or(gen->f, kil_reg, kil_reg, cmp_reg);
+         }
+         else {
+            kil_reg = get_itemp(gen);
+            /* kil = 0 > src ? : ~0 : 0 */
+            spe_fcgt(gen->f, kil_reg, zero_reg, s_regs[ch]);
+         }
+      }
+   }
+
+   if (gen->if_nesting) {
+      /* may have been a conditional kil */
+      spe_and(gen->f, kil_reg, kil_reg, gen->exec_mask_reg);
+   }
+
+   /* allocate the kill mask reg if needed */
+   if (gen->kill_mask_reg <= 0) {
+      gen->kill_mask_reg = spe_allocate_available_register(gen->f);
+      spe_move(gen->f, gen->kill_mask_reg, kil_reg);
+   }
+   else {
+      spe_or(gen->f, gen->kill_mask_reg, gen->kill_mask_reg, kil_reg);
+   }
+
+   free_itemps(gen);
+
+   return TRUE;
+}
+
+
+
 /**
  * Emit max.  See emit_SGT for comments.
  */
@@ -1695,6 +1773,8 @@ emit_instruction(struct codegen *gen,
       /* fall-through for now */
    case TGSI_OPCODE_TXP:
       return emit_TEX(gen, inst);
+   case TGSI_OPCODE_KIL:
+      return emit_KIL(gen, inst);
 
    case TGSI_OPCODE_IF:
       return emit_IF(gen, inst);
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 80e9c696f8..95ef4c9244 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -89,9 +89,9 @@ typedef void (*spu_fragment_ops_func)(uint x, uint y,
                                       uint facing);
 
 /** Function for running fragment program */
-typedef void (*spu_fragment_program_func)(vector float *inputs,
-                                          vector float *outputs,
-                                          vector float *constants);
+typedef vector unsigned int (*spu_fragment_program_func)(vector float *inputs,
+                                                         vector float *outputs,
+                                                         vector float *constants);
 
 
 struct spu_framebuffer
diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index d83085d0f9..4caf7d6b61 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -254,6 +254,7 @@ emit_quad( int x, int y, mask_t mask)
          vector float inputs[4*4], outputs[2*4];
          vector float fragZ = eval_z((float) x, (float) y);
          vector float fragW = eval_w((float) x, (float) y);
+         vector unsigned int kill_mask;
 
          /* setup inputs */
 #if 0
@@ -268,7 +269,9 @@ emit_quad( int x, int y, mask_t mask)
          ASSERT(spu.fragment_ops);
 
          /* Execute the current fragment program */
-         spu.fragment_program(inputs, outputs, spu.constants);
+         kill_mask = spu.fragment_program(inputs, outputs, spu.constants);
+
+         mask = spu_andc(mask, kill_mask);
 
          /* Execute per-fragment/quad operations, including:
           * alpha test, z test, stencil test, blend and framebuffer writing.
-- 
cgit v1.2.3


From 51ffab362b27997f9c6c60bf9bace1b1854817db Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Thu, 16 Oct 2008 13:54:17 -0600
Subject: cell: pass spu_texture_level ptr to get_four_texels()

---
 src/gallium/drivers/cell/spu/spu_texture.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
index e3d9a49dc4..c0af05e46e 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.c
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -72,10 +72,10 @@ invalidate_tex_cache(void)
  * a time.
  */
 static void
-get_four_texels(uint unit, uint level, uint face, vec_int4 x, vec_int4 y,
+get_four_texels(const struct spu_texture_level *tlevel, uint face,
+                vec_int4 x, vec_int4 y,
                 vec_uint4 *texels)
 {
-   const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
    unsigned texture_ea = (uintptr_t) tlevel->start;
    const vec_int4 tile_x = spu_rlmask(x, -5);  /* tile_x = x / 32 */
    const vec_int4 tile_y = spu_rlmask(y, -5);  /* tile_y = y / 32 */
@@ -145,7 +145,7 @@ sample_texture_2d_nearest(vector float s, vector float t,
    is = spu_clamp(is, tlevel->max_s);
    it = spu_clamp(it, tlevel->max_t);
 
-   get_four_texels(unit, level, face, is, it, texels);
+   get_four_texels(tlevel, face, is, it, texels);
 
    /* convert four packed ARGBA pixels to float RRRR,GGGG,BBBB,AAAA */
    spu_unpack_A8R8G8B8_transpose4(texels, colors);
@@ -188,10 +188,10 @@ sample_texture_2d_bilinear(vector float s, vector float t,
 
    /* get packed int texels */
    vector unsigned int texels[16];
-   get_four_texels(unit, level, face, is0, it0, texels + 0);  /* upper-left */
-   get_four_texels(unit, level, face, is1, it0, texels + 4);  /* upper-right */
-   get_four_texels(unit, level, face, is0, it1, texels + 8);  /* lower-left */
-   get_four_texels(unit, level, face, is1, it1, texels + 12); /* lower-right */
+   get_four_texels(tlevel, face, is0, it0, texels + 0);  /* upper-left */
+   get_four_texels(tlevel, face, is1, it0, texels + 4);  /* upper-right */
+   get_four_texels(tlevel, face, is0, it1, texels + 8);  /* lower-left */
+   get_four_texels(tlevel, face, is1, it1, texels + 12); /* lower-right */
 
    /* convert packed int texels to float colors */
    vector float ftexels[16];
@@ -346,10 +346,10 @@ sample_texture_2d_bilinear_int(vector float s, vector float t,
 
    /* get packed int texels */
    vector unsigned int texels[16];
-   get_four_texels(unit, level, face, is0, it0, texels + 0);  /* upper-left */
-   get_four_texels(unit, level, face, is1, it0, texels + 4);  /* upper-right */
-   get_four_texels(unit, level, face, is0, it1, texels + 8);  /* lower-left */
-   get_four_texels(unit, level, face, is1, it1, texels + 12); /* lower-right */
+   get_four_texels(tlevel, face, is0, it0, texels + 0);  /* upper-left */
+   get_four_texels(tlevel, face, is1, it0, texels + 4);  /* upper-right */
+   get_four_texels(tlevel, face, is0, it1, texels + 8);  /* lower-left */
+   get_four_texels(tlevel, face, is1, it1, texels + 12); /* lower-right */
 
    /* twiddle packed 32-bit BGRA pixels into RGBA as four unsigned ints */
    {
-- 
cgit v1.2.3


From fa7b8388066651c5cfafd4ce6461fc43c982d8c7 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Thu, 16 Oct 2008 15:48:04 -0600
Subject: cell: use 7-bit weights in sample_texture_2d_bilinear_int()

This allows us to use 16-bit signed mul/add instructions.  Had to
used unsigned mul before and there's no unsigned mul/add instruction.
---
 src/gallium/drivers/cell/spu/spu_texture.c | 62 +++++++++++++++---------------
 1 file changed, 31 insertions(+), 31 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
index c0af05e46e..4e12a116cd 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.c
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -314,19 +314,19 @@ sample_texture_2d_bilinear_int(vector float s, vector float t,
    vector float ss = spu_madd(s, tlevel->scale_s, half);
    vector float tt = spu_madd(t, tlevel->scale_t, half);
 
-   /* convert float coords to fixed-pt coords with 8 fraction bits */
-   vector signed int is = spu_convts(ss, 8);
-   vector signed int it = spu_convts(tt, 8);
+   /* convert float coords to fixed-pt coords with 7 fraction bits */
+   vector signed int is = spu_convts(ss, 7);  /* XXX really need floor() here */
+   vector signed int it = spu_convts(tt, 7);  /* XXX really need floor() here */
 
-   /* compute integer texel weights in [0, 255] */
-   vector signed int sWeights0 = spu_and(is, 255);
-   vector signed int tWeights0 = spu_and(it, 255);
-   vector signed int sWeights1 = spu_sub(255, sWeights0);
-   vector signed int tWeights1 = spu_sub(255, tWeights0);
+   /* compute integer texel weights in [0, 127] */
+   vector signed int sWeights0 = spu_and(is, 127);
+   vector signed int tWeights0 = spu_and(it, 127);
+   vector signed int sWeights1 = spu_sub(127, sWeights0);
+   vector signed int tWeights1 = spu_sub(127, tWeights0);
 
-   /* texel coords: is0 = is / 256, it0 = is / 256 */
-   vector signed int is0 = spu_rlmask(is, -8);
-   vector signed int it0 = spu_rlmask(it, -8);
+   /* texel coords: is0 = is / 128, it0 = is / 128 */
+   vector signed int is0 = spu_rlmask(is, -7);
+   vector signed int it0 = spu_rlmask(it, -7);
 
    /* texel coords: i1 = is0 + 1, it1 = it0 + 1 */
    vector signed int is1 = spu_add(is0, 1);
@@ -377,36 +377,36 @@ sample_texture_2d_bilinear_int(vector float s, vector float t,
    vector unsigned int c0, c1, c2, c3, cSum;
 
    /* red */
-   c0 = (vector unsigned int) si_mpyu((qword) texel0, si_mpyu((qword) sWeights1, (qword) tWeights1)); /*ul*/
-   c1 = (vector unsigned int) si_mpyu((qword) texel4, si_mpyu((qword) sWeights0, (qword) tWeights1)); /*ur*/
-   c2 = (vector unsigned int) si_mpyu((qword) texel8, si_mpyu((qword) sWeights1, (qword) tWeights0)); /*ll*/
-   c3 = (vector unsigned int) si_mpyu((qword) texel12, si_mpyu((qword) sWeights0, (qword) tWeights0)); /*lr*/
+   c0 = (vector unsigned int) si_mpy((qword) texel0, si_mpy((qword) sWeights1, (qword) tWeights1)); /*ul*/
+   c1 = (vector unsigned int) si_mpy((qword) texel4, si_mpy((qword) sWeights0, (qword) tWeights1)); /*ur*/
+   c2 = (vector unsigned int) si_mpy((qword) texel8, si_mpy((qword) sWeights1, (qword) tWeights0)); /*ll*/
+   c3 = (vector unsigned int) si_mpy((qword) texel12, si_mpy((qword) sWeights0, (qword) tWeights0)); /*lr*/
    cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3));
-   colors[0] = spu_convtf(cSum, 24);
+   colors[0] = spu_convtf(cSum, 22);
 
    /* green */
-   c0 = (vector unsigned int) si_mpyu((qword) texel1, si_mpyu((qword) sWeights1, (qword) tWeights1)); /*ul*/
-   c1 = (vector unsigned int) si_mpyu((qword) texel5, si_mpyu((qword) sWeights0, (qword) tWeights1)); /*ur*/
-   c2 = (vector unsigned int) si_mpyu((qword) texel9, si_mpyu((qword) sWeights1, (qword) tWeights0)); /*ll*/
-   c3 = (vector unsigned int) si_mpyu((qword) texel13, si_mpyu((qword) sWeights0, (qword) tWeights0)); /*lr*/
+   c0 = (vector unsigned int) si_mpy((qword) texel1, si_mpy((qword) sWeights1, (qword) tWeights1)); /*ul*/
+   c1 = (vector unsigned int) si_mpy((qword) texel5, si_mpy((qword) sWeights0, (qword) tWeights1)); /*ur*/
+   c2 = (vector unsigned int) si_mpy((qword) texel9, si_mpy((qword) sWeights1, (qword) tWeights0)); /*ll*/
+   c3 = (vector unsigned int) si_mpy((qword) texel13, si_mpy((qword) sWeights0, (qword) tWeights0)); /*lr*/
    cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3));
-   colors[1] = spu_convtf(cSum, 24);
+   colors[1] = spu_convtf(cSum, 22);
 
    /* blue */
-   c0 = (vector unsigned int) si_mpyu((qword) texel2, si_mpyu((qword) sWeights1, (qword) tWeights1)); /*ul*/
-   c1 = (vector unsigned int) si_mpyu((qword) texel6, si_mpyu((qword) sWeights0, (qword) tWeights1)); /*ur*/
-   c2 = (vector unsigned int) si_mpyu((qword) texel10, si_mpyu((qword) sWeights1, (qword) tWeights0)); /*ll*/
-   c3 = (vector unsigned int) si_mpyu((qword) texel14, si_mpyu((qword) sWeights0, (qword) tWeights0)); /*lr*/
+   c0 = (vector unsigned int) si_mpy((qword) texel2, si_mpy((qword) sWeights1, (qword) tWeights1)); /*ul*/
+   c1 = (vector unsigned int) si_mpy((qword) texel6, si_mpy((qword) sWeights0, (qword) tWeights1)); /*ur*/
+   c2 = (vector unsigned int) si_mpy((qword) texel10, si_mpy((qword) sWeights1, (qword) tWeights0)); /*ll*/
+   c3 = (vector unsigned int) si_mpy((qword) texel14, si_mpy((qword) sWeights0, (qword) tWeights0)); /*lr*/
    cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3));
-   colors[2] = spu_convtf(cSum, 24);
+   colors[2] = spu_convtf(cSum, 22);
 
    /* alpha */
-   c0 = (vector unsigned int) si_mpyu((qword) texel3, si_mpyu((qword) sWeights1, (qword) tWeights1)); /*ul*/
-   c1 = (vector unsigned int) si_mpyu((qword) texel7, si_mpyu((qword) sWeights0, (qword) tWeights1)); /*ur*/
-   c2 = (vector unsigned int) si_mpyu((qword) texel11, si_mpyu((qword) sWeights1, (qword) tWeights0)); /*ll*/
-   c3 = (vector unsigned int) si_mpyu((qword) texel15, si_mpyu((qword) sWeights0, (qword) tWeights0)); /*lr*/
+   c0 = (vector unsigned int) si_mpy((qword) texel3, si_mpy((qword) sWeights1, (qword) tWeights1)); /*ul*/
+   c1 = (vector unsigned int) si_mpy((qword) texel7, si_mpy((qword) sWeights0, (qword) tWeights1)); /*ur*/
+   c2 = (vector unsigned int) si_mpy((qword) texel11, si_mpy((qword) sWeights1, (qword) tWeights0)); /*ll*/
+   c3 = (vector unsigned int) si_mpy((qword) texel15, si_mpy((qword) sWeights0, (qword) tWeights0)); /*lr*/
    cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3));
-   colors[3] = spu_convtf(cSum, 24);
+   colors[3] = spu_convtf(cSum, 22);
 }
 
 
-- 
cgit v1.2.3


From cb8ebc912430201683463822897f06d7d42795f2 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Thu, 16 Oct 2008 16:51:23 -0600
Subject: cell: more efficient state emit for textures/samplers

---
 src/gallium/drivers/cell/ppu/cell_context.h    |  2 +
 src/gallium/drivers/cell/ppu/cell_pipe_state.c | 41 ++++++++++--------
 src/gallium/drivers/cell/ppu/cell_state_emit.c | 60 ++++++++++++++------------
 3 files changed, 58 insertions(+), 45 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/drivers/cell/ppu/cell_context.h b/src/gallium/drivers/cell/ppu/cell_context.h
index a592e728c8..ad1f4829a4 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.h
+++ b/src/gallium/drivers/cell/ppu/cell_context.h
@@ -121,6 +121,8 @@ struct cell_context
    uint *tex_map;
 
    uint dirty;
+   uint dirty_textures;  /* bitmask of texture units */
+   uint dirty_samplers;  /* bitmask of sampler units */
 
    /** Cache of code generated for per-fragment ops */
    struct keymap *fragment_ops_cache;
diff --git a/src/gallium/drivers/cell/ppu/cell_pipe_state.c b/src/gallium/drivers/cell/ppu/cell_pipe_state.c
index 2e3086c4fa..1615e0b356 100644
--- a/src/gallium/drivers/cell/ppu/cell_pipe_state.c
+++ b/src/gallium/drivers/cell/ppu/cell_pipe_state.c
@@ -212,17 +212,24 @@ cell_bind_sampler_states(struct pipe_context *pipe,
                          unsigned num, void **samplers)
 {
    struct cell_context *cell = cell_context(pipe);
+   uint i, changed = 0x0;
 
    assert(num <= CELL_MAX_SAMPLERS);
 
    draw_flush(cell->draw);
 
-   memcpy(cell->sampler, samplers, num * sizeof(void *));
-   memset(&cell->sampler[num], 0, (CELL_MAX_SAMPLERS - num) *
-          sizeof(void *));
-   cell->num_samplers = num;
+   for (i = 0; i < CELL_MAX_SAMPLERS; i++) {
+      struct pipe_sampler_state *new_samp = i < num ? samplers[i] : NULL;
+      if (cell->sampler[i] != new_samp) {
+         cell->sampler[i] = new_samp;
+         changed |= (1 << i);
+      }
+   }
 
-   cell->dirty |= CELL_NEW_SAMPLER;
+   if (changed) {
+      cell->dirty |= CELL_NEW_SAMPLER;
+      cell->dirty_samplers |= changed;
+   }
 }
 
 
@@ -240,25 +247,23 @@ cell_set_sampler_textures(struct pipe_context *pipe,
                           unsigned num, struct pipe_texture **texture)
 {
    struct cell_context *cell = cell_context(pipe);
-   uint i;
+   uint i, changed = 0x0;
 
    assert(num <= CELL_MAX_SAMPLERS);
 
-   /* Check for no-op */
-   if (num == cell->num_textures &&
-       !memcmp(cell->texture, texture, num * sizeof(struct pipe_texture *)))
-      return;
-
-   draw_flush(cell->draw);
-
    for (i = 0; i < CELL_MAX_SAMPLERS; i++) {
-      struct pipe_texture *tex = i < num ? texture[i] : NULL;
-
-      pipe_texture_reference((struct pipe_texture **) &cell->texture[i], tex);
+      struct pipe_texture *new_tex = i < num ? texture[i] : NULL;
+      if ((struct pipe_texture *) cell->texture[i] != new_tex) {
+         pipe_texture_reference((struct pipe_texture **) &cell->texture[i],
+                                new_tex);
+         changed |= (1 << i);
+      }
    }
-   cell->num_textures = num;
 
-   cell->dirty |= CELL_NEW_TEXTURE;
+   if (changed) {
+      cell->dirty |= CELL_NEW_TEXTURE;
+      cell->dirty_textures |= changed;
+   }
 }
 
 
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index e6387382f2..effcd2a1e1 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -201,44 +201,50 @@ cell_emit_state(struct cell_context *cell)
    if (cell->dirty & CELL_NEW_SAMPLER) {
       uint i;
       for (i = 0; i < CELL_MAX_SAMPLERS; i++) {
-         if (cell->sampler[i]) {
-            struct cell_command_sampler *sampler
-               = cell_batch_alloc(cell, sizeof(*sampler));
-            sampler->opcode = CELL_CMD_STATE_SAMPLER;
-            sampler->unit = i;
-            sampler->state = *cell->sampler[i];
+         if (cell->dirty_samplers & (1 << i)) {
+            if (cell->sampler[i]) {
+               struct cell_command_sampler *sampler
+                  = cell_batch_alloc(cell, sizeof(*sampler));
+               sampler->opcode = CELL_CMD_STATE_SAMPLER;
+               sampler->unit = i;
+               sampler->state = *cell->sampler[i];
+            }
          }
       }
+      cell->dirty_samplers = 0x0;
    }
 
    if (cell->dirty & CELL_NEW_TEXTURE) {
       uint i;
       for (i = 0;i < CELL_MAX_SAMPLERS; i++) {
-         struct cell_command_texture *texture
-            =  cell_batch_alloc(cell, sizeof(*texture));
-         texture->opcode = CELL_CMD_STATE_TEXTURE;
-         texture->unit = i;
-         if (cell->texture[i]) {
-            uint level;
-            for (level = 0; level < CELL_MAX_TEXTURE_LEVELS; level++) {
-               texture->start[level] = cell->texture[i]->tiled_data[level];
-               texture->width[level] = cell->texture[i]->base.width[level];
-               texture->height[level] = cell->texture[i]->base.height[level];
-               texture->depth[level] = cell->texture[i]->base.depth[level];
+         if (cell->dirty_textures & (1 << i)) {
+            struct cell_command_texture *texture
+               =  cell_batch_alloc(cell, sizeof(*texture));
+            texture->opcode = CELL_CMD_STATE_TEXTURE;
+            texture->unit = i;
+            if (cell->texture[i]) {
+               uint level;
+               for (level = 0; level < CELL_MAX_TEXTURE_LEVELS; level++) {
+                  texture->start[level] = cell->texture[i]->tiled_data[level];
+                  texture->width[level] = cell->texture[i]->base.width[level];
+                  texture->height[level] = cell->texture[i]->base.height[level];
+                  texture->depth[level] = cell->texture[i]->base.depth[level];
+               }
+               texture->target = cell->texture[i]->base.target;
             }
-            texture->target = cell->texture[i]->base.target;
-         }
-         else {
-            uint level;
-            for (level = 0; level < CELL_MAX_TEXTURE_LEVELS; level++) {
-               texture->start[level] = NULL;
-               texture->width[level] = 0;
-               texture->height[level] = 0;
-               texture->depth[level] = 0;
+            else {
+               uint level;
+               for (level = 0; level < CELL_MAX_TEXTURE_LEVELS; level++) {
+                  texture->start[level] = NULL;
+                  texture->width[level] = 0;
+                  texture->height[level] = 0;
+                  texture->depth[level] = 0;
+               }
+               texture->target = 0;
             }
-            texture->target = 0;
          }
       }
+      cell->dirty_textures = 0x0;
    }
 
    if (cell->dirty & CELL_NEW_VERTEX_INFO) {
-- 
cgit v1.2.3


From 9fa8671c73fa44a95e2ea7fed6047bddb042796f Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Thu, 16 Oct 2008 20:25:28 -0600
Subject: cell: add new debug flag (cache) to report texture cache stats on
 exit

---
 src/gallium/drivers/cell/common.h           | 1 +
 src/gallium/drivers/cell/ppu/cell_context.c | 1 +
 src/gallium/drivers/cell/spu/spu_command.c  | 3 ++-
 src/gallium/drivers/cell/spu/spu_dcache.c   | 4 +++-
 4 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index 469d56cda8..9ca4e9d67e 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -117,6 +117,7 @@
 #define CELL_DEBUG_FRAGMENT_OPS         (1 << 3)
 #define CELL_DEBUG_FRAGMENT_OP_FALLBACK (1 << 4)
 #define CELL_DEBUG_CMD                  (1 << 5)
+#define CELL_DEBUG_CACHE                (1 << 6)
 
 /** Max instructions for doing per-fragment operations */
 #define SPU_MAX_FRAGMENT_OPS_INSTS 64
diff --git a/src/gallium/drivers/cell/ppu/cell_context.c b/src/gallium/drivers/cell/ppu/cell_context.c
index 4dad490ce1..7a2d93ecb4 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.c
+++ b/src/gallium/drivers/cell/ppu/cell_context.c
@@ -94,6 +94,7 @@ static const struct debug_named_value cell_debug_flags[] = {
    {"fragops", CELL_DEBUG_FRAGMENT_OPS}, /**< SPUs emit fragment ops debug messages*/
    {"fragopfallback", CELL_DEBUG_FRAGMENT_OP_FALLBACK}, /**< SPUs use reference implementation for fragment ops*/
    {"cmd", CELL_DEBUG_CMD},       /**< SPUs dump command buffer info */
+   {"cache", CELL_DEBUG_CACHE},   /**< report texture cache stats on exit */
    {NULL, 0}
 };
 
diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index ff4a52d79a..9c853c0961 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -720,5 +720,6 @@ command_loop(void)
 
    D_PRINTF(CELL_DEBUG_CMD, "Exit command loop\n");
 
-   spu_dcache_report();
+   if (spu.init.debug_flags & CELL_DEBUG_CACHE)
+      spu_dcache_report();
 }
diff --git a/src/gallium/drivers/cell/spu/spu_dcache.c b/src/gallium/drivers/cell/spu/spu_dcache.c
index 167404cdc5..a6d67634fd 100644
--- a/src/gallium/drivers/cell/spu/spu_dcache.c
+++ b/src/gallium/drivers/cell/spu/spu_dcache.c
@@ -36,7 +36,9 @@
 #define CACHE_SET_TAGID(set)  (((set) & 0x03) + TAG_DCACHE0)
 #define CACHE_LOG2NNWAY       2
 #define CACHE_LOG2NSETS       6
-/*#define CACHE_STATS           1*/
+#ifdef DEBUG
+#define CACHE_STATS           1
+#endif
 #include <cache-api.h>
 
 /* Yes folks, this is ugly.
-- 
cgit v1.2.3


From 81724da4f61f2ba678e2e0376209e1b754e1ecab Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 17 Oct 2008 09:09:57 -0600
Subject: cell: use an approximation in compute_lambda_2d() to avoid sqrt

Though, the logf() call still needs attention.
---
 src/gallium/drivers/cell/spu/spu_texture.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
index 4e12a116cd..69784c8978 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.c
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -414,7 +414,7 @@ sample_texture_2d_bilinear_int(vector float s, vector float t,
 /**
  * Compute level of detail factor from texcoords.
  */
-static float
+static INLINE float
 compute_lambda_2d(uint unit, vector float s, vector float t)
 {
    uint baseLevel = 0;
@@ -424,11 +424,21 @@ compute_lambda_2d(uint unit, vector float s, vector float t)
    float dsdy = width * (spu_extract(s, 2) - spu_extract(s, 0));
    float dtdx = height * (spu_extract(t, 1) - spu_extract(t, 0));
    float dtdy = height * (spu_extract(t, 2) - spu_extract(t, 0));
+#if 0
+   /* ideal value */
    float x = dsdx * dsdx + dtdx * dtdx;
    float y = dsdy * dsdy + dtdy * dtdy;
    float rho = x > y ? x : y;
    rho = sqrtf(rho);
-   float lambda = logf(rho) * 1.442695f;
+#else
+   /* approximation */
+   dsdx = fabsf(dsdx);
+   dsdy = fabsf(dsdy);
+   dtdx = fabsf(dtdx);
+   dtdy = fabsf(dtdy);
+   float rho = (dsdx + dsdy + dtdx + dtdy) * 0.5;
+#endif
+   float lambda = logf(rho) * 1.442695f; /* compute logbase2(rho) */
    return lambda;
 }
 
-- 
cgit v1.2.3


From 6cec79dc4fc8f6ebde3e4c90ac56fa8022f2d4aa Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Mon, 20 Oct 2008 09:35:18 -0600
Subject: cell: temporarily disable freeing of tiled texture memory

Allows glDrawPixels to work for now...
---
 src/gallium/drivers/cell/ppu/cell_texture.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'src/gallium')

diff --git a/src/gallium/drivers/cell/ppu/cell_texture.c b/src/gallium/drivers/cell/ppu/cell_texture.c
index 230e192573..9c6741f1bc 100644
--- a/src/gallium/drivers/cell/ppu/cell_texture.c
+++ b/src/gallium/drivers/cell/ppu/cell_texture.c
@@ -147,7 +147,13 @@ cell_texture_release(struct pipe_screen *screen,
 
       for (i = 0; i < CELL_MAX_TEXTURE_LEVELS; i++) {
          if (ct->tiled_data[i]) {
+            /* XXX need to use a fenced buffer for tiled data so that
+             * it's properly freed after rendering has completed.
+             * Disabling this free() allows glDrawPixels to work for now.
+             */
+#if 0
             align_free(ct->tiled_data[i]);
+#endif
          }
       }
 
-- 
cgit v1.2.3


From abfc32a68cbf95a7951b1b9fc18a9af7c524b69e Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Mon, 20 Oct 2008 15:44:22 -0600
Subject: cell: minor improvements to batch buffer functions

---
 src/gallium/drivers/cell/ppu/cell_batch.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/drivers/cell/ppu/cell_batch.c b/src/gallium/drivers/cell/ppu/cell_batch.c
index 16882c0129..01254aed60 100644
--- a/src/gallium/drivers/cell/ppu/cell_batch.c
+++ b/src/gallium/drivers/cell/ppu/cell_batch.c
@@ -42,7 +42,9 @@
 uint
 cell_get_empty_buffer(struct cell_context *cell)
 {
-   uint buf = 0, tries = 0;
+   static uint prev_buffer = 0;
+   uint buf = (prev_buffer + 1) % CELL_NUM_BUFFERS;
+   uint tries = 0;
 
    /* Find a buffer that's marked as free by all SPUs */
    while (1) {
@@ -58,8 +60,9 @@ cell_get_empty_buffer(struct cell_context *cell)
                   cell->buffer_status[spu][buf][0] = CELL_BUFFER_STATUS_USED;
                }
                /*
-               printf("PPU: ALLOC BUFFER %u\n", buf);
+               printf("PPU: ALLOC BUFFER %u, %u tries\n", buf, tries);
                */
+               prev_buffer = buf;
                return buf;
             }
          }
@@ -169,7 +172,7 @@ cell_batch_append(struct cell_context *cell, const void *data, uint bytes)
 
    size = cell->buffer_size[cell->cur_batch];
 
-   if (size + bytes > CELL_BUFFER_SIZE) {
+   if (bytes > cell_batch_free_space(cell)) {
       cell_batch_flush(cell);
       size = 0;
    }
@@ -223,7 +226,7 @@ cell_batch_alloc_aligned(struct cell_context *cell, uint bytes,
 
    padbytes = (alignment - (size % alignment)) % alignment;
 
-   if (padbytes + size + bytes > CELL_BUFFER_SIZE) {
+   if (padbytes + bytes > cell_batch_free_space(cell)) {
       cell_batch_flush(cell);
       size = 0;
    }
-- 
cgit v1.2.3


From 3a2a2d5332b4912dd7c3d3d891920bdc419fbde0 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Tue, 21 Oct 2008 14:10:09 -0600
Subject: gallium: remove unused var

---
 src/gallium/drivers/softpipe/sp_fs_exec.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/drivers/softpipe/sp_fs_exec.c b/src/gallium/drivers/softpipe/sp_fs_exec.c
index 6280f0701d..f472dd0ed2 100644
--- a/src/gallium/drivers/softpipe/sp_fs_exec.c
+++ b/src/gallium/drivers/softpipe/sp_fs_exec.c
@@ -94,9 +94,6 @@ exec_prepare( const struct sp_fragment_shader *base,
 	      struct tgsi_exec_machine *machine,
 	      struct tgsi_sampler *samplers )
 {
-   struct sp_exec_fragment_shader *spefs =
-      sp_exec_fragment_shader(base);
-
    /*
     * Bind tokens/shader to the interpreter's machine state.
     * Avoid redundant binding.
-- 
cgit v1.2.3


From 7004582c1894ede839c44e292b413fe4916d7e9e Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Tue, 21 Oct 2008 14:12:17 -0600
Subject: gallium: implement tests for PPC/PPC64

---
 src/gallium/include/pipe/p_config.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'src/gallium')

diff --git a/src/gallium/include/pipe/p_config.h b/src/gallium/include/pipe/p_config.h
index ef05547819..05cbd2fc4d 100644
--- a/src/gallium/include/pipe/p_config.h
+++ b/src/gallium/include/pipe/p_config.h
@@ -93,8 +93,11 @@
 #endif
 #endif
 
-#if 0 /* FIXME */
+#if defined(__PPC__)
 #define PIPE_ARCH_PPC
+#if defined(__PPC64__)
+#define PIPE_ARCH_PPC_64
+#endif
 #endif
 
 
-- 
cgit v1.2.3


From 0c1e98d9598bb5a30224583bdf211a1352b96d44 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 22 Oct 2008 08:12:42 -0600
Subject: cell: note that dst reg writing needs clamping

---
 src/gallium/drivers/cell/ppu/cell_gen_fp.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'src/gallium')

diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
index 493ee1a0c9..d4d644d6e8 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
@@ -349,6 +349,22 @@ store_dest_reg(struct codegen *gen,
                int value_reg, int channel,
                const struct tgsi_full_dst_register *dest)
 {
+   /*
+    * XXX need to implement dst reg clamping/saturation
+    */
+#if 0
+   switch (inst->Instruction.Saturate) {
+   case TGSI_SAT_NONE:
+      break;
+   case TGSI_SAT_ZERO_ONE:
+      break;
+   case TGSI_SAT_MINUS_PLUS_ONE:
+      break;
+   default:
+      assert( 0 );
+   }
+#endif
+
    switch (dest->DstRegister.File) {
    case TGSI_FILE_TEMPORARY:
       if (gen->if_nesting > 0) {
-- 
cgit v1.2.3


From 0ae4728eb429d7b5217d34ec96fc973a5e7cfe95 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 22 Oct 2008 10:30:12 -0600
Subject: cell: set cell->num_textures

---
 src/gallium/drivers/cell/ppu/cell_pipe_state.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'src/gallium')

diff --git a/src/gallium/drivers/cell/ppu/cell_pipe_state.c b/src/gallium/drivers/cell/ppu/cell_pipe_state.c
index 1615e0b356..825110c62b 100644
--- a/src/gallium/drivers/cell/ppu/cell_pipe_state.c
+++ b/src/gallium/drivers/cell/ppu/cell_pipe_state.c
@@ -260,6 +260,8 @@ cell_set_sampler_textures(struct pipe_context *pipe,
       }
    }
 
+   cell->num_textures = num;
+
    if (changed) {
       cell->dirty |= CELL_NEW_TEXTURE;
       cell->dirty_textures |= changed;
-- 
cgit v1.2.3


From 70dd4379d2cd54f229c3940312537912470218d3 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 22 Oct 2008 10:34:13 -0600
Subject: cell: implement fencing for texture buffers

If we delete a texture, we need to keep the underlying tiled data buffer
around until any rendering that references it has completed.
Keep a list of buffers referenced by a rendering batch.  Unref/free them when
the associated batch's fence is executed/signalled.
---
 src/gallium/drivers/cell/common.h              |  25 ++++
 src/gallium/drivers/cell/ppu/Makefile          |   1 +
 src/gallium/drivers/cell/ppu/cell_batch.c      |  32 +++++
 src/gallium/drivers/cell/ppu/cell_context.c    |   6 +
 src/gallium/drivers/cell/ppu/cell_context.h    |  21 ++++
 src/gallium/drivers/cell/ppu/cell_fence.c      | 158 +++++++++++++++++++++++++
 src/gallium/drivers/cell/ppu/cell_fence.h      |  57 +++++++++
 src/gallium/drivers/cell/ppu/cell_state_emit.c |   2 +-
 src/gallium/drivers/cell/ppu/cell_texture.c    |  33 ++++--
 src/gallium/drivers/cell/ppu/cell_texture.h    |   5 +-
 src/gallium/drivers/cell/ppu/cell_vbuf.c       |   6 +
 src/gallium/drivers/cell/spu/spu_command.c     |  38 +++++-
 src/gallium/drivers/cell/spu/spu_main.h        |   2 +-
 13 files changed, 367 insertions(+), 19 deletions(-)
 create mode 100644 src/gallium/drivers/cell/ppu/cell_fence.c
 create mode 100644 src/gallium/drivers/cell/ppu/cell_fence.h

(limited to 'src/gallium')

diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index 9ca4e9d67e..23fb0b0831 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -102,6 +102,8 @@
 #define CELL_CMD_STATE_RASTERIZER    22
 #define CELL_CMD_VS_EXECUTE          23
 #define CELL_CMD_FLUSH_BUFFER_RANGE  24
+#define CELL_CMD_FENCE               25
+
 
 /** Command/batch buffers */
 #define CELL_NUM_BUFFERS 4
@@ -123,6 +125,29 @@
 #define SPU_MAX_FRAGMENT_OPS_INSTS 64
 
 
+
+#define CELL_FENCE_IDLE      0
+#define CELL_FENCE_EMITTED   1
+#define CELL_FENCE_SIGNALLED 2
+
+struct cell_fence
+{
+   /** There's a 16-byte status qword per SPU */
+   volatile uint status[CELL_MAX_SPUS][4];
+};
+
+
+/**
+ * Fence command sent to SPUs.  In response, the SPUs will write
+ * CELL_FENCE_STATUS_SIGNALLED back to the fence status word in main memory.
+ */
+struct cell_command_fence
+{
+   uint64_t opcode;      /**< CELL_CMD_FENCE */
+   struct cell_fence *fence;
+};
+
+
 /**
  * Command to specify per-fragment operations state and generated code.
  * Note that the dsa, blend, blend_color fields are really only needed
diff --git a/src/gallium/drivers/cell/ppu/Makefile b/src/gallium/drivers/cell/ppu/Makefile
index b28f4c5c31..9358a47284 100644
--- a/src/gallium/drivers/cell/ppu/Makefile
+++ b/src/gallium/drivers/cell/ppu/Makefile
@@ -24,6 +24,7 @@ SOURCES = \
 	cell_clear.c \
 	cell_context.c \
 	cell_draw_arrays.c \
+	cell_fence.c \
 	cell_flush.c \
 	cell_gen_fragment.c \
 	cell_gen_fp.c \
diff --git a/src/gallium/drivers/cell/ppu/cell_batch.c b/src/gallium/drivers/cell/ppu/cell_batch.c
index 01254aed60..448b723d85 100644
--- a/src/gallium/drivers/cell/ppu/cell_batch.c
+++ b/src/gallium/drivers/cell/ppu/cell_batch.c
@@ -28,6 +28,7 @@
 
 #include "cell_context.h"
 #include "cell_batch.h"
+#include "cell_fence.h"
 #include "cell_spu.h"
 
 
@@ -63,6 +64,10 @@ cell_get_empty_buffer(struct cell_context *cell)
                printf("PPU: ALLOC BUFFER %u, %u tries\n", buf, tries);
                */
                prev_buffer = buf;
+
+               /* release tex buffer associated w/ prev use of this batch buf */
+               cell_free_fenced_buffers(cell, &cell->fenced_buffers[buf]);
+
                return buf;
             }
          }
@@ -84,6 +89,26 @@ cell_get_empty_buffer(struct cell_context *cell)
 }
 
 
+/**
+ * Append a fence command to the current batch buffer.
+ * Note that we're sure there's always room for this because of the
+ * adjusted size check in cell_batch_free_space().
+ */
+static void
+emit_fence(struct cell_context *cell)
+{
+   const uint batch = cell->cur_batch;
+   const uint size = cell->buffer_size[batch];
+   struct cell_command_fence *fence_cmd;
+
+   ASSERT(size + sizeof(struct cell_command_fence) <= CELL_BUFFER_SIZE);
+
+   fence_cmd = (struct cell_command_fence *) (cell->buffer[batch] + size);
+   fence_cmd->opcode = CELL_CMD_FENCE;
+   fence_cmd->fence = &cell->fenced_buffers[batch].fence;
+}
+
+
 /**
  * Flush the current batch buffer to the SPUs.
  * An empty buffer will be found and set as the new current batch buffer
@@ -102,6 +127,12 @@ cell_batch_flush(struct cell_context *cell)
    if (size == 0)
       return;
 
+   /* Before we use this batch buffer, make sure any fenced texture buffers
+    * are released.
+    */
+   if (cell->fenced_buffers[batch].head)
+      emit_fence(cell);
+
    flushing = TRUE;
 
    assert(batch < CELL_NUM_BUFFERS);
@@ -142,6 +173,7 @@ uint
 cell_batch_free_space(const struct cell_context *cell)
 {
    uint free = CELL_BUFFER_SIZE - cell->buffer_size[cell->cur_batch];
+   free -= sizeof(struct cell_command_fence);
    return free;
 }
 
diff --git a/src/gallium/drivers/cell/ppu/cell_context.c b/src/gallium/drivers/cell/ppu/cell_context.c
index 7a2d93ecb4..22d552d8e3 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.c
+++ b/src/gallium/drivers/cell/ppu/cell_context.c
@@ -47,6 +47,7 @@
 #include "cell_clear.h"
 #include "cell_context.h"
 #include "cell_draw_arrays.h"
+#include "cell_fence.h"
 #include "cell_flush.h"
 #include "cell_state.h"
 #include "cell_surface.h"
@@ -104,6 +105,7 @@ cell_create_context(struct pipe_screen *screen,
                     struct cell_winsys *cws)
 {
    struct cell_context *cell;
+   uint i;
 
    /* some fields need to be 16-byte aligned, so align the whole object */
    cell = (struct cell_context*) align_malloc(sizeof(struct cell_context), 16);
@@ -151,6 +153,10 @@ cell_create_context(struct pipe_screen *screen,
                                               cell_debug_flags, 
                                               0 );
 
+   for (i = 0; i < CELL_NUM_BUFFERS; i++)
+      cell_fence_init(&cell->fenced_buffers[i].fence);
+
+
    /*
     * SPU stuff
     */
diff --git a/src/gallium/drivers/cell/ppu/cell_context.h b/src/gallium/drivers/cell/ppu/cell_context.h
index ad1f4829a4..4491ae8cdf 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.h
+++ b/src/gallium/drivers/cell/ppu/cell_context.h
@@ -81,6 +81,19 @@ struct cell_fragment_ops_key
 };
 
 
+struct cell_buffer_node;
+
+/**
+ * Fenced buffer list.  List of buffers which can be unreferenced after
+ * the fence has been executed/signalled.
+ */
+struct cell_buffer_list
+{
+   struct cell_fence fence;
+   struct cell_buffer_node *head;
+};
+
+
 /**
  * Per-context state, subclass of pipe_context.
  */
@@ -154,6 +167,14 @@ struct cell_context
    uint buffer_status[CELL_MAX_SPUS][CELL_NUM_BUFFERS][4] ALIGN16_ATTRIB;
 
 
+   /** Associated with each command/batch buffer is a list of pipe_buffers
+    * that are fenced.  When the last command in a buffer is executed, the
+    * fence will be signalled, indicating that any pipe_buffers preceeding
+    * that fence can be unreferenced (and probably freed).
+    */
+   struct cell_buffer_list fenced_buffers[CELL_NUM_BUFFERS];
+
+
    struct spe_function attrib_fetch;
    unsigned attrib_fetch_offsets[PIPE_MAX_ATTRIBS];
 
diff --git a/src/gallium/drivers/cell/ppu/cell_fence.c b/src/gallium/drivers/cell/ppu/cell_fence.c
new file mode 100644
index 0000000000..ffb3bea12b
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_fence.c
@@ -0,0 +1,158 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include <unistd.h>
+#include "util/u_memory.h"
+#include "pipe/p_inlines.h"
+#include "cell_context.h"
+#include "cell_batch.h"
+#include "cell_fence.h"
+#include "cell_texture.h"
+
+
+void
+cell_fence_init(struct cell_fence *fence)
+{
+   uint i;
+   for (i = 0; i < CELL_MAX_SPUS; i++) {
+      fence->status[i][0] = CELL_FENCE_IDLE;
+   }
+}
+
+
+boolean
+cell_fence_signalled(const struct cell_context *cell,
+                     const struct cell_fence *fence)
+{
+   uint i;
+   for (i = 0; i < cell->num_spus; i++) {
+      //ASSERT(fence->status[i][0] != CELL_FENCE_IDLE);
+      if (fence->status[i][0] == CELL_FENCE_EMITTED)
+         return FALSE;
+   }
+   return TRUE;
+}
+
+
+void
+cell_fence_finish(const struct cell_context *cell,
+                  const struct cell_fence *fence)
+{
+   while (!cell_fence_signalled(cell, fence)) {
+      usleep(10);
+   }
+}
+
+
+
+
+struct cell_buffer_node
+{
+   struct pipe_buffer *buffer;
+   struct cell_buffer_node *next;
+};
+
+
+static void
+cell_add_buffer_to_list(struct cell_context *cell,
+                        struct cell_buffer_list *list,
+                        struct pipe_buffer *buffer)
+{
+   struct pipe_screen *ps = cell->pipe.screen;
+   struct cell_buffer_node *node = CALLOC_STRUCT(cell_buffer_node);
+   /* create new list node which references the buffer, insert at head */
+   if (node) {
+      pipe_buffer_reference(ps, &node->buffer, buffer);
+      node->next = list->head;
+      list->head = node;
+   }
+}
+
+
+/**
+ * Wait for completion of the given fence, then unreference any buffers
+ * on the list.
+ * This typically unrefs/frees texture buffers after any rendering which uses
+ * them has completed.
+ */
+void
+cell_free_fenced_buffers(struct cell_context *cell,
+                         struct cell_buffer_list *list)
+{
+   if (list->head) {
+      struct pipe_screen *ps = cell->pipe.screen;
+      struct cell_buffer_node *node;
+
+      cell_fence_finish(cell, &list->fence);
+
+      /* traverse the list, unreferencing buffers, freeing nodes */
+      node = list->head;
+      while (node) {
+         struct cell_buffer_node *next = node->next;
+         assert(node->buffer);
+         pipe_buffer_unmap(ps, node->buffer);
+#if 0
+         printf("Unref buffer %p\n", node->buffer);
+         if (node->buffer->refcount == 1)
+            printf("   Delete!\n");
+#endif
+         pipe_buffer_reference(ps, &node->buffer, NULL);
+         FREE(node);
+         node = next;
+      }
+      list->head = NULL;
+   }
+}
+
+
+/**
+ * This should be called for each render command.
+ * Any texture buffers that are current bound will be added to a fenced
+ * list to be freed later when the fence is executed/signalled.
+ */
+void
+cell_add_fenced_textures(struct cell_context *cell)
+{
+   struct cell_buffer_list *list = &cell->fenced_buffers[cell->cur_batch];
+   uint i;
+
+   for (i = 0; i < cell->num_textures; i++) {
+      struct cell_texture *ct = cell->texture[i];
+      if (ct) {
+         uint level;
+         for (level = 0; level < CELL_MAX_TEXTURE_LEVELS; level++) {
+            if (ct->tiled_buffer[level]) {
+#if 0
+               printf("Adding texture %p buffer %p to list\n",
+                      ct, ct->tiled_buffer[level]);
+#endif
+               cell_add_buffer_to_list(cell, list, ct->tiled_buffer[level]);
+            }
+         }
+      }
+   }
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_fence.h b/src/gallium/drivers/cell/ppu/cell_fence.h
new file mode 100644
index 0000000000..536b4ba411
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_fence.h
@@ -0,0 +1,57 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef CELL_FENCE_H
+#define CELL_FENCE_H
+
+
+extern void
+cell_fence_init(struct cell_fence *fence);
+
+
+extern boolean
+cell_fence_signalled(const struct cell_context *cell,
+                     const struct cell_fence *fence);
+
+
+extern void
+cell_fence_finish(const struct cell_context *cell,
+                  const struct cell_fence *fence);
+
+
+
+extern void
+cell_free_fenced_buffers(struct cell_context *cell,
+                         struct cell_buffer_list *list);
+
+
+extern void
+cell_add_fenced_textures(struct cell_context *cell);
+
+
+#endif /* CELL_FENCE_H */
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index effcd2a1e1..dd2d7f7d1e 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -225,7 +225,7 @@ cell_emit_state(struct cell_context *cell)
             if (cell->texture[i]) {
                uint level;
                for (level = 0; level < CELL_MAX_TEXTURE_LEVELS; level++) {
-                  texture->start[level] = cell->texture[i]->tiled_data[level];
+                  texture->start[level] = cell->texture[i]->tiled_mapped[level];
                   texture->width[level] = cell->texture[i]->base.width[level];
                   texture->height[level] = cell->texture[i]->base.height[level];
                   texture->depth[level] = cell->texture[i]->base.depth[level];
diff --git a/src/gallium/drivers/cell/ppu/cell_texture.c b/src/gallium/drivers/cell/ppu/cell_texture.c
index 9c6741f1bc..9ac2f3bbb9 100644
--- a/src/gallium/drivers/cell/ppu/cell_texture.c
+++ b/src/gallium/drivers/cell/ppu/cell_texture.c
@@ -136,6 +136,9 @@ cell_texture_release(struct pipe_screen *screen,
        __FUNCTION__, (void *) *pt, (*pt)->refcount - 1);
    */
    if (--(*pt)->refcount <= 0) {
+      /* Delete this texture now.
+       * But note that the underlying pipe_buffer may linger...
+       */
       struct cell_texture *ct = cell_texture(*pt);
       uint i;
 
@@ -146,14 +149,12 @@ cell_texture_release(struct pipe_screen *screen,
       pipe_buffer_reference(screen, &ct->buffer, NULL);
 
       for (i = 0; i < CELL_MAX_TEXTURE_LEVELS; i++) {
-         if (ct->tiled_data[i]) {
-            /* XXX need to use a fenced buffer for tiled data so that
-             * it's properly freed after rendering has completed.
-             * Disabling this free() allows glDrawPixels to work for now.
-             */
-#if 0
-            align_free(ct->tiled_data[i]);
-#endif
+         /* Unreference the tiled image buffer.
+          * It may not actually be deleted until a fence is hit.
+          */
+         if (ct->tiled_buffer[i]) {
+            ct->tiled_mapped[i] = NULL;
+            winsys_buffer_reference(screen->winsys, &ct->tiled_buffer[i], NULL);
          }
       }
 
@@ -234,12 +235,18 @@ cell_twiddle_texture(struct pipe_screen *screen,
          int offset = bufWidth * bufHeight * 4 * surface->face;
          uint *dst;
 
-         if (!ct->tiled_data[level]) {
-            ct->tiled_data[level] =
-               align_malloc(bufWidth * bufHeight * 4 * numFaces, 16);
+         if (!ct->tiled_buffer[level]) {
+            /* allocate buffer for tiled data now */
+            struct pipe_winsys *ws = screen->winsys;
+            uint bytes = bufWidth * bufHeight * 4 * numFaces;
+            ct->tiled_buffer[level] = ws->buffer_create(ws, 16,
+                                                        PIPE_BUFFER_USAGE_PIXEL,
+                                                        bytes);
+            /* and map it */
+            ct->tiled_mapped[level] = ws->buffer_map(ws, ct->tiled_buffer[level],
+                                                     PIPE_BUFFER_USAGE_GPU_READ);
          }
-
-         dst = (uint *) ((ubyte *) ct->tiled_data[level] + offset);
+         dst = (uint *) ((ubyte *) ct->tiled_mapped[level] + offset);
 
          twiddle_image_uint(texWidth, texHeight, TILE_SIZE, dst,
                             surface->stride, src);
diff --git a/src/gallium/drivers/cell/ppu/cell_texture.h b/src/gallium/drivers/cell/ppu/cell_texture.h
index a0757091b0..2f5fe0dd1b 100644
--- a/src/gallium/drivers/cell/ppu/cell_texture.h
+++ b/src/gallium/drivers/cell/ppu/cell_texture.h
@@ -48,7 +48,10 @@ struct cell_texture
    struct pipe_buffer *buffer;
    unsigned long buffer_size;
 
-   void *tiled_data[CELL_MAX_TEXTURE_LEVELS];  /* XXX this may be temporary */ /*ALIGN16*/
+   /** Texture data in tiled layout is held here */
+   struct pipe_buffer *tiled_buffer[CELL_MAX_TEXTURE_LEVELS];
+   /** Mapped, tiled texture data */
+   void *tiled_mapped[CELL_MAX_TEXTURE_LEVELS];
 };
 
 
diff --git a/src/gallium/drivers/cell/ppu/cell_vbuf.c b/src/gallium/drivers/cell/ppu/cell_vbuf.c
index aa63435b93..65ba51b6bb 100644
--- a/src/gallium/drivers/cell/ppu/cell_vbuf.c
+++ b/src/gallium/drivers/cell/ppu/cell_vbuf.c
@@ -38,6 +38,7 @@
 
 #include "cell_batch.h"
 #include "cell_context.h"
+#include "cell_fence.h"
 #include "cell_flush.h"
 #include "cell_spu.h"
 #include "cell_vbuf.h"
@@ -108,6 +109,11 @@ cell_vbuf_release_vertices(struct vbuf_render *vbr, void *vertices,
           __FUNCTION__, cvbr->vertex_buf, vertices_used);
    */
 
+   /* Make sure texture buffers aren't released until we're done rendering
+    * with them.
+    */
+   cell_add_fenced_textures(cell);
+
    /* Tell SPUs they can release the vert buf */
    if (cvbr->vertex_buf != ~0U) {
       struct cell_command_release_verts *release
diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index 9c853c0961..a6ed29ea63 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -76,9 +76,10 @@ static void
 release_buffer(uint buffer)
 {
    /* Evidently, using less than a 16-byte status doesn't work reliably */
-   static const uint status[4] ALIGN16_ATTRIB
-      = {CELL_BUFFER_STATUS_FREE, 0, 0, 0};
-
+   static const vector unsigned int status = {CELL_BUFFER_STATUS_FREE,
+                                              CELL_BUFFER_STATUS_FREE,
+                                              CELL_BUFFER_STATUS_FREE,
+                                              CELL_BUFFER_STATUS_FREE};
    const uint index = 4 * (spu.init.id * CELL_NUM_BUFFERS + buffer);
    uint *dst = spu.init.buffer_status + index;
 
@@ -93,6 +94,29 @@ release_buffer(uint buffer)
 }
 
 
+/**
+ * Write CELL_FENCE_SIGNALLED back to the fence status qword in main memory.
+ * There's a qword of status per SPU.
+ */
+static void
+cmd_fence(struct cell_command_fence *fence_cmd)
+{
+   static const vector unsigned int status = {CELL_FENCE_SIGNALLED,
+                                              CELL_FENCE_SIGNALLED,
+                                              CELL_FENCE_SIGNALLED,
+                                              CELL_FENCE_SIGNALLED};
+   uint *dst = (uint *) fence_cmd->fence;
+   dst += 4 * spu.init.id;  /* main store/memory address, not local store */
+
+   mfc_put((void *) &status,    /* src in local memory */
+           (unsigned int) dst,  /* dst in main memory */
+           sizeof(status),      /* size */
+           TAG_FENCE,           /* tag */
+           0, /* tid */
+           0  /* rid */);
+}
+
+
 static void
 cmd_clear_surface(const struct cell_command_clear_surface *clear)
 {
@@ -637,6 +661,14 @@ cmd_batch(uint opcode)
          cmd_finish();
          pos += 1;
          break;
+      case CELL_CMD_FENCE:
+         {
+            struct cell_command_fence *fence_cmd =
+               (struct cell_command_fence *) &buffer[pos];
+            cmd_fence(fence_cmd);
+            pos += sizeof(*fence_cmd) / 8;
+         }
+         break;
       case CELL_CMD_RELEASE_VERTS:
          {
             struct cell_command_release_verts *release
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 95ef4c9244..668af10be2 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -210,7 +210,7 @@ extern struct spu_global spu;
 #define TAG_DCACHE1           21
 #define TAG_DCACHE2           22
 #define TAG_DCACHE3           23
-
+#define TAG_FENCE             24
 
 
 static INLINE void
-- 
cgit v1.2.3


From e0c6653a5fda956119239ef921daf1e3b950dfc8 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 22 Oct 2008 10:35:38 -0600
Subject: cell: implement many more PPC instructions for code gen

---
 src/gallium/auxiliary/rtasm/Makefile    |   1 +
 src/gallium/auxiliary/rtasm/rtasm_ppc.c | 603 ++++++++++++++++++++++++++++++--
 src/gallium/auxiliary/rtasm/rtasm_ppc.h | 141 +++++++-
 3 files changed, 704 insertions(+), 41 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/auxiliary/rtasm/Makefile b/src/gallium/auxiliary/rtasm/Makefile
index 39b8a4dbd7..252dc5274a 100644
--- a/src/gallium/auxiliary/rtasm/Makefile
+++ b/src/gallium/auxiliary/rtasm/Makefile
@@ -7,6 +7,7 @@ C_SOURCES = \
 	rtasm_cpu.c \
 	rtasm_execmem.c \
 	rtasm_x86sse.c \
+	rtasm_ppc.c \
 	rtasm_ppc_spe.c
 
 include ../../Makefile.template
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc.c b/src/gallium/auxiliary/rtasm/rtasm_ppc.c
index 534a23568d..4a94ed0460 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc.c
@@ -23,10 +23,19 @@
 
 /**
  * PPC code generation.
+ * For reference, see http://www.power.org/resources/reading/PowerISA_V2.05.pdf
+ * ABI info: http://www.cs.utsa.edu/~whaley/teach/cs6463FHPO/LEC/lec12_ho.pdf
+ *
+ * Other PPC refs:
+ * http://www-01.ibm.com/chips/techlib/techlib.nsf/techdocs/852569B20050FF778525699600719DF2
+ * http://www.ibm.com/developerworks/eserver/library/es-archguide-v2.html
+ * http://www.freescale.com/files/product/doc/MPCFPE32B.pdf
+ *
  * \author Brian Paul
  */
 
 
+#include <stdio.h>
 #include "util/u_memory.h"
 #include "pipe/p_debug.h"
 #include "rtasm_ppc.h"
@@ -35,30 +44,125 @@
 void
 ppc_init_func(struct ppc_function *p, unsigned max_inst)
 {
-    p->store = align_malloc(max_inst * PPC_INST_SIZE, 16);
-    p->num_inst = 0;
-    p->max_inst = max_inst;
-    p->vec_used = ~0;
+   uint i;
+
+   p->store = align_malloc(max_inst * PPC_INST_SIZE, 16);
+   p->num_inst = 0;
+   p->max_inst = max_inst;
+   p->fp_used = ~0x0;
+   p->vec_used = ~0x0;
+
+   /* only allow using gp registers 7..12 for now */
+   p->reg_used = 0x0;
+   for (i = 7; i < 13; i++)
+      p->reg_used |= (1 << i);
 }
 
 
 void
 ppc_release_func(struct ppc_function *p)
 {
-    assert(p->num_inst <= p->max_inst);
-    if (p->store != NULL) {
-        align_free(p->store);
-    }
-    p->store = NULL;
+   assert(p->num_inst <= p->max_inst);
+   if (p->store != NULL) {
+      align_free(p->store);
+   }
+   p->store = NULL;
+}
+
+
+void (*ppc_get_func(struct ppc_function *p))(void)
+{
+#if 0
+   DUMP_END();
+   if (DISASSEM && p->store)
+      debug_printf("disassemble %p %p\n", p->store, p->csr);
+
+   if (p->store == p->error_overflow)
+      return (void (*)(void)) NULL;
+   else
+#endif
+      return (void (*)(void)) p->store;
+}
+
+
+void
+ppc_dump_func(const struct ppc_function *p)
+{
+   uint i;
+   for (i = 0; i < p->num_inst; i++) {
+      debug_printf("%3u: 0x%08x\n", i, p->store[i]);
+   }
+}
+
+
+/**
+ * Allocate a general purpose register.
+ * \return register index or -1 if none left.
+ */
+int
+ppc_allocate_register(struct ppc_function *p)
+{
+   unsigned i;
+   for (i = 0; i < PPC_NUM_REGS; i++) {
+      const uint64_t mask = 1 << i;
+      if ((p->reg_used & mask) != 0) {
+         p->reg_used &= ~mask;
+         return i;
+      }
+   }
+   return -1;
 }
 
 
 /**
- * Alloate a vector register.
+ * Mark the given general purpose register as "unallocated".
+ */
+void
+ppc_release_register(struct ppc_function *p, int reg)
+{
+   assert(reg < PPC_NUM_REGS);
+   assert((p->reg_used & (1 << reg)) == 0);
+   p->reg_used |= (1 << reg);
+}
+
+
+/**
+ * Allocate a floating point register.
  * \return register index or -1 if none left.
  */
 int
-ppc_allocate_vec_register(struct ppc_function *p, int reg)
+ppc_allocate_fp_register(struct ppc_function *p)
+{
+   unsigned i;
+   for (i = 0; i < PPC_NUM_FP_REGS; i++) {
+      const uint64_t mask = 1 << i;
+      if ((p->fp_used & mask) != 0) {
+         p->fp_used &= ~mask;
+         return i;
+      }
+   }
+   return -1;
+}
+
+
+/**
+ * Mark the given floating point register as "unallocated".
+ */
+void
+ppc_release_fp_register(struct ppc_function *p, int reg)
+{
+   assert(reg < PPC_NUM_FP_REGS);
+   assert((p->fp_used & (1 << reg)) == 0);
+   p->fp_used |= (1 << reg);
+}
+
+
+/**
+ * Allocate a vector register.
+ * \return register index or -1 if none left.
+ */
+int
+ppc_allocate_vec_register(struct ppc_function *p)
 {
    unsigned i;
    for (i = 0; i < PPC_NUM_VEC_REGS; i++) {
@@ -68,7 +172,6 @@ ppc_allocate_vec_register(struct ppc_function *p, int reg)
          return i;
       }
    }
-
    return -1;
 }
 
@@ -81,7 +184,6 @@ ppc_release_vec_register(struct ppc_function *p, int reg)
 {
    assert(reg < PPC_NUM_VEC_REGS);
    assert((p->vec_used & (1 << reg)) == 0);
-
    p->vec_used |= (1 << reg);
 }
 
@@ -98,6 +200,20 @@ union vx_inst {
    } inst;
 };
 
+static inline void
+emit_vx(struct ppc_function *p, uint op2, uint vD, uint vA, uint vB)
+{
+   union vx_inst inst;
+   inst.inst.op = 4;
+   inst.inst.vD = vD;
+   inst.inst.vA = vA;
+   inst.inst.vB = vB;
+   inst.inst.op2 = op2;
+   p->store[p->num_inst++] = inst.bits;
+   assert(p->num_inst <= p->max_inst);
+};
+
+
 union vxr_inst {
    uint32_t bits;
    struct {
@@ -110,6 +226,21 @@ union vxr_inst {
    } inst;
 };
 
+static inline void
+emit_vxr(struct ppc_function *p, uint op2, uint vD, uint vA, uint vB)
+{
+   union vxr_inst inst;
+   inst.inst.op = 4;
+   inst.inst.vD = vD;
+   inst.inst.vA = vA;
+   inst.inst.vB = vB;
+   inst.inst.rC = 0;
+   inst.inst.op2 = op2;
+   p->store[p->num_inst++] = inst.bits;
+   assert(p->num_inst <= p->max_inst);
+};
+
+
 union va_inst {
    uint32_t bits;
    struct {
@@ -122,49 +253,204 @@ union va_inst {
    } inst;
 };
 
-
 static inline void
-emit_vx(struct ppc_function *p, uint op2, uint vD, uint vA, uint vB)
+emit_va(struct ppc_function *p, uint op2, uint vD, uint vA, uint vB, uint vC)
 {
-   union vx_inst inst;
+   union va_inst inst;
    inst.inst.op = 4;
    inst.inst.vD = vD;
    inst.inst.vA = vA;
    inst.inst.vB = vB;
+   inst.inst.vC = vC;
    inst.inst.op2 = op2;
    p->store[p->num_inst++] = inst.bits;
    assert(p->num_inst <= p->max_inst);
 };
 
-static inline void
-emit_vxr(struct ppc_function *p, uint op2, uint vD, uint vA, uint vB)
+
+union i_inst {
+   uint32_t bits;
+   struct {
+      unsigned op:6;
+      unsigned li:24;
+      unsigned aa:1;
+      unsigned lk:1;
+   } inst;
+};
+
+static INLINE void
+emit_i(struct ppc_function *p, uint op, uint li, uint aa, uint lk)
 {
-   union vxr_inst inst;
-   inst.inst.op = 4;
-   inst.inst.vD = vD;
-   inst.inst.vA = vA;
-   inst.inst.vB = vB;
-   inst.inst.rC = 0;
+   union i_inst inst;
+   inst.inst.op = op;
+   inst.inst.li = li;
+   inst.inst.aa = aa;
+   inst.inst.lk = lk;
+   p->store[p->num_inst++] = inst.bits;
+   assert(p->num_inst <= p->max_inst);
+}
+
+
+union xl_inst {
+   uint32_t bits;
+   struct {
+      unsigned op:6;
+      unsigned bo:5;
+      unsigned bi:5;
+      unsigned unused:3;
+      unsigned bh:2;
+      unsigned op2:10;
+      unsigned lk:1;
+   } inst;
+};
+
+static INLINE void
+emit_xl(struct ppc_function *p, uint op, uint bo, uint bi, uint bh,
+        uint op2, uint lk)
+{
+   union xl_inst inst;
+   inst.inst.op = op;
+   inst.inst.bo = bo;
+   inst.inst.bi = bi;
+   inst.inst.unused = 0x0;
+   inst.inst.bh = bh;
    inst.inst.op2 = op2;
+   inst.inst.lk = lk;
    p->store[p->num_inst++] = inst.bits;
    assert(p->num_inst <= p->max_inst);
+}
+
+static INLINE void
+dump_xl(const char *name, uint inst)
+{
+   union xl_inst i;
+
+   i.bits = inst;
+   debug_printf("%s = 0x%08x\n", name, inst);
+   debug_printf(" op: %d 0x%x\n", i.inst.op, i.inst.op);
+   debug_printf(" bo: %d 0x%x\n", i.inst.bo, i.inst.bo);
+   debug_printf(" bi: %d 0x%x\n", i.inst.bi, i.inst.bi);
+   debug_printf(" unused: %d 0x%x\n", i.inst.unused, i.inst.unused);
+   debug_printf(" bh: %d 0x%x\n", i.inst.bh, i.inst.bh);
+   debug_printf(" op2: %d 0x%x\n", i.inst.op2, i.inst.op2);
+   debug_printf(" lk: %d 0x%x\n", i.inst.lk, i.inst.lk);
+}
+
+
+union x_inst {
+   uint32_t bits;
+   struct {
+      unsigned op:6;
+      unsigned vrs:5;
+      unsigned ra:5;
+      unsigned rb:5;
+      unsigned op2:10;
+      unsigned unused:1;
+   } inst;
 };
 
-static inline void
-emit_va(struct ppc_function *p, uint op2, uint vD, uint vA, uint vB, uint vC)
+static INLINE void
+emit_x(struct ppc_function *p, uint op, uint vrs, uint ra, uint rb, uint op2)
 {
-   union va_inst inst;
-   inst.inst.op = 4;
-   inst.inst.vD = vD;
-   inst.inst.vA = vA;
-   inst.inst.vB = vB;
-   inst.inst.vC = vC;
+   union x_inst inst;
+   inst.inst.op = op;
+   inst.inst.vrs = vrs;
+   inst.inst.ra = ra;
+   inst.inst.rb = rb;
    inst.inst.op2 = op2;
+   inst.inst.unused = 0x0;
+   p->store[p->num_inst++] = inst.bits;
+   assert(p->num_inst <= p->max_inst);
+}
+
+
+union d_inst {
+   uint32_t bits;
+   struct {
+      unsigned op:6;
+      unsigned rt:5;
+      unsigned ra:5;
+      unsigned si:16;
+   } inst;
+};
+
+static inline void
+emit_d(struct ppc_function *p, uint op, uint rt, uint ra, int si)
+{
+   union d_inst inst;
+   assert(si >= -32768);
+   assert(si <= 32767);
+   inst.inst.op = op;
+   inst.inst.rt = rt;
+   inst.inst.ra = ra;
+   inst.inst.si = (unsigned) (si & 0xffff);
    p->store[p->num_inst++] = inst.bits;
    assert(p->num_inst <= p->max_inst);
 };
 
 
+union a_inst {
+   uint32_t bits;
+   struct {
+      unsigned op:6;
+      unsigned frt:5;
+      unsigned fra:5;
+      unsigned frb:5;
+      unsigned unused:5;
+      unsigned op2:5;
+      unsigned rc:1;
+   } inst;
+};
+
+static inline void
+emit_a(struct ppc_function *p, uint op, uint frt, uint fra, uint frb, uint op2,
+       uint rc)
+{
+   union a_inst inst;
+   inst.inst.op = op;
+   inst.inst.frt = frt;
+   inst.inst.fra = fra;
+   inst.inst.frb = frb;
+   inst.inst.unused = 0x0;
+   inst.inst.op2 = op2;
+   inst.inst.rc = rc;
+   p->store[p->num_inst++] = inst.bits;
+   assert(p->num_inst <= p->max_inst);
+};
+
+
+union xo_inst {
+   uint32_t bits;
+   struct {
+      unsigned op:6;
+      unsigned rt:5;
+      unsigned ra:5;
+      unsigned rb:5;
+      unsigned oe:1;
+      unsigned op2:9;
+      unsigned rc:1;
+   } inst;
+};
+
+static INLINE void
+emit_xo(struct ppc_function *p, uint op, uint rt, uint ra, uint rb, uint oe,
+        uint op2, uint rc)
+{
+   union xo_inst inst;
+   inst.inst.op = op;
+   inst.inst.rt = rt;
+   inst.inst.ra = ra;
+   inst.inst.rb = rb;
+   inst.inst.oe = oe;
+   inst.inst.op2 = op2;
+   inst.inst.rc = rc;
+   p->store[p->num_inst++] = inst.bits;
+   assert(p->num_inst <= p->max_inst);
+}
+
+
+
+
 
 /**
  ** float vector arithmetic
@@ -172,7 +458,7 @@ emit_va(struct ppc_function *p, uint op2, uint vD, uint vA, uint vB, uint vC)
 
 /** vector float add */
 void
-ppc_vaddfp(struct ppc_function *p,uint vD, uint vA, uint vB)
+ppc_vaddfp(struct ppc_function *p, uint vD, uint vA, uint vB)
 {
    emit_vx(p, 10, vD, vA, vB);
 }
@@ -198,11 +484,11 @@ ppc_vmaxfp(struct ppc_function *p, uint vD, uint vA, uint vB)
    emit_vx(p, 1034, vD, vA, vB);
 }
 
-/** vector float mult add */
+/** vector float mult add: vD = vA * vB + vC */
 void
 ppc_vmaddfp(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC)
 {
-   emit_va(p, 46, vD, vA, vB, vC);
+   emit_va(p, 46, vD, vA, vC, vB); /* note arg order */
 }
 
 /** vector float compare greater than */
@@ -282,13 +568,26 @@ ppc_vrfiz(struct ppc_function *p, uint vD, uint vB)
    emit_vx(p, 586, vD, 0, vB);
 }
 
+/** vector store: store vR at mem[vA+vB] */
+void
+ppc_stvx(struct ppc_function *p, uint vR, uint vA, uint vB)
+{
+   emit_x(p, 31, vR, vA, vB, 231);
+}
+
+/** vector load: vR = mem[vA+vB] */
+void
+ppc_lvx(struct ppc_function *p, uint vR, uint vA, uint vB)
+{
+   emit_x(p, 31, vR, vA, vB, 103);
+}
+
 
 
 /**
- ** bitwise operations
+ ** vector bitwise operations
  **/
 
-
 /** vector and */
 void
 ppc_vand(struct ppc_function *p, uint vD, uint vA, uint vB)
@@ -324,6 +623,14 @@ ppc_vxor(struct ppc_function *p, uint vD, uint vA, uint vB)
    emit_vx(p, 1220, vD, vA, vB);
 }
 
+/** Pseudo-instruction: vector move */
+void
+ppc_vecmove(struct ppc_function *p, uint vD, uint vA)
+{
+   ppc_vor(p, vD, vA, vA);
+}
+
+
 
 /**
  ** Vector shuffle / select / splat / etc
@@ -363,3 +670,225 @@ ppc_vspltw(struct ppc_function *p, uint vD, uint vB, uint imm)
 {
    emit_vx(p, 652, vD, imm, vB);
 }
+
+/** vector splat signed immediate word */
+void
+ppc_vspltisw(struct ppc_function *p, uint vD, int imm)
+{
+   assert(imm >= -16);
+   assert(imm < 15);
+   emit_vx(p, 908, vD, imm, 0);
+}
+
+/** vector shift left word: vD[word] = vA[word] << (vB[word] & 0x1f) */
+void
+ppc_vslw(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+   emit_vx(p, 388, vD, vA, vB);
+}
+
+
+
+
+/**
+ ** integer arithmetic
+ **/
+
+/** rt = ra + imm */
+void
+ppc_addi(struct ppc_function *p, uint rt, uint ra, int imm)
+{
+   emit_d(p, 14, rt, ra, imm);
+}
+
+/** rt = ra + (imm << 16) */
+void
+ppc_addis(struct ppc_function *p, uint rt, uint ra, int imm)
+{
+   emit_d(p, 15, rt, ra, imm);
+}
+
+/** rt = ra + rb */
+void
+ppc_add(struct ppc_function *p, uint rt, uint ra, uint rb)
+{
+   emit_xo(p, 31, rt, ra, rb, 0, 266, 0);
+}
+
+/** rt = ra AND ra */
+void
+ppc_and(struct ppc_function *p, uint rt, uint ra, uint rb)
+{
+   emit_x(p, 31, ra, rt, rb, 28);  /* note argument order */
+}
+
+/** rt = ra AND imm */
+void
+ppc_andi(struct ppc_function *p, uint rt, uint ra, int imm)
+{
+   emit_d(p, 28, ra, rt, imm);  /* note argument order */
+}
+
+/** rt = ra OR ra */
+void
+ppc_or(struct ppc_function *p, uint rt, uint ra, uint rb)
+{
+   emit_x(p, 31, ra, rt, rb, 444);  /* note argument order */
+}
+
+/** rt = ra OR imm */
+void
+ppc_ori(struct ppc_function *p, uint rt, uint ra, int imm)
+{
+   emit_d(p, 24, ra, rt, imm);  /* note argument order */
+}
+
+/** rt = ra XOR ra */
+void
+ppc_xor(struct ppc_function *p, uint rt, uint ra, uint rb)
+{
+   emit_x(p, 31, ra, rt, rb, 316);  /* note argument order */
+}
+
+/** rt = ra XOR imm */
+void
+ppc_xori(struct ppc_function *p, uint rt, uint ra, int imm)
+{
+   emit_d(p, 26, ra, rt, imm);  /* note argument order */
+}
+
+/** pseudo instruction: move: rt = ra */
+void
+ppc_mr(struct ppc_function *p, uint rt, uint ra)
+{
+   ppc_or(p, rt, ra, ra);
+}
+
+/** pseudo instruction: load immediate: rt = imm */
+void
+ppc_li(struct ppc_function *p, uint rt, int imm)
+{
+   ppc_addi(p, rt, 0, imm);
+}
+
+/** rt = imm << 16 */
+void
+ppc_lis(struct ppc_function *p, uint rt, int imm)
+{
+   ppc_addis(p, rt, 0, imm);
+}
+
+/** rt = imm */
+void
+ppc_load_int(struct ppc_function *p, uint rt, int imm)
+{
+   ppc_lis(p, rt, (imm >> 16));          /* rt = imm >> 16 */
+   ppc_ori(p, rt, rt, (imm & 0xffff));   /* rt = rt | (imm & 0xffff) */
+}
+
+
+
+
+/**
+ ** integer load/store
+ **/
+
+/** store rs at memory[(ra)+d],
+ * then update ra = (ra)+d
+ */
+void
+ppc_stwu(struct ppc_function *p, uint rs, uint ra, int d)
+{
+   emit_d(p, 37, rs, ra, d);
+}
+
+/** store rs at memory[(ra)+d] */
+void
+ppc_stw(struct ppc_function *p, uint rs, uint ra, int d)
+{
+   emit_d(p, 36, rs, ra, d);
+}
+
+/** Load rt = mem[(ra)+d];  then zero set high 32 bits to zero. */
+void
+ppc_lwz(struct ppc_function *p, uint rt, uint ra, int d)
+{
+   emit_d(p, 32, rt, ra, d);
+}
+
+
+
+/**
+ ** Float (non-vector) arithmetic
+ **/
+
+/** add: frt = fra + frb */
+void
+ppc_fadd(struct ppc_function *p, uint frt, uint fra, uint frb)
+{
+   emit_a(p, 63, frt, fra, frb, 21, 0);
+}
+
+/** sub: frt = fra - frb */
+void
+ppc_fsub(struct ppc_function *p, uint frt, uint fra, uint frb)
+{
+   emit_a(p, 63, frt, fra, frb, 20, 0);
+}
+
+/** convert to int: rt = (int) ra */
+void
+ppc_fctiwz(struct ppc_function *p, uint rt, uint fra)
+{
+   emit_x(p, 63, rt, 0, fra, 15);
+}
+
+/** store frs at mem[(ra)+offset] */
+void
+ppc_stfs(struct ppc_function *p, uint frs, uint ra, int offset)
+{
+   emit_d(p, 52, frs, ra, offset);
+}
+
+/** store frs at mem[(ra)+(rb)] */
+void
+ppc_stfiwx(struct ppc_function *p, uint frs, uint ra, uint rb)
+{
+   emit_x(p, 31, frs, ra, rb, 983);
+}
+
+/** load frt = mem[(ra)+offset] */
+void
+ppc_lfs(struct ppc_function *p, uint frt, uint ra, int offset)
+{
+   emit_d(p, 48, frt, ra, offset);
+}
+
+
+
+
+
+/**
+ ** branch instructions
+ **/
+
+/** BLR: Branch to link register (p. 35) */
+void
+ppc_blr(struct ppc_function *p)
+{
+   emit_i(p, 18, 0, 0, 1);
+}
+
+/** Branch Conditional to Link Register (p. 36) */
+void
+ppc_bclr(struct ppc_function *p, uint condOp, uint branchHint, uint condReg)
+{
+   emit_xl(p, 19, condOp, condReg, branchHint, 16, 0);
+}
+
+/** Pseudo instruction: return from subroutine */
+void
+ppc_return(struct ppc_function *p)
+{
+   ppc_bclr(p, BRANCH_COND_ALWAYS, BRANCH_HINT_SUB_RETURN, 0);
+}
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc.h b/src/gallium/auxiliary/rtasm/rtasm_ppc.h
index ed14e943df..6370b60494 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc.h
@@ -36,27 +36,46 @@
 
 #define PPC_INST_SIZE 4  /**< 4 bytes / instruction */
 
+#define PPC_NUM_REGS 32
+#define PPC_NUM_FP_REGS 32
 #define PPC_NUM_VEC_REGS 32
 
+/** Stack pointer register */
+#define PPC_REG_SP 1
+
+/** Branch conditions */
+#define BRANCH_COND_ALWAYS       0x14  /* binary 1z1zz (z=ignored) */
+
+/** Branch hints */
+#define BRANCH_HINT_SUB_RETURN   0x0   /* binary 00 */
+
 
 struct ppc_function
 {
    uint32_t *store;  /**< instruction buffer */
    uint num_inst;
    uint max_inst;
-   uint32_t vec_used;   /** used/free vector registers bitmask */
    uint32_t reg_used;   /** used/free general-purpose registers bitmask */
+   uint32_t fp_used;   /** used/free floating point registers bitmask */
+   uint32_t vec_used;   /** used/free vector registers bitmask */
 };
 
 
 extern void ppc_init_func(struct ppc_function *p, unsigned max_inst);
 extern void ppc_release_func(struct ppc_function *p);
-
-extern int ppc_allocate_vec_register(struct ppc_function *p, int reg);
+extern void (*ppc_get_func( struct ppc_function *p ))( void );
+extern void ppc_dump_func(const struct ppc_function *p);
+
+extern int ppc_allocate_register(struct ppc_function *p);
+extern void ppc_release_register(struct ppc_function *p, int reg);
+extern int ppc_allocate_fp_register(struct ppc_function *p);
+extern void ppc_release_fp_register(struct ppc_function *p, int reg);
+extern int ppc_allocate_vec_register(struct ppc_function *p);
 extern void ppc_release_vec_register(struct ppc_function *p, int reg);
 
 
+
 /**
  ** float vector arithmetic
  **/
@@ -126,9 +145,18 @@ extern void
 ppc_vrfiz(struct ppc_function *p, uint vD, uint vB);
 
 
+/** vector store: store vR at mem[vA+vB] */
+extern void
+ppc_stvx(struct ppc_function *p, uint vR, uint vA, uint vB);
+
+/** vector load: vR = mem[vA+vB] */
+extern void
+ppc_lvx(struct ppc_function *p, uint vR, uint vA, uint vB);
+
+
 
 /**
- ** bitwise operations
+ ** vector bitwise operations
  **/
 
 
@@ -152,6 +180,10 @@ ppc_vnor(struct ppc_function *p, uint vD, uint vA, uint vB);
 extern void
 ppc_vxor(struct ppc_function *p, uint vD, uint vA, uint vB);
 
+/** Pseudo-instruction: vector move */
+extern void
+ppc_vecmove(struct ppc_function *p, uint vD, uint vA);
+
 
 /**
  ** Vector shuffle / select / splat / etc
@@ -177,5 +209,106 @@ ppc_vsplthw(struct ppc_function *p, uint vD, uint vB, uint imm);
 extern void
 ppc_vspltw(struct ppc_function *p, uint vD, uint vB, uint imm);
 
+/** vector splat signed immediate word */
+extern void
+ppc_vspltisw(struct ppc_function *p, uint vD, int imm);
+
+/** vector shift left word: vD[word] = vA[word] << (vB[word] & 0x1f) */
+extern void
+ppc_vslw(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+
+
+/**
+ ** scalar arithmetic
+ **/
+
+extern void
+ppc_add(struct ppc_function *p, uint rt, uint ra, uint rb);
+
+extern void
+ppc_addi(struct ppc_function *p, uint rt, uint ra, int imm);
+
+extern void
+ppc_and(struct ppc_function *p, uint rt, uint ra, uint rb);
+
+extern void
+ppc_andi(struct ppc_function *p, uint rt, uint ra, int imm);
+
+extern void
+ppc_or(struct ppc_function *p, uint rt, uint ra, uint rb);
+
+extern void
+ppc_ori(struct ppc_function *p, uint rt, uint ra, int imm);
+
+extern void
+ppc_xor(struct ppc_function *p, uint rt, uint ra, uint rb);
+
+extern void
+ppc_xori(struct ppc_function *p, uint rt, uint ra, int imm);
+
+extern void
+ppc_mr(struct ppc_function *p, uint rt, uint ra);
+
+extern void
+ppc_li(struct ppc_function *p, uint rt, int imm);
+
+extern void
+ppc_lis(struct ppc_function *p, uint rt, int imm);
+
+extern void
+ppc_load_int(struct ppc_function *p, uint rt, int imm);
+
+
+
+/**
+ ** scalar load/store
+ **/
+
+extern void
+ppc_stwu(struct ppc_function *p, uint rs, uint ra, int d);
+
+extern void
+ppc_stw(struct ppc_function *p, uint rs, uint ra, int d);
+
+extern void
+ppc_lwz(struct ppc_function *p, uint rs, uint ra, int d);
+
+
+
+/**
+ ** Float (non-vector) arithmetic
+ **/
+
+extern void
+ppc_fadd(struct ppc_function *p, uint frt, uint fra, uint frb);
+
+extern void
+ppc_fsub(struct ppc_function *p, uint frt, uint fra, uint frb);
+
+extern void
+ppc_fctiwz(struct ppc_function *p, uint rt, uint ra);
+
+extern void
+ppc_stfs(struct ppc_function *p, uint frs, uint ra, int offset);
+
+extern void
+ppc_stfiwx(struct ppc_function *p, uint frs, uint ra, uint rb);
+
+
+
+/**
+ ** branch instructions
+ **/
+
+extern void
+ppc_blr(struct ppc_function *p);
+
+void
+ppc_bclr(struct ppc_function *p, uint condOp, uint branchHint, uint condReg);
+
+extern void
+ppc_return(struct ppc_function *p);
+
 
 #endif /* RTASM_PPC_H */
-- 
cgit v1.2.3


From 3aea9c463b7c6b5ba63796ee84f65870662b6567 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 22 Oct 2008 11:04:29 -0600
Subject: cell: include pthread.h

---
 src/gallium/drivers/cell/ppu/cell_spu.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'src/gallium')

diff --git a/src/gallium/drivers/cell/ppu/cell_spu.h b/src/gallium/drivers/cell/ppu/cell_spu.h
index 2e965c6301..b633880c25 100644
--- a/src/gallium/drivers/cell/ppu/cell_spu.h
+++ b/src/gallium/drivers/cell/ppu/cell_spu.h
@@ -31,6 +31,7 @@
 
 #include <libspe2.h>
 #include <libmisc.h>
+#include <pthread.h>
 #include "cell/common.h"
 
 #include "cell_context.h"
-- 
cgit v1.2.3


From 049f57f86a2cb8ff08fba819c581a034ca7ea52c Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 22 Oct 2008 11:06:39 -0600
Subject: gallium: added ppc_lvewx()

---
 src/gallium/auxiliary/rtasm/rtasm_ppc.c | 7 +++++++
 src/gallium/auxiliary/rtasm/rtasm_ppc.h | 4 ++++
 2 files changed, 11 insertions(+)

(limited to 'src/gallium')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc.c b/src/gallium/auxiliary/rtasm/rtasm_ppc.c
index 4a94ed0460..aaec2d2191 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc.c
@@ -582,6 +582,13 @@ ppc_lvx(struct ppc_function *p, uint vR, uint vA, uint vB)
    emit_x(p, 31, vR, vA, vB, 103);
 }
 
+/** load vector element word: vR = mem_word[vA+vB] */
+void
+ppc_lvewx(struct ppc_function *p, uint vR, uint vA, uint vB)
+{
+   emit_x(p, 31, vR, vA, vB, 71);
+}
+
 
 
 /**
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc.h b/src/gallium/auxiliary/rtasm/rtasm_ppc.h
index 6370b60494..53d5746dc8 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc.h
@@ -153,6 +153,10 @@ ppc_stvx(struct ppc_function *p, uint vR, uint vA, uint vB);
 extern void
 ppc_lvx(struct ppc_function *p, uint vR, uint vA, uint vB);
 
+/** load vector element word: vR = mem_word[vA+vB] */
+extern void
+ppc_lvewx(struct ppc_function *p, uint vR, uint vA, uint vB);
+
 
 
 /**
-- 
cgit v1.2.3


From 70f4ad44985e3ec6dabc1b0e55a5bf85803a4cd4 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 22 Oct 2008 11:07:35 -0600
Subject: gallium: TGSI to PPC code generation

Based on the TGSIto SSE2 code generator.
Incomplete and lots of SSE stuff still hanging around but the basic dozen
or so TGSI opcodes are functioning.
---
 src/gallium/auxiliary/tgsi/Makefile   |    1 +
 src/gallium/auxiliary/tgsi/tgsi_ppc.c | 2781 +++++++++++++++++++++++++++++++++
 src/gallium/auxiliary/tgsi/tgsi_ppc.h |   48 +
 3 files changed, 2830 insertions(+)
 create mode 100644 src/gallium/auxiliary/tgsi/tgsi_ppc.c
 create mode 100644 src/gallium/auxiliary/tgsi/tgsi_ppc.h

(limited to 'src/gallium')

diff --git a/src/gallium/auxiliary/tgsi/Makefile b/src/gallium/auxiliary/tgsi/Makefile
index c7155a9316..d7df9490cf 100644
--- a/src/gallium/auxiliary/tgsi/Makefile
+++ b/src/gallium/auxiliary/tgsi/Makefile
@@ -11,6 +11,7 @@ C_SOURCES = \
 	tgsi_info.c \
 	tgsi_iterate.c \
 	tgsi_parse.c \
+	tgsi_ppc.c \
 	tgsi_scan.c \
 	tgsi_sse2.c \
 	tgsi_text.c \
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ppc.c b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
new file mode 100644
index 0000000000..112e736523
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
@@ -0,0 +1,2781 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * TGSI to PowerPC code generation.
+ */
+
+#include "pipe/p_config.h"
+
+#if defined(PIPE_ARCH_PPC)
+
+#include "pipe/p_debug.h"
+#include "pipe/p_shader_tokens.h"
+#include "util/u_math.h"
+#include "util/u_sse.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_util.h"
+#include "tgsi_exec.h"
+#include "tgsi_ppc.h"
+#include "rtasm/rtasm_ppc.h"
+
+
+/* for 1/sqrt()
+ *
+ * This costs about 100fps (close to 10%) in gears:
+ */
+#define HIGH_PRECISION 1
+
+#define FAST_MATH 1
+
+
+#define FOR_EACH_CHANNEL( CHAN )\
+   for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)
+
+#define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
+   ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
+
+#define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
+   if (IS_DST0_CHANNEL_ENABLED( INST, CHAN ))
+
+#define FOR_EACH_DST0_ENABLED_CHANNEL( INST, CHAN )\
+   FOR_EACH_CHANNEL( CHAN )\
+      IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )
+
+#define CHAN_X 0
+#define CHAN_Y 1
+#define CHAN_Z 2
+#define CHAN_W 3
+
+#define TEMP_ONE_I   TGSI_EXEC_TEMP_ONE_I
+#define TEMP_ONE_C   TGSI_EXEC_TEMP_ONE_C
+
+#define TEMP_R0   TGSI_EXEC_TEMP_R0
+#define TEMP_ADDR TGSI_EXEC_TEMP_ADDR
+
+
+/**
+ * Context/state used during code gen.
+ */
+struct gen_context
+{
+   struct ppc_function *f;
+   int inputs_reg;    /**< register pointing to input params */
+   int outputs_reg;   /**< register pointing to output params */
+   int temps_reg;     /**< register pointing to temporary "registers" */
+   int immed_reg;     /**< register pointing to immediates buffer */
+   int const_reg;     /**< register pointing to constants buffer */
+};
+
+
+
+#if 0000
+
+/**
+ * X86 utility functions.
+ */
+
+static struct x86_reg
+make_xmm(
+   unsigned xmm )
+{
+   return x86_make_reg(
+      file_XMM,
+      (enum x86_reg_name) xmm );
+}
+
+/**
+ * X86 register mapping helpers.
+ */
+
+static struct x86_reg
+get_const_base( void )
+{
+   return x86_make_reg(
+      file_REG32,
+      reg_CX );
+}
+
+static struct x86_reg
+get_input_base( void )
+{
+   return x86_make_reg(
+      file_REG32,
+      reg_AX );
+}
+
+static struct x86_reg
+get_output_base( void )
+{
+   return x86_make_reg(
+      file_REG32,
+      reg_DX );
+}
+
+static struct x86_reg
+get_temp_base( void )
+{
+   return x86_make_reg(
+      file_REG32,
+      reg_BX );
+}
+
+static struct x86_reg
+get_coef_base( void )
+{
+   return get_output_base();
+}
+
+static struct x86_reg
+get_immediate_base( void )
+{
+   return x86_make_reg(
+      file_REG32,
+      reg_DI );
+}
+
+
+/**
+ * Data access helpers.
+ */
+
+
+static struct x86_reg
+get_immediate(
+   unsigned vec,
+   unsigned chan )
+{
+   return x86_make_disp(
+      get_immediate_base(),
+      (vec * 4 + chan) * 4 );
+}
+
+static struct x86_reg
+get_const(
+   unsigned vec,
+   unsigned chan )
+{
+   return x86_make_disp(
+      get_const_base(),
+      (vec * 4 + chan) * 4 );
+}
+
+static struct x86_reg
+get_input(
+   unsigned vec,
+   unsigned chan )
+{
+   return x86_make_disp(
+      get_input_base(),
+      (vec * 4 + chan) * 16 );
+}
+
+static struct x86_reg
+get_output(
+   unsigned vec,
+   unsigned chan )
+{
+   return x86_make_disp(
+      get_output_base(),
+      (vec * 4 + chan) * 16 );
+}
+
+static struct x86_reg
+get_temp(
+   unsigned vec,
+   unsigned chan )
+{
+   return x86_make_disp(
+      get_temp_base(),
+      (vec * 4 + chan) * 16 );
+}
+
+static struct x86_reg
+get_coef(
+   unsigned vec,
+   unsigned chan,
+   unsigned member )
+{
+   return x86_make_disp(
+      get_coef_base(),
+      ((vec * 3 + member) * 4 + chan) * 4 );
+}
+
+
+static void
+emit_ret(
+   struct x86_function  *func )
+{
+   x86_ret( func );
+}
+
+#endif
+
+/**
+ * Data fetch helpers.
+ */
+
+#if 00
+/**
+ * Copy a shader constant to xmm register
+ * \param xmm  the destination xmm register
+ * \param vec  the src const buffer index
+ * \param chan  src channel to fetch (X, Y, Z or W)
+ */
+static void
+emit_const(
+   struct x86_function *func,
+   uint xmm,
+   int vec,
+   uint chan,
+   uint indirect,
+   uint indirectFile,
+   int indirectIndex )
+{
+   if (indirect) {
+      struct x86_reg r0 = get_input_base();
+      struct x86_reg r1 = get_output_base();
+      uint i;
+
+      assert( indirectFile == TGSI_FILE_ADDRESS );
+      assert( indirectIndex == 0 );
+
+      x86_push( func, r0 );
+      x86_push( func, r1 );
+
+      for (i = 0; i < QUAD_SIZE; i++) {
+         x86_lea( func, r0, get_const( vec, chan ) );
+         x86_mov( func, r1, x86_make_disp( get_temp( TEMP_ADDR, CHAN_X ), i * 4 ) );
+
+         /* Quick hack to multiply by 16 -- need to add SHL to rtasm.
+          */
+         x86_add( func, r1, r1 );
+         x86_add( func, r1, r1 );
+         x86_add( func, r1, r1 );
+         x86_add( func, r1, r1 );
+
+         x86_add( func, r0, r1 );
+         x86_mov( func, r1, x86_deref( r0 ) );
+         x86_mov( func, x86_make_disp( get_temp( TEMP_R0, CHAN_X ), i * 4 ), r1 );
+      }
+
+      x86_pop( func, r1 );
+      x86_pop( func, r0 );
+
+      sse_movaps(
+         func,
+         make_xmm( xmm ),
+         get_temp( TEMP_R0, CHAN_X ) );
+   }
+   else {
+      assert( vec >= 0 );
+
+      sse_movss(
+         func,
+         make_xmm( xmm ),
+         get_const( vec, chan ) );
+      sse_shufps(
+         func,
+         make_xmm( xmm ),
+         make_xmm( xmm ),
+         SHUF( 0, 0, 0, 0 ) );
+   }
+}
+
+static void
+emit_immediate(
+   struct x86_function *func,
+   unsigned xmm,
+   unsigned vec,
+   unsigned chan )
+{
+   sse_movss(
+      func,
+      make_xmm( xmm ),
+      get_immediate( vec, chan ) );
+   sse_shufps(
+      func,
+      make_xmm( xmm ),
+      make_xmm( xmm ),
+      SHUF( 0, 0, 0, 0 ) );
+}
+
+
+/**
+ * Copy a shader input to xmm register
+ * \param xmm  the destination xmm register
+ * \param vec  the src input attrib
+ * \param chan  src channel to fetch (X, Y, Z or W)
+ */
+static void
+emit_inputf(
+   struct x86_function *func,
+   unsigned xmm,
+   unsigned vec,
+   unsigned chan )
+{
+   sse_movups(
+      func,
+      make_xmm( xmm ),
+      get_input( vec, chan ) );
+}
+
+/**
+ * Store an xmm register to a shader output
+ * \param xmm  the source xmm register
+ * \param vec  the dest output attrib
+ * \param chan  src dest channel to store (X, Y, Z or W)
+ */
+static void
+emit_output(
+   struct x86_function *func,
+   unsigned xmm,
+   unsigned vec,
+   unsigned chan )
+{
+   sse_movups(
+      func,
+      get_output( vec, chan ),
+      make_xmm( xmm ) );
+}
+
+/**
+ * Copy a shader temporary to xmm register
+ * \param xmm  the destination xmm register
+ * \param vec  the src temp register
+ * \param chan  src channel to fetch (X, Y, Z or W)
+ */
+static void
+emit_tempf(
+   struct x86_function *func,
+   unsigned xmm,
+   unsigned vec,
+   unsigned chan )
+{
+   sse_movaps(
+      func,
+      make_xmm( xmm ),
+      get_temp( vec, chan ) );
+}
+
+/**
+ * Load an xmm register with an input attrib coefficient (a0, dadx or dady)
+ * \param xmm  the destination xmm register
+ * \param vec  the src input/attribute coefficient index
+ * \param chan  src channel to fetch (X, Y, Z or W)
+ * \param member  0=a0, 1=dadx, 2=dady
+ */
+static void
+emit_coef(
+   struct x86_function *func,
+   unsigned xmm,
+   unsigned vec,
+   unsigned chan,
+   unsigned member )
+{
+   sse_movss(
+      func,
+      make_xmm( xmm ),
+      get_coef( vec, chan, member ) );
+   sse_shufps(
+      func,
+      make_xmm( xmm ),
+      make_xmm( xmm ),
+      SHUF( 0, 0, 0, 0 ) );
+}
+
+/**
+ * Data store helpers.
+ */
+
+static void
+emit_inputs(
+   struct x86_function *func,
+   unsigned xmm,
+   unsigned vec,
+   unsigned chan )
+{
+   sse_movups(
+      func,
+      get_input( vec, chan ),
+      make_xmm( xmm ) );
+}
+
+static void
+emit_temps(
+   struct x86_function *func,
+   unsigned xmm,
+   unsigned vec,
+   unsigned chan )
+{
+   sse_movaps(
+      func,
+      get_temp( vec, chan ),
+      make_xmm( xmm ) );
+}
+
+static void
+emit_addrs(
+   struct x86_function *func,
+   unsigned xmm,
+   unsigned vec,
+   unsigned chan )
+{
+   assert( vec == 0 );
+
+   emit_temps(
+      func,
+      xmm,
+      vec + TGSI_EXEC_TEMP_ADDR,
+      chan );
+}
+
+/**
+ * Coefficent fetch helpers.
+ */
+
+static void
+emit_coef_a0(
+   struct x86_function *func,
+   unsigned xmm,
+   unsigned vec,
+   unsigned chan )
+{
+   emit_coef(
+      func,
+      xmm,
+      vec,
+      chan,
+      0 );
+}
+
+static void
+emit_coef_dadx(
+   struct x86_function *func,
+   unsigned xmm,
+   unsigned vec,
+   unsigned chan )
+{
+   emit_coef(
+      func,
+      xmm,
+      vec,
+      chan,
+      1 );
+}
+
+static void
+emit_coef_dady(
+   struct x86_function *func,
+   unsigned xmm,
+   unsigned vec,
+   unsigned chan )
+{
+   emit_coef(
+      func,
+      xmm,
+      vec,
+      chan,
+      2 );
+}
+#endif
+
+
+/**
+ * Function call helpers.
+ */
+
+#if 00
+/**
+ * NOTE: In gcc, if the destination uses the SSE intrinsics, then it must be 
+ * defined with __attribute__((force_align_arg_pointer)), as we do not guarantee
+ * that the stack pointer is 16 byte aligned, as expected.
+ */
+static void
+emit_func_call_dst(
+   struct x86_function *func,
+   unsigned xmm_save,
+   unsigned xmm_dst,
+   void (PIPE_CDECL *code)() )
+{
+   struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
+   unsigned i, n, xmm;
+   unsigned xmm_mask;
+   
+   /* Bitmask of the xmm registers to save */
+   xmm_mask = (1 << xmm_save) - 1;
+   xmm_mask &= ~(1 << xmm_dst);
+
+   sse_movaps(
+      func,
+      get_temp( TEMP_R0, 0 ),
+      make_xmm( xmm_dst ) );
+
+   x86_push(
+      func,
+      x86_make_reg( file_REG32, reg_AX) );
+   x86_push(
+      func,
+      x86_make_reg( file_REG32, reg_CX) );
+   x86_push(
+      func,
+      x86_make_reg( file_REG32, reg_DX) );
+   
+   for(i = 0, n = 0; i < 8; ++i)
+      if(xmm_mask & (1 << i))
+         ++n;
+   
+   x86_sub_imm(
+      func, 
+      x86_make_reg( file_REG32, reg_SP ),
+      n*16);
+
+   for(i = 0, n = 0; i < 8; ++i)
+      if(xmm_mask & (1 << i)) {
+         sse_movups(
+            func,
+            x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ),
+            make_xmm( xmm ) );
+         ++n;
+      }
+   
+   x86_lea(
+      func,
+      ecx,
+      get_temp( TEMP_R0, 0 ) );
+   
+   x86_push( func, ecx );
+   x86_mov_reg_imm( func, ecx, (unsigned long) code );
+   x86_call( func, ecx );
+   x86_pop(func, ecx );
+   
+   for(i = 0, n = 0; i < 8; ++i)
+      if(xmm_mask & (1 << i)) {
+         sse_movups(
+            func,
+            make_xmm( xmm ),
+            x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ) );
+         ++n;
+      }
+   
+   x86_add_imm(
+      func, 
+      x86_make_reg( file_REG32, reg_SP ),
+      n*16);
+
+   /* Restore GP registers in a reverse order.
+    */
+   x86_pop(
+      func,
+      x86_make_reg( file_REG32, reg_DX) );
+   x86_pop(
+      func,
+      x86_make_reg( file_REG32, reg_CX) );
+   x86_pop(
+      func,
+      x86_make_reg( file_REG32, reg_AX) );
+
+   sse_movaps(
+      func,
+      make_xmm( xmm_dst ),
+      get_temp( TEMP_R0, 0 ) );
+}
+
+static void
+emit_func_call_dst_src(
+   struct x86_function *func,
+   unsigned xmm_save, 
+   unsigned xmm_dst,
+   unsigned xmm_src,
+   void (PIPE_CDECL *code)() )
+{
+   sse_movaps(
+      func,
+      get_temp( TEMP_R0, 1 ),
+      make_xmm( xmm_src ) );
+
+   emit_func_call_dst(
+      func,
+      xmm_save,
+      xmm_dst,
+      code );
+}
+
+/*
+ * Fast SSE2 implementation of special math functions.
+ */
+
+#define POLY0(x, c0) _mm_set1_ps(c0)
+#define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
+#define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
+#define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
+#define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
+#define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
+
+#define EXP_POLY_DEGREE 3
+#define LOG_POLY_DEGREE 5
+
+/**
+ * See http://www.devmaster.net/forums/showthread.php?p=43580
+ */
+static INLINE __m128 
+exp2f4(__m128 x)
+{
+   __m128i ipart;
+   __m128 fpart, expipart, expfpart;
+
+   x = _mm_min_ps(x, _mm_set1_ps( 129.00000f));
+   x = _mm_max_ps(x, _mm_set1_ps(-126.99999f));
+
+   /* ipart = int(x - 0.5) */
+   ipart = _mm_cvtps_epi32(_mm_sub_ps(x, _mm_set1_ps(0.5f)));
+
+   /* fpart = x - ipart */
+   fpart = _mm_sub_ps(x, _mm_cvtepi32_ps(ipart));
+
+   /* expipart = (float) (1 << ipart) */
+   expipart = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart, _mm_set1_epi32(127)), 23));
+
+   /* minimax polynomial fit of 2**x, in range [-0.5, 0.5[ */
+#if EXP_POLY_DEGREE == 5
+   expfpart = POLY5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f);
+#elif EXP_POLY_DEGREE == 4
+   expfpart = POLY4(fpart, 1.0000026f, 6.9300383e-1f, 2.4144275e-1f, 5.2011464e-2f, 1.3534167e-2f);
+#elif EXP_POLY_DEGREE == 3
+   expfpart = POLY3(fpart, 9.9992520e-1f, 6.9583356e-1f, 2.2606716e-1f, 7.8024521e-2f);
+#elif EXP_POLY_DEGREE == 2
+   expfpart = POLY2(fpart, 1.0017247f, 6.5763628e-1f, 3.3718944e-1f);
+#else
+#error
+#endif
+
+   return _mm_mul_ps(expipart, expfpart);
+}
+
+/**
+ * See http://www.devmaster.net/forums/showthread.php?p=43580
+ */
+static INLINE __m128 
+log2f4(__m128 x)
+{
+   __m128i expmask = _mm_set1_epi32(0x7f800000);
+   __m128i mantmask = _mm_set1_epi32(0x007fffff);
+   __m128 one = _mm_set1_ps(1.0f);
+
+   __m128i i = _mm_castps_si128(x);
+
+   /* exp = (float) exponent(x) */
+   __m128 exp = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i, expmask), 23), _mm_set1_epi32(127)));
+
+   /* mant = (float) mantissa(x) */
+   __m128 mant = _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i, mantmask)), one);
+
+   __m128 logmant;
+
+   /* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[ 
+    * These coefficients can be generate with 
+    * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
+    */
+#if LOG_POLY_DEGREE == 6
+   logmant = POLY5(mant, 3.11578814719469302614f, -3.32419399085241980044f, 2.59883907202499966007f, -1.23152682416275988241f, 0.318212422185251071475f, -0.0344359067839062357313f);
+#elif LOG_POLY_DEGREE == 5
+   logmant = POLY4(mant, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
+#elif LOG_POLY_DEGREE == 4
+   logmant = POLY3(mant, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
+#elif LOG_POLY_DEGREE == 3
+   logmant = POLY2(mant, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
+#else
+#error
+#endif
+
+   /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
+   logmant = _mm_mul_ps(logmant, _mm_sub_ps(mant, one));
+
+   return _mm_add_ps(logmant, exp);
+}
+
+static INLINE __m128
+powf4(__m128 x, __m128 y)
+{
+   return exp2f4(_mm_mul_ps(log2f4(x), y));
+}
+
+
+/**
+ * Low-level instruction translators.
+ */
+
+static void
+emit_abs(
+   struct x86_function *func,
+   unsigned xmm )
+{
+   sse_andps(
+      func,
+      make_xmm( xmm ),
+      get_temp(
+         TGSI_EXEC_TEMP_7FFFFFFF_I,
+         TGSI_EXEC_TEMP_7FFFFFFF_C ) );
+}
+
+static void
+emit_add(
+   struct x86_function *func,
+   unsigned xmm_dst,
+   unsigned xmm_src )
+{
+   sse_addps(
+      func,
+      make_xmm( xmm_dst ),
+      make_xmm( xmm_src ) );
+}
+
+static void PIPE_CDECL
+cos4f(
+   float *store )
+{
+   store[0] = cosf( store[0] );
+   store[1] = cosf( store[1] );
+   store[2] = cosf( store[2] );
+   store[3] = cosf( store[3] );
+}
+
+static void
+emit_cos(
+   struct x86_function *func,
+   unsigned xmm_save, 
+   unsigned xmm_dst )
+{
+   emit_func_call_dst(
+      func,
+      xmm_save, 
+      xmm_dst,
+      cos4f );
+}
+
+static void PIPE_CDECL
+#if defined(PIPE_CC_GCC)
+__attribute__((force_align_arg_pointer))
+#endif
+ex24f(
+   float *store )
+{
+   _mm_store_ps(&store[0], exp2f4( _mm_load_ps(&store[0]) ));
+}
+
+static void
+emit_ex2(
+   struct x86_function *func,
+   unsigned xmm_save, 
+   unsigned xmm_dst )
+{
+   emit_func_call_dst(
+      func,
+      xmm_save,
+      xmm_dst,
+      ex24f );
+}
+
+static void
+emit_f2it(
+   struct x86_function *func,
+   unsigned xmm )
+{
+   sse2_cvttps2dq(
+      func,
+      make_xmm( xmm ),
+      make_xmm( xmm ) );
+}
+
+static void PIPE_CDECL
+flr4f(
+   float *store )
+{
+   store[0] = floorf( store[0] );
+   store[1] = floorf( store[1] );
+   store[2] = floorf( store[2] );
+   store[3] = floorf( store[3] );
+}
+
+static void
+emit_flr(
+   struct x86_function *func,
+   unsigned xmm_save, 
+   unsigned xmm_dst )
+{
+   emit_func_call_dst(
+      func,
+      xmm_save,
+      xmm_dst,
+      flr4f );
+}
+
+static void PIPE_CDECL
+frc4f(
+   float *store )
+{
+   store[0] -= floorf( store[0] );
+   store[1] -= floorf( store[1] );
+   store[2] -= floorf( store[2] );
+   store[3] -= floorf( store[3] );
+}
+
+static void
+emit_frc(
+   struct x86_function *func,
+   unsigned xmm_save, 
+   unsigned xmm_dst )
+{
+   emit_func_call_dst(
+      func,
+      xmm_save,
+      xmm_dst,
+      frc4f );
+}
+
+static void PIPE_CDECL
+#if defined(PIPE_CC_GCC)
+__attribute__((force_align_arg_pointer))
+#endif
+lg24f(
+   float *store )
+{
+   _mm_store_ps(&store[0], log2f4( _mm_load_ps(&store[0]) ));
+}
+
+static void
+emit_lg2(
+   struct x86_function *func,
+   unsigned xmm_save, 
+   unsigned xmm_dst )
+{
+   emit_func_call_dst(
+      func,
+      xmm_save,
+      xmm_dst,
+      lg24f );
+}
+
+static void
+emit_MOV(
+   struct x86_function *func,
+   unsigned xmm_dst,
+   unsigned xmm_src )
+{
+   sse_movups(
+      func,
+      make_xmm( xmm_dst ),
+      make_xmm( xmm_src ) );
+}
+
+static void
+emit_mul (struct x86_function *func,
+          unsigned xmm_dst,
+          unsigned xmm_src)
+{
+   sse_mulps(
+      func,
+      make_xmm( xmm_dst ),
+      make_xmm( xmm_src ) );
+}
+
+static void
+emit_neg(
+   struct x86_function *func,
+   unsigned xmm )
+{
+   sse_xorps(
+      func,
+      make_xmm( xmm ),
+      get_temp(
+         TGSI_EXEC_TEMP_80000000_I,
+         TGSI_EXEC_TEMP_80000000_C ) );
+}
+
+static void PIPE_CDECL
+#if defined(PIPE_CC_GCC)
+__attribute__((force_align_arg_pointer))
+#endif
+pow4f(
+   float *store )
+{
+#if 1
+   _mm_store_ps(&store[0], powf4( _mm_load_ps(&store[0]), _mm_load_ps(&store[4]) ));
+#else
+   store[0] = powf( store[0], store[4] );
+   store[1] = powf( store[1], store[5] );
+   store[2] = powf( store[2], store[6] );
+   store[3] = powf( store[3], store[7] );
+#endif
+}
+
+static void
+emit_pow(
+   struct x86_function *func,
+   unsigned xmm_save, 
+   unsigned xmm_dst,
+   unsigned xmm_src )
+{
+   emit_func_call_dst_src(
+      func,
+      xmm_save,
+      xmm_dst,
+      xmm_src,
+      pow4f );
+}
+
+static void
+emit_rcp (
+   struct x86_function *func,
+   unsigned xmm_dst,
+   unsigned xmm_src )
+{
+   /* On Intel CPUs at least, this is only accurate to 12 bits -- not
+    * good enough.  Need to either emit a proper divide or use the
+    * iterative technique described below in emit_rsqrt().
+    */
+   sse2_rcpps(
+      func,
+      make_xmm( xmm_dst ),
+      make_xmm( xmm_src ) );
+}
+
+static void
+emit_rsqrt(
+   struct x86_function *func,
+   unsigned xmm_dst,
+   unsigned xmm_src )
+{
+#if HIGH_PRECISION
+   /* Although rsqrtps() and rcpps() are low precision on some/all SSE
+    * implementations, it is possible to improve its precision at
+    * fairly low cost, using a newton/raphson step, as below:
+    * 
+    * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
+    * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
+    *
+    * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
+    */
+   {
+      struct x86_reg dst = make_xmm( xmm_dst );
+      struct x86_reg src = make_xmm( xmm_src );
+      struct x86_reg tmp0 = make_xmm( 2 );
+      struct x86_reg tmp1 = make_xmm( 3 );
+
+      assert( xmm_dst != xmm_src );
+      assert( xmm_dst != 2 && xmm_dst != 3 );
+      assert( xmm_src != 2 && xmm_src != 3 );
+
+      sse_movaps(  func, dst,  get_temp( TGSI_EXEC_TEMP_HALF_I, TGSI_EXEC_TEMP_HALF_C ) );
+      sse_movaps(  func, tmp0, get_temp( TGSI_EXEC_TEMP_THREE_I, TGSI_EXEC_TEMP_THREE_C ) );
+      sse_rsqrtps( func, tmp1, src  );
+      sse_mulps(   func, src,  tmp1 );
+      sse_mulps(   func, dst,  tmp1 );
+      sse_mulps(   func, src,  tmp1 );
+      sse_subps(   func, tmp0, src  );
+      sse_mulps(   func, dst,  tmp0 );
+   }
+#else
+   /* On Intel CPUs at least, this is only accurate to 12 bits -- not
+    * good enough.
+    */
+   sse_rsqrtps(
+      func,
+      make_xmm( xmm_dst ),
+      make_xmm( xmm_src ) );
+#endif
+}
+
+static void
+emit_setsign(
+   struct x86_function *func,
+   unsigned xmm )
+{
+   sse_orps(
+      func,
+      make_xmm( xmm ),
+      get_temp(
+         TGSI_EXEC_TEMP_80000000_I,
+         TGSI_EXEC_TEMP_80000000_C ) );
+}
+
+static void PIPE_CDECL
+sin4f(
+   float *store )
+{
+   store[0] = sinf( store[0] );
+   store[1] = sinf( store[1] );
+   store[2] = sinf( store[2] );
+   store[3] = sinf( store[3] );
+}
+
+static void
+emit_sin (struct x86_function *func,
+          unsigned xmm_save, 
+          unsigned xmm_dst)
+{
+   emit_func_call_dst(
+      func,
+      xmm_save,
+      xmm_dst,
+      sin4f );
+}
+
+static void
+emit_sub(
+   struct x86_function *func,
+   unsigned xmm_dst,
+   unsigned xmm_src )
+{
+   sse_subps(
+      func,
+      make_xmm( xmm_dst ),
+      make_xmm( xmm_src ) );
+}
+#endif
+
+
+/**
+ * Register fetch.
+ */
+static void
+emit_fetch(struct gen_context *gen,
+           unsigned vec_reg,
+           const struct tgsi_full_src_register *reg,
+           const unsigned chan_index)
+{
+   uint swizzle = tgsi_util_get_full_src_register_extswizzle(reg, chan_index);
+
+   switch (swizzle) {
+   case TGSI_EXTSWIZZLE_X:
+   case TGSI_EXTSWIZZLE_Y:
+   case TGSI_EXTSWIZZLE_Z:
+   case TGSI_EXTSWIZZLE_W:
+      switch (reg->SrcRegister.File) {
+      case TGSI_FILE_INPUT:
+         {
+            int offset_reg = ppc_allocate_register(gen->f);
+            int offset = (reg->SrcRegister.Index * 4 + swizzle) * 16;
+            ppc_li(gen->f, offset_reg, offset);
+            ppc_lvx(gen->f, vec_reg, gen->inputs_reg, offset_reg);
+            ppc_release_register(gen->f, offset_reg);
+         }
+         break;
+      case TGSI_FILE_TEMPORARY:
+         {
+            int offset_reg = ppc_allocate_register(gen->f);
+            int offset = (reg->SrcRegister.Index * 4 + swizzle) * 16;
+            ppc_li(gen->f, offset_reg, offset);
+            ppc_lvx(gen->f, vec_reg, gen->temps_reg, offset_reg);
+            ppc_release_register(gen->f, offset_reg);
+         }
+         break;
+      case TGSI_FILE_IMMEDIATE:
+         {
+            int offset_reg = ppc_allocate_register(gen->f);
+            int offset = (reg->SrcRegister.Index * 4 + swizzle) * 16;
+            ppc_li(gen->f, offset_reg, offset);
+            ppc_lvx(gen->f, vec_reg, gen->immed_reg, offset_reg);
+            ppc_release_register(gen->f, offset_reg);
+         }
+         break;
+      case TGSI_FILE_CONSTANT:
+         {
+            int offset_reg = ppc_allocate_register(gen->f);
+            int offset = (reg->SrcRegister.Index * 4 + swizzle) * 4;
+            ppc_li(gen->f, offset_reg, offset);
+            /* load vector word */
+            ppc_lvewx(gen->f, vec_reg, gen->const_reg, offset_reg);
+            /* splat word[0] across vector */
+            ppc_vspltw(gen->f, vec_reg, vec_reg, 0);
+            ppc_release_register(gen->f, offset_reg);
+         }
+         break;
+      default:
+         assert( 0 );
+      }
+      break;
+
+   case TGSI_EXTSWIZZLE_ZERO:
+#if 0
+      emit_tempf(
+         func,
+         xmm,
+         TGSI_EXEC_TEMP_00000000_I,
+         TGSI_EXEC_TEMP_00000000_C );
+#endif
+      break;
+
+   case TGSI_EXTSWIZZLE_ONE:
+#if 0
+      emit_tempf(
+         func,
+         xmm,
+         TEMP_ONE_I,
+         TEMP_ONE_C );
+#endif
+      break;
+
+   default:
+      assert( 0 );
+   }
+
+#if 0
+   switch( tgsi_util_get_full_src_register_sign_mode( reg, chan_index ) ) {
+   case TGSI_UTIL_SIGN_CLEAR:
+      emit_abs( func, xmm );
+      break;
+
+   case TGSI_UTIL_SIGN_SET:
+      emit_setsign( func, xmm );
+      break;
+
+   case TGSI_UTIL_SIGN_TOGGLE:
+      emit_neg( func, xmm );
+      break;
+
+   case TGSI_UTIL_SIGN_KEEP:
+      break;
+   }
+#endif
+}
+
+#define FETCH( GEN, INST, VEC_REG, SRC_REG, CHAN ) \
+   emit_fetch( GEN, VEC_REG, &(INST).FullSrcRegisters[SRC_REG], CHAN )
+
+
+
+/**
+ * Register store.
+ */
+static void
+emit_store(struct gen_context *gen,
+           unsigned vec_reg,
+           const struct tgsi_full_dst_register *reg,
+           const struct tgsi_full_instruction *inst,
+           unsigned chan_index)
+{
+   switch (reg->DstRegister.File) {
+   case TGSI_FILE_OUTPUT:
+      {
+         int offset_reg = ppc_allocate_register(gen->f);
+         int offset = (reg->DstRegister.Index * 4 + chan_index) * 16;
+         ppc_li(gen->f, offset_reg, offset);
+         ppc_stvx(gen->f, vec_reg, gen->outputs_reg, offset_reg);
+         ppc_release_register(gen->f, offset_reg);
+      }
+      break;
+   case TGSI_FILE_TEMPORARY:
+      {
+         int offset_reg = ppc_allocate_register(gen->f);
+         int offset = (reg->DstRegister.Index * 4 + chan_index) * 16;
+         ppc_li(gen->f, offset_reg, offset);
+         ppc_stvx(gen->f, vec_reg, gen->temps_reg, offset_reg);
+         ppc_release_register(gen->f, offset_reg);
+      }
+      break;
+#if 0
+   case TGSI_FILE_ADDRESS:
+      emit_addrs(
+         func,
+         xmm,
+         reg->DstRegister.Index,
+         chan_index );
+      break;
+#endif
+   default:
+      assert( 0 );
+   }
+
+#if 0
+   switch( inst->Instruction.Saturate ) {
+   case TGSI_SAT_NONE:
+      break;
+
+   case TGSI_SAT_ZERO_ONE:
+      /* assert( 0 ); */
+      break;
+
+   case TGSI_SAT_MINUS_PLUS_ONE:
+      assert( 0 );
+      break;
+   }
+#endif
+}
+
+
+#define STORE( GEN, INST, XMM, INDEX, CHAN )\
+   emit_store( GEN, XMM, &(INST).FullDstRegisters[INDEX], &(INST), CHAN )
+
+
+
+#if 000
+/**
+ * High-level instruction translators.
+ */
+
+static void
+emit_kil(
+   struct x86_function *func,
+   const struct tgsi_full_src_register *reg )
+{
+   unsigned uniquemask;
+   unsigned registers[4];
+   unsigned nextregister = 0;
+   unsigned firstchan = ~0;
+   unsigned chan_index;
+
+   /* This mask stores component bits that were already tested. Note that
+    * we test if the value is less than zero, so 1.0 and 0.0 need not to be
+    * tested. */
+   uniquemask = (1 << TGSI_EXTSWIZZLE_ZERO) | (1 << TGSI_EXTSWIZZLE_ONE);
+
+   FOR_EACH_CHANNEL( chan_index ) {
+      unsigned swizzle;
+
+      /* unswizzle channel */
+      swizzle = tgsi_util_get_full_src_register_extswizzle(
+         reg,
+         chan_index );
+
+      /* check if the component has not been already tested */
+      if( !(uniquemask & (1 << swizzle)) ) {
+         uniquemask |= 1 << swizzle;
+
+         /* allocate register */
+         registers[chan_index] = nextregister;
+         emit_fetch(
+            func,
+            nextregister,
+            reg,
+            chan_index );
+         nextregister++;
+
+         /* mark the first channel used */
+         if( firstchan == ~0 ) {
+            firstchan = chan_index;
+         }
+      }
+   }
+
+   x86_push(
+      func,
+      x86_make_reg( file_REG32, reg_AX ) );
+   x86_push(
+      func,
+      x86_make_reg( file_REG32, reg_DX ) );
+
+   FOR_EACH_CHANNEL( chan_index ) {
+      if( uniquemask & (1 << chan_index) ) {
+         sse_cmpps(
+            func,
+            make_xmm( registers[chan_index] ),
+            get_temp(
+               TGSI_EXEC_TEMP_00000000_I,
+               TGSI_EXEC_TEMP_00000000_C ),
+            cc_LessThan );
+
+         if( chan_index == firstchan ) {
+            sse_pmovmskb(
+               func,
+               x86_make_reg( file_REG32, reg_AX ),
+               make_xmm( registers[chan_index] ) );
+         }
+         else {
+            sse_pmovmskb(
+               func,
+               x86_make_reg( file_REG32, reg_DX ),
+               make_xmm( registers[chan_index] ) );
+            x86_or(
+               func,
+               x86_make_reg( file_REG32, reg_AX ),
+               x86_make_reg( file_REG32, reg_DX ) );
+         }
+      }
+   }
+
+   x86_or(
+      func,
+      get_temp(
+         TGSI_EXEC_TEMP_KILMASK_I,
+         TGSI_EXEC_TEMP_KILMASK_C ),
+      x86_make_reg( file_REG32, reg_AX ) );
+
+   x86_pop(
+      func,
+      x86_make_reg( file_REG32, reg_DX ) );
+   x86_pop(
+      func,
+      x86_make_reg( file_REG32, reg_AX ) );
+}
+
+
+static void
+emit_kilp(
+   struct x86_function *func )
+{
+   /* XXX todo / fix me */
+}
+
+
+static void
+emit_setcc(
+   struct x86_function *func,
+   struct tgsi_full_instruction *inst,
+   enum sse_cc cc )
+{
+   unsigned chan_index;
+
+   FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+      FETCH( func, *inst, 0, 0, chan_index );
+      FETCH( func, *inst, 1, 1, chan_index );
+      sse_cmpps(
+         func,
+         make_xmm( 0 ),
+         make_xmm( 1 ),
+         cc );
+      sse_andps(
+         func,
+         make_xmm( 0 ),
+         get_temp(
+            TEMP_ONE_I,
+            TEMP_ONE_C ) );
+      STORE( func, *inst, 0, 0, chan_index );
+   }
+}
+
+static void
+emit_cmp(
+   struct x86_function *func,
+   struct tgsi_full_instruction *inst )
+{
+   unsigned chan_index;
+
+   FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+      FETCH( func, *inst, 0, 0, chan_index );
+      FETCH( func, *inst, 1, 1, chan_index );
+      FETCH( func, *inst, 2, 2, chan_index );
+      sse_cmpps(
+         func,
+         make_xmm( 0 ),
+         get_temp(
+            TGSI_EXEC_TEMP_00000000_I,
+            TGSI_EXEC_TEMP_00000000_C ),
+         cc_LessThan );
+      sse_andps(
+         func,
+         make_xmm( 1 ),
+         make_xmm( 0 ) );
+      sse_andnps(
+         func,
+         make_xmm( 0 ),
+         make_xmm( 2 ) );
+      sse_orps(
+         func,
+         make_xmm( 0 ),
+         make_xmm( 1 ) );
+      STORE( func, *inst, 0, 0, chan_index );
+   }
+}
+#endif
+
+
+static void
+emit_unaryop(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+   int v0 = ppc_allocate_vec_register(gen->f);
+   uint chan_index;
+   FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) {
+      FETCH(gen, *inst, 0, 0, chan_index);   /* v0 = srcreg[0] */
+      switch (inst->Instruction.Opcode) {
+      case TGSI_OPCODE_ABS:
+         /* turn off the most significant bit of each vector float word */
+         {
+            int v1 = ppc_allocate_vec_register(gen->f);
+            ppc_vspltisw(gen->f, v1, -1);  /* v1 = {-1, -1, -1, -1} */
+            ppc_vslw(gen->f, v1, v1, v1);  /* v1 = {1<<31, 1<<31, 1<<31, 1<<31} */
+            ppc_vandc(gen->f, v0, v0, v1); /* v0 = v0 & ~v1 */
+            ppc_release_vec_register(gen->f, v1);
+         }
+         break;
+      case TGSI_OPCODE_FLOOR:
+         ppc_vrfim(gen->f, v0, v0);         /* v0 = floor(v0) */
+         break;
+      case TGSI_OPCODE_FRAC:
+         {
+            int v1 = ppc_allocate_vec_register(gen->f);
+            ppc_vrfim(gen->f, v1, v0);         /* v1 = floor(v0) */
+            ppc_vsubfp(gen->f, v0, v0, v1);    /* v0 = v0 - v1 */
+            ppc_release_vec_register(gen->f, v1);
+         }
+         break;
+      case TGSI_OPCODE_EXPBASE2:
+         ppc_vexptefp(gen->f, v0, v0);      /* v0 = 2^v0 */
+         break;
+      case TGSI_OPCODE_LOGBASE2:
+         /* XXX this may be broken! */
+         ppc_vlogefp(gen->f, v0, v0);      /* v0 = log2(v0) */
+         break;
+      case TGSI_OPCODE_MOV:
+         /* nothing */
+         break;
+      default:
+         assert(0);
+      }
+      STORE(gen, *inst, v0, 0, chan_index);   /* store v0 */
+   }
+   ppc_release_vec_register(gen->f, v0);
+}
+
+
+static void
+emit_binop(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+   int v0 = ppc_allocate_vec_register(gen->f);
+   int v1 = ppc_allocate_vec_register(gen->f);
+   int v2 = ppc_allocate_vec_register(gen->f);
+   uint chan_index;
+   FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) {
+      FETCH(gen, *inst, v0, 0, chan_index);   /* v0 = srcreg[0] */
+      FETCH(gen, *inst, v1, 1, chan_index);   /* v1 = srcreg[1] */
+      switch (inst->Instruction.Opcode) {
+      case TGSI_OPCODE_ADD:
+         ppc_vaddfp(gen->f, v2, v0, v1);
+         break;
+      case TGSI_OPCODE_SUB:
+         ppc_vsubfp(gen->f, v2, v0, v1);
+         break;
+      case TGSI_OPCODE_MUL:
+         ppc_vxor(gen->f, v2, v2, v2);        /* v2 = {0, 0, 0, 0} */
+         ppc_vmaddfp(gen->f, v2, v0, v1, v2); /* v2 = v0 * v1 + v0 */
+         break;
+      case TGSI_OPCODE_MIN:
+         ppc_vminfp(gen->f, v2, v0, v1);
+         break;
+      case TGSI_OPCODE_MAX:
+         ppc_vmaxfp(gen->f, v2, v0, v1);
+         break;
+      default:
+         assert(0);
+      }
+      STORE(gen, *inst, v2, 0, chan_index);   /* store v2 */
+   }
+   ppc_release_vec_register(gen->f, v0);
+   ppc_release_vec_register(gen->f, v1);
+   ppc_release_vec_register(gen->f, v2);
+}
+
+
+static void
+emit_dotprod(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+   int v0 = ppc_allocate_vec_register(gen->f);
+   int v1 = ppc_allocate_vec_register(gen->f);
+   int v2 = ppc_allocate_vec_register(gen->f);
+   uint chan_index;
+
+   ppc_vxor(gen->f, v2, v2, v2);           /* v2 = {0, 0, 0, 0} */
+
+   FETCH(gen, *inst, v0, 0, CHAN_X);       /* v0 = src0.XXXX */
+   FETCH(gen, *inst, v1, 1, CHAN_X);       /* v1 = src1.XXXX */
+   ppc_vmaddfp(gen->f, v2, v0, v1, v2);    /* v2 = v0 * v1 + v2 */
+
+   FETCH(gen, *inst, v0, 0, CHAN_Y);       /* v0 = src0.YYYY */
+   FETCH(gen, *inst, v1, 1, CHAN_Y);       /* v1 = src1.YYYY */
+   ppc_vmaddfp(gen->f, v2, v0, v1, v2);    /* v2 = v0 * v1 + v2 */
+
+   FETCH(gen, *inst, v0, 0, CHAN_Z);       /* v0 = src0.ZZZZ */
+   FETCH(gen, *inst, v1, 1, CHAN_Z);       /* v1 = src1.ZZZZ */
+   ppc_vmaddfp(gen->f, v2, v0, v1, v2);    /* v2 = v0 * v1 + v2 */
+
+   if (inst->Instruction.Opcode == TGSI_OPCODE_DP4) {
+      FETCH(gen, *inst, v0, 0, CHAN_W);    /* v0 = src0.WWWW */
+      FETCH(gen, *inst, v1, 1, CHAN_W);    /* v1 = src1.WWWW */
+      ppc_vmaddfp(gen->f, v2, v0, v1, v2); /* v2 = v0 * v1 + v2 */
+   }
+   else if (inst->Instruction.Opcode == TGSI_OPCODE_DPH) {
+      FETCH(gen, *inst, v1, 1, CHAN_W);    /* v1 = src1.WWWW */
+      ppc_vaddfp(gen->f, v2, v2, v1);      /* v2 = v2 + v1 */
+   }
+
+   FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) {
+      STORE(gen, *inst, v2, 0, chan_index);  /* store v2 */
+   }
+   ppc_release_vec_register(gen->f, v0);
+   ppc_release_vec_register(gen->f, v1);
+   ppc_release_vec_register(gen->f, v2);
+}
+
+
+static void
+emit_triop(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+   int v0 = ppc_allocate_vec_register(gen->f);
+   int v1 = ppc_allocate_vec_register(gen->f);
+   int v2 = ppc_allocate_vec_register(gen->f);
+   int v3 = ppc_allocate_vec_register(gen->f);
+   uint chan_index;
+   FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) {
+      FETCH(gen, *inst, v0, 0, chan_index);   /* v0 = srcreg[0] */
+      FETCH(gen, *inst, v1, 1, chan_index);   /* v1 = srcreg[1] */
+      FETCH(gen, *inst, v2, 2, chan_index);   /* v2 = srcreg[2] */
+      switch (inst->Instruction.Opcode) {
+      case TGSI_OPCODE_MAD:
+         ppc_vmaddfp(gen->f, v3, v0, v1, v2);   /* v3 = v0 * v1 + v2 */
+         break;
+      case TGSI_OPCODE_LRP:
+         ppc_vsubfp(gen->f, v3, v1, v2);        /* v3 = v1 - v2 */
+         ppc_vmaddfp(gen->f, v3, v0, v3, v2);   /* v3 = v0 * v3 + v2 */
+         break;
+      default:
+         assert(0);
+      }
+      STORE(gen, *inst, v3, 0, chan_index);   /* store v3 */
+   }
+   ppc_release_vec_register(gen->f, v0);
+   ppc_release_vec_register(gen->f, v1);
+   ppc_release_vec_register(gen->f, v2);
+   ppc_release_vec_register(gen->f, v3);
+}
+
+
+static int
+emit_instruction(struct gen_context *gen,
+                 struct tgsi_full_instruction *inst)
+{
+   switch (inst->Instruction.Opcode) {
+   case TGSI_OPCODE_MOV:
+   case TGSI_OPCODE_ABS:
+   case TGSI_OPCODE_FLOOR:
+   case TGSI_OPCODE_FRAC:
+   case TGSI_OPCODE_EXPBASE2:
+   case TGSI_OPCODE_LOGBASE2:
+      emit_unaryop(gen, inst);
+      break;
+   case TGSI_OPCODE_ADD:
+   case TGSI_OPCODE_SUB:
+   case TGSI_OPCODE_MUL:
+   case TGSI_OPCODE_MIN:
+   case TGSI_OPCODE_MAX:
+      emit_binop(gen, inst);
+      break;
+   case TGSI_OPCODE_MAD:
+   case TGSI_OPCODE_LRP:
+      emit_triop(gen, inst);
+      break;
+   case TGSI_OPCODE_DP3:
+   case TGSI_OPCODE_DP4:
+   case TGSI_OPCODE_DPH:
+      emit_dotprod(gen, inst);
+      break;
+   case TGSI_OPCODE_END:
+      /* normal end */
+      return 1;
+   default:
+      return 0;
+   }
+
+#if 0
+   unsigned chan_index;
+
+   switch (inst->Instruction.Opcode) {
+   case TGSI_OPCODE_ARL:
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         emit_f2it( func, 0 );
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_MOV:
+   case TGSI_OPCODE_SWZ:
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_LIT:
+      if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
+          IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
+         emit_tempf(
+            func,
+            0,
+            TEMP_ONE_I,
+            TEMP_ONE_C);
+         if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ) {
+            STORE( func, *inst, 0, 0, CHAN_X );
+         }
+         if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
+            STORE( func, *inst, 0, 0, CHAN_W );
+         }
+      }
+      if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
+          IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
+         if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
+            FETCH( func, *inst, 0, 0, CHAN_X );
+            sse_maxps(
+               func,
+               make_xmm( 0 ),
+               get_temp(
+                  TGSI_EXEC_TEMP_00000000_I,
+                  TGSI_EXEC_TEMP_00000000_C ) );
+            STORE( func, *inst, 0, 0, CHAN_Y );
+         }
+         if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
+            /* XMM[1] = SrcReg[0].yyyy */
+            FETCH( func, *inst, 1, 0, CHAN_Y );
+            /* XMM[1] = max(XMM[1], 0) */
+            sse_maxps(
+               func,
+               make_xmm( 1 ),
+               get_temp(
+                  TGSI_EXEC_TEMP_00000000_I,
+                  TGSI_EXEC_TEMP_00000000_C ) );
+            /* XMM[2] = SrcReg[0].wwww */
+            FETCH( func, *inst, 2, 0, CHAN_W );
+            /* XMM[2] = min(XMM[2], 128.0) */
+            sse_minps(
+               func,
+               make_xmm( 2 ),
+               get_temp(
+                  TGSI_EXEC_TEMP_128_I,
+                  TGSI_EXEC_TEMP_128_C ) );
+            /* XMM[2] = max(XMM[2], -128.0) */
+            sse_maxps(
+               func,
+               make_xmm( 2 ),
+               get_temp(
+                  TGSI_EXEC_TEMP_MINUS_128_I,
+                  TGSI_EXEC_TEMP_MINUS_128_C ) );
+            emit_pow( func, 3, 1, 2 );
+            FETCH( func, *inst, 0, 0, CHAN_X );
+            sse_xorps(
+               func,
+               make_xmm( 2 ),
+               make_xmm( 2 ) );
+            sse_cmpps(
+               func,
+               make_xmm( 2 ),
+               make_xmm( 0 ),
+               cc_LessThanEqual );
+            sse_andps(
+               func,
+               make_xmm( 2 ),
+               make_xmm( 1 ) );
+            STORE( func, *inst, 2, 0, CHAN_Z );
+         }
+      }
+      break;
+
+   case TGSI_OPCODE_RCP:
+   /* TGSI_OPCODE_RECIP */
+      FETCH( func, *inst, 0, 0, CHAN_X );
+      emit_rcp( func, 0, 0 );
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_RSQ:
+   /* TGSI_OPCODE_RECIPSQRT */
+      FETCH( func, *inst, 0, 0, CHAN_X );
+      emit_rsqrt( func, 1, 0 );
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( func, *inst, 1, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_EXP:
+      if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
+          IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
+          IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
+         FETCH( func, *inst, 0, 0, CHAN_X );
+         if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
+             IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
+            emit_MOV( func, 1, 0 );
+            emit_flr( func, 2, 1 );
+            /* dst.x = ex2(floor(src.x)) */
+            if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
+               emit_MOV( func, 2, 1 );
+               emit_ex2( func, 3, 2 );
+               STORE( func, *inst, 2, 0, CHAN_X );
+            }
+            /* dst.y = src.x - floor(src.x) */
+            if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
+               emit_MOV( func, 2, 0 );
+               emit_sub( func, 2, 1 );
+               STORE( func, *inst, 2, 0, CHAN_Y );
+            }
+         }
+         /* dst.z = ex2(src.x) */
+         if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
+            emit_ex2( func, 3, 0 );
+            STORE( func, *inst, 0, 0, CHAN_Z );
+         }
+      }
+      /* dst.w = 1.0 */
+      if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
+         emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
+         STORE( func, *inst, 0, 0, CHAN_W );
+      }
+      break;
+
+   case TGSI_OPCODE_LOG:
+      if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
+          IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
+          IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
+         FETCH( func, *inst, 0, 0, CHAN_X );
+         emit_abs( func, 0 );
+         emit_MOV( func, 1, 0 );
+         emit_lg2( func, 2, 1 );
+         /* dst.z = lg2(abs(src.x)) */
+         if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
+            STORE( func, *inst, 1, 0, CHAN_Z );
+         }
+         if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
+             IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
+            emit_flr( func, 2, 1 );
+            /* dst.x = floor(lg2(abs(src.x))) */
+            if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
+               STORE( func, *inst, 1, 0, CHAN_X );
+            }
+            /* dst.x = abs(src)/ex2(floor(lg2(abs(src.x)))) */
+            if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
+               emit_ex2( func, 2, 1 );
+               emit_rcp( func, 1, 1 );
+               emit_mul( func, 0, 1 );
+               STORE( func, *inst, 0, 0, CHAN_Y );
+            }
+         }
+      }
+      /* dst.w = 1.0 */
+      if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
+         emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
+         STORE( func, *inst, 0, 0, CHAN_W );
+      }
+      break;
+
+   case TGSI_OPCODE_MUL:
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         FETCH( func, *inst, 1, 1, chan_index );
+         emit_mul( func, 0, 1 );
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_ADD:
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         FETCH( func, *inst, 1, 1, chan_index );
+         emit_add( func, 0, 1 );
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_DP3:
+   /* TGSI_OPCODE_DOT3 */
+      FETCH( func, *inst, 0, 0, CHAN_X );
+      FETCH( func, *inst, 1, 1, CHAN_X );
+      emit_mul( func, 0, 1 );
+      FETCH( func, *inst, 1, 0, CHAN_Y );
+      FETCH( func, *inst, 2, 1, CHAN_Y );
+      emit_mul( func, 1, 2 );
+      emit_add( func, 0, 1 );
+      FETCH( func, *inst, 1, 0, CHAN_Z );
+      FETCH( func, *inst, 2, 1, CHAN_Z );
+      emit_mul( func, 1, 2 );
+      emit_add( func, 0, 1 );
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_DP4:
+   /* TGSI_OPCODE_DOT4 */
+      FETCH( func, *inst, 0, 0, CHAN_X );
+      FETCH( func, *inst, 1, 1, CHAN_X );
+      emit_mul( func, 0, 1 );
+      FETCH( func, *inst, 1, 0, CHAN_Y );
+      FETCH( func, *inst, 2, 1, CHAN_Y );
+      emit_mul( func, 1, 2 );
+      emit_add( func, 0, 1 );
+      FETCH( func, *inst, 1, 0, CHAN_Z );
+      FETCH( func, *inst, 2, 1, CHAN_Z );
+      emit_mul(func, 1, 2 );
+      emit_add(func, 0, 1 );
+      FETCH( func, *inst, 1, 0, CHAN_W );
+      FETCH( func, *inst, 2, 1, CHAN_W );
+      emit_mul( func, 1, 2 );
+      emit_add( func, 0, 1 );
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_DST:
+      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
+         emit_tempf(
+            func,
+            0,
+            TEMP_ONE_I,
+            TEMP_ONE_C );
+         STORE( func, *inst, 0, 0, CHAN_X );
+      }
+      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
+         FETCH( func, *inst, 0, 0, CHAN_Y );
+         FETCH( func, *inst, 1, 1, CHAN_Y );
+         emit_mul( func, 0, 1 );
+         STORE( func, *inst, 0, 0, CHAN_Y );
+      }
+      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
+         FETCH( func, *inst, 0, 0, CHAN_Z );
+         STORE( func, *inst, 0, 0, CHAN_Z );
+      }
+      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
+         FETCH( func, *inst, 0, 1, CHAN_W );
+         STORE( func, *inst, 0, 0, CHAN_W );
+      }
+      break;
+
+   case TGSI_OPCODE_MIN:
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         FETCH( func, *inst, 1, 1, chan_index );
+         sse_minps(
+            func,
+            make_xmm( 0 ),
+            make_xmm( 1 ) );
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_MAX:
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         FETCH( func, *inst, 1, 1, chan_index );
+         sse_maxps(
+            func,
+            make_xmm( 0 ),
+            make_xmm( 1 ) );
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SLT:
+   /* TGSI_OPCODE_SETLT */
+      emit_setcc( func, inst, cc_LessThan );
+      break;
+
+   case TGSI_OPCODE_SGE:
+   /* TGSI_OPCODE_SETGE */
+      emit_setcc( func, inst, cc_NotLessThan );
+      break;
+
+   case TGSI_OPCODE_MAD:
+   /* TGSI_OPCODE_MADD */
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         FETCH( func, *inst, 1, 1, chan_index );
+         FETCH( func, *inst, 2, 2, chan_index );
+         emit_mul( func, 0, 1 );
+         emit_add( func, 0, 2 );
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SUB:
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         FETCH( func, *inst, 1, 1, chan_index );
+         emit_sub( func, 0, 1 );
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_LERP:
+   /* TGSI_OPCODE_LRP */
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         FETCH( func, *inst, 1, 1, chan_index );
+         FETCH( func, *inst, 2, 2, chan_index );
+         emit_sub( func, 1, 2 );
+         emit_mul( func, 0, 1 );
+         emit_add( func, 0, 2 );
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_CND:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_CND0:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_DOT2ADD:
+   /* TGSI_OPCODE_DP2A */
+      return 0;
+      break;
+
+   case TGSI_OPCODE_INDEX:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_NEGATE:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_FRAC:
+   /* TGSI_OPCODE_FRC */
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         emit_frc( func, 0, 0 );
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_CLAMP:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_FLOOR:
+   /* TGSI_OPCODE_FLR */
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         emit_flr( func, 0, 0 );
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_ROUND:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_EXPBASE2:
+   /* TGSI_OPCODE_EX2 */
+      FETCH( func, *inst, 0, 0, CHAN_X );
+      emit_ex2( func, 0, 0 );
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_LOGBASE2:
+   /* TGSI_OPCODE_LG2 */
+      FETCH( func, *inst, 0, 0, CHAN_X );
+      emit_lg2( func, 0, 0 );
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_POWER:
+   /* TGSI_OPCODE_POW */
+      FETCH( func, *inst, 0, 0, CHAN_X );
+      FETCH( func, *inst, 1, 1, CHAN_X );
+      emit_pow( func, 0, 0, 1 );
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_CROSSPRODUCT:
+   /* TGSI_OPCODE_XPD */
+      if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
+          IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
+         FETCH( func, *inst, 1, 1, CHAN_Z );
+         FETCH( func, *inst, 3, 0, CHAN_Z );
+      }
+      if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
+          IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
+         FETCH( func, *inst, 0, 0, CHAN_Y );
+         FETCH( func, *inst, 4, 1, CHAN_Y );
+      }
+      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
+         emit_MOV( func, 2, 0 );
+         emit_mul( func, 2, 1 );
+         emit_MOV( func, 5, 3 );
+         emit_mul( func, 5, 4 );
+         emit_sub( func, 2, 5 );
+         STORE( func, *inst, 2, 0, CHAN_X );
+      }
+      if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
+          IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
+         FETCH( func, *inst, 2, 1, CHAN_X );
+         FETCH( func, *inst, 5, 0, CHAN_X );
+      }
+      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
+         emit_mul( func, 3, 2 );
+         emit_mul( func, 1, 5 );
+         emit_sub( func, 3, 1 );
+         STORE( func, *inst, 3, 0, CHAN_Y );
+      }
+      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
+         emit_mul( func, 5, 4 );
+         emit_mul( func, 0, 2 );
+         emit_sub( func, 5, 0 );
+         STORE( func, *inst, 5, 0, CHAN_Z );
+      }
+      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
+	 emit_tempf(
+	    func,
+	    0,
+	    TEMP_ONE_I,
+	    TEMP_ONE_C );
+         STORE( func, *inst, 0, 0, CHAN_W );
+      }
+      break;
+
+   case TGSI_OPCODE_MULTIPLYMATRIX:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_ABS:
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         emit_abs( func, 0) ;
+
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_RCC:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_DPH:
+      FETCH( func, *inst, 0, 0, CHAN_X );
+      FETCH( func, *inst, 1, 1, CHAN_X );
+      emit_mul( func, 0, 1 );
+      FETCH( func, *inst, 1, 0, CHAN_Y );
+      FETCH( func, *inst, 2, 1, CHAN_Y );
+      emit_mul( func, 1, 2 );
+      emit_add( func, 0, 1 );
+      FETCH( func, *inst, 1, 0, CHAN_Z );
+      FETCH( func, *inst, 2, 1, CHAN_Z );
+      emit_mul( func, 1, 2 );
+      emit_add( func, 0, 1 );
+      FETCH( func, *inst, 1, 1, CHAN_W );
+      emit_add( func, 0, 1 );
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_COS:
+      FETCH( func, *inst, 0, 0, CHAN_X );
+      emit_cos( func, 0, 0 );
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_DDX:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_DDY:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_KILP:
+      /* predicated kill */
+      emit_kilp( func );
+      return 0; /* XXX fix me */
+      break;
+
+   case TGSI_OPCODE_KIL:
+      /* conditional kill */
+      emit_kil( func, &inst->FullSrcRegisters[0] );
+      break;
+
+   case TGSI_OPCODE_PK2H:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_PK2US:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_PK4B:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_PK4UB:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_RFL:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_SEQ:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_SFL:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_SGT:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_SIN:
+      FETCH( func, *inst, 0, 0, CHAN_X );
+      emit_sin( func, 0, 0 );
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SLE:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_SNE:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_STR:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_TEX:
+      if (0) {
+	 /* Disable dummy texture code: 
+	  */
+	 emit_tempf(
+	    func,
+	    0,
+	    TEMP_ONE_I,
+	    TEMP_ONE_C );
+	 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+	    STORE( func, *inst, 0, 0, chan_index );
+	 }
+      }
+      else {
+	 return 0;
+      }
+      break;
+
+   case TGSI_OPCODE_TXD:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_UP2H:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_UP2US:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_UP4B:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_UP4UB:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_X2D:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_ARA:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_ARR:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_BRA:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_CAL:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_RET:
+      emit_ret( func );
+      break;
+
+   case TGSI_OPCODE_END:
+      break;
+
+   case TGSI_OPCODE_SSG:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_CMP:
+      emit_cmp (func, inst);
+      break;
+
+   case TGSI_OPCODE_SCS:
+      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
+         FETCH( func, *inst, 0, 0, CHAN_X );
+         emit_cos( func, 0, 0 );
+         STORE( func, *inst, 0, 0, CHAN_X );
+      }
+      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
+         FETCH( func, *inst, 0, 0, CHAN_X );
+         emit_sin( func, 0, 0 );
+         STORE( func, *inst, 0, 0, CHAN_Y );
+      }
+      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
+	 emit_tempf(
+	    func,
+	    0,
+	    TGSI_EXEC_TEMP_00000000_I,
+	    TGSI_EXEC_TEMP_00000000_C );
+         STORE( func, *inst, 0, 0, CHAN_Z );
+      }
+      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
+	 emit_tempf(
+	    func,
+	    0,
+	    TEMP_ONE_I,
+	    TEMP_ONE_C );
+         STORE( func, *inst, 0, 0, CHAN_W );
+      }
+      break;
+
+   case TGSI_OPCODE_TXB:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_NRM:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_DIV:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_DP2:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_TXL:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_BRK:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_IF:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_LOOP:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_REP:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_ELSE:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_ENDIF:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_ENDLOOP:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_ENDREP:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_PUSHA:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_POPA:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_CEIL:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_I2F:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_NOT:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_TRUNC:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_SHL:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_SHR:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_AND:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_OR:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_MOD:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_XOR:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_SAD:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_TXF:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_TXQ:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_CONT:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_EMIT:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_ENDPRIM:
+      return 0;
+      break;
+
+   default:
+      return 0;
+   }
+#endif
+   
+   return 1;
+}
+
+static void
+emit_declaration(
+   struct ppc_function *func,
+   struct tgsi_full_declaration *decl )
+{
+   if( decl->Declaration.File == TGSI_FILE_INPUT ) {
+#if 0
+      unsigned first, last, mask;
+      unsigned i, j;
+
+      first = decl->DeclarationRange.First;
+      last = decl->DeclarationRange.Last;
+      mask = decl->Declaration.UsageMask;
+
+      for( i = first; i <= last; i++ ) {
+         for( j = 0; j < NUM_CHANNELS; j++ ) {
+            if( mask & (1 << j) ) {
+               switch( decl->Declaration.Interpolate ) {
+               case TGSI_INTERPOLATE_CONSTANT:
+                  emit_coef_a0( func, 0, i, j );
+                  emit_inputs( func, 0, i, j );
+                  break;
+
+               case TGSI_INTERPOLATE_LINEAR:
+                  emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
+                  emit_coef_dadx( func, 1, i, j );
+                  emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
+                  emit_coef_dady( func, 3, i, j );
+                  emit_mul( func, 0, 1 );    /* x * dadx */
+                  emit_coef_a0( func, 4, i, j );
+                  emit_mul( func, 2, 3 );    /* y * dady */
+                  emit_add( func, 0, 4 );    /* x * dadx + a0 */
+                  emit_add( func, 0, 2 );    /* x * dadx + y * dady + a0 */
+                  emit_inputs( func, 0, i, j );
+                  break;
+
+               case TGSI_INTERPOLATE_PERSPECTIVE:
+                  emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
+                  emit_coef_dadx( func, 1, i, j );
+                  emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
+                  emit_coef_dady( func, 3, i, j );
+                  emit_mul( func, 0, 1 );    /* x * dadx */
+                  emit_tempf( func, 4, 0, TGSI_SWIZZLE_W );
+                  emit_coef_a0( func, 5, i, j );
+                  emit_rcp( func, 4, 4 );    /* 1.0 / w */
+                  emit_mul( func, 2, 3 );    /* y * dady */
+                  emit_add( func, 0, 5 );    /* x * dadx + a0 */
+                  emit_add( func, 0, 2 );    /* x * dadx + y * dady + a0 */
+                  emit_mul( func, 0, 4 );    /* (x * dadx + y * dady + a0) / w */
+                  emit_inputs( func, 0, i, j );
+                  break;
+
+               default:
+                  assert( 0 );
+		  break;
+               }
+            }
+         }
+      }
+#endif
+   }
+}
+
+#if 0
+static void aos_to_soa( struct x86_function *func, 
+                        uint arg_aos,
+                        uint arg_soa, 
+                        uint arg_num, 
+                        uint arg_stride )
+{
+   struct x86_reg soa_input = x86_make_reg( file_REG32, reg_AX );
+   struct x86_reg aos_input = x86_make_reg( file_REG32, reg_BX );
+   struct x86_reg num_inputs = x86_make_reg( file_REG32, reg_CX );
+   struct x86_reg stride = x86_make_reg( file_REG32, reg_DX );
+   int inner_loop;
+
+
+   /* Save EBX */
+   x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
+
+   x86_mov( func, aos_input,  x86_fn_arg( func, arg_aos ) );
+   x86_mov( func, soa_input,  x86_fn_arg( func, arg_soa ) );
+   x86_mov( func, num_inputs, x86_fn_arg( func, arg_num ) );
+   x86_mov( func, stride,     x86_fn_arg( func, arg_stride ) );
+
+   /* do */
+   inner_loop = x86_get_label( func );
+   {
+      x86_push( func, aos_input );
+      sse_movlps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
+      sse_movlps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
+      x86_add( func, aos_input, stride );
+      sse_movhps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
+      sse_movhps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
+      x86_add( func, aos_input, stride );
+      sse_movlps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
+      sse_movlps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
+      x86_add( func, aos_input, stride );
+      sse_movhps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
+      sse_movhps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
+      x86_pop( func, aos_input );
+
+      sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
+      sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
+      sse_shufps( func, make_xmm( 0 ), make_xmm( 1 ), 0x88 );
+      sse_shufps( func, make_xmm( 2 ), make_xmm( 1 ), 0xdd );
+      sse_shufps( func, make_xmm( 3 ), make_xmm( 4 ), 0x88 );
+      sse_shufps( func, make_xmm( 5 ), make_xmm( 4 ), 0xdd );
+
+      sse_movups( func, x86_make_disp( soa_input, 0 ), make_xmm( 0 ) );
+      sse_movups( func, x86_make_disp( soa_input, 16 ), make_xmm( 2 ) );
+      sse_movups( func, x86_make_disp( soa_input, 32 ), make_xmm( 3 ) );
+      sse_movups( func, x86_make_disp( soa_input, 48 ), make_xmm( 5 ) );
+
+      /* Advance to next input */
+      x86_lea( func, aos_input, x86_make_disp(aos_input, 16) );
+      x86_lea( func, soa_input, x86_make_disp(soa_input, 64) );
+   }
+   /* while --num_inputs */
+   x86_dec( func, num_inputs );
+   x86_jcc( func, cc_NE, inner_loop );
+
+   /* Restore EBX */
+   x86_pop( func, aos_input );
+}
+#endif
+
+#if 0
+static void soa_to_aos( struct x86_function *func, uint aos, uint soa, uint num, uint stride )
+{
+   struct x86_reg soa_output;
+   struct x86_reg aos_output;
+   struct x86_reg num_outputs;
+   struct x86_reg temp;
+   int inner_loop;
+
+   soa_output = x86_make_reg( file_REG32, reg_AX );
+   aos_output = x86_make_reg( file_REG32, reg_BX );
+   num_outputs = x86_make_reg( file_REG32, reg_CX );
+   temp = x86_make_reg( file_REG32, reg_DX );
+
+   /* Save EBX */
+   x86_push( func, aos_output );
+
+   x86_mov( func, soa_output, x86_fn_arg( func, soa ) );
+   x86_mov( func, aos_output, x86_fn_arg( func, aos ) );
+   x86_mov( func, num_outputs, x86_fn_arg( func, num ) );
+
+   /* do */
+   inner_loop = x86_get_label( func );
+   {
+      sse_movups( func, make_xmm( 0 ), x86_make_disp( soa_output, 0 ) );
+      sse_movups( func, make_xmm( 1 ), x86_make_disp( soa_output, 16 ) );
+      sse_movups( func, make_xmm( 3 ), x86_make_disp( soa_output, 32 ) );
+      sse_movups( func, make_xmm( 4 ), x86_make_disp( soa_output, 48 ) );
+
+      sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
+      sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
+      sse_unpcklps( func, make_xmm( 0 ), make_xmm( 1 ) );
+      sse_unpckhps( func, make_xmm( 2 ), make_xmm( 1 ) );
+      sse_unpcklps( func, make_xmm( 3 ), make_xmm( 4 ) );
+      sse_unpckhps( func, make_xmm( 5 ), make_xmm( 4 ) );
+
+      x86_mov( func, temp, x86_fn_arg( func, stride ) );
+      x86_push( func, aos_output );
+      sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
+      sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
+      x86_add( func, aos_output, temp );
+      sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
+      sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
+      x86_add( func, aos_output, temp );
+      sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
+      sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
+      x86_add( func, aos_output, temp );
+      sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
+      sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
+      x86_pop( func, aos_output );
+
+      /* Advance to next output */
+      x86_lea( func, aos_output, x86_make_disp(aos_output, 16) );
+      x86_lea( func, soa_output, x86_make_disp(soa_output, 64) );
+   }
+   /* while --num_outputs */
+   x86_dec( func, num_outputs );
+   x86_jcc( func, cc_NE, inner_loop );
+
+   /* Restore EBX */
+   x86_pop( func, aos_output );
+}
+#endif
+
+
+static void
+emit_prologue(struct ppc_function *func)
+{
+   /* XXX set up stack frame */
+}
+
+
+static void
+emit_epilogue(struct ppc_function *func)
+{
+   ppc_return(func);
+   /* XXX restore prev stack frame */
+}
+
+
+
+/**
+ * Translate a TGSI vertex/fragment shader to PPC code.
+ *
+ * \param tokens  the TGSI input shader
+ * \param func  the output PPC code/function
+ * \param immediates  buffer to place immediates, later passed to PPC func
+ * \return TRUE for success, FALSE if translation failed
+ */
+boolean
+tgsi_emit_ppc(const struct tgsi_token *tokens,
+              struct ppc_function *func,
+              float (*immediates)[4],
+              boolean do_swizzles )
+{
+   struct tgsi_parse_context parse;
+   /*boolean instruction_phase = FALSE;*/
+   unsigned ok = 1;
+   uint num_immediates = 0;
+   struct gen_context gen;
+
+   util_init_math();
+
+   tgsi_parse_init( &parse, tokens );
+
+   gen.f = func;
+   gen.inputs_reg = 3;   /* first function param */
+   gen.outputs_reg = 4;  /* second function param */
+   gen.temps_reg = 5;    /* ... */
+   gen.immed_reg = 6;
+   gen.const_reg = 7;
+
+   emit_prologue(func);
+
+   /*
+    * Different function args for vertex/fragment shaders:
+    */
+#if 0
+   if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
+      /* DECLARATION phase, do not load output argument. */
+      x86_mov(
+         func,
+         get_input_base(),
+         x86_fn_arg( func, 1 ) );
+      /* skipping outputs argument here */
+      x86_mov(
+         func,
+         get_const_base(),
+         x86_fn_arg( func, 3 ) );
+      x86_mov(
+         func,
+         get_temp_base(),
+         x86_fn_arg( func, 4 ) );
+      x86_mov(
+         func,
+         get_coef_base(),
+         x86_fn_arg( func, 5 ) );
+      x86_mov(
+         func,
+         get_immediate_base(),
+         x86_fn_arg( func, 6 ) );
+   }
+   else {
+      assert(parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX);
+
+      if (do_swizzles)
+         aos_to_soa( func, 
+                     6,         /* aos_input */
+                     1,         /* machine->input */
+                     7,         /* num_inputs */
+                     8 );       /* input_stride */
+
+      x86_mov(
+         func,
+         get_input_base(),
+         x86_fn_arg( func, 1 ) );
+      x86_mov(
+         func,
+         get_output_base(),
+         x86_fn_arg( func, 2 ) );
+      x86_mov(
+         func,
+         get_const_base(),
+         x86_fn_arg( func, 3 ) );
+      x86_mov(
+         func,
+         get_temp_base(),
+         x86_fn_arg( func, 4 ) );
+      x86_mov(
+         func,
+         get_immediate_base(),
+         x86_fn_arg( func, 5 ) );
+   }
+#endif
+
+   while (!tgsi_parse_end_of_tokens(&parse) && ok) {
+      tgsi_parse_token(&parse);
+
+      switch (parse.FullToken.Token.Type) {
+      case TGSI_TOKEN_TYPE_DECLARATION:
+         if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
+            emit_declaration(func, &parse.FullToken.FullDeclaration );
+         }
+         break;
+
+      case TGSI_TOKEN_TYPE_INSTRUCTION:
+         if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
+#if 0
+            if( !instruction_phase ) {
+               /* INSTRUCTION phase, overwrite coeff with output. */
+               instruction_phase = TRUE;
+               x86_mov(
+                  func,
+                  get_output_base(),
+                  x86_fn_arg( func, 2 ) );
+            }
+#endif
+         }
+
+         ok = emit_instruction(&gen, &parse.FullToken.FullInstruction);
+
+	 if (!ok) {
+	    debug_printf("failed to translate tgsi opcode %d to PPC (%s)\n", 
+			 parse.FullToken.FullInstruction.Instruction.Opcode,
+                         parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX ?
+                         "vertex shader" : "fragment shader");
+	 }
+         break;
+
+      case TGSI_TOKEN_TYPE_IMMEDIATE:
+         /* splat each immediate component into a float[4] vector for SoA */
+         {
+            const uint size = parse.FullToken.FullImmediate.Immediate.Size - 1;
+            float *imm = (float *) immediates;
+            uint i;
+            assert(size <= 4);
+            assert(num_immediates < TGSI_EXEC_NUM_IMMEDIATES);
+            for (i = 0; i < size; i++) {
+               const float value =
+                  parse.FullToken.FullImmediate.u.ImmediateFloat32[i].Float;
+               imm[num_immediates * 4 + 0] = 
+               imm[num_immediates * 4 + 1] = 
+               imm[num_immediates * 4 + 2] = 
+               imm[num_immediates * 4 + 3] = value;
+               num_immediates++;
+            }
+         }
+         break;
+
+      default:
+	 ok = 0;
+         assert( 0 );
+      }
+   }
+
+#if 0
+   if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
+      if (do_swizzles)
+         soa_to_aos( func, 9, 2, 10, 11 );
+   }
+#endif
+
+   emit_epilogue(func);
+
+   tgsi_parse_free( &parse );
+
+   return ok;
+}
+
+#endif /* PIPE_ARCH_PPC */
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ppc.h b/src/gallium/auxiliary/tgsi/tgsi_ppc.h
new file mode 100644
index 0000000000..7cd2bf9aff
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/tgsi_ppc.h
@@ -0,0 +1,48 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef TGSI_PPC_H
+#define TGSI_PPC_H
+
+#if defined __cplusplus
+extern "C" {
+#endif
+
+struct tgsi_token;
+struct ppc_function;
+
+boolean
+tgsi_emit_ppc(const struct tgsi_token *tokens,
+              struct ppc_function *function,
+              float (*immediates)[4],
+              boolean do_swizzles);
+
+#if defined __cplusplus
+}
+#endif
+
+#endif /* TGSI_PPC_H */
-- 
cgit v1.2.3


From b7da4c3dc199ee382bb9924ac86a3485deccc62d Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 22 Oct 2008 11:08:45 -0600
Subject: gallium: PPC vertex shader support

Works, but dead code lingering, debug code present, etc.
---
 src/gallium/auxiliary/draw/Makefile      |   1 +
 src/gallium/auxiliary/draw/draw_vs.c     |   5 +-
 src/gallium/auxiliary/draw/draw_vs.h     |   4 +
 src/gallium/auxiliary/draw/draw_vs_ppc.c | 270 +++++++++++++++++++++++++++++++
 4 files changed, 279 insertions(+), 1 deletion(-)
 create mode 100644 src/gallium/auxiliary/draw/draw_vs_ppc.c

(limited to 'src/gallium')

diff --git a/src/gallium/auxiliary/draw/Makefile b/src/gallium/auxiliary/draw/Makefile
index f2e36a89e9..bdbf5a08ed 100644
--- a/src/gallium/auxiliary/draw/Makefile
+++ b/src/gallium/auxiliary/draw/Makefile
@@ -40,6 +40,7 @@ C_SOURCES = \
 	draw_vs_aos_machine.c \
 	draw_vs_exec.c \
 	draw_vs_llvm.c \
+	draw_vs_ppc.c  \
 	draw_vs_sse.c 
 
 
diff --git a/src/gallium/auxiliary/draw/draw_vs.c b/src/gallium/auxiliary/draw/draw_vs.c
index 34adbd49b0..7f305304ff 100644
--- a/src/gallium/auxiliary/draw/draw_vs.c
+++ b/src/gallium/auxiliary/draw/draw_vs.c
@@ -85,7 +85,10 @@ draw_create_vertex_shader(struct draw_context *draw,
    if (!vs) {
       vs = draw_create_vs_sse( draw, shader );
       if (!vs) {
-         vs = draw_create_vs_exec( draw, shader );
+         vs = draw_create_vs_ppc( draw, shader );
+         if (!vs) {
+            vs = draw_create_vs_exec( draw, shader );
+         }
       }
    }
 
diff --git a/src/gallium/auxiliary/draw/draw_vs.h b/src/gallium/auxiliary/draw/draw_vs.h
index 68c24abad3..89ae158751 100644
--- a/src/gallium/auxiliary/draw/draw_vs.h
+++ b/src/gallium/auxiliary/draw/draw_vs.h
@@ -157,6 +157,10 @@ struct draw_vertex_shader *
 draw_create_vs_sse(struct draw_context *draw,
 		   const struct pipe_shader_state *templ);
 
+struct draw_vertex_shader *
+draw_create_vs_ppc(struct draw_context *draw,
+		   const struct pipe_shader_state *templ);
+
 struct draw_vertex_shader *
 draw_create_vs_llvm(struct draw_context *draw,
 		    const struct pipe_shader_state *templ);
diff --git a/src/gallium/auxiliary/draw/draw_vs_ppc.c b/src/gallium/auxiliary/draw/draw_vs_ppc.c
new file mode 100644
index 0000000000..a096ad49b8
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_vs_ppc.c
@@ -0,0 +1,270 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  *   Brian Paul
+  */
+
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "pipe/p_config.h"
+
+#include "draw_vs.h"
+
+#if defined(PIPE_ARCH_PPC)
+
+#include "pipe/p_shader_tokens.h"
+
+#include "draw_private.h"
+#include "draw_context.h"
+
+#include "rtasm/rtasm_cpu.h"
+#include "rtasm/rtasm_ppc.h"
+#include "tgsi/tgsi_ppc.h"
+#include "tgsi/tgsi_parse.h"
+
+
+
+typedef void (PIPE_CDECL *codegen_function) (float (*inputs)[4][4],
+                                             float (*outputs)[4][4],
+                                             float (*temps)[4][4],
+                                             float (*immeds)[4][4],
+                                             float (*consts)[4]);
+
+#if 0
+   const struct tgsi_exec_vector *input,
+   struct tgsi_exec_vector *output,
+   float (*constant)[4],        /* 3 */
+   struct tgsi_exec_vector *temporary, /* 4 */
+   float (*immediates)[4],      /* 5 */
+   const float (*aos_input)[4], /* 6 */
+   uint num_inputs,             /* 7 */
+   uint input_stride,           /* 8 */
+   float (*aos_output)[4],      /* 9 */
+   uint num_outputs,            /* 10 */
+   uint output_stride );        /* 11 */
+#endif
+
+struct draw_ppc_vertex_shader {
+   struct draw_vertex_shader base;
+   struct ppc_function ppc_program;
+
+   codegen_function func;
+   
+   struct tgsi_exec_machine *machine;
+};
+
+
+static void
+vs_ppc_prepare( struct draw_vertex_shader *base,
+		struct draw_context *draw )
+{
+}
+
+
+
+/* Simplified vertex shader interface for the pt paths.  Given the
+ * complexity of code-generating all the above operations together,
+ * it's time to try doing all the other stuff separately.
+ */
+static void
+vs_ppc_run_linear( struct draw_vertex_shader *base,
+		   const float (*input)[4],
+		   float (*output)[4],
+		   const float (*constants)[4],
+		   unsigned count,
+		   unsigned input_stride,
+		   unsigned output_stride )
+{
+   struct draw_ppc_vertex_shader *shader = (struct draw_ppc_vertex_shader *)base;
+   struct tgsi_exec_machine *machine = shader->machine;
+   unsigned int i;
+
+#define MAX_VERTICES 4
+
+   /* loop over verts */
+   for (i = 0; i < count; i += MAX_VERTICES) {
+      const uint max_vertices = MIN2(MAX_VERTICES, count - i);
+      float inputs_soa[PIPE_MAX_SHADER_INPUTS][4][4] ALIGN16_ATTRIB;
+      float outputs_soa[PIPE_MAX_SHADER_OUTPUTS][4][4] ALIGN16_ATTRIB;
+      float temps_soa[TGSI_EXEC_NUM_TEMPS][4][4] ALIGN16_ATTRIB;
+      uint attr;
+
+      /* convert (up to) four input verts to SoA format */
+      for (attr = 0; attr < base->info.num_inputs; attr++) {
+         const float *vIn = (const float *) input;
+         uint vert;
+         for (vert = 0; vert < max_vertices; vert++) {
+#if 0
+            if (attr==0)
+               printf("Input v%d a%d: %f %f %f %f\n",
+                      vert, attr, vIn[0], vIn[1], vIn[2], vIn[3]);
+#endif
+            inputs_soa[attr][0][vert] = vIn[attr * 4 + 0];
+            inputs_soa[attr][1][vert] = vIn[attr * 4 + 1];
+            inputs_soa[attr][2][vert] = vIn[attr * 4 + 2];
+            inputs_soa[attr][3][vert] = vIn[attr * 4 + 3];
+            vIn += input_stride / 4;
+         }
+      }
+
+      /* run compiled shader
+       */
+#if 0
+      shader->func(machine->Inputs,
+		   machine->Outputs,
+		   (float (*)[4])constants,
+		   machine->Temps,
+		   (float (*)[4])shader->base.immediates,
+                   input,
+                   base->info.num_inputs,
+                   input_stride,
+                   output,
+                   base->info.num_outputs,
+                   output_stride );
+#else
+      shader->func(inputs_soa, outputs_soa, temps_soa,
+		   (float (*)[4][4]) shader->base.immediates,
+		   (float (*)[4]) constants);
+
+      /*output[0][0] = input[0][0] * 0.5;*/
+#endif
+
+      /* convert (up to) four output verts from SoA back to AoS format */
+      for (attr = 0; attr < base->info.num_outputs; attr++) {
+         float *vOut = (float *) output;
+         uint vert;
+         for (vert = 0; vert < max_vertices; vert++) {
+            vOut[attr * 4 + 0] = outputs_soa[attr][0][vert];
+            vOut[attr * 4 + 1] = outputs_soa[attr][1][vert];
+            vOut[attr * 4 + 2] = outputs_soa[attr][2][vert];
+            vOut[attr * 4 + 3] = outputs_soa[attr][3][vert];
+#if 0
+            if (attr==0)
+               printf("Output v%d a%d: %f %f %f %f\n",
+                      vert, attr, vOut[0], vOut[1], vOut[2], vOut[3]);
+#endif
+            vOut += output_stride / 4;
+         }
+      }
+
+      /* advance to next group of four input/output verts */
+      input = (const float (*)[4])((const char *)input + input_stride * max_vertices);
+      output = (float (*)[4])((char *)output + output_stride * max_vertices);
+   }
+}
+
+
+
+
+static void
+vs_ppc_delete( struct draw_vertex_shader *base )
+{
+   struct draw_ppc_vertex_shader *shader = (struct draw_ppc_vertex_shader *)base;
+   
+   ppc_release_func( &shader->ppc_program );
+
+   align_free( (void *) shader->base.immediates );
+
+   FREE( (void*) shader->base.state.tokens );
+   FREE( shader );
+}
+
+
+struct draw_vertex_shader *
+draw_create_vs_ppc(struct draw_context *draw,
+                          const struct pipe_shader_state *templ)
+{
+   struct draw_ppc_vertex_shader *vs;
+
+   vs = CALLOC_STRUCT( draw_ppc_vertex_shader );
+   if (vs == NULL) 
+      return NULL;
+
+   /* we make a private copy of the tokens */
+   vs->base.state.tokens = tgsi_dup_tokens(templ->tokens);
+   if (!vs->base.state.tokens)
+      goto fail;
+
+   tgsi_scan_shader(templ->tokens, &vs->base.info);
+
+   vs->base.draw = draw;
+#if 0
+   if (1)
+      vs->base.create_varient = draw_vs_varient_aos_ppc;
+   else
+#endif
+      vs->base.create_varient = draw_vs_varient_generic;
+   vs->base.prepare = vs_ppc_prepare;
+   vs->base.run_linear = vs_ppc_run_linear;
+   vs->base.delete = vs_ppc_delete;
+   
+   vs->base.immediates = align_malloc(TGSI_EXEC_NUM_IMMEDIATES * 4 * 4 *
+                                      sizeof(float), 16);
+
+   vs->machine = &draw->vs.machine;
+   
+   ppc_init_func( &vs->ppc_program, 1000 ); /* XXX fix limit */
+
+   if (!tgsi_emit_ppc( (struct tgsi_token *) vs->base.state.tokens,
+			&vs->ppc_program, 
+                        (float (*)[4])vs->base.immediates, 
+                        TRUE )) 
+      goto fail;
+      
+   vs->func = (codegen_function) ppc_get_func( &vs->ppc_program );
+   if (!vs->func) {
+      goto fail;
+   }
+   
+   return &vs->base;
+
+fail:
+   debug_error("tgsi_emit_ppc() failed, falling back to interpreter\n");
+
+   ppc_release_func( &vs->ppc_program );
+   
+   FREE(vs);
+   return NULL;
+}
+
+
+
+#else /* PIPE_ARCH_PPC */
+
+
+struct draw_vertex_shader *
+draw_create_vs_ppc( struct draw_context *draw,
+		    const struct pipe_shader_state *templ )
+{
+   return (void *) 0;
+}
+
+
+#endif /* PIPE_ARCH_PPC */
-- 
cgit v1.2.3


From ba4faef7c07c47ad4f71f3e6ba94cb54217c56ed Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 22 Oct 2008 11:13:31 -0600
Subject: gallium: temporarily disable PPC vertex shader until more things run

---
 src/gallium/auxiliary/draw/draw_vs_ppc.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'src/gallium')

diff --git a/src/gallium/auxiliary/draw/draw_vs_ppc.c b/src/gallium/auxiliary/draw/draw_vs_ppc.c
index a096ad49b8..990a659f27 100644
--- a/src/gallium/auxiliary/draw/draw_vs_ppc.c
+++ b/src/gallium/auxiliary/draw/draw_vs_ppc.c
@@ -203,6 +203,9 @@ draw_create_vs_ppc(struct draw_context *draw,
 {
    struct draw_ppc_vertex_shader *vs;
 
+   /* XXX temporary short-circuit */
+   return NULL;
+
    vs = CALLOC_STRUCT( draw_ppc_vertex_shader );
    if (vs == NULL) 
       return NULL;
-- 
cgit v1.2.3


From ebdc399d83d6bd2f4e3594874483dbca5f9f5c0e Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 22 Oct 2008 13:57:56 -0600
Subject: gallium: fix-up confusing register allocation masks in rtasm_ppc.c

Plus, add ppc_reserve_register() func.
---
 src/gallium/auxiliary/rtasm/rtasm_ppc.c | 56 ++++++++++++++++++++-------------
 src/gallium/auxiliary/rtasm/rtasm_ppc.h |  1 +
 2 files changed, 36 insertions(+), 21 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc.c b/src/gallium/auxiliary/rtasm/rtasm_ppc.c
index aaec2d2191..2d9f4e079e 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc.c
@@ -49,13 +49,15 @@ ppc_init_func(struct ppc_function *p, unsigned max_inst)
    p->store = align_malloc(max_inst * PPC_INST_SIZE, 16);
    p->num_inst = 0;
    p->max_inst = max_inst;
-   p->fp_used = ~0x0;
-   p->vec_used = ~0x0;
-
-   /* only allow using gp registers 7..12 for now */
    p->reg_used = 0x0;
-   for (i = 7; i < 13; i++)
-      p->reg_used |= (1 << i);
+   p->fp_used = 0x0;
+   p->vec_used = 0x0;
+
+   /* only allow using gp registers 3..12 for now */
+   for (i = 0; i < 3; i++)
+      ppc_reserve_register(p, i);
+   for (i = 12; i < PPC_NUM_REGS; i++)
+      ppc_reserve_register(p, i);
 }
 
 
@@ -95,6 +97,18 @@ ppc_dump_func(const struct ppc_function *p)
 }
 
 
+/**
+ * Mark a register as being unavailable.
+ */
+int
+ppc_reserve_register(struct ppc_function *p, int reg)
+{
+   assert(reg < PPC_NUM_REGS);
+   p->reg_used |= (1 << reg);
+   return reg;
+}
+
+
 /**
  * Allocate a general purpose register.
  * \return register index or -1 if none left.
@@ -105,8 +119,8 @@ ppc_allocate_register(struct ppc_function *p)
    unsigned i;
    for (i = 0; i < PPC_NUM_REGS; i++) {
       const uint64_t mask = 1 << i;
-      if ((p->reg_used & mask) != 0) {
-         p->reg_used &= ~mask;
+      if ((p->reg_used & mask) == 0) {
+         p->reg_used |= mask;
          return i;
       }
    }
@@ -121,8 +135,8 @@ void
 ppc_release_register(struct ppc_function *p, int reg)
 {
    assert(reg < PPC_NUM_REGS);
-   assert((p->reg_used & (1 << reg)) == 0);
-   p->reg_used |= (1 << reg);
+   assert(p->reg_used & (1 << reg));
+   p->reg_used &= ~(1 << reg);
 }
 
 
@@ -136,8 +150,8 @@ ppc_allocate_fp_register(struct ppc_function *p)
    unsigned i;
    for (i = 0; i < PPC_NUM_FP_REGS; i++) {
       const uint64_t mask = 1 << i;
-      if ((p->fp_used & mask) != 0) {
-         p->fp_used &= ~mask;
+      if ((p->fp_used & mask) == 0) {
+         p->fp_used |= mask;
          return i;
       }
    }
@@ -152,8 +166,8 @@ void
 ppc_release_fp_register(struct ppc_function *p, int reg)
 {
    assert(reg < PPC_NUM_FP_REGS);
-   assert((p->fp_used & (1 << reg)) == 0);
-   p->fp_used |= (1 << reg);
+   assert(p->fp_used & (1 << reg));
+   p->fp_used &= ~(1 << reg);
 }
 
 
@@ -167,8 +181,8 @@ ppc_allocate_vec_register(struct ppc_function *p)
    unsigned i;
    for (i = 0; i < PPC_NUM_VEC_REGS; i++) {
       const uint64_t mask = 1 << i;
-      if ((p->vec_used & mask) != 0) {
-         p->vec_used &= ~mask;
+      if ((p->vec_used & mask) == 0) {
+         p->vec_used |= mask;
          return i;
       }
    }
@@ -183,8 +197,8 @@ void
 ppc_release_vec_register(struct ppc_function *p, int reg)
 {
    assert(reg < PPC_NUM_VEC_REGS);
-   assert((p->vec_used & (1 << reg)) == 0);
-   p->vec_used |= (1 << reg);
+   assert(p->vec_used & (1 << reg));
+   p->vec_used &= ~(1 << reg);
 }
 
 
@@ -582,11 +596,11 @@ ppc_lvx(struct ppc_function *p, uint vR, uint vA, uint vB)
    emit_x(p, 31, vR, vA, vB, 103);
 }
 
-/** load vector element word: vR = mem_word[vA+vB] */
+/** load vector element word: vR = mem_word[ra+rb] */
 void
-ppc_lvewx(struct ppc_function *p, uint vR, uint vA, uint vB)
+ppc_lvewx(struct ppc_function *p, uint vr, uint ra, uint rb)
 {
-   emit_x(p, 31, vR, vA, vB, 71);
+   emit_x(p, 31, vr, ra, rb, 71);
 }
 
 
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc.h b/src/gallium/auxiliary/rtasm/rtasm_ppc.h
index 53d5746dc8..85679b4886 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc.h
@@ -67,6 +67,7 @@ extern void ppc_release_func(struct ppc_function *p);
 extern void (*ppc_get_func( struct ppc_function *p ))( void );
 extern void ppc_dump_func(const struct ppc_function *p);
 
+extern int ppc_reserve_register(struct ppc_function *p, int reg);
 extern int ppc_allocate_register(struct ppc_function *p);
 extern void ppc_release_register(struct ppc_function *p, int reg);
 extern int ppc_allocate_fp_register(struct ppc_function *p);
-- 
cgit v1.2.3


From da63edd720fc154820fcbf699e1056ac9357a03f Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 22 Oct 2008 13:59:11 -0600
Subject: gallium: fix broken TGSI_FILE_CONSTANT case, use
 ppc_reserver_register()

---
 src/gallium/auxiliary/tgsi/tgsi_ppc.c | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/auxiliary/tgsi/tgsi_ppc.c b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
index 112e736523..dbf215c0d5 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ppc.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
@@ -1108,10 +1108,15 @@ emit_fetch(struct gen_context *gen,
             int offset_reg = ppc_allocate_register(gen->f);
             int offset = (reg->SrcRegister.Index * 4 + swizzle) * 4;
             ppc_li(gen->f, offset_reg, offset);
-            /* load vector word */
+            /* Load 4-byte word into vector register.
+             * The vector slot depends on the effective address we load from.
+             * We know that our constants start at a 16-byte boundary so we
+             * know that 'swizzle' tells us which vector slot will have the
+             * loaded word.  The other vector slots will be undefined.
+             */
             ppc_lvewx(gen->f, vec_reg, gen->const_reg, offset_reg);
-            /* splat word[0] across vector */
-            ppc_vspltw(gen->f, vec_reg, vec_reg, 0);
+            /* splat word[swizzle] across the vector reg */
+            ppc_vspltw(gen->f, vec_reg, vec_reg, swizzle);
             ppc_release_register(gen->f, offset_reg);
          }
          break;
@@ -2635,11 +2640,11 @@ tgsi_emit_ppc(const struct tgsi_token *tokens,
    tgsi_parse_init( &parse, tokens );
 
    gen.f = func;
-   gen.inputs_reg = 3;   /* first function param */
-   gen.outputs_reg = 4;  /* second function param */
-   gen.temps_reg = 5;    /* ... */
-   gen.immed_reg = 6;
-   gen.const_reg = 7;
+   gen.inputs_reg = ppc_reserve_register(func, 3);   /* first function param */
+   gen.outputs_reg = ppc_reserve_register(func, 4);  /* second function param */
+   gen.temps_reg = ppc_reserve_register(func, 5);    /* ... */
+   gen.immed_reg = ppc_reserve_register(func, 6);
+   gen.const_reg = ppc_reserve_register(func, 7);
 
    emit_prologue(func);
 
-- 
cgit v1.2.3


From b06d0720194dfecaf45dc97cbd178411aed5205f Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 22 Oct 2008 14:48:33 -0600
Subject: gallium: added ppc_vload_float(), for limited cases

---
 src/gallium/auxiliary/rtasm/rtasm_ppc.c | 18 ++++++++++++++++++
 src/gallium/auxiliary/rtasm/rtasm_ppc.h |  4 ++++
 2 files changed, 22 insertions(+)

(limited to 'src/gallium')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc.c b/src/gallium/auxiliary/rtasm/rtasm_ppc.c
index 2d9f4e079e..65df676eae 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc.c
@@ -603,6 +603,24 @@ ppc_lvewx(struct ppc_function *p, uint vr, uint ra, uint rb)
    emit_x(p, 31, vr, ra, rb, 71);
 }
 
+/** vector load float: vr = splats(imm) */
+void
+ppc_vload_float(struct ppc_function *p, uint vr, float imm)
+{
+   if (imm == 0.0f) {
+      ppc_vxor(p, vr, vr, vr);
+   }
+   else if (imm == 1.0f) {
+      /* use 2^0=1 to get 1.0 */
+      ppc_vxor(p, vr, vr, vr);  /* vr = {0,0,0,0} */
+      ppc_vexptefp(p, vr, vr);  /* vr = 0^0 */
+   }
+   else {
+      assert(0);
+   }
+}
+
+
 
 
 /**
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc.h b/src/gallium/auxiliary/rtasm/rtasm_ppc.h
index 85679b4886..9f1e3fcd84 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc.h
@@ -158,6 +158,10 @@ ppc_lvx(struct ppc_function *p, uint vR, uint vA, uint vB);
 extern void
 ppc_lvewx(struct ppc_function *p, uint vR, uint vA, uint vB);
 
+/** vector load float: vr = splats(imm) */
+extern void
+ppc_vload_float(struct ppc_function *p, uint vr, float imm);
+
 
 
 /**
-- 
cgit v1.2.3


From 51840065607337210fbba5ba1c01874293fbb42e Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 22 Oct 2008 14:48:58 -0600
Subject: gallium: TGSI->PPC inequality operators

---
 src/gallium/auxiliary/tgsi/tgsi_ppc.c | 70 +++++++++++++++++++++++++++++++++++
 1 file changed, 70 insertions(+)

(limited to 'src/gallium')

diff --git a/src/gallium/auxiliary/tgsi/tgsi_ppc.c b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
index dbf215c0d5..9bf364b8c4 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ppc.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
@@ -1495,6 +1495,68 @@ emit_binop(struct gen_context *gen, struct tgsi_full_instruction *inst)
 }
 
 
+/**
+ * Vector comparisons, resulting in 1.0 or 0.0 values.
+ */
+static void
+emit_inequality(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+   int v0 = ppc_allocate_vec_register(gen->f);
+   int v1 = ppc_allocate_vec_register(gen->f);
+   int v2 = ppc_allocate_vec_register(gen->f);
+   int v_one = ppc_allocate_vec_register(gen->f);
+   uint chan_index;
+   boolean complement = FALSE;
+
+   /* v_one = splat(1.0) */
+   ppc_vload_float(gen->f, v_one, 1.0f);
+
+   FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) {
+      FETCH(gen, *inst, v0, 0, chan_index);   /* v0 = srcreg[0] */
+      FETCH(gen, *inst, v1, 1, chan_index);   /* v1 = srcreg[1] */
+
+      switch (inst->Instruction.Opcode) {
+      case TGSI_OPCODE_SNE:
+         complement = TRUE;
+         /* fall-through */
+      case TGSI_OPCODE_SEQ:
+         ppc_vcmpeqfpx(gen->f, v2, v0, v1); /* v2 = v0 == v1 ? ~0 : 0 */
+         break;
+
+      case TGSI_OPCODE_SGE:
+         complement = TRUE;
+         /* fall-through */
+      case TGSI_OPCODE_SLT:
+         ppc_vcmpgtfpx(gen->f, v2, v1, v0); /* v2 = v1 > v0 ? ~0 : 0 */
+         break;
+
+      case TGSI_OPCODE_SLE:
+         complement = TRUE;
+         /* fall-through */
+      case TGSI_OPCODE_SGT:
+         ppc_vcmpgtfpx(gen->f, v2, v0, v1); /* v2 = v0 > v1 ? ~0 : 0 */
+         break;
+      default:
+         assert(0);
+      }
+
+      /* v2 is now {0,0,0,0} or {~0,~0,~0,~0} */
+
+      if (complement)
+         ppc_vandc(gen->f, v2, v_one, v2);    /* v2 = v_one & ~v2 */
+      else
+         ppc_vand(gen->f, v2, v_one, v2);     /* v2 = v_one & v2 */
+
+      STORE(gen, *inst, v2, 0, chan_index);   /* store v2 */
+   }
+
+   ppc_release_vec_register(gen->f, v0);
+   ppc_release_vec_register(gen->f, v1);
+   ppc_release_vec_register(gen->f, v2);
+   ppc_release_vec_register(gen->f, v_one);
+}
+
+
 static void
 emit_dotprod(struct gen_context *gen, struct tgsi_full_instruction *inst)
 {
@@ -1588,6 +1650,14 @@ emit_instruction(struct gen_context *gen,
    case TGSI_OPCODE_MAX:
       emit_binop(gen, inst);
       break;
+   case TGSI_OPCODE_SEQ:
+   case TGSI_OPCODE_SNE:
+   case TGSI_OPCODE_SLT:
+   case TGSI_OPCODE_SGT:
+   case TGSI_OPCODE_SLE:
+   case TGSI_OPCODE_SGE:
+      emit_inequality(gen, inst);
+      break;
    case TGSI_OPCODE_MAD:
    case TGSI_OPCODE_LRP:
       emit_triop(gen, inst);
-- 
cgit v1.2.3


From c6ff870836e7c970f1030e9e0fbdd0cb5df40d29 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 22 Oct 2008 15:21:22 -0600
Subject: cell: TGSI->PPC for RSQ, RCP and src register sign modes

---
 src/gallium/auxiliary/tgsi/tgsi_ppc.c | 162 ++++++++++++++++++++++++----------
 1 file changed, 116 insertions(+), 46 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/auxiliary/tgsi/tgsi_ppc.c b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
index 9bf364b8c4..3637772102 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ppc.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
@@ -84,11 +84,14 @@
 struct gen_context
 {
    struct ppc_function *f;
-   int inputs_reg;    /**< register pointing to input params */
-   int outputs_reg;   /**< register pointing to output params */
-   int temps_reg;     /**< register pointing to temporary "registers" */
-   int immed_reg;     /**< register pointing to immediates buffer */
-   int const_reg;     /**< register pointing to constants buffer */
+   int inputs_reg;    /**< GP register pointing to input params */
+   int outputs_reg;   /**< GP register pointing to output params */
+   int temps_reg;     /**< GP register pointing to temporary "registers" */
+   int immed_reg;     /**< GP register pointing to immediates buffer */
+   int const_reg;     /**< GP register pointing to constants buffer */
+
+   int one_vec;       /**< vector register with {1.0, 1.0, 1.0, 1.0} */
+   int bit31_vec;     /**< vector register with {1<<31, 1<<31, 1<<31, 1<<31} */
 };
 
 
@@ -1059,6 +1062,35 @@ emit_sub(
 #endif
 
 
+/**
+ * Return index of vector register containing {1.0, 1.0, 1.0, 1.0}.
+ */
+static int
+gen_one_vec(struct gen_context *gen)
+{
+   if (gen->one_vec < 0) {
+      gen->one_vec = ppc_allocate_vec_register(gen->f);
+      ppc_vload_float(gen->f, gen->one_vec, 1.0f);
+   }
+   return gen->one_vec;
+}
+
+/**
+ * Return index of vector register containing {1<<31, 1<<31, 1<<31, 1<<31}.
+ */
+static int
+gen_get_bit31_vec(struct gen_context *gen)
+{
+   if (gen->bit31_vec < 0) {
+      gen->bit31_vec = ppc_allocate_vec_register(gen->f);
+      ppc_vspltisw(gen->f, gen->bit31_vec, -1);
+      ppc_vslw(gen->f, gen->bit31_vec, gen->bit31_vec, gen->bit31_vec);
+   }
+   return gen->bit31_vec;
+}
+
+
+
 /**
  * Register fetch.
  */
@@ -1124,49 +1156,42 @@ emit_fetch(struct gen_context *gen,
          assert( 0 );
       }
       break;
-
    case TGSI_EXTSWIZZLE_ZERO:
-#if 0
-      emit_tempf(
-         func,
-         xmm,
-         TGSI_EXEC_TEMP_00000000_I,
-         TGSI_EXEC_TEMP_00000000_C );
-#endif
+      ppc_vload_float(gen->f, vec_reg, 0.0f);
       break;
-
    case TGSI_EXTSWIZZLE_ONE:
-#if 0
-      emit_tempf(
-         func,
-         xmm,
-         TEMP_ONE_I,
-         TEMP_ONE_C );
-#endif
+      {
+         int one_vec = gen_one_vec(gen);
+         ppc_vecmove(gen->f, vec_reg, one_vec);
+      }
       break;
-
    default:
       assert( 0 );
    }
 
-#if 0
-   switch( tgsi_util_get_full_src_register_sign_mode( reg, chan_index ) ) {
-   case TGSI_UTIL_SIGN_CLEAR:
-      emit_abs( func, xmm );
-      break;
-
-   case TGSI_UTIL_SIGN_SET:
-      emit_setsign( func, xmm );
-      break;
-
-   case TGSI_UTIL_SIGN_TOGGLE:
-      emit_neg( func, xmm );
-      break;
-
-   case TGSI_UTIL_SIGN_KEEP:
-      break;
+   {
+      uint sign_op = tgsi_util_get_full_src_register_sign_mode(reg, chan_index);
+      if (sign_op != TGSI_UTIL_SIGN_KEEP) {
+         int bit31_vec = gen_get_bit31_vec(gen);
+
+         switch (sign_op) {
+         case TGSI_UTIL_SIGN_CLEAR:
+            /* vec = vec & ~bit31 */
+            ppc_vandc(gen->f, vec_reg, vec_reg, bit31_vec);
+            break;
+         case TGSI_UTIL_SIGN_SET:
+            /* vec = vec | bit31 */
+            ppc_vor(gen->f, vec_reg, vec_reg, bit31_vec);
+            break;
+         case TGSI_UTIL_SIGN_TOGGLE:
+            /* vec = vec ^ bit31 */
+            ppc_vxor(gen->f, vec_reg, vec_reg, bit31_vec);
+            break;
+         default:
+            assert(0);
+         }
+      }
    }
-#endif
 }
 
 #define FETCH( GEN, INST, VEC_REG, SRC_REG, CHAN ) \
@@ -1409,6 +1434,36 @@ emit_cmp(
 #endif
 
 
+static void
+emit_scalar_unaryop(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+   int v0 = ppc_allocate_vec_register(gen->f);
+   int v1 = ppc_allocate_vec_register(gen->f);
+   uint chan_index;
+
+   FETCH(gen, *inst, v0, 0, CHAN_X);
+
+   switch (inst->Instruction.Opcode) {
+   case TGSI_OPCODE_RSQ:
+      /* v1 = 1.0 / sqrt(v0) */
+      ppc_vrsqrtefp(gen->f, v1, v0);
+      break;
+   case TGSI_OPCODE_RCP:
+      /* v1 = 1.0 / v0 */
+      ppc_vrefp(gen->f, v1, v0);
+      break;
+   default:
+      assert(0);
+   }
+
+   FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+      STORE(gen, *inst, v1, 0, chan_index);
+   }
+   ppc_release_vec_register(gen->f, v0);
+   ppc_release_vec_register(gen->f, v1);
+}
+
+
 static void
 emit_unaryop(struct gen_context *gen, struct tgsi_full_instruction *inst)
 {
@@ -1504,12 +1559,9 @@ emit_inequality(struct gen_context *gen, struct tgsi_full_instruction *inst)
    int v0 = ppc_allocate_vec_register(gen->f);
    int v1 = ppc_allocate_vec_register(gen->f);
    int v2 = ppc_allocate_vec_register(gen->f);
-   int v_one = ppc_allocate_vec_register(gen->f);
    uint chan_index;
    boolean complement = FALSE;
-
-   /* v_one = splat(1.0) */
-   ppc_vload_float(gen->f, v_one, 1.0f);
+   int one_vec = gen_one_vec(gen);
 
    FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) {
       FETCH(gen, *inst, v0, 0, chan_index);   /* v0 = srcreg[0] */
@@ -1543,9 +1595,9 @@ emit_inequality(struct gen_context *gen, struct tgsi_full_instruction *inst)
       /* v2 is now {0,0,0,0} or {~0,~0,~0,~0} */
 
       if (complement)
-         ppc_vandc(gen->f, v2, v_one, v2);    /* v2 = v_one & ~v2 */
+         ppc_vandc(gen->f, v2, one_vec, v2);    /* v2 = one_vec & ~v2 */
       else
-         ppc_vand(gen->f, v2, v_one, v2);     /* v2 = v_one & v2 */
+         ppc_vand(gen->f, v2, one_vec, v2);     /* v2 = one_vec & v2 */
 
       STORE(gen, *inst, v2, 0, chan_index);   /* store v2 */
    }
@@ -1553,7 +1605,6 @@ emit_inequality(struct gen_context *gen, struct tgsi_full_instruction *inst)
    ppc_release_vec_register(gen->f, v0);
    ppc_release_vec_register(gen->f, v1);
    ppc_release_vec_register(gen->f, v2);
-   ppc_release_vec_register(gen->f, v_one);
 }
 
 
@@ -1630,6 +1681,14 @@ emit_triop(struct gen_context *gen, struct tgsi_full_instruction *inst)
 }
 
 
+/*
+static void
+emit_lit(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+}
+*/
+
+
 static int
 emit_instruction(struct gen_context *gen,
                  struct tgsi_full_instruction *inst)
@@ -1643,6 +1702,10 @@ emit_instruction(struct gen_context *gen,
    case TGSI_OPCODE_LOGBASE2:
       emit_unaryop(gen, inst);
       break;
+   case TGSI_OPCODE_RSQ:
+   case TGSI_OPCODE_RCP:
+      emit_scalar_unaryop(gen, inst);
+      break;
    case TGSI_OPCODE_ADD:
    case TGSI_OPCODE_SUB:
    case TGSI_OPCODE_MUL:
@@ -1667,6 +1730,11 @@ emit_instruction(struct gen_context *gen,
    case TGSI_OPCODE_DPH:
       emit_dotprod(gen, inst);
       break;
+      /*
+   case TGSI_OPCODE_LIT:
+      emit_lit(gen, inst);
+      break;
+      */
    case TGSI_OPCODE_END:
       /* normal end */
       return 1;
@@ -2715,6 +2783,8 @@ tgsi_emit_ppc(const struct tgsi_token *tokens,
    gen.temps_reg = ppc_reserve_register(func, 5);    /* ... */
    gen.immed_reg = ppc_reserve_register(func, 6);
    gen.const_reg = ppc_reserve_register(func, 7);
+   gen.one_vec = -1;
+   gen.bit31_vec = -1;
 
    emit_prologue(func);
 
-- 
cgit v1.2.3


From 7b1d08738f30d0fec2f07568b16e08c4fdddeeac Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 22 Oct 2008 15:25:04 -0600
Subject: cell: turn on PPC assembly vertex transform

gears runs with it now (3x faster FPS than before).
---
 src/gallium/auxiliary/draw/draw_vs_ppc.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/auxiliary/draw/draw_vs_ppc.c b/src/gallium/auxiliary/draw/draw_vs_ppc.c
index 990a659f27..fcc9cbfec5 100644
--- a/src/gallium/auxiliary/draw/draw_vs_ppc.c
+++ b/src/gallium/auxiliary/draw/draw_vs_ppc.c
@@ -203,9 +203,6 @@ draw_create_vs_ppc(struct draw_context *draw,
 {
    struct draw_ppc_vertex_shader *vs;
 
-   /* XXX temporary short-circuit */
-   return NULL;
-
    vs = CALLOC_STRUCT( draw_ppc_vertex_shader );
    if (vs == NULL) 
       return NULL;
@@ -233,7 +230,7 @@ draw_create_vs_ppc(struct draw_context *draw,
 
    vs->machine = &draw->vs.machine;
    
-   ppc_init_func( &vs->ppc_program, 1000 ); /* XXX fix limit */
+   ppc_init_func( &vs->ppc_program, 2000 ); /* XXX fix limit */
 
    if (!tgsi_emit_ppc( (struct tgsi_token *) vs->base.state.tokens,
 			&vs->ppc_program, 
-- 
cgit v1.2.3


From 519c2dbed57b3c5e1717a62df5d5f8b908a1acd6 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 22 Oct 2008 15:30:00 -0600
Subject: gallium: remove SSE remnants from tgsi_ppc.c

---
 src/gallium/auxiliary/tgsi/tgsi_ppc.c | 2987 +++++----------------------------
 1 file changed, 417 insertions(+), 2570 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/auxiliary/tgsi/tgsi_ppc.c b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
index 3637772102..432ec7459b 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ppc.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
@@ -44,14 +44,6 @@
 #include "rtasm/rtasm_ppc.h"
 
 
-/* for 1/sqrt()
- *
- * This costs about 100fps (close to 10%) in gears:
- */
-#define HIGH_PRECISION 1
-
-#define FAST_MATH 1
-
 
 #define FOR_EACH_CHANNEL( CHAN )\
    for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)
@@ -95,2452 +87,515 @@ struct gen_context
 };
 
 
-
-#if 0000
-
 /**
- * X86 utility functions.
+ * Return index of vector register containing {1.0, 1.0, 1.0, 1.0}.
  */
-
-static struct x86_reg
-make_xmm(
-   unsigned xmm )
+static int
+gen_one_vec(struct gen_context *gen)
 {
-   return x86_make_reg(
-      file_XMM,
-      (enum x86_reg_name) xmm );
+   if (gen->one_vec < 0) {
+      gen->one_vec = ppc_allocate_vec_register(gen->f);
+      ppc_vload_float(gen->f, gen->one_vec, 1.0f);
+   }
+   return gen->one_vec;
 }
 
 /**
- * X86 register mapping helpers.
+ * Return index of vector register containing {1<<31, 1<<31, 1<<31, 1<<31}.
  */
-
-static struct x86_reg
-get_const_base( void )
-{
-   return x86_make_reg(
-      file_REG32,
-      reg_CX );
-}
-
-static struct x86_reg
-get_input_base( void )
-{
-   return x86_make_reg(
-      file_REG32,
-      reg_AX );
-}
-
-static struct x86_reg
-get_output_base( void )
-{
-   return x86_make_reg(
-      file_REG32,
-      reg_DX );
-}
-
-static struct x86_reg
-get_temp_base( void )
-{
-   return x86_make_reg(
-      file_REG32,
-      reg_BX );
-}
-
-static struct x86_reg
-get_coef_base( void )
+static int
+gen_get_bit31_vec(struct gen_context *gen)
 {
-   return get_output_base();
+   if (gen->bit31_vec < 0) {
+      gen->bit31_vec = ppc_allocate_vec_register(gen->f);
+      ppc_vspltisw(gen->f, gen->bit31_vec, -1);
+      ppc_vslw(gen->f, gen->bit31_vec, gen->bit31_vec, gen->bit31_vec);
+   }
+   return gen->bit31_vec;
 }
 
-static struct x86_reg
-get_immediate_base( void )
-{
-   return x86_make_reg(
-      file_REG32,
-      reg_DI );
-}
 
 
 /**
- * Data access helpers.
+ * Register fetch.
  */
-
-
-static struct x86_reg
-get_immediate(
-   unsigned vec,
-   unsigned chan )
-{
-   return x86_make_disp(
-      get_immediate_base(),
-      (vec * 4 + chan) * 4 );
-}
-
-static struct x86_reg
-get_const(
-   unsigned vec,
-   unsigned chan )
-{
-   return x86_make_disp(
-      get_const_base(),
-      (vec * 4 + chan) * 4 );
-}
-
-static struct x86_reg
-get_input(
-   unsigned vec,
-   unsigned chan )
+static void
+emit_fetch(struct gen_context *gen,
+           unsigned vec_reg,
+           const struct tgsi_full_src_register *reg,
+           const unsigned chan_index)
 {
-   return x86_make_disp(
-      get_input_base(),
-      (vec * 4 + chan) * 16 );
-}
+   uint swizzle = tgsi_util_get_full_src_register_extswizzle(reg, chan_index);
 
-static struct x86_reg
-get_output(
-   unsigned vec,
-   unsigned chan )
-{
-   return x86_make_disp(
-      get_output_base(),
-      (vec * 4 + chan) * 16 );
-}
+   switch (swizzle) {
+   case TGSI_EXTSWIZZLE_X:
+   case TGSI_EXTSWIZZLE_Y:
+   case TGSI_EXTSWIZZLE_Z:
+   case TGSI_EXTSWIZZLE_W:
+      switch (reg->SrcRegister.File) {
+      case TGSI_FILE_INPUT:
+         {
+            int offset_reg = ppc_allocate_register(gen->f);
+            int offset = (reg->SrcRegister.Index * 4 + swizzle) * 16;
+            ppc_li(gen->f, offset_reg, offset);
+            ppc_lvx(gen->f, vec_reg, gen->inputs_reg, offset_reg);
+            ppc_release_register(gen->f, offset_reg);
+         }
+         break;
+      case TGSI_FILE_TEMPORARY:
+         {
+            int offset_reg = ppc_allocate_register(gen->f);
+            int offset = (reg->SrcRegister.Index * 4 + swizzle) * 16;
+            ppc_li(gen->f, offset_reg, offset);
+            ppc_lvx(gen->f, vec_reg, gen->temps_reg, offset_reg);
+            ppc_release_register(gen->f, offset_reg);
+         }
+         break;
+      case TGSI_FILE_IMMEDIATE:
+         {
+            int offset_reg = ppc_allocate_register(gen->f);
+            int offset = (reg->SrcRegister.Index * 4 + swizzle) * 16;
+            ppc_li(gen->f, offset_reg, offset);
+            ppc_lvx(gen->f, vec_reg, gen->immed_reg, offset_reg);
+            ppc_release_register(gen->f, offset_reg);
+         }
+         break;
+      case TGSI_FILE_CONSTANT:
+         {
+            int offset_reg = ppc_allocate_register(gen->f);
+            int offset = (reg->SrcRegister.Index * 4 + swizzle) * 4;
+            ppc_li(gen->f, offset_reg, offset);
+            /* Load 4-byte word into vector register.
+             * The vector slot depends on the effective address we load from.
+             * We know that our constants start at a 16-byte boundary so we
+             * know that 'swizzle' tells us which vector slot will have the
+             * loaded word.  The other vector slots will be undefined.
+             */
+            ppc_lvewx(gen->f, vec_reg, gen->const_reg, offset_reg);
+            /* splat word[swizzle] across the vector reg */
+            ppc_vspltw(gen->f, vec_reg, vec_reg, swizzle);
+            ppc_release_register(gen->f, offset_reg);
+         }
+         break;
+      default:
+         assert( 0 );
+      }
+      break;
+   case TGSI_EXTSWIZZLE_ZERO:
+      ppc_vload_float(gen->f, vec_reg, 0.0f);
+      break;
+   case TGSI_EXTSWIZZLE_ONE:
+      {
+         int one_vec = gen_one_vec(gen);
+         ppc_vecmove(gen->f, vec_reg, one_vec);
+      }
+      break;
+   default:
+      assert( 0 );
+   }
 
-static struct x86_reg
-get_temp(
-   unsigned vec,
-   unsigned chan )
-{
-   return x86_make_disp(
-      get_temp_base(),
-      (vec * 4 + chan) * 16 );
-}
+   {
+      uint sign_op = tgsi_util_get_full_src_register_sign_mode(reg, chan_index);
+      if (sign_op != TGSI_UTIL_SIGN_KEEP) {
+         int bit31_vec = gen_get_bit31_vec(gen);
 
-static struct x86_reg
-get_coef(
-   unsigned vec,
-   unsigned chan,
-   unsigned member )
-{
-   return x86_make_disp(
-      get_coef_base(),
-      ((vec * 3 + member) * 4 + chan) * 4 );
+         switch (sign_op) {
+         case TGSI_UTIL_SIGN_CLEAR:
+            /* vec = vec & ~bit31 */
+            ppc_vandc(gen->f, vec_reg, vec_reg, bit31_vec);
+            break;
+         case TGSI_UTIL_SIGN_SET:
+            /* vec = vec | bit31 */
+            ppc_vor(gen->f, vec_reg, vec_reg, bit31_vec);
+            break;
+         case TGSI_UTIL_SIGN_TOGGLE:
+            /* vec = vec ^ bit31 */
+            ppc_vxor(gen->f, vec_reg, vec_reg, bit31_vec);
+            break;
+         default:
+            assert(0);
+         }
+      }
+   }
 }
 
+#define FETCH( GEN, INST, VEC_REG, SRC_REG, CHAN ) \
+   emit_fetch( GEN, VEC_REG, &(INST).FullSrcRegisters[SRC_REG], CHAN )
 
-static void
-emit_ret(
-   struct x86_function  *func )
-{
-   x86_ret( func );
-}
-
-#endif
 
-/**
- * Data fetch helpers.
- */
 
-#if 00
 /**
- * Copy a shader constant to xmm register
- * \param xmm  the destination xmm register
- * \param vec  the src const buffer index
- * \param chan  src channel to fetch (X, Y, Z or W)
+ * Register store.
  */
 static void
-emit_const(
-   struct x86_function *func,
-   uint xmm,
-   int vec,
-   uint chan,
-   uint indirect,
-   uint indirectFile,
-   int indirectIndex )
+emit_store(struct gen_context *gen,
+           unsigned vec_reg,
+           const struct tgsi_full_dst_register *reg,
+           const struct tgsi_full_instruction *inst,
+           unsigned chan_index)
 {
-   if (indirect) {
-      struct x86_reg r0 = get_input_base();
-      struct x86_reg r1 = get_output_base();
-      uint i;
-
-      assert( indirectFile == TGSI_FILE_ADDRESS );
-      assert( indirectIndex == 0 );
-
-      x86_push( func, r0 );
-      x86_push( func, r1 );
-
-      for (i = 0; i < QUAD_SIZE; i++) {
-         x86_lea( func, r0, get_const( vec, chan ) );
-         x86_mov( func, r1, x86_make_disp( get_temp( TEMP_ADDR, CHAN_X ), i * 4 ) );
-
-         /* Quick hack to multiply by 16 -- need to add SHL to rtasm.
-          */
-         x86_add( func, r1, r1 );
-         x86_add( func, r1, r1 );
-         x86_add( func, r1, r1 );
-         x86_add( func, r1, r1 );
-
-         x86_add( func, r0, r1 );
-         x86_mov( func, r1, x86_deref( r0 ) );
-         x86_mov( func, x86_make_disp( get_temp( TEMP_R0, CHAN_X ), i * 4 ), r1 );
+   switch (reg->DstRegister.File) {
+   case TGSI_FILE_OUTPUT:
+      {
+         int offset_reg = ppc_allocate_register(gen->f);
+         int offset = (reg->DstRegister.Index * 4 + chan_index) * 16;
+         ppc_li(gen->f, offset_reg, offset);
+         ppc_stvx(gen->f, vec_reg, gen->outputs_reg, offset_reg);
+         ppc_release_register(gen->f, offset_reg);
       }
-
-      x86_pop( func, r1 );
-      x86_pop( func, r0 );
-
-      sse_movaps(
+      break;
+   case TGSI_FILE_TEMPORARY:
+      {
+         int offset_reg = ppc_allocate_register(gen->f);
+         int offset = (reg->DstRegister.Index * 4 + chan_index) * 16;
+         ppc_li(gen->f, offset_reg, offset);
+         ppc_stvx(gen->f, vec_reg, gen->temps_reg, offset_reg);
+         ppc_release_register(gen->f, offset_reg);
+      }
+      break;
+#if 0
+   case TGSI_FILE_ADDRESS:
+      emit_addrs(
          func,
-         make_xmm( xmm ),
-         get_temp( TEMP_R0, CHAN_X ) );
+         xmm,
+         reg->DstRegister.Index,
+         chan_index );
+      break;
+#endif
+   default:
+      assert( 0 );
    }
-   else {
-      assert( vec >= 0 );
 
-      sse_movss(
-         func,
-         make_xmm( xmm ),
-         get_const( vec, chan ) );
-      sse_shufps(
-         func,
-         make_xmm( xmm ),
-         make_xmm( xmm ),
-         SHUF( 0, 0, 0, 0 ) );
+#if 0
+   switch( inst->Instruction.Saturate ) {
+   case TGSI_SAT_NONE:
+      break;
+
+   case TGSI_SAT_ZERO_ONE:
+      /* assert( 0 ); */
+      break;
+
+   case TGSI_SAT_MINUS_PLUS_ONE:
+      assert( 0 );
+      break;
    }
+#endif
 }
 
-static void
-emit_immediate(
-   struct x86_function *func,
-   unsigned xmm,
-   unsigned vec,
-   unsigned chan )
-{
-   sse_movss(
-      func,
-      make_xmm( xmm ),
-      get_immediate( vec, chan ) );
-   sse_shufps(
-      func,
-      make_xmm( xmm ),
-      make_xmm( xmm ),
-      SHUF( 0, 0, 0, 0 ) );
-}
 
+#define STORE( GEN, INST, XMM, INDEX, CHAN )\
+   emit_store( GEN, XMM, &(INST).FullDstRegisters[INDEX], &(INST), CHAN )
 
-/**
- * Copy a shader input to xmm register
- * \param xmm  the destination xmm register
- * \param vec  the src input attrib
- * \param chan  src channel to fetch (X, Y, Z or W)
- */
-static void
-emit_inputf(
-   struct x86_function *func,
-   unsigned xmm,
-   unsigned vec,
-   unsigned chan )
-{
-   sse_movups(
-      func,
-      make_xmm( xmm ),
-      get_input( vec, chan ) );
-}
 
-/**
- * Store an xmm register to a shader output
- * \param xmm  the source xmm register
- * \param vec  the dest output attrib
- * \param chan  src dest channel to store (X, Y, Z or W)
- */
-static void
-emit_output(
-   struct x86_function *func,
-   unsigned xmm,
-   unsigned vec,
-   unsigned chan )
-{
-   sse_movups(
-      func,
-      get_output( vec, chan ),
-      make_xmm( xmm ) );
-}
 
-/**
- * Copy a shader temporary to xmm register
- * \param xmm  the destination xmm register
- * \param vec  the src temp register
- * \param chan  src channel to fetch (X, Y, Z or W)
- */
 static void
-emit_tempf(
-   struct x86_function *func,
-   unsigned xmm,
-   unsigned vec,
-   unsigned chan )
+emit_scalar_unaryop(struct gen_context *gen, struct tgsi_full_instruction *inst)
 {
-   sse_movaps(
-      func,
-      make_xmm( xmm ),
-      get_temp( vec, chan ) );
-}
+   int v0 = ppc_allocate_vec_register(gen->f);
+   int v1 = ppc_allocate_vec_register(gen->f);
+   uint chan_index;
 
-/**
- * Load an xmm register with an input attrib coefficient (a0, dadx or dady)
- * \param xmm  the destination xmm register
- * \param vec  the src input/attribute coefficient index
- * \param chan  src channel to fetch (X, Y, Z or W)
- * \param member  0=a0, 1=dadx, 2=dady
- */
-static void
-emit_coef(
-   struct x86_function *func,
-   unsigned xmm,
-   unsigned vec,
-   unsigned chan,
-   unsigned member )
-{
-   sse_movss(
-      func,
-      make_xmm( xmm ),
-      get_coef( vec, chan, member ) );
-   sse_shufps(
-      func,
-      make_xmm( xmm ),
-      make_xmm( xmm ),
-      SHUF( 0, 0, 0, 0 ) );
+   FETCH(gen, *inst, v0, 0, CHAN_X);
+
+   switch (inst->Instruction.Opcode) {
+   case TGSI_OPCODE_RSQ:
+      /* v1 = 1.0 / sqrt(v0) */
+      ppc_vrsqrtefp(gen->f, v1, v0);
+      break;
+   case TGSI_OPCODE_RCP:
+      /* v1 = 1.0 / v0 */
+      ppc_vrefp(gen->f, v1, v0);
+      break;
+   default:
+      assert(0);
+   }
+
+   FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+      STORE(gen, *inst, v1, 0, chan_index);
+   }
+   ppc_release_vec_register(gen->f, v0);
+   ppc_release_vec_register(gen->f, v1);
 }
 
-/**
- * Data store helpers.
- */
 
 static void
-emit_inputs(
-   struct x86_function *func,
-   unsigned xmm,
-   unsigned vec,
-   unsigned chan )
-{
-   sse_movups(
-      func,
-      get_input( vec, chan ),
-      make_xmm( xmm ) );
-}
-
-static void
-emit_temps(
-   struct x86_function *func,
-   unsigned xmm,
-   unsigned vec,
-   unsigned chan )
-{
-   sse_movaps(
-      func,
-      get_temp( vec, chan ),
-      make_xmm( xmm ) );
-}
-
-static void
-emit_addrs(
-   struct x86_function *func,
-   unsigned xmm,
-   unsigned vec,
-   unsigned chan )
-{
-   assert( vec == 0 );
-
-   emit_temps(
-      func,
-      xmm,
-      vec + TGSI_EXEC_TEMP_ADDR,
-      chan );
-}
-
-/**
- * Coefficent fetch helpers.
- */
-
-static void
-emit_coef_a0(
-   struct x86_function *func,
-   unsigned xmm,
-   unsigned vec,
-   unsigned chan )
-{
-   emit_coef(
-      func,
-      xmm,
-      vec,
-      chan,
-      0 );
-}
-
-static void
-emit_coef_dadx(
-   struct x86_function *func,
-   unsigned xmm,
-   unsigned vec,
-   unsigned chan )
-{
-   emit_coef(
-      func,
-      xmm,
-      vec,
-      chan,
-      1 );
-}
-
-static void
-emit_coef_dady(
-   struct x86_function *func,
-   unsigned xmm,
-   unsigned vec,
-   unsigned chan )
-{
-   emit_coef(
-      func,
-      xmm,
-      vec,
-      chan,
-      2 );
-}
-#endif
-
-
-/**
- * Function call helpers.
- */
-
-#if 00
-/**
- * NOTE: In gcc, if the destination uses the SSE intrinsics, then it must be 
- * defined with __attribute__((force_align_arg_pointer)), as we do not guarantee
- * that the stack pointer is 16 byte aligned, as expected.
- */
-static void
-emit_func_call_dst(
-   struct x86_function *func,
-   unsigned xmm_save,
-   unsigned xmm_dst,
-   void (PIPE_CDECL *code)() )
+emit_unaryop(struct gen_context *gen, struct tgsi_full_instruction *inst)
 {
-   struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
-   unsigned i, n, xmm;
-   unsigned xmm_mask;
-   
-   /* Bitmask of the xmm registers to save */
-   xmm_mask = (1 << xmm_save) - 1;
-   xmm_mask &= ~(1 << xmm_dst);
-
-   sse_movaps(
-      func,
-      get_temp( TEMP_R0, 0 ),
-      make_xmm( xmm_dst ) );
-
-   x86_push(
-      func,
-      x86_make_reg( file_REG32, reg_AX) );
-   x86_push(
-      func,
-      x86_make_reg( file_REG32, reg_CX) );
-   x86_push(
-      func,
-      x86_make_reg( file_REG32, reg_DX) );
-   
-   for(i = 0, n = 0; i < 8; ++i)
-      if(xmm_mask & (1 << i))
-         ++n;
-   
-   x86_sub_imm(
-      func, 
-      x86_make_reg( file_REG32, reg_SP ),
-      n*16);
-
-   for(i = 0, n = 0; i < 8; ++i)
-      if(xmm_mask & (1 << i)) {
-         sse_movups(
-            func,
-            x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ),
-            make_xmm( xmm ) );
-         ++n;
-      }
-   
-   x86_lea(
-      func,
-      ecx,
-      get_temp( TEMP_R0, 0 ) );
-   
-   x86_push( func, ecx );
-   x86_mov_reg_imm( func, ecx, (unsigned long) code );
-   x86_call( func, ecx );
-   x86_pop(func, ecx );
-   
-   for(i = 0, n = 0; i < 8; ++i)
-      if(xmm_mask & (1 << i)) {
-         sse_movups(
-            func,
-            make_xmm( xmm ),
-            x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ) );
-         ++n;
+   int v0 = ppc_allocate_vec_register(gen->f);
+   uint chan_index;
+   FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) {
+      FETCH(gen, *inst, 0, 0, chan_index);   /* v0 = srcreg[0] */
+      switch (inst->Instruction.Opcode) {
+      case TGSI_OPCODE_ABS:
+         /* turn off the most significant bit of each vector float word */
+         {
+            int v1 = ppc_allocate_vec_register(gen->f);
+            ppc_vspltisw(gen->f, v1, -1);  /* v1 = {-1, -1, -1, -1} */
+            ppc_vslw(gen->f, v1, v1, v1);  /* v1 = {1<<31, 1<<31, 1<<31, 1<<31} */
+            ppc_vandc(gen->f, v0, v0, v1); /* v0 = v0 & ~v1 */
+            ppc_release_vec_register(gen->f, v1);
+         }
+         break;
+      case TGSI_OPCODE_FLOOR:
+         ppc_vrfim(gen->f, v0, v0);         /* v0 = floor(v0) */
+         break;
+      case TGSI_OPCODE_FRAC:
+         {
+            int v1 = ppc_allocate_vec_register(gen->f);
+            ppc_vrfim(gen->f, v1, v0);         /* v1 = floor(v0) */
+            ppc_vsubfp(gen->f, v0, v0, v1);    /* v0 = v0 - v1 */
+            ppc_release_vec_register(gen->f, v1);
+         }
+         break;
+      case TGSI_OPCODE_EXPBASE2:
+         ppc_vexptefp(gen->f, v0, v0);      /* v0 = 2^v0 */
+         break;
+      case TGSI_OPCODE_LOGBASE2:
+         /* XXX this may be broken! */
+         ppc_vlogefp(gen->f, v0, v0);      /* v0 = log2(v0) */
+         break;
+      case TGSI_OPCODE_MOV:
+         /* nothing */
+         break;
+      default:
+         assert(0);
       }
-   
-   x86_add_imm(
-      func, 
-      x86_make_reg( file_REG32, reg_SP ),
-      n*16);
-
-   /* Restore GP registers in a reverse order.
-    */
-   x86_pop(
-      func,
-      x86_make_reg( file_REG32, reg_DX) );
-   x86_pop(
-      func,
-      x86_make_reg( file_REG32, reg_CX) );
-   x86_pop(
-      func,
-      x86_make_reg( file_REG32, reg_AX) );
-
-   sse_movaps(
-      func,
-      make_xmm( xmm_dst ),
-      get_temp( TEMP_R0, 0 ) );
-}
-
-static void
-emit_func_call_dst_src(
-   struct x86_function *func,
-   unsigned xmm_save, 
-   unsigned xmm_dst,
-   unsigned xmm_src,
-   void (PIPE_CDECL *code)() )
-{
-   sse_movaps(
-      func,
-      get_temp( TEMP_R0, 1 ),
-      make_xmm( xmm_src ) );
-
-   emit_func_call_dst(
-      func,
-      xmm_save,
-      xmm_dst,
-      code );
-}
-
-/*
- * Fast SSE2 implementation of special math functions.
- */
-
-#define POLY0(x, c0) _mm_set1_ps(c0)
-#define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
-#define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
-#define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
-#define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
-#define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
-
-#define EXP_POLY_DEGREE 3
-#define LOG_POLY_DEGREE 5
-
-/**
- * See http://www.devmaster.net/forums/showthread.php?p=43580
- */
-static INLINE __m128 
-exp2f4(__m128 x)
-{
-   __m128i ipart;
-   __m128 fpart, expipart, expfpart;
-
-   x = _mm_min_ps(x, _mm_set1_ps( 129.00000f));
-   x = _mm_max_ps(x, _mm_set1_ps(-126.99999f));
-
-   /* ipart = int(x - 0.5) */
-   ipart = _mm_cvtps_epi32(_mm_sub_ps(x, _mm_set1_ps(0.5f)));
-
-   /* fpart = x - ipart */
-   fpart = _mm_sub_ps(x, _mm_cvtepi32_ps(ipart));
-
-   /* expipart = (float) (1 << ipart) */
-   expipart = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart, _mm_set1_epi32(127)), 23));
-
-   /* minimax polynomial fit of 2**x, in range [-0.5, 0.5[ */
-#if EXP_POLY_DEGREE == 5
-   expfpart = POLY5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f);
-#elif EXP_POLY_DEGREE == 4
-   expfpart = POLY4(fpart, 1.0000026f, 6.9300383e-1f, 2.4144275e-1f, 5.2011464e-2f, 1.3534167e-2f);
-#elif EXP_POLY_DEGREE == 3
-   expfpart = POLY3(fpart, 9.9992520e-1f, 6.9583356e-1f, 2.2606716e-1f, 7.8024521e-2f);
-#elif EXP_POLY_DEGREE == 2
-   expfpart = POLY2(fpart, 1.0017247f, 6.5763628e-1f, 3.3718944e-1f);
-#else
-#error
-#endif
-
-   return _mm_mul_ps(expipart, expfpart);
-}
-
-/**
- * See http://www.devmaster.net/forums/showthread.php?p=43580
- */
-static INLINE __m128 
-log2f4(__m128 x)
-{
-   __m128i expmask = _mm_set1_epi32(0x7f800000);
-   __m128i mantmask = _mm_set1_epi32(0x007fffff);
-   __m128 one = _mm_set1_ps(1.0f);
-
-   __m128i i = _mm_castps_si128(x);
-
-   /* exp = (float) exponent(x) */
-   __m128 exp = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i, expmask), 23), _mm_set1_epi32(127)));
-
-   /* mant = (float) mantissa(x) */
-   __m128 mant = _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i, mantmask)), one);
-
-   __m128 logmant;
-
-   /* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[ 
-    * These coefficients can be generate with 
-    * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
-    */
-#if LOG_POLY_DEGREE == 6
-   logmant = POLY5(mant, 3.11578814719469302614f, -3.32419399085241980044f, 2.59883907202499966007f, -1.23152682416275988241f, 0.318212422185251071475f, -0.0344359067839062357313f);
-#elif LOG_POLY_DEGREE == 5
-   logmant = POLY4(mant, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
-#elif LOG_POLY_DEGREE == 4
-   logmant = POLY3(mant, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
-#elif LOG_POLY_DEGREE == 3
-   logmant = POLY2(mant, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
-#else
-#error
-#endif
-
-   /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
-   logmant = _mm_mul_ps(logmant, _mm_sub_ps(mant, one));
-
-   return _mm_add_ps(logmant, exp);
-}
-
-static INLINE __m128
-powf4(__m128 x, __m128 y)
-{
-   return exp2f4(_mm_mul_ps(log2f4(x), y));
-}
-
-
-/**
- * Low-level instruction translators.
- */
-
-static void
-emit_abs(
-   struct x86_function *func,
-   unsigned xmm )
-{
-   sse_andps(
-      func,
-      make_xmm( xmm ),
-      get_temp(
-         TGSI_EXEC_TEMP_7FFFFFFF_I,
-         TGSI_EXEC_TEMP_7FFFFFFF_C ) );
-}
-
-static void
-emit_add(
-   struct x86_function *func,
-   unsigned xmm_dst,
-   unsigned xmm_src )
-{
-   sse_addps(
-      func,
-      make_xmm( xmm_dst ),
-      make_xmm( xmm_src ) );
-}
-
-static void PIPE_CDECL
-cos4f(
-   float *store )
-{
-   store[0] = cosf( store[0] );
-   store[1] = cosf( store[1] );
-   store[2] = cosf( store[2] );
-   store[3] = cosf( store[3] );
-}
-
-static void
-emit_cos(
-   struct x86_function *func,
-   unsigned xmm_save, 
-   unsigned xmm_dst )
-{
-   emit_func_call_dst(
-      func,
-      xmm_save, 
-      xmm_dst,
-      cos4f );
-}
-
-static void PIPE_CDECL
-#if defined(PIPE_CC_GCC)
-__attribute__((force_align_arg_pointer))
-#endif
-ex24f(
-   float *store )
-{
-   _mm_store_ps(&store[0], exp2f4( _mm_load_ps(&store[0]) ));
-}
-
-static void
-emit_ex2(
-   struct x86_function *func,
-   unsigned xmm_save, 
-   unsigned xmm_dst )
-{
-   emit_func_call_dst(
-      func,
-      xmm_save,
-      xmm_dst,
-      ex24f );
-}
-
-static void
-emit_f2it(
-   struct x86_function *func,
-   unsigned xmm )
-{
-   sse2_cvttps2dq(
-      func,
-      make_xmm( xmm ),
-      make_xmm( xmm ) );
-}
-
-static void PIPE_CDECL
-flr4f(
-   float *store )
-{
-   store[0] = floorf( store[0] );
-   store[1] = floorf( store[1] );
-   store[2] = floorf( store[2] );
-   store[3] = floorf( store[3] );
-}
-
-static void
-emit_flr(
-   struct x86_function *func,
-   unsigned xmm_save, 
-   unsigned xmm_dst )
-{
-   emit_func_call_dst(
-      func,
-      xmm_save,
-      xmm_dst,
-      flr4f );
-}
-
-static void PIPE_CDECL
-frc4f(
-   float *store )
-{
-   store[0] -= floorf( store[0] );
-   store[1] -= floorf( store[1] );
-   store[2] -= floorf( store[2] );
-   store[3] -= floorf( store[3] );
-}
-
-static void
-emit_frc(
-   struct x86_function *func,
-   unsigned xmm_save, 
-   unsigned xmm_dst )
-{
-   emit_func_call_dst(
-      func,
-      xmm_save,
-      xmm_dst,
-      frc4f );
-}
-
-static void PIPE_CDECL
-#if defined(PIPE_CC_GCC)
-__attribute__((force_align_arg_pointer))
-#endif
-lg24f(
-   float *store )
-{
-   _mm_store_ps(&store[0], log2f4( _mm_load_ps(&store[0]) ));
-}
-
-static void
-emit_lg2(
-   struct x86_function *func,
-   unsigned xmm_save, 
-   unsigned xmm_dst )
-{
-   emit_func_call_dst(
-      func,
-      xmm_save,
-      xmm_dst,
-      lg24f );
-}
-
-static void
-emit_MOV(
-   struct x86_function *func,
-   unsigned xmm_dst,
-   unsigned xmm_src )
-{
-   sse_movups(
-      func,
-      make_xmm( xmm_dst ),
-      make_xmm( xmm_src ) );
-}
-
-static void
-emit_mul (struct x86_function *func,
-          unsigned xmm_dst,
-          unsigned xmm_src)
-{
-   sse_mulps(
-      func,
-      make_xmm( xmm_dst ),
-      make_xmm( xmm_src ) );
-}
-
-static void
-emit_neg(
-   struct x86_function *func,
-   unsigned xmm )
-{
-   sse_xorps(
-      func,
-      make_xmm( xmm ),
-      get_temp(
-         TGSI_EXEC_TEMP_80000000_I,
-         TGSI_EXEC_TEMP_80000000_C ) );
-}
-
-static void PIPE_CDECL
-#if defined(PIPE_CC_GCC)
-__attribute__((force_align_arg_pointer))
-#endif
-pow4f(
-   float *store )
-{
-#if 1
-   _mm_store_ps(&store[0], powf4( _mm_load_ps(&store[0]), _mm_load_ps(&store[4]) ));
-#else
-   store[0] = powf( store[0], store[4] );
-   store[1] = powf( store[1], store[5] );
-   store[2] = powf( store[2], store[6] );
-   store[3] = powf( store[3], store[7] );
-#endif
-}
-
-static void
-emit_pow(
-   struct x86_function *func,
-   unsigned xmm_save, 
-   unsigned xmm_dst,
-   unsigned xmm_src )
-{
-   emit_func_call_dst_src(
-      func,
-      xmm_save,
-      xmm_dst,
-      xmm_src,
-      pow4f );
-}
-
-static void
-emit_rcp (
-   struct x86_function *func,
-   unsigned xmm_dst,
-   unsigned xmm_src )
-{
-   /* On Intel CPUs at least, this is only accurate to 12 bits -- not
-    * good enough.  Need to either emit a proper divide or use the
-    * iterative technique described below in emit_rsqrt().
-    */
-   sse2_rcpps(
-      func,
-      make_xmm( xmm_dst ),
-      make_xmm( xmm_src ) );
-}
-
-static void
-emit_rsqrt(
-   struct x86_function *func,
-   unsigned xmm_dst,
-   unsigned xmm_src )
-{
-#if HIGH_PRECISION
-   /* Although rsqrtps() and rcpps() are low precision on some/all SSE
-    * implementations, it is possible to improve its precision at
-    * fairly low cost, using a newton/raphson step, as below:
-    * 
-    * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
-    * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
-    *
-    * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
-    */
-   {
-      struct x86_reg dst = make_xmm( xmm_dst );
-      struct x86_reg src = make_xmm( xmm_src );
-      struct x86_reg tmp0 = make_xmm( 2 );
-      struct x86_reg tmp1 = make_xmm( 3 );
-
-      assert( xmm_dst != xmm_src );
-      assert( xmm_dst != 2 && xmm_dst != 3 );
-      assert( xmm_src != 2 && xmm_src != 3 );
-
-      sse_movaps(  func, dst,  get_temp( TGSI_EXEC_TEMP_HALF_I, TGSI_EXEC_TEMP_HALF_C ) );
-      sse_movaps(  func, tmp0, get_temp( TGSI_EXEC_TEMP_THREE_I, TGSI_EXEC_TEMP_THREE_C ) );
-      sse_rsqrtps( func, tmp1, src  );
-      sse_mulps(   func, src,  tmp1 );
-      sse_mulps(   func, dst,  tmp1 );
-      sse_mulps(   func, src,  tmp1 );
-      sse_subps(   func, tmp0, src  );
-      sse_mulps(   func, dst,  tmp0 );
+      STORE(gen, *inst, v0, 0, chan_index);   /* store v0 */
    }
-#else
-   /* On Intel CPUs at least, this is only accurate to 12 bits -- not
-    * good enough.
-    */
-   sse_rsqrtps(
-      func,
-      make_xmm( xmm_dst ),
-      make_xmm( xmm_src ) );
-#endif
-}
-
-static void
-emit_setsign(
-   struct x86_function *func,
-   unsigned xmm )
-{
-   sse_orps(
-      func,
-      make_xmm( xmm ),
-      get_temp(
-         TGSI_EXEC_TEMP_80000000_I,
-         TGSI_EXEC_TEMP_80000000_C ) );
-}
-
-static void PIPE_CDECL
-sin4f(
-   float *store )
-{
-   store[0] = sinf( store[0] );
-   store[1] = sinf( store[1] );
-   store[2] = sinf( store[2] );
-   store[3] = sinf( store[3] );
+   ppc_release_vec_register(gen->f, v0);
 }
 
-static void
-emit_sin (struct x86_function *func,
-          unsigned xmm_save, 
-          unsigned xmm_dst)
-{
-   emit_func_call_dst(
-      func,
-      xmm_save,
-      xmm_dst,
-      sin4f );
-}
 
 static void
-emit_sub(
-   struct x86_function *func,
-   unsigned xmm_dst,
-   unsigned xmm_src )
+emit_binop(struct gen_context *gen, struct tgsi_full_instruction *inst)
 {
-   sse_subps(
-      func,
-      make_xmm( xmm_dst ),
-      make_xmm( xmm_src ) );
+   int v0 = ppc_allocate_vec_register(gen->f);
+   int v1 = ppc_allocate_vec_register(gen->f);
+   int v2 = ppc_allocate_vec_register(gen->f);
+   uint chan_index;
+   FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) {
+      FETCH(gen, *inst, v0, 0, chan_index);   /* v0 = srcreg[0] */
+      FETCH(gen, *inst, v1, 1, chan_index);   /* v1 = srcreg[1] */
+      switch (inst->Instruction.Opcode) {
+      case TGSI_OPCODE_ADD:
+         ppc_vaddfp(gen->f, v2, v0, v1);
+         break;
+      case TGSI_OPCODE_SUB:
+         ppc_vsubfp(gen->f, v2, v0, v1);
+         break;
+      case TGSI_OPCODE_MUL:
+         ppc_vxor(gen->f, v2, v2, v2);        /* v2 = {0, 0, 0, 0} */
+         ppc_vmaddfp(gen->f, v2, v0, v1, v2); /* v2 = v0 * v1 + v0 */
+         break;
+      case TGSI_OPCODE_MIN:
+         ppc_vminfp(gen->f, v2, v0, v1);
+         break;
+      case TGSI_OPCODE_MAX:
+         ppc_vmaxfp(gen->f, v2, v0, v1);
+         break;
+      default:
+         assert(0);
+      }
+      STORE(gen, *inst, v2, 0, chan_index);   /* store v2 */
+   }
+   ppc_release_vec_register(gen->f, v0);
+   ppc_release_vec_register(gen->f, v1);
+   ppc_release_vec_register(gen->f, v2);
 }
-#endif
 
 
 /**
- * Return index of vector register containing {1.0, 1.0, 1.0, 1.0}.
+ * Vector comparisons, resulting in 1.0 or 0.0 values.
  */
-static int
-gen_one_vec(struct gen_context *gen)
-{
-   if (gen->one_vec < 0) {
-      gen->one_vec = ppc_allocate_vec_register(gen->f);
-      ppc_vload_float(gen->f, gen->one_vec, 1.0f);
-   }
-   return gen->one_vec;
-}
-
-/**
- * Return index of vector register containing {1<<31, 1<<31, 1<<31, 1<<31}.
- */
-static int
-gen_get_bit31_vec(struct gen_context *gen)
-{
-   if (gen->bit31_vec < 0) {
-      gen->bit31_vec = ppc_allocate_vec_register(gen->f);
-      ppc_vspltisw(gen->f, gen->bit31_vec, -1);
-      ppc_vslw(gen->f, gen->bit31_vec, gen->bit31_vec, gen->bit31_vec);
-   }
-   return gen->bit31_vec;
-}
-
-
-
-/**
- * Register fetch.
- */
-static void
-emit_fetch(struct gen_context *gen,
-           unsigned vec_reg,
-           const struct tgsi_full_src_register *reg,
-           const unsigned chan_index)
-{
-   uint swizzle = tgsi_util_get_full_src_register_extswizzle(reg, chan_index);
-
-   switch (swizzle) {
-   case TGSI_EXTSWIZZLE_X:
-   case TGSI_EXTSWIZZLE_Y:
-   case TGSI_EXTSWIZZLE_Z:
-   case TGSI_EXTSWIZZLE_W:
-      switch (reg->SrcRegister.File) {
-      case TGSI_FILE_INPUT:
-         {
-            int offset_reg = ppc_allocate_register(gen->f);
-            int offset = (reg->SrcRegister.Index * 4 + swizzle) * 16;
-            ppc_li(gen->f, offset_reg, offset);
-            ppc_lvx(gen->f, vec_reg, gen->inputs_reg, offset_reg);
-            ppc_release_register(gen->f, offset_reg);
-         }
-         break;
-      case TGSI_FILE_TEMPORARY:
-         {
-            int offset_reg = ppc_allocate_register(gen->f);
-            int offset = (reg->SrcRegister.Index * 4 + swizzle) * 16;
-            ppc_li(gen->f, offset_reg, offset);
-            ppc_lvx(gen->f, vec_reg, gen->temps_reg, offset_reg);
-            ppc_release_register(gen->f, offset_reg);
-         }
-         break;
-      case TGSI_FILE_IMMEDIATE:
-         {
-            int offset_reg = ppc_allocate_register(gen->f);
-            int offset = (reg->SrcRegister.Index * 4 + swizzle) * 16;
-            ppc_li(gen->f, offset_reg, offset);
-            ppc_lvx(gen->f, vec_reg, gen->immed_reg, offset_reg);
-            ppc_release_register(gen->f, offset_reg);
-         }
-         break;
-      case TGSI_FILE_CONSTANT:
-         {
-            int offset_reg = ppc_allocate_register(gen->f);
-            int offset = (reg->SrcRegister.Index * 4 + swizzle) * 4;
-            ppc_li(gen->f, offset_reg, offset);
-            /* Load 4-byte word into vector register.
-             * The vector slot depends on the effective address we load from.
-             * We know that our constants start at a 16-byte boundary so we
-             * know that 'swizzle' tells us which vector slot will have the
-             * loaded word.  The other vector slots will be undefined.
-             */
-            ppc_lvewx(gen->f, vec_reg, gen->const_reg, offset_reg);
-            /* splat word[swizzle] across the vector reg */
-            ppc_vspltw(gen->f, vec_reg, vec_reg, swizzle);
-            ppc_release_register(gen->f, offset_reg);
-         }
-         break;
-      default:
-         assert( 0 );
-      }
-      break;
-   case TGSI_EXTSWIZZLE_ZERO:
-      ppc_vload_float(gen->f, vec_reg, 0.0f);
-      break;
-   case TGSI_EXTSWIZZLE_ONE:
-      {
-         int one_vec = gen_one_vec(gen);
-         ppc_vecmove(gen->f, vec_reg, one_vec);
-      }
-      break;
-   default:
-      assert( 0 );
-   }
-
-   {
-      uint sign_op = tgsi_util_get_full_src_register_sign_mode(reg, chan_index);
-      if (sign_op != TGSI_UTIL_SIGN_KEEP) {
-         int bit31_vec = gen_get_bit31_vec(gen);
-
-         switch (sign_op) {
-         case TGSI_UTIL_SIGN_CLEAR:
-            /* vec = vec & ~bit31 */
-            ppc_vandc(gen->f, vec_reg, vec_reg, bit31_vec);
-            break;
-         case TGSI_UTIL_SIGN_SET:
-            /* vec = vec | bit31 */
-            ppc_vor(gen->f, vec_reg, vec_reg, bit31_vec);
-            break;
-         case TGSI_UTIL_SIGN_TOGGLE:
-            /* vec = vec ^ bit31 */
-            ppc_vxor(gen->f, vec_reg, vec_reg, bit31_vec);
-            break;
-         default:
-            assert(0);
-         }
-      }
-   }
-}
-
-#define FETCH( GEN, INST, VEC_REG, SRC_REG, CHAN ) \
-   emit_fetch( GEN, VEC_REG, &(INST).FullSrcRegisters[SRC_REG], CHAN )
-
-
-
-/**
- * Register store.
- */
-static void
-emit_store(struct gen_context *gen,
-           unsigned vec_reg,
-           const struct tgsi_full_dst_register *reg,
-           const struct tgsi_full_instruction *inst,
-           unsigned chan_index)
-{
-   switch (reg->DstRegister.File) {
-   case TGSI_FILE_OUTPUT:
-      {
-         int offset_reg = ppc_allocate_register(gen->f);
-         int offset = (reg->DstRegister.Index * 4 + chan_index) * 16;
-         ppc_li(gen->f, offset_reg, offset);
-         ppc_stvx(gen->f, vec_reg, gen->outputs_reg, offset_reg);
-         ppc_release_register(gen->f, offset_reg);
-      }
-      break;
-   case TGSI_FILE_TEMPORARY:
-      {
-         int offset_reg = ppc_allocate_register(gen->f);
-         int offset = (reg->DstRegister.Index * 4 + chan_index) * 16;
-         ppc_li(gen->f, offset_reg, offset);
-         ppc_stvx(gen->f, vec_reg, gen->temps_reg, offset_reg);
-         ppc_release_register(gen->f, offset_reg);
-      }
-      break;
-#if 0
-   case TGSI_FILE_ADDRESS:
-      emit_addrs(
-         func,
-         xmm,
-         reg->DstRegister.Index,
-         chan_index );
-      break;
-#endif
-   default:
-      assert( 0 );
-   }
-
-#if 0
-   switch( inst->Instruction.Saturate ) {
-   case TGSI_SAT_NONE:
-      break;
-
-   case TGSI_SAT_ZERO_ONE:
-      /* assert( 0 ); */
-      break;
-
-   case TGSI_SAT_MINUS_PLUS_ONE:
-      assert( 0 );
-      break;
-   }
-#endif
-}
-
-
-#define STORE( GEN, INST, XMM, INDEX, CHAN )\
-   emit_store( GEN, XMM, &(INST).FullDstRegisters[INDEX], &(INST), CHAN )
-
-
-
-#if 000
-/**
- * High-level instruction translators.
- */
-
-static void
-emit_kil(
-   struct x86_function *func,
-   const struct tgsi_full_src_register *reg )
-{
-   unsigned uniquemask;
-   unsigned registers[4];
-   unsigned nextregister = 0;
-   unsigned firstchan = ~0;
-   unsigned chan_index;
-
-   /* This mask stores component bits that were already tested. Note that
-    * we test if the value is less than zero, so 1.0 and 0.0 need not to be
-    * tested. */
-   uniquemask = (1 << TGSI_EXTSWIZZLE_ZERO) | (1 << TGSI_EXTSWIZZLE_ONE);
-
-   FOR_EACH_CHANNEL( chan_index ) {
-      unsigned swizzle;
-
-      /* unswizzle channel */
-      swizzle = tgsi_util_get_full_src_register_extswizzle(
-         reg,
-         chan_index );
-
-      /* check if the component has not been already tested */
-      if( !(uniquemask & (1 << swizzle)) ) {
-         uniquemask |= 1 << swizzle;
-
-         /* allocate register */
-         registers[chan_index] = nextregister;
-         emit_fetch(
-            func,
-            nextregister,
-            reg,
-            chan_index );
-         nextregister++;
-
-         /* mark the first channel used */
-         if( firstchan == ~0 ) {
-            firstchan = chan_index;
-         }
-      }
-   }
-
-   x86_push(
-      func,
-      x86_make_reg( file_REG32, reg_AX ) );
-   x86_push(
-      func,
-      x86_make_reg( file_REG32, reg_DX ) );
-
-   FOR_EACH_CHANNEL( chan_index ) {
-      if( uniquemask & (1 << chan_index) ) {
-         sse_cmpps(
-            func,
-            make_xmm( registers[chan_index] ),
-            get_temp(
-               TGSI_EXEC_TEMP_00000000_I,
-               TGSI_EXEC_TEMP_00000000_C ),
-            cc_LessThan );
-
-         if( chan_index == firstchan ) {
-            sse_pmovmskb(
-               func,
-               x86_make_reg( file_REG32, reg_AX ),
-               make_xmm( registers[chan_index] ) );
-         }
-         else {
-            sse_pmovmskb(
-               func,
-               x86_make_reg( file_REG32, reg_DX ),
-               make_xmm( registers[chan_index] ) );
-            x86_or(
-               func,
-               x86_make_reg( file_REG32, reg_AX ),
-               x86_make_reg( file_REG32, reg_DX ) );
-         }
-      }
-   }
-
-   x86_or(
-      func,
-      get_temp(
-         TGSI_EXEC_TEMP_KILMASK_I,
-         TGSI_EXEC_TEMP_KILMASK_C ),
-      x86_make_reg( file_REG32, reg_AX ) );
-
-   x86_pop(
-      func,
-      x86_make_reg( file_REG32, reg_DX ) );
-   x86_pop(
-      func,
-      x86_make_reg( file_REG32, reg_AX ) );
-}
-
-
-static void
-emit_kilp(
-   struct x86_function *func )
-{
-   /* XXX todo / fix me */
-}
-
-
-static void
-emit_setcc(
-   struct x86_function *func,
-   struct tgsi_full_instruction *inst,
-   enum sse_cc cc )
-{
-   unsigned chan_index;
-
-   FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-      FETCH( func, *inst, 0, 0, chan_index );
-      FETCH( func, *inst, 1, 1, chan_index );
-      sse_cmpps(
-         func,
-         make_xmm( 0 ),
-         make_xmm( 1 ),
-         cc );
-      sse_andps(
-         func,
-         make_xmm( 0 ),
-         get_temp(
-            TEMP_ONE_I,
-            TEMP_ONE_C ) );
-      STORE( func, *inst, 0, 0, chan_index );
-   }
-}
-
-static void
-emit_cmp(
-   struct x86_function *func,
-   struct tgsi_full_instruction *inst )
-{
-   unsigned chan_index;
-
-   FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-      FETCH( func, *inst, 0, 0, chan_index );
-      FETCH( func, *inst, 1, 1, chan_index );
-      FETCH( func, *inst, 2, 2, chan_index );
-      sse_cmpps(
-         func,
-         make_xmm( 0 ),
-         get_temp(
-            TGSI_EXEC_TEMP_00000000_I,
-            TGSI_EXEC_TEMP_00000000_C ),
-         cc_LessThan );
-      sse_andps(
-         func,
-         make_xmm( 1 ),
-         make_xmm( 0 ) );
-      sse_andnps(
-         func,
-         make_xmm( 0 ),
-         make_xmm( 2 ) );
-      sse_orps(
-         func,
-         make_xmm( 0 ),
-         make_xmm( 1 ) );
-      STORE( func, *inst, 0, 0, chan_index );
-   }
-}
-#endif
-
-
-static void
-emit_scalar_unaryop(struct gen_context *gen, struct tgsi_full_instruction *inst)
+static void
+emit_inequality(struct gen_context *gen, struct tgsi_full_instruction *inst)
 {
    int v0 = ppc_allocate_vec_register(gen->f);
-   int v1 = ppc_allocate_vec_register(gen->f);
-   uint chan_index;
-
-   FETCH(gen, *inst, v0, 0, CHAN_X);
-
-   switch (inst->Instruction.Opcode) {
-   case TGSI_OPCODE_RSQ:
-      /* v1 = 1.0 / sqrt(v0) */
-      ppc_vrsqrtefp(gen->f, v1, v0);
-      break;
-   case TGSI_OPCODE_RCP:
-      /* v1 = 1.0 / v0 */
-      ppc_vrefp(gen->f, v1, v0);
-      break;
-   default:
-      assert(0);
-   }
-
-   FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-      STORE(gen, *inst, v1, 0, chan_index);
-   }
-   ppc_release_vec_register(gen->f, v0);
-   ppc_release_vec_register(gen->f, v1);
-}
-
-
-static void
-emit_unaryop(struct gen_context *gen, struct tgsi_full_instruction *inst)
-{
-   int v0 = ppc_allocate_vec_register(gen->f);
-   uint chan_index;
-   FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) {
-      FETCH(gen, *inst, 0, 0, chan_index);   /* v0 = srcreg[0] */
-      switch (inst->Instruction.Opcode) {
-      case TGSI_OPCODE_ABS:
-         /* turn off the most significant bit of each vector float word */
-         {
-            int v1 = ppc_allocate_vec_register(gen->f);
-            ppc_vspltisw(gen->f, v1, -1);  /* v1 = {-1, -1, -1, -1} */
-            ppc_vslw(gen->f, v1, v1, v1);  /* v1 = {1<<31, 1<<31, 1<<31, 1<<31} */
-            ppc_vandc(gen->f, v0, v0, v1); /* v0 = v0 & ~v1 */
-            ppc_release_vec_register(gen->f, v1);
-         }
-         break;
-      case TGSI_OPCODE_FLOOR:
-         ppc_vrfim(gen->f, v0, v0);         /* v0 = floor(v0) */
-         break;
-      case TGSI_OPCODE_FRAC:
-         {
-            int v1 = ppc_allocate_vec_register(gen->f);
-            ppc_vrfim(gen->f, v1, v0);         /* v1 = floor(v0) */
-            ppc_vsubfp(gen->f, v0, v0, v1);    /* v0 = v0 - v1 */
-            ppc_release_vec_register(gen->f, v1);
-         }
-         break;
-      case TGSI_OPCODE_EXPBASE2:
-         ppc_vexptefp(gen->f, v0, v0);      /* v0 = 2^v0 */
-         break;
-      case TGSI_OPCODE_LOGBASE2:
-         /* XXX this may be broken! */
-         ppc_vlogefp(gen->f, v0, v0);      /* v0 = log2(v0) */
-         break;
-      case TGSI_OPCODE_MOV:
-         /* nothing */
-         break;
-      default:
-         assert(0);
-      }
-      STORE(gen, *inst, v0, 0, chan_index);   /* store v0 */
-   }
-   ppc_release_vec_register(gen->f, v0);
-}
-
-
-static void
-emit_binop(struct gen_context *gen, struct tgsi_full_instruction *inst)
-{
-   int v0 = ppc_allocate_vec_register(gen->f);
-   int v1 = ppc_allocate_vec_register(gen->f);
-   int v2 = ppc_allocate_vec_register(gen->f);
-   uint chan_index;
-   FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) {
-      FETCH(gen, *inst, v0, 0, chan_index);   /* v0 = srcreg[0] */
-      FETCH(gen, *inst, v1, 1, chan_index);   /* v1 = srcreg[1] */
-      switch (inst->Instruction.Opcode) {
-      case TGSI_OPCODE_ADD:
-         ppc_vaddfp(gen->f, v2, v0, v1);
-         break;
-      case TGSI_OPCODE_SUB:
-         ppc_vsubfp(gen->f, v2, v0, v1);
-         break;
-      case TGSI_OPCODE_MUL:
-         ppc_vxor(gen->f, v2, v2, v2);        /* v2 = {0, 0, 0, 0} */
-         ppc_vmaddfp(gen->f, v2, v0, v1, v2); /* v2 = v0 * v1 + v0 */
-         break;
-      case TGSI_OPCODE_MIN:
-         ppc_vminfp(gen->f, v2, v0, v1);
-         break;
-      case TGSI_OPCODE_MAX:
-         ppc_vmaxfp(gen->f, v2, v0, v1);
-         break;
-      default:
-         assert(0);
-      }
-      STORE(gen, *inst, v2, 0, chan_index);   /* store v2 */
-   }
-   ppc_release_vec_register(gen->f, v0);
-   ppc_release_vec_register(gen->f, v1);
-   ppc_release_vec_register(gen->f, v2);
-}
-
-
-/**
- * Vector comparisons, resulting in 1.0 or 0.0 values.
- */
-static void
-emit_inequality(struct gen_context *gen, struct tgsi_full_instruction *inst)
-{
-   int v0 = ppc_allocate_vec_register(gen->f);
-   int v1 = ppc_allocate_vec_register(gen->f);
-   int v2 = ppc_allocate_vec_register(gen->f);
-   uint chan_index;
-   boolean complement = FALSE;
-   int one_vec = gen_one_vec(gen);
-
-   FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) {
-      FETCH(gen, *inst, v0, 0, chan_index);   /* v0 = srcreg[0] */
-      FETCH(gen, *inst, v1, 1, chan_index);   /* v1 = srcreg[1] */
-
-      switch (inst->Instruction.Opcode) {
-      case TGSI_OPCODE_SNE:
-         complement = TRUE;
-         /* fall-through */
-      case TGSI_OPCODE_SEQ:
-         ppc_vcmpeqfpx(gen->f, v2, v0, v1); /* v2 = v0 == v1 ? ~0 : 0 */
-         break;
-
-      case TGSI_OPCODE_SGE:
-         complement = TRUE;
-         /* fall-through */
-      case TGSI_OPCODE_SLT:
-         ppc_vcmpgtfpx(gen->f, v2, v1, v0); /* v2 = v1 > v0 ? ~0 : 0 */
-         break;
-
-      case TGSI_OPCODE_SLE:
-         complement = TRUE;
-         /* fall-through */
-      case TGSI_OPCODE_SGT:
-         ppc_vcmpgtfpx(gen->f, v2, v0, v1); /* v2 = v0 > v1 ? ~0 : 0 */
-         break;
-      default:
-         assert(0);
-      }
-
-      /* v2 is now {0,0,0,0} or {~0,~0,~0,~0} */
-
-      if (complement)
-         ppc_vandc(gen->f, v2, one_vec, v2);    /* v2 = one_vec & ~v2 */
-      else
-         ppc_vand(gen->f, v2, one_vec, v2);     /* v2 = one_vec & v2 */
-
-      STORE(gen, *inst, v2, 0, chan_index);   /* store v2 */
-   }
-
-   ppc_release_vec_register(gen->f, v0);
-   ppc_release_vec_register(gen->f, v1);
-   ppc_release_vec_register(gen->f, v2);
-}
-
-
-static void
-emit_dotprod(struct gen_context *gen, struct tgsi_full_instruction *inst)
-{
-   int v0 = ppc_allocate_vec_register(gen->f);
-   int v1 = ppc_allocate_vec_register(gen->f);
-   int v2 = ppc_allocate_vec_register(gen->f);
-   uint chan_index;
-
-   ppc_vxor(gen->f, v2, v2, v2);           /* v2 = {0, 0, 0, 0} */
-
-   FETCH(gen, *inst, v0, 0, CHAN_X);       /* v0 = src0.XXXX */
-   FETCH(gen, *inst, v1, 1, CHAN_X);       /* v1 = src1.XXXX */
-   ppc_vmaddfp(gen->f, v2, v0, v1, v2);    /* v2 = v0 * v1 + v2 */
-
-   FETCH(gen, *inst, v0, 0, CHAN_Y);       /* v0 = src0.YYYY */
-   FETCH(gen, *inst, v1, 1, CHAN_Y);       /* v1 = src1.YYYY */
-   ppc_vmaddfp(gen->f, v2, v0, v1, v2);    /* v2 = v0 * v1 + v2 */
-
-   FETCH(gen, *inst, v0, 0, CHAN_Z);       /* v0 = src0.ZZZZ */
-   FETCH(gen, *inst, v1, 1, CHAN_Z);       /* v1 = src1.ZZZZ */
-   ppc_vmaddfp(gen->f, v2, v0, v1, v2);    /* v2 = v0 * v1 + v2 */
-
-   if (inst->Instruction.Opcode == TGSI_OPCODE_DP4) {
-      FETCH(gen, *inst, v0, 0, CHAN_W);    /* v0 = src0.WWWW */
-      FETCH(gen, *inst, v1, 1, CHAN_W);    /* v1 = src1.WWWW */
-      ppc_vmaddfp(gen->f, v2, v0, v1, v2); /* v2 = v0 * v1 + v2 */
-   }
-   else if (inst->Instruction.Opcode == TGSI_OPCODE_DPH) {
-      FETCH(gen, *inst, v1, 1, CHAN_W);    /* v1 = src1.WWWW */
-      ppc_vaddfp(gen->f, v2, v2, v1);      /* v2 = v2 + v1 */
-   }
-
-   FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) {
-      STORE(gen, *inst, v2, 0, chan_index);  /* store v2 */
-   }
-   ppc_release_vec_register(gen->f, v0);
-   ppc_release_vec_register(gen->f, v1);
-   ppc_release_vec_register(gen->f, v2);
-}
-
-
-static void
-emit_triop(struct gen_context *gen, struct tgsi_full_instruction *inst)
-{
-   int v0 = ppc_allocate_vec_register(gen->f);
-   int v1 = ppc_allocate_vec_register(gen->f);
-   int v2 = ppc_allocate_vec_register(gen->f);
-   int v3 = ppc_allocate_vec_register(gen->f);
-   uint chan_index;
-   FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) {
-      FETCH(gen, *inst, v0, 0, chan_index);   /* v0 = srcreg[0] */
-      FETCH(gen, *inst, v1, 1, chan_index);   /* v1 = srcreg[1] */
-      FETCH(gen, *inst, v2, 2, chan_index);   /* v2 = srcreg[2] */
-      switch (inst->Instruction.Opcode) {
-      case TGSI_OPCODE_MAD:
-         ppc_vmaddfp(gen->f, v3, v0, v1, v2);   /* v3 = v0 * v1 + v2 */
-         break;
-      case TGSI_OPCODE_LRP:
-         ppc_vsubfp(gen->f, v3, v1, v2);        /* v3 = v1 - v2 */
-         ppc_vmaddfp(gen->f, v3, v0, v3, v2);   /* v3 = v0 * v3 + v2 */
-         break;
-      default:
-         assert(0);
-      }
-      STORE(gen, *inst, v3, 0, chan_index);   /* store v3 */
-   }
-   ppc_release_vec_register(gen->f, v0);
-   ppc_release_vec_register(gen->f, v1);
-   ppc_release_vec_register(gen->f, v2);
-   ppc_release_vec_register(gen->f, v3);
-}
-
-
-/*
-static void
-emit_lit(struct gen_context *gen, struct tgsi_full_instruction *inst)
-{
-}
-*/
-
-
-static int
-emit_instruction(struct gen_context *gen,
-                 struct tgsi_full_instruction *inst)
-{
-   switch (inst->Instruction.Opcode) {
-   case TGSI_OPCODE_MOV:
-   case TGSI_OPCODE_ABS:
-   case TGSI_OPCODE_FLOOR:
-   case TGSI_OPCODE_FRAC:
-   case TGSI_OPCODE_EXPBASE2:
-   case TGSI_OPCODE_LOGBASE2:
-      emit_unaryop(gen, inst);
-      break;
-   case TGSI_OPCODE_RSQ:
-   case TGSI_OPCODE_RCP:
-      emit_scalar_unaryop(gen, inst);
-      break;
-   case TGSI_OPCODE_ADD:
-   case TGSI_OPCODE_SUB:
-   case TGSI_OPCODE_MUL:
-   case TGSI_OPCODE_MIN:
-   case TGSI_OPCODE_MAX:
-      emit_binop(gen, inst);
-      break;
-   case TGSI_OPCODE_SEQ:
-   case TGSI_OPCODE_SNE:
-   case TGSI_OPCODE_SLT:
-   case TGSI_OPCODE_SGT:
-   case TGSI_OPCODE_SLE:
-   case TGSI_OPCODE_SGE:
-      emit_inequality(gen, inst);
-      break;
-   case TGSI_OPCODE_MAD:
-   case TGSI_OPCODE_LRP:
-      emit_triop(gen, inst);
-      break;
-   case TGSI_OPCODE_DP3:
-   case TGSI_OPCODE_DP4:
-   case TGSI_OPCODE_DPH:
-      emit_dotprod(gen, inst);
-      break;
-      /*
-   case TGSI_OPCODE_LIT:
-      emit_lit(gen, inst);
-      break;
-      */
-   case TGSI_OPCODE_END:
-      /* normal end */
-      return 1;
-   default:
-      return 0;
-   }
-
-#if 0
-   unsigned chan_index;
-
-   switch (inst->Instruction.Opcode) {
-   case TGSI_OPCODE_ARL:
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( func, *inst, 0, 0, chan_index );
-         emit_f2it( func, 0 );
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_MOV:
-   case TGSI_OPCODE_SWZ:
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( func, *inst, 0, 0, chan_index );
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_LIT:
-      if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
-          IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
-         emit_tempf(
-            func,
-            0,
-            TEMP_ONE_I,
-            TEMP_ONE_C);
-         if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ) {
-            STORE( func, *inst, 0, 0, CHAN_X );
-         }
-         if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
-            STORE( func, *inst, 0, 0, CHAN_W );
-         }
-      }
-      if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
-          IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
-         if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
-            FETCH( func, *inst, 0, 0, CHAN_X );
-            sse_maxps(
-               func,
-               make_xmm( 0 ),
-               get_temp(
-                  TGSI_EXEC_TEMP_00000000_I,
-                  TGSI_EXEC_TEMP_00000000_C ) );
-            STORE( func, *inst, 0, 0, CHAN_Y );
-         }
-         if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
-            /* XMM[1] = SrcReg[0].yyyy */
-            FETCH( func, *inst, 1, 0, CHAN_Y );
-            /* XMM[1] = max(XMM[1], 0) */
-            sse_maxps(
-               func,
-               make_xmm( 1 ),
-               get_temp(
-                  TGSI_EXEC_TEMP_00000000_I,
-                  TGSI_EXEC_TEMP_00000000_C ) );
-            /* XMM[2] = SrcReg[0].wwww */
-            FETCH( func, *inst, 2, 0, CHAN_W );
-            /* XMM[2] = min(XMM[2], 128.0) */
-            sse_minps(
-               func,
-               make_xmm( 2 ),
-               get_temp(
-                  TGSI_EXEC_TEMP_128_I,
-                  TGSI_EXEC_TEMP_128_C ) );
-            /* XMM[2] = max(XMM[2], -128.0) */
-            sse_maxps(
-               func,
-               make_xmm( 2 ),
-               get_temp(
-                  TGSI_EXEC_TEMP_MINUS_128_I,
-                  TGSI_EXEC_TEMP_MINUS_128_C ) );
-            emit_pow( func, 3, 1, 2 );
-            FETCH( func, *inst, 0, 0, CHAN_X );
-            sse_xorps(
-               func,
-               make_xmm( 2 ),
-               make_xmm( 2 ) );
-            sse_cmpps(
-               func,
-               make_xmm( 2 ),
-               make_xmm( 0 ),
-               cc_LessThanEqual );
-            sse_andps(
-               func,
-               make_xmm( 2 ),
-               make_xmm( 1 ) );
-            STORE( func, *inst, 2, 0, CHAN_Z );
-         }
-      }
-      break;
-
-   case TGSI_OPCODE_RCP:
-   /* TGSI_OPCODE_RECIP */
-      FETCH( func, *inst, 0, 0, CHAN_X );
-      emit_rcp( func, 0, 0 );
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_RSQ:
-   /* TGSI_OPCODE_RECIPSQRT */
-      FETCH( func, *inst, 0, 0, CHAN_X );
-      emit_rsqrt( func, 1, 0 );
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( func, *inst, 1, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_EXP:
-      if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
-          IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
-          IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
-         FETCH( func, *inst, 0, 0, CHAN_X );
-         if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
-             IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
-            emit_MOV( func, 1, 0 );
-            emit_flr( func, 2, 1 );
-            /* dst.x = ex2(floor(src.x)) */
-            if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
-               emit_MOV( func, 2, 1 );
-               emit_ex2( func, 3, 2 );
-               STORE( func, *inst, 2, 0, CHAN_X );
-            }
-            /* dst.y = src.x - floor(src.x) */
-            if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
-               emit_MOV( func, 2, 0 );
-               emit_sub( func, 2, 1 );
-               STORE( func, *inst, 2, 0, CHAN_Y );
-            }
-         }
-         /* dst.z = ex2(src.x) */
-         if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
-            emit_ex2( func, 3, 0 );
-            STORE( func, *inst, 0, 0, CHAN_Z );
-         }
-      }
-      /* dst.w = 1.0 */
-      if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
-         emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
-         STORE( func, *inst, 0, 0, CHAN_W );
-      }
-      break;
-
-   case TGSI_OPCODE_LOG:
-      if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
-          IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
-          IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
-         FETCH( func, *inst, 0, 0, CHAN_X );
-         emit_abs( func, 0 );
-         emit_MOV( func, 1, 0 );
-         emit_lg2( func, 2, 1 );
-         /* dst.z = lg2(abs(src.x)) */
-         if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
-            STORE( func, *inst, 1, 0, CHAN_Z );
-         }
-         if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
-             IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
-            emit_flr( func, 2, 1 );
-            /* dst.x = floor(lg2(abs(src.x))) */
-            if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
-               STORE( func, *inst, 1, 0, CHAN_X );
-            }
-            /* dst.x = abs(src)/ex2(floor(lg2(abs(src.x)))) */
-            if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
-               emit_ex2( func, 2, 1 );
-               emit_rcp( func, 1, 1 );
-               emit_mul( func, 0, 1 );
-               STORE( func, *inst, 0, 0, CHAN_Y );
-            }
-         }
-      }
-      /* dst.w = 1.0 */
-      if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
-         emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
-         STORE( func, *inst, 0, 0, CHAN_W );
-      }
-      break;
-
-   case TGSI_OPCODE_MUL:
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( func, *inst, 0, 0, chan_index );
-         FETCH( func, *inst, 1, 1, chan_index );
-         emit_mul( func, 0, 1 );
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_ADD:
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( func, *inst, 0, 0, chan_index );
-         FETCH( func, *inst, 1, 1, chan_index );
-         emit_add( func, 0, 1 );
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_DP3:
-   /* TGSI_OPCODE_DOT3 */
-      FETCH( func, *inst, 0, 0, CHAN_X );
-      FETCH( func, *inst, 1, 1, CHAN_X );
-      emit_mul( func, 0, 1 );
-      FETCH( func, *inst, 1, 0, CHAN_Y );
-      FETCH( func, *inst, 2, 1, CHAN_Y );
-      emit_mul( func, 1, 2 );
-      emit_add( func, 0, 1 );
-      FETCH( func, *inst, 1, 0, CHAN_Z );
-      FETCH( func, *inst, 2, 1, CHAN_Z );
-      emit_mul( func, 1, 2 );
-      emit_add( func, 0, 1 );
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_DP4:
-   /* TGSI_OPCODE_DOT4 */
-      FETCH( func, *inst, 0, 0, CHAN_X );
-      FETCH( func, *inst, 1, 1, CHAN_X );
-      emit_mul( func, 0, 1 );
-      FETCH( func, *inst, 1, 0, CHAN_Y );
-      FETCH( func, *inst, 2, 1, CHAN_Y );
-      emit_mul( func, 1, 2 );
-      emit_add( func, 0, 1 );
-      FETCH( func, *inst, 1, 0, CHAN_Z );
-      FETCH( func, *inst, 2, 1, CHAN_Z );
-      emit_mul(func, 1, 2 );
-      emit_add(func, 0, 1 );
-      FETCH( func, *inst, 1, 0, CHAN_W );
-      FETCH( func, *inst, 2, 1, CHAN_W );
-      emit_mul( func, 1, 2 );
-      emit_add( func, 0, 1 );
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_DST:
-      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
-         emit_tempf(
-            func,
-            0,
-            TEMP_ONE_I,
-            TEMP_ONE_C );
-         STORE( func, *inst, 0, 0, CHAN_X );
-      }
-      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
-         FETCH( func, *inst, 0, 0, CHAN_Y );
-         FETCH( func, *inst, 1, 1, CHAN_Y );
-         emit_mul( func, 0, 1 );
-         STORE( func, *inst, 0, 0, CHAN_Y );
-      }
-      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
-         FETCH( func, *inst, 0, 0, CHAN_Z );
-         STORE( func, *inst, 0, 0, CHAN_Z );
-      }
-      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
-         FETCH( func, *inst, 0, 1, CHAN_W );
-         STORE( func, *inst, 0, 0, CHAN_W );
-      }
-      break;
-
-   case TGSI_OPCODE_MIN:
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( func, *inst, 0, 0, chan_index );
-         FETCH( func, *inst, 1, 1, chan_index );
-         sse_minps(
-            func,
-            make_xmm( 0 ),
-            make_xmm( 1 ) );
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_MAX:
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( func, *inst, 0, 0, chan_index );
-         FETCH( func, *inst, 1, 1, chan_index );
-         sse_maxps(
-            func,
-            make_xmm( 0 ),
-            make_xmm( 1 ) );
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_SLT:
-   /* TGSI_OPCODE_SETLT */
-      emit_setcc( func, inst, cc_LessThan );
-      break;
-
-   case TGSI_OPCODE_SGE:
-   /* TGSI_OPCODE_SETGE */
-      emit_setcc( func, inst, cc_NotLessThan );
-      break;
-
-   case TGSI_OPCODE_MAD:
-   /* TGSI_OPCODE_MADD */
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( func, *inst, 0, 0, chan_index );
-         FETCH( func, *inst, 1, 1, chan_index );
-         FETCH( func, *inst, 2, 2, chan_index );
-         emit_mul( func, 0, 1 );
-         emit_add( func, 0, 2 );
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_SUB:
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( func, *inst, 0, 0, chan_index );
-         FETCH( func, *inst, 1, 1, chan_index );
-         emit_sub( func, 0, 1 );
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_LERP:
-   /* TGSI_OPCODE_LRP */
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( func, *inst, 0, 0, chan_index );
-         FETCH( func, *inst, 1, 1, chan_index );
-         FETCH( func, *inst, 2, 2, chan_index );
-         emit_sub( func, 1, 2 );
-         emit_mul( func, 0, 1 );
-         emit_add( func, 0, 2 );
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_CND:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_CND0:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_DOT2ADD:
-   /* TGSI_OPCODE_DP2A */
-      return 0;
-      break;
-
-   case TGSI_OPCODE_INDEX:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_NEGATE:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_FRAC:
-   /* TGSI_OPCODE_FRC */
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( func, *inst, 0, 0, chan_index );
-         emit_frc( func, 0, 0 );
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_CLAMP:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_FLOOR:
-   /* TGSI_OPCODE_FLR */
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( func, *inst, 0, 0, chan_index );
-         emit_flr( func, 0, 0 );
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_ROUND:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_EXPBASE2:
-   /* TGSI_OPCODE_EX2 */
-      FETCH( func, *inst, 0, 0, CHAN_X );
-      emit_ex2( func, 0, 0 );
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_LOGBASE2:
-   /* TGSI_OPCODE_LG2 */
-      FETCH( func, *inst, 0, 0, CHAN_X );
-      emit_lg2( func, 0, 0 );
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_POWER:
-   /* TGSI_OPCODE_POW */
-      FETCH( func, *inst, 0, 0, CHAN_X );
-      FETCH( func, *inst, 1, 1, CHAN_X );
-      emit_pow( func, 0, 0, 1 );
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_CROSSPRODUCT:
-   /* TGSI_OPCODE_XPD */
-      if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
-          IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
-         FETCH( func, *inst, 1, 1, CHAN_Z );
-         FETCH( func, *inst, 3, 0, CHAN_Z );
-      }
-      if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
-          IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
-         FETCH( func, *inst, 0, 0, CHAN_Y );
-         FETCH( func, *inst, 4, 1, CHAN_Y );
-      }
-      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
-         emit_MOV( func, 2, 0 );
-         emit_mul( func, 2, 1 );
-         emit_MOV( func, 5, 3 );
-         emit_mul( func, 5, 4 );
-         emit_sub( func, 2, 5 );
-         STORE( func, *inst, 2, 0, CHAN_X );
-      }
-      if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
-          IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
-         FETCH( func, *inst, 2, 1, CHAN_X );
-         FETCH( func, *inst, 5, 0, CHAN_X );
-      }
-      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
-         emit_mul( func, 3, 2 );
-         emit_mul( func, 1, 5 );
-         emit_sub( func, 3, 1 );
-         STORE( func, *inst, 3, 0, CHAN_Y );
-      }
-      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
-         emit_mul( func, 5, 4 );
-         emit_mul( func, 0, 2 );
-         emit_sub( func, 5, 0 );
-         STORE( func, *inst, 5, 0, CHAN_Z );
-      }
-      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
-	 emit_tempf(
-	    func,
-	    0,
-	    TEMP_ONE_I,
-	    TEMP_ONE_C );
-         STORE( func, *inst, 0, 0, CHAN_W );
-      }
-      break;
-
-   case TGSI_OPCODE_MULTIPLYMATRIX:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_ABS:
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( func, *inst, 0, 0, chan_index );
-         emit_abs( func, 0) ;
-
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_RCC:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_DPH:
-      FETCH( func, *inst, 0, 0, CHAN_X );
-      FETCH( func, *inst, 1, 1, CHAN_X );
-      emit_mul( func, 0, 1 );
-      FETCH( func, *inst, 1, 0, CHAN_Y );
-      FETCH( func, *inst, 2, 1, CHAN_Y );
-      emit_mul( func, 1, 2 );
-      emit_add( func, 0, 1 );
-      FETCH( func, *inst, 1, 0, CHAN_Z );
-      FETCH( func, *inst, 2, 1, CHAN_Z );
-      emit_mul( func, 1, 2 );
-      emit_add( func, 0, 1 );
-      FETCH( func, *inst, 1, 1, CHAN_W );
-      emit_add( func, 0, 1 );
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_COS:
-      FETCH( func, *inst, 0, 0, CHAN_X );
-      emit_cos( func, 0, 0 );
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_DDX:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_DDY:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_KILP:
-      /* predicated kill */
-      emit_kilp( func );
-      return 0; /* XXX fix me */
-      break;
-
-   case TGSI_OPCODE_KIL:
-      /* conditional kill */
-      emit_kil( func, &inst->FullSrcRegisters[0] );
-      break;
-
-   case TGSI_OPCODE_PK2H:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_PK2US:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_PK4B:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_PK4UB:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_RFL:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_SEQ:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_SFL:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_SGT:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_SIN:
-      FETCH( func, *inst, 0, 0, CHAN_X );
-      emit_sin( func, 0, 0 );
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_SLE:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_SNE:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_STR:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_TEX:
-      if (0) {
-	 /* Disable dummy texture code: 
-	  */
-	 emit_tempf(
-	    func,
-	    0,
-	    TEMP_ONE_I,
-	    TEMP_ONE_C );
-	 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-	    STORE( func, *inst, 0, 0, chan_index );
-	 }
-      }
-      else {
-	 return 0;
-      }
-      break;
-
-   case TGSI_OPCODE_TXD:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_UP2H:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_UP2US:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_UP4B:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_UP4UB:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_X2D:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_ARA:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_ARR:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_BRA:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_CAL:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_RET:
-      emit_ret( func );
-      break;
-
-   case TGSI_OPCODE_END:
-      break;
-
-   case TGSI_OPCODE_SSG:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_CMP:
-      emit_cmp (func, inst);
-      break;
-
-   case TGSI_OPCODE_SCS:
-      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
-         FETCH( func, *inst, 0, 0, CHAN_X );
-         emit_cos( func, 0, 0 );
-         STORE( func, *inst, 0, 0, CHAN_X );
-      }
-      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
-         FETCH( func, *inst, 0, 0, CHAN_X );
-         emit_sin( func, 0, 0 );
-         STORE( func, *inst, 0, 0, CHAN_Y );
-      }
-      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
-	 emit_tempf(
-	    func,
-	    0,
-	    TGSI_EXEC_TEMP_00000000_I,
-	    TGSI_EXEC_TEMP_00000000_C );
-         STORE( func, *inst, 0, 0, CHAN_Z );
-      }
-      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
-	 emit_tempf(
-	    func,
-	    0,
-	    TEMP_ONE_I,
-	    TEMP_ONE_C );
-         STORE( func, *inst, 0, 0, CHAN_W );
-      }
-      break;
-
-   case TGSI_OPCODE_TXB:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_NRM:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_DIV:
-      return 0;
-      break;
+   int v1 = ppc_allocate_vec_register(gen->f);
+   int v2 = ppc_allocate_vec_register(gen->f);
+   uint chan_index;
+   boolean complement = FALSE;
+   int one_vec = gen_one_vec(gen);
 
-   case TGSI_OPCODE_DP2:
-      return 0;
-      break;
+   FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) {
+      FETCH(gen, *inst, v0, 0, chan_index);   /* v0 = srcreg[0] */
+      FETCH(gen, *inst, v1, 1, chan_index);   /* v1 = srcreg[1] */
 
-   case TGSI_OPCODE_TXL:
-      return 0;
-      break;
+      switch (inst->Instruction.Opcode) {
+      case TGSI_OPCODE_SNE:
+         complement = TRUE;
+         /* fall-through */
+      case TGSI_OPCODE_SEQ:
+         ppc_vcmpeqfpx(gen->f, v2, v0, v1); /* v2 = v0 == v1 ? ~0 : 0 */
+         break;
 
-   case TGSI_OPCODE_BRK:
-      return 0;
-      break;
+      case TGSI_OPCODE_SGE:
+         complement = TRUE;
+         /* fall-through */
+      case TGSI_OPCODE_SLT:
+         ppc_vcmpgtfpx(gen->f, v2, v1, v0); /* v2 = v1 > v0 ? ~0 : 0 */
+         break;
 
-   case TGSI_OPCODE_IF:
-      return 0;
-      break;
+      case TGSI_OPCODE_SLE:
+         complement = TRUE;
+         /* fall-through */
+      case TGSI_OPCODE_SGT:
+         ppc_vcmpgtfpx(gen->f, v2, v0, v1); /* v2 = v0 > v1 ? ~0 : 0 */
+         break;
+      default:
+         assert(0);
+      }
 
-   case TGSI_OPCODE_LOOP:
-      return 0;
-      break;
+      /* v2 is now {0,0,0,0} or {~0,~0,~0,~0} */
 
-   case TGSI_OPCODE_REP:
-      return 0;
-      break;
+      if (complement)
+         ppc_vandc(gen->f, v2, one_vec, v2);    /* v2 = one_vec & ~v2 */
+      else
+         ppc_vand(gen->f, v2, one_vec, v2);     /* v2 = one_vec & v2 */
 
-   case TGSI_OPCODE_ELSE:
-      return 0;
-      break;
+      STORE(gen, *inst, v2, 0, chan_index);   /* store v2 */
+   }
 
-   case TGSI_OPCODE_ENDIF:
-      return 0;
-      break;
+   ppc_release_vec_register(gen->f, v0);
+   ppc_release_vec_register(gen->f, v1);
+   ppc_release_vec_register(gen->f, v2);
+}
 
-   case TGSI_OPCODE_ENDLOOP:
-      return 0;
-      break;
 
-   case TGSI_OPCODE_ENDREP:
-      return 0;
-      break;
+static void
+emit_dotprod(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+   int v0 = ppc_allocate_vec_register(gen->f);
+   int v1 = ppc_allocate_vec_register(gen->f);
+   int v2 = ppc_allocate_vec_register(gen->f);
+   uint chan_index;
 
-   case TGSI_OPCODE_PUSHA:
-      return 0;
-      break;
+   ppc_vxor(gen->f, v2, v2, v2);           /* v2 = {0, 0, 0, 0} */
 
-   case TGSI_OPCODE_POPA:
-      return 0;
-      break;
+   FETCH(gen, *inst, v0, 0, CHAN_X);       /* v0 = src0.XXXX */
+   FETCH(gen, *inst, v1, 1, CHAN_X);       /* v1 = src1.XXXX */
+   ppc_vmaddfp(gen->f, v2, v0, v1, v2);    /* v2 = v0 * v1 + v2 */
 
-   case TGSI_OPCODE_CEIL:
-      return 0;
-      break;
+   FETCH(gen, *inst, v0, 0, CHAN_Y);       /* v0 = src0.YYYY */
+   FETCH(gen, *inst, v1, 1, CHAN_Y);       /* v1 = src1.YYYY */
+   ppc_vmaddfp(gen->f, v2, v0, v1, v2);    /* v2 = v0 * v1 + v2 */
 
-   case TGSI_OPCODE_I2F:
-      return 0;
-      break;
+   FETCH(gen, *inst, v0, 0, CHAN_Z);       /* v0 = src0.ZZZZ */
+   FETCH(gen, *inst, v1, 1, CHAN_Z);       /* v1 = src1.ZZZZ */
+   ppc_vmaddfp(gen->f, v2, v0, v1, v2);    /* v2 = v0 * v1 + v2 */
 
-   case TGSI_OPCODE_NOT:
-      return 0;
-      break;
+   if (inst->Instruction.Opcode == TGSI_OPCODE_DP4) {
+      FETCH(gen, *inst, v0, 0, CHAN_W);    /* v0 = src0.WWWW */
+      FETCH(gen, *inst, v1, 1, CHAN_W);    /* v1 = src1.WWWW */
+      ppc_vmaddfp(gen->f, v2, v0, v1, v2); /* v2 = v0 * v1 + v2 */
+   }
+   else if (inst->Instruction.Opcode == TGSI_OPCODE_DPH) {
+      FETCH(gen, *inst, v1, 1, CHAN_W);    /* v1 = src1.WWWW */
+      ppc_vaddfp(gen->f, v2, v2, v1);      /* v2 = v2 + v1 */
+   }
 
-   case TGSI_OPCODE_TRUNC:
-      return 0;
-      break;
+   FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) {
+      STORE(gen, *inst, v2, 0, chan_index);  /* store v2 */
+   }
+   ppc_release_vec_register(gen->f, v0);
+   ppc_release_vec_register(gen->f, v1);
+   ppc_release_vec_register(gen->f, v2);
+}
 
-   case TGSI_OPCODE_SHL:
-      return 0;
-      break;
 
-   case TGSI_OPCODE_SHR:
-      return 0;
-      break;
+static void
+emit_triop(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+   int v0 = ppc_allocate_vec_register(gen->f);
+   int v1 = ppc_allocate_vec_register(gen->f);
+   int v2 = ppc_allocate_vec_register(gen->f);
+   int v3 = ppc_allocate_vec_register(gen->f);
+   uint chan_index;
+   FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) {
+      FETCH(gen, *inst, v0, 0, chan_index);   /* v0 = srcreg[0] */
+      FETCH(gen, *inst, v1, 1, chan_index);   /* v1 = srcreg[1] */
+      FETCH(gen, *inst, v2, 2, chan_index);   /* v2 = srcreg[2] */
+      switch (inst->Instruction.Opcode) {
+      case TGSI_OPCODE_MAD:
+         ppc_vmaddfp(gen->f, v3, v0, v1, v2);   /* v3 = v0 * v1 + v2 */
+         break;
+      case TGSI_OPCODE_LRP:
+         ppc_vsubfp(gen->f, v3, v1, v2);        /* v3 = v1 - v2 */
+         ppc_vmaddfp(gen->f, v3, v0, v3, v2);   /* v3 = v0 * v3 + v2 */
+         break;
+      default:
+         assert(0);
+      }
+      STORE(gen, *inst, v3, 0, chan_index);   /* store v3 */
+   }
+   ppc_release_vec_register(gen->f, v0);
+   ppc_release_vec_register(gen->f, v1);
+   ppc_release_vec_register(gen->f, v2);
+   ppc_release_vec_register(gen->f, v3);
+}
 
-   case TGSI_OPCODE_AND:
-      return 0;
-      break;
 
-   case TGSI_OPCODE_OR:
-      return 0;
-      break;
+/*
+static void
+emit_lit(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+}
+*/
 
-   case TGSI_OPCODE_MOD:
-      return 0;
-      break;
 
-   case TGSI_OPCODE_XOR:
-      return 0;
+static int
+emit_instruction(struct gen_context *gen,
+                 struct tgsi_full_instruction *inst)
+{
+   switch (inst->Instruction.Opcode) {
+   case TGSI_OPCODE_MOV:
+   case TGSI_OPCODE_ABS:
+   case TGSI_OPCODE_FLOOR:
+   case TGSI_OPCODE_FRAC:
+   case TGSI_OPCODE_EXPBASE2:
+   case TGSI_OPCODE_LOGBASE2:
+      emit_unaryop(gen, inst);
       break;
-
-   case TGSI_OPCODE_SAD:
-      return 0;
+   case TGSI_OPCODE_RSQ:
+   case TGSI_OPCODE_RCP:
+      emit_scalar_unaryop(gen, inst);
       break;
-
-   case TGSI_OPCODE_TXF:
-      return 0;
+   case TGSI_OPCODE_ADD:
+   case TGSI_OPCODE_SUB:
+   case TGSI_OPCODE_MUL:
+   case TGSI_OPCODE_MIN:
+   case TGSI_OPCODE_MAX:
+      emit_binop(gen, inst);
       break;
-
-   case TGSI_OPCODE_TXQ:
-      return 0;
+   case TGSI_OPCODE_SEQ:
+   case TGSI_OPCODE_SNE:
+   case TGSI_OPCODE_SLT:
+   case TGSI_OPCODE_SGT:
+   case TGSI_OPCODE_SLE:
+   case TGSI_OPCODE_SGE:
+      emit_inequality(gen, inst);
       break;
-
-   case TGSI_OPCODE_CONT:
-      return 0;
+   case TGSI_OPCODE_MAD:
+   case TGSI_OPCODE_LRP:
+      emit_triop(gen, inst);
       break;
-
-   case TGSI_OPCODE_EMIT:
-      return 0;
+   case TGSI_OPCODE_DP3:
+   case TGSI_OPCODE_DP4:
+   case TGSI_OPCODE_DPH:
+      emit_dotprod(gen, inst);
       break;
-
-   case TGSI_OPCODE_ENDPRIM:
-      return 0;
+      /*
+   case TGSI_OPCODE_LIT:
+      emit_lit(gen, inst);
       break;
-
+      */
+   case TGSI_OPCODE_END:
+      /* normal end */
+      return 1;
    default:
       return 0;
    }
-#endif
+
    
    return 1;
 }
@@ -2608,133 +663,6 @@ emit_declaration(
    }
 }
 
-#if 0
-static void aos_to_soa( struct x86_function *func, 
-                        uint arg_aos,
-                        uint arg_soa, 
-                        uint arg_num, 
-                        uint arg_stride )
-{
-   struct x86_reg soa_input = x86_make_reg( file_REG32, reg_AX );
-   struct x86_reg aos_input = x86_make_reg( file_REG32, reg_BX );
-   struct x86_reg num_inputs = x86_make_reg( file_REG32, reg_CX );
-   struct x86_reg stride = x86_make_reg( file_REG32, reg_DX );
-   int inner_loop;
-
-
-   /* Save EBX */
-   x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
-
-   x86_mov( func, aos_input,  x86_fn_arg( func, arg_aos ) );
-   x86_mov( func, soa_input,  x86_fn_arg( func, arg_soa ) );
-   x86_mov( func, num_inputs, x86_fn_arg( func, arg_num ) );
-   x86_mov( func, stride,     x86_fn_arg( func, arg_stride ) );
-
-   /* do */
-   inner_loop = x86_get_label( func );
-   {
-      x86_push( func, aos_input );
-      sse_movlps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
-      sse_movlps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
-      x86_add( func, aos_input, stride );
-      sse_movhps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
-      sse_movhps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
-      x86_add( func, aos_input, stride );
-      sse_movlps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
-      sse_movlps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
-      x86_add( func, aos_input, stride );
-      sse_movhps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
-      sse_movhps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
-      x86_pop( func, aos_input );
-
-      sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
-      sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
-      sse_shufps( func, make_xmm( 0 ), make_xmm( 1 ), 0x88 );
-      sse_shufps( func, make_xmm( 2 ), make_xmm( 1 ), 0xdd );
-      sse_shufps( func, make_xmm( 3 ), make_xmm( 4 ), 0x88 );
-      sse_shufps( func, make_xmm( 5 ), make_xmm( 4 ), 0xdd );
-
-      sse_movups( func, x86_make_disp( soa_input, 0 ), make_xmm( 0 ) );
-      sse_movups( func, x86_make_disp( soa_input, 16 ), make_xmm( 2 ) );
-      sse_movups( func, x86_make_disp( soa_input, 32 ), make_xmm( 3 ) );
-      sse_movups( func, x86_make_disp( soa_input, 48 ), make_xmm( 5 ) );
-
-      /* Advance to next input */
-      x86_lea( func, aos_input, x86_make_disp(aos_input, 16) );
-      x86_lea( func, soa_input, x86_make_disp(soa_input, 64) );
-   }
-   /* while --num_inputs */
-   x86_dec( func, num_inputs );
-   x86_jcc( func, cc_NE, inner_loop );
-
-   /* Restore EBX */
-   x86_pop( func, aos_input );
-}
-#endif
-
-#if 0
-static void soa_to_aos( struct x86_function *func, uint aos, uint soa, uint num, uint stride )
-{
-   struct x86_reg soa_output;
-   struct x86_reg aos_output;
-   struct x86_reg num_outputs;
-   struct x86_reg temp;
-   int inner_loop;
-
-   soa_output = x86_make_reg( file_REG32, reg_AX );
-   aos_output = x86_make_reg( file_REG32, reg_BX );
-   num_outputs = x86_make_reg( file_REG32, reg_CX );
-   temp = x86_make_reg( file_REG32, reg_DX );
-
-   /* Save EBX */
-   x86_push( func, aos_output );
-
-   x86_mov( func, soa_output, x86_fn_arg( func, soa ) );
-   x86_mov( func, aos_output, x86_fn_arg( func, aos ) );
-   x86_mov( func, num_outputs, x86_fn_arg( func, num ) );
-
-   /* do */
-   inner_loop = x86_get_label( func );
-   {
-      sse_movups( func, make_xmm( 0 ), x86_make_disp( soa_output, 0 ) );
-      sse_movups( func, make_xmm( 1 ), x86_make_disp( soa_output, 16 ) );
-      sse_movups( func, make_xmm( 3 ), x86_make_disp( soa_output, 32 ) );
-      sse_movups( func, make_xmm( 4 ), x86_make_disp( soa_output, 48 ) );
-
-      sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
-      sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
-      sse_unpcklps( func, make_xmm( 0 ), make_xmm( 1 ) );
-      sse_unpckhps( func, make_xmm( 2 ), make_xmm( 1 ) );
-      sse_unpcklps( func, make_xmm( 3 ), make_xmm( 4 ) );
-      sse_unpckhps( func, make_xmm( 5 ), make_xmm( 4 ) );
-
-      x86_mov( func, temp, x86_fn_arg( func, stride ) );
-      x86_push( func, aos_output );
-      sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
-      sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
-      x86_add( func, aos_output, temp );
-      sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
-      sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
-      x86_add( func, aos_output, temp );
-      sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
-      sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
-      x86_add( func, aos_output, temp );
-      sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
-      sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
-      x86_pop( func, aos_output );
-
-      /* Advance to next output */
-      x86_lea( func, aos_output, x86_make_disp(aos_output, 16) );
-      x86_lea( func, soa_output, x86_make_disp(soa_output, 64) );
-   }
-   /* while --num_outputs */
-   x86_dec( func, num_outputs );
-   x86_jcc( func, cc_NE, inner_loop );
-
-   /* Restore EBX */
-   x86_pop( func, aos_output );
-}
-#endif
 
 
 static void
@@ -2788,67 +716,6 @@ tgsi_emit_ppc(const struct tgsi_token *tokens,
 
    emit_prologue(func);
 
-   /*
-    * Different function args for vertex/fragment shaders:
-    */
-#if 0
-   if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
-      /* DECLARATION phase, do not load output argument. */
-      x86_mov(
-         func,
-         get_input_base(),
-         x86_fn_arg( func, 1 ) );
-      /* skipping outputs argument here */
-      x86_mov(
-         func,
-         get_const_base(),
-         x86_fn_arg( func, 3 ) );
-      x86_mov(
-         func,
-         get_temp_base(),
-         x86_fn_arg( func, 4 ) );
-      x86_mov(
-         func,
-         get_coef_base(),
-         x86_fn_arg( func, 5 ) );
-      x86_mov(
-         func,
-         get_immediate_base(),
-         x86_fn_arg( func, 6 ) );
-   }
-   else {
-      assert(parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX);
-
-      if (do_swizzles)
-         aos_to_soa( func, 
-                     6,         /* aos_input */
-                     1,         /* machine->input */
-                     7,         /* num_inputs */
-                     8 );       /* input_stride */
-
-      x86_mov(
-         func,
-         get_input_base(),
-         x86_fn_arg( func, 1 ) );
-      x86_mov(
-         func,
-         get_output_base(),
-         x86_fn_arg( func, 2 ) );
-      x86_mov(
-         func,
-         get_const_base(),
-         x86_fn_arg( func, 3 ) );
-      x86_mov(
-         func,
-         get_temp_base(),
-         x86_fn_arg( func, 4 ) );
-      x86_mov(
-         func,
-         get_immediate_base(),
-         x86_fn_arg( func, 5 ) );
-   }
-#endif
-
    while (!tgsi_parse_end_of_tokens(&parse) && ok) {
       tgsi_parse_token(&parse);
 
@@ -2860,19 +727,6 @@ tgsi_emit_ppc(const struct tgsi_token *tokens,
          break;
 
       case TGSI_TOKEN_TYPE_INSTRUCTION:
-         if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
-#if 0
-            if( !instruction_phase ) {
-               /* INSTRUCTION phase, overwrite coeff with output. */
-               instruction_phase = TRUE;
-               x86_mov(
-                  func,
-                  get_output_base(),
-                  x86_fn_arg( func, 2 ) );
-            }
-#endif
-         }
-
          ok = emit_instruction(&gen, &parse.FullToken.FullInstruction);
 
 	 if (!ok) {
@@ -2909,13 +763,6 @@ tgsi_emit_ppc(const struct tgsi_token *tokens,
       }
    }
 
-#if 0
-   if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
-      if (do_swizzles)
-         soa_to_aos( func, 9, 2, 10, 11 );
-   }
-#endif
-
    emit_epilogue(func);
 
    tgsi_parse_free( &parse );
-- 
cgit v1.2.3


From 77160cd97b7f2181b7953bcc8d13e86055b819e3 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 22 Oct 2008 15:34:16 -0600
Subject: gallium: var renaming in tgsi_ppc.c

---
 src/gallium/auxiliary/tgsi/tgsi_ppc.c | 36 +++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/auxiliary/tgsi/tgsi_ppc.c b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
index 432ec7459b..c1e707657b 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ppc.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
@@ -117,11 +117,11 @@ gen_get_bit31_vec(struct gen_context *gen)
 
 
 /**
- * Register fetch.
+ * Register fetch, put result in 'dst_vec'.
  */
 static void
 emit_fetch(struct gen_context *gen,
-           unsigned vec_reg,
+           unsigned dst_vec,
            const struct tgsi_full_src_register *reg,
            const unsigned chan_index)
 {
@@ -138,7 +138,7 @@ emit_fetch(struct gen_context *gen,
             int offset_reg = ppc_allocate_register(gen->f);
             int offset = (reg->SrcRegister.Index * 4 + swizzle) * 16;
             ppc_li(gen->f, offset_reg, offset);
-            ppc_lvx(gen->f, vec_reg, gen->inputs_reg, offset_reg);
+            ppc_lvx(gen->f, dst_vec, gen->inputs_reg, offset_reg);
             ppc_release_register(gen->f, offset_reg);
          }
          break;
@@ -147,7 +147,7 @@ emit_fetch(struct gen_context *gen,
             int offset_reg = ppc_allocate_register(gen->f);
             int offset = (reg->SrcRegister.Index * 4 + swizzle) * 16;
             ppc_li(gen->f, offset_reg, offset);
-            ppc_lvx(gen->f, vec_reg, gen->temps_reg, offset_reg);
+            ppc_lvx(gen->f, dst_vec, gen->temps_reg, offset_reg);
             ppc_release_register(gen->f, offset_reg);
          }
          break;
@@ -156,7 +156,7 @@ emit_fetch(struct gen_context *gen,
             int offset_reg = ppc_allocate_register(gen->f);
             int offset = (reg->SrcRegister.Index * 4 + swizzle) * 16;
             ppc_li(gen->f, offset_reg, offset);
-            ppc_lvx(gen->f, vec_reg, gen->immed_reg, offset_reg);
+            ppc_lvx(gen->f, dst_vec, gen->immed_reg, offset_reg);
             ppc_release_register(gen->f, offset_reg);
          }
          break;
@@ -171,9 +171,9 @@ emit_fetch(struct gen_context *gen,
              * know that 'swizzle' tells us which vector slot will have the
              * loaded word.  The other vector slots will be undefined.
              */
-            ppc_lvewx(gen->f, vec_reg, gen->const_reg, offset_reg);
+            ppc_lvewx(gen->f, dst_vec, gen->const_reg, offset_reg);
             /* splat word[swizzle] across the vector reg */
-            ppc_vspltw(gen->f, vec_reg, vec_reg, swizzle);
+            ppc_vspltw(gen->f, dst_vec, dst_vec, swizzle);
             ppc_release_register(gen->f, offset_reg);
          }
          break;
@@ -182,12 +182,12 @@ emit_fetch(struct gen_context *gen,
       }
       break;
    case TGSI_EXTSWIZZLE_ZERO:
-      ppc_vload_float(gen->f, vec_reg, 0.0f);
+      ppc_vload_float(gen->f, dst_vec, 0.0f);
       break;
    case TGSI_EXTSWIZZLE_ONE:
       {
          int one_vec = gen_one_vec(gen);
-         ppc_vecmove(gen->f, vec_reg, one_vec);
+         ppc_vecmove(gen->f, dst_vec, one_vec);
       }
       break;
    default:
@@ -202,15 +202,15 @@ emit_fetch(struct gen_context *gen,
          switch (sign_op) {
          case TGSI_UTIL_SIGN_CLEAR:
             /* vec = vec & ~bit31 */
-            ppc_vandc(gen->f, vec_reg, vec_reg, bit31_vec);
+            ppc_vandc(gen->f, dst_vec, dst_vec, bit31_vec);
             break;
          case TGSI_UTIL_SIGN_SET:
             /* vec = vec | bit31 */
-            ppc_vor(gen->f, vec_reg, vec_reg, bit31_vec);
+            ppc_vor(gen->f, dst_vec, dst_vec, bit31_vec);
             break;
          case TGSI_UTIL_SIGN_TOGGLE:
             /* vec = vec ^ bit31 */
-            ppc_vxor(gen->f, vec_reg, vec_reg, bit31_vec);
+            ppc_vxor(gen->f, dst_vec, dst_vec, bit31_vec);
             break;
          default:
             assert(0);
@@ -219,17 +219,17 @@ emit_fetch(struct gen_context *gen,
    }
 }
 
-#define FETCH( GEN, INST, VEC_REG, SRC_REG, CHAN ) \
-   emit_fetch( GEN, VEC_REG, &(INST).FullSrcRegisters[SRC_REG], CHAN )
+#define FETCH( GEN, INST, DST_VEC, SRC_REG, CHAN ) \
+   emit_fetch( GEN, DST_VEC, &(INST).FullSrcRegisters[SRC_REG], CHAN )
 
 
 /**
- * Register store.
+ * Register store.  Store 'src_vec' at location indicated by 'reg'.
  */
 static void
 emit_store(struct gen_context *gen,
-           unsigned vec_reg,
+           unsigned src_vec,
            const struct tgsi_full_dst_register *reg,
            const struct tgsi_full_instruction *inst,
            unsigned chan_index)
@@ -240,7 +240,7 @@ emit_store(struct gen_context *gen,
          int offset_reg = ppc_allocate_register(gen->f);
          int offset = (reg->DstRegister.Index * 4 + chan_index) * 16;
          ppc_li(gen->f, offset_reg, offset);
-         ppc_stvx(gen->f, vec_reg, gen->outputs_reg, offset_reg);
+         ppc_stvx(gen->f, src_vec, gen->outputs_reg, offset_reg);
          ppc_release_register(gen->f, offset_reg);
       }
       break;
@@ -249,7 +249,7 @@ emit_store(struct gen_context *gen,
          int offset_reg = ppc_allocate_register(gen->f);
          int offset = (reg->DstRegister.Index * 4 + chan_index) * 16;
          ppc_li(gen->f, offset_reg, offset);
-         ppc_stvx(gen->f, vec_reg, gen->temps_reg, offset_reg);
+         ppc_stvx(gen->f, src_vec, gen->temps_reg, offset_reg);
          ppc_release_register(gen->f, offset_reg);
       }
       break;
-- 
cgit v1.2.3


From 9e3ee82305b4602feca0253dc0e0c27f9bc9b05e Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 22 Oct 2008 16:57:22 -0600
Subject: gallium: PPC LIT instruction (not quite complete yet)

---
 src/gallium/auxiliary/tgsi/tgsi_ppc.c | 89 +++++++++++++++++++++++++++++++++--
 1 file changed, 85 insertions(+), 4 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/auxiliary/tgsi/tgsi_ppc.c b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
index c1e707657b..edd535a884 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ppc.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
@@ -535,12 +535,95 @@ emit_triop(struct gen_context *gen, struct tgsi_full_instruction *inst)
 }
 
 
-/*
+
+/** Approximation for vr = pow(va, vb) */
+static void
+ppc_vec_pow(struct ppc_function *f, int vr, int va, int vb)
+{
+   /* pow(a,b) ~= exp2(log2(a) * b) */
+   int t_vec = ppc_allocate_vec_register(f);
+   int zero_vec = ppc_allocate_vec_register(f);
+
+   ppc_vload_float(f, zero_vec, 0.0f);
+
+   ppc_vlogefp(f, t_vec, va);                   /* t = log2(va) */
+   ppc_vmaddfp(f, t_vec, t_vec, vb, zero_vec);  /* t = t * vb */
+   ppc_vexptefp(f, vr, t_vec);                  /* vr = 2^t */
+
+   ppc_release_vec_register(f, t_vec);
+   ppc_release_vec_register(f, zero_vec);
+}
+
+
 static void
 emit_lit(struct gen_context *gen, struct tgsi_full_instruction *inst)
 {
+   int one_vec = gen_one_vec(gen);
+
+   /* Compute X */
+   if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) {
+      STORE(gen, *inst, one_vec, 0, CHAN_X);
+   }
+
+   /* Compute Y, Z */
+   if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y) ||
+       IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
+      int x_vec = ppc_allocate_vec_register(gen->f);
+      int zero_vec = ppc_allocate_vec_register(gen->f);
+
+      FETCH(gen, *inst, x_vec, 0, CHAN_X);        /* x_vec = src[0].x */
+
+      ppc_vload_float(gen->f, zero_vec, 0.0f);    /* zero = {0,0,0,0} */
+      ppc_vmaxfp(gen->f, x_vec, x_vec, zero_vec); /* x_vec = max(x_vec, 0) */
+
+      if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
+         STORE(gen, *inst, x_vec, 0, CHAN_Y);        /* store Y */
+      }
+
+      if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
+         int y_vec = ppc_allocate_vec_register(gen->f);
+         int z_vec = ppc_allocate_vec_register(gen->f);
+         int w_vec = ppc_allocate_vec_register(gen->f);
+         int pow_vec = ppc_allocate_vec_register(gen->f);
+         int pos_vec = ppc_allocate_vec_register(gen->f);
+         int c128_vec = ppc_allocate_vec_register(gen->f);
+
+         FETCH(gen, *inst, y_vec, 0, CHAN_Y);        /* y_vec = src[0].y */
+         ppc_vmaxfp(gen->f, y_vec, y_vec, zero_vec); /* y_vec = max(y_vec, 0) */
+
+         FETCH(gen, *inst, w_vec, 0, CHAN_W);        /* w_vec = src[0].w */
+
+         /* XXX clamp Y to [-128, 128] */
+         ppc_vload_float(gen->f, c128_vec, 128.0f);
+
+         /* if temp.x > 0
+          *    pow(tmp.y, tmp.w)
+          * else
+          *   0.0
+          */
+
+         ppc_vec_pow(gen->f, pow_vec, y_vec, w_vec);      /* pow = pow(y, w) */
+         ppc_vcmpgtfpx(gen->f, pos_vec, x_vec, zero_vec); /* pos = x > 0 */
+         ppc_vand(gen->f, z_vec, pow_vec, pos_vec);       /* z = pow & pos */
+
+         STORE(gen, *inst, z_vec, 0, CHAN_Z);             /* store Z */
+
+         ppc_release_vec_register(gen->f, y_vec);
+         ppc_release_vec_register(gen->f, z_vec);
+         ppc_release_vec_register(gen->f, w_vec);
+         ppc_release_vec_register(gen->f, pow_vec);
+         ppc_release_vec_register(gen->f, pos_vec);
+      }
+
+      ppc_release_vec_register(gen->f, x_vec);
+      ppc_release_vec_register(gen->f, zero_vec);
+   }
+
+   /* Compute W */
+   if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W)) {
+      STORE(gen, *inst, one_vec, 0, CHAN_W);
+   }
 }
-*/
 
 
 static int
@@ -584,11 +667,9 @@ emit_instruction(struct gen_context *gen,
    case TGSI_OPCODE_DPH:
       emit_dotprod(gen, inst);
       break;
-      /*
    case TGSI_OPCODE_LIT:
       emit_lit(gen, inst);
       break;
-      */
    case TGSI_OPCODE_END:
       /* normal end */
       return 1;
-- 
cgit v1.2.3


From ae81aeb12868db219cbdc02437c481714cfed3f5 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 22 Oct 2008 16:58:05 -0600
Subject: gallium: GALLIUM_NOPPC debug var to disable PPC codegen

---
 src/gallium/auxiliary/tgsi/tgsi_ppc.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/auxiliary/tgsi/tgsi_ppc.c b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
index edd535a884..9d7de41fe7 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ppc.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
@@ -776,15 +776,21 @@ tgsi_emit_ppc(const struct tgsi_token *tokens,
               float (*immediates)[4],
               boolean do_swizzles )
 {
+   static int use_ppc_asm = -1;
    struct tgsi_parse_context parse;
    /*boolean instruction_phase = FALSE;*/
    unsigned ok = 1;
    uint num_immediates = 0;
    struct gen_context gen;
 
-   util_init_math();
+   if (use_ppc_asm < 0) {
+      /* If GALLIUM_NOPPC is set, don't use PPC codegen */
+      use_ppc_asm = !debug_get_bool_option("GALLIUM_NOPPC", FALSE);
+   }
+   if (!use_ppc_asm)
+      return FALSE;
 
-   tgsi_parse_init( &parse, tokens );
+   util_init_math();
 
    gen.f = func;
    gen.inputs_reg = ppc_reserve_register(func, 3);   /* first function param */
@@ -797,6 +803,8 @@ tgsi_emit_ppc(const struct tgsi_token *tokens,
 
    emit_prologue(func);
 
+   tgsi_parse_init( &parse, tokens );
+
    while (!tgsi_parse_end_of_tokens(&parse) && ok) {
       tgsi_parse_token(&parse);
 
-- 
cgit v1.2.3


From 3026616c48487a7561d8545c08950539f0ad51d1 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 22 Oct 2008 17:17:11 -0600
Subject: gallium: added ppc_vzero()

---
 src/gallium/auxiliary/rtasm/rtasm_ppc.c | 8 ++++++++
 src/gallium/auxiliary/rtasm/rtasm_ppc.h | 5 +++++
 2 files changed, 13 insertions(+)

(limited to 'src/gallium')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc.c b/src/gallium/auxiliary/rtasm/rtasm_ppc.c
index 65df676eae..51d9b53657 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc.c
@@ -669,6 +669,14 @@ ppc_vecmove(struct ppc_function *p, uint vD, uint vA)
    ppc_vor(p, vD, vA, vA);
 }
 
+/** Set vector register to {0,0,0,0} */
+void
+ppc_vzero(struct ppc_function *p, uint vr)
+{
+   ppc_vxor(p, vr, vr, vr);
+}
+
+
 
 
 /**
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc.h b/src/gallium/auxiliary/rtasm/rtasm_ppc.h
index 9f1e3fcd84..f194d3be13 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc.h
@@ -193,6 +193,11 @@ ppc_vxor(struct ppc_function *p, uint vD, uint vA, uint vB);
 extern void
 ppc_vecmove(struct ppc_function *p, uint vD, uint vA);
 
+/** Set vector register to {0,0,0,0} */
+extern void
+ppc_vzero(struct ppc_function *p, uint vr);
+
+
 
 /**
  ** Vector shuffle / select / splat / etc
-- 
cgit v1.2.3


From abbbe876ac98596b143da295abf6887e0a4e50d2 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 22 Oct 2008 17:19:12 -0600
Subject: gallium: new PPC built-in constants array

It's hard to form PPC vector immediates so load them from an array.
---
 src/gallium/auxiliary/draw/draw_vs_ppc.c |  8 +++--
 src/gallium/auxiliary/tgsi/tgsi_ppc.c    | 61 ++++++++++++++++++++++++++++----
 src/gallium/auxiliary/tgsi/tgsi_ppc.h    |  3 ++
 3 files changed, 63 insertions(+), 9 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/auxiliary/draw/draw_vs_ppc.c b/src/gallium/auxiliary/draw/draw_vs_ppc.c
index fcc9cbfec5..8eff6d4fda 100644
--- a/src/gallium/auxiliary/draw/draw_vs_ppc.c
+++ b/src/gallium/auxiliary/draw/draw_vs_ppc.c
@@ -55,7 +55,8 @@ typedef void (PIPE_CDECL *codegen_function) (float (*inputs)[4][4],
                                              float (*outputs)[4][4],
                                              float (*temps)[4][4],
                                              float (*immeds)[4][4],
-                                             float (*consts)[4]);
+                                             float (*consts)[4],
+                                             const float *builtins);
 
 #if 0
    const struct tgsi_exec_vector *input,
@@ -151,7 +152,8 @@ vs_ppc_run_linear( struct draw_vertex_shader *base,
 #else
       shader->func(inputs_soa, outputs_soa, temps_soa,
 		   (float (*)[4][4]) shader->base.immediates,
-		   (float (*)[4]) constants);
+		   (float (*)[4]) constants,
+                   ppc_builtin_constants);
 
       /*output[0][0] = input[0][0] * 0.5;*/
 #endif
@@ -246,7 +248,9 @@ draw_create_vs_ppc(struct draw_context *draw,
    return &vs->base;
 
 fail:
+   /*
    debug_error("tgsi_emit_ppc() failed, falling back to interpreter\n");
+   */
 
    ppc_release_func( &vs->ppc_program );
    
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ppc.c b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
index 9d7de41fe7..6b05fd16cf 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ppc.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
@@ -36,6 +36,7 @@
 #include "pipe/p_debug.h"
 #include "pipe/p_shader_tokens.h"
 #include "util/u_math.h"
+#include "util/u_memory.h"
 #include "util/u_sse.h"
 #include "tgsi/tgsi_parse.h"
 #include "tgsi/tgsi_util.h"
@@ -44,6 +45,14 @@
 #include "rtasm/rtasm_ppc.h"
 
 
+/**
+ * Since it's pretty much impossible to form PPC vector immediates, load
+ * them from memory here:
+ */
+const float ppc_builtin_constants[] ALIGN16_ATTRIB = {
+   1.0f, -128.0f, 128.0, 0.0
+};
+
 
 #define FOR_EACH_CHANNEL( CHAN )\
    for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)
@@ -81,12 +90,46 @@ struct gen_context
    int temps_reg;     /**< GP register pointing to temporary "registers" */
    int immed_reg;     /**< GP register pointing to immediates buffer */
    int const_reg;     /**< GP register pointing to constants buffer */
+   int builtins_reg;  /**< GP register pointint to built-in constants */
 
    int one_vec;       /**< vector register with {1.0, 1.0, 1.0, 1.0} */
    int bit31_vec;     /**< vector register with {1<<31, 1<<31, 1<<31, 1<<31} */
 };
 
 
+/**
+ * Load the given vector register with {value, value, value, value}.
+ * The value must be in the ppu_builtin_constants[] array.
+ * We wouldn't need this if there was a simple way to load PPC vector
+ * registers with immediate values!
+ */
+static void
+load_constant_vec(struct gen_context *gen, int dst_vec, float value)
+{
+   uint pos;
+   for (pos = 0; pos < Elements(ppc_builtin_constants); pos++) {
+      if (ppc_builtin_constants[pos] == value) {
+         int offset_reg = ppc_allocate_register(gen->f);
+         int offset = pos * 4;
+
+         ppc_li(gen->f, offset_reg, offset);
+         /* Load 4-byte word into vector register.
+          * The vector slot depends on the effective address we load from.
+          * We know that our builtins start at a 16-byte boundary so we
+          * know that 'swizzle' tells us which vector slot will have the
+          * loaded word.  The other vector slots will be undefined.
+          */
+         ppc_lvewx(gen->f, dst_vec, gen->builtins_reg, offset_reg);
+         /* splat word[pos % 4] across the vector reg */
+         ppc_vspltw(gen->f, dst_vec, dst_vec, pos % 4);
+         ppc_release_register(gen->f, offset_reg);
+         return;
+      }
+   }
+   assert(0 && "Need to add new constant to ppc_builtin_constants array");
+}
+
+
 /**
  * Return index of vector register containing {1.0, 1.0, 1.0, 1.0}.
  */
@@ -95,7 +138,7 @@ gen_one_vec(struct gen_context *gen)
 {
    if (gen->one_vec < 0) {
       gen->one_vec = ppc_allocate_vec_register(gen->f);
-      ppc_vload_float(gen->f, gen->one_vec, 1.0f);
+      load_constant_vec(gen, gen->one_vec, 1.0f);
    }
    return gen->one_vec;
 }
@@ -115,7 +158,6 @@ gen_get_bit31_vec(struct gen_context *gen)
 }
 
 
-
 /**
  * Register fetch, put result in 'dst_vec'.
  */
@@ -182,7 +224,7 @@ emit_fetch(struct gen_context *gen,
       }
       break;
    case TGSI_EXTSWIZZLE_ZERO:
-      ppc_vload_float(gen->f, dst_vec, 0.0f);
+      ppc_vzero(gen->f, dst_vec);
       break;
    case TGSI_EXTSWIZZLE_ONE:
       {
@@ -544,7 +586,7 @@ ppc_vec_pow(struct ppc_function *f, int vr, int va, int vb)
    int t_vec = ppc_allocate_vec_register(f);
    int zero_vec = ppc_allocate_vec_register(f);
 
-   ppc_vload_float(f, zero_vec, 0.0f);
+   ppc_vzero(f, zero_vec);
 
    ppc_vlogefp(f, t_vec, va);                   /* t = log2(va) */
    ppc_vmaddfp(f, t_vec, t_vec, vb, zero_vec);  /* t = t * vb */
@@ -573,7 +615,7 @@ emit_lit(struct gen_context *gen, struct tgsi_full_instruction *inst)
 
       FETCH(gen, *inst, x_vec, 0, CHAN_X);        /* x_vec = src[0].x */
 
-      ppc_vload_float(gen->f, zero_vec, 0.0f);    /* zero = {0,0,0,0} */
+      ppc_vzero(gen->f, zero_vec);                /* zero = {0,0,0,0} */
       ppc_vmaxfp(gen->f, x_vec, x_vec, zero_vec); /* x_vec = max(x_vec, 0) */
 
       if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
@@ -586,7 +628,8 @@ emit_lit(struct gen_context *gen, struct tgsi_full_instruction *inst)
          int w_vec = ppc_allocate_vec_register(gen->f);
          int pow_vec = ppc_allocate_vec_register(gen->f);
          int pos_vec = ppc_allocate_vec_register(gen->f);
-         int c128_vec = ppc_allocate_vec_register(gen->f);
+         int p128_vec = ppc_allocate_vec_register(gen->f);
+         int n128_vec = ppc_allocate_vec_register(gen->f);
 
          FETCH(gen, *inst, y_vec, 0, CHAN_Y);        /* y_vec = src[0].y */
          ppc_vmaxfp(gen->f, y_vec, y_vec, zero_vec); /* y_vec = max(y_vec, 0) */
@@ -594,7 +637,8 @@ emit_lit(struct gen_context *gen, struct tgsi_full_instruction *inst)
          FETCH(gen, *inst, w_vec, 0, CHAN_W);        /* w_vec = src[0].w */
 
          /* XXX clamp Y to [-128, 128] */
-         ppc_vload_float(gen->f, c128_vec, 128.0f);
+         load_constant_vec(gen, p128_vec, 128.0f);
+         load_constant_vec(gen, n128_vec, -128.0f);
 
          /* if temp.x > 0
           *    pow(tmp.y, tmp.w)
@@ -613,6 +657,8 @@ emit_lit(struct gen_context *gen, struct tgsi_full_instruction *inst)
          ppc_release_vec_register(gen->f, w_vec);
          ppc_release_vec_register(gen->f, pow_vec);
          ppc_release_vec_register(gen->f, pos_vec);
+         ppc_release_vec_register(gen->f, p128_vec);
+         ppc_release_vec_register(gen->f, n128_vec);
       }
 
       ppc_release_vec_register(gen->f, x_vec);
@@ -798,6 +844,7 @@ tgsi_emit_ppc(const struct tgsi_token *tokens,
    gen.temps_reg = ppc_reserve_register(func, 5);    /* ... */
    gen.immed_reg = ppc_reserve_register(func, 6);
    gen.const_reg = ppc_reserve_register(func, 7);
+   gen.builtins_reg = ppc_reserve_register(func, 8);
    gen.one_vec = -1;
    gen.bit31_vec = -1;
 
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ppc.h b/src/gallium/auxiliary/tgsi/tgsi_ppc.h
index 7cd2bf9aff..829ec075e7 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ppc.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_ppc.h
@@ -35,6 +35,9 @@ extern "C" {
 struct tgsi_token;
 struct ppc_function;
 
+extern const float ppc_builtin_constants[];
+
+
 boolean
 tgsi_emit_ppc(const struct tgsi_token *tokens,
               struct ppc_function *function,
-- 
cgit v1.2.3


From f8ab4feb75f4a592e23859813c093dcdbd4b8988 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 22 Oct 2008 17:21:43 -0600
Subject: gallium: remove ppc_vload_float(), rename ppc_vecmove() ->
 ppc_vmove().

---
 src/gallium/auxiliary/rtasm/rtasm_ppc.c | 19 +------------------
 src/gallium/auxiliary/rtasm/rtasm_ppc.h |  6 +-----
 src/gallium/auxiliary/tgsi/tgsi_ppc.c   |  2 +-
 3 files changed, 3 insertions(+), 24 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc.c b/src/gallium/auxiliary/rtasm/rtasm_ppc.c
index 51d9b53657..7dd8263749 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc.c
@@ -603,23 +603,6 @@ ppc_lvewx(struct ppc_function *p, uint vr, uint ra, uint rb)
    emit_x(p, 31, vr, ra, rb, 71);
 }
 
-/** vector load float: vr = splats(imm) */
-void
-ppc_vload_float(struct ppc_function *p, uint vr, float imm)
-{
-   if (imm == 0.0f) {
-      ppc_vxor(p, vr, vr, vr);
-   }
-   else if (imm == 1.0f) {
-      /* use 2^0=1 to get 1.0 */
-      ppc_vxor(p, vr, vr, vr);  /* vr = {0,0,0,0} */
-      ppc_vexptefp(p, vr, vr);  /* vr = 0^0 */
-   }
-   else {
-      assert(0);
-   }
-}
-
 
 
@@ -664,7 +647,7 @@ ppc_vxor(struct ppc_function *p, uint vD, uint vA, uint vB)
 
 /** Pseudo-instruction: vector move */
 void
-ppc_vecmove(struct ppc_function *p, uint vD, uint vA)
+ppc_vmove(struct ppc_function *p, uint vD, uint vA)
 {
    ppc_vor(p, vD, vA, vA);
 }
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc.h b/src/gallium/auxiliary/rtasm/rtasm_ppc.h
index f194d3be13..f938d8d759 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc.h
@@ -158,10 +158,6 @@ ppc_lvx(struct ppc_function *p, uint vR, uint vA, uint vB);
 extern void
 ppc_lvewx(struct ppc_function *p, uint vR, uint vA, uint vB);
 
-/** vector load float: vr = splats(imm) */
-extern void
-ppc_vload_float(struct ppc_function *p, uint vr, float imm);
-
 
 
 /**
@@ -191,7 +187,7 @@ ppc_vxor(struct ppc_function *p, uint vD, uint vA, uint vB);
 
 /** Pseudo-instruction: vector move */
 extern void
-ppc_vecmove(struct ppc_function *p, uint vD, uint vA);
+ppc_vmove(struct ppc_function *p, uint vD, uint vA);
 
 /** Set vector register to {0,0,0,0} */
 extern void
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ppc.c b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
index 6b05fd16cf..96beec0cc6 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ppc.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
@@ -229,7 +229,7 @@ emit_fetch(struct gen_context *gen,
    case TGSI_EXTSWIZZLE_ONE:
       {
          int one_vec = gen_one_vec(gen);
-         ppc_vecmove(gen->f, dst_vec, one_vec);
+         ppc_vmove(gen->f, dst_vec, one_vec);
       }
       break;
    default:
-- 
cgit v1.2.3


From 0ac99457811eb766e9bdd3903857b5c0fdef7694 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 22 Oct 2008 17:29:37 -0600
Subject: gallium: PPC: clamp y to [-128,128] for LIT

---
 src/gallium/auxiliary/tgsi/tgsi_ppc.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/auxiliary/tgsi/tgsi_ppc.c b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
index 96beec0cc6..9ad7ecd7cf 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ppc.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
@@ -636,16 +636,17 @@ emit_lit(struct gen_context *gen, struct tgsi_full_instruction *inst)
 
          FETCH(gen, *inst, w_vec, 0, CHAN_W);        /* w_vec = src[0].w */
 
-         /* XXX clamp Y to [-128, 128] */
+         /* clamp Y to [-128, 128] */
          load_constant_vec(gen, p128_vec, 128.0f);
          load_constant_vec(gen, n128_vec, -128.0f);
+         ppc_vmaxfp(gen->f, y_vec, y_vec, n128_vec); /* y = max(y, -128) */
+         ppc_vminfp(gen->f, y_vec, y_vec, p128_vec); /* y = min(y, 128) */
 
          /* if temp.x > 0
-          *    pow(tmp.y, tmp.w)
+          *    z = pow(tmp.y, tmp.w)
           * else
-          *   0.0
+          *    z = 0.0
           */
-
          ppc_vec_pow(gen->f, pow_vec, y_vec, w_vec);      /* pow = pow(y, w) */
          ppc_vcmpgtfpx(gen->f, pos_vec, x_vec, zero_vec); /* pos = x > 0 */
          ppc_vand(gen->f, z_vec, pow_vec, pos_vec);       /* z = pow & pos */
-- 
cgit v1.2.3


From 06c43beee08052bae3832586559889d74fb538b6 Mon Sep 17 00:00:00 2001
From: Michel Dänzer <michel@tungstengraphics.com>
Date: Thu, 23 Oct 2008 10:27:39 +0200
Subject: scons: Don't hardcode any drivers for the xlib winsys, just pick
 suitable ones.

---
 src/gallium/winsys/xlib/SConscript | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/winsys/xlib/SConscript b/src/gallium/winsys/xlib/SConscript
index 324fbef306..3aef3b6ced 100644
--- a/src/gallium/winsys/xlib/SConscript
+++ b/src/gallium/winsys/xlib/SConscript
@@ -5,8 +5,7 @@ Import('*')
 
 if env['platform'] == 'linux' \
         and 'mesa' in env['statetrackers'] \
-        and 'softpipe' in env['drivers'] \
-        and 'i965simple' in env['drivers'] \
+        and ('softpipe' or 'i915simple' or 'trace') in env['drivers'] \
         and not env['dri']:
 
     env = env.Clone()
@@ -22,15 +21,20 @@ if env['platform'] == 'linux' \
         'xfonts.c',
         'xm_api.c',
         'xm_winsys.c',
-        'xm_winsys_aub.c',
-        'brw_aub.c',
     ]
+
+    drivers = [];
+        
+    if 'softpipe' in env['drivers']:
+        drivers += [softpipe]
+
+    if 'i965simple' in env['drivers']:
+        drivers += [i965simple]
+        sources += [
+            'brw_aub.c',
+            'xm_winsys_aub.c',
+            ]
         
-    drivers = [
-        softpipe,
-        i965simple,
-    ]
-    
     if 'trace' in env['drivers']:
         env.Append(CPPDEFINES = 'GALLIUM_TRACE')
         drivers += [trace]
-- 
cgit v1.2.3


From 6b69e3c71741d99a54c6f4dcb605a3c241239aeb Mon Sep 17 00:00:00 2001
From: Michel Dänzer <michel@tungstengraphics.com>
Date: Thu, 23 Oct 2008 10:28:48 +0200
Subject: scons: ppc support.

---
 SConstruct                             | 2 ++
 common.py                              | 3 ++-
 scons/gallium.py                       | 1 +
 src/gallium/auxiliary/draw/SConscript  | 1 +
 src/gallium/auxiliary/rtasm/SConscript | 1 +
 src/gallium/auxiliary/tgsi/SConscript  | 1 +
 src/mesa/SConscript                    | 4 ++++
 7 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'src/gallium')

diff --git a/SConstruct b/SConstruct
index c1dc624651..8c96817dae 100644
--- a/SConstruct
+++ b/SConstruct
@@ -70,12 +70,14 @@ platform = env['platform']
 
 # derived options
 x86 = machine == 'x86'
+ppc = machine == 'ppc'
 gcc = platform in ('linux', 'freebsd', 'darwin')
 msvc = platform in ('windows', 'winddk')
 
 Export([
 	'debug', 
 	'x86', 
+	'ppc', 
 	'dri', 
 	'llvm',
 	'platform',
diff --git a/common.py b/common.py
index dd64e0f434..cc2582f1a4 100644
--- a/common.py
+++ b/common.py
@@ -24,6 +24,7 @@ _machine_map = {
 	'i486': 'x86',
 	'i586': 'x86',
 	'i686': 'x86',
+	'ppc' : 'ppc',
 	'x86_64': 'x86_64',
 }
 if 'PROCESSOR_ARCHITECTURE' in os.environ:
@@ -56,7 +57,7 @@ def AddOptions(opts):
 	opts.Add(BoolOption('profile', 'profile build', 'no'))
 	#opts.Add(BoolOption('quiet', 'quiet command lines', 'no'))
 	opts.Add(EnumOption('machine', 'use machine-specific assembly code', default_machine,
-											 allowed_values=('generic', 'x86', 'x86_64')))
+											 allowed_values=('generic', 'ppc', 'x86', 'x86_64')))
 	opts.Add(EnumOption('platform', 'target platform', default_platform,
 											 allowed_values=('linux', 'cell', 'windows', 'winddk', 'wince')))
 	opts.Add(BoolOption('llvm', 'use LLVM', 'no'))
diff --git a/scons/gallium.py b/scons/gallium.py
index 3631607e66..2a42bdf2bb 100644
--- a/scons/gallium.py
+++ b/scons/gallium.py
@@ -175,6 +175,7 @@ def generate(env):
     machine = env['machine']
     platform = env['platform']
     x86 = env['machine'] == 'x86'
+    ppc = env['machine'] == 'ppc'
     gcc = env['platform'] in ('linux', 'freebsd', 'darwin')
     msvc = env['platform'] in ('windows', 'winddk', 'wince')
 
diff --git a/src/gallium/auxiliary/draw/SConscript b/src/gallium/auxiliary/draw/SConscript
index 544a04918b..5f05aa324a 100644
--- a/src/gallium/auxiliary/draw/SConscript
+++ b/src/gallium/auxiliary/draw/SConscript
@@ -38,6 +38,7 @@ draw = env.ConvenienceLibrary(
 		'draw_vs_aos_machine.c',
 		'draw_vs_exec.c',
 		'draw_vs_llvm.c',
+		'draw_vs_ppc.c',
 		'draw_vs_sse.c',
 		'draw_vs_varient.c'
 	])
diff --git a/src/gallium/auxiliary/rtasm/SConscript b/src/gallium/auxiliary/rtasm/SConscript
index 8ea25922aa..eb48368acc 100644
--- a/src/gallium/auxiliary/rtasm/SConscript
+++ b/src/gallium/auxiliary/rtasm/SConscript
@@ -6,6 +6,7 @@ rtasm = env.ConvenienceLibrary(
 		'rtasm_cpu.c',
 		'rtasm_execmem.c',
 		'rtasm_x86sse.c',
+		'rtasm_ppc.c',
 		'rtasm_ppc_spe.c',
 	])
 
diff --git a/src/gallium/auxiliary/tgsi/SConscript b/src/gallium/auxiliary/tgsi/SConscript
index 45bf3f6d57..8200cce42f 100644
--- a/src/gallium/auxiliary/tgsi/SConscript
+++ b/src/gallium/auxiliary/tgsi/SConscript
@@ -12,6 +12,7 @@ tgsi = env.ConvenienceLibrary(
 		'tgsi_parse.c',
 		'tgsi_sanity.c',
 		'tgsi_scan.c',
+		'tgsi_ppc.c',
 		'tgsi_sse2.c',
 		'tgsi_text.c',
 		'tgsi_transform.c',
diff --git a/src/mesa/SConscript b/src/mesa/SConscript
index af8dfcb493..89b98b37ab 100644
--- a/src/mesa/SConscript
+++ b/src/mesa/SConscript
@@ -283,6 +283,10 @@ if env['platform'] != 'winddk':
 			'x86-64/glapi_x86-64.S'
 		]
 	elif gcc and env['machine'] == 'ppc':
+		env.Append(CPPDEFINES = [
+			'USE_PPC_ASM', 
+			'USE_VMX_ASM', 
+		])
 		mesa_sources += [
 			'ppc/common_ppc.c',
 		]
-- 
cgit v1.2.3