72 files changed, 5819 insertions, 1259 deletions
diff --git a/src/gallium/auxiliary/cso_cache/cso_context.c b/src/gallium/auxiliary/cso_cache/cso_context.c
index b1ccfc0374..68508f24de 100644
--- a/src/gallium/auxiliary/cso_cache/cso_context.c
+++ b/src/gallium/auxiliary/cso_cache/cso_context.c
@@ -80,6 +80,10 @@ struct cso_context {
 };
 
 
+static void
+free_framebuffer_state(struct pipe_framebuffer_state *fb);
+
+
 static boolean delete_blend_state(struct cso_context *ctx, void *state)
 {
    struct cso_blend *cso = (struct cso_blend *)state;
@@ -252,6 +256,9 @@ void cso_release_all( struct cso_context *ctx )
       pipe_texture_reference(&ctx->textures_saved[i], NULL);
    }
 
+   free_framebuffer_state(&ctx->fb);
+   free_framebuffer_state(&ctx->fb_saved);
+
    if (ctx->cache) {
       cso_cache_delete( ctx->cache );
       ctx->cache = NULL;
@@ -784,6 +791,18 @@ copy_framebuffer_state(struct pipe_framebuffer_state *dst,
 }
 
 
+static void
+free_framebuffer_state(struct pipe_framebuffer_state *fb)
+{
+   uint i;
+
+   for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
+      pipe_surface_reference(&fb->cbufs[i], NULL);
+   }
+   pipe_surface_reference(&fb->zsbuf, NULL);
+}
+
+
 enum pipe_error cso_set_framebuffer(struct cso_context *ctx,
                                     const struct pipe_framebuffer_state *fb)
 {
@@ -804,6 +823,7 @@ void cso_restore_framebuffer(struct cso_context *ctx)
    if (memcmp(&ctx->fb, &ctx->fb_saved, sizeof(ctx->fb))) {
       copy_framebuffer_state(&ctx->fb, &ctx->fb_saved);
       ctx->pipe->set_framebuffer_state(ctx->pipe, &ctx->fb);
+      free_framebuffer_state(&ctx->fb_saved);
    }
 }
 
diff --git a/src/gallium/auxiliary/cso_cache/cso_hash.c b/src/gallium/auxiliary/cso_cache/cso_hash.c
index 7f0044c5a7..4e7664f9bf 100644
--- a/src/gallium/auxiliary/cso_cache/cso_hash.c
+++ b/src/gallium/auxiliary/cso_cache/cso_hash.c
@@ -431,3 +431,9 @@ struct cso_hash_iter cso_hash_erase(struct cso_hash *hash, struct cso_hash_iter
    --hash->data.d->size;
    return ret;
 }
+
+boolean cso_hash_contains(struct cso_hash *hash, unsigned key)
+{
+   struct cso_node **node = cso_hash_find_node(hash, key);
+   return (*node != hash->data.e);
+}
diff --git a/src/gallium/auxiliary/cso_cache/cso_hash.h b/src/gallium/auxiliary/cso_cache/cso_hash.h
index 85f3e276c6..5891c325fa 100644
--- a/src/gallium/auxiliary/cso_cache/cso_hash.h
+++ b/src/gallium/auxiliary/cso_cache/cso_hash.h
@@ -44,6 +44,7 @@
 #ifndef CSO_HASH_H
 #define CSO_HASH_H
 
+#include "pipe/p_compiler.h"
 
 #ifdef	__cplusplus
 extern "C" {
@@ -95,6 +96,11 @@ struct cso_hash_iter cso_hash_first_node(struct cso_hash *hash);
  */
 struct cso_hash_iter cso_hash_find(struct cso_hash *hash, unsigned key);
 
+/**
+ * Returns true if a value with the given key exists in the hash
+ */
+boolean   cso_hash_contains(struct cso_hash *hash, unsigned key);
+
 
 int       cso_hash_iter_is_null(struct cso_hash_iter iter);
 unsigned  cso_hash_iter_key(struct cso_hash_iter iter);
diff --git a/src/gallium/auxiliary/draw/Makefile b/src/gallium/auxiliary/draw/Makefile
index f2e36a89e9..bdbf5a08ed 100644
--- a/src/gallium/auxiliary/draw/Makefile
+++ b/src/gallium/auxiliary/draw/Makefile
@@ -40,6 +40,7 @@ C_SOURCES = \
 	draw_vs_aos_machine.c \
 	draw_vs_exec.c \
 	draw_vs_llvm.c \
+	draw_vs_ppc.c  \
 	draw_vs_sse.c 
 
 
diff --git a/src/gallium/auxiliary/draw/SConscript b/src/gallium/auxiliary/draw/SConscript
index 544a04918b..5f05aa324a 100644
--- a/src/gallium/auxiliary/draw/SConscript
+++ b/src/gallium/auxiliary/draw/SConscript
@@ -38,6 +38,7 @@ draw = env.ConvenienceLibrary(
 		'draw_vs_aos_machine.c',
 		'draw_vs_exec.c',
 		'draw_vs_llvm.c',
+		'draw_vs_ppc.c',
 		'draw_vs_sse.c',
 		'draw_vs_varient.c'
 	])
diff --git a/src/gallium/auxiliary/draw/draw_context.c b/src/gallium/auxiliary/draw/draw_context.c
index 78249054f2..b439bc4059 100644
--- a/src/gallium/auxiliary/draw/draw_context.c
+++ b/src/gallium/auxiliary/draw/draw_context.c
@@ -274,6 +274,14 @@ draw_enable_point_sprites(struct draw_context *draw, boolean enable)
 }
 
 
+void
+draw_set_force_passthrough( struct draw_context *draw, boolean enable )
+{
+   draw_do_flush( draw, DRAW_FLUSH_STATE_CHANGE );
+   draw->force_passthrough = enable;
+}
+
+
 /**
  * Ask the draw module for the location/slot of the given vertex attribute in
  * a post-transformed vertex.
diff --git a/src/gallium/auxiliary/draw/draw_context.h b/src/gallium/auxiliary/draw/draw_context.h
index 0ab3681b64..3eeb453531 100644
--- a/src/gallium/auxiliary/draw/draw_context.h
+++ b/src/gallium/auxiliary/draw/draw_context.h
@@ -160,6 +160,9 @@ void draw_set_render( struct draw_context *draw,
 void draw_set_driver_clipping( struct draw_context *draw,
                                boolean bypass_clipping );
 
+void draw_set_force_passthrough( struct draw_context *draw, 
+                                 boolean enable );
+
 /*******************************************************************************
  * Draw pipeline 
  */
diff --git a/src/gallium/auxiliary/draw/draw_pipe_vbuf.c b/src/gallium/auxiliary/draw/draw_pipe_vbuf.c
index c0cf4269db..9825e116c3 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_vbuf.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_vbuf.c
@@ -231,9 +231,9 @@ vbuf_set_prim( struct vbuf_stage *vbuf, uint prim )
       unsigned emit_sz = 0;
       unsigned src_buffer = 0;
       unsigned output_format;
-      unsigned src_offset = (vbuf->vinfo->src_index[i] * 4 * sizeof(float) );
+      unsigned src_offset = (vbuf->vinfo->attrib[i].src_index * 4 * sizeof(float) );
 
-      switch (vbuf->vinfo->emit[i]) {
+      switch (vbuf->vinfo->attrib[i].emit) {
       case EMIT_4F:
 	 output_format = PIPE_FORMAT_R32G32B32A32_FLOAT;
 	 emit_sz = 4 * sizeof(float);
diff --git a/src/gallium/auxiliary/draw/draw_private.h b/src/gallium/auxiliary/draw/draw_private.h
index 626a2e3e30..5d531146c5 100644
--- a/src/gallium/auxiliary/draw/draw_private.h
+++ b/src/gallium/auxiliary/draw/draw_private.h
@@ -163,12 +163,15 @@ struct draw_context
 
    struct {
       boolean bypass_clipping;
+      boolean bypass_vs;
    } driver;
 
    boolean flushing;         /**< debugging/sanity */
    boolean suspend_flushing; /**< internally set */
    boolean bypass_clipping;  /**< set if either api or driver bypass_clipping true */
 
+   boolean force_passthrough; /**< never clip or shade */
+
    /* pipe state that we need: */
    const struct pipe_rasterizer_state *rasterizer;
    struct pipe_viewport_state viewport;
@@ -193,7 +196,7 @@ struct draw_context
 
       const float (*aligned_constants)[4];
 
-      float (*aligned_constant_storage)[4];
+      const float (*aligned_constant_storage)[4];
       unsigned const_storage_size;
 
 
diff --git a/src/gallium/auxiliary/draw/draw_pt.c b/src/gallium/auxiliary/draw/draw_pt.c
index 669c11c993..87ec6ae20c 100644
--- a/src/gallium/auxiliary/draw/draw_pt.c
+++ b/src/gallium/auxiliary/draw/draw_pt.c
@@ -69,26 +69,26 @@ draw_pt_arrays(struct draw_context *draw,
          return TRUE;
    }
 
-
-   if (!draw->render) {
-      opt |= PT_PIPELINE;
-   }
-
-   if (draw_need_pipeline(draw,
-                          draw->rasterizer,
-                          prim)) {
-      opt |= PT_PIPELINE;
-   }
-
-   if (!draw->bypass_clipping && !draw->pt.test_fse) {
-      opt |= PT_CLIPTEST;
+   if (!draw->force_passthrough) {
+      if (!draw->render) {
+         opt |= PT_PIPELINE;
+      }
+      
+      if (draw_need_pipeline(draw,
+                             draw->rasterizer,
+                             prim)) {
+         opt |= PT_PIPELINE;
+      }
+
+      if (!draw->bypass_clipping && !draw->pt.test_fse) {
+         opt |= PT_CLIPTEST;
+      }
+      
+      if (!draw->rasterizer->bypass_vs) {
+         opt |= PT_SHADE;
+      }
    }
-
-   if (!draw->rasterizer->bypass_vs) {
-      opt |= PT_SHADE;
-   }
-
-
+      
    if (opt == 0) 
       middle = draw->pt.middle.fetch_emit;
    else if (opt == PT_SHADE && !draw->pt.no_fse)
diff --git a/src/gallium/auxiliary/draw/draw_pt_emit.c b/src/gallium/auxiliary/draw/draw_pt_emit.c
index d4eca80588..d520b05869 100644
--- a/src/gallium/auxiliary/draw/draw_pt_emit.c
+++ b/src/gallium/auxiliary/draw/draw_pt_emit.c
@@ -84,11 +84,11 @@ void draw_pt_emit_prepare( struct pt_emit *emit,
       unsigned emit_sz = 0;
       unsigned src_buffer = 0;
       unsigned output_format;
-      unsigned src_offset = (vinfo->src_index[i] * 4 * sizeof(float) );
+      unsigned src_offset = (vinfo->attrib[i].src_index * 4 * sizeof(float) );
 
 
          
-      switch (vinfo->emit[i]) {
+      switch (vinfo->attrib[i].emit) {
       case EMIT_4F:
 	 output_format = PIPE_FORMAT_R32G32B32A32_FLOAT;
 	 emit_sz = 4 * sizeof(float);
diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c b/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c
index 5a4db6cfe5..3966ad48ba 100644
--- a/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c
@@ -121,7 +121,7 @@ static void fetch_emit_prepare( struct draw_pt_middle_end *middle,
    memset(&key, 0, sizeof(key));
 
    for (i = 0; i < vinfo->num_attribs; i++) {
-      const struct pipe_vertex_element *src = &draw->pt.vertex_element[vinfo->src_index[i]];
+      const struct pipe_vertex_element *src = &draw->pt.vertex_element[vinfo->attrib[i].src_index];
 
       unsigned emit_sz = 0;
       unsigned input_format = src->src_format;
@@ -129,7 +129,7 @@ static void fetch_emit_prepare( struct draw_pt_middle_end *middle,
       unsigned input_offset = src->src_offset;
       unsigned output_format;
 
-      switch (vinfo->emit[i]) {
+      switch (vinfo->attrib[i].emit) {
       case EMIT_4F:
 	 output_format = PIPE_FORMAT_R32G32B32A32_FLOAT;
 	 emit_sz = 4 * sizeof(float);
diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c
index 73fc70c1bc..f7e6a1a8ee 100644
--- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c
@@ -79,6 +79,7 @@ static void fse_prepare( struct draw_pt_middle_end *middle,
    unsigned num_vs_inputs = draw->vs.vertex_shader->info.num_inputs;
    const struct vertex_info *vinfo;
    unsigned i;
+   unsigned nr_vbs = 0;
    
 
    if (!draw->render->set_primitive( draw->render, 
@@ -102,7 +103,7 @@ static void fse_prepare( struct draw_pt_middle_end *middle,
 
    fse->key.viewport = !draw->identity_viewport;
    fse->key.clip = !draw->bypass_clipping;
-   fse->key.pad = 0;
+   fse->key.const_vbuffers = 0;
 
    memset(fse->key.element, 0, 
           fse->key.nr_elements * sizeof(fse->key.element[0]));
@@ -116,16 +117,23 @@ static void fse_prepare( struct draw_pt_middle_end *middle,
        */
       fse->key.element[i].in.buffer = src->vertex_buffer_index;
       fse->key.element[i].in.offset = src->src_offset;
+      nr_vbs = MAX2(nr_vbs, src->vertex_buffer_index + 1);
    }
    
+   for (i = 0; i < 5 && i < nr_vbs; i++) {
+      if (draw->pt.vertex_buffer[i].pitch == 0)
+         fse->key.const_vbuffers |= (1<<i);
+   }
 
+   if (0) debug_printf("%s: lookup const_vbuffers: %x\n", __FUNCTION__, fse->key.const_vbuffers);
+   
    {
       unsigned dst_offset = 0;
 
       for (i = 0; i < vinfo->num_attribs; i++) {
          unsigned emit_sz = 0;
 
-         switch (vinfo->emit[i]) {
+         switch (vinfo->attrib[i].emit) {
          case EMIT_4F:
             emit_sz = 4 * sizeof(float);
             break;
@@ -153,8 +161,8 @@ static void fse_prepare( struct draw_pt_middle_end *middle,
           * numbers, not to positions in the hw vertex description --
           * that's handled by the output_offset field.
           */
-         fse->key.element[i].out.format = vinfo->emit[i];
-         fse->key.element[i].out.vs_output = vinfo->src_index[i];
+         fse->key.element[i].out.format = vinfo->attrib[i].emit;
+         fse->key.element[i].out.vs_output = vinfo->attrib[i].src_index;
          fse->key.element[i].out.offset = dst_offset;
       
          dst_offset += emit_sz;
@@ -162,13 +170,7 @@ static void fse_prepare( struct draw_pt_middle_end *middle,
       }
    }
 
-
-   /* Would normally look up a vertex shader and peruse its list of
-    * varients somehow.  We omitted that step and put all the
-    * hardcoded "shaders" into an array.  We're just making the
-    * assumption that this happens to be a matching shader...  ie
-    * you're running isosurf, aren't you?
-    */
+   
    fse->active = draw_vs_lookup_varient( draw->vs.vertex_shader, 
                                          &fse->key );
 
@@ -177,18 +179,17 @@ static void fse_prepare( struct draw_pt_middle_end *middle,
       return ;
    }
 
+   if (0) debug_printf("%s: found const_vbuffers: %x\n", __FUNCTION__, 
+                       fse->active->key.const_vbuffers);
+
    /* Now set buffer pointers:
     */
-   for (i = 0; i < num_vs_inputs; i++) {
-      unsigned buf = draw->pt.vertex_element[i].vertex_buffer_index;
-
-      fse->active->set_input( fse->active, 
-                              i, 
-                              
-                              ((const ubyte *) draw->pt.user.vbuffer[buf] + 
-                               draw->pt.vertex_buffer[buf].buffer_offset),
-                              
-                              draw->pt.vertex_buffer[buf].pitch );
+   for (i = 0; i < draw->pt.nr_vertex_buffers; i++) {
+      fse->active->set_buffer( fse->active, 
+                               i, 
+                               ((const ubyte *) draw->pt.user.vbuffer[i] + 
+                                draw->pt.vertex_buffer[i].buffer_offset),
+                              draw->pt.vertex_buffer[i].pitch );
    }
 
    *max_vertices = (draw->render->max_vertex_buffer_bytes / 
diff --git a/src/gallium/auxiliary/draw/draw_vertex.c b/src/gallium/auxiliary/draw/draw_vertex.c
index 1446f785c5..3214213e44 100644
--- a/src/gallium/auxiliary/draw/draw_vertex.c
+++ b/src/gallium/auxiliary/draw/draw_vertex.c
@@ -49,7 +49,7 @@ draw_compute_vertex_size(struct vertex_info *vinfo)
 
    vinfo->size = 0;
    for (i = 0; i < vinfo->num_attribs; i++) {
-      switch (vinfo->emit[i]) {
+      switch (vinfo->attrib[i].emit) {
       case EMIT_OMIT:
          break;
       case EMIT_4UB:
@@ -81,8 +81,8 @@ draw_dump_emitted_vertex(const struct vertex_info *vinfo, const uint8_t *data)
    unsigned i, j;
 
    for (i = 0; i < vinfo->num_attribs; i++) {
-      j = vinfo->src_index[i];
-      switch (vinfo->emit[i]) {
+      j = vinfo->attrib[i].src_index;
+      switch (vinfo->attrib[i].emit) {
       case EMIT_OMIT:
          debug_printf("EMIT_OMIT:");
          break;
diff --git a/src/gallium/auxiliary/draw/draw_vertex.h b/src/gallium/auxiliary/draw/draw_vertex.h
index 16c65c4317..a943607d7e 100644
--- a/src/gallium/auxiliary/draw/draw_vertex.h
+++ b/src/gallium/auxiliary/draw/draw_vertex.h
@@ -75,12 +75,41 @@ struct vertex_info
 {
    uint num_attribs;
    uint hwfmt[4];      /**< hardware format info for this format */
-   enum interp_mode interp_mode[PIPE_MAX_SHADER_INPUTS];
-   enum attrib_emit emit[PIPE_MAX_SHADER_INPUTS];   /**< EMIT_x */
-   uint src_index[PIPE_MAX_SHADER_INPUTS]; /**< map to post-xform attribs */
    uint size;          /**< total vertex size in dwords */
+   
+   /* Keep this small and at the end of the struct to allow quick
+    * memcmp() comparisons.
+    */
+   struct {
+      ubyte interp_mode:4;      /**< INTERP_x */
+      ubyte emit:4;             /**< EMIT_x */
+      ubyte src_index;          /**< map to post-xform attribs */
+   } attrib[PIPE_MAX_SHADER_INPUTS];
 };
 
+static INLINE int
+draw_vinfo_size( const struct vertex_info *a )
+{
+   return ((const char *)&a->attrib[a->num_attribs] -
+           (const char *)a);
+}
+
+static INLINE int
+draw_vinfo_compare( const struct vertex_info *a,
+                    const struct vertex_info *b )
+{
+   unsigned sizea = draw_vinfo_size( a );
+   return memcmp( a, b, sizea );
+}
+
+static INLINE void
+draw_vinfo_copy( struct vertex_info *dst,
+                 const struct vertex_info *src )
+{
+   unsigned size = draw_vinfo_size( src );
+   memcpy( dst, src, size );
+}
+
 
 
 /**
@@ -91,14 +120,15 @@ struct vertex_info
  */
 static INLINE uint
 draw_emit_vertex_attr(struct vertex_info *vinfo,
-                      enum attrib_emit emit, enum interp_mode interp,
+                      enum attrib_emit emit, 
+                      enum interp_mode interp, /* only used by softpipe??? */
                       uint src_index)
 {
    const uint n = vinfo->num_attribs;
    assert(n < PIPE_MAX_SHADER_INPUTS);
-   vinfo->emit[n] = emit;
-   vinfo->interp_mode[n] = interp;
-   vinfo->src_index[n] = src_index;
+   vinfo->attrib[n].emit = emit;
+   vinfo->attrib[n].interp_mode = interp;
+   vinfo->attrib[n].src_index = src_index;
    vinfo->num_attribs++;
    return n;
 }
diff --git a/src/gallium/auxiliary/draw/draw_vs.c b/src/gallium/auxiliary/draw/draw_vs.c
index 34adbd49b0..7f305304ff 100644
--- a/src/gallium/auxiliary/draw/draw_vs.c
+++ b/src/gallium/auxiliary/draw/draw_vs.c
@@ -85,7 +85,10 @@ draw_create_vertex_shader(struct draw_context *draw,
    if (!vs) {
       vs = draw_create_vs_sse( draw, shader );
       if (!vs) {
-         vs = draw_create_vs_exec( draw, shader );
+         vs = draw_create_vs_ppc( draw, shader );
+         if (!vs) {
+            vs = draw_create_vs_exec( draw, shader );
+         }
       }
    }
 
diff --git a/src/gallium/auxiliary/draw/draw_vs.h b/src/gallium/auxiliary/draw/draw_vs.h
index 45992d1986..89ae158751 100644
--- a/src/gallium/auxiliary/draw/draw_vs.h
+++ b/src/gallium/auxiliary/draw/draw_vs.h
@@ -64,7 +64,7 @@ struct draw_vs_varient_key {
    unsigned nr_outputs:8;
    unsigned viewport:1;
    unsigned clip:1;
-   unsigned pad:5;
+   unsigned const_vbuffers:5;
    struct draw_varient_element element[PIPE_MAX_ATTRIBS];
 };
 
@@ -76,7 +76,7 @@ struct draw_vs_varient {
 
    struct draw_vertex_shader *vs;
 
-   void (*set_input)( struct draw_vs_varient *,
+   void (*set_buffer)( struct draw_vs_varient *,
                       unsigned i,
                       const void *ptr,
                       unsigned stride );
@@ -158,6 +158,10 @@ draw_create_vs_sse(struct draw_context *draw,
 		   const struct pipe_shader_state *templ);
 
 struct draw_vertex_shader *
+draw_create_vs_ppc(struct draw_context *draw,
+		   const struct pipe_shader_state *templ);
+
+struct draw_vertex_shader *
 draw_create_vs_llvm(struct draw_context *draw,
 		    const struct pipe_shader_state *templ);
 
diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.c b/src/gallium/auxiliary/draw/draw_vs_aos.c
index a556477a76..87232865e2 100644
--- a/src/gallium/auxiliary/draw/draw_vs_aos.c
+++ b/src/gallium/auxiliary/draw/draw_vs_aos.c
@@ -92,9 +92,9 @@ struct x86_reg aos_get_x86( struct aos_compilation *cp,
          assert(which_reg == 1);
          offset = Offset(struct aos_machine, constants);
          break;
-      case X86_ATTRIBS:
+      case X86_BUFFERS:
          assert(which_reg == 0);
-         offset = Offset(struct aos_machine, attrib);
+         offset = Offset(struct aos_machine, buffer);
          break;
       default:
          assert(0);
@@ -196,6 +196,18 @@ static void spill( struct aos_compilation *cp, unsigned idx )
 }
 
 
+void aos_spill_all( struct aos_compilation *cp )
+{
+   unsigned i;
+
+   for (i = 0; i < 8; i++) {
+      if (cp->xmm[i].dirty) 
+         spill(cp, i);
+      aos_release_xmm_reg(cp, i);
+   }
+}
+
+
 static struct x86_reg get_xmm_writable( struct aos_compilation *cp,
                                         struct x86_reg reg )
 {
@@ -1939,6 +1951,11 @@ static boolean build_vertex_program( struct draw_vs_varient_aos_sse *varient,
    save_fpu_state( &cp );
    set_fpu_round_nearest( &cp );
 
+   aos_init_inputs( &cp, linear );
+
+   cp.x86_reg[0] = 0;
+   cp.x86_reg[1] = 0;
+   
    /* Note address for loop jump 
     */
    label = x86_get_label(cp.func);
@@ -2018,13 +2035,7 @@ static boolean build_vertex_program( struct draw_vs_varient_aos_sse *varient,
 
       /* Incr index
        */   
-      if (linear) {
-         x86_inc(cp.func, cp.idx_EBX);
-      } 
-      else {
-         x86_lea(cp.func, cp.idx_EBX, x86_make_disp(cp.idx_EBX, 4));
-      }
-
+      aos_incr_inputs( &cp, linear );
    }
    /* decr count, loop if not zero
     */
@@ -2065,15 +2076,13 @@ static void vaos_set_buffer( struct draw_vs_varient *varient,
                              unsigned stride )
 {
    struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient;
-   unsigned i;
 
-   for (i = 0; i < vaos->base.key.nr_inputs; i++) {
-      if (vaos->base.key.element[i].in.buffer == buf) {
-         vaos->attrib[i].input_ptr = ((char *)ptr +
-                                      vaos->base.key.element[i].in.offset);
-         vaos->attrib[i].input_stride = stride;
-      }
+   if (buf < vaos->nr_vb) {
+      vaos->buffer[buf].base_ptr = (char *)ptr;
+      vaos->buffer[buf].stride = stride;
    }
+
+   if (0) debug_printf("%s %d/%d: %p %d\n", __FUNCTION__, buf, vaos->nr_vb, ptr, stride);
 }
 
 
@@ -2086,10 +2095,12 @@ static void PIPE_CDECL vaos_run_elts( struct draw_vs_varient *varient,
    struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient;
    struct aos_machine *machine = vaos->draw->vs.aos_machine;
 
+   if (0) debug_printf("%s %d\n", __FUNCTION__, count);
+
    machine->internal[IMM_PSIZE][0] = vaos->draw->rasterizer->point_size;
    machine->constants = vaos->draw->vs.aligned_constants;
    machine->immediates = vaos->base.vs->immediates;
-   machine->attrib = vaos->attrib;
+   machine->buffer = vaos->buffer;
 
    vaos->gen_run_elts( machine,
                        elts,
@@ -2105,10 +2116,13 @@ static void PIPE_CDECL vaos_run_linear( struct draw_vs_varient *varient,
    struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient;
    struct aos_machine *machine = vaos->draw->vs.aos_machine;
 
+   if (0) debug_printf("%s %d %d const: %x\n", __FUNCTION__, start, count, 
+                       vaos->base.key.const_vbuffers);
+
    machine->internal[IMM_PSIZE][0] = vaos->draw->rasterizer->point_size;
    machine->constants = vaos->draw->vs.aligned_constants;
    machine->immediates = vaos->base.vs->immediates;
-   machine->attrib = vaos->attrib;
+   machine->buffer = vaos->buffer;
 
    vaos->gen_run_linear( machine,
                          start,
@@ -2127,7 +2141,7 @@ static void vaos_destroy( struct draw_vs_varient *varient )
 {
    struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient;
 
-   FREE( vaos->attrib );
+   FREE( vaos->buffer );
 
    x86_release_func( &vaos->func[0] );
    x86_release_func( &vaos->func[1] );
@@ -2140,6 +2154,7 @@ static void vaos_destroy( struct draw_vs_varient *varient )
 static struct draw_vs_varient *varient_aos_sse( struct draw_vertex_shader *vs,
                                                  const struct draw_vs_varient_key *key )
 {
+   unsigned i;
    struct draw_vs_varient_aos_sse *vaos = CALLOC_STRUCT(draw_vs_varient_aos_sse);
 
    if (!vaos)
@@ -2147,17 +2162,22 @@ static struct draw_vs_varient *varient_aos_sse( struct draw_vertex_shader *vs,
    
    vaos->base.key = *key;
    vaos->base.vs = vs;
-   vaos->base.set_input = vaos_set_buffer;
+   vaos->base.set_buffer = vaos_set_buffer;
    vaos->base.destroy = vaos_destroy;
    vaos->base.run_linear = vaos_run_linear;
    vaos->base.run_elts = vaos_run_elts;
 
    vaos->draw = vs->draw;
 
-   vaos->attrib = MALLOC( key->nr_inputs * sizeof(vaos->attrib[0]) );
-   if (!vaos->attrib)
+   for (i = 0; i < key->nr_inputs; i++) 
+      vaos->nr_vb = MAX2( vaos->nr_vb, key->element[i].in.buffer + 1 );
+
+   vaos->buffer = MALLOC( vaos->nr_vb * sizeof(vaos->buffer[0]) );
+   if (!vaos->buffer)
       goto fail;
 
+   debug_printf("nr_vb: %d const: %x\n", vaos->nr_vb, vaos->base.key.const_vbuffers);
+
 #if 0
    tgsi_dump(vs->state.tokens, 0);
 #endif
@@ -2179,8 +2199,8 @@ static struct draw_vs_varient *varient_aos_sse( struct draw_vertex_shader *vs,
    return &vaos->base;
 
  fail:
-   if (vaos && vaos->attrib)
-      FREE(vaos->attrib);
+   if (vaos && vaos->buffer)
+      FREE(vaos->buffer);
 
    if (vaos)
       x86_release_func( &vaos->func[0] );
diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.h b/src/gallium/auxiliary/draw/draw_vs_aos.h
index 7fe6f79db0..264387517b 100644
--- a/src/gallium/auxiliary/draw/draw_vs_aos.h
+++ b/src/gallium/auxiliary/draw/draw_vs_aos.h
@@ -87,9 +87,10 @@ struct lit_info {
 #define MAX_SHINE_TAB    4
 #define MAX_LIT_INFO     16
 
-struct aos_attrib {
-   const void *input_ptr;
-   unsigned input_stride;
+struct aos_buffer {
+   const void *base_ptr;
+   unsigned stride;
+   void *ptr;                   /* updated per vertex */
 };
 
 
@@ -123,7 +124,7 @@ struct aos_machine {
    const float (*immediates)[4];     /* points to shader data */
    const float (*constants)[4];      /* points to draw data */
 
-   const struct aos_attrib *attrib; /* points to ? */
+   const struct aos_buffer *buffer; /* points to ? */
 };
 
 
@@ -175,12 +176,15 @@ void aos_adopt_xmm_reg( struct aos_compilation *cp,
                         unsigned idx,
                         unsigned dirty );
 
+void aos_spill_all( struct aos_compilation *cp );
+
 struct x86_reg aos_get_shader_reg( struct aos_compilation *cp, 
                                    unsigned file,
                                    unsigned idx );
 
-boolean aos_fetch_inputs( struct aos_compilation *cp,
-                          boolean linear );
+boolean aos_init_inputs( struct aos_compilation *cp, boolean linear );
+boolean aos_fetch_inputs( struct aos_compilation *cp, boolean linear );
+boolean aos_incr_inputs( struct aos_compilation *cp, boolean linear );
 
 boolean aos_emit_outputs( struct aos_compilation *cp );
 
@@ -210,7 +214,7 @@ do {                                                                    \
 #define X86_NULL       0
 #define X86_IMMEDIATES 1
 #define X86_CONSTANTS  2
-#define X86_ATTRIBS    3
+#define X86_BUFFERS    3
 
 struct x86_reg aos_get_x86( struct aos_compilation *cp,
                             unsigned which_reg,
@@ -232,7 +236,8 @@ struct draw_vs_varient_aos_sse {
    struct draw_vs_varient base;
    struct draw_context *draw;
 
-   struct aos_attrib *attrib;
+   struct aos_buffer *buffer;
+   unsigned nr_vb;
 
    vaos_run_linear_func gen_run_linear;
    vaos_run_elts_func gen_run_elts;
diff --git a/src/gallium/auxiliary/draw/draw_vs_aos_io.c b/src/gallium/auxiliary/draw/draw_vs_aos_io.c
index 26297c74f8..39f75b50b7 100644
--- a/src/gallium/auxiliary/draw/draw_vs_aos_io.c
+++ b/src/gallium/auxiliary/draw/draw_vs_aos_io.c
@@ -54,6 +54,7 @@ static void emit_load_R32G32B32( struct aos_compilation *cp,
 				 struct x86_reg data,
 				 struct x86_reg src_ptr )
 {
+#if 1
    sse_movss(cp->func, data, x86_make_disp(src_ptr, 8));
    /* data = z ? ? ? */
    sse_shufps(cp->func, data, aos_get_internal_xmm( cp, IMM_IDENTITY ), SHUF(X,Y,Z,W) );
@@ -62,6 +63,16 @@ static void emit_load_R32G32B32( struct aos_compilation *cp,
    /* data = ? 0 z 1 */
    sse_movlps(cp->func, data, src_ptr);
    /* data = x y z 1 */
+#else
+   sse_movups(cp->func, data, src_ptr);
+   /* data = x y z ? */
+   sse2_pshufd(cp->func, data, data, SHUF(W,X,Y,Z) );
+   /* data = ? x y z */
+   sse_movss(cp->func, data, aos_get_internal_xmm( cp, IMM_ONES ) );
+   /* data = 1 x y z */
+   sse2_pshufd(cp->func, data, data, SHUF(Y,Z,W,X) );
+   /* data = x y z 1 */
+#endif
 }
 
 static void emit_load_R32G32( struct aos_compilation *cp, 
@@ -95,28 +106,6 @@ static void emit_load_R8G8B8A8_UNORM( struct aos_compilation *cp,
 
 
 
-static void get_src_ptr( struct aos_compilation *cp,
-                         struct x86_reg src,
-                         struct x86_reg elt,
-                         unsigned a )
-{
-   struct x86_reg attrib = x86_make_disp(aos_get_x86( cp, 0, X86_ATTRIBS ), 
-                                         a * sizeof(struct aos_attrib));
-
-   struct x86_reg input_ptr = x86_make_disp(attrib, 
-                                            Offset(struct aos_attrib, input_ptr));
-
-   struct x86_reg input_stride = x86_make_disp(attrib, 
-                                               Offset(struct aos_attrib, input_stride));
-
-   /* Calculate pointer to current attrib:
-    */
-   x86_mov(cp->func, src, input_stride);
-   x86_imul(cp->func, src, elt);
-   x86_add(cp->func, src, input_ptr);
-}
-
-
 /* Extended swizzles?  Maybe later.
  */  
 static void emit_swizzle( struct aos_compilation *cp,
@@ -128,22 +117,60 @@ static void emit_swizzle( struct aos_compilation *cp,
 }
 
 
+
+static boolean get_buffer_ptr( struct aos_compilation *cp,
+                               boolean linear,
+                               unsigned buf_idx,
+                               struct x86_reg elt,
+                               struct x86_reg ptr)
+{
+   struct x86_reg buf = x86_make_disp(aos_get_x86( cp, 0, X86_BUFFERS ), 
+                                      buf_idx * sizeof(struct aos_buffer));
+
+   struct x86_reg buf_stride = x86_make_disp(buf, 
+                                             Offset(struct aos_buffer, stride));
+   if (linear) {
+      struct x86_reg buf_ptr = x86_make_disp(buf, 
+                                             Offset(struct aos_buffer, ptr));
+
+
+      /* Calculate pointer to current attrib:
+       */
+      x86_mov(cp->func, ptr, buf_ptr);
+      x86_mov(cp->func, elt, buf_stride);
+      x86_add(cp->func, elt, ptr);
+      if (buf_idx == 0) sse_prefetchnta(cp->func, x86_make_disp(elt, 192));
+      x86_mov(cp->func, buf_ptr, elt);
+   }
+   else {
+      struct x86_reg buf_base_ptr = x86_make_disp(buf, 
+                                                  Offset(struct aos_buffer, base_ptr));
+
+
+      /* Calculate pointer to current attrib:
+       */
+      x86_mov(cp->func, ptr, buf_stride);
+      x86_imul(cp->func, ptr, elt);
+      x86_add(cp->func, ptr, buf_base_ptr);
+   }
+
+   cp->insn_counter++;
+
+   return TRUE;
+}
+
+
 static boolean load_input( struct aos_compilation *cp,
                            unsigned idx,
-                           boolean linear )
+                           struct x86_reg bufptr )
 {
    unsigned format = cp->vaos->base.key.element[idx].in.format;
-   struct x86_reg src = cp->tmp_EAX;
+   unsigned offset = cp->vaos->base.key.element[idx].in.offset;
    struct x86_reg dataXMM = aos_get_xmm_reg(cp);
 
    /* Figure out source pointer address:
     */
-   get_src_ptr(cp, 
-               src, 
-               linear ? cp->idx_EBX : x86_deref(cp->idx_EBX),
-               idx);
-
-   src = x86_deref(src);
+   struct x86_reg src = x86_make_disp(bufptr, offset);
 
    aos_adopt_xmm_reg( cp,
                       dataXMM,
@@ -179,20 +206,128 @@ static boolean load_input( struct aos_compilation *cp,
    return TRUE;
 }
 
-
-boolean aos_fetch_inputs( struct aos_compilation *cp, boolean linear )
+static boolean load_inputs( struct aos_compilation *cp,
+                            unsigned buffer,
+                            struct x86_reg ptr )
 {
    unsigned i;
-   
+
    for (i = 0; i < cp->vaos->base.key.nr_inputs; i++) {
-      if (!load_input( cp, i, linear ))
-         return FALSE;
-      cp->insn_counter++;
+      if (cp->vaos->base.key.element[i].in.buffer == buffer) {
+
+         if (!load_input( cp, i, ptr ))
+            return FALSE;
+
+         cp->insn_counter++;
+      }
+   }
+   
+   return TRUE;
+}
+
+boolean aos_init_inputs( struct aos_compilation *cp, boolean linear )
+{
+   unsigned i;
+   for (i = 0; i < cp->vaos->nr_vb; i++) {
+      struct x86_reg buf = x86_make_disp(aos_get_x86( cp, 0, X86_BUFFERS ), 
+                                         i * sizeof(struct aos_buffer));
+
+      struct x86_reg buf_base_ptr = x86_make_disp(buf, 
+                                                  Offset(struct aos_buffer, base_ptr));
+
+      if (cp->vaos->base.key.const_vbuffers & (1<<i)) {
+         struct x86_reg ptr = cp->tmp_EAX;
+
+         x86_mov(cp->func, ptr, buf_base_ptr);
+
+         /* Load all inputs for this constant vertex buffer
+          */
+         load_inputs( cp, i, x86_deref(ptr) );
+         
+         /* Then just force them out to aos_machine.input[]
+          */
+         aos_spill_all( cp );
+
+      }
+      else if (linear) {
+
+         struct x86_reg elt = cp->idx_EBX;
+         struct x86_reg ptr = cp->tmp_EAX;
+
+         struct x86_reg buf_stride = x86_make_disp(buf, 
+                                                   Offset(struct aos_buffer, stride));
+
+         struct x86_reg buf_ptr = x86_make_disp(buf, 
+                                                Offset(struct aos_buffer, ptr));
+
+
+         /* Calculate pointer to current attrib:
+          */
+         x86_mov(cp->func, ptr, buf_stride);
+         x86_imul(cp->func, ptr, elt);
+         x86_add(cp->func, ptr, buf_base_ptr);
+
+
+         /* In the linear case, keep the buffer pointer instead of the
+          * index number.
+          */
+         if (cp->vaos->nr_vb == 1) 
+            x86_mov( cp->func, elt, ptr );
+         else
+            x86_mov( cp->func, buf_ptr, ptr );
+
+         cp->insn_counter++;
+      }
+   }
+
+   return TRUE;
+}
+
+boolean aos_fetch_inputs( struct aos_compilation *cp, boolean linear )
+{
+   unsigned j;
+
+   for (j = 0; j < cp->vaos->nr_vb; j++) {
+      if (cp->vaos->base.key.const_vbuffers & (1<<j)) {
+         /* just retreive pre-transformed input */
+      }
+      else if (linear && cp->vaos->nr_vb == 1) {
+         load_inputs( cp, 0, cp->idx_EBX );
+      }
+      else {
+         struct x86_reg elt = linear ? cp->idx_EBX : x86_deref(cp->idx_EBX);
+         struct x86_reg ptr = cp->tmp_EAX;
+
+         if (!get_buffer_ptr( cp, linear, j, elt, ptr ))
+            return FALSE;
+
+         if (!load_inputs( cp, j, ptr ))
+            return FALSE;
+      }
    }
 
    return TRUE;
 }
 
+boolean aos_incr_inputs( struct aos_compilation *cp, boolean linear )
+{
+   if (linear && cp->vaos->nr_vb == 1) {
+      struct x86_reg stride = x86_make_disp(aos_get_x86( cp, 0, X86_BUFFERS ), 
+                                            (0 * sizeof(struct aos_buffer) + 
+                                             Offset(struct aos_buffer, stride)));
+
+      x86_add(cp->func, cp->idx_EBX, stride);
+      sse_prefetchnta(cp->func, x86_make_disp(cp->idx_EBX, 192));
+   }
+   else if (linear) {
+      /* Nothing to do */
+   } 
+   else {
+      x86_lea(cp->func, cp->idx_EBX, x86_make_disp(cp->idx_EBX, 4));
+   }
+
+   return TRUE;
+}
 
 
 
@@ -306,7 +441,7 @@ boolean aos_emit_outputs( struct aos_compilation *cp )
 
       if (data.file != file_XMM) {
          struct x86_reg tmp = aos_get_xmm_reg( cp );
-         sse_movups(cp->func, tmp, data);
+         sse_movaps(cp->func, tmp, data);
          data = tmp;
       }
       
diff --git a/src/gallium/auxiliary/draw/draw_vs_exec.c b/src/gallium/auxiliary/draw/draw_vs_exec.c
index 44563803f9..13d4fcfdbf 100644
--- a/src/gallium/auxiliary/draw/draw_vs_exec.c
+++ b/src/gallium/auxiliary/draw/draw_vs_exec.c
@@ -62,12 +62,15 @@ vs_exec_prepare( struct draw_vertex_shader *shader,
 {
    struct exec_vertex_shader *evs = exec_vertex_shader(shader);
 
-   /* specify the vertex program to interpret/execute */
-   tgsi_exec_machine_bind_shader(evs->machine,
-				 shader->state.tokens,
-				 PIPE_MAX_SAMPLERS,
-				 NULL /*samplers*/ );
-
+   /* Specify the vertex program to interpret/execute.
+    * Avoid rebinding when possible.
+    */
+   if (evs->machine->Tokens != shader->state.tokens) {
+      tgsi_exec_machine_bind_shader(evs->machine,
+                                    shader->state.tokens,
+                                    PIPE_MAX_SAMPLERS,
+                                    NULL /*samplers*/ );
+   }
 }
 
 
diff --git a/src/gallium/auxiliary/draw/draw_vs_llvm.c b/src/gallium/auxiliary/draw/draw_vs_llvm.c
index 2ce30b9a02..727977bc3a 100644
--- a/src/gallium/auxiliary/draw/draw_vs_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_vs_llvm.c
@@ -32,6 +32,7 @@
   *   Brian Paul
   */
 
+#include "util/u_memory.h"
 #include "pipe/p_shader_tokens.h"
 #include "draw_private.h"
 #include "draw_context.h"
diff --git a/src/gallium/auxiliary/draw/draw_vs_ppc.c b/src/gallium/auxiliary/draw/draw_vs_ppc.c
new file mode 100644
index 0000000000..8eff6d4fda
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_vs_ppc.c
@@ -0,0 +1,274 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  *   Brian Paul
+  */
+
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "pipe/p_config.h"
+
+#include "draw_vs.h"
+
+#if defined(PIPE_ARCH_PPC)
+
+#include "pipe/p_shader_tokens.h"
+
+#include "draw_private.h"
+#include "draw_context.h"
+
+#include "rtasm/rtasm_cpu.h"
+#include "rtasm/rtasm_ppc.h"
+#include "tgsi/tgsi_ppc.h"
+#include "tgsi/tgsi_parse.h"
+
+
+
+typedef void (PIPE_CDECL *codegen_function) (float (*inputs)[4][4],
+                                             float (*outputs)[4][4],
+                                             float (*temps)[4][4],
+                                             float (*immeds)[4][4],
+                                             float (*consts)[4],
+                                             const float *builtins);
+
+#if 0
+   const struct tgsi_exec_vector *input,
+   struct tgsi_exec_vector *output,
+   float (*constant)[4],        /* 3 */
+   struct tgsi_exec_vector *temporary, /* 4 */
+   float (*immediates)[4],      /* 5 */
+   const float (*aos_input)[4], /* 6 */
+   uint num_inputs,             /* 7 */
+   uint input_stride,           /* 8 */
+   float (*aos_output)[4],      /* 9 */
+   uint num_outputs,            /* 10 */
+   uint output_stride );        /* 11 */
+#endif
+
+struct draw_ppc_vertex_shader {
+   struct draw_vertex_shader base;
+   struct ppc_function ppc_program;
+
+   codegen_function func;
+   
+   struct tgsi_exec_machine *machine;
+};
+
+
+static void
+vs_ppc_prepare( struct draw_vertex_shader *base,
+		struct draw_context *draw )
+{
+}
+
+
+
+/* Simplified vertex shader interface for the pt paths.  Given the
+ * complexity of code-generating all the above operations together,
+ * it's time to try doing all the other stuff separately.
+ */
+static void
+vs_ppc_run_linear( struct draw_vertex_shader *base,
+		   const float (*input)[4],
+		   float (*output)[4],
+		   const float (*constants)[4],
+		   unsigned count,
+		   unsigned input_stride,
+		   unsigned output_stride )
+{
+   struct draw_ppc_vertex_shader *shader = (struct draw_ppc_vertex_shader *)base;
+   struct tgsi_exec_machine *machine = shader->machine;
+   unsigned int i;
+
+#define MAX_VERTICES 4
+
+   /* loop over verts */
+   for (i = 0; i < count; i += MAX_VERTICES) {
+      const uint max_vertices = MIN2(MAX_VERTICES, count - i);
+      float inputs_soa[PIPE_MAX_SHADER_INPUTS][4][4] ALIGN16_ATTRIB;
+      float outputs_soa[PIPE_MAX_SHADER_OUTPUTS][4][4] ALIGN16_ATTRIB;
+      float temps_soa[TGSI_EXEC_NUM_TEMPS][4][4] ALIGN16_ATTRIB;
+      uint attr;
+
+      /* convert (up to) four input verts to SoA format */
+      for (attr = 0; attr < base->info.num_inputs; attr++) {
+         const float *vIn = (const float *) input;
+         uint vert;
+         for (vert = 0; vert < max_vertices; vert++) {
+#if 0
+            if (attr==0)
+               printf("Input v%d a%d: %f %f %f %f\n",
+                      vert, attr, vIn[0], vIn[1], vIn[2], vIn[3]);
+#endif
+            inputs_soa[attr][0][vert] = vIn[attr * 4 + 0];
+            inputs_soa[attr][1][vert] = vIn[attr * 4 + 1];
+            inputs_soa[attr][2][vert] = vIn[attr * 4 + 2];
+            inputs_soa[attr][3][vert] = vIn[attr * 4 + 3];
+            vIn += input_stride / 4;
+         }
+      }
+
+      /* run compiled shader
+       */
+#if 0
+      shader->func(machine->Inputs,
+		   machine->Outputs,
+		   (float (*)[4])constants,
+		   machine->Temps,
+		   (float (*)[4])shader->base.immediates,
+                   input,
+                   base->info.num_inputs,
+                   input_stride,
+                   output,
+                   base->info.num_outputs,
+                   output_stride );
+#else
+      shader->func(inputs_soa, outputs_soa, temps_soa,
+		   (float (*)[4][4]) shader->base.immediates,
+		   (float (*)[4]) constants,
+                   ppc_builtin_constants);
+
+      /*output[0][0] = input[0][0] * 0.5;*/
+#endif
+
+      /* convert (up to) four output verts from SoA back to AoS format */
+      for (attr = 0; attr < base->info.num_outputs; attr++) {
+         float *vOut = (float *) output;
+         uint vert;
+         for (vert = 0; vert < max_vertices; vert++) {
+            vOut[attr * 4 + 0] = outputs_soa[attr][0][vert];
+            vOut[attr * 4 + 1] = outputs_soa[attr][1][vert];
+            vOut[attr * 4 + 2] = outputs_soa[attr][2][vert];
+            vOut[attr * 4 + 3] = outputs_soa[attr][3][vert];
+#if 0
+            if (attr==0)
+               printf("Output v%d a%d: %f %f %f %f\n",
+                      vert, attr, vOut[0], vOut[1], vOut[2], vOut[3]);
+#endif
+            vOut += output_stride / 4;
+         }
+      }
+
+      /* advance to next group of four input/output verts */
+      input = (const float (*)[4])((const char *)input + input_stride * max_vertices);
+      output = (float (*)[4])((char *)output + output_stride * max_vertices);
+   }
+}
+
+
+
+
+static void
+vs_ppc_delete( struct draw_vertex_shader *base )
+{
+   struct draw_ppc_vertex_shader *shader = (struct draw_ppc_vertex_shader *)base;
+   
+   ppc_release_func( &shader->ppc_program );
+
+   align_free( (void *) shader->base.immediates );
+
+   FREE( (void*) shader->base.state.tokens );
+   FREE( shader );
+}
+
+
+struct draw_vertex_shader *
+draw_create_vs_ppc(struct draw_context *draw,
+                          const struct pipe_shader_state *templ)
+{
+   struct draw_ppc_vertex_shader *vs;
+
+   vs = CALLOC_STRUCT( draw_ppc_vertex_shader );
+   if (vs == NULL) 
+      return NULL;
+
+   /* we make a private copy of the tokens */
+   vs->base.state.tokens = tgsi_dup_tokens(templ->tokens);
+   if (!vs->base.state.tokens)
+      goto fail;
+
+   tgsi_scan_shader(templ->tokens, &vs->base.info);
+
+   vs->base.draw = draw;
+#if 0
+   if (1)
+      vs->base.create_varient = draw_vs_varient_aos_ppc;
+   else
+#endif
+      vs->base.create_varient = draw_vs_varient_generic;
+   vs->base.prepare = vs_ppc_prepare;
+   vs->base.run_linear = vs_ppc_run_linear;
+   vs->base.delete = vs_ppc_delete;
+   
+   vs->base.immediates = align_malloc(TGSI_EXEC_NUM_IMMEDIATES * 4 * 4 *
+                                      sizeof(float), 16);
+
+   vs->machine = &draw->vs.machine;
+   
+   ppc_init_func( &vs->ppc_program, 2000 ); /* XXX fix limit */
+
+   if (!tgsi_emit_ppc( (struct tgsi_token *) vs->base.state.tokens,
+			&vs->ppc_program, 
+                        (float (*)[4])vs->base.immediates, 
+                        TRUE )) 
+      goto fail;
+      
+   vs->func = (codegen_function) ppc_get_func( &vs->ppc_program );
+   if (!vs->func) {
+      goto fail;
+   }
+   
+   return &vs->base;
+
+fail:
+   /*
+   debug_error("tgsi_emit_ppc() failed, falling back to interpreter\n");
+   */
+
+   ppc_release_func( &vs->ppc_program );
+   
+   FREE(vs);
+   return NULL;
+}
+
+
+
+#else /* PIPE_ARCH_PPC */
+
+
+struct draw_vertex_shader *
+draw_create_vs_ppc( struct draw_context *draw,
+		    const struct pipe_shader_state *templ )
+{
+   return (void *) 0;
+}
+
+
+#endif /* PIPE_ARCH_PPC */
diff --git a/src/gallium/auxiliary/draw/draw_vs_sse.c b/src/gallium/auxiliary/draw/draw_vs_sse.c
index 0efabd9de8..b11ae31662 100644
--- a/src/gallium/auxiliary/draw/draw_vs_sse.c
+++ b/src/gallium/auxiliary/draw/draw_vs_sse.c
@@ -37,7 +37,7 @@
 
 #include "draw_vs.h"
 
-#if defined(PIPE_ARCH_X86)
+#if defined(PIPE_ARCH_X86) && defined(PIPE_ARCH_SSE)
 
 #include "pipe/p_shader_tokens.h"
 
diff --git a/src/gallium/auxiliary/draw/draw_vs_varient.c b/src/gallium/auxiliary/draw/draw_vs_varient.c
index 4daf05dae7..7ee567d478 100644
--- a/src/gallium/auxiliary/draw/draw_vs_varient.c
+++ b/src/gallium/auxiliary/draw/draw_vs_varient.c
@@ -64,10 +64,10 @@ struct draw_vs_varient_generic {
 
 
 
-static void vsvg_set_input( struct draw_vs_varient *varient,
-                            unsigned buffer,
-                            const void *ptr,
-                            unsigned stride )
+static void vsvg_set_buffer( struct draw_vs_varient *varient,
+                             unsigned buffer,
+                             const void *ptr,
+                             unsigned stride )
 {
    struct draw_vs_varient_generic *vsvg = (struct draw_vs_varient_generic *)varient;
 
@@ -265,7 +265,7 @@ struct draw_vs_varient *draw_vs_varient_generic( struct draw_vertex_shader *vs,
 
    vsvg->base.key = *key;
    vsvg->base.vs = vs;
-   vsvg->base.set_input     = vsvg_set_input;
+   vsvg->base.set_buffer    = vsvg_set_buffer;
    vsvg->base.run_elts      = vsvg_run_elts;
    vsvg->base.run_linear    = vsvg_run_linear;
    vsvg->base.destroy       = vsvg_destroy;
diff --git a/src/gallium/auxiliary/gallivm/gallivm_builtins.cpp b/src/gallium/auxiliary/gallivm/gallivm_builtins.cpp
index 0fc5c4ec5c..fcc5c05794 100644
--- a/src/gallium/auxiliary/gallivm/gallivm_builtins.cpp
+++ b/src/gallium/auxiliary/gallivm/gallivm_builtins.cpp
@@ -1,140 +1,140 @@
 static const unsigned char llvm_builtins_data[] = {
-0x42,0x43,0xc0,0xde,0x21,0x0c,0x00,0x00,0x29,0x02,0x00,0x00,0x01,0x10,0x00,0x00,
+0x42,0x43,0xc0,0xde,0x21,0x0c,0x00,0x00,0x27,0x02,0x00,0x00,0x01,0x10,0x00,0x00,
 0x10,0x00,0x00,0x00,0x07,0x81,0x23,0x91,0x41,0xc8,0x04,0x49,0x06,0x10,0x32,0x39,
 0x92,0x01,0x84,0x0c,0x25,0x05,0x08,0x19,0x1e,0x04,0x8b,0x62,0x80,0x14,0x45,0x02,
 0x42,0x92,0x0b,0x42,0xa4,0x10,0x32,0x14,0x38,0x08,0x18,0x49,0x0a,0x32,0x44,0x24,
 0x48,0x0a,0x90,0x21,0x23,0x44,0x72,0x80,0x8c,0x14,0x21,0x86,0x0a,0x8a,0x0a,0x64,
-0x0c,0x1f,0x00,0x00,0x49,0x18,0x00,0x00,0x02,0x00,0x00,0x00,0x0b,0x04,0x00,0x0c,
-0x00,0x00,0x00,0x00,0x51,0x20,0x00,0x00,0x12,0x00,0x00,0x00,0x32,0x22,0x48,0x09,
-0x20,0x65,0x82,0x84,0x00,0x26,0x45,0x48,0x05,0x09,0x26,0x45,0xc6,0x05,0x42,0x52,
-0x26,0x08,0xae,0x19,0x80,0x61,0x04,0x02,0x98,0x23,0x00,0x83,0x29,0x80,0x21,0x00,
-0xb2,0x73,0x04,0x01,0x51,0x8a,0xf4,0x08,0x92,0xa4,0x39,0x47,0x80,0x50,0x2b,0x03,
-0x00,0xa0,0x08,0x21,0x5c,0x46,0x2b,0x44,0x08,0x21,0xd4,0x40,0x14,0x01,0x80,0x11,
-0x80,0x22,0x88,0x00,0x13,0xa2,0x74,0xb0,0x03,0x3c,0xb0,0x83,0x36,0x80,0x87,0x71,
-0x68,0x03,0x76,0x48,0x07,0x77,0xa8,0x07,0x7c,0x68,0x83,0x73,0x70,0x87,0x7a,0xd8,
-0x70,0x0f,0xe5,0xd0,0x06,0xf0,0xa0,0x07,0x73,0x20,0x07,0x7a,0x30,0x07,0x72,0xa0,
-0x07,0x73,0x20,0x07,0x6d,0x90,0x0e,0x71,0xa0,0x07,0x78,0xa0,0x07,0x78,0xd0,0x06,
-0xe9,0x80,0x07,0x7a,0x80,0x07,0x7a,0x80,0x07,0x6d,0x90,0x0e,0x71,0x60,0x07,0x7a,
-0x10,0x07,0x76,0xa0,0x07,0x71,0x60,0x07,0x6d,0x90,0x0e,0x73,0x20,0x07,0x7a,0x30,
-0x07,0x72,0xa0,0x07,0x73,0x20,0x07,0x6d,0x90,0x0e,0x76,0x40,0x07,0x7a,0x30,0x07,
-0x72,0xa0,0x07,0x76,0x40,0x07,0x6d,0x60,0x0e,0x73,0x20,0x07,0x7a,0x30,0x07,0x72,
-0xa0,0x07,0x73,0x20,0x07,0x6d,0x60,0x0e,0x76,0x40,0x07,0x7a,0x30,0x07,0x72,0xa0,
-0x07,0x76,0x40,0x07,0x6d,0x60,0x0f,0x76,0x40,0x07,0x7a,0x60,0x07,0x74,0xa0,0x07,
-0x76,0x40,0x07,0x6d,0x60,0x0f,0x71,0x20,0x07,0x78,0xa0,0x07,0x71,0x20,0x07,0x78,
-0xa0,0x07,0x71,0x20,0x07,0x78,0xd0,0x06,0xe1,0x00,0x07,0x7a,0x00,0x07,0x7a,0x60,
-0x07,0x74,0xd0,0x06,0xe6,0x80,0x07,0x70,0xa0,0x07,0x71,0x20,0x07,0x78,0xa0,0x07,
-0x71,0x20,0x07,0x78,0xa0,0xf3,0x40,0x88,0x04,0x32,0x32,0x02,0x04,0x20,0x76,0x46,
-0xfc,0x6c,0x48,0x92,0x00,0x40,0x00,0x00,0x00,0x00,0x0c,0x49,0x12,0x20,0x00,0x00,
-0x00,0x00,0x80,0x21,0x89,0x02,0x00,0x01,0x00,0x00,0x00,0x30,0x24,0x59,0x00,0x20,
-0x08,0x00,0x00,0x00,0x86,0x24,0x0a,0x00,0x04,0x00,0x00,0x00,0xc0,0x90,0x84,0x01,
-0x02,0x00,0x00,0x00,0x00,0x18,0x92,0x1c,0x40,0x00,0x00,0x00,0x00,0x00,0x43,0x12,
-0x05,0x00,0x02,0x00,0x00,0x00,0x60,0x48,0x72,0x00,0x01,0x00,0x00,0x00,0x00,0x0c,
-0x49,0x14,0x00,0x08,0x00,0x00,0x00,0x80,0x21,0x49,0x01,0x00,0x41,0x00,0x00,0x00,
-0x90,0x05,0x02,0x00,0x10,0x00,0x00,0x00,0x32,0x1e,0x98,0x10,0x19,0x11,0x4c,0x90,
+0x0c,0x1f,0x00,0x00,0x49,0x18,0x00,0x00,0x03,0x00,0x00,0x00,0x0b,0x84,0xff,0xff,
+0xff,0xff,0x1f,0xc0,0x00,0x00,0x00,0x00,0x51,0x20,0x00,0x00,0x12,0x00,0x00,0x00,
+0x32,0x22,0x48,0x09,0x20,0x65,0x82,0x84,0x00,0x26,0x45,0x48,0x05,0x09,0x26,0x45,
+0xc6,0x05,0x42,0x52,0x26,0x08,0xae,0x19,0x80,0x61,0x04,0x02,0x98,0x23,0x00,0x83,
+0x29,0x80,0x21,0x00,0xb2,0x73,0x04,0x01,0x51,0x8a,0xf4,0x08,0x92,0xa4,0x39,0x47,
+0x80,0x50,0x2b,0x03,0x00,0xa0,0x08,0x21,0x5c,0x46,0x2b,0x44,0x08,0x21,0xd4,0x40,
+0x14,0x01,0x80,0x11,0x80,0x22,0x88,0x00,0x13,0x30,0x7c,0xc0,0x03,0x3b,0xf8,0x05,
+0x3b,0xa0,0x83,0x36,0xa8,0x07,0x77,0x58,0x07,0x77,0x78,0x87,0x7b,0x70,0x87,0x36,
+0x60,0x87,0x74,0x70,0x87,0x7a,0xc0,0x87,0x36,0x38,0x07,0x77,0xa8,0x87,0x0d,0xf7,
+0x50,0x0e,0x6d,0x00,0x0f,0x7a,0x60,0x07,0x74,0xa0,0x07,0x76,0x40,0x07,0x7a,0x60,
+0x07,0x74,0xd0,0x06,0xe9,0x10,0x07,0x7a,0x80,0x07,0x7a,0x80,0x07,0x6d,0x90,0x0e,
+0x78,0xa0,0x07,0x78,0xa0,0x07,0x78,0xd0,0x06,0xe9,0x10,0x07,0x76,0xa0,0x07,0x71,
+0x60,0x07,0x7a,0x10,0x07,0x76,0xd0,0x06,0xe9,0x30,0x07,0x72,0xa0,0x07,0x73,0x20,
+0x07,0x7a,0x30,0x07,0x72,0xd0,0x06,0xe9,0x60,0x07,0x74,0xa0,0x07,0x76,0x40,0x07,
+0x7a,0x60,0x07,0x74,0xd0,0x06,0xe6,0x30,0x07,0x72,0xa0,0x07,0x73,0x20,0x07,0x7a,
+0x30,0x07,0x72,0xd0,0x06,0xe6,0x60,0x07,0x74,0xa0,0x07,0x76,0x40,0x07,0x7a,0x60,
+0x07,0x74,0xd0,0x06,0xf6,0x60,0x07,0x74,0xa0,0x07,0x76,0x40,0x07,0x7a,0x60,0x07,
+0x74,0xd0,0x06,0xf6,0x10,0x07,0x72,0x80,0x07,0x7a,0x10,0x07,0x72,0x80,0x07,0x7a,
+0x10,0x07,0x72,0x80,0x07,0x6d,0x10,0x0e,0x70,0xa0,0x07,0x70,0xa0,0x07,0x76,0x40,
+0x07,0x6d,0x60,0x0e,0x78,0x00,0x07,0x7a,0x10,0x07,0x72,0x80,0x07,0x7a,0x10,0x07,
+0x72,0x80,0x07,0x3a,0x0f,0x84,0x48,0x20,0x23,0x24,0x40,0x00,0x62,0x67,0x88,0x9f,
+0x19,0x92,0x24,0x00,0x10,0x04,0x00,0x00,0x00,0x43,0x92,0x04,0x08,0x00,0x00,0x00,
+0x00,0x60,0x48,0xa2,0x00,0x40,0x10,0x00,0x00,0x00,0x0c,0x49,0x16,0x00,0x08,0x02,
+0x00,0x00,0x80,0x21,0x89,0x02,0x00,0x41,0x00,0x00,0x00,0x30,0x24,0x61,0x80,0x00,
+0x00,0x00,0x00,0x00,0x86,0x24,0x07,0x10,0x00,0x00,0x00,0x00,0xc0,0x90,0x44,0x01,
+0x80,0x20,0x00,0x00,0x00,0x18,0x92,0x1c,0x40,0x00,0x00,0x00,0x00,0x00,0x43,0x12,
+0x05,0x00,0x82,0x00,0x00,0x00,0x60,0x48,0x52,0x00,0x40,0x10,0x00,0x00,0x00,0x64,
+0x81,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x32,0x1e,0x98,0x10,0x19,0x11,0x4c,0x90,
 0x8c,0x09,0x26,0x47,0xc6,0x04,0x43,0x8a,0x8a,0x59,0x8b,0x43,0x50,0xd2,0x09,0x02,
 0x81,0xd2,0x73,0x50,0xc9,0x0c,0x2a,0x99,0x41,0x25,0x33,0xa8,0x64,0x56,0x28,0x66,
 0x2d,0x0e,0x41,0xcf,0x2a,0x15,0x04,0x4a,0xcf,0x41,0x25,0x33,0xa8,0x64,0x06,0x95,
 0xcc,0xa0,0x92,0x59,0x01,0x00,0x00,0x00,0x53,0x82,0x26,0x0c,0x04,0x00,0x00,0x00,
 0x22,0x00,0x00,0x00,0x13,0x04,0x41,0x2c,0x10,0x00,0x00,0x00,0x05,0x00,0x00,0x00,
 0x04,0xc6,0x08,0x40,0x10,0x04,0xe1,0x70,0x18,0x23,0x00,0x41,0x10,0x84,0xc3,0x60,
-0x04,0x00,0x00,0x00,0x93,0x0c,0xce,0x43,0x4c,0x31,0x3c,0x8e,0x34,0xc9,0x30,0x41,
-0xc2,0x14,0x03,0x34,0x51,0x93,0x0c,0x4d,0x44,0x4c,0x31,0x44,0x8d,0x35,0x56,0x01,
-0x04,0xc3,0x55,0x21,0x16,0x0e,0x04,0x00,0x0f,0x00,0x00,0x00,0x46,0x41,0x08,0xcc,
-0x73,0x9b,0x05,0x21,0x30,0xcf,0x6e,0x18,0x84,0x00,0x2c,0x8b,0x35,0x04,0x80,0x39,
-0x04,0x81,0x5d,0x20,0x80,0x0f,0x0c,0x43,0xe4,0xd3,0x36,0x81,0x04,0x3e,0x30,0x0c,
-0x91,0x4f,0x5b,0x05,0x12,0xf8,0xc0,0x30,0x44,0x7e,0x7d,0x00,0x05,0xd1,0x4c,0x11,
-0x66,0x12,0x83,0xc0,0x3c,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00,
-0x2a,0x00,0x00,0x00,0x13,0x04,0x43,0x2c,0x10,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
+0x04,0x00,0x00,0x00,0xc3,0x0d,0xce,0x43,0x4c,0x37,0x3c,0x8e,0x34,0xdc,0x30,0x41,
+0xc2,0x74,0x03,0x34,0x51,0xc3,0x0d,0x4d,0x44,0x4c,0x37,0x44,0x8d,0x35,0x56,0x01,
+0x04,0xc3,0x55,0x21,0x16,0x0e,0x04,0x00,0x0f,0x00,0x00,0x00,0xd6,0x10,0x00,0xe6,
+0x10,0x04,0x76,0x81,0x00,0x3e,0x30,0x0c,0x91,0x4f,0x1b,0x05,0x21,0x30,0x8f,0x6d,
+0x13,0x48,0xe0,0x03,0xc3,0x10,0xf9,0xb4,0x55,0x20,0x81,0x0f,0x0c,0x43,0xe4,0xd7,
+0x66,0x41,0x08,0xcc,0xa3,0x1f,0x40,0x41,0x34,0x53,0x84,0x99,0xc4,0x20,0x30,0x8f,
+0x61,0x10,0x02,0xb0,0x2c,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00,
+0x27,0x00,0x00,0x00,0x13,0x04,0x43,0x2c,0x10,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
 0x24,0x8a,0xa0,0x0c,0x46,0x00,0x4a,0x80,0xc2,0x1c,0x84,0x55,0x55,0xd6,0x1c,0x84,
 0x45,0x51,0x16,0x81,0x19,0x80,0x11,0x80,0x31,0x02,0x10,0x04,0x41,0xfc,0x03,0x00,
-0x63,0x08,0x0d,0x34,0xc9,0x70,0x55,0xc2,0x2c,0x43,0x20,0x60,0x73,0x0c,0xd3,0x15,
+0x63,0x08,0x0d,0x34,0xdc,0x70,0x55,0xc2,0x2c,0x43,0x20,0x60,0x73,0x0c,0xd3,0x15,
 0x8d,0x21,0x34,0xd1,0x18,0x42,0xf3,0x8c,0x55,0x00,0x81,0xa0,0x6d,0x73,0x0c,0x19,
-0xe7,0x60,0x87,0x52,0x38,0x10,0x00,0x00,0x13,0x00,0x00,0x00,0x17,0x60,0x20,0xc5,
-0x74,0x10,0x8d,0x65,0x14,0x13,0xf3,0xd4,0xb4,0x6d,0x14,0x13,0xf3,0xd4,0xb8,0x69,
-0x14,0x13,0xf3,0xd4,0xb6,0x75,0x14,0x13,0xf3,0xd4,0xba,0x35,0x0c,0x13,0xf3,0x9c,
-0x80,0xe4,0x36,0x48,0x81,0x10,0xc3,0x4a,0x4c,0x54,0xd4,0x6c,0x8b,0x23,0x28,0x76,
-0x41,0x4c,0xcc,0xa3,0x1b,0x07,0x21,0x00,0xcb,0x72,0x00,0x05,0xd1,0x4c,0x11,0x66,
-0x18,0x83,0xc0,0x3c,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00,
-0x81,0x00,0x00,0x00,0x13,0x04,0x4d,0x2c,0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x24,0xca,0x60,0x04,0xa0,0x04,0x8a,0x80,0xc2,0x0c,0x00,0x91,0x11,0x00,0x00,0x00,
-0x63,0x08,0x4d,0x64,0x16,0xc1,0x49,0x86,0xab,0x22,0x66,0x19,0x02,0x01,0x1b,0x43,
-0x70,0xa2,0x59,0x82,0x61,0x0c,0xe1,0x89,0x66,0x09,0x86,0x81,0x0a,0x20,0x0b,0x34,
-0x61,0x8e,0x81,0xda,0xa2,0x31,0x84,0x46,0xb2,0x8e,0xe0,0x24,0x83,0x57,0x11,0xb3,
-0x0c,0x44,0xf1,0x8d,0x21,0x38,0xd2,0x2c,0x81,0x31,0x86,0xf0,0x48,0xb3,0x04,0xc6,
-0x40,0x05,0x00,0x06,0x44,0x18,0x14,0x73,0x0c,0x9c,0x18,0x48,0x63,0x08,0xcd,0x64,
-0x64,0x40,0x70,0x92,0xa1,0x0c,0x2a,0x62,0x96,0xe1,0x40,0xcc,0x60,0x0c,0xc1,0x99,
-0x66,0x09,0x92,0x31,0x84,0x67,0x9a,0x25,0x48,0x06,0x2a,0x80,0x33,0x38,0xd0,0x00,
-0x99,0x63,0x18,0x83,0x34,0x98,0xc6,0x10,0x1a,0xc8,0xd6,0x80,0xe0,0x24,0x03,0x1b,
-0x54,0xc4,0x2c,0x83,0xb2,0xb4,0xc1,0x18,0x82,0x03,0xcd,0x12,0x30,0x63,0x08,0x0f,
-0x34,0x4b,0xc0,0x0c,0x54,0x00,0x6e,0xa0,0xbc,0xc1,0x32,0xc7,0xa0,0x06,0x70,0x00,
-0x61,0x1c,0x84,0x03,0x01,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x76,0x52,0x4c,0xcc,
-0x73,0xd3,0x24,0x05,0x64,0xec,0xcd,0x8d,0xcc,0xe5,0x87,0x46,0xc6,0x50,0x8a,0x89,
-0x79,0xee,0xdb,0x54,0x8a,0x89,0x79,0xee,0xdd,0x1a,0x88,0x89,0x79,0x68,0x73,0x20,
-0x26,0xe6,0xa9,0xed,0x81,0x98,0x98,0xc7,0x36,0x0b,0x62,0x62,0x9e,0xdb,0x32,0x88,
-0x89,0x79,0x72,0xd3,0x20,0x26,0xe6,0xd9,0x8d,0x83,0x98,0x98,0xa7,0xb7,0x95,0x62,
-0x62,0x9e,0xbb,0x27,0x2d,0x20,0x63,0x6f,0x6e,0x64,0x2e,0x3a,0x34,0x35,0x56,0x62,
-0x08,0x4e,0x53,0xd9,0xba,0xb5,0x14,0x02,0xf3,0xe0,0xf5,0x25,0x2c,0x82,0xd3,0x0c,
-0xbe,0xe0,0x34,0xd3,0x8d,0x9b,0x88,0x21,0x38,0xcd,0x60,0xd7,0x24,0x01,0x63,0xec,
-0xcd,0x8d,0xcc,0x45,0x87,0x44,0x80,0x8c,0xbd,0xb9,0x91,0xb9,0xfc,0xc4,0xd0,0x90,
-0x02,0x8c,0xb1,0x37,0x37,0x32,0x97,0x1f,0x73,0x29,0x26,0xe6,0xc1,0x71,0x7b,0x29,
-0x26,0xe6,0xc1,0x77,0xfb,0x28,0x04,0xe6,0xa9,0x6f,0x52,0x01,0x32,0xf6,0xe6,0x46,
-0xe6,0xa2,0x13,0x73,0x63,0x18,0x83,0xc0,0x3c,0xb6,0x41,0x08,0x4e,0x33,0x58,0x47,
-0x31,0x31,0x4f,0x5d,0x1f,0xc3,0x22,0x38,0xcd,0xe0,0x0b,0x4e,0x33,0xe1,0xbc,0xa5,
-0x18,0x82,0xd3,0x0c,0x77,0x6e,0x20,0xc5,0xc4,0x3c,0xb5,0x4e,0x3a,0x40,0xc6,0xde,
-0xdc,0xc8,0x5c,0x7e,0x64,0x70,0x2c,0xa4,0x98,0x98,0xa7,0xee,0xed,0x82,0x10,0x9c,
-0xa6,0xba,0x81,0x44,0x70,0x9a,0xc1,0x17,0x9c,0x66,0x32,0x93,0x42,0x60,0x1e,0x7b,
-0xb7,0x98,0x62,0x62,0x9e,0xbc,0x36,0x16,0x43,0x70,0x9a,0x0a,0xa7,0x6d,0xa4,0x98,
-0x98,0xc7,0xbe,0x8d,0xa4,0x98,0x98,0xc7,0xce,0x0d,0xc6,0x10,0x9c,0x66,0xc0,0x7b,
-0x12,0x02,0x32,0xf6,0xe6,0x46,0xe6,0xa2,0x33,0x13,0x73,0x06,0x8b,0xe0,0x34,0x83,
-0x2f,0x38,0xcd,0x64,0xd3,0xe6,0x61,0x08,0x4e,0x53,0xd5,0xf6,0x01,0x14,0x44,0x33,
-0x45,0x18,0x00,0x00,0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00,0x4a,0x00,0x00,0x00,
-0x13,0x04,0x41,0x2c,0x10,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0x24,0xca,0x60,0x04,
-0xa0,0x04,0x8a,0x80,0xc2,0x0c,0x00,0xb9,0x61,0x0c,0x04,0x10,0x1e,0xe1,0x19,0xc6,
-0x40,0x02,0xe1,0x11,0x1e,0x00,0x00,0x00,0x63,0x08,0xcd,0x63,0x15,0xc1,0x31,0x84,
-0x06,0xb2,0x8b,0xe0,0x18,0x42,0x13,0x59,0x46,0x70,0x0c,0xa1,0x71,0x6c,0x23,0x38,
-0x16,0x02,0x04,0xc7,0x64,0x61,0x1a,0x37,0x16,0x01,0x04,0x48,0x35,0xc7,0x20,0x79,
-0xcf,0x58,0x04,0x10,0x20,0xd5,0x1c,0xc3,0x07,0x06,0xd0,0x58,0x04,0x10,0x20,0xd5,
-0x1c,0x43,0x18,0x88,0x41,0x34,0x16,0x01,0x04,0x48,0x35,0xc7,0x30,0x06,0x64,0xe0,
-0x98,0x47,0xd0,0xc0,0x80,0xa0,0x89,0x01,0x41,0x23,0x03,0x82,0x63,0x21,0x40,0x70,
-0x50,0x66,0x70,0x06,0x68,0x90,0x06,0x58,0x06,0xe1,0x40,0x00,0x25,0x00,0x00,0x00,
-0x56,0x52,0x4c,0xcc,0x73,0xd3,0x56,0x41,0x4c,0xcc,0x53,0xdb,0x05,0x31,0x31,0xcf,
-0x6d,0x19,0xc4,0xc4,0x3c,0xba,0x6d,0x10,0x13,0xf3,0xf4,0xd6,0x41,0x08,0xc0,0xb2,
-0x18,0x46,0x21,0x38,0x4d,0x85,0x9b,0x46,0x21,0x38,0x4d,0xb5,0x9b,0x8a,0x21,0x00,
-0xcb,0x82,0xdf,0x66,0x62,0x08,0x4e,0x53,0xdd,0xb7,0x9d,0x18,0x82,0xd3,0x54,0xb7,
-0x6e,0x28,0x86,0xe0,0x34,0xd5,0xdd,0xdb,0x47,0x31,0x31,0x4f,0x9d,0x9b,0x87,0x21,
-0x00,0xcb,0x52,0xdf,0x06,0x62,0x08,0xc0,0xb2,0xd4,0xbc,0x59,0x10,0x82,0xd3,0x54,
-0x96,0x62,0x08,0x4e,0x53,0xe1,0xb6,0x85,0x14,0x13,0xf3,0xd8,0xb4,0x8d,0x14,0x13,
-0xf3,0xd8,0xb9,0x89,0x18,0x02,0xb0,0x2c,0xf6,0x6d,0x24,0x86,0x00,0x2c,0x8b,0xcd,
-0x1b,0x87,0x21,0x38,0x4d,0x55,0xd3,0xd6,0x30,0x54,0xc0,0x72,0x00,0x05,0xd1,0x4c,
-0x11,0x06,0x00,0x00,0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00,0x19,0x00,0x00,0x00,
-0x13,0x04,0x41,0x2c,0x10,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x24,0x4a,0x60,0x04,
-0x80,0xc2,0x0c,0x00,0x00,0x00,0x00,0x00,0x63,0x08,0xcd,0x33,0x16,0x01,0x04,0x48,
-0x34,0xc7,0x00,0x49,0xcf,0x58,0x04,0x10,0x28,0xd1,0x1c,0xc3,0x44,0x39,0x58,0x85,
-0x03,0x01,0x00,0x00,0x0a,0x00,0x00,0x00,0x16,0x41,0x4c,0xcc,0x63,0xdb,0x04,0x31,
-0x31,0x4f,0x6e,0x0d,0x43,0x05,0x2c,0x07,0x50,0x10,0xcd,0x14,0x61,0x56,0x41,0x4c,
-0xcc,0xd3,0x1b,0x45,0x21,0x00,0xcb,0xb2,0x9b,0x04,0x21,0x00,0xcb,0x02,0x00,0x00,
-0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00,0x1b,0x00,0x00,0x00,0x13,0x04,0x41,0x2c,
-0x10,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x24,0xca,0x60,0x04,0xa0,0x04,0x8a,0x80,
-0xc2,0x0c,0x00,0x00,0x63,0x08,0xcd,0x33,0x16,0x01,0x04,0xca,0x34,0xc7,0x20,0x51,
-0xcf,0x1c,0x43,0x45,0x41,0x73,0x0c,0x16,0x15,0xcd,0x31,0x5c,0x94,0x83,0x58,0x38,
-0x10,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x86,0x51,0x4c,0xcc,0x53,0xe7,0x76,0x51,
-0x4c,0xcc,0x53,0xdb,0x36,0x41,0x4c,0xcc,0x63,0x5b,0x05,0x31,0x31,0x8f,0x6e,0x0d,
-0x43,0x05,0x2c,0x66,0x41,0x4c,0xcc,0xd3,0x1f,0x40,0x41,0x34,0x53,0x84,0x19,0x05,
-0x21,0x00,0xcb,0x02,0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00,0x2f,0x00,0x00,0x00,
-0x13,0x04,0x45,0x2c,0x10,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x24,0xca,0xa0,0x04,
-0x46,0x00,0x8a,0x80,0xc0,0x08,0x00,0x00,0x63,0x08,0x0d,0x34,0xc9,0x30,0x49,0xc4,
-0x2c,0x03,0x11,0x50,0x63,0x08,0xcd,0x33,0xc9,0x50,0x49,0xc4,0x2c,0x03,0x21,0x58,
-0x63,0x08,0x4d,0x34,0xc9,0x70,0x49,0xc4,0x2c,0x03,0x31,0x60,0x63,0x08,0x8d,0x33,
-0xc9,0x90,0x49,0x84,0x69,0x22,0x70,0xc3,0x27,0x1c,0x08,0x00,0x1a,0x00,0x00,0x00,
-0x96,0x51,0x4c,0xcc,0x53,0xdf,0x66,0x41,0x08,0xcc,0x83,0xdb,0x04,0x31,0x31,0x4f,
-0x6d,0x15,0xc4,0xc4,0x3c,0xb7,0x61,0x10,0x02,0xf3,0xf0,0x47,0x20,0xb9,0x0d,0x52,
-0x20,0xc4,0xb0,0x12,0x13,0x15,0x35,0xdb,0xe2,0x08,0x8a,0x5d,0x10,0x13,0xf3,0xec,
-0x37,0x90,0x2c,0x4e,0xf4,0x47,0x87,0x54,0xd7,0x17,0x70,0x2c,0x4e,0xf4,0x47,0x87,
-0x74,0x02,0xc8,0xe2,0x44,0x7f,0x74,0x48,0xb9,0x69,0x14,0x02,0xf3,0xd4,0xb8,0x6d,
-0x18,0x11,0x31,0x55,0xc0,0x62,0x0d,0x43,0x05,0x2c,0x07,0x50,0x10,0xcd,0x14,0x61,
-0x46,0x31,0x08,0xcc,0x03,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x71,0x20,0x00,0x00,
-0x12,0x00,0x00,0x00,0x66,0x40,0x54,0x82,0x23,0x59,0xc2,0x20,0x09,0x92,0x1d,0x18,
-0x4f,0x84,0x34,0x53,0x61,0x03,0xc4,0xe3,0x58,0x85,0x05,0x14,0xbe,0x34,0x45,0xb5,
-0x21,0x10,0x82,0x23,0x15,0x46,0x30,0x2c,0xc8,0x64,0x02,0x06,0xf0,0x3c,0x91,0x73,
-0x19,0x00,0xe1,0x4b,0x53,0x64,0x0a,0x84,0x84,0x34,0x85,0x31,0x10,0x0a,0xb2,0x3c,
-0x56,0x30,0x08,0xcc,0x63,0x0b,0x44,0x25,0x21,0x0d,0x00,0x00,0x00,0x00,0x00,0x00};
+0xe7,0x60,0x87,0x52,0x38,0x10,0x00,0x00,0x10,0x00,0x00,0x00,0x27,0x50,0x20,0x05,
+0xd1,0x0c,0x17,0x60,0x20,0xc5,0x74,0x10,0x8d,0x65,0x14,0x13,0xf3,0xd4,0xb4,0x6d,
+0x14,0x13,0xf3,0xd4,0xb8,0x69,0x14,0x13,0xf3,0xd4,0xb6,0x75,0x14,0x13,0xf3,0xd4,
+0xba,0x35,0x0c,0x13,0xf3,0xd8,0x05,0x31,0x31,0x8f,0x6e,0x1c,0x84,0x00,0x2c,0xcb,
+0x01,0x14,0x44,0x33,0x45,0x98,0x61,0x0c,0x02,0xf3,0x00,0x00,0x00,0x00,0x00,0x00,
+0x61,0x20,0x00,0x00,0x81,0x00,0x00,0x00,0x13,0x04,0x4d,0x2c,0x10,0x00,0x00,0x00,
+0x04,0x00,0x00,0x00,0x24,0xca,0x60,0x04,0xa0,0x04,0x8a,0x80,0xc2,0x0c,0x00,0x91,
+0x11,0x00,0x00,0x00,0x63,0x08,0x4d,0x64,0x16,0xc1,0xe1,0x86,0xab,0x22,0x66,0x19,
+0x02,0x01,0x1b,0x43,0x70,0xa2,0x59,0x82,0x61,0x0c,0xe1,0x89,0x66,0x09,0x86,0x81,
+0x0a,0x20,0x0b,0x34,0x61,0x8e,0x81,0xda,0xa2,0x31,0x84,0x46,0xb2,0x8e,0xe0,0x70,
+0x83,0x57,0x11,0xb3,0x0c,0x44,0xf1,0x8d,0x21,0x38,0xd2,0x2c,0x81,0x31,0x86,0xf0,
+0x48,0xb3,0x04,0xc6,0x40,0x05,0x00,0x06,0x44,0x18,0x14,0x73,0x0c,0x9c,0x18,0x48,
+0x63,0x08,0xcd,0x64,0x64,0x40,0x70,0xb8,0xa1,0x0c,0x2a,0x62,0x96,0xe1,0x40,0xcc,
+0x60,0x0c,0xc1,0x99,0x66,0x09,0x92,0x31,0x84,0x67,0x9a,0x25,0x48,0x06,0x2a,0x80,
+0x33,0x38,0xd0,0x00,0x99,0x63,0x18,0x83,0x34,0x98,0xc6,0x10,0x1a,0xc8,0xd6,0x80,
+0xe0,0x70,0x03,0x1b,0x54,0xc4,0x2c,0x83,0xb2,0xb4,0xc1,0x18,0x82,0x03,0xcd,0x12,
+0x30,0x63,0x08,0x0f,0x34,0x4b,0xc0,0x0c,0x54,0x00,0x6e,0xa0,0xbc,0xc1,0x32,0xc7,
+0xa0,0x06,0x70,0x00,0x61,0x1c,0x84,0x03,0x01,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
+0x76,0x52,0x4c,0xcc,0x73,0xd3,0x24,0x05,0x64,0xec,0xcd,0x8d,0xcc,0xe5,0x87,0x46,
+0xc6,0x50,0x8a,0x89,0x79,0xee,0xdb,0x54,0x8a,0x89,0x79,0xee,0xdd,0x1a,0x88,0x89,
+0x79,0x68,0x73,0x20,0x26,0xe6,0xa9,0xed,0x81,0x98,0x98,0xc7,0x36,0x0b,0x62,0x62,
+0x9e,0xdb,0x32,0x88,0x89,0x79,0x72,0xd3,0x20,0x26,0xe6,0xd9,0x8d,0x83,0x98,0x98,
+0xa7,0xb7,0x95,0x62,0x62,0x9e,0xbb,0x27,0x2d,0x20,0x63,0x6f,0x6e,0x64,0x2e,0x3a,
+0x34,0x35,0x56,0x62,0x08,0x4e,0x53,0xd9,0xba,0xb5,0x14,0x02,0xf3,0xe0,0xf5,0x25,
+0x2c,0x82,0xd3,0x0c,0xbe,0xe0,0x34,0xd3,0x8d,0x9b,0x88,0x21,0x38,0xcd,0x60,0xd7,
+0x24,0x01,0x63,0xec,0xcd,0x8d,0xcc,0x45,0x87,0x44,0x80,0x8c,0xbd,0xb9,0x91,0xb9,
+0xfc,0xc4,0xd0,0x90,0x02,0x8c,0xb1,0x37,0x37,0x32,0x97,0x1f,0x73,0x29,0x26,0xe6,
+0xc1,0x71,0x7b,0x29,0x26,0xe6,0xc1,0x77,0xfb,0x28,0x04,0xe6,0xa9,0x6f,0x52,0x01,
+0x32,0xf6,0xe6,0x46,0xe6,0xa2,0x13,0x73,0x63,0x18,0x83,0xc0,0x3c,0xb6,0x41,0x08,
+0x4e,0x33,0x58,0x47,0x31,0x31,0x4f,0x5d,0x1f,0xc3,0x22,0x38,0xcd,0xe0,0x0b,0x4e,
+0x33,0xe1,0xbc,0xa5,0x18,0x82,0xd3,0x0c,0x77,0x6e,0x20,0xc5,0xc4,0x3c,0xb5,0x4e,
+0x3a,0x40,0xc6,0xde,0xdc,0xc8,0x5c,0x7e,0x64,0x70,0x2c,0xa4,0x98,0x98,0xa7,0xee,
+0x6f,0x20,0x11,0x9c,0x66,0xf0,0x05,0xa7,0x99,0xec,0x82,0x10,0x9c,0xa6,0x32,0x93,
+0x42,0x60,0x1e,0x7b,0xb7,0x98,0x62,0x62,0x9e,0xbc,0x36,0x16,0x43,0x70,0x9a,0x0a,
+0xa7,0x6d,0xa4,0x98,0x98,0xc7,0xbe,0x8d,0xa4,0x98,0x98,0xc7,0xce,0x0d,0xc6,0x10,
+0x9c,0x66,0xc0,0x7b,0x12,0x02,0x32,0xf6,0xe6,0x46,0xe6,0xa2,0x33,0x13,0x73,0x06,
+0x8b,0xe0,0x34,0x83,0x2f,0x38,0xcd,0x64,0xd3,0x07,0x50,0x10,0xcd,0x14,0x61,0xe6,
+0x61,0x08,0x4e,0x53,0xd5,0x36,0x00,0x00,0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00,
+0x4a,0x00,0x00,0x00,0x13,0x04,0x41,0x2c,0x10,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
+0x24,0xca,0x60,0x04,0xa0,0x04,0x8a,0x80,0xc2,0x0c,0x00,0xb9,0x61,0x0c,0x04,0x10,
+0x1e,0xe1,0x19,0xc6,0x40,0x02,0xe1,0x11,0x1e,0x00,0x00,0x00,0x63,0x08,0xcd,0x63,
+0x15,0xc1,0x31,0x84,0x06,0xb2,0x8b,0xe0,0x18,0x42,0x13,0x59,0x46,0x70,0x0c,0xa1,
+0x71,0x6c,0x23,0x38,0x16,0x02,0x04,0xc7,0x64,0x61,0x1a,0x37,0x16,0x01,0x04,0x48,
+0x35,0xc7,0x20,0x79,0xcf,0x58,0x04,0x10,0x20,0xd5,0x1c,0xc3,0x07,0x06,0xd0,0x58,
+0x04,0x10,0x20,0xd5,0x1c,0x43,0x18,0x88,0x41,0x34,0x16,0x01,0x04,0x48,0x35,0xc7,
+0x30,0x06,0x64,0xe0,0x98,0x47,0xd0,0xc0,0x80,0xa0,0x89,0x01,0x41,0x23,0x03,0x82,
+0x63,0x21,0x40,0x70,0x50,0x66,0x70,0x06,0x68,0x90,0x06,0x58,0x06,0xe1,0x40,0x00,
+0x25,0x00,0x00,0x00,0x56,0x52,0x4c,0xcc,0x73,0xd3,0x56,0x41,0x4c,0xcc,0x53,0xdb,
+0x05,0x31,0x31,0xcf,0x6d,0x19,0xc4,0xc4,0x3c,0xba,0x6d,0x10,0x13,0xf3,0xf4,0xd6,
+0x41,0x08,0xc0,0xb2,0x18,0x46,0x21,0x38,0x4d,0x85,0x9b,0x46,0x21,0x38,0x4d,0xb5,
+0x9b,0x8a,0x21,0x00,0xcb,0x82,0xdf,0x66,0x62,0x08,0x4e,0x53,0xdd,0xb7,0x9d,0x18,
+0x82,0xd3,0x54,0xb7,0x6e,0x28,0x86,0xe0,0x34,0xd5,0xdd,0xdb,0x47,0x31,0x31,0x4f,
+0x9d,0x9b,0x87,0x21,0x00,0xcb,0x52,0xdf,0x06,0x62,0x08,0xc0,0xb2,0xd4,0xbc,0x59,
+0x10,0x82,0xd3,0x54,0x96,0x62,0x08,0x4e,0x53,0xe1,0xb6,0x85,0x14,0x13,0xf3,0xd8,
+0xb4,0x8d,0x14,0x13,0xf3,0xd8,0xb9,0x89,0x18,0x02,0xb0,0x2c,0xf6,0x6d,0x24,0x86,
+0x00,0x2c,0x8b,0xcd,0x1b,0x87,0x21,0x38,0x4d,0x55,0xd3,0xd6,0x30,0x54,0xc0,0x72,
+0x00,0x05,0xd1,0x4c,0x11,0x06,0x00,0x00,0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00,
+0x19,0x00,0x00,0x00,0x13,0x04,0x41,0x2c,0x10,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
+0x24,0x4a,0x60,0x04,0x80,0xc2,0x0c,0x00,0x00,0x00,0x00,0x00,0x63,0x08,0xcd,0x33,
+0x16,0x01,0x04,0x48,0x34,0xc7,0x00,0x49,0xcf,0x58,0x04,0x10,0x28,0xd1,0x1c,0xc3,
+0x44,0x39,0x58,0x85,0x03,0x01,0x00,0x00,0x0a,0x00,0x00,0x00,0x26,0x41,0x08,0xc0,
+0xb2,0x18,0x45,0x21,0x00,0xcb,0xb2,0x5b,0x04,0x31,0x31,0x8f,0x6d,0x13,0xc4,0xc4,
+0x3c,0xb9,0x35,0x0c,0x15,0xb0,0x58,0x05,0x31,0x31,0x4f,0x7f,0x00,0x05,0xd1,0x4c,
+0x11,0x06,0x00,0x00,0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00,0x1b,0x00,0x00,0x00,
+0x13,0x04,0x41,0x2c,0x10,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x24,0xca,0x60,0x04,
+0xa0,0x04,0x8a,0x80,0xc2,0x0c,0x00,0x00,0x63,0x08,0xcd,0x33,0x16,0x01,0x04,0xca,
+0x34,0xc7,0x20,0x51,0xcf,0x1c,0x43,0x45,0x41,0x73,0x0c,0x16,0x15,0xcd,0x31,0x5c,
+0x94,0x83,0x58,0x38,0x10,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x76,0x51,0x4c,0xcc,
+0x53,0xdb,0x86,0x51,0x4c,0xcc,0x53,0xe7,0x36,0x41,0x4c,0xcc,0x63,0x5b,0x05,0x31,
+0x31,0x8f,0x6e,0x16,0xc4,0xc4,0x3c,0xbd,0x51,0x10,0x02,0xb0,0x2c,0xd6,0x30,0x54,
+0xc0,0x72,0x00,0x05,0xd1,0x4c,0x11,0x06,0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00,
+0x2c,0x00,0x00,0x00,0x13,0x04,0x45,0x2c,0x10,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
+0x24,0xca,0xa0,0x04,0x46,0x00,0x8a,0x80,0xc0,0x08,0x00,0x00,0x63,0x08,0x0d,0x34,
+0xdc,0x30,0x49,0xc4,0x2c,0x03,0x11,0x50,0x63,0x08,0xcd,0x33,0xdc,0x50,0x49,0xc4,
+0x2c,0x03,0x21,0x58,0x63,0x08,0x4d,0x34,0xdc,0x70,0x49,0xc4,0x2c,0x03,0x31,0x60,
+0x63,0x08,0x8d,0x33,0xdc,0x90,0x49,0x84,0x69,0x22,0x70,0xc3,0x27,0x1c,0x08,0x00,
+0x17,0x00,0x00,0x00,0x96,0x51,0x4c,0xcc,0x53,0xdf,0x66,0x41,0x08,0xcc,0x83,0xdb,
+0x04,0x31,0x31,0x4f,0x6d,0x15,0xc4,0xc4,0x3c,0xb7,0x61,0x10,0x02,0xf3,0xf0,0x76,
+0x41,0x4c,0xcc,0xb3,0x1f,0x81,0x11,0x11,0x13,0x15,0x35,0x37,0x90,0x2c,0x4e,0xf4,
+0x47,0x87,0x54,0xd7,0x17,0x70,0x2c,0x4e,0xf4,0x47,0x87,0x74,0x02,0xc8,0xe2,0x44,
+0x7f,0x74,0x48,0xb9,0x69,0x14,0x02,0xf3,0xd4,0xb8,0x6d,0x18,0x11,0x31,0x55,0xc0,
+0x62,0x0d,0x43,0x05,0x2c,0x07,0x50,0x10,0xcd,0x14,0x61,0x46,0x31,0x08,0xcc,0x03,
+0x00,0x00,0x00,0x00,0x71,0x20,0x00,0x00,0x12,0x00,0x00,0x00,0x66,0x40,0x54,0x82,
+0x23,0x19,0xc3,0xa0,0x20,0x8b,0x1d,0x18,0x4f,0x84,0x34,0x53,0x61,0x03,0xc4,0xe3,
+0x58,0x85,0x05,0x14,0xbe,0x34,0x45,0xb5,0x21,0x10,0x82,0x23,0x15,0x46,0x30,0x2c,
+0xc8,0x64,0x02,0x06,0xf0,0x3c,0x91,0x73,0x19,0x00,0xe1,0x4b,0x53,0x64,0x0a,0x84,
+0x84,0x34,0x85,0x25,0x0c,0x92,0x20,0x59,0xc1,0x20,0x30,0x8f,0x2d,0x10,0x95,0x84,
+0x34,0x00,0x00,0x00,0x00,0x00,0x00,0x00};
diff --git a/src/gallium/auxiliary/gallivm/gallivm_cpu.cpp b/src/gallium/auxiliary/gallivm/gallivm_cpu.cpp
index e64bfb1c6c..3a2f2878a3 100644
--- a/src/gallium/auxiliary/gallivm/gallivm_cpu.cpp
+++ b/src/gallium/auxiliary/gallivm/gallivm_cpu.cpp
@@ -46,6 +46,7 @@
 #include "tgsi/tgsi_dump.h"
 
 #include "util/u_memory.h"
+#include "util/u_math.h"
 
 #include <llvm/Module.h>
 #include <llvm/CallingConv.h>
@@ -157,8 +158,8 @@ void gallivm_cpu_jit_compile(struct gallivm_cpu_engine *cpu, struct gallivm_prog
    llvm::ExistingModuleProvider *mp = new llvm::ExistingModuleProvider(mod);
    llvm::ExecutionEngine *ee = cpu->engine;
    assert(ee);
-   /*FIXME : remove */
-   ee->DisableLazyCompilation();
+   /*FIXME : why was this disabled ? we need it for pow/sqrt/... */
+   ee->DisableLazyCompilation(false);
    ee->addModuleProvider(mp);
 
    llvm::Function *func = func_for_shader(prog);
@@ -201,7 +202,6 @@ int gallivm_cpu_vs_exec(struct gallivm_prog *prog,
    unsigned int i, j;
    unsigned slot;
    vertex_shader_runner runner = reinterpret_cast<vertex_shader_runner>(prog->function);
-
    assert(runner);
 
    for (i = 0; i < count; i += MAX_TGSI_VERTICES) {
diff --git a/src/gallium/auxiliary/gallivm/instructions.cpp b/src/gallium/auxiliary/gallivm/instructions.cpp
index a82dc30306..599975d5ad 100644
--- a/src/gallium/auxiliary/gallivm/instructions.cpp
+++ b/src/gallium/auxiliary/gallivm/instructions.cpp
@@ -83,6 +83,7 @@ Instructions::Instructions(llvm::Module *mod, llvm::Function *func, llvm::BasicB
    m_llvmPow   = 0;
    m_llvmFloor = 0;
    m_llvmFlog  = 0;
+   m_llvmFexp  = 0;
    m_llvmLit  = 0;
    m_fmtPtr = 0;
 
@@ -92,194 +93,271 @@ Instructions::Instructions(llvm::Module *mod, llvm::Function *func, llvm::BasicB
    m_mod = ParseBitcodeFile(buffer);
 }
 
+llvm::BasicBlock * Instructions::currentBlock() const
+{
+   return m_builder.GetInsertBlock();
+}
+
+llvm::Value * Instructions::abs(llvm::Value *in)
+{
+   std::vector<llvm::Value*> vec = extractVector(in);
+   Value *xabs  = callFAbs(vec[0]);
+   Value *yabs  = callFAbs(vec[1]);
+   Value *zabs  = callFAbs(vec[2]);
+   Value *wabs  = callFAbs(vec[3]);
+   return vectorFromVals(xabs, yabs, zabs, wabs);
+}
+
 llvm::Value * Instructions::add(llvm::Value *in1, llvm::Value *in2)
 {
    return m_builder.CreateAdd(in1, in2, name("add"));
 }
 
-llvm::Value * Instructions::madd(llvm::Value *in1, llvm::Value *in2,
-                                 llvm::Value *in3)
+llvm::Value * Instructions::arl(llvm::Value *in)
 {
-   Value *mulRes = mul(in1, in2);
-   return add(mulRes, in3);
+   return floor(in);
 }
- 
-llvm::Value * Instructions::mul(llvm::Value *in1, llvm::Value *in2)
+
+void Instructions::beginLoop()
 {
-   return m_builder.CreateMul(in1, in2, name("mul"));
+   BasicBlock *begin = BasicBlock::Create(name("loop"), m_func,0);
+   BasicBlock *end = BasicBlock::Create(name("endloop"), m_func,0);
+
+   m_builder.CreateBr(begin);
+   Loop loop;
+   loop.begin = begin;
+   loop.end   = end;
+   m_builder.SetInsertPoint(begin);
+   m_loopStack.push(loop);
 }
 
-const char * Instructions::name(const char *prefix)
+void Instructions::bgnSub(unsigned label)
 {
-   ++m_idx;
-   snprintf(m_name, 32, "%s%d", prefix, m_idx);
-   return m_name;
+   llvm::Function *func = findFunction(label);
+
+   Function::arg_iterator args = func->arg_begin();
+   Value *ptr_INPUT = args++;
+   ptr_INPUT->setName("INPUT");
+   m_storage->pushArguments(ptr_INPUT);
+
+   llvm::BasicBlock *entry = BasicBlock::Create("entry", func, 0);
+
+   m_func = func;
+   m_builder.SetInsertPoint(entry);
 }
 
-llvm::Value * Instructions::dp3(llvm::Value *in1, llvm::Value *in2)
+void Instructions::brk()
 {
-   Value *mulRes = mul(in1, in2);
-   Value *x = m_builder.CreateExtractElement(mulRes,
-                                                          m_storage->constantInt(0),
-                                                          name("extractx"));
-   Value *y = m_builder.CreateExtractElement(mulRes,
-                                                          m_storage->constantInt(1),
-                                                          name("extracty"));
-   Value *z = m_builder.CreateExtractElement(mulRes,
-                                                          m_storage->constantInt(2),
-                                                          name("extractz"));
-   Value *xy = m_builder.CreateAdd(x, y,name("xy"));
-   Value *dot3 = m_builder.CreateAdd(xy, z, name("dot3"));
-   return vectorFromVals(dot3, dot3, dot3, dot3);
+   assert(!m_loopStack.empty());
+   BasicBlock *unr = BasicBlock::Create(name("unreachable"), m_func,0);
+   m_builder.CreateBr(m_loopStack.top().end);
+   m_builder.SetInsertPoint(unr);
 }
 
-llvm::Value *Instructions::callFSqrt(llvm::Value *val)
+void Instructions::cal(int label, llvm::Value *input)
 {
-   if (!m_llvmFSqrt) {
-      // predeclare the intrinsic
-      std::vector<const Type*> fsqrtArgs;
-      fsqrtArgs.push_back(Type::FloatTy);
-      PAListPtr fsqrtPal;
-      FunctionType* fsqrtType = FunctionType::get(
-         /*Result=*/Type::FloatTy,
-         /*Params=*/fsqrtArgs,
-         /*isVarArg=*/false);
-      m_llvmFSqrt = Function::Create(
-         /*Type=*/fsqrtType,
-         /*Linkage=*/GlobalValue::ExternalLinkage,
-         /*Name=*/"llvm.sqrt.f32", m_mod);
-      m_llvmFSqrt->setCallingConv(CallingConv::C);
-      m_llvmFSqrt->setParamAttrs(fsqrtPal);
-   }
-   CallInst *call = m_builder.CreateCall(m_llvmFSqrt, val,
-                                         name("sqrt"));
-   call->setCallingConv(CallingConv::C);
-   call->setTailCall(false);
-   return call;
+   std::vector<Value*> params;
+   params.push_back(input);
+   llvm::Function *func = findFunction(label);
+
+   m_builder.CreateCall(func, params.begin(), params.end());
 }
 
-llvm::Value * Instructions::rsq(llvm::Value *in1)
+llvm::Value * Instructions::ceil(llvm::Value *in)
 {
-   Value *x = m_builder.CreateExtractElement(in1,
-                                             m_storage->constantInt(0),
-                                             name("extractx"));
-   Value *abs  = callFAbs(x);
-   Value *sqrt = callFSqrt(abs);
-
-   Value *rsqrt = m_builder.CreateFDiv(ConstantFP::get(APFloat(1.f)),
-                                       sqrt,
-                                       name("rsqrt"));
-   return vectorFromVals(rsqrt, rsqrt, rsqrt, rsqrt);
+   std::vector<llvm::Value*> vec = extractVector(in);
+   return vectorFromVals(callCeil(vec[0]), callCeil(vec[1]),
+                         callCeil(vec[2]), callCeil(vec[3]));
 }
 
-llvm::Value * Instructions::vectorFromVals(llvm::Value *x, llvm::Value *y,
-                                           llvm::Value *z, llvm::Value *w)
+llvm::Value * Instructions::clamp(llvm::Value *in1)
 {
-   Constant *const_vec = Constant::getNullValue(m_floatVecType);
-   Value *res = m_builder.CreateInsertElement(const_vec, x,
-                                              m_storage->constantInt(0),
-                                              name("vecx"));
-   res = m_builder.CreateInsertElement(res, y, m_storage->constantInt(1),
-                               name("vecxy"));
-   res = m_builder.CreateInsertElement(res, z, m_storage->constantInt(2),
-                               name("vecxyz"));
-   if (w)
-      res = m_builder.CreateInsertElement(res, w, m_storage->constantInt(3),
-                                          name("vecxyzw"));
-   return res;
+   llvm::Value *zero = constVector(0.0f, 0.0f, 0.0f, 0.0f);
+   llvm::Value *one = constVector(1.0f, 1.0f, 1.0f, 1.0f);
+   return min( max(zero, in1), one);
 }
 
-llvm::Value *Instructions::callFAbs(llvm::Value *val)
+llvm::Value * Instructions::cmp(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3)
 {
-   if (!m_llvmFAbs) {
-      // predeclare the intrinsic
-      std::vector<const Type*> fabsArgs;
-      fabsArgs.push_back(Type::FloatTy);
-      PAListPtr fabsPal;
-      FunctionType* fabsType = FunctionType::get(
-         /*Result=*/Type::FloatTy,
-         /*Params=*/fabsArgs,
-         /*isVarArg=*/false);
-      m_llvmFAbs = Function::Create(
-         /*Type=*/fabsType,
-         /*Linkage=*/GlobalValue::ExternalLinkage,
-         /*Name=*/"fabs", m_mod);
-      m_llvmFAbs->setCallingConv(CallingConv::C);
-      m_llvmFAbs->setParamAttrs(fabsPal);
-   }
-   CallInst *call = m_builder.CreateCall(m_llvmFAbs, val,
-                                         name("fabs"));
-   call->setCallingConv(CallingConv::C);
+   llvm::Function *func = m_mod->getFunction("cmp");
+   assert(func);
+
+   std::vector<Value*> params;
+   params.push_back(in1);
+   params.push_back(in2);
+   params.push_back(in3);
+   CallInst *call = m_builder.CreateCall(func, params.begin(), params.end(), name("cmpres"));
    call->setTailCall(false);
    return call;
 }
 
-llvm::Value * Instructions::lit(llvm::Value *in)
+llvm::Value * Instructions::cnd(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3)
 {
-   if (!m_llvmLit) {
-      m_llvmLit = m_mod->getFunction("lit");
-   }
-   CallInst *call = m_builder.CreateCall(m_llvmLit, in, name("litres"));
-   call->setCallingConv(CallingConv::C);
-   call->setTailCall(false);
-   return call;
+   std::vector<llvm::Value*> vec1 = extractVector(in1);
+   std::vector<llvm::Value*> vec2 = extractVector(in2);
+   std::vector<llvm::Value*> vec3 = extractVector(in3);
+   Constant *half = ConstantFP::get(APFloat(0.5f));
+
+   Value *xcmp  = m_builder.CreateFCmpOGT(vec1[0], half, name("xcmp"));
+   Value *selx = m_builder.CreateSelect(xcmp, vec2[0], vec3[0],
+                                        name("selx"));
+
+   Value *ycmp  = m_builder.CreateFCmpOGT(vec1[1], half, name("ycmp"));
+   Value *sely = m_builder.CreateSelect(ycmp, vec2[1], vec3[1],
+                                        name("sely"));
+
+   Value *zcmp  = m_builder.CreateFCmpOGT(vec1[2], half, name("zcmp"));
+   Value *selz = m_builder.CreateSelect(zcmp, vec2[2], vec3[2],
+                                        name("selz"));
+
+   Value *wcmp  = m_builder.CreateFCmpOGT(vec1[3], half, name("wcmp"));
+   Value *selw = m_builder.CreateSelect(wcmp, vec2[3], vec3[3],
+                                        name("selw"));
+
+   return vectorFromVals(selx, sely, selz, selw);
 }
 
-llvm::Value * Instructions::sub(llvm::Value *in1, llvm::Value *in2)
+llvm::Value * Instructions::cnd0(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3)
 {
-   Value *res = m_builder.CreateSub(in1, in2, name("sub"));
-   return res;
+   std::vector<llvm::Value*> vec1 = extractVector(in1);
+   std::vector<llvm::Value*> vec2 = extractVector(in2);
+   std::vector<llvm::Value*> vec3 = extractVector(in3);
+   Constant *zero = Constant::getNullValue(Type::FloatTy);
+
+   Value *xcmp  = m_builder.CreateFCmpOGE(vec1[0], zero, name("xcmp"));
+   Value *selx = m_builder.CreateSelect(xcmp, vec2[0], vec3[0],
+                                        name("selx"));
+
+   Value *ycmp  = m_builder.CreateFCmpOGE(vec1[1], zero, name("ycmp"));
+   Value *sely = m_builder.CreateSelect(ycmp, vec2[1], vec3[1],
+                                        name("sely"));
+
+   Value *zcmp  = m_builder.CreateFCmpOGE(vec1[2], zero, name("zcmp"));
+   Value *selz = m_builder.CreateSelect(zcmp, vec2[2], vec3[2],
+                                        name("selz"));
+
+   Value *wcmp  = m_builder.CreateFCmpOGE(vec1[3], zero, name("wcmp"));
+   Value *selw = m_builder.CreateSelect(wcmp, vec2[3], vec3[3],
+                                        name("selw"));
+
+   return vectorFromVals(selx, sely, selz, selw);
 }
 
-llvm::Value * Instructions::callPow(llvm::Value *val1, llvm::Value *val2)
+llvm::Value * Instructions::cos(llvm::Value *in)
 {
-   if (!m_llvmPow) {
-      // predeclare the intrinsic
-      std::vector<const Type*> powArgs;
-      powArgs.push_back(Type::FloatTy);
-      powArgs.push_back(Type::FloatTy);
-      PAListPtr powPal;
-      FunctionType* powType = FunctionType::get(
-         /*Result=*/Type::FloatTy,
-         /*Params=*/powArgs,
-         /*isVarArg=*/false);
-      m_llvmPow = Function::Create(
-         /*Type=*/powType,
-         /*Linkage=*/GlobalValue::ExternalLinkage,
-         /*Name=*/"llvm.pow.f32", m_mod);
-      m_llvmPow->setCallingConv(CallingConv::C);
-      m_llvmPow->setParamAttrs(powPal);
-   }
-   std::vector<Value*> params;
-   params.push_back(val1);
-   params.push_back(val2);
-   CallInst *call = m_builder.CreateCall(m_llvmPow, params.begin(), params.end(),
-                                         name("pow"));
-   call->setCallingConv(CallingConv::C);
+#if 0
+   llvm::Function *func = m_mod->getFunction("vcos");
+   assert(func);
+
+   CallInst *call = m_builder.CreateCall(func, in, name("cosres"));
    call->setTailCall(false);
    return call;
+#else
+   std::vector<llvm::Value*> elems = extractVector(in);
+   Function *func = m_mod->getFunction("cosf");
+   assert(func);
+   CallInst *cos = m_builder.CreateCall(func, elems[0], name("cosres"));
+   cos->setCallingConv(CallingConv::C);
+   cos->setTailCall(true);
+   return vectorFromVals(cos, cos, cos, cos);
+#endif
 }
 
-llvm::Value * Instructions::pow(llvm::Value *in1, llvm::Value *in2)
+llvm::Value * Instructions::cross(llvm::Value *in1, llvm::Value *in2)
 {
    Value *x1 = m_builder.CreateExtractElement(in1,
                                               m_storage->constantInt(0),
                                               name("x1"));
+   Value *y1 = m_builder.CreateExtractElement(in1,
+                                              m_storage->constantInt(1),
+                                              name("y1"));
+   Value *z1 = m_builder.CreateExtractElement(in1,
+                                              m_storage->constantInt(2),
+                                              name("z1"));
+
    Value *x2 = m_builder.CreateExtractElement(in2,
                                               m_storage->constantInt(0),
                                               name("x2"));
-   llvm::Value *val = callPow(x1, x2);
-   return vectorFromVals(val, val, val, val);
+   Value *y2 = m_builder.CreateExtractElement(in2,
+                                              m_storage->constantInt(1),
+                                              name("y2"));
+   Value *z2 = m_builder.CreateExtractElement(in2,
+                                              m_storage->constantInt(2),
+                                              name("z2"));
+   Value *y1z2 = mul(y1, z2);
+   Value *z1y2 = mul(z1, y2);
+
+   Value *z1x2 = mul(z1, x2);
+   Value *x1z2 = mul(x1, z2);
+
+   Value *x1y2 = mul(x1, y2);
+   Value *y1x2 = mul(y1, x2);
+
+   return vectorFromVals(sub(y1z2, z1y2), sub(z1x2, x1z2), sub(x1y2, y1x2));
 }
 
-llvm::Value * Instructions::rcp(llvm::Value *in1)
+llvm::Value * Instructions::ddx(llvm::Value *in)
 {
-   Value *x1 = m_builder.CreateExtractElement(in1,
-                                              m_storage->constantInt(0),
-                                              name("x1"));
-   Value *res = m_builder.CreateFDiv(ConstantFP::get(APFloat(1.f)),
-                                     x1, name("rcp"));
-   return vectorFromVals(res, res, res, res);
+   // FIXME
+   assert(0);
+}
+
+llvm::Value * Instructions::ddy(llvm::Value *in)
+{
+   // FIXME
+   assert(0);
+}
+
+llvm::Value * Instructions::div(llvm::Value *in1, llvm::Value *in2)
+{
+   return m_builder.CreateFDiv(in1, in2, name("div"));
+}
+
+llvm::Value * Instructions::dot2add(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3)
+{
+   Value *mulRes = mul(in1, in2);
+   Value *x = m_builder.CreateExtractElement(mulRes,
+                                                          m_storage->constantInt(0),
+                                                          name("extractx"));
+   Value *y = m_builder.CreateExtractElement(mulRes,
+                                                          m_storage->constantInt(1),
+                                                          name("extracty"));
+   Value *z = m_builder.CreateExtractElement(in3,
+                                                          m_storage->constantInt(2),
+                                                          name("extractz"));
+   Value *xy = m_builder.CreateAdd(x, y,name("xy"));
+   Value *dot2add = m_builder.CreateAdd(xy, z, name("dot2add"));
+   return vectorFromVals(dot2add, dot2add, dot2add, dot2add);
+}
+
+llvm::Value * Instructions::dp2(llvm::Value *in1, llvm::Value *in2)
+{
+   Value *mulRes = mul(in1, in2);
+   Value *x = m_builder.CreateExtractElement(mulRes,
+                                                          m_storage->constantInt(0),
+                                                          name("extractx"));
+   Value *y = m_builder.CreateExtractElement(mulRes,
+                                                          m_storage->constantInt(1),
+                                                          name("extracty"));
+   Value *xy = m_builder.CreateAdd(x, y,name("xy"));
+   return vectorFromVals(xy, xy, xy, xy);
+}
+
+llvm::Value * Instructions::dp3(llvm::Value *in1, llvm::Value *in2)
+{
+   Value *mulRes = mul(in1, in2);
+   Value *x = m_builder.CreateExtractElement(mulRes,
+                                                          m_storage->constantInt(0),
+                                                          name("extractx"));
+   Value *y = m_builder.CreateExtractElement(mulRes,
+                                                          m_storage->constantInt(1),
+                                                          name("extracty"));
+   Value *z = m_builder.CreateExtractElement(mulRes,
+                                                          m_storage->constantInt(2),
+                                                          name("extractz"));
+   Value *xy = m_builder.CreateAdd(x, y,name("xy"));
+   Value *dot3 = m_builder.CreateAdd(xy, z, name("dot3"));
+   return vectorFromVals(dot3, dot3, dot3, dot3);
 }
 
 llvm::Value * Instructions::dp4(llvm::Value *in1, llvm::Value *in2)
@@ -321,6 +399,53 @@ llvm::Value * Instructions::dst(llvm::Value *in1, llvm::Value *in2)
                          ry, z, w);
 }
 
+void Instructions::elseop()
+{
+   assert(!m_ifStack.empty());
+   BasicBlock *ifend = BasicBlock::Create(name("ifend"), m_func,0);
+   m_builder.CreateBr(ifend);
+   m_builder.SetInsertPoint(m_ifStack.top());
+   currentBlock()->setName(name("ifelse"));
+   m_ifStack.pop();
+   m_ifStack.push(ifend);
+}
+
+void Instructions::endif()
+{
+   assert(!m_ifStack.empty());
+   m_builder.CreateBr(m_ifStack.top());
+   m_builder.SetInsertPoint(m_ifStack.top());
+   m_ifStack.pop();
+}
+
+void Instructions::endLoop()
+{
+   assert(!m_loopStack.empty());
+   Loop loop = m_loopStack.top();
+   m_builder.CreateBr(loop.begin);
+   loop.end->moveAfter(currentBlock());
+   m_builder.SetInsertPoint(loop.end);
+   m_loopStack.pop();
+}
+
+void Instructions::end()
+{
+   m_builder.CreateRetVoid();
+}
+
+void Instructions::endSub()
+{
+   m_func = 0;
+   m_builder.SetInsertPoint(0);
+}
+
+llvm::Value * Instructions::exp(llvm::Value *in)
+{
+   std::vector<llvm::Value*> vec = extractVector(in);
+   return vectorFromVals(callFExp(vec[0]), callFExp(vec[1]),
+                             callFExp(vec[2]), callFExp(vec[3]));
+}
+
 llvm::Value * Instructions::ex2(llvm::Value *in)
 {
    llvm::Value *val = callPow(ConstantFP::get(APFloat(2.f)),
@@ -330,31 +455,6 @@ llvm::Value * Instructions::ex2(llvm::Value *in)
    return vectorFromVals(val, val, val, val);
 }
 
-llvm::Value * Instructions::callFloor(llvm::Value *val)
-{
-   if (!m_llvmFloor) {
-      // predeclare the intrinsic
-      std::vector<const Type*> floorArgs;
-      floorArgs.push_back(Type::FloatTy);
-      PAListPtr floorPal;
-      FunctionType* floorType = FunctionType::get(
-         /*Result=*/Type::FloatTy,
-         /*Params=*/floorArgs,
-         /*isVarArg=*/false);
-      m_llvmFloor = Function::Create(
-         /*Type=*/floorType,
-         /*Linkage=*/GlobalValue::ExternalLinkage,
-         /*Name=*/"floorf", m_mod);
-      m_llvmFloor->setCallingConv(CallingConv::C);
-      m_llvmFloor->setParamAttrs(floorPal);
-   }
-   CallInst *call =  m_builder.CreateCall(m_llvmFloor, val,
-                                          name("floorf"));
-   call->setCallingConv(CallingConv::C);
-   call->setTailCall(false);
-   return call;
-}
-
 llvm::Value * Instructions::floor(llvm::Value *in)
 {
    std::vector<llvm::Value*> vec = extractVector(in);
@@ -362,42 +462,52 @@ llvm::Value * Instructions::floor(llvm::Value *in)
                          callFloor(vec[2]), callFloor(vec[3]));
 }
 
-llvm::Value * Instructions::arl(llvm::Value *in)
-{
-   return floor(in);
-}
-
 llvm::Value * Instructions::frc(llvm::Value *in)
 {
    llvm::Value *flr = floor(in);
    return sub(in, flr);
 }
 
-llvm::Value * Instructions::callFLog(llvm::Value *val)
+void Instructions::ifop(llvm::Value *in)
 {
-   if (!m_llvmFlog) {
-      // predeclare the intrinsic
-      std::vector<const Type*> flogArgs;
-      flogArgs.push_back(Type::FloatTy);
-      PAListPtr flogPal;
-      FunctionType* flogType = FunctionType::get(
-         /*Result=*/Type::FloatTy,
-         /*Params=*/flogArgs,
-         /*isVarArg=*/false);
-      m_llvmFlog = Function::Create(
-         /*Type=*/flogType,
-         /*Linkage=*/GlobalValue::ExternalLinkage,
-         /*Name=*/"logf", m_mod);
-      m_llvmFlog->setCallingConv(CallingConv::C);
-      m_llvmFlog->setParamAttrs(flogPal);
-   }
-   CallInst *call = m_builder.CreateCall(m_llvmFlog, val,
-                                         name("logf"));
-   call->setCallingConv(CallingConv::C);
+   BasicBlock *ifthen = BasicBlock::Create(name("ifthen"), m_func,0);
+   BasicBlock *ifend = BasicBlock::Create(name("ifthenend"), m_func,0);
+
+   //BasicBlock *yblock = new BasicBlock(name("yblock"), m_func,0);
+   //BasicBlock *zblock = new BasicBlock(name("zblock"), m_func,0);
+   //BasicBlock *wblock = new BasicBlock(name("wblock"), m_func,0);
+
+   Constant *float0 = Constant::getNullValue(Type::FloatTy);
+
+   Value *x = m_builder.CreateExtractElement(in, m_storage->constantInt(0),
+                                             name("extractx"));
+   Value *xcmp = m_builder.CreateFCmpUNE(x, float0, name("xcmp"));
+   m_builder.CreateCondBr(xcmp, ifthen, ifend);
+   //m_builder.SetInsertPoint(yblock);
+
+   m_builder.SetInsertPoint(ifthen);
+   m_ifStack.push(ifend);
+}
+
+llvm::Value * Instructions::kil(llvm::Value *in)
+{
+   llvm::Function *func = m_mod->getFunction("kil");
+   assert(func);
+
+   CallInst *call = m_builder.CreateCall(func, in, name("kilpres"));
    call->setTailCall(false);
    return call;
 }
 
+llvm::Value * Instructions::lerp(llvm::Value *in1, llvm::Value *in2,
+                                 llvm::Value *in3)
+{
+   llvm::Value *m = mul(in1, in2);
+   llvm::Value *vec1 = constVector(1.f, 1.f, 1.f, 1.f);
+   llvm::Value *s = sub(vec1, in1);
+   return add(m, mul(s, in3));
+}
+
 llvm::Value * Instructions::lg2(llvm::Value *in)
 {
    std::vector<llvm::Value*> vec = extractVector(in);
@@ -407,142 +517,176 @@ llvm::Value * Instructions::lg2(llvm::Value *in)
                              callFLog(vec[2]), callFLog(vec[3])), const_vec);
 }
 
-llvm::Value * Instructions::min(llvm::Value *in1, llvm::Value *in2)
+llvm::Value * Instructions::lit(llvm::Value *in)
+{
+   if (!m_llvmLit) {
+      m_llvmLit = m_mod->getFunction("lit");
+   }
+   CallInst *call = m_builder.CreateCall(m_llvmLit, in, name("litres"));
+   call->setCallingConv(CallingConv::C);
+   call->setTailCall(false);
+   return call;
+}
+
+llvm::Value * Instructions::log(llvm::Value *in)
+{
+   std::vector<llvm::Value*> vec = extractVector(in);
+   return vectorFromVals(callFLog(vec[0]), callFLog(vec[1]),
+                             callFLog(vec[2]), callFLog(vec[3]));
+}
+
+llvm::Value * Instructions::madd(llvm::Value *in1, llvm::Value *in2,
+                                 llvm::Value *in3)
+{
+   Value *mulRes = mul(in1, in2);
+   return add(mulRes, in3);
+}
+
+llvm::Value * Instructions::max(llvm::Value *in1, llvm::Value *in2)
 {
    std::vector<llvm::Value*> vec1 = extractVector(in1);
    std::vector<llvm::Value*> vec2 = extractVector(in2);
 
-   Value *xcmp  = m_builder.CreateFCmpOLT(vec1[0], vec2[0], name("xcmp"));
+   Value *xcmp  = m_builder.CreateFCmpOGT(vec1[0], vec2[0],
+                                          name("xcmp"));
    Value *selx = m_builder.CreateSelect(xcmp, vec1[0], vec2[0],
                                         name("selx"));
 
-   Value *ycmp  = m_builder.CreateFCmpOLT(vec1[1], vec2[1], name("ycmp"));
+   Value *ycmp  = m_builder.CreateFCmpOGT(vec1[1], vec2[1],
+                                          name("ycmp"));
    Value *sely = m_builder.CreateSelect(ycmp, vec1[1], vec2[1],
                                         name("sely"));
 
-   Value *zcmp  = m_builder.CreateFCmpOLT(vec1[2], vec2[2], name("zcmp"));
+   Value *zcmp  = m_builder.CreateFCmpOGT(vec1[2], vec2[2],
+                                          name("zcmp"));
    Value *selz = m_builder.CreateSelect(zcmp, vec1[2], vec2[2],
                                         name("selz"));
 
-   Value *wcmp  = m_builder.CreateFCmpOLT(vec1[3], vec2[3], name("wcmp"));
+   Value *wcmp  = m_builder.CreateFCmpOGT(vec1[3], vec2[3],
+                                          name("wcmp"));
    Value *selw = m_builder.CreateSelect(wcmp, vec1[3], vec2[3],
                                         name("selw"));
 
    return vectorFromVals(selx, sely, selz, selw);
 }
 
-llvm::Value * Instructions::max(llvm::Value *in1, llvm::Value *in2)
+llvm::Value * Instructions::min(llvm::Value *in1, llvm::Value *in2)
 {
    std::vector<llvm::Value*> vec1 = extractVector(in1);
    std::vector<llvm::Value*> vec2 = extractVector(in2);
 
-   Value *xcmp  = m_builder.CreateFCmpOGT(vec1[0], vec2[0],
-                                          name("xcmp"));
+   Value *xcmp  = m_builder.CreateFCmpOLT(vec1[0], vec2[0], name("xcmp"));
    Value *selx = m_builder.CreateSelect(xcmp, vec1[0], vec2[0],
                                         name("selx"));
 
-   Value *ycmp  = m_builder.CreateFCmpOGT(vec1[1], vec2[1],
-                                          name("ycmp"));
+   Value *ycmp  = m_builder.CreateFCmpOLT(vec1[1], vec2[1], name("ycmp"));
    Value *sely = m_builder.CreateSelect(ycmp, vec1[1], vec2[1],
                                         name("sely"));
 
-   Value *zcmp  = m_builder.CreateFCmpOGT(vec1[2], vec2[2],
-                                          name("zcmp"));
+   Value *zcmp  = m_builder.CreateFCmpOLT(vec1[2], vec2[2], name("zcmp"));
    Value *selz = m_builder.CreateSelect(zcmp, vec1[2], vec2[2],
                                         name("selz"));
 
-   Value *wcmp  = m_builder.CreateFCmpOGT(vec1[3], vec2[3],
-                                          name("wcmp"));
+   Value *wcmp  = m_builder.CreateFCmpOLT(vec1[3], vec2[3], name("wcmp"));
    Value *selw = m_builder.CreateSelect(wcmp, vec1[3], vec2[3],
                                         name("selw"));
 
    return vectorFromVals(selx, sely, selz, selw);
 }
 
-void Instructions::printVector(llvm::Value *val)
+llvm::Value * Instructions::mul(llvm::Value *in1, llvm::Value *in2)
 {
-   static const char *frmt = "Vector is [%f, %f, %f, %f]\x0A";
+   return m_builder.CreateMul(in1, in2, name("mul"));
+}
 
-   if (!m_fmtPtr) {
-      Constant *format = ConstantArray::get(frmt, true);
-      ArrayType *arrayTy = ArrayType::get(IntegerType::get(8), strlen(frmt) + 1);
-      GlobalVariable* globalFormat = new GlobalVariable(
-         /*Type=*/arrayTy,
-         /*isConstant=*/true,
-         /*Linkage=*/GlobalValue::InternalLinkage,
-         /*Initializer=*/0, // has initializer, specified below
-         /*Name=*/name(".str"),
-         m_mod);
-      globalFormat->setInitializer(format);
+llvm::Value * Instructions::neg(llvm::Value *in)
+{
+   Value *neg = m_builder.CreateNeg(in, name("neg"));
+   return neg;
+}
 
-      Constant* const_int0 = Constant::getNullValue(IntegerType::get(32));
-      std::vector<Constant*> const_ptr_21_indices;
-      const_ptr_21_indices.push_back(const_int0);
-      const_ptr_21_indices.push_back(const_int0);
-      m_fmtPtr = ConstantExpr::getGetElementPtr(globalFormat,
-                                                &const_ptr_21_indices[0], const_ptr_21_indices.size());
-   }
+llvm::Value * Instructions::nrm(llvm::Value *in)
+{
+   llvm::Value *v = rsq(in);
+   return mul(v, in);
+}
 
-   Function *func_printf = m_mod->getFunction("printf");
-   if (!func_printf)
-      func_printf = declarePrintf();
-   assert(func_printf);
-   std::vector<llvm::Value*> vec = extractVector(val);
-   Value *dx = m_builder.CreateFPExt(vec[0], Type::DoubleTy, name("dx"));
-   Value *dy = m_builder.CreateFPExt(vec[1], Type::DoubleTy, name("dy"));
-   Value *dz = m_builder.CreateFPExt(vec[2], Type::DoubleTy, name("dz"));
-   Value *dw = m_builder.CreateFPExt(vec[3], Type::DoubleTy, name("dw"));
-   std::vector<Value*> params;
-   params.push_back(m_fmtPtr);
-   params.push_back(dx);
-   params.push_back(dy);
-   params.push_back(dz);
-   params.push_back(dw);
-   CallInst *call = m_builder.CreateCall(func_printf, params.begin(), params.end(),
-                                         name("printf"));
-   call->setCallingConv(CallingConv::C);
-   call->setTailCall(true);
+llvm::Value * Instructions::pow(llvm::Value *in1, llvm::Value *in2)
+{
+   Value *x1 = m_builder.CreateExtractElement(in1,
+                                              m_storage->constantInt(0),
+                                              name("x1"));
+   Value *x2 = m_builder.CreateExtractElement(in2,
+                                              m_storage->constantInt(0),
+                                              name("x2"));
+   llvm::Value *val = callPow(x1, x2);
+   return vectorFromVals(val, val, val, val);
 }
 
-llvm::Function * Instructions::declarePrintf()
+llvm::Value * Instructions::rcp(llvm::Value *in1)
 {
-   std::vector<const Type*> args;
-   PAListPtr params;
-   FunctionType* funcTy = FunctionType::get(
-      /*Result=*/IntegerType::get(32),
-      /*Params=*/args,
-      /*isVarArg=*/true);
-   Function* func_printf = Function::Create(
-      /*Type=*/funcTy,
-      /*Linkage=*/GlobalValue::ExternalLinkage,
-      /*Name=*/"printf", m_mod);
-   func_printf->setCallingConv(CallingConv::C);
-   func_printf->setParamAttrs(params);
-   return func_printf;
+   Value *x1 = m_builder.CreateExtractElement(in1,
+                                              m_storage->constantInt(0),
+                                              name("x1"));
+   Value *res = m_builder.CreateFDiv(ConstantFP::get(APFloat(1.f)),
+                                     x1, name("rcp"));
+   return vectorFromVals(res, res, res, res);
+}
+
+llvm::Value * Instructions::rsq(llvm::Value *in1)
+{
+   Value *x = m_builder.CreateExtractElement(in1,
+                                             m_storage->constantInt(0),
+                                             name("extractx"));
+   Value *abs  = callFAbs(x);
+   Value *sqrt = callFSqrt(abs);
+
+   Value *rsqrt = m_builder.CreateFDiv(ConstantFP::get(APFloat(1.f)),
+                                       sqrt,
+                                       name("rsqrt"));
+   return vectorFromVals(rsqrt, rsqrt, rsqrt, rsqrt);
 }
 
+llvm::Value * Instructions::scs(llvm::Value *in)
+{
+   llvm::Function *func = m_mod->getFunction("scs");
+   assert(func);
 
-llvm::Value * Instructions::sgt(llvm::Value *in1, llvm::Value *in2)
+   CallInst *call = m_builder.CreateCall(func, in, name("scsres"));
+   call->setTailCall(false);
+   return call;
+}
+
+llvm::Value * Instructions::seq(llvm::Value *in1, llvm::Value *in2)
 {
    Constant *const1f = ConstantFP::get(APFloat(1.000000e+00f));
    Constant *const0f = Constant::getNullValue(Type::FloatTy);
 
    std::vector<llvm::Value*> vec1 = extractVector(in1);
    std::vector<llvm::Value*> vec2 = extractVector(in2);
-   Value *xcmp = m_builder.CreateFCmpOGT(vec1[0], vec2[0], name("xcmp"));
+
+   Value *xcmp = m_builder.CreateFCmpOEQ(vec1[0], vec2[0], name("xcmp"));
    Value *x = m_builder.CreateSelect(xcmp, const1f, const0f, name("xsel"));
 
-   Value *ycmp = m_builder.CreateFCmpOGT(vec1[1], vec2[1], name("ycmp"));
+   Value *ycmp = m_builder.CreateFCmpOEQ(vec1[1], vec2[1], name("ycmp"));
    Value *y = m_builder.CreateSelect(ycmp, const1f, const0f, name("ysel"));
 
-   Value *zcmp = m_builder.CreateFCmpOGT(vec1[2], vec2[2], name("zcmp"));
+   Value *zcmp = m_builder.CreateFCmpOEQ(vec1[2], vec2[2], name("zcmp"));
    Value *z = m_builder.CreateSelect(zcmp, const1f, const0f, name("zsel"));
 
-   Value *wcmp = m_builder.CreateFCmpOGT(vec1[3], vec2[3], name("wcmp"));
+   Value *wcmp = m_builder.CreateFCmpOEQ(vec1[3], vec2[3], name("wcmp"));
    Value *w = m_builder.CreateSelect(wcmp, const1f, const0f, name("wsel"));
 
    return vectorFromVals(x, y, z, w);
 }
+
+llvm::Value * Instructions::sfl(llvm::Value *in1, llvm::Value *in2)
+{
+   Constant *const0f = Constant::getNullValue(Type::FloatTy);
+
+   return vectorFromVals(const0f, const0f, const0f, const0f);
+}
+
 llvm::Value * Instructions::sge(llvm::Value *in1, llvm::Value *in2)
 {
    Constant *const1f = ConstantFP::get(APFloat(1.000000e+00f));
@@ -566,157 +710,118 @@ llvm::Value * Instructions::sge(llvm::Value *in1, llvm::Value *in2)
    return vectorFromVals(x, y, z, w);
 }
 
-
-llvm::Value * Instructions::slt(llvm::Value *in1, llvm::Value *in2)
+llvm::Value * Instructions::sgt(llvm::Value *in1, llvm::Value *in2)
 {
    Constant *const1f = ConstantFP::get(APFloat(1.000000e+00f));
    Constant *const0f = Constant::getNullValue(Type::FloatTy);
 
    std::vector<llvm::Value*> vec1 = extractVector(in1);
    std::vector<llvm::Value*> vec2 = extractVector(in2);
-
-   Value *xcmp = m_builder.CreateFCmpOLT(vec1[0], vec2[0], name("xcmp"));
+   Value *xcmp = m_builder.CreateFCmpOGT(vec1[0], vec2[0], name("xcmp"));
    Value *x = m_builder.CreateSelect(xcmp, const1f, const0f, name("xsel"));
 
-   Value *ycmp = m_builder.CreateFCmpOLT(vec1[1], vec2[1], name("ycmp"));
+   Value *ycmp = m_builder.CreateFCmpOGT(vec1[1], vec2[1], name("ycmp"));
    Value *y = m_builder.CreateSelect(ycmp, const1f, const0f, name("ysel"));
 
-   Value *zcmp = m_builder.CreateFCmpOLT(vec1[2], vec2[2], name("zcmp"));
+   Value *zcmp = m_builder.CreateFCmpOGT(vec1[2], vec2[2], name("zcmp"));
    Value *z = m_builder.CreateSelect(zcmp, const1f, const0f, name("zsel"));
 
-   Value *wcmp = m_builder.CreateFCmpOLT(vec1[3], vec2[3], name("wcmp"));
+   Value *wcmp = m_builder.CreateFCmpOGT(vec1[3], vec2[3], name("wcmp"));
    Value *w = m_builder.CreateSelect(wcmp, const1f, const0f, name("wsel"));
 
    return vectorFromVals(x, y, z, w);
 }
 
-llvm::Value * Instructions::cross(llvm::Value *in1, llvm::Value *in2)
+llvm::Value * Instructions::sin(llvm::Value *in)
 {
-   Value *x1 = m_builder.CreateExtractElement(in1,
-                                              m_storage->constantInt(0),
-                                              name("x1"));
-   Value *y1 = m_builder.CreateExtractElement(in1,
-                                              m_storage->constantInt(1),
-                                              name("y1"));
-   Value *z1 = m_builder.CreateExtractElement(in1,
-                                              m_storage->constantInt(2),
-                                              name("z1"));
+   llvm::Function *func = m_mod->getFunction("vsin");
+   assert(func);
 
-   Value *x2 = m_builder.CreateExtractElement(in2,
-                                              m_storage->constantInt(0),
-                                              name("x2"));
-   Value *y2 = m_builder.CreateExtractElement(in2,
-                                              m_storage->constantInt(1),
-                                              name("y2"));
-   Value *z2 = m_builder.CreateExtractElement(in2,
-                                              m_storage->constantInt(2),
-                                              name("z2"));
-   Value *y1z2 = mul(y1, z2);
-   Value *z1y2 = mul(z1, y2);
+   CallInst *call = m_builder.CreateCall(func, in, name("sinres"));
+   call->setTailCall(false);
+   return call;
+}
 
-   Value *z1x2 = mul(z1, x2);
-   Value *x1z2 = mul(x1, z2);
+llvm::Value * Instructions::sle(llvm::Value *in1, llvm::Value *in2)
+{
+   Constant *const1f = ConstantFP::get(APFloat(1.000000e+00f));
+   Constant *const0f = Constant::getNullValue(Type::FloatTy);
 
-   Value *x1y2 = mul(x1, y2);
-   Value *y1x2 = mul(y1, x2);
+   std::vector<llvm::Value*> vec1 = extractVector(in1);
+   std::vector<llvm::Value*> vec2 = extractVector(in2);
 
-   return vectorFromVals(sub(y1z2, z1y2), sub(z1x2, x1z2), sub(x1y2, y1x2));
-}
+   Value *xcmp = m_builder.CreateFCmpOLE(vec1[0], vec2[0], name("xcmp"));
+   Value *x = m_builder.CreateSelect(xcmp, const1f, const0f, name("xsel"));
 
+   Value *ycmp = m_builder.CreateFCmpOLE(vec1[1], vec2[1], name("ycmp"));
+   Value *y = m_builder.CreateSelect(ycmp, const1f, const0f, name("ysel"));
 
-llvm::Value * Instructions::abs(llvm::Value *in)
-{
-   std::vector<llvm::Value*> vec = extractVector(in);
-   Value *xabs  = callFAbs(vec[0]);
-   Value *yabs  = callFAbs(vec[1]);
-   Value *zabs  = callFAbs(vec[2]);
-   Value *wabs  = callFAbs(vec[3]);
-   return vectorFromVals(xabs, yabs, zabs, wabs);
+   Value *zcmp = m_builder.CreateFCmpOLE(vec1[2], vec2[2], name("zcmp"));
+   Value *z = m_builder.CreateSelect(zcmp, const1f, const0f, name("zsel"));
+
+   Value *wcmp = m_builder.CreateFCmpOLE(vec1[3], vec2[3], name("wcmp"));
+   Value *w = m_builder.CreateSelect(wcmp, const1f, const0f, name("wsel"));
+
+   return vectorFromVals(x, y, z, w);
 }
 
-void Instructions::ifop(llvm::Value *in)
+llvm::Value * Instructions::slt(llvm::Value *in1, llvm::Value *in2)
 {
-   BasicBlock *ifthen = BasicBlock::Create(name("ifthen"), m_func,0);
-   BasicBlock *ifend = BasicBlock::Create(name("ifthenend"), m_func,0);
+   Constant *const1f = ConstantFP::get(APFloat(1.000000e+00f));
+   Constant *const0f = Constant::getNullValue(Type::FloatTy);
 
-   //BasicBlock *yblock = new BasicBlock(name("yblock"), m_func,0);
-   //BasicBlock *zblock = new BasicBlock(name("zblock"), m_func,0);
-   //BasicBlock *wblock = new BasicBlock(name("wblock"), m_func,0);
+   std::vector<llvm::Value*> vec1 = extractVector(in1);
+   std::vector<llvm::Value*> vec2 = extractVector(in2);
 
-   Constant *float0 = Constant::getNullValue(Type::FloatTy);
+   Value *xcmp = m_builder.CreateFCmpOLT(vec1[0], vec2[0], name("xcmp"));
+   Value *x = m_builder.CreateSelect(xcmp, const1f, const0f, name("xsel"));
 
-   Value *x = m_builder.CreateExtractElement(in, m_storage->constantInt(0),
-                                             name("extractx"));
-   Value *xcmp = m_builder.CreateFCmpUNE(x, float0, name("xcmp"));
-   m_builder.CreateCondBr(xcmp, ifthen, ifend);
-   //m_builder.SetInsertPoint(yblock);
+   Value *ycmp = m_builder.CreateFCmpOLT(vec1[1], vec2[1], name("ycmp"));
+   Value *y = m_builder.CreateSelect(ycmp, const1f, const0f, name("ysel"));
 
-   m_builder.SetInsertPoint(ifthen);
-   m_ifStack.push(ifend);
-}
+   Value *zcmp = m_builder.CreateFCmpOLT(vec1[2], vec2[2], name("zcmp"));
+   Value *z = m_builder.CreateSelect(zcmp, const1f, const0f, name("zsel"));
 
-llvm::BasicBlock * Instructions::currentBlock() const
-{
-   return m_builder.GetInsertBlock();
-}
+   Value *wcmp = m_builder.CreateFCmpOLT(vec1[3], vec2[3], name("wcmp"));
+   Value *w = m_builder.CreateSelect(wcmp, const1f, const0f, name("wsel"));
 
-void Instructions::elseop()
-{
-   assert(!m_ifStack.empty());
-   BasicBlock *ifend = BasicBlock::Create(name("ifend"), m_func,0);
-   m_builder.CreateBr(ifend);
-   m_builder.SetInsertPoint(m_ifStack.top());
-   currentBlock()->setName(name("ifelse"));
-   m_ifStack.pop();
-   m_ifStack.push(ifend);
+   return vectorFromVals(x, y, z, w);
 }
 
-void Instructions::endif()
+llvm::Value * Instructions::sne(llvm::Value *in1, llvm::Value *in2)
 {
-   assert(!m_ifStack.empty());
-   m_builder.CreateBr(m_ifStack.top());
-   m_builder.SetInsertPoint(m_ifStack.top());
-   m_ifStack.pop();
-}
+   Constant *const1f = ConstantFP::get(APFloat(1.000000e+00f));
+   Constant *const0f = Constant::getNullValue(Type::FloatTy);
 
-llvm::Value * Instructions::lerp(llvm::Value *in1, llvm::Value *in2,
-                                 llvm::Value *in3)
-{
-   llvm::Value *m = mul(in1, in2);
-   llvm::Value *vec1 = constVector(1.f, 1.f, 1.f, 1.f);
-   llvm::Value *s = sub(vec1, in1);
-   return add(m, mul(s, in3));
-}
+   std::vector<llvm::Value*> vec1 = extractVector(in1);
+   std::vector<llvm::Value*> vec2 = extractVector(in2);
 
-void Instructions::beginLoop()
-{
-   BasicBlock *begin = BasicBlock::Create(name("loop"), m_func,0);
-   BasicBlock *end = BasicBlock::Create(name("endloop"), m_func,0);
+   Value *xcmp = m_builder.CreateFCmpONE(vec1[0], vec2[0], name("xcmp"));
+   Value *x = m_builder.CreateSelect(xcmp, const1f, const0f, name("xsel"));
 
-   m_builder.CreateBr(begin);
-   Loop loop;
-   loop.begin = begin;
-   loop.end   = end;
-   m_builder.SetInsertPoint(begin);
-   m_loopStack.push(loop);
+   Value *ycmp = m_builder.CreateFCmpONE(vec1[1], vec2[1], name("ycmp"));
+   Value *y = m_builder.CreateSelect(ycmp, const1f, const0f, name("ysel"));
+
+   Value *zcmp = m_builder.CreateFCmpONE(vec1[2], vec2[2], name("zcmp"));
+   Value *z = m_builder.CreateSelect(zcmp, const1f, const0f, name("zsel"));
+
+   Value *wcmp = m_builder.CreateFCmpONE(vec1[3], vec2[3], name("wcmp"));
+   Value *w = m_builder.CreateSelect(wcmp, const1f, const0f, name("wsel"));
+
+   return vectorFromVals(x, y, z, w);
 }
 
-void Instructions::endLoop()
+llvm::Value * Instructions::str(llvm::Value *in1, llvm::Value *in2)
 {
-   assert(!m_loopStack.empty());
-   Loop loop = m_loopStack.top();
-   m_builder.CreateBr(loop.begin);
-   loop.end->moveAfter(currentBlock());
-   m_builder.SetInsertPoint(loop.end);
-   m_loopStack.pop();
+   Constant *const1f = ConstantFP::get(APFloat(1.000000e+00f));
+
+   return vectorFromVals(const1f, const1f, const1f, const1f);
 }
 
-void Instructions::brk()
+llvm::Value * Instructions::sub(llvm::Value *in1, llvm::Value *in2)
 {
-   assert(!m_loopStack.empty());
-   BasicBlock *unr = BasicBlock::Create(name("unreachable"), m_func,0);
-   m_builder.CreateBr(m_loopStack.top().end);
-   m_builder.SetInsertPoint(unr);
+   Value *res = m_builder.CreateSub(in1, in2, name("sub"));
+   return res;
 }
 
 llvm::Value * Instructions::trunc(llvm::Value *in)
@@ -741,18 +846,298 @@ llvm::Value * Instructions::trunc(llvm::Value *in)
    return vectorFromVals(fx, fy, fz, fw);
 }
 
-void Instructions::end()
+llvm::Value * Instructions::x2d(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3)
 {
-   m_builder.CreateRetVoid();
+   std::vector<llvm::Value*> vec1 = extractVector(in1);
+   std::vector<llvm::Value*> vec2 = extractVector(in2);
+   std::vector<llvm::Value*> vec3 = extractVector(in3);
+
+   Value *x2x3 = m_builder.CreateMul( vec2[0], vec3[0], name("x2x3"));
+   Value *y2y3 = m_builder.CreateMul( vec2[1], vec3[1], name("y2y3"));
+   Value *x1px2x3 = m_builder.CreateAdd (vec1[0], x2x3, name("x1 + x2x3"));
+   Value *x1px2x3py2y3 = m_builder.CreateAdd (x1px2x3, y2y3, name("x1 + x2x3 + y2y3"));
+
+   Value *x2z3 = m_builder.CreateMul( vec2[0], vec3[2], name("x2z3"));
+   Value *y2w3 = m_builder.CreateMul( vec2[1], vec3[3], name("y2w3"));
+   Value *y1px2z3 = m_builder.CreateAdd (vec1[1], x2z3, name("y1 + x2z3"));
+   Value *y1px2z3py2w3 = m_builder.CreateAdd (y1px2z3, y2w3, name("y1 + x2z3 + y2w3"));
+
+   return vectorFromVals(x1px2x3py2y3, y1px2z3py2w3, x1px2x3py2y3, y1px2z3py2w3);
 }
 
-void Instructions::cal(int label, llvm::Value *input)
+void Instructions::printVector(llvm::Value *val)
 {
+   static const char *frmt = "Vector is [%f, %f, %f, %f]\x0A";
+
+   if (!m_fmtPtr) {
+      Constant *format = ConstantArray::get(frmt, true);
+      ArrayType *arrayTy = ArrayType::get(IntegerType::get(8), strlen(frmt) + 1);
+      GlobalVariable* globalFormat = new GlobalVariable(
+         /*Type=*/arrayTy,
+         /*isConstant=*/true,
+         /*Linkage=*/GlobalValue::InternalLinkage,
+         /*Initializer=*/0, // has initializer, specified below
+         /*Name=*/name(".str"),
+         m_mod);
+      globalFormat->setInitializer(format);
+
+      Constant* const_int0 = Constant::getNullValue(IntegerType::get(32));
+      std::vector<Constant*> const_ptr_21_indices;
+      const_ptr_21_indices.push_back(const_int0);
+      const_ptr_21_indices.push_back(const_int0);
+      m_fmtPtr = ConstantExpr::getGetElementPtr(globalFormat,
+                                                &const_ptr_21_indices[0], const_ptr_21_indices.size());
+   }
+
+   Function *func_printf = m_mod->getFunction("printf");
+   if (!func_printf)
+      func_printf = declarePrintf();
+   assert(func_printf);
+   std::vector<llvm::Value*> vec = extractVector(val);
+   Value *dx = m_builder.CreateFPExt(vec[0], Type::DoubleTy, name("dx"));
+   Value *dy = m_builder.CreateFPExt(vec[1], Type::DoubleTy, name("dy"));
+   Value *dz = m_builder.CreateFPExt(vec[2], Type::DoubleTy, name("dz"));
+   Value *dw = m_builder.CreateFPExt(vec[3], Type::DoubleTy, name("dw"));
    std::vector<Value*> params;
-   params.push_back(input);
-   llvm::Function *func = findFunction(label);
+   params.push_back(m_fmtPtr);
+   params.push_back(dx);
+   params.push_back(dy);
+   params.push_back(dz);
+   params.push_back(dw);
+   CallInst *call = m_builder.CreateCall(func_printf, params.begin(), params.end(),
+                                         name("printf"));
+   call->setCallingConv(CallingConv::C);
+   call->setTailCall(true);
+}
 
-   m_builder.CreateCall(func, params.begin(), params.end());
+const char * Instructions::name(const char *prefix)
+{
+   ++m_idx;
+   snprintf(m_name, 32, "%s%d", prefix, m_idx);
+   return m_name;
+}
+
+llvm::Value * Instructions::callCeil(llvm::Value *val)
+{
+   if (!m_llvmCeil) {
+      // predeclare the intrinsic
+      std::vector<const Type*> ceilArgs;
+      ceilArgs.push_back(Type::FloatTy);
+      AttrListPtr ceilPal;
+      FunctionType* ceilType = FunctionType::get(
+         /*Result=*/Type::FloatTy,
+         /*Params=*/ceilArgs,
+         /*isVarArg=*/false);
+      m_llvmCeil = Function::Create(
+         /*Type=*/ceilType,
+         /*Linkage=*/GlobalValue::ExternalLinkage,
+         /*Name=*/"ceilf", m_mod);
+      m_llvmCeil->setCallingConv(CallingConv::C);
+      m_llvmCeil->setAttributes(ceilPal);
+   }
+   CallInst *call =  m_builder.CreateCall(m_llvmCeil, val,
+                                          name("ceilf"));
+   call->setCallingConv(CallingConv::C);
+   call->setTailCall(false);
+   return call;
+}
+
+llvm::Value *Instructions::callFAbs(llvm::Value *val)
+{
+   if (!m_llvmFAbs) {
+      // predeclare the intrinsic
+      std::vector<const Type*> fabsArgs;
+      fabsArgs.push_back(Type::FloatTy);
+      AttrListPtr fabsPal;
+      FunctionType* fabsType = FunctionType::get(
+         /*Result=*/Type::FloatTy,
+         /*Params=*/fabsArgs,
+         /*isVarArg=*/false);
+      m_llvmFAbs = Function::Create(
+         /*Type=*/fabsType,
+         /*Linkage=*/GlobalValue::ExternalLinkage,
+         /*Name=*/"fabs", m_mod);
+      m_llvmFAbs->setCallingConv(CallingConv::C);
+      m_llvmFAbs->setAttributes(fabsPal);
+   }
+   CallInst *call = m_builder.CreateCall(m_llvmFAbs, val,
+                                         name("fabs"));
+   call->setCallingConv(CallingConv::C);
+   call->setTailCall(false);
+   return call;
+}
+
+llvm::Value * Instructions::callFExp(llvm::Value *val)
+{
+   if (!m_llvmFexp) {
+      // predeclare the intrinsic
+      std::vector<const Type*> fexpArgs;
+      fexpArgs.push_back(Type::FloatTy);
+      AttrListPtr fexpPal;
+      FunctionType* fexpType = FunctionType::get(
+         /*Result=*/Type::FloatTy,
+         /*Params=*/fexpArgs,
+         /*isVarArg=*/false);
+      m_llvmFexp = Function::Create(
+         /*Type=*/fexpType,
+         /*Linkage=*/GlobalValue::ExternalLinkage,
+         /*Name=*/"expf", m_mod);
+      m_llvmFexp->setCallingConv(CallingConv::C);
+      m_llvmFexp->setAttributes(fexpPal);
+   }
+   CallInst *call = m_builder.CreateCall(m_llvmFexp, val,
+                                         name("expf"));
+   call->setCallingConv(CallingConv::C);
+   call->setTailCall(false);
+   return call;
+}
+
+llvm::Value * Instructions::callFLog(llvm::Value *val)
+{
+   if (!m_llvmFlog) {
+      // predeclare the intrinsic
+      std::vector<const Type*> flogArgs;
+      flogArgs.push_back(Type::FloatTy);
+      AttrListPtr flogPal;
+      FunctionType* flogType = FunctionType::get(
+         /*Result=*/Type::FloatTy,
+         /*Params=*/flogArgs,
+         /*isVarArg=*/false);
+      m_llvmFlog = Function::Create(
+         /*Type=*/flogType,
+         /*Linkage=*/GlobalValue::ExternalLinkage,
+         /*Name=*/"logf", m_mod);
+      m_llvmFlog->setCallingConv(CallingConv::C);
+      m_llvmFlog->setAttributes(flogPal);
+   }
+   CallInst *call = m_builder.CreateCall(m_llvmFlog, val,
+                                         name("logf"));
+   call->setCallingConv(CallingConv::C);
+   call->setTailCall(false);
+   return call;
+}
+
+llvm::Value * Instructions::callFloor(llvm::Value *val)
+{
+   if (!m_llvmFloor) {
+      // predeclare the intrinsic
+      std::vector<const Type*> floorArgs;
+      floorArgs.push_back(Type::FloatTy);
+      AttrListPtr floorPal;
+      FunctionType* floorType = FunctionType::get(
+         /*Result=*/Type::FloatTy,
+         /*Params=*/floorArgs,
+         /*isVarArg=*/false);
+      m_llvmFloor = Function::Create(
+         /*Type=*/floorType,
+         /*Linkage=*/GlobalValue::ExternalLinkage,
+         /*Name=*/"floorf", m_mod);
+      m_llvmFloor->setCallingConv(CallingConv::C);
+      m_llvmFloor->setAttributes(floorPal);
+   }
+   CallInst *call =  m_builder.CreateCall(m_llvmFloor, val,
+                                          name("floorf"));
+   call->setCallingConv(CallingConv::C);
+   call->setTailCall(false);
+   return call;
+}
+
+llvm::Value *Instructions::callFSqrt(llvm::Value *val)
+{
+   if (!m_llvmFSqrt) {
+      // predeclare the intrinsic
+      std::vector<const Type*> fsqrtArgs;
+      fsqrtArgs.push_back(Type::FloatTy);
+      AttrListPtr fsqrtPal;
+      FunctionType* fsqrtType = FunctionType::get(
+         /*Result=*/Type::FloatTy,
+         /*Params=*/fsqrtArgs,
+         /*isVarArg=*/false);
+      m_llvmFSqrt = Function::Create(
+         /*Type=*/fsqrtType,
+         /*Linkage=*/GlobalValue::ExternalLinkage,
+         /*Name=*/"llvm.sqrt.f32", m_mod);
+      m_llvmFSqrt->setCallingConv(CallingConv::C);
+      m_llvmFSqrt->setAttributes(fsqrtPal);
+   }
+   CallInst *call = m_builder.CreateCall(m_llvmFSqrt, val,
+                                         name("sqrt"));
+   call->setCallingConv(CallingConv::C);
+   call->setTailCall(false);
+   return call;
+}
+
+llvm::Value * Instructions::callPow(llvm::Value *val1, llvm::Value *val2)
+{
+   if (!m_llvmPow) {
+      // predeclare the intrinsic
+      std::vector<const Type*> powArgs;
+      powArgs.push_back(Type::FloatTy);
+      powArgs.push_back(Type::FloatTy);
+      AttrListPtr powPal;
+      FunctionType* powType = FunctionType::get(
+         /*Result=*/Type::FloatTy,
+         /*Params=*/powArgs,
+         /*isVarArg=*/false);
+      m_llvmPow = Function::Create(
+         /*Type=*/powType,
+         /*Linkage=*/GlobalValue::ExternalLinkage,
+         /*Name=*/"llvm.pow.f32", m_mod);
+      m_llvmPow->setCallingConv(CallingConv::C);
+      m_llvmPow->setAttributes(powPal);
+   }
+   std::vector<Value*> params;
+   params.push_back(val1);
+   params.push_back(val2);
+   CallInst *call = m_builder.CreateCall(m_llvmPow, params.begin(), params.end(),
+                                         name("pow"));
+   call->setCallingConv(CallingConv::C);
+   call->setTailCall(false);
+   return call;
+}
+
+llvm::Value * Instructions::vectorFromVals(llvm::Value *x, llvm::Value *y,
+                                           llvm::Value *z, llvm::Value *w)
+{
+   Constant *const_vec = Constant::getNullValue(m_floatVecType);
+   Value *res = m_builder.CreateInsertElement(const_vec, x,
+                                              m_storage->constantInt(0),
+                                              name("vecx"));
+   res = m_builder.CreateInsertElement(res, y, m_storage->constantInt(1),
+                               name("vecxy"));
+   res = m_builder.CreateInsertElement(res, z, m_storage->constantInt(2),
+                               name("vecxyz"));
+   if (w)
+      res = m_builder.CreateInsertElement(res, w, m_storage->constantInt(3),
+                                          name("vecxyzw"));
+   return res;
+}
+
+llvm::Value * Instructions::constVector(float x, float y, float z, float w)
+{
+   std::vector<Constant*> vec(4);
+   vec[0] = ConstantFP::get(APFloat(x));
+   vec[1] = ConstantFP::get(APFloat(y));
+   vec[2] = ConstantFP::get(APFloat(z));
+   vec[3] = ConstantFP::get(APFloat(w));
+   return ConstantVector::get(m_floatVecType, vec);
+}
+
+llvm::Function * Instructions::declarePrintf()
+{
+   std::vector<const Type*> args;
+   AttrListPtr params;
+   FunctionType* funcTy = FunctionType::get(
+      /*Result=*/IntegerType::get(32),
+      /*Params=*/args,
+      /*isVarArg=*/true);
+   Function* func_printf = Function::Create(
+      /*Type=*/funcTy,
+      /*Linkage=*/GlobalValue::ExternalLinkage,
+      /*Name=*/"printf", m_mod);
+   func_printf->setCallingConv(CallingConv::C);
+   func_printf->setAttributes(params);
+   return func_printf;
 }
 
 llvm::Function * Instructions::declareFunc(int label)
@@ -763,7 +1148,7 @@ llvm::Function * Instructions::declareFunc(int label)
    args.push_back(vecPtr);
    args.push_back(vecPtr);
    args.push_back(vecPtr);
-   PAListPtr params;
+   AttrListPtr params;
    FunctionType *funcType = FunctionType::get(
       /*Result=*/Type::VoidTy,
       /*Params=*/args,
@@ -774,31 +1159,10 @@ llvm::Function * Instructions::declareFunc(int label)
       /*Linkage=*/GlobalValue::ExternalLinkage,
       /*Name=*/name.c_str(), m_mod);
    func->setCallingConv(CallingConv::C);
-   func->setParamAttrs(params);
+   func->setAttributes(params);
    return func;
 }
 
-void Instructions::bgnSub(unsigned label)
-{
-   llvm::Function *func = findFunction(label);
-
-   Function::arg_iterator args = func->arg_begin();
-   Value *ptr_INPUT = args++;
-   ptr_INPUT->setName("INPUT");
-   m_storage->pushArguments(ptr_INPUT);
-
-   llvm::BasicBlock *entry = BasicBlock::Create("entry", func, 0);
-
-   m_func = func;
-   m_builder.SetInsertPoint(entry);
-}
-
-void Instructions::endSub()
-{
-   m_func = 0;
-   m_builder.SetInsertPoint(0);
-}
-
 llvm::Function * Instructions::findFunction(int label)
 {
    llvm::Function *func = m_functions[label];
@@ -809,17 +1173,6 @@ llvm::Function * Instructions::findFunction(int label)
    return func;
 }
 
-llvm::Value * Instructions::constVector(float x, float y, float z, float w)
-{
-   std::vector<Constant*> vec(4);
-   vec[0] = ConstantFP::get(APFloat(x));
-   vec[1] = ConstantFP::get(APFloat(y));
-   vec[2] = ConstantFP::get(APFloat(z));
-   vec[3] = ConstantFP::get(APFloat(w));
-   return ConstantVector::get(m_floatVecType, vec);
-}
-
-
 std::vector<llvm::Value*> Instructions::extractVector(llvm::Value *vec)
 {
    std::vector<llvm::Value*> elems(4);
@@ -834,69 +1187,7 @@ std::vector<llvm::Value*> Instructions::extractVector(llvm::Value *vec)
    return elems;
 }
 
-llvm::Value * Instructions::cmp(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3)
-{
-   llvm::Function *func = m_mod->getFunction("cmp");
-   assert(func);
-
-   std::vector<Value*> params;
-   params.push_back(in1);
-   params.push_back(in2);
-   params.push_back(in3);
-   CallInst *call = m_builder.CreateCall(func, params.begin(), params.end(), name("cmpres"));
-   call->setTailCall(false);
-   return call;
-}
-
-llvm::Value * Instructions::cos(llvm::Value *in)
-{
-#if 0
-   llvm::Function *func = m_mod->getFunction("vcos");
-   assert(func);
 
-   CallInst *call = m_builder.CreateCall(func, in, name("cosres"));
-   call->setTailCall(false);
-   return call;
-#else
-   std::vector<llvm::Value*> elems = extractVector(in);
-   Function *func = m_mod->getFunction("cosf");
-   assert(func);
-   CallInst *cos = m_builder.CreateCall(func, elems[0], name("cosres"));
-   cos->setCallingConv(CallingConv::C);
-   cos->setTailCall(true);
-   return vectorFromVals(cos, cos, cos, cos);
-#endif
-}
-
-llvm::Value * Instructions::scs(llvm::Value *in)
-{
-   llvm::Function *func = m_mod->getFunction("scs");
-   assert(func);
-
-   CallInst *call = m_builder.CreateCall(func, in, name("scsres"));
-   call->setTailCall(false);
-   return call;
-}
-
-llvm::Value * Instructions::kil(llvm::Value *in)
-{
-   llvm::Function *func = m_mod->getFunction("kil");
-   assert(func);
-
-   CallInst *call = m_builder.CreateCall(func, in, name("kilpres"));
-   call->setTailCall(false);
-   return call;
-}
-
-llvm::Value * Instructions::sin(llvm::Value *in)
-{
-   llvm::Function *func = m_mod->getFunction("vsin");
-   assert(func);
-
-   CallInst *call = m_builder.CreateCall(func, in, name("sinres"));
-   call->setTailCall(false);
-   return call;
-}
 #endif //MESA_LLVM
 
 
diff --git a/src/gallium/auxiliary/gallivm/instructions.h b/src/gallium/auxiliary/gallivm/instructions.h
index d286ce80c7..e18571251e 100644
--- a/src/gallium/auxiliary/gallivm/instructions.h
+++ b/src/gallium/auxiliary/gallivm/instructions.h
@@ -57,15 +57,24 @@ public:
    llvm::BasicBlock *currentBlock() const;
 
    llvm::Value *abs(llvm::Value *in1);
-   llvm::Value *arl(llvm::Value *in1);
    llvm::Value *add(llvm::Value *in1, llvm::Value *in2);
+   llvm::Value *arl(llvm::Value *in1);
    void         beginLoop();
    void         bgnSub(unsigned);
    void         brk();
    void         cal(int label, llvm::Value *input);
+   llvm::Value *ceil(llvm::Value *in);
+   llvm::Value *clamp(llvm::Value *in);
    llvm::Value *cmp(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3);
+   llvm::Value *cnd(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3);
+   llvm::Value *cnd0(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3);
    llvm::Value *cos(llvm::Value *in);
    llvm::Value *cross(llvm::Value *in1, llvm::Value *in2);
+   llvm::Value *ddx(llvm::Value *in);
+   llvm::Value *ddy(llvm::Value *in);
+   llvm::Value *div(llvm::Value *in1, llvm::Value *in2);
+   llvm::Value *dot2add(llvm::Value *in, llvm::Value *in2, llvm::Value *in3);
+   llvm::Value *dp2(llvm::Value *in1, llvm::Value *in2);
    llvm::Value *dp3(llvm::Value *in1, llvm::Value *in2);
    llvm::Value *dp4(llvm::Value *in1, llvm::Value *in2);
    llvm::Value *dph(llvm::Value *in1, llvm::Value *in2);
@@ -75,6 +84,7 @@ public:
    void         endLoop();
    void         end();
    void         endSub();
+   llvm::Value *exp(llvm::Value *in);
    llvm::Value *ex2(llvm::Value *in);
    llvm::Value *floor(llvm::Value *in);
    llvm::Value *frc(llvm::Value *in);
@@ -82,32 +92,43 @@ public:
    llvm::Value *kil(llvm::Value *in);
    llvm::Value *lerp(llvm::Value *in1, llvm::Value *in2,
                      llvm::Value *in3);
-   llvm::Value *lit(llvm::Value *in);
    llvm::Value *lg2(llvm::Value *in);
+   llvm::Value *lit(llvm::Value *in);
+   llvm::Value *log(llvm::Value *in);
    llvm::Value *madd(llvm::Value *in1, llvm::Value *in2,
                      llvm::Value *in3);
-   llvm::Value *min(llvm::Value *in1, llvm::Value *in2);
    llvm::Value *max(llvm::Value *in1, llvm::Value *in2);
+   llvm::Value *min(llvm::Value *in1, llvm::Value *in2);
    llvm::Value *mul(llvm::Value *in1, llvm::Value *in2);
+   llvm::Value *neg(llvm::Value *in);
+   llvm::Value *nrm(llvm::Value *in);
    llvm::Value *pow(llvm::Value *in1, llvm::Value *in2);
    llvm::Value *rcp(llvm::Value *in);
    llvm::Value *rsq(llvm::Value *in);
    llvm::Value *scs(llvm::Value *in);
+   llvm::Value *seq(llvm::Value *in1, llvm::Value *in2);
+   llvm::Value *sfl(llvm::Value *in1, llvm::Value *in2);
    llvm::Value *sge(llvm::Value *in1, llvm::Value *in2);
    llvm::Value *sgt(llvm::Value *in1, llvm::Value *in2);
    llvm::Value *sin(llvm::Value *in);
+   llvm::Value *sle(llvm::Value *in1, llvm::Value *in2);
    llvm::Value *slt(llvm::Value *in1, llvm::Value *in2);
+   llvm::Value *sne(llvm::Value *in1, llvm::Value *in2);
+   llvm::Value *str(llvm::Value *in1, llvm::Value *in2);
    llvm::Value *sub(llvm::Value *in1, llvm::Value *in2);
    llvm::Value *trunc(llvm::Value *in);
+   llvm::Value *x2d(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3);
 
    void printVector(llvm::Value *val);
 private:
    const char *name(const char *prefix);
 
+   llvm::Value *callCeil(llvm::Value *val);
    llvm::Value *callFAbs(llvm::Value *val);
+   llvm::Value *callFExp(llvm::Value *val);
+   llvm::Value *callFLog(llvm::Value *val);
    llvm::Value *callFloor(llvm::Value *val);
    llvm::Value *callFSqrt(llvm::Value *val);
-   llvm::Value *callFLog(llvm::Value *val);
    llvm::Value *callPow(llvm::Value *val1, llvm::Value *val2);
 
    llvm::Value *vectorFromVals(llvm::Value *x, llvm::Value *y,
@@ -125,16 +146,18 @@ private:
    llvm::Module             *m_mod;
    llvm::Function           *m_func;
    char                      m_name[32];
-   llvm::IRBuilder           m_builder;
+   llvm::IRBuilder<>         m_builder;
    int                       m_idx;
 
    llvm::VectorType *m_floatVecType;
 
+   llvm::Function   *m_llvmCeil;
    llvm::Function   *m_llvmFSqrt;
    llvm::Function   *m_llvmFAbs;
    llvm::Function   *m_llvmPow;
    llvm::Function   *m_llvmFloor;
    llvm::Function   *m_llvmFlog;
+   llvm::Function   *m_llvmFexp;
    llvm::Function   *m_llvmLit;
 
    llvm::Constant   *m_fmtPtr;
diff --git a/src/gallium/auxiliary/gallivm/instructionssoa.cpp b/src/gallium/auxiliary/gallivm/instructionssoa.cpp
index efddc04e81..d5600fd22d 100644
--- a/src/gallium/auxiliary/gallivm/instructionssoa.cpp
+++ b/src/gallium/auxiliary/gallivm/instructionssoa.cpp
@@ -90,68 +90,11 @@ llvm::Value * InstructionsSoa::vectorFromVals(llvm::Value *x, llvm::Value *y,
    return res;
 }
 
-std::vector<llvm::Value*> InstructionsSoa::arl(const std::vector<llvm::Value*> in)
-{
-   std::vector<llvm::Value*> res(4);
-
-   //Extract x's
-   llvm::Value *x1 = m_builder.CreateExtractElement(in[0],
-                                                    m_storage->constantInt(0),
-                                                    name("extractX"));
-   //cast it to an unsigned int
-   x1 = m_builder.CreateFPToUI(x1, IntegerType::get(32), name("x1IntCast"));
-
-   res[0] = x1;//vectorFromVals(x1, x2, x3, x4);
-   //only x is valid. the others shouldn't be necessary
-   /*
-   res[1] = Constant::getNullValue(m_floatVecType);
-   res[2] = Constant::getNullValue(m_floatVecType);
-   res[3] = Constant::getNullValue(m_floatVecType);
-   */
-
-   return res;
-}
-
-
-std::vector<llvm::Value*> InstructionsSoa::add(const std::vector<llvm::Value*> in1,
-                                               const std::vector<llvm::Value*> in2)
-{
-   std::vector<llvm::Value*> res(4);
-
-   res[0] = m_builder.CreateAdd(in1[0], in2[0], name("addx"));
-   res[1] = m_builder.CreateAdd(in1[1], in2[1], name("addy"));
-   res[2] = m_builder.CreateAdd(in1[2], in2[2], name("addz"));
-   res[3] = m_builder.CreateAdd(in1[3], in2[3], name("addw"));
-
-   return res;
-}
-
-std::vector<llvm::Value*> InstructionsSoa::mul(const std::vector<llvm::Value*> in1,
-                                               const std::vector<llvm::Value*> in2)
-{
-   std::vector<llvm::Value*> res(4);
-
-   res[0] = m_builder.CreateMul(in1[0], in2[0], name("mulx"));
-   res[1] = m_builder.CreateMul(in1[1], in2[1], name("muly"));
-   res[2] = m_builder.CreateMul(in1[2], in2[2], name("mulz"));
-   res[3] = m_builder.CreateMul(in1[3], in2[3], name("mulw"));
-
-   return res;
-}
-
 void InstructionsSoa::end()
 {
    m_builder.CreateRetVoid();
 }
 
-std::vector<llvm::Value*> InstructionsSoa::madd(const std::vector<llvm::Value*> in1,
-                                                const std::vector<llvm::Value*> in2,
-                                                const std::vector<llvm::Value*> in3)
-{
-   std::vector<llvm::Value*> res = mul(in1, in2);
-   return add(res, in3);
-}
-
 std::vector<llvm::Value*> InstructionsSoa::extractVector(llvm::Value *vector)
 {
    std::vector<llvm::Value*> res(4);
@@ -171,6 +114,11 @@ std::vector<llvm::Value*> InstructionsSoa::extractVector(llvm::Value *vector)
    return res;
 }
 
+llvm::IRBuilder<>* InstructionsSoa::getIRBuilder()
+{
+   return &m_builder;
+}
+
 void InstructionsSoa::createFunctionMap()
 {
    m_functionsMap[TGSI_OPCODE_ABS]   = "abs";
@@ -181,6 +129,7 @@ void InstructionsSoa::createFunctionMap()
    m_functionsMap[TGSI_OPCODE_POWER] = "pow";
    m_functionsMap[TGSI_OPCODE_LIT]   = "lit";
    m_functionsMap[TGSI_OPCODE_RSQ]   = "rsq";
+   m_functionsMap[TGSI_OPCODE_SLT]   = "slt";
 }
 
 void InstructionsSoa::createDependencies()
@@ -273,6 +222,41 @@ std::vector<llvm::Value*> InstructionsSoa::abs(const std::vector<llvm::Value*> i
    return callBuiltin(func, in1);
 }
 
+std::vector<llvm::Value*> InstructionsSoa::add(const std::vector<llvm::Value*> in1,
+                                               const std::vector<llvm::Value*> in2)
+{
+   std::vector<llvm::Value*> res(4);
+
+   res[0] = m_builder.CreateAdd(in1[0], in2[0], name("addx"));
+   res[1] = m_builder.CreateAdd(in1[1], in2[1], name("addy"));
+   res[2] = m_builder.CreateAdd(in1[2], in2[2], name("addz"));
+   res[3] = m_builder.CreateAdd(in1[3], in2[3], name("addw"));
+
+   return res;
+}
+
+std::vector<llvm::Value*> InstructionsSoa::arl(const std::vector<llvm::Value*> in)
+{
+   std::vector<llvm::Value*> res(4);
+
+   //Extract x's
+   llvm::Value *x1 = m_builder.CreateExtractElement(in[0],
+                                                    m_storage->constantInt(0),
+                                                    name("extractX"));
+   //cast it to an unsigned int
+   x1 = m_builder.CreateFPToUI(x1, IntegerType::get(32), name("x1IntCast"));
+
+   res[0] = x1;//vectorFromVals(x1, x2, x3, x4);
+   //only x is valid. the others shouldn't be necessary
+   /*
+   res[1] = Constant::getNullValue(m_floatVecType);
+   res[2] = Constant::getNullValue(m_floatVecType);
+   res[3] = Constant::getNullValue(m_floatVecType);
+   */
+
+   return res;
+}
+
 std::vector<llvm::Value*> InstructionsSoa::dp3(const std::vector<llvm::Value*> in1,
                                                const std::vector<llvm::Value*> in2)
 {
@@ -280,6 +264,98 @@ std::vector<llvm::Value*> InstructionsSoa::dp3(const std::vector<llvm::Value*> i
    return callBuiltin(func, in1, in2);
 }
 
+std::vector<llvm::Value*> InstructionsSoa::lit(const std::vector<llvm::Value*> in)
+{
+   llvm::Function *func = function(TGSI_OPCODE_LIT);
+   return callBuiltin(func, in);
+}
+
+std::vector<llvm::Value*> InstructionsSoa::madd(const std::vector<llvm::Value*> in1,
+                                                const std::vector<llvm::Value*> in2,
+                                                const std::vector<llvm::Value*> in3)
+{
+   std::vector<llvm::Value*> res = mul(in1, in2);
+   return add(res, in3);
+}
+
+std::vector<llvm::Value*> InstructionsSoa::max(const std::vector<llvm::Value*> in1,
+                                               const std::vector<llvm::Value*> in2)
+{
+   llvm::Function *func = function(TGSI_OPCODE_MAX);
+   return callBuiltin(func, in1, in2);
+}
+
+std::vector<llvm::Value*> InstructionsSoa::min(const std::vector<llvm::Value*> in1,
+                                               const std::vector<llvm::Value*> in2)
+{
+   llvm::Function *func = function(TGSI_OPCODE_MIN);
+   return callBuiltin(func, in1, in2);
+}
+
+std::vector<llvm::Value*> InstructionsSoa::mul(const std::vector<llvm::Value*> in1,
+                                               const std::vector<llvm::Value*> in2)
+{
+   std::vector<llvm::Value*> res(4);
+
+   res[0] = m_builder.CreateMul(in1[0], in2[0], name("mulx"));
+   res[1] = m_builder.CreateMul(in1[1], in2[1], name("muly"));
+   res[2] = m_builder.CreateMul(in1[2], in2[2], name("mulz"));
+   res[3] = m_builder.CreateMul(in1[3], in2[3], name("mulw"));
+
+   return res;
+}
+
+std::vector<llvm::Value*> InstructionsSoa::pow(const std::vector<llvm::Value*> in1,
+                                               const std::vector<llvm::Value*> in2)
+{
+   llvm::Function *func = function(TGSI_OPCODE_POWER);
+   return callBuiltin(func, in1, in2);
+}
+
+std::vector<llvm::Value*> InstructionsSoa::rsq(const std::vector<llvm::Value*> in)
+{
+   llvm::Function *func = function(TGSI_OPCODE_RSQ);
+   return callBuiltin(func, in);
+}
+
+std::vector<llvm::Value*> InstructionsSoa::slt(const std::vector<llvm::Value*> in1,
+                                               const std::vector<llvm::Value*> in2)
+{
+   llvm::Function *func = function(TGSI_OPCODE_SLT);
+   return callBuiltin(func, in1, in2);
+}
+
+std::vector<llvm::Value*> InstructionsSoa::sub(const std::vector<llvm::Value*> in1,
+                                               const std::vector<llvm::Value*> in2)
+{
+   std::vector<llvm::Value*> res(4);
+
+   res[0] = m_builder.CreateSub(in1[0], in2[0], name("subx"));
+   res[1] = m_builder.CreateSub(in1[1], in2[1], name("suby"));
+   res[2] = m_builder.CreateSub(in1[2], in2[2], name("subz"));
+   res[3] = m_builder.CreateSub(in1[3], in2[3], name("subw"));
+
+   return res;
+}
+
+void checkFunction(Function *func)
+{
+   for (Function::const_iterator BI = func->begin(), BE = func->end();
+        BI != BE; ++BI) {
+      const BasicBlock &BB = *BI;
+      for (BasicBlock::const_iterator II = BB.begin(), IE = BB.end();
+           II != IE; ++II) {
+         const Instruction &I = *II;
+         std::cout<< "Instr = "<<I;
+         for (unsigned op = 0, E = I.getNumOperands(); op != E; ++op) {
+            const Value *Op = I.getOperand(op);
+            std::cout<< "\top = "<<Op<<"("<<op<<")"<<std::endl;
+            //I->setOperand(op, V);
+  }
+      }
+   }
+}
+
 llvm::Value * InstructionsSoa::allocaTemp()
 {
    VectorType *vector   = VectorType::get(Type::FloatTy, 4);
@@ -399,46 +475,6 @@ std::vector<Value*> InstructionsSoa::callBuiltin(llvm::Function *func, const std
    return allocaToResult(allocaPtr);
 }
 
-std::vector<llvm::Value*> InstructionsSoa::pow(const std::vector<llvm::Value*> in1,
-                                               const std::vector<llvm::Value*> in2)
-{
-   llvm::Function *func = function(TGSI_OPCODE_POWER);
-   return callBuiltin(func, in1, in2);
-}
-
-std::vector<llvm::Value*> InstructionsSoa::min(const std::vector<llvm::Value*> in1,
-                                               const std::vector<llvm::Value*> in2)
-{
-   llvm::Function *func = function(TGSI_OPCODE_MIN);
-   return callBuiltin(func, in1, in2);
-}
-
-
-std::vector<llvm::Value*> InstructionsSoa::max(const std::vector<llvm::Value*> in1,
-                                               const std::vector<llvm::Value*> in2)
-{
-   llvm::Function *func = function(TGSI_OPCODE_MAX);
-   return callBuiltin(func, in1, in2);
-}
-
-void checkFunction(Function *func)
-{
-   for (Function::const_iterator BI = func->begin(), BE = func->end();
-        BI != BE; ++BI) {
-      const BasicBlock &BB = *BI;
-      for (BasicBlock::const_iterator II = BB.begin(), IE = BB.end();
-           II != IE; ++II) {
-         const Instruction &I = *II;
-         std::cout<< "Instr = "<<I;
-         for (unsigned op = 0, E = I.getNumOperands(); op != E; ++op) {
-            const Value *Op = I.getOperand(op);
-            std::cout<< "\top = "<<Op<<"("<<op<<")"<<std::endl;
-            //I->setOperand(op, V);
-  }
-      }
-   }
-}
-
 void InstructionsSoa::injectFunction(llvm::Function *originalFunc, int op)
 {
    assert(originalFunc);
@@ -458,8 +494,8 @@ void InstructionsSoa::injectFunction(llvm::Function *originalFunc, int op)
       func = Function::Create(originalFunc->getFunctionType(), GlobalValue::ExternalLinkage,
                               originalFunc->getName(), currentModule());
       func->setCallingConv(CallingConv::C);
-      const PAListPtr pal;
-      func->setParamAttrs(pal);
+      const AttrListPtr pal;
+      func->setAttributes(pal);
       currentModule()->dump();
    } else {
       DenseMap<const Value*, Value *> val;
@@ -483,28 +519,4 @@ void InstructionsSoa::injectFunction(llvm::Function *originalFunc, int op)
    }
 }
 
-std::vector<llvm::Value*> InstructionsSoa::sub(const std::vector<llvm::Value*> in1,
-                                               const std::vector<llvm::Value*> in2)
-{
-   std::vector<llvm::Value*> res(4);
-
-   res[0] = m_builder.CreateSub(in1[0], in2[0], name("subx"));
-   res[1] = m_builder.CreateSub(in1[1], in2[1], name("suby"));
-   res[2] = m_builder.CreateSub(in1[2], in2[2], name("subz"));
-   res[3] = m_builder.CreateSub(in1[3], in2[3], name("subw"));
-
-   return res;
-}
-
-std::vector<llvm::Value*> InstructionsSoa::lit(const std::vector<llvm::Value*> in)
-{
-   llvm::Function *func = function(TGSI_OPCODE_LIT);
-   return callBuiltin(func, in);
-}
-
-std::vector<llvm::Value*> InstructionsSoa::rsq(const std::vector<llvm::Value*> in)
-{
-   llvm::Function *func = function(TGSI_OPCODE_RSQ);
-   return callBuiltin(func, in);
-}
 
diff --git a/src/gallium/auxiliary/gallivm/instructionssoa.h b/src/gallium/auxiliary/gallivm/instructionssoa.h
index 3e20b652dd..d6831e0a6b 100644
--- a/src/gallium/auxiliary/gallivm/instructionssoa.h
+++ b/src/gallium/auxiliary/gallivm/instructionssoa.h
@@ -69,11 +69,14 @@ public:
    std::vector<llvm::Value*> pow(const std::vector<llvm::Value*> in1,
                                  const std::vector<llvm::Value*> in2);
    std::vector<llvm::Value*> rsq(const std::vector<llvm::Value*> in1);
+   std::vector<llvm::Value*> slt(const std::vector<llvm::Value*> in1,
+                                 const std::vector<llvm::Value*> in2);
    std::vector<llvm::Value*> sub(const std::vector<llvm::Value*> in1,
                                  const std::vector<llvm::Value*> in2);
    void         end();
 
    std::vector<llvm::Value*> extractVector(llvm::Value *vector);
+   llvm::IRBuilder<>*  getIRBuilder();
 private:
    const char * name(const char *prefix) const;
    llvm::Value *vectorFromVals(llvm::Value *x, llvm::Value *y,
@@ -96,7 +99,7 @@ private:
                                          const std::vector<llvm::Value*> in3);
    void injectFunction(llvm::Function *originalFunc, int op = TGSI_OPCODE_LAST);
 private:
-   llvm::IRBuilder  m_builder;
+   llvm::IRBuilder<>  m_builder;
    StorageSoa *m_storage;
 
    std::map<int, std::string> m_functionsMap;
diff --git a/src/gallium/auxiliary/gallivm/soabuiltins.c b/src/gallium/auxiliary/gallivm/soabuiltins.c
index 78f84510e2..cb85e1734e 100644
--- a/src/gallium/auxiliary/gallivm/soabuiltins.c
+++ b/src/gallium/auxiliary/gallivm/soabuiltins.c
@@ -36,6 +36,8 @@ typedef __attribute__(( ext_vector_type(4) )) float float4;
 
 extern float fabsf(float val);
 
+/* helpers */
+
 float4 absvec(float4 vec)
 {
    float4 res;
@@ -47,6 +49,58 @@ float4 absvec(float4 vec)
    return res;
 }
 
+float4 maxvec(float4 a, float4 b)
+{
+   return (float4){(a.x > b.x) ? a.x : b.x,
+         (a.y > b.y) ? a.y : b.y,
+         (a.z > b.z) ? a.z : b.z,
+         (a.w > b.w) ? a.w : b.w};
+}
+
+float4 minvec(float4 a, float4 b)
+{
+   return (float4){(a.x < b.x) ? a.x : b.x,
+         (a.y < b.y) ? a.y : b.y,
+         (a.z < b.z) ? a.z : b.z,
+         (a.w < b.w) ? a.w : b.w};
+}
+
+extern float powf(float num, float p);
+extern float sqrtf(float x);
+
+float4 powvec(float4 vec, float4 q)
+{
+   float4 p;
+   p.x = powf(vec.x, q.x);
+   p.y = powf(vec.y, q.y);
+   p.z = powf(vec.z, q.z);
+   p.w = powf(vec.w, q.w);
+   return p;
+}
+
+float4 sqrtvec(float4 vec)
+{
+   float4 p;
+   p.x = sqrtf(vec.x);
+   p.y = sqrtf(vec.y);
+   p.z = sqrtf(vec.z);
+   p.w = sqrtf(vec.w);
+   return p;
+}
+
+float4 sltvec(float4 v1, float4 v2)
+{
+   float4 p;
+   p.x = (v1.x < v2.x) ? 1.0 : 0.0;
+   p.y = (v1.y < v2.y) ? 1.0 : 0.0;
+   p.z = (v1.z < v2.z) ? 1.0 : 0.0;
+   p.w = (v1.w < v2.w) ? 1.0 : 0.0;
+   return p;
+}
+
+
+/* instructions */
+
 void abs(float4 *res,
          float4 tmp0x, float4 tmp0y, float4 tmp0z, float4 tmp0w)
 {
@@ -69,7 +123,6 @@ void dp3(float4 *res,
    res[3] = dot;
 }
 
-
 void dp4(float4 *res,
          float4 tmp0x, float4 tmp0y, float4 tmp0z, float4 tmp0w,
          float4 tmp1x, float4 tmp1y, float4 tmp1z, float4 tmp1w)
@@ -83,35 +136,25 @@ void dp4(float4 *res,
    res[3] = dot;
 }
 
-extern float powf(float num, float p);
-extern float sqrtf(float x);
-
-float4 powvec(float4 vec, float4 q)
-{
-   float4 p;
-   p.x = powf(vec.x, q.x);
-   p.y = powf(vec.y, q.y);
-   p.z = powf(vec.z, q.z);
-   p.w = powf(vec.w, q.w);
-   return p;
-}
-
-void pow(float4 *res,
-         float4 tmp0x, float4 tmp0y, float4 tmp0z, float4 tmp0w,
-         float4 tmp1x, float4 tmp1y, float4 tmp1z, float4 tmp1w)
+void lit(float4 *res,
+         float4 tmp0x, float4 tmp0y, float4 tmp0z, float4 tmp0w)
 {
-   res[0] = powvec(tmp0x, tmp1x);
-   res[1] = res[0];
-   res[2] = res[0];
-   res[3] = res[0];
-}
+   const float4 zerovec = (float4) {0.0, 0.0, 0.0, 0.0};
+   const float4 min128 = (float4) {-128.f, -128.f, -128.f, -128.f};
+   const float4 plus128 = (float4) {128.f,  128.f,  128.f,  128.f};
 
-float4 minvec(float4 a, float4 b)
-{
-   return (float4){(a.x < b.x) ? a.x : b.x,
-         (a.y < b.y) ? a.y : b.y,
-         (a.z < b.z) ? a.z : b.z,
-         (a.w < b.w) ? a.w : b.w};
+   res[0] = (float4){1.0, 1.0, 1.0, 1.0};
+   if (tmp0x.x > 0) {
+      float4 tmpy = maxvec(tmp0y, zerovec);
+      float4 tmpw = minvec(tmp0w, plus128);
+      tmpw = maxvec(tmpw, min128);
+      res[1] = tmp0x;
+      res[2] = powvec(tmpy, tmpw);
+   } else {
+      res[1] = zerovec;
+      res[2] = zerovec;
+   }
+   res[3] = (float4){1.0, 1.0, 1.0, 1.0};
 }
 
 void min(float4 *res,
@@ -125,14 +168,6 @@ void min(float4 *res,
 }
 
 
-float4 maxvec(float4 a, float4 b)
-{
-   return (float4){(a.x > b.x) ? a.x : b.x,
-         (a.y > b.y) ? a.y : b.y,
-         (a.z > b.z) ? a.z : b.z,
-         (a.w > b.w) ? a.w : b.w};
-}
-
 void max(float4 *res,
          float4 tmp0x, float4 tmp0y, float4 tmp0z, float4 tmp0w,
          float4 tmp1x, float4 tmp1y, float4 tmp1z, float4 tmp1w)
@@ -143,37 +178,14 @@ void max(float4 *res,
    res[3] = maxvec(tmp0w, tmp1w);
 }
 
-
-void lit(float4 *res,
-         float4 tmp0x, float4 tmp0y, float4 tmp0z, float4 tmp0w)
-{
-   const float4 zerovec = (float4) {0.0, 0.0, 0.0, 0.0};
-   const float4 min128 = (float4) {-128.f, -128.f, -128.f, -128.f};
-   const float4 plus128 = (float4) {128.f,  128.f,  128.f,  128.f};
-
-   res[0] = (float4){1.0, 1.0, 1.0, 1.0};
-   if (tmp0x.x > 0) {
-      float4 tmpy = maxvec(tmp0y, zerovec);
-      float4 tmpw = minvec(tmp0w, plus128);
-      tmpw = maxvec(tmpw, min128);
-      res[1] = tmp0x;
-      res[2] = powvec(tmpy, tmpw);
-   } else {
-      res[1] = zerovec;
-      res[2] = zerovec;
-   }
-   res[3] = (float4){1.0, 1.0, 1.0, 1.0};
-}
-
-
-float4 sqrtvec(float4 vec)
+void pow(float4 *res,
+         float4 tmp0x, float4 tmp0y, float4 tmp0z, float4 tmp0w,
+         float4 tmp1x, float4 tmp1y, float4 tmp1z, float4 tmp1w)
 {
-   float4 p;
-   p.x = sqrtf(vec.x);
-   p.y = sqrtf(vec.y);
-   p.z = sqrtf(vec.z);
-   p.w = sqrtf(vec.w);
-   return p;
+   res[0] = powvec(tmp0x, tmp1x);
+   res[1] = res[0];
+   res[2] = res[0];
+   res[3] = res[0];
 }
 
 void rsq(float4 *res,
@@ -185,3 +197,14 @@ void rsq(float4 *res,
    res[2] = onevec/sqrtvec(absvec(tmp0z));
    res[3] = onevec/sqrtvec(absvec(tmp0w));
 }
+
+void slt(float4 *res,
+         float4 tmp0x, float4 tmp0y, float4 tmp0z, float4 tmp0w,
+         float4 tmp1x, float4 tmp1y, float4 tmp1z, float4 tmp1w)
+{
+   res[0] = sltvec(tmp0x, tmp1x);
+   res[1] = sltvec(tmp0y, tmp1y);
+   res[2] = sltvec(tmp0z, tmp1z);
+   res[3] = sltvec(tmp0w, tmp1w);
+}
+
diff --git a/src/gallium/auxiliary/gallivm/storagesoa.cpp b/src/gallium/auxiliary/gallivm/storagesoa.cpp
index 78d754371f..4fc075cf6d 100644
--- a/src/gallium/auxiliary/gallivm/storagesoa.cpp
+++ b/src/gallium/auxiliary/gallivm/storagesoa.cpp
@@ -93,7 +93,7 @@ void StorageSoa::declareImmediates()
       std::vector<float> vals(4);
       std::vector<Constant*> channelArray;
 
-      vals[0] = vec[0]; vals[1] = vec[0]; vals[2] = vec[0]; vals[3] = vec[0];
+      vals[0] = vec[0]; vals[1] = vec[1]; vals[2] = vec[2]; vals[3] = vec[3];
       llvm::Constant *xChannel = createConstGlobalVector(vals);
 
       vals[0] = vec[1]; vals[1] = vec[1]; vals[2] = vec[1]; vals[3] = vec[1];
@@ -144,22 +144,43 @@ std::vector<llvm::Value*> StorageSoa::inputElement(llvm::Value *idx)
    return res;
 }
 
-std::vector<llvm::Value*> StorageSoa::constElement(llvm::Value *idx)
+llvm::Value* StorageSoa::unpackConstElement(llvm::IRBuilder<>* m_builder, llvm::Value* vector, int cc)
 {
-   std::vector<llvm::Value*> res(4);
+   std::vector<llvm::Value*> x(4);
+   x[0] = m_builder->CreateExtractElement(vector,
+                                           constantInt(cc),
+                                           name("x"));
+
+   VectorType  *vectorType = VectorType::get(Type::FloatTy, 4);
+   Constant *constVector = Constant::getNullValue(vectorType);
+   Value *res = m_builder->CreateInsertElement(constVector, x[0],
+                                              constantInt(0),
+                                              name("vecx"));
+   res = m_builder->CreateInsertElement(res, x[0], constantInt(1),
+                               name("vecxx"));
+   res = m_builder->CreateInsertElement(res, x[0], constantInt(2),
+                               name("vecxxx"));
+   res = m_builder->CreateInsertElement(res, x[0], constantInt(3),
+                               name("vecxxxx"));
+   return res;
+}
+
+std::vector<llvm::Value*> StorageSoa::constElement(llvm::IRBuilder<>* m_builder, llvm::Value *idx)
+{
+   llvm::Value* res;
+   std::vector<llvm::Value*> res2(4);
    llvm::Value *xChannel, *yChannel, *zChannel, *wChannel;
 
    xChannel = elementPointer(m_consts, idx, 0);
-   yChannel = elementPointer(m_consts, idx, 1);
-   zChannel = elementPointer(m_consts, idx, 2);
-   wChannel = elementPointer(m_consts, idx, 3);
 
-   res[0] = alignedArrayLoad(xChannel);
-   res[1] = alignedArrayLoad(yChannel);
-   res[2] = alignedArrayLoad(zChannel);
-   res[3] = alignedArrayLoad(wChannel);
+   res = alignedArrayLoad(xChannel);
 
-   return res;
+   res2[0]=unpackConstElement(m_builder, res,0);
+   res2[1]=unpackConstElement(m_builder, res,1);
+   res2[2]=unpackConstElement(m_builder, res,2);
+   res2[3]=unpackConstElement(m_builder, res,3);
+
+   return res2;
 }
 
 std::vector<llvm::Value*> StorageSoa::outputElement(llvm::Value *idx)
@@ -260,6 +281,12 @@ llvm::Module * StorageSoa::currentModule() const
     return m_block->getParent()->getParent();
 }
 
+llvm::Constant * StorageSoa::createConstGlobalFloat(const float val)
+{
+   Constant*c = ConstantFP::get(APFloat(val));
+   return c;
+}
+
 llvm::Constant * StorageSoa::createConstGlobalVector(const std::vector<float> &vec)
 {
    VectorType *vectorType = VectorType::get(Type::FloatTy, 4);
@@ -278,7 +305,7 @@ llvm::Constant * StorageSoa::createConstGlobalVector(const std::vector<float> &v
 }
 
 std::vector<llvm::Value*> StorageSoa::load(enum tgsi_file_type type, int idx, int swizzle,
-                                           llvm::Value *indIdx)
+                                           llvm::IRBuilder<>* m_builder,llvm::Value *indIdx)
 {
    std::vector<llvm::Value*> val(4);
 
@@ -302,7 +329,7 @@ std::vector<llvm::Value*> StorageSoa::load(enum tgsi_file_type type, int idx, in
       val = tempElement(realIndex);
       break;
    case TGSI_FILE_CONSTANT:
-      val = constElement(realIndex);
+      val = constElement(m_builder, realIndex);
       break;
    case TGSI_FILE_IMMEDIATE:
       val = immediateElement(realIndex);
diff --git a/src/gallium/auxiliary/gallivm/storagesoa.h b/src/gallium/auxiliary/gallivm/storagesoa.h
index ae2fc7c6ae..f21ca6ec43 100644
--- a/src/gallium/auxiliary/gallivm/storagesoa.h
+++ b/src/gallium/auxiliary/gallivm/storagesoa.h
@@ -29,6 +29,7 @@
 #define STORAGESOA_H
 
 #include <pipe/p_shader_tokens.h>
+#include <llvm/Support/IRBuilder.h>
 
 #include <vector>
 #include <list>
@@ -56,7 +57,7 @@ public:
 
 
    std::vector<llvm::Value*> load(enum tgsi_file_type type, int idx, int swizzle, 
-                                  llvm::Value *indIdx =0);
+                                  llvm::IRBuilder<>* m_builder, llvm::Value *indIdx =0);
    void store(enum tgsi_file_type type, int idx, const std::vector<llvm::Value*> &val,
               int mask);
 
@@ -76,10 +77,12 @@ private:
    const char *name(const char *prefix) const;
    llvm::Value  *alignedArrayLoad(llvm::Value *val);
    llvm::Module *currentModule() const;
+   llvm::Constant  *createConstGlobalFloat(const float val);
    llvm::Constant  *createConstGlobalVector(const std::vector<float> &vec);
 
    std::vector<llvm::Value*> inputElement(llvm::Value *indIdx);
-   std::vector<llvm::Value*> constElement(llvm::Value *indIdx);
+   llvm::Value* unpackConstElement(llvm::IRBuilder<>* m_builder, llvm::Value *indIdx, int cc);
+   std::vector<llvm::Value*> constElement(llvm::IRBuilder<>* m_builder, llvm::Value *indIdx);
    std::vector<llvm::Value*> outputElement(llvm::Value *indIdx);
    std::vector<llvm::Value*> tempElement(llvm::Value *indIdx);
    std::vector<llvm::Value*> immediateElement(llvm::Value *indIdx);
diff --git a/src/gallium/auxiliary/gallivm/tgsitollvm.cpp b/src/gallium/auxiliary/gallivm/tgsitollvm.cpp
index cc1516a45e..1191a6cae9 100644
--- a/src/gallium/auxiliary/gallivm/tgsitollvm.cpp
+++ b/src/gallium/auxiliary/gallivm/tgsitollvm.cpp
@@ -52,7 +52,7 @@ static inline FunctionType *vertexShaderFunctionType()
    // pass are castable to the following:
    // [4 x <4 x float>] inputs,
    // [4 x <4 x float>] output,
-   // [4 x [4 x float]] consts,
+   // [4 x [1 x float]] consts,
    // [4 x <4 x float>] temps
 
    std::vector<const Type*> funcArgs;
@@ -61,7 +61,7 @@ static inline FunctionType *vertexShaderFunctionType()
    PointerType *vectorArrayPtr = PointerType::get(vectorArray, 0);
 
    ArrayType   *floatArray     = ArrayType::get(Type::FloatTy, 4);
-   ArrayType   *constsArray    = ArrayType::get(floatArray, 4);
+   ArrayType   *constsArray    = ArrayType::get(floatArray, 1);
    PointerType *constsArrayPtr = PointerType::get(constsArray, 0);
 
    funcArgs.push_back(vectorArrayPtr);//inputs
@@ -246,6 +246,7 @@ translate_instruction(llvm::Module *module,
          val = storage->constElement(src->SrcRegister.Index, indIdx);
       } else if (src->SrcRegister.File == TGSI_FILE_INPUT) {
          val = storage->inputElement(src->SrcRegister.Index, indIdx);
+      // FIXME we should not be generating elements for temporaries, this creates useless memory writes
       } else if (src->SrcRegister.File == TGSI_FILE_TEMPORARY) {
          val = storage->tempElement(src->SrcRegister.Index);
       } else if (src->SrcRegister.File == TGSI_FILE_OUTPUT) {
@@ -286,9 +287,13 @@ translate_instruction(llvm::Module *module,
       out = instr->rsq(inputs[0]);
    }
       break;
-   case TGSI_OPCODE_EXP:
+   case TGSI_OPCODE_EXP: {
+      out = instr->exp(inputs[0]);
+   }
       break;
-   case TGSI_OPCODE_LOG:
+   case TGSI_OPCODE_LOG: {
+      out = instr->log(inputs[0]);
+   }
       break;
    case TGSI_OPCODE_MUL: {
       out = instr->mul(inputs[0], inputs[1]);
@@ -338,21 +343,31 @@ translate_instruction(llvm::Module *module,
       out = instr->lerp(inputs[0], inputs[1], inputs[2]);
    }
       break;
-   case TGSI_OPCODE_CND:
+   case TGSI_OPCODE_CND: {
+      out = instr->cnd(inputs[0], inputs[1], inputs[2]);
+   }
       break;
-   case TGSI_OPCODE_CND0:
+   case TGSI_OPCODE_CND0: {
+      out = instr->cnd0(inputs[0], inputs[1], inputs[2]);
+   }
       break;
-   case TGSI_OPCODE_DOT2ADD:
+   case TGSI_OPCODE_DOT2ADD: {
+      out = instr->dot2add(inputs[0], inputs[1], inputs[2]);
+   }
       break;
    case TGSI_OPCODE_INDEX:
       break;
-   case TGSI_OPCODE_NEGATE:
+   case TGSI_OPCODE_NEGATE: {
+      out = instr->neg(inputs[0]);
+   }
       break;
    case TGSI_OPCODE_FRAC: {
       out = instr->frc(inputs[0]);
    }
       break;
-   case TGSI_OPCODE_CLAMP:
+   case TGSI_OPCODE_CLAMP: {
+      out = instr->clamp(inputs[0]);
+   }
       break;
    case TGSI_OPCODE_FLOOR: {
       out = instr->floor(inputs[0]);
@@ -392,9 +407,13 @@ translate_instruction(llvm::Module *module,
       out = instr->cos(inputs[0]);
    }
       break;
-   case TGSI_OPCODE_DDX:
+   case TGSI_OPCODE_DDX: {
+      out = instr->ddx(inputs[0]);
+   }
       break;
-   case TGSI_OPCODE_DDY:
+   case TGSI_OPCODE_DDY: {
+      out = instr->ddy(inputs[0]);
+   }
       break;
    case TGSI_OPCODE_KILP:
       break;
@@ -408,9 +427,13 @@ translate_instruction(llvm::Module *module,
       break;
    case TGSI_OPCODE_RFL:
       break;
-   case TGSI_OPCODE_SEQ:
+   case TGSI_OPCODE_SEQ: {
+      out = instr->seq(inputs[0], inputs[1]);
+   }
       break;
-   case TGSI_OPCODE_SFL:
+   case TGSI_OPCODE_SFL: {
+      out = instr->sfl(inputs[0], inputs[1]);
+   }
       break;
    case TGSI_OPCODE_SGT: {
       out = instr->sgt(inputs[0], inputs[1]);
@@ -420,11 +443,17 @@ translate_instruction(llvm::Module *module,
       out = instr->sin(inputs[0]);
    }
       break;
-   case TGSI_OPCODE_SLE:
+   case TGSI_OPCODE_SLE: {
+      out = instr->sle(inputs[0], inputs[1]);
+   }
       break;
-   case TGSI_OPCODE_SNE:
+   case TGSI_OPCODE_SNE: {
+      out = instr->sne(inputs[0], inputs[1]);
+   }
       break;
-   case TGSI_OPCODE_STR:
+   case TGSI_OPCODE_STR: {
+      out = instr->str(inputs[0], inputs[1]);
+   }
       break;
    case TGSI_OPCODE_TEX:
       break;
@@ -438,7 +467,9 @@ translate_instruction(llvm::Module *module,
       break;
    case TGSI_OPCODE_UP4UB:
       break;
-   case TGSI_OPCODE_X2D:
+   case TGSI_OPCODE_X2D: {
+      out = instr->x2d(inputs[0], inputs[1], inputs[2]);
+   }
       break;
    case TGSI_OPCODE_ARA:
       break;
@@ -468,11 +499,18 @@ translate_instruction(llvm::Module *module,
       break;
    case TGSI_OPCODE_TXB:
       break;
-   case TGSI_OPCODE_NRM:
+   case TGSI_OPCODE_NRM4:
+   case TGSI_OPCODE_NRM: {
+      out = instr->nrm(inputs[0]);
+   }
       break;
-   case TGSI_OPCODE_DIV:
+   case TGSI_OPCODE_DIV: {
+      out = instr->div(inputs[0], inputs[1]);
+   }
       break;
-   case TGSI_OPCODE_DP2:
+   case TGSI_OPCODE_DP2: {
+      out = instr->dp2(inputs[0], inputs[1]);
+   }
       break;
    case TGSI_OPCODE_TXL:
       break;
@@ -590,8 +628,6 @@ translate_instruction(llvm::Module *module,
       break;
    case TGSI_OPCODE_M3X2:
       break;
-   case TGSI_OPCODE_NRM4:
-      break;
    case TGSI_OPCODE_CALLNZ:
       break;
    case TGSI_OPCODE_IFC:
@@ -641,6 +677,7 @@ translate_instruction(llvm::Module *module,
 
       if (dst->DstRegister.File == TGSI_FILE_OUTPUT) {
          storage->setOutputElement(dst->DstRegister.Index, out, dst->DstRegister.WriteMask);
+      // FIXME we should not be generating elements for temporaries, this creates useless memory writes
       } else if (dst->DstRegister.File == TGSI_FILE_TEMPORARY) {
          storage->setTempElement(dst->DstRegister.Index, out, dst->DstRegister.WriteMask);
       } else if (dst->DstRegister.File == TGSI_FILE_ADDRESS) {
@@ -672,9 +709,8 @@ translate_instructionir(llvm::Module *module,
       if (src->SrcRegister.Indirect) {
          indIdx = storage->addrElement(src->SrcRegisterInd.Index);
       }
-
       val = storage->load((enum tgsi_file_type)src->SrcRegister.File,
-                          src->SrcRegister.Index, swizzle, indIdx);
+                          src->SrcRegister.Index, swizzle, instr->getIRBuilder(), indIdx);
 
       inputs[i] = val;
    }
@@ -732,6 +768,7 @@ translate_instructionir(llvm::Module *module,
    }
       break;
    case TGSI_OPCODE_SLT: {
+      out = instr->slt(inputs[0], inputs[1]);
    }
       break;
    case TGSI_OPCODE_SGE: {
@@ -989,7 +1026,6 @@ translate_instructionir(llvm::Module *module,
    /* store results  */
    for (int i = 0; i < inst->Instruction.NumDstRegs; ++i) {
       struct tgsi_full_dst_register *dst = &inst->FullDstRegisters[i];
-
       storage->store((enum tgsi_file_type)dst->DstRegister.File,
                      dst->DstRegister.Index, out, dst->DstRegister.WriteMask);
    }
diff --git a/src/gallium/auxiliary/pipebuffer/pb_buffer_malloc.c b/src/gallium/auxiliary/pipebuffer/pb_buffer_malloc.c
index 20fc87b39d..1bf22a2ec0 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_buffer_malloc.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_buffer_malloc.c
@@ -129,7 +129,7 @@ pb_malloc_buffer_create(size_t size,
 
 
 static struct pb_buffer *
-pb_malloc_buffer_create_buffer(struct pb_manager *mgr, 
+pb_malloc_bufmgr_create_buffer(struct pb_manager *mgr, 
                                size_t size,
                                const struct pb_desc *desc) 
 {
@@ -138,6 +138,13 @@ pb_malloc_buffer_create_buffer(struct pb_manager *mgr,
 
 
 static void
+pb_malloc_bufmgr_flush(struct pb_manager *mgr) 
+{
+   /* No-op */
+}
+
+
+static void
 pb_malloc_bufmgr_destroy(struct pb_manager *mgr) 
 {
    /* No-op */
@@ -146,8 +153,9 @@ pb_malloc_bufmgr_destroy(struct pb_manager *mgr)
 
 static struct pb_manager 
 pb_malloc_bufmgr = {
-   pb_malloc_buffer_create_buffer,
-   pb_malloc_bufmgr_destroy
+   pb_malloc_bufmgr_destroy,
+   pb_malloc_bufmgr_create_buffer,
+   pb_malloc_bufmgr_flush
 };
 
 
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr.h b/src/gallium/auxiliary/pipebuffer/pb_bufmgr.h
index 32867029ee..cafbee045a 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr.h
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr.h
@@ -69,13 +69,22 @@ struct pipe_winsys;
  */
 struct pb_manager
 {
+   void
+   (*destroy)( struct pb_manager *mgr );
+
    struct pb_buffer *
    (*create_buffer)( struct pb_manager *mgr, 
 	             size_t size,
 	             const struct pb_desc *desc);
 
+   /**
+    * Flush all temporary-held buffers.
+    * 
+    * Used mostly to aid debugging memory issues or to clean up resources when 
+    * the drivers are long lived.
+    */
    void
-   (*destroy)( struct pb_manager *mgr );
+   (*flush)( struct pb_manager *mgr );
 };
 
 
@@ -153,9 +162,6 @@ struct pb_manager *
 pb_cache_manager_create(struct pb_manager *provider, 
                      	unsigned usecs); 
 
-void
-pb_cache_flush(struct pb_manager *mgr);
-
 
 /** 
  * Fenced buffer manager.
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_alt.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_alt.c
index 2afaeafa1a..c956924cc7 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_alt.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_alt.c
@@ -76,6 +76,21 @@ pb_alt_manager_create_buffer(struct pb_manager *_mgr,
 
 
 static void
+pb_alt_manager_flush(struct pb_manager *_mgr)
+{
+   struct pb_alt_manager *mgr = pb_alt_manager(_mgr);
+   
+   assert(mgr->provider1->flush);
+   if(mgr->provider1->flush)
+      mgr->provider1->flush(mgr->provider1);
+   
+   assert(mgr->provider2->flush);
+   if(mgr->provider2->flush)
+      mgr->provider2->flush(mgr->provider2);
+}
+
+
+static void
 pb_alt_manager_destroy(struct pb_manager *mgr)
 {
    FREE(mgr);
@@ -97,6 +112,7 @@ pb_alt_manager_create(struct pb_manager *provider1,
 
    mgr->base.destroy = pb_alt_manager_destroy;
    mgr->base.create_buffer = pb_alt_manager_create_buffer;
+   mgr->base.flush = pb_alt_manager_flush;
    mgr->provider1 = provider1;
    mgr->provider2 = provider2;
       
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_cache.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_cache.c
index 1ec422fb19..8f118874ec 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_cache.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_cache.c
@@ -306,8 +306,8 @@ pb_cache_manager_create_buffer(struct pb_manager *_mgr,
 }
 
 
-void
-pb_cache_flush(struct pb_manager *_mgr)
+static void
+pb_cache_manager_flush(struct pb_manager *_mgr)
 {
    struct pb_cache_manager *mgr = pb_cache_manager(_mgr);
    struct list_head *curr, *next;
@@ -323,13 +323,17 @@ pb_cache_flush(struct pb_manager *_mgr)
       next = curr->next;
    }
    pipe_mutex_unlock(mgr->mutex);
+   
+   assert(mgr->provider->flush);
+   if(mgr->provider->flush)
+      mgr->provider->flush(mgr->provider);
 }
 
 
 static void
 pb_cache_manager_destroy(struct pb_manager *mgr)
 {
-   pb_cache_flush(mgr);
+   pb_cache_manager_flush(mgr);
    FREE(mgr);
 }
 
@@ -349,6 +353,7 @@ pb_cache_manager_create(struct pb_manager *provider,
 
    mgr->base.destroy = pb_cache_manager_destroy;
    mgr->base.create_buffer = pb_cache_manager_create_buffer;
+   mgr->base.flush = pb_cache_manager_flush;
    mgr->provider = provider;
    mgr->usecs = usecs;
    LIST_INITHEAD(&mgr->delayed);
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_debug.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_debug.c
index 5f1ed3e5a8..1675e6e182 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_debug.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_debug.c
@@ -314,6 +314,16 @@ pb_debug_manager_create_buffer(struct pb_manager *_mgr,
 
 
 static void
+pb_debug_manager_flush(struct pb_manager *_mgr)
+{
+   struct pb_debug_manager *mgr = pb_debug_manager(_mgr);
+   assert(mgr->provider->flush);
+   if(mgr->provider->flush)
+      mgr->provider->flush(mgr->provider);
+}
+
+
+static void
 pb_debug_manager_destroy(struct pb_manager *_mgr)
 {
    struct pb_debug_manager *mgr = pb_debug_manager(_mgr);
@@ -336,6 +346,7 @@ pb_debug_manager_create(struct pb_manager *provider, size_t band_size)
 
    mgr->base.destroy = pb_debug_manager_destroy;
    mgr->base.create_buffer = pb_debug_manager_create_buffer;
+   mgr->base.flush = pb_debug_manager_flush;
    mgr->provider = provider;
    mgr->band_size = band_size;
       
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_fenced.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_fenced.c
index 8fc63ce648..633ee70a75 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_fenced.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_fenced.c
@@ -95,6 +95,19 @@ fenced_bufmgr_create_buffer(struct pb_manager *mgr,
 
 
 static void
+fenced_bufmgr_flush(struct pb_manager *mgr)
+{
+   struct fenced_pb_manager *fenced_mgr = fenced_pb_manager(mgr);
+
+   fenced_buffer_list_check_free(fenced_mgr->fenced_list, TRUE);
+
+   assert(fenced_mgr->provider->flush);
+   if(fenced_mgr->provider->flush)
+      fenced_mgr->provider->flush(fenced_mgr->provider);
+}
+
+
+static void
 fenced_bufmgr_destroy(struct pb_manager *mgr)
 {
    struct fenced_pb_manager *fenced_mgr = fenced_pb_manager(mgr);
@@ -123,6 +136,7 @@ fenced_bufmgr_create(struct pb_manager *provider,
 
    fenced_mgr->base.destroy = fenced_bufmgr_destroy;
    fenced_mgr->base.create_buffer = fenced_bufmgr_create_buffer;
+   fenced_mgr->base.flush = fenced_bufmgr_flush;
 
    fenced_mgr->provider = provider;
    fenced_mgr->fenced_list = fenced_buffer_list_create(winsys);
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c
index e8c7f8e1f8..fe80ca30ee 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c
@@ -200,6 +200,13 @@ mm_bufmgr_create_buffer(struct pb_manager *mgr,
 
 
 static void
+mm_bufmgr_flush(struct pb_manager *mgr)
+{
+   /* No-op */
+}
+
+
+static void
 mm_bufmgr_destroy(struct pb_manager *mgr)
 {
    struct mm_pb_manager *mm = mm_pb_manager(mgr);
@@ -230,8 +237,9 @@ mm_bufmgr_create_from_buffer(struct pb_buffer *buffer,
    if (!mm)
       return NULL;
 
-   mm->base.create_buffer = mm_bufmgr_create_buffer;
    mm->base.destroy = mm_bufmgr_destroy;
+   mm->base.create_buffer = mm_bufmgr_create_buffer;
+   mm->base.flush = mm_bufmgr_flush;
 
    mm->size = size;
    mm->align2 = align2; /* 64-byte alignment */
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_pool.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_pool.c
index 3ef72c5bbb..61ac291ed7 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_pool.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_pool.c
@@ -203,6 +203,13 @@ pool_bufmgr_create_buffer(struct pb_manager *mgr,
 
 
 static void
+pool_bufmgr_flush(struct pb_manager *mgr)
+{
+   /* No-op */
+}
+
+
+static void
 pool_bufmgr_destroy(struct pb_manager *mgr)
 {
    struct pool_pb_manager *pool = pool_pb_manager(mgr);
@@ -238,6 +245,7 @@ pool_bufmgr_create(struct pb_manager *provider,
 
    pool->base.destroy = pool_bufmgr_destroy;
    pool->base.create_buffer = pool_bufmgr_create_buffer;
+   pool->base.flush = pool_bufmgr_flush;
 
    LIST_INITHEAD(&pool->free);
 
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_slab.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_slab.c
index 8698c4cff6..2a80154920 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_slab.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_slab.c
@@ -407,6 +407,17 @@ pb_slab_manager_create_buffer(struct pb_manager *_mgr,
 
 
 static void
+pb_slab_manager_flush(struct pb_manager *_mgr)
+{
+   struct pb_slab_manager *mgr = pb_slab_manager(_mgr);
+
+   assert(mgr->provider->flush);
+   if(mgr->provider->flush)
+      mgr->provider->flush(mgr->provider);
+}
+
+
+static void
 pb_slab_manager_destroy(struct pb_manager *_mgr)
 {
    struct pb_slab_manager *mgr = pb_slab_manager(_mgr);
@@ -430,6 +441,7 @@ pb_slab_manager_create(struct pb_manager *provider,
 
    mgr->base.destroy = pb_slab_manager_destroy;
    mgr->base.create_buffer = pb_slab_manager_create_buffer;
+   mgr->base.flush = pb_slab_manager_flush;
 
    mgr->provider = provider;
    mgr->bufSize = bufSize;
@@ -466,6 +478,19 @@ pb_slab_range_manager_create_buffer(struct pb_manager *_mgr,
 
 
 static void
+pb_slab_range_manager_flush(struct pb_manager *_mgr)
+{
+   struct pb_slab_range_manager *mgr = pb_slab_range_manager(_mgr);
+
+   /* Individual slabs don't hold any temporary buffers so no need to call them */
+   
+   assert(mgr->provider->flush);
+   if(mgr->provider->flush)
+      mgr->provider->flush(mgr->provider);
+}
+
+
+static void
 pb_slab_range_manager_destroy(struct pb_manager *_mgr)
 {
    struct pb_slab_range_manager *mgr = pb_slab_range_manager(_mgr);
@@ -499,6 +524,7 @@ pb_slab_range_manager_create(struct pb_manager *provider,
 
    mgr->base.destroy = pb_slab_range_manager_destroy;
    mgr->base.create_buffer = pb_slab_range_manager_create_buffer;
+   mgr->base.flush = pb_slab_range_manager_flush;
 
    mgr->provider = provider;
    mgr->minBufSize = minBufSize;
diff --git a/src/gallium/auxiliary/rtasm/Makefile b/src/gallium/auxiliary/rtasm/Makefile
index 39b8a4dbd7..252dc5274a 100644
--- a/src/gallium/auxiliary/rtasm/Makefile
+++ b/src/gallium/auxiliary/rtasm/Makefile
@@ -7,6 +7,7 @@ C_SOURCES = \
 	rtasm_cpu.c \
 	rtasm_execmem.c \
 	rtasm_x86sse.c \
+	rtasm_ppc.c \
 	rtasm_ppc_spe.c
 
 include ../../Makefile.template
diff --git a/src/gallium/auxiliary/rtasm/SConscript b/src/gallium/auxiliary/rtasm/SConscript
index 8ea25922aa..eb48368acc 100644
--- a/src/gallium/auxiliary/rtasm/SConscript
+++ b/src/gallium/auxiliary/rtasm/SConscript
@@ -6,6 +6,7 @@ rtasm = env.ConvenienceLibrary(
 		'rtasm_cpu.c',
 		'rtasm_execmem.c',
 		'rtasm_x86sse.c',
+		'rtasm_ppc.c',
 		'rtasm_ppc_spe.c',
 	])
 
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc.c b/src/gallium/auxiliary/rtasm/rtasm_ppc.c
new file mode 100644
index 0000000000..7dd8263749
--- /dev/null
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc.c
@@ -0,0 +1,924 @@
+/**************************************************************************
+ *
+ * Copyright (C) 2008 Tungsten Graphics, Inc.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * PPC code generation.
+ * For reference, see http://www.power.org/resources/reading/PowerISA_V2.05.pdf
+ * ABI info: http://www.cs.utsa.edu/~whaley/teach/cs6463FHPO/LEC/lec12_ho.pdf
+ *
+ * Other PPC refs:
+ * http://www-01.ibm.com/chips/techlib/techlib.nsf/techdocs/852569B20050FF778525699600719DF2
+ * http://www.ibm.com/developerworks/eserver/library/es-archguide-v2.html
+ * http://www.freescale.com/files/product/doc/MPCFPE32B.pdf
+ *
+ * \author Brian Paul
+ */
+
+
+#include <stdio.h>
+#include "util/u_memory.h"
+#include "pipe/p_debug.h"
+#include "rtasm_ppc.h"
+
+
+void
+ppc_init_func(struct ppc_function *p, unsigned max_inst)
+{
+   uint i;
+
+   p->store = align_malloc(max_inst * PPC_INST_SIZE, 16);
+   p->num_inst = 0;
+   p->max_inst = max_inst;
+   p->reg_used = 0x0;
+   p->fp_used = 0x0;
+   p->vec_used = 0x0;
+
+   /* only allow using gp registers 3..12 for now */
+   for (i = 0; i < 3; i++)
+      ppc_reserve_register(p, i);
+   for (i = 12; i < PPC_NUM_REGS; i++)
+      ppc_reserve_register(p, i);
+}
+
+
+void
+ppc_release_func(struct ppc_function *p)
+{
+   assert(p->num_inst <= p->max_inst);
+   if (p->store != NULL) {
+      align_free(p->store);
+   }
+   p->store = NULL;
+}
+
+
+void (*ppc_get_func(struct ppc_function *p))(void)
+{
+#if 0
+   DUMP_END();
+   if (DISASSEM && p->store)
+      debug_printf("disassemble %p %p\n", p->store, p->csr);
+
+   if (p->store == p->error_overflow)
+      return (void (*)(void)) NULL;
+   else
+#endif
+      return (void (*)(void)) p->store;
+}
+
+
+void
+ppc_dump_func(const struct ppc_function *p)
+{
+   uint i;
+   for (i = 0; i < p->num_inst; i++) {
+      debug_printf("%3u: 0x%08x\n", i, p->store[i]);
+   }
+}
+
+
+/**
+ * Mark a register as being unavailable.
+ */
+int
+ppc_reserve_register(struct ppc_function *p, int reg)
+{
+   assert(reg < PPC_NUM_REGS);
+   p->reg_used |= (1 << reg);
+   return reg;
+}
+
+
+/**
+ * Allocate a general purpose register.
+ * \return register index or -1 if none left.
+ */
+int
+ppc_allocate_register(struct ppc_function *p)
+{
+   unsigned i;
+   for (i = 0; i < PPC_NUM_REGS; i++) {
+      const uint64_t mask = 1 << i;
+      if ((p->reg_used & mask) == 0) {
+         p->reg_used |= mask;
+         return i;
+      }
+   }
+   return -1;
+}
+
+
+/**
+ * Mark the given general purpose register as "unallocated".
+ */
+void
+ppc_release_register(struct ppc_function *p, int reg)
+{
+   assert(reg < PPC_NUM_REGS);
+   assert(p->reg_used & (1 << reg));
+   p->reg_used &= ~(1 << reg);
+}
+
+
+/**
+ * Allocate a floating point register.
+ * \return register index or -1 if none left.
+ */
+int
+ppc_allocate_fp_register(struct ppc_function *p)
+{
+   unsigned i;
+   for (i = 0; i < PPC_NUM_FP_REGS; i++) {
+      const uint64_t mask = 1 << i;
+      if ((p->fp_used & mask) == 0) {
+         p->fp_used |= mask;
+         return i;
+      }
+   }
+   return -1;
+}
+
+
+/**
+ * Mark the given floating point register as "unallocated".
+ */
+void
+ppc_release_fp_register(struct ppc_function *p, int reg)
+{
+   assert(reg < PPC_NUM_FP_REGS);
+   assert(p->fp_used & (1 << reg));
+   p->fp_used &= ~(1 << reg);
+}
+
+
+/**
+ * Allocate a vector register.
+ * \return register index or -1 if none left.
+ */
+int
+ppc_allocate_vec_register(struct ppc_function *p)
+{
+   unsigned i;
+   for (i = 0; i < PPC_NUM_VEC_REGS; i++) {
+      const uint64_t mask = 1 << i;
+      if ((p->vec_used & mask) == 0) {
+         p->vec_used |= mask;
+         return i;
+      }
+   }
+   return -1;
+}
+
+
+/**
+ * Mark the given vector register as "unallocated".
+ */
+void
+ppc_release_vec_register(struct ppc_function *p, int reg)
+{
+   assert(reg < PPC_NUM_VEC_REGS);
+   assert(p->vec_used & (1 << reg));
+   p->vec_used &= ~(1 << reg);
+}
+
+
+
+union vx_inst {
+   uint32_t bits;
+   struct {
+      unsigned op:6;
+      unsigned vD:5;
+      unsigned vA:5;
+      unsigned vB:5;
+      unsigned op2:11;
+   } inst;
+};
+
+static inline void
+emit_vx(struct ppc_function *p, uint op2, uint vD, uint vA, uint vB)
+{
+   union vx_inst inst;
+   inst.inst.op = 4;
+   inst.inst.vD = vD;
+   inst.inst.vA = vA;
+   inst.inst.vB = vB;
+   inst.inst.op2 = op2;
+   p->store[p->num_inst++] = inst.bits;
+   assert(p->num_inst <= p->max_inst);
+};
+
+
+union vxr_inst {
+   uint32_t bits;
+   struct {
+      unsigned op:6;
+      unsigned vD:5;
+      unsigned vA:5;
+      unsigned vB:5;
+      unsigned rC:1;
+      unsigned op2:10;
+   } inst;
+};
+
+static inline void
+emit_vxr(struct ppc_function *p, uint op2, uint vD, uint vA, uint vB)
+{
+   union vxr_inst inst;
+   inst.inst.op = 4;
+   inst.inst.vD = vD;
+   inst.inst.vA = vA;
+   inst.inst.vB = vB;
+   inst.inst.rC = 0;
+   inst.inst.op2 = op2;
+   p->store[p->num_inst++] = inst.bits;
+   assert(p->num_inst <= p->max_inst);
+};
+
+
+union va_inst {
+   uint32_t bits;
+   struct {
+      unsigned op:6;
+      unsigned vD:5;
+      unsigned vA:5;
+      unsigned vB:5;
+      unsigned vC:5;
+      unsigned op2:6;
+   } inst;
+};
+
+static inline void
+emit_va(struct ppc_function *p, uint op2, uint vD, uint vA, uint vB, uint vC)
+{
+   union va_inst inst;
+   inst.inst.op = 4;
+   inst.inst.vD = vD;
+   inst.inst.vA = vA;
+   inst.inst.vB = vB;
+   inst.inst.vC = vC;
+   inst.inst.op2 = op2;
+   p->store[p->num_inst++] = inst.bits;
+   assert(p->num_inst <= p->max_inst);
+};
+
+
+union i_inst {
+   uint32_t bits;
+   struct {
+      unsigned op:6;
+      unsigned li:24;
+      unsigned aa:1;
+      unsigned lk:1;
+   } inst;
+};
+
+static INLINE void
+emit_i(struct ppc_function *p, uint op, uint li, uint aa, uint lk)
+{
+   union i_inst inst;
+   inst.inst.op = op;
+   inst.inst.li = li;
+   inst.inst.aa = aa;
+   inst.inst.lk = lk;
+   p->store[p->num_inst++] = inst.bits;
+   assert(p->num_inst <= p->max_inst);
+}
+
+
+union xl_inst {
+   uint32_t bits;
+   struct {
+      unsigned op:6;
+      unsigned bo:5;
+      unsigned bi:5;
+      unsigned unused:3;
+      unsigned bh:2;
+      unsigned op2:10;
+      unsigned lk:1;
+   } inst;
+};
+
+static INLINE void
+emit_xl(struct ppc_function *p, uint op, uint bo, uint bi, uint bh,
+        uint op2, uint lk)
+{
+   union xl_inst inst;
+   inst.inst.op = op;
+   inst.inst.bo = bo;
+   inst.inst.bi = bi;
+   inst.inst.unused = 0x0;
+   inst.inst.bh = bh;
+   inst.inst.op2 = op2;
+   inst.inst.lk = lk;
+   p->store[p->num_inst++] = inst.bits;
+   assert(p->num_inst <= p->max_inst);
+}
+
+static INLINE void
+dump_xl(const char *name, uint inst)
+{
+   union xl_inst i;
+
+   i.bits = inst;
+   debug_printf("%s = 0x%08x\n", name, inst);
+   debug_printf(" op: %d 0x%x\n", i.inst.op, i.inst.op);
+   debug_printf(" bo: %d 0x%x\n", i.inst.bo, i.inst.bo);
+   debug_printf(" bi: %d 0x%x\n", i.inst.bi, i.inst.bi);
+   debug_printf(" unused: %d 0x%x\n", i.inst.unused, i.inst.unused);
+   debug_printf(" bh: %d 0x%x\n", i.inst.bh, i.inst.bh);
+   debug_printf(" op2: %d 0x%x\n", i.inst.op2, i.inst.op2);
+   debug_printf(" lk: %d 0x%x\n", i.inst.lk, i.inst.lk);
+}
+
+
+union x_inst {
+   uint32_t bits;
+   struct {
+      unsigned op:6;
+      unsigned vrs:5;
+      unsigned ra:5;
+      unsigned rb:5;
+      unsigned op2:10;
+      unsigned unused:1;
+   } inst;
+};
+
+static INLINE void
+emit_x(struct ppc_function *p, uint op, uint vrs, uint ra, uint rb, uint op2)
+{
+   union x_inst inst;
+   inst.inst.op = op;
+   inst.inst.vrs = vrs;
+   inst.inst.ra = ra;
+   inst.inst.rb = rb;
+   inst.inst.op2 = op2;
+   inst.inst.unused = 0x0;
+   p->store[p->num_inst++] = inst.bits;
+   assert(p->num_inst <= p->max_inst);
+}
+
+
+union d_inst {
+   uint32_t bits;
+   struct {
+      unsigned op:6;
+      unsigned rt:5;
+      unsigned ra:5;
+      unsigned si:16;
+   } inst;
+};
+
+static inline void
+emit_d(struct ppc_function *p, uint op, uint rt, uint ra, int si)
+{
+   union d_inst inst;
+   assert(si >= -32768);
+   assert(si <= 32767);
+   inst.inst.op = op;
+   inst.inst.rt = rt;
+   inst.inst.ra = ra;
+   inst.inst.si = (unsigned) (si & 0xffff);
+   p->store[p->num_inst++] = inst.bits;
+   assert(p->num_inst <= p->max_inst);
+};
+
+
+union a_inst {
+   uint32_t bits;
+   struct {
+      unsigned op:6;
+      unsigned frt:5;
+      unsigned fra:5;
+      unsigned frb:5;
+      unsigned unused:5;
+      unsigned op2:5;
+      unsigned rc:1;
+   } inst;
+};
+
+static inline void
+emit_a(struct ppc_function *p, uint op, uint frt, uint fra, uint frb, uint op2,
+       uint rc)
+{
+   union a_inst inst;
+   inst.inst.op = op;
+   inst.inst.frt = frt;
+   inst.inst.fra = fra;
+   inst.inst.frb = frb;
+   inst.inst.unused = 0x0;
+   inst.inst.op2 = op2;
+   inst.inst.rc = rc;
+   p->store[p->num_inst++] = inst.bits;
+   assert(p->num_inst <= p->max_inst);
+};
+
+
+union xo_inst {
+   uint32_t bits;
+   struct {
+      unsigned op:6;
+      unsigned rt:5;
+      unsigned ra:5;
+      unsigned rb:5;
+      unsigned oe:1;
+      unsigned op2:9;
+      unsigned rc:1;
+   } inst;
+};
+
+static INLINE void
+emit_xo(struct ppc_function *p, uint op, uint rt, uint ra, uint rb, uint oe,
+        uint op2, uint rc)
+{
+   union xo_inst inst;
+   inst.inst.op = op;
+   inst.inst.rt = rt;
+   inst.inst.ra = ra;
+   inst.inst.rb = rb;
+   inst.inst.oe = oe;
+   inst.inst.op2 = op2;
+   inst.inst.rc = rc;
+   p->store[p->num_inst++] = inst.bits;
+   assert(p->num_inst <= p->max_inst);
+}
+
+
+
+
+
+/**
+ ** float vector arithmetic
+ **/
+
+/** vector float add */
+void
+ppc_vaddfp(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+   emit_vx(p, 10, vD, vA, vB);
+}
+
+/** vector float substract */
+void
+ppc_vsubfp(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+   emit_vx(p, 74, vD, vA, vB);
+}
+
+/** vector float min */
+void
+ppc_vminfp(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+   emit_vx(p, 1098, vD, vA, vB);
+}
+
+/** vector float max */
+void
+ppc_vmaxfp(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+   emit_vx(p, 1034, vD, vA, vB);
+}
+
+/** vector float mult add: vD = vA * vB + vC */
+void
+ppc_vmaddfp(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC)
+{
+   emit_va(p, 46, vD, vA, vC, vB); /* note arg order */
+}
+
+/** vector float compare greater than */
+void
+ppc_vcmpgtfpx(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+   emit_vxr(p, 710, vD, vA, vB);
+}
+
+/** vector float compare greater than or equal to */
+void
+ppc_vcmpgefpx(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+   emit_vxr(p, 454, vD, vA, vB);
+}
+
+/** vector float compare equal */
+void
+ppc_vcmpeqfpx(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+   emit_vxr(p, 198, vD, vA, vB);
+}
+
+/** vector float 2^x */
+void
+ppc_vexptefp(struct ppc_function *p, uint vD, uint vB)
+{
+   emit_vx(p, 394, vD, 0, vB);
+}
+
+/** vector float log2(x) */
+void
+ppc_vlogefp(struct ppc_function *p, uint vD, uint vB)
+{
+   emit_vx(p, 458, vD, 0, vB);
+}
+
+/** vector float reciprocol */
+void
+ppc_vrefp(struct ppc_function *p, uint vD, uint vB)
+{
+   emit_vx(p, 266, vD, 0, vB);
+}
+
+/** vector float reciprocol sqrt estimate */
+void
+ppc_vrsqrtefp(struct ppc_function *p, uint vD, uint vB)
+{
+   emit_vx(p, 330, vD, 0, vB);
+}
+
+/** vector float round to negative infinity */
+void
+ppc_vrfim(struct ppc_function *p, uint vD, uint vB)
+{
+   emit_vx(p, 714, vD, 0, vB);
+}
+
+/** vector float round to positive infinity */
+void
+ppc_vrfip(struct ppc_function *p, uint vD, uint vB)
+{
+   emit_vx(p, 650, vD, 0, vB);
+}
+
+/** vector float round to nearest int */
+void
+ppc_vrfin(struct ppc_function *p, uint vD, uint vB)
+{
+   emit_vx(p, 522, vD, 0, vB);
+}
+
+/** vector float round to int toward zero */
+void
+ppc_vrfiz(struct ppc_function *p, uint vD, uint vB)
+{
+   emit_vx(p, 586, vD, 0, vB);
+}
+
+/** vector store: store vR at mem[vA+vB] */
+void
+ppc_stvx(struct ppc_function *p, uint vR, uint vA, uint vB)
+{
+   emit_x(p, 31, vR, vA, vB, 231);
+}
+
+/** vector load: vR = mem[vA+vB] */
+void
+ppc_lvx(struct ppc_function *p, uint vR, uint vA, uint vB)
+{
+   emit_x(p, 31, vR, vA, vB, 103);
+}
+
+/** load vector element word: vR = mem_word[ra+rb] */
+void
+ppc_lvewx(struct ppc_function *p, uint vr, uint ra, uint rb)
+{
+   emit_x(p, 31, vr, ra, rb, 71);
+}
+
+
+
+
+/**
+ ** vector bitwise operations
+ **/
+
+/** vector and */
+void
+ppc_vand(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+   emit_vx(p, 1028, vD, vA, vB);
+}
+
+/** vector and complement */
+void
+ppc_vandc(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+   emit_vx(p, 1092, vD, vA, vB);
+}
+
+/** vector or */
+void
+ppc_vor(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+   emit_vx(p, 1156, vD, vA, vB);
+}
+
+/** vector nor */
+void
+ppc_vnor(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+   emit_vx(p, 1284, vD, vA, vB);
+}
+
+/** vector xor */
+void
+ppc_vxor(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+   emit_vx(p, 1220, vD, vA, vB);
+}
+
+/** Pseudo-instruction: vector move */
+void
+ppc_vmove(struct ppc_function *p, uint vD, uint vA)
+{
+   ppc_vor(p, vD, vA, vA);
+}
+
+/** Set vector register to {0,0,0,0} */
+void
+ppc_vzero(struct ppc_function *p, uint vr)
+{
+   ppc_vxor(p, vr, vr, vr);
+}
+
+
+
+
+/**
+ ** Vector shuffle / select / splat / etc
+ **/
+
+/** vector permute */
+void
+ppc_vperm(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC)
+{
+   emit_va(p, 43, vD, vA, vB, vC);
+}
+
+/** vector select */
+void
+ppc_vsel(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC)
+{
+   emit_va(p, 42, vD, vA, vB, vC);
+}
+
+/** vector splat byte */
+void
+ppc_vspltb(struct ppc_function *p, uint vD, uint vB, uint imm)
+{
+   emit_vx(p, 42, vD, imm, vB);
+}
+
+/** vector splat half word */
+void
+ppc_vsplthw(struct ppc_function *p, uint vD, uint vB, uint imm)
+{
+   emit_vx(p, 588, vD, imm, vB);
+}
+
+/** vector splat word */
+void
+ppc_vspltw(struct ppc_function *p, uint vD, uint vB, uint imm)
+{
+   emit_vx(p, 652, vD, imm, vB);
+}
+
+/** vector splat signed immediate word */
+void
+ppc_vspltisw(struct ppc_function *p, uint vD, int imm)
+{
+   assert(imm >= -16);
+   assert(imm < 15);
+   emit_vx(p, 908, vD, imm, 0);
+}
+
+/** vector shift left word: vD[word] = vA[word] << (vB[word] & 0x1f) */
+void
+ppc_vslw(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+   emit_vx(p, 388, vD, vA, vB);
+}
+
+
+
+
+/**
+ ** integer arithmetic
+ **/
+
+/** rt = ra + imm */
+void
+ppc_addi(struct ppc_function *p, uint rt, uint ra, int imm)
+{
+   emit_d(p, 14, rt, ra, imm);
+}
+
+/** rt = ra + (imm << 16) */
+void
+ppc_addis(struct ppc_function *p, uint rt, uint ra, int imm)
+{
+   emit_d(p, 15, rt, ra, imm);
+}
+
+/** rt = ra + rb */
+void
+ppc_add(struct ppc_function *p, uint rt, uint ra, uint rb)
+{
+   emit_xo(p, 31, rt, ra, rb, 0, 266, 0);
+}
+
+/** rt = ra AND ra */
+void
+ppc_and(struct ppc_function *p, uint rt, uint ra, uint rb)
+{
+   emit_x(p, 31, ra, rt, rb, 28);  /* note argument order */
+}
+
+/** rt = ra AND imm */
+void
+ppc_andi(struct ppc_function *p, uint rt, uint ra, int imm)
+{
+   emit_d(p, 28, ra, rt, imm);  /* note argument order */
+}
+
+/** rt = ra OR ra */
+void
+ppc_or(struct ppc_function *p, uint rt, uint ra, uint rb)
+{
+   emit_x(p, 31, ra, rt, rb, 444);  /* note argument order */
+}
+
+/** rt = ra OR imm */
+void
+ppc_ori(struct ppc_function *p, uint rt, uint ra, int imm)
+{
+   emit_d(p, 24, ra, rt, imm);  /* note argument order */
+}
+
+/** rt = ra XOR ra */
+void
+ppc_xor(struct ppc_function *p, uint rt, uint ra, uint rb)
+{
+   emit_x(p, 31, ra, rt, rb, 316);  /* note argument order */
+}
+
+/** rt = ra XOR imm */
+void
+ppc_xori(struct ppc_function *p, uint rt, uint ra, int imm)
+{
+   emit_d(p, 26, ra, rt, imm);  /* note argument order */
+}
+
+/** pseudo instruction: move: rt = ra */
+void
+ppc_mr(struct ppc_function *p, uint rt, uint ra)
+{
+   ppc_or(p, rt, ra, ra);
+}
+
+/** pseudo instruction: load immediate: rt = imm */
+void
+ppc_li(struct ppc_function *p, uint rt, int imm)
+{
+   ppc_addi(p, rt, 0, imm);
+}
+
+/** rt = imm << 16 */
+void
+ppc_lis(struct ppc_function *p, uint rt, int imm)
+{
+   ppc_addis(p, rt, 0, imm);
+}
+
+/** rt = imm */
+void
+ppc_load_int(struct ppc_function *p, uint rt, int imm)
+{
+   ppc_lis(p, rt, (imm >> 16));          /* rt = imm >> 16 */
+   ppc_ori(p, rt, rt, (imm & 0xffff));   /* rt = rt | (imm & 0xffff) */
+}
+
+
+
+
+/**
+ ** integer load/store
+ **/
+
+/** store rs at memory[(ra)+d],
+ * then update ra = (ra)+d
+ */
+void
+ppc_stwu(struct ppc_function *p, uint rs, uint ra, int d)
+{
+   emit_d(p, 37, rs, ra, d);
+}
+
+/** store rs at memory[(ra)+d] */
+void
+ppc_stw(struct ppc_function *p, uint rs, uint ra, int d)
+{
+   emit_d(p, 36, rs, ra, d);
+}
+
+/** Load rt = mem[(ra)+d];  then zero set high 32 bits to zero. */
+void
+ppc_lwz(struct ppc_function *p, uint rt, uint ra, int d)
+{
+   emit_d(p, 32, rt, ra, d);
+}
+
+
+
+/**
+ ** Float (non-vector) arithmetic
+ **/
+
+/** add: frt = fra + frb */
+void
+ppc_fadd(struct ppc_function *p, uint frt, uint fra, uint frb)
+{
+   emit_a(p, 63, frt, fra, frb, 21, 0);
+}
+
+/** sub: frt = fra - frb */
+void
+ppc_fsub(struct ppc_function *p, uint frt, uint fra, uint frb)
+{
+   emit_a(p, 63, frt, fra, frb, 20, 0);
+}
+
+/** convert to int: rt = (int) ra */
+void
+ppc_fctiwz(struct ppc_function *p, uint rt, uint fra)
+{
+   emit_x(p, 63, rt, 0, fra, 15);
+}
+
+/** store frs at mem[(ra)+offset] */
+void
+ppc_stfs(struct ppc_function *p, uint frs, uint ra, int offset)
+{
+   emit_d(p, 52, frs, ra, offset);
+}
+
+/** store frs at mem[(ra)+(rb)] */
+void
+ppc_stfiwx(struct ppc_function *p, uint frs, uint ra, uint rb)
+{
+   emit_x(p, 31, frs, ra, rb, 983);
+}
+
+/** load frt = mem[(ra)+offset] */
+void
+ppc_lfs(struct ppc_function *p, uint frt, uint ra, int offset)
+{
+   emit_d(p, 48, frt, ra, offset);
+}
+
+
+
+
+
+/**
+ ** branch instructions
+ **/
+
+/** BLR: Branch to link register (p. 35) */
+void
+ppc_blr(struct ppc_function *p)
+{
+   emit_i(p, 18, 0, 0, 1);
+}
+
+/** Branch Conditional to Link Register (p. 36) */
+void
+ppc_bclr(struct ppc_function *p, uint condOp, uint branchHint, uint condReg)
+{
+   emit_xl(p, 19, condOp, condReg, branchHint, 16, 0);
+}
+
+/** Pseudo instruction: return from subroutine */
+void
+ppc_return(struct ppc_function *p)
+{
+   ppc_bclr(p, BRANCH_COND_ALWAYS, BRANCH_HINT_SUB_RETURN, 0);
+}
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc.h b/src/gallium/auxiliary/rtasm/rtasm_ppc.h
new file mode 100644
index 0000000000..f938d8d759
--- /dev/null
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc.h
@@ -0,0 +1,324 @@
+/**************************************************************************
+ *
+ * Copyright (C) 2008 Tungsten Graphics, Inc.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * PPC code generation.
+ * \author Brian Paul
+ */
+
+
+#ifndef RTASM_PPC_H
+#define RTASM_PPC_H
+
+
+#include "pipe/p_compiler.h"
+
+
+#define PPC_INST_SIZE 4  /**< 4 bytes / instruction */
+
+#define PPC_NUM_REGS 32
+#define PPC_NUM_FP_REGS 32
+#define PPC_NUM_VEC_REGS 32
+
+/** Stack pointer register */
+#define PPC_REG_SP 1
+
+/** Branch conditions */
+#define BRANCH_COND_ALWAYS       0x14  /* binary 1z1zz (z=ignored) */
+
+/** Branch hints */
+#define BRANCH_HINT_SUB_RETURN   0x0   /* binary 00 */
+
+
+struct ppc_function
+{
+   uint32_t *store;  /**< instruction buffer */
+   uint num_inst;
+   uint max_inst;
+   uint32_t reg_used;   /** used/free general-purpose registers bitmask */
+   uint32_t fp_used;   /** used/free floating point registers bitmask */
+   uint32_t vec_used;   /** used/free vector registers bitmask */
+};
+
+
+
+extern void ppc_init_func(struct ppc_function *p, unsigned max_inst);
+extern void ppc_release_func(struct ppc_function *p);
+extern void (*ppc_get_func( struct ppc_function *p ))( void );
+extern void ppc_dump_func(const struct ppc_function *p);
+
+extern int ppc_reserve_register(struct ppc_function *p, int reg);
+extern int ppc_allocate_register(struct ppc_function *p);
+extern void ppc_release_register(struct ppc_function *p, int reg);
+extern int ppc_allocate_fp_register(struct ppc_function *p);
+extern void ppc_release_fp_register(struct ppc_function *p, int reg);
+extern int ppc_allocate_vec_register(struct ppc_function *p);
+extern void ppc_release_vec_register(struct ppc_function *p, int reg);
+
+
+
+/**
+ ** float vector arithmetic
+ **/
+
+/** vector float add */
+extern void
+ppc_vaddfp(struct ppc_function *p,uint vD, uint vA, uint vB);
+
+/** vector float substract */
+extern void
+ppc_vsubfp(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+/** vector float min */
+extern void
+ppc_vminfp(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+/** vector float max */
+extern void
+ppc_vmaxfp(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+/** vector float mult add */
+extern void
+ppc_vmaddfp(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC);
+
+/** vector float compare greater than */
+extern void
+ppc_vcmpgtfpx(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+/** vector float compare greater than or equal to */
+extern void
+ppc_vcmpgefpx(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+/** vector float compare equal */
+extern void
+ppc_vcmpeqfpx(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+/** vector float 2^x */
+extern void
+ppc_vexptefp(struct ppc_function *p, uint vD, uint vB);
+
+/** vector float log2(x) */
+extern void
+ppc_vlogefp(struct ppc_function *p, uint vD, uint vB);
+
+/** vector float reciprocol */
+extern void
+ppc_vrefp(struct ppc_function *p, uint vD, uint vB);
+
+/** vector float reciprocol sqrt estimate */
+extern void
+ppc_vrsqrtefp(struct ppc_function *p, uint vD, uint vB);
+
+/** vector float round to negative infinity */
+extern void
+ppc_vrfim(struct ppc_function *p, uint vD, uint vB);
+
+/** vector float round to positive infinity */
+extern void
+ppc_vrfip(struct ppc_function *p, uint vD, uint vB);
+
+/** vector float round to nearest int */
+extern void
+ppc_vrfin(struct ppc_function *p, uint vD, uint vB);
+
+/** vector float round to int toward zero */
+extern void
+ppc_vrfiz(struct ppc_function *p, uint vD, uint vB);
+
+
+/** vector store: store vR at mem[vA+vB] */
+extern void
+ppc_stvx(struct ppc_function *p, uint vR, uint vA, uint vB);
+
+/** vector load: vR = mem[vA+vB] */
+extern void
+ppc_lvx(struct ppc_function *p, uint vR, uint vA, uint vB);
+
+/** load vector element word: vR = mem_word[vA+vB] */
+extern void
+ppc_lvewx(struct ppc_function *p, uint vR, uint vA, uint vB);
+
+
+
+/**
+ ** vector bitwise operations
+ **/
+
+
+/** vector and */
+extern void
+ppc_vand(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+/** vector and complement */
+extern void
+ppc_vandc(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+/** vector or */
+extern void
+ppc_vor(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+/** vector nor */
+extern void
+ppc_vnor(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+/** vector xor */
+extern void
+ppc_vxor(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+/** Pseudo-instruction: vector move */
+extern void
+ppc_vmove(struct ppc_function *p, uint vD, uint vA);
+
+/** Set vector register to {0,0,0,0} */
+extern void
+ppc_vzero(struct ppc_function *p, uint vr);
+
+
+
+/**
+ ** Vector shuffle / select / splat / etc
+ **/
+
+/** vector permute */
+extern void
+ppc_vperm(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC);
+
+/** vector select */
+extern void
+ppc_vsel(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC);
+
+/** vector splat byte */
+extern void
+ppc_vspltb(struct ppc_function *p, uint vD, uint vB, uint imm);
+
+/** vector splat half word */
+extern void
+ppc_vsplthw(struct ppc_function *p, uint vD, uint vB, uint imm);
+
+/** vector splat word */
+extern void
+ppc_vspltw(struct ppc_function *p, uint vD, uint vB, uint imm);
+
+/** vector splat signed immediate word */
+extern void
+ppc_vspltisw(struct ppc_function *p, uint vD, int imm);
+
+/** vector shift left word: vD[word] = vA[word] << (vB[word] & 0x1f) */
+extern void
+ppc_vslw(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+
+
+/**
+ ** scalar arithmetic
+ **/
+
+extern void
+ppc_add(struct ppc_function *p, uint rt, uint ra, uint rb);
+
+extern void
+ppc_addi(struct ppc_function *p, uint rt, uint ra, int imm);
+
+extern void
+ppc_and(struct ppc_function *p, uint rt, uint ra, uint rb);
+
+extern void
+ppc_andi(struct ppc_function *p, uint rt, uint ra, int imm);
+
+extern void
+ppc_or(struct ppc_function *p, uint rt, uint ra, uint rb);
+
+extern void
+ppc_ori(struct ppc_function *p, uint rt, uint ra, int imm);
+
+extern void
+ppc_xor(struct ppc_function *p, uint rt, uint ra, uint rb);
+
+extern void
+ppc_xori(struct ppc_function *p, uint rt, uint ra, int imm);
+
+extern void
+ppc_mr(struct ppc_function *p, uint rt, uint ra);
+
+extern void
+ppc_li(struct ppc_function *p, uint rt, int imm);
+
+extern void
+ppc_lis(struct ppc_function *p, uint rt, int imm);
+
+extern void
+ppc_load_int(struct ppc_function *p, uint rt, int imm);
+
+
+
+/**
+ ** scalar load/store
+ **/
+
+extern void
+ppc_stwu(struct ppc_function *p, uint rs, uint ra, int d);
+
+extern void
+ppc_stw(struct ppc_function *p, uint rs, uint ra, int d);
+
+extern void
+ppc_lwz(struct ppc_function *p, uint rs, uint ra, int d);
+
+
+
+/**
+ ** Float (non-vector) arithmetic
+ **/
+
+extern void
+ppc_fadd(struct ppc_function *p, uint frt, uint fra, uint frb);
+
+extern void
+ppc_fsub(struct ppc_function *p, uint frt, uint fra, uint frb);
+
+extern void
+ppc_fctiwz(struct ppc_function *p, uint rt, uint ra);
+
+extern void
+ppc_stfs(struct ppc_function *p, uint frs, uint ra, int offset);
+
+extern void
+ppc_stfiwx(struct ppc_function *p, uint frs, uint ra, uint rb);
+
+
+
+/**
+ ** branch instructions
+ **/
+
+extern void
+ppc_blr(struct ppc_function *p);
+
+void
+ppc_bclr(struct ppc_function *p, uint condOp, uint branchHint, uint condReg);
+
+extern void
+ppc_return(struct ppc_function *p);
+
+
+#endif /* RTASM_PPC_H */
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
index a04cc6c4ff..dea1aed032 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
@@ -27,12 +27,16 @@
  * Real-time assembly generation interface for Cell B.E. SPEs.
  *
  * \author Ian Romanick <idr@us.ibm.com>
+ * \author Brian Paul
  */
 
+
+#include <stdio.h>
 #include "pipe/p_compiler.h"
 #include "util/u_memory.h"
 #include "rtasm_ppc_spe.h"
 
+
 #ifdef GALLIUM_CELL
 /**
  * SPE instruction types
@@ -143,8 +147,46 @@ union spe_inst_RI18 {
 /*@}*/
 
 
+static void
+indent(const struct spe_function *p)
+{
+   int i;
+   for (i = 0; i < p->indent; i++) {
+      putchar(' ');
+   }
+}
+
+
+static const char *
+rem_prefix(const char *longname)
+{
+   return longname + 4;
+}
+
+
+static const char *
+reg_name(int reg)
+{
+   switch (reg) {
+   case SPE_REG_SP:
+      return "$sp";
+   case SPE_REG_RA:
+      return "$lr";
+   default:
+      {
+         /* cycle through four buffers to handle multiple calls per printf */
+         static char buf[4][10];
+         static int b = 0;
+         b = (b + 1) % 4;
+         sprintf(buf[b], "$%d", reg);
+         return buf[b];
+      }
+   }
+}
+
+
 static void emit_RR(struct spe_function *p, unsigned op, unsigned rT,
-		    unsigned rA, unsigned rB)
+		    unsigned rA, unsigned rB, const char *name)
 {
     union spe_inst_RR inst;
     inst.inst.op = op;
@@ -153,11 +195,16 @@ static void emit_RR(struct spe_function *p, unsigned op, unsigned rT,
     inst.inst.rT = rT;
     p->store[p->num_inst++] = inst.bits;
     assert(p->num_inst <= p->max_inst);
+    if (p->print) {
+       indent(p);
+       printf("%s\t%s, %s, %s\n",
+              rem_prefix(name), reg_name(rT), reg_name(rA), reg_name(rB));
+    }
 }
 
 
 static void emit_RRR(struct spe_function *p, unsigned op, unsigned rT,
-		    unsigned rA, unsigned rB, unsigned rC)
+                     unsigned rA, unsigned rB, unsigned rC, const char *name)
 {
     union spe_inst_RRR inst;
     inst.inst.op = op;
@@ -167,11 +214,16 @@ static void emit_RRR(struct spe_function *p, unsigned op, unsigned rT,
     inst.inst.rC = rC;
     p->store[p->num_inst++] = inst.bits;
     assert(p->num_inst <= p->max_inst);
+    if (p->print) {
+       indent(p);
+       printf("%s\t%s, %s, %s, %s\n", rem_prefix(name), reg_name(rT),
+              reg_name(rA), reg_name(rB), reg_name(rC));
+    }
 }
 
 
 static void emit_RI7(struct spe_function *p, unsigned op, unsigned rT,
-		     unsigned rA, int imm)
+		     unsigned rA, int imm, const char *name)
 {
     union spe_inst_RI7 inst;
     inst.inst.op = op;
@@ -180,12 +232,17 @@ static void emit_RI7(struct spe_function *p, unsigned op, unsigned rT,
     inst.inst.rT = rT;
     p->store[p->num_inst++] = inst.bits;
     assert(p->num_inst <= p->max_inst);
+    if (p->print) {
+       indent(p);
+       printf("%s\t%s, %s, 0x%x\n",
+              rem_prefix(name), reg_name(rT), reg_name(rA), imm);
+    }
 }
 
 
 
 static void emit_RI8(struct spe_function *p, unsigned op, unsigned rT,
-		     unsigned rA, int imm)
+		     unsigned rA, int imm, const char *name)
 {
     union spe_inst_RI8 inst;
     inst.inst.op = op;
@@ -194,12 +251,17 @@ static void emit_RI8(struct spe_function *p, unsigned op, unsigned rT,
     inst.inst.rT = rT;
     p->store[p->num_inst++] = inst.bits;
     assert(p->num_inst <= p->max_inst);
+    if (p->print) {
+       indent(p);
+       printf("%s\t%s, %s, 0x%x\n",
+              rem_prefix(name), reg_name(rT), reg_name(rA), imm);
+    }
 }
 
 
 
 static void emit_RI10(struct spe_function *p, unsigned op, unsigned rT,
-		      unsigned rA, int imm)
+		      unsigned rA, int imm, const char *name)
 {
     union spe_inst_RI10 inst;
     inst.inst.op = op;
@@ -208,11 +270,26 @@ static void emit_RI10(struct spe_function *p, unsigned op, unsigned rT,
     inst.inst.rT = rT;
     p->store[p->num_inst++] = inst.bits;
     assert(p->num_inst <= p->max_inst);
+    if (p->print) {
+       indent(p);
+       printf("%s\t%s, %s, 0x%x\n",
+              rem_prefix(name), reg_name(rT), reg_name(rA), imm);
+    }
+}
+
+
+/** As above, but do range checking on signed immediate value */
+static void emit_RI10s(struct spe_function *p, unsigned op, unsigned rT,
+                       unsigned rA, int imm, const char *name)
+{
+    assert(imm <= 511);
+    assert(imm >= -512);
+    emit_RI10(p, op, rT, rA, imm, name);
 }
 
 
 static void emit_RI16(struct spe_function *p, unsigned op, unsigned rT,
-		      int imm)
+		      int imm, const char *name)
 {
     union spe_inst_RI16 inst;
     inst.inst.op = op;
@@ -220,11 +297,15 @@ static void emit_RI16(struct spe_function *p, unsigned op, unsigned rT,
     inst.inst.rT = rT;
     p->store[p->num_inst++] = inst.bits;
     assert(p->num_inst <= p->max_inst);
+    if (p->print) {
+       indent(p);
+       printf("%s\t%s, 0x%x\n", rem_prefix(name), reg_name(rT), imm);
+    }
 }
 
 
 static void emit_RI18(struct spe_function *p, unsigned op, unsigned rT,
-		      int imm)
+		      int imm, const char *name)
 {
     union spe_inst_RI18 inst;
     inst.inst.op = op;
@@ -232,6 +313,10 @@ static void emit_RI18(struct spe_function *p, unsigned op, unsigned rT,
     inst.inst.rT = rT;
     p->store[p->num_inst++] = inst.bits;
     assert(p->num_inst <= p->max_inst);
+    if (p->print) {
+       indent(p);
+       printf("%s\t%s, 0x%x\n", rem_prefix(name), reg_name(rT), imm);
+    }
 }
 
 
@@ -240,80 +325,97 @@ static void emit_RI18(struct spe_function *p, unsigned op, unsigned rT,
 #define EMIT_(_name, _op) \
 void _name (struct spe_function *p, unsigned rT) \
 { \
-    emit_RR(p, _op, rT, 0, 0); \
+   emit_RR(p, _op, rT, 0, 0, __FUNCTION__); \
 }
 
 #define EMIT_R(_name, _op) \
 void _name (struct spe_function *p, unsigned rT, unsigned rA) \
 { \
-    emit_RR(p, _op, rT, rA, 0); \
+   emit_RR(p, _op, rT, rA, 0, __FUNCTION__);                 \
 }
 
 #define EMIT_RR(_name, _op) \
 void _name (struct spe_function *p, unsigned rT, unsigned rA, unsigned rB) \
 { \
-    emit_RR(p, _op, rT, rA, rB); \
+   emit_RR(p, _op, rT, rA, rB, __FUNCTION__);                \
 }
 
 #define EMIT_RRR(_name, _op) \
 void _name (struct spe_function *p, unsigned rT, unsigned rA, unsigned rB, unsigned rC) \
 { \
-    emit_RRR(p, _op, rT, rA, rB, rC); \
+   emit_RRR(p, _op, rT, rA, rB, rC, __FUNCTION__);           \
 }
 
 #define EMIT_RI7(_name, _op) \
 void _name (struct spe_function *p, unsigned rT, unsigned rA, int imm) \
 { \
-    emit_RI7(p, _op, rT, rA, imm); \
+   emit_RI7(p, _op, rT, rA, imm, __FUNCTION__);              \
 }
 
 #define EMIT_RI8(_name, _op, bias) \
 void _name (struct spe_function *p, unsigned rT, unsigned rA, int imm) \
 { \
-    emit_RI8(p, _op, rT, rA, bias - imm); \
+   emit_RI8(p, _op, rT, rA, bias - imm, __FUNCTION__);       \
 }
 
 #define EMIT_RI10(_name, _op) \
 void _name (struct spe_function *p, unsigned rT, unsigned rA, int imm) \
 { \
-    emit_RI10(p, _op, rT, rA, imm); \
+   emit_RI10(p, _op, rT, rA, imm, __FUNCTION__);             \
+}
+
+#define EMIT_RI10s(_name, _op) \
+void _name (struct spe_function *p, unsigned rT, unsigned rA, int imm) \
+{ \
+   emit_RI10s(p, _op, rT, rA, imm, __FUNCTION__);             \
 }
 
 #define EMIT_RI16(_name, _op) \
 void _name (struct spe_function *p, unsigned rT, int imm) \
 { \
-    emit_RI16(p, _op, rT, imm); \
+   emit_RI16(p, _op, rT, imm, __FUNCTION__);                 \
 }
 
 #define EMIT_RI18(_name, _op) \
 void _name (struct spe_function *p, unsigned rT, int imm) \
 { \
-    emit_RI18(p, _op, rT, imm); \
+   emit_RI18(p, _op, rT, imm, __FUNCTION__);                 \
 }
 
 #define EMIT_I16(_name, _op) \
 void _name (struct spe_function *p, int imm) \
 { \
-    emit_RI16(p, _op, 0, imm); \
+   emit_RI16(p, _op, 0, imm, __FUNCTION__);                  \
 }
 
 #include "rtasm_ppc_spe.h"
 
 
+
 /**
  * Initialize an spe_function.
  * \param code_size  size of instruction buffer to allocate, in bytes.
  */
 void spe_init_func(struct spe_function *p, unsigned code_size)
 {
+    unsigned int i;
+
     p->store = align_malloc(code_size, 16);
     p->num_inst = 0;
     p->max_inst = code_size / SPE_INST_SIZE;
 
+    p->set_count = 0;
+    memset(p->regs, 0, SPE_NUM_REGS * sizeof(p->regs[0]));
+
     /* Conservatively treat R0 - R2 and R80 - R127 as non-volatile.
      */
-    p->regs[0] = ~7;
-    p->regs[1] = (1U << (80 - 64)) - 1;
+    p->regs[0] = p->regs[1] = p->regs[2] = 1;
+    for (i = 80; i <= 127; i++) {
+      p->regs[i] = 1;
+    }
+
+    p->print = false;
+    p->indent = 0;
 }
 
 
@@ -327,20 +429,23 @@ void spe_release_func(struct spe_function *p)
 }
 
 
+/** Return current code size in bytes. */
+unsigned spe_code_size(const struct spe_function *p)
+{
+   return p->num_inst * SPE_INST_SIZE;
+}
+
+
 /**
- * Alloate a SPE register.
+ * Allocate a SPE register.
  * \return register index or -1 if none left.
  */
 int spe_allocate_available_register(struct spe_function *p)
 {
    unsigned i;
    for (i = 0; i < SPE_NUM_REGS; i++) {
-      const uint64_t mask = (1ULL << (i % 64));
-      const unsigned idx = i / 64;
-
-      assert(idx < 2);
-      if ((p->regs[idx] & mask) != 0) {
-         p->regs[idx] &= ~mask;
+      if (p->regs[i] == 0) {
+         p->regs[i] = 1;
          return i;
       }
    }
@@ -354,31 +459,160 @@ int spe_allocate_available_register(struct spe_function *p)
  */
 int spe_allocate_register(struct spe_function *p, int reg)
 {
-   const unsigned idx = reg / 64;
-   const unsigned bit = reg % 64;
-
    assert(reg < SPE_NUM_REGS);
-   assert((p->regs[idx] & (1ULL << bit)) != 0);
-
-   p->regs[idx] &= ~(1ULL << bit);
+   assert(p->regs[reg] == 0);
+   p->regs[reg] = 1;
    return reg;
 }
 
 
 /**
- * Mark the given SPE register as "unallocated".
+ * Mark the given SPE register as "unallocated".  Note that this should
+ * only be used on registers allocated in the current register set; an
+ * assertion will fail if an attempt is made to deallocate a register
+ * allocated in an earlier register set.
  */
 void spe_release_register(struct spe_function *p, int reg)
 {
-   const unsigned idx = reg / 64;
-   const unsigned bit = reg % 64;
+   assert(reg < SPE_NUM_REGS);
+   assert(p->regs[reg] == 1);
 
-   assert(idx < 2);
+   p->regs[reg] = 0;
+}
+
+/**
+ * Start a new set of registers.  This can be called if
+ * it will be difficult later to determine exactly what
+ * registers were actually allocated during a code generation
+ * sequence, and you really just want to deallocate all of them.
+ */
+void spe_allocate_register_set(struct spe_function *p)
+{
+   unsigned int i;
+
+   /* Keep track of the set count.  If it ever wraps around to 0, 
+    * we're in trouble.
+    */
+   p->set_count++;
+   assert(p->set_count > 0);
+
+   /* Increment the allocation count of all registers currently
+    * allocated.  Then any registers that are allocated in this set
+    * will be the only ones with a count of 1; they'll all be released
+    * when the register set is released.
+    */
+   for (i = 0; i < SPE_NUM_REGS; i++) {
+      if (p->regs[i] > 0)
+         p->regs[i]++;
+   }
+}
+
+void spe_release_register_set(struct spe_function *p)
+{
+   unsigned int i;
+
+   /* If the set count drops below zero, we're in trouble. */
+   assert(p->set_count > 0);
+   p->set_count--;
+
+   /* Drop the allocation level of all registers.  Any allocated
+    * during this register set will drop to 0 and then become
+    * available.
+    */
+   for (i = 0; i < SPE_NUM_REGS; i++) {
+      if (p->regs[i] > 0)
+         p->regs[i]--;
+   }
+}
+
+
+unsigned
+spe_get_registers_used(const struct spe_function *p, ubyte used[])
+{
+   unsigned i, num = 0;
+   /* only count registers in the range available to callers */
+   for (i = 2; i < 80; i++) {
+      if (p->regs[i]) {
+         used[num++] = i;
+      }
+   }
+   return num;
+}
+
+
+void
+spe_print_code(struct spe_function *p, boolean enable)
+{
+   p->print = enable;
+}
+
+
+void
+spe_indent(struct spe_function *p, int spaces)
+{
+   p->indent += spaces;
+}
+
+
+void
+spe_comment(struct spe_function *p, int rel_indent, const char *s)
+{
+   if (p->print) {
+      p->indent += rel_indent;
+      indent(p);
+      p->indent -= rel_indent;
+      printf("# %s\n", s);
+   }
+}
 
-   assert(reg < SPE_NUM_REGS);
-   assert((p->regs[idx] & (1ULL << bit)) == 0);
 
-   p->regs[idx] |= (1ULL << bit);
+/**
+ * Load quad word.
+ * NOTE: offset is in bytes and the least significant 4 bits must be zero!
+ */
+void spe_lqd(struct spe_function *p, unsigned rT, unsigned rA, int offset)
+{
+   const boolean pSave = p->print;
+
+   /* offset must be a multiple of 16 */
+   assert(offset % 16 == 0);
+   /* offset must fit in 10-bit signed int field, after shifting */
+   assert((offset >> 4) <= 511);
+   assert((offset >> 4) >= -512);
+
+   p->print = FALSE;
+   emit_RI10(p, 0x034, rT, rA, offset >> 4, "spe_lqd");
+   p->print = pSave;
+
+   if (p->print) {
+      indent(p);
+      printf("lqd\t%s, %d(%s)\n", reg_name(rT), offset, reg_name(rA));
+   }
+}
+
+
+/**
+ * Store quad word.
+ * NOTE: offset is in bytes and the least significant 4 bits must be zero!
+ */
+void spe_stqd(struct spe_function *p, unsigned rT, unsigned rA, int offset)
+{
+   const boolean pSave = p->print;
+
+   /* offset must be a multiple of 16 */
+   assert(offset % 16 == 0);
+   /* offset must fit in 10-bit signed int field, after shifting */
+   assert((offset >> 4) <= 511);
+   assert((offset >> 4) >= -512);
+
+   p->print = FALSE;
+   emit_RI10(p, 0x024, rT, rA, offset >> 4, "spe_stqd");
+   p->print = pSave;
+
+   if (p->print) {
+      indent(p);
+      printf("stqd\t%s, %d(%s)\n", reg_name(rT), offset, reg_name(rA));
+   }
 }
 
 
@@ -392,51 +626,51 @@ void spe_release_register(struct spe_function *p, int reg)
 /** Branch Indirect to address in rA */
 void spe_bi(struct spe_function *p, unsigned rA, int d, int e)
 {
-    emit_RI7(p, 0x1a8, 0, rA, (d << 5) | (e << 4));
+   emit_RI7(p, 0x1a8, 0, rA, (d << 5) | (e << 4), __FUNCTION__);
 }
 
 /** Interupt Return */
 void spe_iret(struct spe_function *p, unsigned rA, int d, int e)
 {
-    emit_RI7(p, 0x1aa, 0, rA, (d << 5) | (e << 4));
+   emit_RI7(p, 0x1aa, 0, rA, (d << 5) | (e << 4), __FUNCTION__);
 }
 
 /** Branch indirect and set link on external data */
 void spe_bisled(struct spe_function *p, unsigned rT, unsigned rA, int d,
 		int e)
 {
-    emit_RI7(p, 0x1ab, rT, rA, (d << 5) | (e << 4));
+   emit_RI7(p, 0x1ab, rT, rA, (d << 5) | (e << 4), __FUNCTION__);
 }
 
 /** Branch indirect and set link.  Save PC in rT, jump to rA. */
 void spe_bisl(struct spe_function *p, unsigned rT, unsigned rA, int d,
 		int e)
 {
-    emit_RI7(p, 0x1a9, rT, rA, (d << 5) | (e << 4));
+   emit_RI7(p, 0x1a9, rT, rA, (d << 5) | (e << 4), __FUNCTION__);
 }
 
 /** Branch indirect if zero word.  If rT.word[0]==0, jump to rA. */
 void spe_biz(struct spe_function *p, unsigned rT, unsigned rA, int d, int e)
 {
-    emit_RI7(p, 0x128, rT, rA, (d << 5) | (e << 4));
+   emit_RI7(p, 0x128, rT, rA, (d << 5) | (e << 4), __FUNCTION__);
 }
 
 /** Branch indirect if non-zero word.  If rT.word[0]!=0, jump to rA. */
 void spe_binz(struct spe_function *p, unsigned rT, unsigned rA, int d, int e)
 {
-    emit_RI7(p, 0x129, rT, rA, (d << 5) | (e << 4));
+   emit_RI7(p, 0x129, rT, rA, (d << 5) | (e << 4), __FUNCTION__);
 }
 
 /** Branch indirect if zero halfword.  If rT.halfword[1]==0, jump to rA. */
 void spe_bihz(struct spe_function *p, unsigned rT, unsigned rA, int d, int e)
 {
-    emit_RI7(p, 0x12a, rT, rA, (d << 5) | (e << 4));
+   emit_RI7(p, 0x12a, rT, rA, (d << 5) | (e << 4), __FUNCTION__);
 }
 
 /** Branch indirect if non-zero halfword.  If rT.halfword[1]!=0, jump to rA. */
 void spe_bihnz(struct spe_function *p, unsigned rT, unsigned rA, int d, int e)
 {
-    emit_RI7(p, 0x12b, rT, rA, (d << 5) | (e << 4));
+   emit_RI7(p, 0x12b, rT, rA, (d << 5) | (e << 4), __FUNCTION__);
 }
 
 
@@ -505,30 +739,223 @@ spe_load_int(struct spe_function *p, unsigned rT, int i)
    }
    else {
       spe_ilhu(p, rT, i >> 16);
-      spe_iohl(p, rT, i & 0xffff);
+      if (i & 0xffff)
+         spe_iohl(p, rT, i & 0xffff);
+   }
+}
+
+void spe_load_uint(struct spe_function *p, unsigned rT, unsigned int ui)
+{
+   /* If the whole value is in the lower 18 bits, use ila, which
+    * doesn't sign-extend.  Otherwise, if the two halfwords of
+    * the constant are identical, use ilh.  Otherwise, if every byte of
+    * the desired value is 0x00 or 0xff, we can use Form Select Mask for
+    * Bytes Immediate (fsmbi) to load the value in a single instruction.
+    * Otherwise, in the general case, we have to use ilhu followed by iohl.
+    */
+   if ((ui & 0x0003ffff) == ui) {
+      spe_ila(p, rT, ui);
+   }
+   else if ((ui >> 16) == (ui & 0xffff)) {
+      spe_ilh(p, rT, ui & 0xffff);
+   }
+   else if (
+      ((ui & 0x000000ff) == 0 || (ui & 0x000000ff) == 0x000000ff) &&
+      ((ui & 0x0000ff00) == 0 || (ui & 0x0000ff00) == 0x0000ff00) &&
+      ((ui & 0x00ff0000) == 0 || (ui & 0x00ff0000) == 0x00ff0000) &&
+      ((ui & 0xff000000) == 0 || (ui & 0xff000000) == 0xff000000)
+   ) {
+      unsigned int mask = 0;
+      /* fsmbi duplicates each bit in the given mask eight times,
+       * using a 16-bit value to initialize a 16-byte quadword.
+       * Each 4-bit nybble of the mask corresponds to a full word
+       * of the result; look at the value and figure out the mask
+       * (replicated for each word in the quadword), and then
+       * form the "select mask" to get the value.
+       */
+      if ((ui & 0x000000ff) == 0x000000ff) mask |= 0x1111;
+      if ((ui & 0x0000ff00) == 0x0000ff00) mask |= 0x2222;
+      if ((ui & 0x00ff0000) == 0x00ff0000) mask |= 0x4444;
+      if ((ui & 0xff000000) == 0xff000000) mask |= 0x8888;
+      spe_fsmbi(p, rT, mask);
+   }
+   else {
+      /* The general case: this usually uses two instructions, but
+       * may use only one if the low-order 16 bits of each word are 0.
+       */
+      spe_ilhu(p, rT, ui >> 16);
+      if (ui & 0xffff)
+         spe_iohl(p, rT, ui & 0xffff);
    }
 }
 
+/**
+ * This function is constructed identically to spe_xor_uint() below.
+ * Changes to one should be made in the other.
+ */
+void
+spe_and_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
+{
+   /* If we can, emit a single instruction, either And Byte Immediate
+    * (which uses the same constant across each byte), And Halfword Immediate
+    * (which sign-extends a 10-bit immediate to 16 bits and uses that
+    * across each halfword), or And Word Immediate (which sign-extends
+    * a 10-bit immediate to 32 bits).
+    *
+    * Otherwise, we'll need to use a temporary register.
+    */
+   unsigned int tmp;
+
+   /* If the upper 23 bits are all 0s or all 1s, sign extension
+    * will work and we can use And Word Immediate
+    */
+   tmp = ui & 0xfffffe00;
+   if (tmp == 0xfffffe00 || tmp  == 0) {
+      spe_andi(p, rT, rA, ui & 0x000003ff);
+      return;
+   }
+   
+   /* If the ui field is symmetric along halfword boundaries and
+    * the upper 7 bits of each halfword are all 0s or 1s, we
+    * can use And Halfword Immediate
+    */
+   tmp = ui & 0xfe00fe00;
+   if ((tmp == 0xfe00fe00 || tmp == 0) && ((ui >> 16) == (ui & 0x0000ffff))) {
+      spe_andhi(p, rT, rA, ui & 0x000003ff);
+      return;
+   }
+
+   /* If the ui field is symmetric in each byte, then we can use
+    * the And Byte Immediate instruction.
+    */
+   tmp = ui & 0x000000ff;
+   if ((ui >> 24) == tmp && ((ui >> 16) & 0xff) == tmp && ((ui >> 8) & 0xff) == tmp) {
+      spe_andbi(p, rT, rA, tmp);
+      return;
+   }
+
+   /* Otherwise, we'll have to use a temporary register. */
+   unsigned int tmp_reg = spe_allocate_available_register(p);
+   spe_load_uint(p, tmp_reg, ui);
+   spe_and(p, rT, rA, tmp_reg);
+   spe_release_register(p, tmp_reg);
+}
+
+
+/**
+ * This function is constructed identically to spe_and_uint() above.
+ * Changes to one should be made in the other.
+ */
+void
+spe_xor_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
+{
+   /* If we can, emit a single instruction, either Exclusive Or Byte 
+    * Immediate (which uses the same constant across each byte), Exclusive 
+    * Or Halfword Immediate (which sign-extends a 10-bit immediate to 
+    * 16 bits and uses that across each halfword), or Exclusive Or Word 
+    * Immediate (which sign-extends a 10-bit immediate to 32 bits).
+    *
+    * Otherwise, we'll need to use a temporary register.
+    */
+   unsigned int tmp;
+
+   /* If the upper 23 bits are all 0s or all 1s, sign extension
+    * will work and we can use Exclusive Or Word Immediate
+    */
+   tmp = ui & 0xfffffe00;
+   if (tmp == 0xfffffe00 || tmp  == 0) {
+      spe_xori(p, rT, rA, ui & 0x000003ff);
+      return;
+   }
+   
+   /* If the ui field is symmetric along halfword boundaries and
+    * the upper 7 bits of each halfword are all 0s or 1s, we
+    * can use Exclusive Or Halfword Immediate
+    */
+   tmp = ui & 0xfe00fe00;
+   if ((tmp == 0xfe00fe00 || tmp == 0) && ((ui >> 16) == (ui & 0x0000ffff))) {
+      spe_xorhi(p, rT, rA, ui & 0x000003ff);
+      return;
+   }
+
+   /* If the ui field is symmetric in each byte, then we can use
+    * the Exclusive Or Byte Immediate instruction.
+    */
+   tmp = ui & 0x000000ff;
+   if ((ui >> 24) == tmp && ((ui >> 16) & 0xff) == tmp && ((ui >> 8) & 0xff) == tmp) {
+      spe_xorbi(p, rT, rA, tmp);
+      return;
+   }
+
+   /* Otherwise, we'll have to use a temporary register. */
+   unsigned int tmp_reg = spe_allocate_available_register(p);
+   spe_load_uint(p, tmp_reg, ui);
+   spe_xor(p, rT, rA, tmp_reg);
+   spe_release_register(p, tmp_reg);
+}
+
+void
+spe_compare_equal_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
+{
+   /* If the comparison value is 9 bits or less, it fits inside a
+    * Compare Equal Word Immediate instruction.
+    */
+   if ((ui & 0x000001ff) == ui) {
+      spe_ceqi(p, rT, rA, ui);
+   }
+   /* Otherwise, we're going to have to load a word first. */
+   else {
+      unsigned int tmp_reg = spe_allocate_available_register(p);
+      spe_load_uint(p, tmp_reg, ui);
+      spe_ceq(p, rT, rA, tmp_reg);
+      spe_release_register(p, tmp_reg);
+   }
+}
+
+void
+spe_compare_greater_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
+{
+   /* If the comparison value is 10 bits or less, it fits inside a
+    * Compare Logical Greater Than Word Immediate instruction.
+    */
+   if ((ui & 0x000003ff) == ui) {
+      spe_clgti(p, rT, rA, ui);
+   }
+   /* Otherwise, we're going to have to load a word first. */
+   else {
+      unsigned int tmp_reg = spe_allocate_available_register(p);
+      spe_load_uint(p, tmp_reg, ui);
+      spe_clgt(p, rT, rA, tmp_reg);
+      spe_release_register(p, tmp_reg);
+   }
+}
 
 void
 spe_splat(struct spe_function *p, unsigned rT, unsigned rA)
 {
-   spe_ila(p, rT, 66051);
+   /* Duplicate bytes 0, 1, 2, and 3 across the whole register */
+   spe_ila(p, rT, 0x00010203);
    spe_shufb(p, rT, rA, rA, rT);
 }
 
 
 void
-spe_complement(struct spe_function *p, unsigned rT)
+spe_complement(struct spe_function *p, unsigned rT, unsigned rA)
 {
-   spe_nor(p, rT, rT, rT);
+   spe_nor(p, rT, rA, rA);
 }
 
 
 void
 spe_move(struct spe_function *p, unsigned rT, unsigned rA)
 {
-   spe_ori(p, rT, rA, 0);
+   /* Use different instructions depending on the instruction address
+    * to take advantage of the dual pipelines.
+    */
+   if (p->num_inst & 1)
+      spe_shlqbyi(p, rT, rA, 0);  /* odd pipe */
+   else
+      spe_ori(p, rT, rA, 0);  /* even pipe */
 }
 
 
@@ -539,4 +966,70 @@ spe_zero(struct spe_function *p, unsigned rT)
 }
 
 
+void
+spe_splat_word(struct spe_function *p, unsigned rT, unsigned rA, int word)
+{
+   assert(word >= 0);
+   assert(word <= 3);
+
+   if (word == 0) {
+      int tmp1 = rT;
+      spe_ila(p, tmp1, 66051);
+      spe_shufb(p, rT, rA, rA, tmp1);
+   }
+   else {
+      /* XXX review this, we may not need the rotqbyi instruction */
+      int tmp1 = rT;
+      int tmp2 = spe_allocate_available_register(p);
+
+      spe_ila(p, tmp1, 66051);
+      spe_rotqbyi(p, tmp2, rA, 4 * word);
+      spe_shufb(p, rT, tmp2, tmp2, tmp1);
+
+      spe_release_register(p, tmp2);
+   }
+}
+
+/**
+ * For each 32-bit float element of rA and rB, choose the smaller of the
+ * two, compositing them into the rT register.
+ * 
+ * The Float Compare Greater Than (fcgt) instruction will put 1s into
+ * compare_reg where rA > rB, and 0s where rA <= rB.
+ *
+ * Then the Select Bits (selb) instruction will take bits from rA where
+ * compare_reg is 0, and from rB where compare_reg is 1; i.e., from rA
+ * where rA <= rB and from rB where rB > rA, which is exactly the
+ * "min" operation.
+ *
+ * The compare_reg could in many cases be the same as rT, unless
+ * rT == rA || rt == rB.  But since this is common in constructions
+ * like "x = min(x, a)", we always allocate a new register to be safe.
+ */
+void 
+spe_float_min(struct spe_function *p, unsigned rT, unsigned rA, unsigned rB)
+{
+   unsigned int compare_reg = spe_allocate_available_register(p);
+   spe_fcgt(p, compare_reg, rA, rB);
+   spe_selb(p, rT, rA, rB, compare_reg);
+   spe_release_register(p, compare_reg);
+}
+
+/**
+ * For each 32-bit float element of rA and rB, choose the greater of the
+ * two, compositing them into the rT register.
+ * 
+ * The logic is similar to that of spe_float_min() above; the only
+ * difference is that the registers on spe_selb() have been reversed,
+ * so that the larger of the two is selected instead of the smaller.
+ */
+void 
+spe_float_max(struct spe_function *p, unsigned rT, unsigned rA, unsigned rB)
+{
+   unsigned int compare_reg = spe_allocate_available_register(p);
+   spe_fcgt(p, compare_reg, rA, rB);
+   spe_selb(p, rT, rB, rA, compare_reg);
+   spe_release_register(p, compare_reg);
+}
+
 #endif /* GALLIUM_CELL */
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
index d95e5aace3..d6a3c02f20 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
@@ -28,6 +28,7 @@
  * For details, see /opt/cell/sdk/docs/arch/SPU_ISA_v1.2_27Jan2007_pub.pdf
  *
  * \author Ian Romanick <idr@us.ibm.com>
+ * \author Brian Paul
  */
 
 #ifndef RTASM_PPC_SPE_H
@@ -39,10 +40,10 @@
 /** number of general-purpose SIMD registers */
 #define SPE_NUM_REGS  128
 
-/** Return Address register */
+/** Return Address register (aka $lr / Link Register) */
 #define SPE_REG_RA  0
 
-/** Stack Pointer register */
+/** Stack Pointer register (aka $sp) */
 #define SPE_REG_SP  1
 
 
@@ -52,25 +53,49 @@ struct spe_function
    uint num_inst;
    uint max_inst;
 
-    /**
-     * Mask of used / unused registers
-     *
-     * Each set bit corresponds to an available register.  Each cleared bit
-     * corresponds to an allocated register.
+   /**
+    * The "set count" reflects the number of nested register sets
+    * are allowed.  In the unlikely case that we exceed the set count,
+    * register allocation will start to be confused, which is critical
+    * enough that we check for it.
+    */
+   unsigned char set_count;
+
+   /** 
+    * Flags for used and unused registers.  Each byte corresponds to a
+    * register; a 0 in that byte means that the register is available.
+    * A value of 1 means that the register was allocated in the current
+    * register set.  Any other value N means that the register was allocated
+    * N register sets ago.
      *
      * \sa
      * spe_allocate_register, spe_allocate_available_register,
-     * spe_release_register
+     * spe_allocate_register_set, spe_release_register_set, spe_release_register, 
      */
-    uint64_t regs[SPE_NUM_REGS / 64];
+    unsigned char regs[SPE_NUM_REGS];
+
+    boolean print; /**< print/dump instructions as they're emitted? */
+    int indent;    /**< number of spaces to indent */
 };
 
+
 extern void spe_init_func(struct spe_function *p, unsigned code_size);
 extern void spe_release_func(struct spe_function *p);
+extern unsigned spe_code_size(const struct spe_function *p);
 
 extern int spe_allocate_available_register(struct spe_function *p);
 extern int spe_allocate_register(struct spe_function *p, int reg);
 extern void spe_release_register(struct spe_function *p, int reg);
+extern void spe_allocate_register_set(struct spe_function *p);
+extern void spe_release_register_set(struct spe_function *p);
+
+extern unsigned
+spe_get_registers_used(const struct spe_function *p, ubyte used[]);
+
+extern void spe_print_code(struct spe_function *p, boolean enable);
+extern void spe_indent(struct spe_function *p, int spaces);
+extern void spe_comment(struct spe_function *p, int rel_indent, const char *s);
+
 
 #endif /* RTASM_PPC_SPE_H */
 
@@ -94,6 +119,9 @@ extern void spe_release_register(struct spe_function *p, int reg);
 #define EMIT_RI10(_name, _op) \
     extern void _name (struct spe_function *p, unsigned rT, unsigned rA, \
 			   int imm)
+#define EMIT_RI10s(_name, _op) \
+    extern void _name (struct spe_function *p, unsigned rT, unsigned rA, \
+			   int imm)
 #define EMIT_RI16(_name, _op) \
     extern void _name (struct spe_function *p, unsigned rT, int imm)
 #define EMIT_RI18(_name, _op) \
@@ -106,11 +134,9 @@ extern void spe_release_register(struct spe_function *p, int reg);
 
 /* Memory load / store instructions
  */
-EMIT_RI10(spe_lqd,  0x034);
 EMIT_RR  (spe_lqx,  0x1c4);
 EMIT_RI16(spe_lqa,  0x061);
 EMIT_RI16(spe_lqr,  0x067);
-EMIT_RI10(spe_stqd, 0x024);
 EMIT_RR  (spe_stqx, 0x144);
 EMIT_RI16(spe_stqa, 0x041);
 EMIT_RI16(spe_stqr, 0x047);
@@ -140,7 +166,7 @@ EMIT_RI16(spe_fsmbi, 0x065);
 EMIT_RR  (spe_ah,      0x0c8);
 EMIT_RI10(spe_ahi,     0x01d);
 EMIT_RR  (spe_a,       0x0c0);
-EMIT_RI10(spe_ai,      0x01c);
+EMIT_RI10s(spe_ai,      0x01c);
 EMIT_RR  (spe_sfh,     0x048);
 EMIT_RI10(spe_sfhi,    0x00d);
 EMIT_RR  (spe_sf,      0x040);
@@ -178,19 +204,19 @@ EMIT_R   (spe_xshw,    0x2ae);
 EMIT_R   (spe_xswd,    0x2a6);
 EMIT_RR  (spe_and,     0x0c1);
 EMIT_RR  (spe_andc,    0x2c1);
-EMIT_RI10(spe_andbi,   0x016);
-EMIT_RI10(spe_andhi,   0x015);
-EMIT_RI10(spe_andi,    0x014);
+EMIT_RI10s(spe_andbi,   0x016);
+EMIT_RI10s(spe_andhi,   0x015);
+EMIT_RI10s(spe_andi,    0x014);
 EMIT_RR  (spe_or,      0x041);
 EMIT_RR  (spe_orc,     0x2c9);
-EMIT_RI10(spe_orbi,    0x006);
-EMIT_RI10(spe_orhi,    0x005);
-EMIT_RI10(spe_ori,     0x004);
+EMIT_RI10s(spe_orbi,    0x006);
+EMIT_RI10s(spe_orhi,    0x005);
+EMIT_RI10s(spe_ori,     0x004);
 EMIT_R   (spe_orx,     0x1f0);
 EMIT_RR  (spe_xor,     0x241);
-EMIT_RI10(spe_xorbi,   0x026);
-EMIT_RI10(spe_xorhi,   0x025);
-EMIT_RI10(spe_xori,    0x024);
+EMIT_RI10s(spe_xorbi,   0x026);
+EMIT_RI10s(spe_xorhi,   0x025);
+EMIT_RI10s(spe_xori,    0x024);
 EMIT_RR  (spe_nand,    0x0c9);
 EMIT_RR  (spe_nor,     0x049);
 EMIT_RR  (spe_eqv,     0x249);
@@ -268,6 +294,12 @@ EMIT_RI16(spe_brz,       0x040);
 EMIT_RI16(spe_brhnz,     0x046);
 EMIT_RI16(spe_brhz,      0x044);
 
+extern void
+spe_lqd(struct spe_function *p, unsigned rT, unsigned rA, int offset);
+
+extern void
+spe_stqd(struct spe_function *p, unsigned rT, unsigned rA, int offset);
+
 extern void spe_bi(struct spe_function *p, unsigned rA, int d, int e);
 extern void spe_iret(struct spe_function *p, unsigned rA, int d, int e);
 extern void spe_bisled(struct spe_function *p, unsigned rT, unsigned rA,
@@ -292,13 +324,33 @@ spe_load_float(struct spe_function *p, unsigned rT, float x);
 extern void
 spe_load_int(struct spe_function *p, unsigned rT, int i);
 
+/** Load/splat immediate unsigned int into rT. */
+extern void
+spe_load_uint(struct spe_function *p, unsigned rT, unsigned int ui);
+
+/** And immediate value into rT. */
+extern void
+spe_and_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui);
+
+/** Xor immediate value into rT. */
+extern void
+spe_xor_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui);
+
+/** Compare equal with immediate value. */
+extern void
+spe_compare_equal_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui);
+
+/** Compare greater with immediate value. */
+extern void
+spe_compare_greater_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui);
+
 /** Replicate word 0 of rA across rT. */
 extern void
 spe_splat(struct spe_function *p, unsigned rT, unsigned rA);
 
-/** Complement/invert all bits in rT. */
+/** rT = complement_all_bits(rA). */
 extern void
-spe_complement(struct spe_function *p, unsigned rT);
+spe_complement(struct spe_function *p, unsigned rT, unsigned rA);
 
 /** rT = rA. */
 extern void
@@ -308,6 +360,18 @@ spe_move(struct spe_function *p, unsigned rT, unsigned rA);
 extern void
 spe_zero(struct spe_function *p, unsigned rT);
 
+/** rT = splat(rA, word) */
+extern void
+spe_splat_word(struct spe_function *p, unsigned rT, unsigned rA, int word);
+
+/** rT = float min(rA, rB) */
+extern void
+spe_float_min(struct spe_function *p, unsigned rT, unsigned rA, unsigned rB);
+
+/** rT = float max(rA, rB) */
+extern void
+spe_float_max(struct spe_function *p, unsigned rT, unsigned rA, unsigned rB);
+
 
 /* Floating-point instructions
  */
@@ -361,6 +425,7 @@ EMIT_R   (spe_wrch,       0x10d);
 #undef EMIT_RI7
 #undef EMIT_RI8
 #undef EMIT_RI10
+#undef EMIT_RI10s
 #undef EMIT_RI16
 #undef EMIT_RI18
 #undef EMIT_I16
diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
index 6d4c081e04..99ee74cf14 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
@@ -240,7 +240,8 @@ static void emit_modrm( struct x86_function *p,
    /* Oh-oh we've stumbled into the SIB thing.
     */
    if (regmem.file == file_REG32 &&
-       regmem.idx == reg_SP) {
+       regmem.idx == reg_SP &&
+       regmem.mod != mod_REG) {
       emit_1ub(p, 0x24);		/* simplistic! */
    }
 
@@ -370,7 +371,11 @@ void x86_jcc( struct x86_function *p,
    DUMP_I(cc);
    
    if (offset < 0) {
-      assert(p->csr - p->store > -offset);
+      /*assert(p->csr - p->store > -offset);*/
+      if (p->csr - p->store <= -offset) {
+         /* probably out of memory (using the error_overflow buffer) */
+         return;
+      }
    }
 
    if (offset <= 127 && offset >= -128) {
@@ -435,25 +440,70 @@ void x86_call( struct x86_function *p, struct x86_reg reg)
 }
 
 
-/* michal:
- * Temporary. As I need immediate operands, and dont want to mess with the codegen,
- * I load the immediate into general purpose register and use it.
- */
 void x86_mov_reg_imm( struct x86_function *p, struct x86_reg dst, int imm )
 {
    DUMP_RI( dst, imm );
+   assert(dst.file == file_REG32);
    assert(dst.mod == mod_REG);
    emit_1ub(p, 0xb8 + dst.idx);
    emit_1i(p, imm);
 }
 
-void x86_add_reg_imm8( struct x86_function *p, struct x86_reg dst, ubyte imm )
+/**
+ * Immediate group 1 instructions.
+ */
+static INLINE void 
+x86_group1_imm( struct x86_function *p, 
+                unsigned op, struct x86_reg dst, int imm )
 {
-   DUMP_RI( dst, imm );
+   assert(dst.file == file_REG32);
    assert(dst.mod == mod_REG);
-   emit_1ub(p, 0x80);
-   emit_modrm_noreg(p, 0, dst);
-   emit_1ub(p, imm);
+   if(-0x80 <= imm && imm < 0x80) {
+      emit_1ub(p, 0x83);
+      emit_modrm_noreg(p, op, dst);
+      emit_1b(p, (char)imm);
+   }
+   else {
+      emit_1ub(p, 0x81);
+      emit_modrm_noreg(p, op, dst);
+      emit_1i(p, imm);
+   }
+}
+
+void x86_add_imm( struct x86_function *p, struct x86_reg dst, int imm )
+{
+   DUMP_RI( dst, imm );
+   x86_group1_imm(p, 0, dst, imm);
+}
+
+void x86_or_imm( struct x86_function *p, struct x86_reg dst, int imm )
+{
+   DUMP_RI( dst, imm );
+   x86_group1_imm(p, 1, dst, imm);
+}
+
+void x86_and_imm( struct x86_function *p, struct x86_reg dst, int imm )
+{
+   DUMP_RI( dst, imm );
+   x86_group1_imm(p, 4, dst, imm);
+}
+
+void x86_sub_imm( struct x86_function *p, struct x86_reg dst, int imm )
+{
+   DUMP_RI( dst, imm );
+   x86_group1_imm(p, 5, dst, imm);
+}
+
+void x86_xor_imm( struct x86_function *p, struct x86_reg dst, int imm )
+{
+   DUMP_RI( dst, imm );
+   x86_group1_imm(p, 6, dst, imm);
+}
+
+void x86_cmp_imm( struct x86_function *p, struct x86_reg dst, int imm )
+{
+   DUMP_RI( dst, imm );
+   x86_group1_imm(p, 7, dst, imm);
 }
 
 
@@ -629,6 +679,44 @@ void x86_and( struct x86_function *p,
  * SSE instructions
  */
 
+void sse_prefetchnta( struct x86_function *p, struct x86_reg ptr)
+{
+   DUMP_R( ptr );
+   assert(ptr.mod != mod_REG);
+   emit_2ub(p, 0x0f, 0x18);
+   emit_modrm_noreg(p, 0, ptr);
+}
+
+void sse_prefetch0( struct x86_function *p, struct x86_reg ptr)
+{
+   DUMP_R( ptr );
+   assert(ptr.mod != mod_REG);
+   emit_2ub(p, 0x0f, 0x18);
+   emit_modrm_noreg(p, 1, ptr);
+}
+
+void sse_prefetch1( struct x86_function *p, struct x86_reg ptr)
+{
+   DUMP_R( ptr );
+   assert(ptr.mod != mod_REG);
+   emit_2ub(p, 0x0f, 0x18);
+   emit_modrm_noreg(p, 2, ptr);
+}
+
+void sse_movntps( struct x86_function *p, 
+                  struct x86_reg dst,
+                  struct x86_reg src)
+{
+   DUMP_RR( dst, src );
+
+   assert(dst.mod != mod_REG);
+   assert(src.mod == mod_REG);
+   emit_2ub(p, 0x0f, 0x2b);
+   emit_modrm(p, src, dst);
+}
+
+
+
 
 void sse_movss( struct x86_function *p,
 		struct x86_reg dst,
diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
index af94577aab..1b5eaaca85 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
@@ -152,12 +152,13 @@ void x86_jmp( struct x86_function *p, int label );
 /* void x86_call( struct x86_function *p, void (*label)() ); */
 void x86_call( struct x86_function *p, struct x86_reg reg);
 
-/* michal:
- * Temporary. As I need immediate operands, and dont want to mess with the codegen,
- * I load the immediate into general purpose register and use it.
- */
 void x86_mov_reg_imm( struct x86_function *p, struct x86_reg dst, int imm );
-void x86_add_reg_imm8( struct x86_function *p, struct x86_reg dst, ubyte imm );
+void x86_add_imm( struct x86_function *p, struct x86_reg dst, int imm );
+void x86_or_imm( struct x86_function *p, struct x86_reg dst, int imm );
+void x86_and_imm( struct x86_function *p, struct x86_reg dst, int imm );
+void x86_sub_imm( struct x86_function *p, struct x86_reg dst, int imm );
+void x86_xor_imm( struct x86_function *p, struct x86_reg dst, int imm );
+void x86_cmp_imm( struct x86_function *p, struct x86_reg dst, int imm );
 
 
 /* Macro for sse_shufps() and sse2_pshufd():
@@ -184,6 +185,13 @@ void sse2_pshufd( struct x86_function *p, struct x86_reg dest, struct x86_reg ar
 void sse2_rcpps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void sse2_rcpss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 
+
+void sse_prefetchnta( struct x86_function *p, struct x86_reg ptr);
+void sse_prefetch0( struct x86_function *p, struct x86_reg ptr);
+void sse_prefetch1( struct x86_function *p, struct x86_reg ptr);
+
+void sse_movntps( struct x86_function *p, struct x86_reg dst, struct x86_reg src);
+
 void sse_addps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void sse_addss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void sse_cvtps2pi( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
diff --git a/src/gallium/auxiliary/tgsi/Makefile b/src/gallium/auxiliary/tgsi/Makefile
index c5d2082087..d7df9490cf 100644
--- a/src/gallium/auxiliary/tgsi/Makefile
+++ b/src/gallium/auxiliary/tgsi/Makefile
@@ -11,8 +11,10 @@ C_SOURCES = \
 	tgsi_info.c \
 	tgsi_iterate.c \
 	tgsi_parse.c \
+	tgsi_ppc.c \
 	tgsi_scan.c \
 	tgsi_sse2.c \
+	tgsi_text.c \
 	tgsi_transform.c \
 	tgsi_util.c
 
diff --git a/src/gallium/auxiliary/tgsi/SConscript b/src/gallium/auxiliary/tgsi/SConscript
index 45bf3f6d57..8200cce42f 100644
--- a/src/gallium/auxiliary/tgsi/SConscript
+++ b/src/gallium/auxiliary/tgsi/SConscript
@@ -12,6 +12,7 @@ tgsi = env.ConvenienceLibrary(
 		'tgsi_parse.c',
 		'tgsi_sanity.c',
 		'tgsi_scan.c',
+		'tgsi_ppc.c',
 		'tgsi_sse2.c',
 		'tgsi_text.c',
 		'tgsi_transform.c',
diff --git a/src/gallium/auxiliary/tgsi/tgsi_build.c b/src/gallium/auxiliary/tgsi/tgsi_build.c
index 74614d3688..38fcaf8829 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_build.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_build.c
@@ -793,10 +793,14 @@ tgsi_default_instruction_ext_nv( void )
    return instruction_ext_nv;
 }
 
-union token_u32
+
+/** test for inequality of 32-bit values pointed to by a and b */
+static INLINE boolean
+compare32(const void *a, const void *b)
 {
-   unsigned u32;
-};
+   return *((uint32_t *) a) != *((uint32_t *) b);
+}
+
 
 unsigned
 tgsi_compare_instruction_ext_nv(
@@ -805,7 +809,7 @@ tgsi_compare_instruction_ext_nv(
 {
    a.Padding = b.Padding = 0;
    a.Extended = b.Extended = 0;
-   return ((union token_u32 *) &a)->u32 != ((union token_u32 *) &b)->u32;
+   return compare32(&a, &b);
 }
 
 struct tgsi_instruction_ext_nv
@@ -864,7 +868,7 @@ tgsi_compare_instruction_ext_label(
 {
    a.Padding = b.Padding = 0;
    a.Extended = b.Extended = 0;
-   return ((union token_u32 *) &a)->u32 != ((union token_u32 *) &b)->u32;
+   return compare32(&a, &b);
 }
 
 struct tgsi_instruction_ext_label
@@ -905,7 +909,7 @@ tgsi_compare_instruction_ext_texture(
 {
    a.Padding = b.Padding = 0;
    a.Extended = b.Extended = 0;
-   return ((union token_u32 *) &a)->u32 != ((union token_u32 *) &b)->u32;
+   return compare32(&a, &b);
 }
 
 struct tgsi_instruction_ext_texture
@@ -1027,7 +1031,7 @@ tgsi_compare_src_register_ext_swz(
 {
    a.Padding = b.Padding = 0;
    a.Extended = b.Extended = 0;
-   return ((union token_u32 *) &a)->u32 != ((union token_u32 *) &b)->u32;
+   return compare32(&a, &b);
 }
 
 struct tgsi_src_register_ext_swz
@@ -1095,7 +1099,7 @@ tgsi_compare_src_register_ext_mod(
 {
    a.Padding = b.Padding = 0;
    a.Extended = b.Extended = 0;
-   return ((union token_u32 *) &a)->u32 != ((union token_u32 *) &b)->u32;
+   return compare32(&a, &b);
 }
 
 struct tgsi_src_register_ext_mod
@@ -1241,7 +1245,7 @@ tgsi_compare_dst_register_ext_concode(
 {
    a.Padding = b.Padding = 0;
    a.Extended = b.Extended = 0;
-   return ((union token_u32 *) &a)->u32 != ((union token_u32 *) &b)->u32;
+   return compare32(&a, &b);
 }
 
 struct tgsi_dst_register_ext_concode
@@ -1299,7 +1303,7 @@ tgsi_compare_dst_register_ext_modulate(
 {
    a.Padding = b.Padding = 0;
    a.Extended = b.Extended = 0;
-   return ((union token_u32 *) &a)->u32 != ((union token_u32 *) &b)->u32;
+   return compare32(&a, &b);
 }
 
 struct tgsi_dst_register_ext_modulate
diff --git a/src/gallium/auxiliary/tgsi/tgsi_dump.c b/src/gallium/auxiliary/tgsi/tgsi_dump.c
index afc8ffa553..3177f54952 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_dump.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_dump.c
@@ -68,6 +68,7 @@ dump_enum(
 #define CHR(C)          ctx->printf( ctx, "%c", C )
 #define UIX(I)          ctx->printf( ctx, "0x%x", I )
 #define UID(I)          ctx->printf( ctx, "%u", I )
+#define INSTID(I)          ctx->printf( ctx, "% 3u", I )
 #define SID(I)          ctx->printf( ctx, "%d", I )
 #define FLT(F)          ctx->printf( ctx, "%10.4f", F )
 #define ENM(E,ENUMS)    dump_enum( ctx, E, ENUMS, sizeof( ENUMS ) / sizeof( *ENUMS ) )
@@ -315,8 +316,8 @@ iter_instruction(
    uint i;
    boolean first_reg = TRUE;
 
-   UID( instno );
-   CHR( ':' );
+   INSTID( instno );
+   TXT( ": " );
    TXT( tgsi_get_opcode_info( inst->Instruction.Opcode )->mnemonic );
 
    switch (inst->Instruction.Saturate) {
diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.c b/src/gallium/auxiliary/tgsi/tgsi_exec.c
index df002939c6..1a5294eabc 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.c
@@ -1674,6 +1674,7 @@ exec_declaration(
             break;
 
          default:
+            eval = NULL;
             assert( 0 );
          }
 
diff --git a/src/gallium/auxiliary/tgsi/tgsi_parse.c b/src/gallium/auxiliary/tgsi/tgsi_parse.c
index 3757486ba9..2cd56e413a 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_parse.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_parse.c
@@ -88,16 +88,33 @@ tgsi_parse_end_of_tokens(
       1 + ctx->FullHeader.Header.HeaderSize + ctx->FullHeader.Header.BodySize;
 }
 
+
+/**
+ * This function is used to avoid and work-around type punning/aliasing
+ * warnings.  The warnings seem harmless on x86 but on PPC they cause
+ * real failures.
+ */
+static INLINE void
+copy_token(void *dst, const void *src)
+{
+   memcpy(dst, src, 4);
+}
+
+
+/**
+ * Get next 4-byte token, return it at address specified by 'token'
+ */
 static void
 next_token(
    struct tgsi_parse_context *ctx,
    void *token )
 {
    assert( !tgsi_parse_end_of_tokens( ctx ) );
-
-   *(struct tgsi_token *) token = ctx->Tokens[ctx->Position++];
+   copy_token(token, &ctx->Tokens[ctx->Position]);
+   ctx->Position++;
 }
 
+
 void
 tgsi_parse_token(
    struct tgsi_parse_context *ctx )
@@ -116,7 +133,7 @@ tgsi_parse_token(
       struct tgsi_full_declaration *decl = &ctx->FullToken.FullDeclaration;
 
       *decl = tgsi_default_full_declaration();
-      decl->Declaration = *(struct tgsi_declaration *) &token;
+      copy_token(&decl->Declaration, &token);
 
       next_token( ctx, &decl->DeclarationRange );
 
@@ -132,8 +149,7 @@ tgsi_parse_token(
       struct tgsi_full_immediate *imm = &ctx->FullToken.FullImmediate;
 
       *imm = tgsi_default_full_immediate();
-      imm->Immediate = *(struct tgsi_immediate *) &token;
-
+      copy_token(&imm->Immediate, &token);
       assert( !imm->Immediate.Extended );
 
       switch (imm->Immediate.DataType) {
@@ -158,8 +174,7 @@ tgsi_parse_token(
       unsigned extended;
 
       *inst = tgsi_default_full_instruction();
-      inst->Instruction = *(struct tgsi_instruction *) &token;
-
+      copy_token(&inst->Instruction, &token);
       extended = inst->Instruction.Extended;
 
       while( extended ) {
@@ -169,18 +184,15 @@ tgsi_parse_token(
 
          switch( token.Type ) {
          case TGSI_INSTRUCTION_EXT_TYPE_NV:
-            inst->InstructionExtNv =
-               *(struct tgsi_instruction_ext_nv *) &token;
+            copy_token(&inst->InstructionExtNv, &token);
             break;
 
          case TGSI_INSTRUCTION_EXT_TYPE_LABEL:
-            inst->InstructionExtLabel =
-               *(struct tgsi_instruction_ext_label *) &token;
+            copy_token(&inst->InstructionExtLabel, &token);
             break;
 
          case TGSI_INSTRUCTION_EXT_TYPE_TEXTURE:
-            inst->InstructionExtTexture =
-               *(struct tgsi_instruction_ext_texture *) &token;
+            copy_token(&inst->InstructionExtTexture, &token);
             break;
 
          default:
@@ -212,13 +224,13 @@ tgsi_parse_token(
 
             switch( token.Type ) {
             case TGSI_DST_REGISTER_EXT_TYPE_CONDCODE:
-               inst->FullDstRegisters[i].DstRegisterExtConcode =
-                  *(struct tgsi_dst_register_ext_concode *) &token;
+               copy_token(&inst->FullDstRegisters[i].DstRegisterExtConcode,
+                          &token);
                break;
 
             case TGSI_DST_REGISTER_EXT_TYPE_MODULATE:
-               inst->FullDstRegisters[i].DstRegisterExtModulate =
-                  *(struct tgsi_dst_register_ext_modulate *) &token;
+               copy_token(&inst->FullDstRegisters[i].DstRegisterExtModulate,
+                          &token);
                break;
 
             default:
@@ -245,13 +257,13 @@ tgsi_parse_token(
 
             switch( token.Type ) {
             case TGSI_SRC_REGISTER_EXT_TYPE_SWZ:
-               inst->FullSrcRegisters[i].SrcRegisterExtSwz =
-                  *(struct tgsi_src_register_ext_swz *) &token;
+               copy_token(&inst->FullSrcRegisters[i].SrcRegisterExtSwz,
+                          &token);
                break;
 
             case TGSI_SRC_REGISTER_EXT_TYPE_MOD:
-               inst->FullSrcRegisters[i].SrcRegisterExtMod =
-                  *(struct tgsi_src_register_ext_mod *) &token;
+               copy_token(&inst->FullSrcRegisters[i].SrcRegisterExtMod,
+                          &token);
                break;
 
             default:
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ppc.c b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
new file mode 100644
index 0000000000..9ad7ecd7cf
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
@@ -0,0 +1,910 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * TGSI to PowerPC code generation.
+ */
+
+#include "pipe/p_config.h"
+
+#if defined(PIPE_ARCH_PPC)
+
+#include "pipe/p_debug.h"
+#include "pipe/p_shader_tokens.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "util/u_sse.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_util.h"
+#include "tgsi_exec.h"
+#include "tgsi_ppc.h"
+#include "rtasm/rtasm_ppc.h"
+
+
+/**
+ * Since it's pretty much impossible to form PPC vector immediates, load
+ * them from memory here:
+ */
+const float ppc_builtin_constants[] ALIGN16_ATTRIB = {
+   1.0f, -128.0f, 128.0, 0.0
+};
+
+
+#define FOR_EACH_CHANNEL( CHAN )\
+   for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)
+
+#define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
+   ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
+
+#define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
+   if (IS_DST0_CHANNEL_ENABLED( INST, CHAN ))
+
+#define FOR_EACH_DST0_ENABLED_CHANNEL( INST, CHAN )\
+   FOR_EACH_CHANNEL( CHAN )\
+      IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )
+
+#define CHAN_X 0
+#define CHAN_Y 1
+#define CHAN_Z 2
+#define CHAN_W 3
+
+#define TEMP_ONE_I   TGSI_EXEC_TEMP_ONE_I
+#define TEMP_ONE_C   TGSI_EXEC_TEMP_ONE_C
+
+#define TEMP_R0   TGSI_EXEC_TEMP_R0
+#define TEMP_ADDR TGSI_EXEC_TEMP_ADDR
+
+
+/**
+ * Context/state used during code gen.
+ */
+struct gen_context
+{
+   struct ppc_function *f;
+   int inputs_reg;    /**< GP register pointing to input params */
+   int outputs_reg;   /**< GP register pointing to output params */
+   int temps_reg;     /**< GP register pointing to temporary "registers" */
+   int immed_reg;     /**< GP register pointing to immediates buffer */
+   int const_reg;     /**< GP register pointing to constants buffer */
+   int builtins_reg;  /**< GP register pointint to built-in constants */
+
+   int one_vec;       /**< vector register with {1.0, 1.0, 1.0, 1.0} */
+   int bit31_vec;     /**< vector register with {1<<31, 1<<31, 1<<31, 1<<31} */
+};
+
+
+/**
+ * Load the given vector register with {value, value, value, value}.
+ * The value must be in the ppu_builtin_constants[] array.
+ * We wouldn't need this if there was a simple way to load PPC vector
+ * registers with immediate values!
+ */
+static void
+load_constant_vec(struct gen_context *gen, int dst_vec, float value)
+{
+   uint pos;
+   for (pos = 0; pos < Elements(ppc_builtin_constants); pos++) {
+      if (ppc_builtin_constants[pos] == value) {
+         int offset_reg = ppc_allocate_register(gen->f);
+         int offset = pos * 4;
+
+         ppc_li(gen->f, offset_reg, offset);
+         /* Load 4-byte word into vector register.
+          * The vector slot depends on the effective address we load from.
+          * We know that our builtins start at a 16-byte boundary so we
+          * know that 'swizzle' tells us which vector slot will have the
+          * loaded word.  The other vector slots will be undefined.
+          */
+         ppc_lvewx(gen->f, dst_vec, gen->builtins_reg, offset_reg);
+         /* splat word[pos % 4] across the vector reg */
+         ppc_vspltw(gen->f, dst_vec, dst_vec, pos % 4);
+         ppc_release_register(gen->f, offset_reg);
+         return;
+      }
+   }
+   assert(0 && "Need to add new constant to ppc_builtin_constants array");
+}
+
+
+/**
+ * Return index of vector register containing {1.0, 1.0, 1.0, 1.0}.
+ */
+static int
+gen_one_vec(struct gen_context *gen)
+{
+   if (gen->one_vec < 0) {
+      gen->one_vec = ppc_allocate_vec_register(gen->f);
+      load_constant_vec(gen, gen->one_vec, 1.0f);
+   }
+   return gen->one_vec;
+}
+
+/**
+ * Return index of vector register containing {1<<31, 1<<31, 1<<31, 1<<31}.
+ */
+static int
+gen_get_bit31_vec(struct gen_context *gen)
+{
+   if (gen->bit31_vec < 0) {
+      gen->bit31_vec = ppc_allocate_vec_register(gen->f);
+      ppc_vspltisw(gen->f, gen->bit31_vec, -1);
+      ppc_vslw(gen->f, gen->bit31_vec, gen->bit31_vec, gen->bit31_vec);
+   }
+   return gen->bit31_vec;
+}
+
+
+/**
+ * Register fetch, put result in 'dst_vec'.
+ */
+static void
+emit_fetch(struct gen_context *gen,
+           unsigned dst_vec,
+           const struct tgsi_full_src_register *reg,
+           const unsigned chan_index)
+{
+   uint swizzle = tgsi_util_get_full_src_register_extswizzle(reg, chan_index);
+
+   switch (swizzle) {
+   case TGSI_EXTSWIZZLE_X:
+   case TGSI_EXTSWIZZLE_Y:
+   case TGSI_EXTSWIZZLE_Z:
+   case TGSI_EXTSWIZZLE_W:
+      switch (reg->SrcRegister.File) {
+      case TGSI_FILE_INPUT:
+         {
+            int offset_reg = ppc_allocate_register(gen->f);
+            int offset = (reg->SrcRegister.Index * 4 + swizzle) * 16;
+            ppc_li(gen->f, offset_reg, offset);
+            ppc_lvx(gen->f, dst_vec, gen->inputs_reg, offset_reg);
+            ppc_release_register(gen->f, offset_reg);
+         }
+         break;
+      case TGSI_FILE_TEMPORARY:
+         {
+            int offset_reg = ppc_allocate_register(gen->f);
+            int offset = (reg->SrcRegister.Index * 4 + swizzle) * 16;
+            ppc_li(gen->f, offset_reg, offset);
+            ppc_lvx(gen->f, dst_vec, gen->temps_reg, offset_reg);
+            ppc_release_register(gen->f, offset_reg);
+         }
+         break;
+      case TGSI_FILE_IMMEDIATE:
+         {
+            int offset_reg = ppc_allocate_register(gen->f);
+            int offset = (reg->SrcRegister.Index * 4 + swizzle) * 16;
+            ppc_li(gen->f, offset_reg, offset);
+            ppc_lvx(gen->f, dst_vec, gen->immed_reg, offset_reg);
+            ppc_release_register(gen->f, offset_reg);
+         }
+         break;
+      case TGSI_FILE_CONSTANT:
+         {
+            int offset_reg = ppc_allocate_register(gen->f);
+            int offset = (reg->SrcRegister.Index * 4 + swizzle) * 4;
+            ppc_li(gen->f, offset_reg, offset);
+            /* Load 4-byte word into vector register.
+             * The vector slot depends on the effective address we load from.
+             * We know that our constants start at a 16-byte boundary so we
+             * know that 'swizzle' tells us which vector slot will have the
+             * loaded word.  The other vector slots will be undefined.
+             */
+            ppc_lvewx(gen->f, dst_vec, gen->const_reg, offset_reg);
+            /* splat word[swizzle] across the vector reg */
+            ppc_vspltw(gen->f, dst_vec, dst_vec, swizzle);
+            ppc_release_register(gen->f, offset_reg);
+         }
+         break;
+      default:
+         assert( 0 );
+      }
+      break;
+   case TGSI_EXTSWIZZLE_ZERO:
+      ppc_vzero(gen->f, dst_vec);
+      break;
+   case TGSI_EXTSWIZZLE_ONE:
+      {
+         int one_vec = gen_one_vec(gen);
+         ppc_vmove(gen->f, dst_vec, one_vec);
+      }
+      break;
+   default:
+      assert( 0 );
+   }
+
+   {
+      uint sign_op = tgsi_util_get_full_src_register_sign_mode(reg, chan_index);
+      if (sign_op != TGSI_UTIL_SIGN_KEEP) {
+         int bit31_vec = gen_get_bit31_vec(gen);
+
+         switch (sign_op) {
+         case TGSI_UTIL_SIGN_CLEAR:
+            /* vec = vec & ~bit31 */
+            ppc_vandc(gen->f, dst_vec, dst_vec, bit31_vec);
+            break;
+         case TGSI_UTIL_SIGN_SET:
+            /* vec = vec | bit31 */
+            ppc_vor(gen->f, dst_vec, dst_vec, bit31_vec);
+            break;
+         case TGSI_UTIL_SIGN_TOGGLE:
+            /* vec = vec ^ bit31 */
+            ppc_vxor(gen->f, dst_vec, dst_vec, bit31_vec);
+            break;
+         default:
+            assert(0);
+         }
+      }
+   }
+}
+
+#define FETCH( GEN, INST, DST_VEC, SRC_REG, CHAN ) \
+   emit_fetch( GEN, DST_VEC, &(INST).FullSrcRegisters[SRC_REG], CHAN )
+
+
+
+/**
+ * Register store.  Store 'src_vec' at location indicated by 'reg'.
+ */
+static void
+emit_store(struct gen_context *gen,
+           unsigned src_vec,
+           const struct tgsi_full_dst_register *reg,
+           const struct tgsi_full_instruction *inst,
+           unsigned chan_index)
+{
+   switch (reg->DstRegister.File) {
+   case TGSI_FILE_OUTPUT:
+      {
+         int offset_reg = ppc_allocate_register(gen->f);
+         int offset = (reg->DstRegister.Index * 4 + chan_index) * 16;
+         ppc_li(gen->f, offset_reg, offset);
+         ppc_stvx(gen->f, src_vec, gen->outputs_reg, offset_reg);
+         ppc_release_register(gen->f, offset_reg);
+      }
+      break;
+   case TGSI_FILE_TEMPORARY:
+      {
+         int offset_reg = ppc_allocate_register(gen->f);
+         int offset = (reg->DstRegister.Index * 4 + chan_index) * 16;
+         ppc_li(gen->f, offset_reg, offset);
+         ppc_stvx(gen->f, src_vec, gen->temps_reg, offset_reg);
+         ppc_release_register(gen->f, offset_reg);
+      }
+      break;
+#if 0
+   case TGSI_FILE_ADDRESS:
+      emit_addrs(
+         func,
+         xmm,
+         reg->DstRegister.Index,
+         chan_index );
+      break;
+#endif
+   default:
+      assert( 0 );
+   }
+
+#if 0
+   switch( inst->Instruction.Saturate ) {
+   case TGSI_SAT_NONE:
+      break;
+
+   case TGSI_SAT_ZERO_ONE:
+      /* assert( 0 ); */
+      break;
+
+   case TGSI_SAT_MINUS_PLUS_ONE:
+      assert( 0 );
+      break;
+   }
+#endif
+}
+
+
+#define STORE( GEN, INST, XMM, INDEX, CHAN )\
+   emit_store( GEN, XMM, &(INST).FullDstRegisters[INDEX], &(INST), CHAN )
+
+
+
+static void
+emit_scalar_unaryop(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+   int v0 = ppc_allocate_vec_register(gen->f);
+   int v1 = ppc_allocate_vec_register(gen->f);
+   uint chan_index;
+
+   FETCH(gen, *inst, v0, 0, CHAN_X);
+
+   switch (inst->Instruction.Opcode) {
+   case TGSI_OPCODE_RSQ:
+      /* v1 = 1.0 / sqrt(v0) */
+      ppc_vrsqrtefp(gen->f, v1, v0);
+      break;
+   case TGSI_OPCODE_RCP:
+      /* v1 = 1.0 / v0 */
+      ppc_vrefp(gen->f, v1, v0);
+      break;
+   default:
+      assert(0);
+   }
+
+   FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+      STORE(gen, *inst, v1, 0, chan_index);
+   }
+   ppc_release_vec_register(gen->f, v0);
+   ppc_release_vec_register(gen->f, v1);
+}
+
+
+static void
+emit_unaryop(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+   int v0 = ppc_allocate_vec_register(gen->f);
+   uint chan_index;
+   FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) {
+      FETCH(gen, *inst, 0, 0, chan_index);   /* v0 = srcreg[0] */
+      switch (inst->Instruction.Opcode) {
+      case TGSI_OPCODE_ABS:
+         /* turn off the most significant bit of each vector float word */
+         {
+            int v1 = ppc_allocate_vec_register(gen->f);
+            ppc_vspltisw(gen->f, v1, -1);  /* v1 = {-1, -1, -1, -1} */
+            ppc_vslw(gen->f, v1, v1, v1);  /* v1 = {1<<31, 1<<31, 1<<31, 1<<31} */
+            ppc_vandc(gen->f, v0, v0, v1); /* v0 = v0 & ~v1 */
+            ppc_release_vec_register(gen->f, v1);
+         }
+         break;
+      case TGSI_OPCODE_FLOOR:
+         ppc_vrfim(gen->f, v0, v0);         /* v0 = floor(v0) */
+         break;
+      case TGSI_OPCODE_FRAC:
+         {
+            int v1 = ppc_allocate_vec_register(gen->f);
+            ppc_vrfim(gen->f, v1, v0);         /* v1 = floor(v0) */
+            ppc_vsubfp(gen->f, v0, v0, v1);    /* v0 = v0 - v1 */
+            ppc_release_vec_register(gen->f, v1);
+         }
+         break;
+      case TGSI_OPCODE_EXPBASE2:
+         ppc_vexptefp(gen->f, v0, v0);      /* v0 = 2^v0 */
+         break;
+      case TGSI_OPCODE_LOGBASE2:
+         /* XXX this may be broken! */
+         ppc_vlogefp(gen->f, v0, v0);      /* v0 = log2(v0) */
+         break;
+      case TGSI_OPCODE_MOV:
+         /* nothing */
+         break;
+      default:
+         assert(0);
+      }
+      STORE(gen, *inst, v0, 0, chan_index);   /* store v0 */
+   }
+   ppc_release_vec_register(gen->f, v0);
+}
+
+
+static void
+emit_binop(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+   int v0 = ppc_allocate_vec_register(gen->f);
+   int v1 = ppc_allocate_vec_register(gen->f);
+   int v2 = ppc_allocate_vec_register(gen->f);
+   uint chan_index;
+   FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) {
+      FETCH(gen, *inst, v0, 0, chan_index);   /* v0 = srcreg[0] */
+      FETCH(gen, *inst, v1, 1, chan_index);   /* v1 = srcreg[1] */
+      switch (inst->Instruction.Opcode) {
+      case TGSI_OPCODE_ADD:
+         ppc_vaddfp(gen->f, v2, v0, v1);
+         break;
+      case TGSI_OPCODE_SUB:
+         ppc_vsubfp(gen->f, v2, v0, v1);
+         break;
+      case TGSI_OPCODE_MUL:
+         ppc_vxor(gen->f, v2, v2, v2);        /* v2 = {0, 0, 0, 0} */
+         ppc_vmaddfp(gen->f, v2, v0, v1, v2); /* v2 = v0 * v1 + v0 */
+         break;
+      case TGSI_OPCODE_MIN:
+         ppc_vminfp(gen->f, v2, v0, v1);
+         break;
+      case TGSI_OPCODE_MAX:
+         ppc_vmaxfp(gen->f, v2, v0, v1);
+         break;
+      default:
+         assert(0);
+      }
+      STORE(gen, *inst, v2, 0, chan_index);   /* store v2 */
+   }
+   ppc_release_vec_register(gen->f, v0);
+   ppc_release_vec_register(gen->f, v1);
+   ppc_release_vec_register(gen->f, v2);
+}
+
+
+/**
+ * Vector comparisons, resulting in 1.0 or 0.0 values.
+ */
+static void
+emit_inequality(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+   int v0 = ppc_allocate_vec_register(gen->f);
+   int v1 = ppc_allocate_vec_register(gen->f);
+   int v2 = ppc_allocate_vec_register(gen->f);
+   uint chan_index;
+   boolean complement = FALSE;
+   int one_vec = gen_one_vec(gen);
+
+   FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) {
+      FETCH(gen, *inst, v0, 0, chan_index);   /* v0 = srcreg[0] */
+      FETCH(gen, *inst, v1, 1, chan_index);   /* v1 = srcreg[1] */
+
+      switch (inst->Instruction.Opcode) {
+      case TGSI_OPCODE_SNE:
+         complement = TRUE;
+         /* fall-through */
+      case TGSI_OPCODE_SEQ:
+         ppc_vcmpeqfpx(gen->f, v2, v0, v1); /* v2 = v0 == v1 ? ~0 : 0 */
+         break;
+
+      case TGSI_OPCODE_SGE:
+         complement = TRUE;
+         /* fall-through */
+      case TGSI_OPCODE_SLT:
+         ppc_vcmpgtfpx(gen->f, v2, v1, v0); /* v2 = v1 > v0 ? ~0 : 0 */
+         break;
+
+      case TGSI_OPCODE_SLE:
+         complement = TRUE;
+         /* fall-through */
+      case TGSI_OPCODE_SGT:
+         ppc_vcmpgtfpx(gen->f, v2, v0, v1); /* v2 = v0 > v1 ? ~0 : 0 */
+         break;
+      default:
+         assert(0);
+      }
+
+      /* v2 is now {0,0,0,0} or {~0,~0,~0,~0} */
+
+      if (complement)
+         ppc_vandc(gen->f, v2, one_vec, v2);    /* v2 = one_vec & ~v2 */
+      else
+         ppc_vand(gen->f, v2, one_vec, v2);     /* v2 = one_vec & v2 */
+
+      STORE(gen, *inst, v2, 0, chan_index);   /* store v2 */
+   }
+
+   ppc_release_vec_register(gen->f, v0);
+   ppc_release_vec_register(gen->f, v1);
+   ppc_release_vec_register(gen->f, v2);
+}
+
+
+static void
+emit_dotprod(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+   int v0 = ppc_allocate_vec_register(gen->f);
+   int v1 = ppc_allocate_vec_register(gen->f);
+   int v2 = ppc_allocate_vec_register(gen->f);
+   uint chan_index;
+
+   ppc_vxor(gen->f, v2, v2, v2);           /* v2 = {0, 0, 0, 0} */
+
+   FETCH(gen, *inst, v0, 0, CHAN_X);       /* v0 = src0.XXXX */
+   FETCH(gen, *inst, v1, 1, CHAN_X);       /* v1 = src1.XXXX */
+   ppc_vmaddfp(gen->f, v2, v0, v1, v2);    /* v2 = v0 * v1 + v2 */
+
+   FETCH(gen, *inst, v0, 0, CHAN_Y);       /* v0 = src0.YYYY */
+   FETCH(gen, *inst, v1, 1, CHAN_Y);       /* v1 = src1.YYYY */
+   ppc_vmaddfp(gen->f, v2, v0, v1, v2);    /* v2 = v0 * v1 + v2 */
+
+   FETCH(gen, *inst, v0, 0, CHAN_Z);       /* v0 = src0.ZZZZ */
+   FETCH(gen, *inst, v1, 1, CHAN_Z);       /* v1 = src1.ZZZZ */
+   ppc_vmaddfp(gen->f, v2, v0, v1, v2);    /* v2 = v0 * v1 + v2 */
+
+   if (inst->Instruction.Opcode == TGSI_OPCODE_DP4) {
+      FETCH(gen, *inst, v0, 0, CHAN_W);    /* v0 = src0.WWWW */
+      FETCH(gen, *inst, v1, 1, CHAN_W);    /* v1 = src1.WWWW */
+      ppc_vmaddfp(gen->f, v2, v0, v1, v2); /* v2 = v0 * v1 + v2 */
+   }
+   else if (inst->Instruction.Opcode == TGSI_OPCODE_DPH) {
+      FETCH(gen, *inst, v1, 1, CHAN_W);    /* v1 = src1.WWWW */
+      ppc_vaddfp(gen->f, v2, v2, v1);      /* v2 = v2 + v1 */
+   }
+
+   FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) {
+      STORE(gen, *inst, v2, 0, chan_index);  /* store v2 */
+   }
+   ppc_release_vec_register(gen->f, v0);
+   ppc_release_vec_register(gen->f, v1);
+   ppc_release_vec_register(gen->f, v2);
+}
+
+
+static void
+emit_triop(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+   int v0 = ppc_allocate_vec_register(gen->f);
+   int v1 = ppc_allocate_vec_register(gen->f);
+   int v2 = ppc_allocate_vec_register(gen->f);
+   int v3 = ppc_allocate_vec_register(gen->f);
+   uint chan_index;
+   FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) {
+      FETCH(gen, *inst, v0, 0, chan_index);   /* v0 = srcreg[0] */
+      FETCH(gen, *inst, v1, 1, chan_index);   /* v1 = srcreg[1] */
+      FETCH(gen, *inst, v2, 2, chan_index);   /* v2 = srcreg[2] */
+      switch (inst->Instruction.Opcode) {
+      case TGSI_OPCODE_MAD:
+         ppc_vmaddfp(gen->f, v3, v0, v1, v2);   /* v3 = v0 * v1 + v2 */
+         break;
+      case TGSI_OPCODE_LRP:
+         ppc_vsubfp(gen->f, v3, v1, v2);        /* v3 = v1 - v2 */
+         ppc_vmaddfp(gen->f, v3, v0, v3, v2);   /* v3 = v0 * v3 + v2 */
+         break;
+      default:
+         assert(0);
+      }
+      STORE(gen, *inst, v3, 0, chan_index);   /* store v3 */
+   }
+   ppc_release_vec_register(gen->f, v0);
+   ppc_release_vec_register(gen->f, v1);
+   ppc_release_vec_register(gen->f, v2);
+   ppc_release_vec_register(gen->f, v3);
+}
+
+
+
+/** Approximation for vr = pow(va, vb) */
+static void
+ppc_vec_pow(struct ppc_function *f, int vr, int va, int vb)
+{
+   /* pow(a,b) ~= exp2(log2(a) * b) */
+   int t_vec = ppc_allocate_vec_register(f);
+   int zero_vec = ppc_allocate_vec_register(f);
+
+   ppc_vzero(f, zero_vec);
+
+   ppc_vlogefp(f, t_vec, va);                   /* t = log2(va) */
+   ppc_vmaddfp(f, t_vec, t_vec, vb, zero_vec);  /* t = t * vb */
+   ppc_vexptefp(f, vr, t_vec);                  /* vr = 2^t */
+
+   ppc_release_vec_register(f, t_vec);
+   ppc_release_vec_register(f, zero_vec);
+}
+
+
+static void
+emit_lit(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+   int one_vec = gen_one_vec(gen);
+
+   /* Compute X */
+   if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) {
+      STORE(gen, *inst, one_vec, 0, CHAN_X);
+   }
+
+   /* Compute Y, Z */
+   if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y) ||
+       IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
+      int x_vec = ppc_allocate_vec_register(gen->f);
+      int zero_vec = ppc_allocate_vec_register(gen->f);
+
+      FETCH(gen, *inst, x_vec, 0, CHAN_X);        /* x_vec = src[0].x */
+
+      ppc_vzero(gen->f, zero_vec);                /* zero = {0,0,0,0} */
+      ppc_vmaxfp(gen->f, x_vec, x_vec, zero_vec); /* x_vec = max(x_vec, 0) */
+
+      if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
+         STORE(gen, *inst, x_vec, 0, CHAN_Y);        /* store Y */
+      }
+
+      if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
+         int y_vec = ppc_allocate_vec_register(gen->f);
+         int z_vec = ppc_allocate_vec_register(gen->f);
+         int w_vec = ppc_allocate_vec_register(gen->f);
+         int pow_vec = ppc_allocate_vec_register(gen->f);
+         int pos_vec = ppc_allocate_vec_register(gen->f);
+         int p128_vec = ppc_allocate_vec_register(gen->f);
+         int n128_vec = ppc_allocate_vec_register(gen->f);
+
+         FETCH(gen, *inst, y_vec, 0, CHAN_Y);        /* y_vec = src[0].y */
+         ppc_vmaxfp(gen->f, y_vec, y_vec, zero_vec); /* y_vec = max(y_vec, 0) */
+
+         FETCH(gen, *inst, w_vec, 0, CHAN_W);        /* w_vec = src[0].w */
+
+         /* clamp Y to [-128, 128] */
+         load_constant_vec(gen, p128_vec, 128.0f);
+         load_constant_vec(gen, n128_vec, -128.0f);
+         ppc_vmaxfp(gen->f, y_vec, y_vec, n128_vec); /* y = max(y, -128) */
+         ppc_vminfp(gen->f, y_vec, y_vec, p128_vec); /* y = min(y, 128) */
+
+         /* if temp.x > 0
+          *    z = pow(tmp.y, tmp.w)
+          * else
+          *    z = 0.0
+          */
+         ppc_vec_pow(gen->f, pow_vec, y_vec, w_vec);      /* pow = pow(y, w) */
+         ppc_vcmpgtfpx(gen->f, pos_vec, x_vec, zero_vec); /* pos = x > 0 */
+         ppc_vand(gen->f, z_vec, pow_vec, pos_vec);       /* z = pow & pos */
+
+         STORE(gen, *inst, z_vec, 0, CHAN_Z);             /* store Z */
+
+         ppc_release_vec_register(gen->f, y_vec);
+         ppc_release_vec_register(gen->f, z_vec);
+         ppc_release_vec_register(gen->f, w_vec);
+         ppc_release_vec_register(gen->f, pow_vec);
+         ppc_release_vec_register(gen->f, pos_vec);
+         ppc_release_vec_register(gen->f, p128_vec);
+         ppc_release_vec_register(gen->f, n128_vec);
+      }
+
+      ppc_release_vec_register(gen->f, x_vec);
+      ppc_release_vec_register(gen->f, zero_vec);
+   }
+
+   /* Compute W */
+   if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W)) {
+      STORE(gen, *inst, one_vec, 0, CHAN_W);
+   }
+}
+
+
+static int
+emit_instruction(struct gen_context *gen,
+                 struct tgsi_full_instruction *inst)
+{
+   switch (inst->Instruction.Opcode) {
+   case TGSI_OPCODE_MOV:
+   case TGSI_OPCODE_ABS:
+   case TGSI_OPCODE_FLOOR:
+   case TGSI_OPCODE_FRAC:
+   case TGSI_OPCODE_EXPBASE2:
+   case TGSI_OPCODE_LOGBASE2:
+      emit_unaryop(gen, inst);
+      break;
+   case TGSI_OPCODE_RSQ:
+   case TGSI_OPCODE_RCP:
+      emit_scalar_unaryop(gen, inst);
+      break;
+   case TGSI_OPCODE_ADD:
+   case TGSI_OPCODE_SUB:
+   case TGSI_OPCODE_MUL:
+   case TGSI_OPCODE_MIN:
+   case TGSI_OPCODE_MAX:
+      emit_binop(gen, inst);
+      break;
+   case TGSI_OPCODE_SEQ:
+   case TGSI_OPCODE_SNE:
+   case TGSI_OPCODE_SLT:
+   case TGSI_OPCODE_SGT:
+   case TGSI_OPCODE_SLE:
+   case TGSI_OPCODE_SGE:
+      emit_inequality(gen, inst);
+      break;
+   case TGSI_OPCODE_MAD:
+   case TGSI_OPCODE_LRP:
+      emit_triop(gen, inst);
+      break;
+   case TGSI_OPCODE_DP3:
+   case TGSI_OPCODE_DP4:
+   case TGSI_OPCODE_DPH:
+      emit_dotprod(gen, inst);
+      break;
+   case TGSI_OPCODE_LIT:
+      emit_lit(gen, inst);
+      break;
+   case TGSI_OPCODE_END:
+      /* normal end */
+      return 1;
+   default:
+      return 0;
+   }
+
+   
+   return 1;
+}
+
+static void
+emit_declaration(
+   struct ppc_function *func,
+   struct tgsi_full_declaration *decl )
+{
+   if( decl->Declaration.File == TGSI_FILE_INPUT ) {
+#if 0
+      unsigned first, last, mask;
+      unsigned i, j;
+
+      first = decl->DeclarationRange.First;
+      last = decl->DeclarationRange.Last;
+      mask = decl->Declaration.UsageMask;
+
+      for( i = first; i <= last; i++ ) {
+         for( j = 0; j < NUM_CHANNELS; j++ ) {
+            if( mask & (1 << j) ) {
+               switch( decl->Declaration.Interpolate ) {
+               case TGSI_INTERPOLATE_CONSTANT:
+                  emit_coef_a0( func, 0, i, j );
+                  emit_inputs( func, 0, i, j );
+                  break;
+
+               case TGSI_INTERPOLATE_LINEAR:
+                  emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
+                  emit_coef_dadx( func, 1, i, j );
+                  emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
+                  emit_coef_dady( func, 3, i, j );
+                  emit_mul( func, 0, 1 );    /* x * dadx */
+                  emit_coef_a0( func, 4, i, j );
+                  emit_mul( func, 2, 3 );    /* y * dady */
+                  emit_add( func, 0, 4 );    /* x * dadx + a0 */
+                  emit_add( func, 0, 2 );    /* x * dadx + y * dady + a0 */
+                  emit_inputs( func, 0, i, j );
+                  break;
+
+               case TGSI_INTERPOLATE_PERSPECTIVE:
+                  emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
+                  emit_coef_dadx( func, 1, i, j );
+                  emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
+                  emit_coef_dady( func, 3, i, j );
+                  emit_mul( func, 0, 1 );    /* x * dadx */
+                  emit_tempf( func, 4, 0, TGSI_SWIZZLE_W );
+                  emit_coef_a0( func, 5, i, j );
+                  emit_rcp( func, 4, 4 );    /* 1.0 / w */
+                  emit_mul( func, 2, 3 );    /* y * dady */
+                  emit_add( func, 0, 5 );    /* x * dadx + a0 */
+                  emit_add( func, 0, 2 );    /* x * dadx + y * dady + a0 */
+                  emit_mul( func, 0, 4 );    /* (x * dadx + y * dady + a0) / w */
+                  emit_inputs( func, 0, i, j );
+                  break;
+
+               default:
+                  assert( 0 );
+		  break;
+               }
+            }
+         }
+      }
+#endif
+   }
+}
+
+
+
+static void
+emit_prologue(struct ppc_function *func)
+{
+   /* XXX set up stack frame */
+}
+
+
+static void
+emit_epilogue(struct ppc_function *func)
+{
+   ppc_return(func);
+   /* XXX restore prev stack frame */
+}
+
+
+
+/**
+ * Translate a TGSI vertex/fragment shader to PPC code.
+ *
+ * \param tokens  the TGSI input shader
+ * \param func  the output PPC code/function
+ * \param immediates  buffer to place immediates, later passed to PPC func
+ * \return TRUE for success, FALSE if translation failed
+ */
+boolean
+tgsi_emit_ppc(const struct tgsi_token *tokens,
+              struct ppc_function *func,
+              float (*immediates)[4],
+              boolean do_swizzles )
+{
+   static int use_ppc_asm = -1;
+   struct tgsi_parse_context parse;
+   /*boolean instruction_phase = FALSE;*/
+   unsigned ok = 1;
+   uint num_immediates = 0;
+   struct gen_context gen;
+
+   if (use_ppc_asm < 0) {
+      /* If GALLIUM_NOPPC is set, don't use PPC codegen */
+      use_ppc_asm = !debug_get_bool_option("GALLIUM_NOPPC", FALSE);
+   }
+   if (!use_ppc_asm)
+      return FALSE;
+
+   util_init_math();
+
+   gen.f = func;
+   gen.inputs_reg = ppc_reserve_register(func, 3);   /* first function param */
+   gen.outputs_reg = ppc_reserve_register(func, 4);  /* second function param */
+   gen.temps_reg = ppc_reserve_register(func, 5);    /* ... */
+   gen.immed_reg = ppc_reserve_register(func, 6);
+   gen.const_reg = ppc_reserve_register(func, 7);
+   gen.builtins_reg = ppc_reserve_register(func, 8);
+   gen.one_vec = -1;
+   gen.bit31_vec = -1;
+
+   emit_prologue(func);
+
+   tgsi_parse_init( &parse, tokens );
+
+   while (!tgsi_parse_end_of_tokens(&parse) && ok) {
+      tgsi_parse_token(&parse);
+
+      switch (parse.FullToken.Token.Type) {
+      case TGSI_TOKEN_TYPE_DECLARATION:
+         if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
+            emit_declaration(func, &parse.FullToken.FullDeclaration );
+         }
+         break;
+
+      case TGSI_TOKEN_TYPE_INSTRUCTION:
+         ok = emit_instruction(&gen, &parse.FullToken.FullInstruction);
+
+	 if (!ok) {
+	    debug_printf("failed to translate tgsi opcode %d to PPC (%s)\n", 
+			 parse.FullToken.FullInstruction.Instruction.Opcode,
+                         parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX ?
+                         "vertex shader" : "fragment shader");
+	 }
+         break;
+
+      case TGSI_TOKEN_TYPE_IMMEDIATE:
+         /* splat each immediate component into a float[4] vector for SoA */
+         {
+            const uint size = parse.FullToken.FullImmediate.Immediate.Size - 1;
+            float *imm = (float *) immediates;
+            uint i;
+            assert(size <= 4);
+            assert(num_immediates < TGSI_EXEC_NUM_IMMEDIATES);
+            for (i = 0; i < size; i++) {
+               const float value =
+                  parse.FullToken.FullImmediate.u.ImmediateFloat32[i].Float;
+               imm[num_immediates * 4 + 0] = 
+               imm[num_immediates * 4 + 1] = 
+               imm[num_immediates * 4 + 2] = 
+               imm[num_immediates * 4 + 3] = value;
+               num_immediates++;
+            }
+         }
+         break;
+
+      default:
+	 ok = 0;
+         assert( 0 );
+      }
+   }
+
+   emit_epilogue(func);
+
+   tgsi_parse_free( &parse );
+
+   return ok;
+}
+
+#endif /* PIPE_ARCH_PPC */
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ppc.h b/src/gallium/auxiliary/tgsi/tgsi_ppc.h
new file mode 100644
index 0000000000..829ec075e7
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/tgsi_ppc.h
@@ -0,0 +1,51 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef TGSI_PPC_H
+#define TGSI_PPC_H
+
+#if defined __cplusplus
+extern "C" {
+#endif
+
+struct tgsi_token;
+struct ppc_function;
+
+extern const float ppc_builtin_constants[];
+
+
+boolean
+tgsi_emit_ppc(const struct tgsi_token *tokens,
+              struct ppc_function *function,
+              float (*immediates)[4],
+              boolean do_swizzles);
+
+#if defined __cplusplus
+}
+#endif
+
+#endif /* TGSI_PPC_H */
diff --git a/src/gallium/auxiliary/tgsi/tgsi_sse2.c b/src/gallium/auxiliary/tgsi/tgsi_sse2.c
index 4681b29f52..f79170b9d6 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_sse2.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_sse2.c
@@ -25,9 +25,14 @@
  * 
  **************************************************************************/
 
+#include "pipe/p_config.h"
+
+#if defined(PIPE_ARCH_X86) && defined(PIPE_ARCH_SSE)
+
 #include "pipe/p_debug.h"
 #include "pipe/p_shader_tokens.h"
 #include "util/u_math.h"
+#include "util/u_sse.h"
 #include "tgsi/tgsi_parse.h"
 #include "tgsi/tgsi_util.h"
 #include "tgsi_exec.h"
@@ -35,8 +40,6 @@
 
 #include "rtasm/rtasm_x86sse.h"
 
-#ifdef PIPE_ARCH_X86
-
 /* for 1/sqrt()
  *
  * This costs about 100fps (close to 10%) in gears:
@@ -480,10 +483,31 @@ emit_coef_dady(
  * Function call helpers.
  */
 
+/**
+ * NOTE: In gcc, if the destination uses the SSE intrinsics, then it must be 
+ * defined with __attribute__((force_align_arg_pointer)), as we do not guarantee
+ * that the stack pointer is 16 byte aligned, as expected.
+ */
 static void
-emit_push_gp(
-   struct x86_function *func )
+emit_func_call_dst(
+   struct x86_function *func,
+   unsigned xmm_save,
+   unsigned xmm_dst,
+   void (PIPE_CDECL *code)() )
 {
+   struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
+   unsigned i, n, xmm;
+   unsigned xmm_mask;
+   
+   /* Bitmask of the xmm registers to save */
+   xmm_mask = (1 << xmm_save) - 1;
+   xmm_mask &= ~(1 << xmm_dst);
+
+   sse_movaps(
+      func,
+      get_temp( TEMP_R0, 0 ),
+      make_xmm( xmm_dst ) );
+
    x86_push(
       func,
       x86_make_reg( file_REG32, reg_AX) );
@@ -493,12 +517,49 @@ emit_push_gp(
    x86_push(
       func,
       x86_make_reg( file_REG32, reg_DX) );
-}
+   
+   for(i = 0, n = 0; i < 8; ++i)
+      if(xmm_mask & (1 << i))
+         ++n;
+   
+   x86_sub_imm(
+      func, 
+      x86_make_reg( file_REG32, reg_SP ),
+      n*16);
+
+   for(i = 0, n = 0; i < 8; ++i)
+      if(xmm_mask & (1 << i)) {
+         sse_movups(
+            func,
+            x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ),
+            make_xmm( xmm ) );
+         ++n;
+      }
+   
+   x86_lea(
+      func,
+      ecx,
+      get_temp( TEMP_R0, 0 ) );
+   
+   x86_push( func, ecx );
+   x86_mov_reg_imm( func, ecx, (unsigned long) code );
+   x86_call( func, ecx );
+   x86_pop(func, ecx );
+   
+   for(i = 0, n = 0; i < 8; ++i)
+      if(xmm_mask & (1 << i)) {
+         sse_movups(
+            func,
+            make_xmm( xmm ),
+            x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ) );
+         ++n;
+      }
+   
+   x86_add_imm(
+      func, 
+      x86_make_reg( file_REG32, reg_SP ),
+      n*16);
 
-static void
-x86_pop_gp(
-   struct x86_function *func )
-{
    /* Restore GP registers in a reverse order.
     */
    x86_pop(
@@ -510,39 +571,6 @@ x86_pop_gp(
    x86_pop(
       func,
       x86_make_reg( file_REG32, reg_AX) );
-}
-
-static void
-emit_func_call_dst(
-   struct x86_function *func,
-   unsigned xmm_dst,
-   void (PIPE_CDECL *code)() )
-{
-   sse_movaps(
-      func,
-      get_temp( TEMP_R0, 0 ),
-      make_xmm( xmm_dst ) );
-
-   emit_push_gp(
-      func );
-
-   {
-      struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
-
-      x86_lea(
-         func,
-         ecx,
-         get_temp( TEMP_R0, 0 ) );
-
-      x86_push( func, ecx );
-      x86_mov_reg_imm( func, ecx, (unsigned long) code );
-      x86_call( func, ecx );
-      x86_pop(func, ecx ); 
-   }
-
-
-   x86_pop_gp(
-      func );
 
    sse_movaps(
       func,
@@ -553,6 +581,7 @@ emit_func_call_dst(
 static void
 emit_func_call_dst_src(
    struct x86_function *func,
+   unsigned xmm_save, 
    unsigned xmm_dst,
    unsigned xmm_src,
    void (PIPE_CDECL *code)() )
@@ -564,10 +593,111 @@ emit_func_call_dst_src(
 
    emit_func_call_dst(
       func,
+      xmm_save,
       xmm_dst,
       code );
 }
 
+/*
+ * Fast SSE2 implementation of special math functions.
+ */
+
+#define POLY0(x, c0) _mm_set1_ps(c0)
+#define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
+#define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
+#define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
+#define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
+#define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
+
+#define EXP_POLY_DEGREE 3
+#define LOG_POLY_DEGREE 5
+
+/**
+ * See http://www.devmaster.net/forums/showthread.php?p=43580
+ */
+static INLINE __m128 
+exp2f4(__m128 x)
+{
+   __m128i ipart;
+   __m128 fpart, expipart, expfpart;
+
+   x = _mm_min_ps(x, _mm_set1_ps( 129.00000f));
+   x = _mm_max_ps(x, _mm_set1_ps(-126.99999f));
+
+   /* ipart = int(x - 0.5) */
+   ipart = _mm_cvtps_epi32(_mm_sub_ps(x, _mm_set1_ps(0.5f)));
+
+   /* fpart = x - ipart */
+   fpart = _mm_sub_ps(x, _mm_cvtepi32_ps(ipart));
+
+   /* expipart = (float) (1 << ipart) */
+   expipart = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart, _mm_set1_epi32(127)), 23));
+
+   /* minimax polynomial fit of 2**x, in range [-0.5, 0.5[ */
+#if EXP_POLY_DEGREE == 5
+   expfpart = POLY5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f);
+#elif EXP_POLY_DEGREE == 4
+   expfpart = POLY4(fpart, 1.0000026f, 6.9300383e-1f, 2.4144275e-1f, 5.2011464e-2f, 1.3534167e-2f);
+#elif EXP_POLY_DEGREE == 3
+   expfpart = POLY3(fpart, 9.9992520e-1f, 6.9583356e-1f, 2.2606716e-1f, 7.8024521e-2f);
+#elif EXP_POLY_DEGREE == 2
+   expfpart = POLY2(fpart, 1.0017247f, 6.5763628e-1f, 3.3718944e-1f);
+#else
+#error
+#endif
+
+   return _mm_mul_ps(expipart, expfpart);
+}
+
+/**
+ * See http://www.devmaster.net/forums/showthread.php?p=43580
+ */
+static INLINE __m128 
+log2f4(__m128 x)
+{
+   __m128i expmask = _mm_set1_epi32(0x7f800000);
+   __m128i mantmask = _mm_set1_epi32(0x007fffff);
+   __m128 one = _mm_set1_ps(1.0f);
+
+   __m128i i = _mm_castps_si128(x);
+
+   /* exp = (float) exponent(x) */
+   __m128 exp = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i, expmask), 23), _mm_set1_epi32(127)));
+
+   /* mant = (float) mantissa(x) */
+   __m128 mant = _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i, mantmask)), one);
+
+   __m128 logmant;
+
+   /* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[ 
+    * These coefficients can be generate with 
+    * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
+    */
+#if LOG_POLY_DEGREE == 6
+   logmant = POLY5(mant, 3.11578814719469302614f, -3.32419399085241980044f, 2.59883907202499966007f, -1.23152682416275988241f, 0.318212422185251071475f, -0.0344359067839062357313f);
+#elif LOG_POLY_DEGREE == 5
+   logmant = POLY4(mant, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
+#elif LOG_POLY_DEGREE == 4
+   logmant = POLY3(mant, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
+#elif LOG_POLY_DEGREE == 3
+   logmant = POLY2(mant, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
+#else
+#error
+#endif
+
+   /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
+   logmant = _mm_mul_ps(logmant, _mm_sub_ps(mant, one));
+
+   return _mm_add_ps(logmant, exp);
+}
+
+static INLINE __m128
+powf4(__m128 x, __m128 y)
+{
+   return exp2f4(_mm_mul_ps(log2f4(x), y));
+}
+
+
 /**
  * Low-level instruction translators.
  */
@@ -610,38 +740,35 @@ cos4f(
 static void
 emit_cos(
    struct x86_function *func,
+   unsigned xmm_save, 
    unsigned xmm_dst )
 {
    emit_func_call_dst(
       func,
+      xmm_save, 
       xmm_dst,
       cos4f );
 }
 
 static void PIPE_CDECL
+#if defined(PIPE_CC_GCC)
+__attribute__((force_align_arg_pointer))
+#endif
 ex24f(
    float *store )
 {
-#if FAST_MATH
-   store[0] = util_fast_exp2( store[0] );
-   store[1] = util_fast_exp2( store[1] );
-   store[2] = util_fast_exp2( store[2] );
-   store[3] = util_fast_exp2( store[3] );
-#else
-   store[0] = powf( 2.0f, store[0] );
-   store[1] = powf( 2.0f, store[1] );
-   store[2] = powf( 2.0f, store[2] );
-   store[3] = powf( 2.0f, store[3] );
-#endif
+   _mm_store_ps(&store[0], exp2f4( _mm_load_ps(&store[0]) ));
 }
 
 static void
 emit_ex2(
    struct x86_function *func,
+   unsigned xmm_save, 
    unsigned xmm_dst )
 {
    emit_func_call_dst(
       func,
+      xmm_save,
       xmm_dst,
       ex24f );
 }
@@ -670,10 +797,12 @@ flr4f(
 static void
 emit_flr(
    struct x86_function *func,
+   unsigned xmm_save, 
    unsigned xmm_dst )
 {
    emit_func_call_dst(
       func,
+      xmm_save,
       xmm_dst,
       flr4f );
 }
@@ -691,31 +820,35 @@ frc4f(
 static void
 emit_frc(
    struct x86_function *func,
+   unsigned xmm_save, 
    unsigned xmm_dst )
 {
    emit_func_call_dst(
       func,
+      xmm_save,
       xmm_dst,
       frc4f );
 }
 
 static void PIPE_CDECL
+#if defined(PIPE_CC_GCC)
+__attribute__((force_align_arg_pointer))
+#endif
 lg24f(
    float *store )
 {
-   store[0] = util_fast_log2( store[0] );
-   store[1] = util_fast_log2( store[1] );
-   store[2] = util_fast_log2( store[2] );
-   store[3] = util_fast_log2( store[3] );
+   _mm_store_ps(&store[0], log2f4( _mm_load_ps(&store[0]) ));
 }
 
 static void
 emit_lg2(
    struct x86_function *func,
+   unsigned xmm_save, 
    unsigned xmm_dst )
 {
    emit_func_call_dst(
       func,
+      xmm_save,
       xmm_dst,
       lg24f );
 }
@@ -757,14 +890,14 @@ emit_neg(
 }
 
 static void PIPE_CDECL
+#if defined(PIPE_CC_GCC)
+__attribute__((force_align_arg_pointer))
+#endif
 pow4f(
    float *store )
 {
-#if FAST_MATH
-   store[0] = util_fast_pow( store[0], store[4] );
-   store[1] = util_fast_pow( store[1], store[5] );
-   store[2] = util_fast_pow( store[2], store[6] );
-   store[3] = util_fast_pow( store[3], store[7] );
+#if 1
+   _mm_store_ps(&store[0], powf4( _mm_load_ps(&store[0]), _mm_load_ps(&store[4]) ));
 #else
    store[0] = powf( store[0], store[4] );
    store[1] = powf( store[1], store[5] );
@@ -776,11 +909,13 @@ pow4f(
 static void
 emit_pow(
    struct x86_function *func,
+   unsigned xmm_save, 
    unsigned xmm_dst,
    unsigned xmm_src )
 {
    emit_func_call_dst_src(
       func,
+      xmm_save,
       xmm_dst,
       xmm_src,
       pow4f );
@@ -873,10 +1008,12 @@ sin4f(
 
 static void
 emit_sin (struct x86_function *func,
+          unsigned xmm_save, 
           unsigned xmm_dst)
 {
    emit_func_call_dst(
       func,
+      xmm_save,
       xmm_dst,
       sin4f );
 }
@@ -1296,7 +1433,7 @@ emit_instruction(
                get_temp(
                   TGSI_EXEC_TEMP_MINUS_128_I,
                   TGSI_EXEC_TEMP_MINUS_128_C ) );
-            emit_pow( func, 1, 2 );
+            emit_pow( func, 3, 1, 2 );
             FETCH( func, *inst, 0, 0, CHAN_X );
             sse_xorps(
                func,
@@ -1342,11 +1479,11 @@ emit_instruction(
          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
              IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
             emit_MOV( func, 1, 0 );
-            emit_flr( func, 1 );
+            emit_flr( func, 2, 1 );
             /* dst.x = ex2(floor(src.x)) */
             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
                emit_MOV( func, 2, 1 );
-               emit_ex2( func, 2 );
+               emit_ex2( func, 3, 2 );
                STORE( func, *inst, 2, 0, CHAN_X );
             }
             /* dst.y = src.x - floor(src.x) */
@@ -1358,7 +1495,7 @@ emit_instruction(
          }
          /* dst.z = ex2(src.x) */
          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
-            emit_ex2( func, 0 );
+            emit_ex2( func, 3, 0 );
             STORE( func, *inst, 0, 0, CHAN_Z );
          }
       }
@@ -1376,21 +1513,21 @@ emit_instruction(
          FETCH( func, *inst, 0, 0, CHAN_X );
          emit_abs( func, 0 );
          emit_MOV( func, 1, 0 );
-         emit_lg2( func, 1 );
+         emit_lg2( func, 2, 1 );
          /* dst.z = lg2(abs(src.x)) */
          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
             STORE( func, *inst, 1, 0, CHAN_Z );
          }
          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
              IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
-            emit_flr( func, 1 );
+            emit_flr( func, 2, 1 );
             /* dst.x = floor(lg2(abs(src.x))) */
             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
                STORE( func, *inst, 1, 0, CHAN_X );
             }
             /* dst.x = abs(src)/ex2(floor(lg2(abs(src.x)))) */
             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
-               emit_ex2( func, 1 );
+               emit_ex2( func, 2, 1 );
                emit_rcp( func, 1, 1 );
                emit_mul( func, 0, 1 );
                STORE( func, *inst, 0, 0, CHAN_Y );
@@ -1580,7 +1717,7 @@ emit_instruction(
    /* TGSI_OPCODE_FRC */
       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( func, *inst, 0, 0, chan_index );
-         emit_frc( func, 0 );
+         emit_frc( func, 0, 0 );
          STORE( func, *inst, 0, 0, chan_index );
       }
       break;
@@ -1593,7 +1730,7 @@ emit_instruction(
    /* TGSI_OPCODE_FLR */
       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( func, *inst, 0, 0, chan_index );
-         emit_flr( func, 0 );
+         emit_flr( func, 0, 0 );
          STORE( func, *inst, 0, 0, chan_index );
       }
       break;
@@ -1605,7 +1742,7 @@ emit_instruction(
    case TGSI_OPCODE_EXPBASE2:
    /* TGSI_OPCODE_EX2 */
       FETCH( func, *inst, 0, 0, CHAN_X );
-      emit_ex2( func, 0 );
+      emit_ex2( func, 0, 0 );
       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
          STORE( func, *inst, 0, 0, chan_index );
       }
@@ -1614,7 +1751,7 @@ emit_instruction(
    case TGSI_OPCODE_LOGBASE2:
    /* TGSI_OPCODE_LG2 */
       FETCH( func, *inst, 0, 0, CHAN_X );
-      emit_lg2( func, 0 );
+      emit_lg2( func, 0, 0 );
       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
          STORE( func, *inst, 0, 0, chan_index );
       }
@@ -1624,7 +1761,7 @@ emit_instruction(
    /* TGSI_OPCODE_POW */
       FETCH( func, *inst, 0, 0, CHAN_X );
       FETCH( func, *inst, 1, 1, CHAN_X );
-      emit_pow( func, 0, 1 );
+      emit_pow( func, 0, 0, 1 );
       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
          STORE( func, *inst, 0, 0, chan_index );
       }
@@ -1715,7 +1852,7 @@ emit_instruction(
 
    case TGSI_OPCODE_COS:
       FETCH( func, *inst, 0, 0, CHAN_X );
-      emit_cos( func, 0 );
+      emit_cos( func, 0, 0 );
       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
          STORE( func, *inst, 0, 0, chan_index );
       }
@@ -1774,7 +1911,7 @@ emit_instruction(
 
    case TGSI_OPCODE_SIN:
       FETCH( func, *inst, 0, 0, CHAN_X );
-      emit_sin( func, 0 );
+      emit_sin( func, 0, 0 );
       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
          STORE( func, *inst, 0, 0, chan_index );
       }
@@ -1868,12 +2005,12 @@ emit_instruction(
    case TGSI_OPCODE_SCS:
       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
          FETCH( func, *inst, 0, 0, CHAN_X );
-         emit_cos( func, 0 );
+         emit_cos( func, 0, 0 );
          STORE( func, *inst, 0, 0, CHAN_X );
       }
       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
          FETCH( func, *inst, 0, 0, CHAN_X );
-         emit_sin( func, 0 );
+         emit_sin( func, 0, 0 );
          STORE( func, *inst, 0, 0, CHAN_Y );
       }
       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
diff --git a/src/gallium/auxiliary/util/Makefile b/src/gallium/auxiliary/util/Makefile
index d3951e4e7d..b3d1045a8f 100644
--- a/src/gallium/auxiliary/util/Makefile
+++ b/src/gallium/auxiliary/util/Makefile
@@ -10,6 +10,7 @@ C_SOURCES = \
 	u_gen_mipmap.c \
 	u_handle_table.c \
 	u_hash_table.c \
+	u_keymap.c \
 	u_math.c \
 	u_mm.c \
 	u_rect.c \
diff --git a/src/gallium/auxiliary/util/SConscript b/src/gallium/auxiliary/util/SConscript
index e65c17b1cc..8a04955a16 100644
--- a/src/gallium/auxiliary/util/SConscript
+++ b/src/gallium/auxiliary/util/SConscript
@@ -11,13 +11,14 @@ util = env.ConvenienceLibrary(
 		'u_gen_mipmap.c',
 		'u_handle_table.c',
 		'u_hash_table.c',
+		'u_keymap.c',
 		'u_math.c',
 		'u_mm.c',
 		'u_rect.c',
 		'u_simple_shaders.c',
 		'u_snprintf.c',
-        'u_stream_stdc.c',
-        'u_stream_wd.c',
+		'u_stream_stdc.c',
+		'u_stream_wd.c',
 		'u_tile.c',
 		'u_time.c',
 	])
diff --git a/src/gallium/auxiliary/util/p_debug.c b/src/gallium/auxiliary/util/p_debug.c
index b6cff281e6..3ed8bdfdf3 100644
--- a/src/gallium/auxiliary/util/p_debug.c
+++ b/src/gallium/auxiliary/util/p_debug.c
@@ -306,6 +306,13 @@ debug_get_flags_option(const char *name,
    str = _debug_get_option(name);
    if(!str)
       result = dfault;
+   else if (!util_strcmp(str, "help")) {
+      result = dfault;
+      while (flags->name) {
+         debug_printf("%s: help for %s: %s [0x%lx]\n", __FUNCTION__, name, flags->name, flags->value);
+         flags++;
+      }
+   }
    else {
       result = 0;
       while( flags->name ) {
@@ -315,7 +322,12 @@ debug_get_flags_option(const char *name,
       }
    }
 
-   debug_printf("%s: %s = 0x%lx\n", __FUNCTION__, name, result);
+   if (str) {
+      debug_printf("%s: %s = 0x%lx (%s)\n", __FUNCTION__, name, result, str);
+   }
+   else {
+      debug_printf("%s: %s = 0x%lx\n", __FUNCTION__, name, result);
+   }
 
    return result;
 }
diff --git a/src/gallium/auxiliary/util/p_debug_mem.c b/src/gallium/auxiliary/util/p_debug_mem.c
index ed18c6540e..9511479cbb 100644
--- a/src/gallium/auxiliary/util/p_debug_mem.c
+++ b/src/gallium/auxiliary/util/p_debug_mem.c
@@ -122,8 +122,12 @@ debug_malloc(const char *file, unsigned line, const char *function,
    struct debug_memory_footer *ftr;
    
    hdr = real_malloc(sizeof(*hdr) + size + sizeof(*ftr));
-   if(!hdr)
+   if(!hdr) {
+      debug_printf("%s:%u:%s: out of memory when trying to allocate %lu bytes\n",
+                   file, line, function,
+                   (long unsigned)size);
       return NULL;
+   }
  
    hdr->no = last_no++;
    hdr->file = file;
@@ -219,8 +223,12 @@ debug_realloc(const char *file, unsigned line, const char *function,
 
    /* alloc new */
    new_hdr = real_malloc(sizeof(*new_hdr) + new_size + sizeof(*new_ftr));
-   if(!new_hdr)
+   if(!new_hdr) {
+      debug_printf("%s:%u:%s: out of memory when trying to allocate %lu bytes\n",
+                   file, line, function,
+                   (long unsigned)new_size);
       return NULL;
+   }
    new_hdr->no = old_hdr->no;
    new_hdr->file = old_hdr->file;
    new_hdr->line = old_hdr->line;
@@ -261,8 +269,19 @@ debug_memory_end(unsigned long start_no)
    for (; entry != &list; entry = entry->prev) {
       struct debug_memory_header *hdr;
       void *ptr;
+      struct debug_memory_footer *ftr;
+
       hdr = LIST_ENTRY(struct debug_memory_header, entry, head);
       ptr = data_from_header(hdr);
+      ftr = footer_from_header(hdr);
+
+      if(hdr->magic != DEBUG_MEMORY_MAGIC) {
+         debug_printf("%s:%u:%s: bad or corrupted memory %p\n",
+                      hdr->file, hdr->line, hdr->function,
+                      ptr);
+         debug_assert(0);
+      }
+
       if((start_no <= hdr->no && hdr->no < last_no) ||
 	 (last_no < start_no && (hdr->no < last_no || start_no <= hdr->no))) {
 	 debug_printf("%s:%u:%s: %u bytes at %p not freed\n",
@@ -270,7 +289,15 @@ debug_memory_end(unsigned long start_no)
 		      hdr->size, ptr);
 	 total_size += hdr->size;
       }
+
+      if(ftr->magic != DEBUG_MEMORY_MAGIC) {
+         debug_printf("%s:%u:%s: buffer overflow %p\n",
+                      hdr->file, hdr->line, hdr->function,
+                      ptr);
+         debug_assert(0);
+      }
    }
+
    if(total_size) {
       debug_printf("Total of %u KB of system memory apparently leaked\n",
 		   (total_size + 1023)/1024);
diff --git a/src/gallium/auxiliary/util/u_blit.c b/src/gallium/auxiliary/util/u_blit.c
index 9adf72944e..d28201ac8d 100644
--- a/src/gallium/auxiliary/util/u_blit.c
+++ b/src/gallium/auxiliary/util/u_blit.c
@@ -104,6 +104,7 @@ util_create_blit(struct pipe_context *pipe, struct cso_context *cso)
    ctx->rasterizer.cull_mode = PIPE_WINDING_NONE;
    ctx->rasterizer.bypass_clipping = 1;
    /*ctx->rasterizer.bypass_vs = 1;*/
+   ctx->rasterizer.gl_rasterization_rules = 1;
 
    /* samplers */
    memset(&ctx->sampler, 0, sizeof(ctx->sampler));
diff --git a/src/gallium/auxiliary/util/u_gen_mipmap.c b/src/gallium/auxiliary/util/u_gen_mipmap.c
index b19a649bbc..9d305ad763 100644
--- a/src/gallium/auxiliary/util/u_gen_mipmap.c
+++ b/src/gallium/auxiliary/util/u_gen_mipmap.c
@@ -725,6 +725,7 @@ util_create_gen_mipmap(struct pipe_context *pipe,
    ctx->rasterizer.cull_mode = PIPE_WINDING_NONE;
    ctx->rasterizer.bypass_clipping = 1;
    /*ctx->rasterizer.bypass_vs = 1;*/
+   ctx->rasterizer.gl_rasterization_rules = 1;
 
    /* sampler state */
    memset(&ctx->sampler, 0, sizeof(ctx->sampler));
diff --git a/src/gallium/auxiliary/util/u_keymap.c b/src/gallium/auxiliary/util/u_keymap.c
new file mode 100644
index 0000000000..01b17ddb1b
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_keymap.c
@@ -0,0 +1,309 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * Key lookup/associative container.
+ *
+ * Like Jose's u_hash_table, based on CSO cache code for now.
+ *
+ * Author: Brian Paul
+ */
+
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_debug.h"
+#include "pipe/p_error.h"
+
+#include "cso_cache/cso_hash.h"
+
+#include "util/u_memory.h"
+#include "util/u_keymap.h"
+
+
+struct keymap
+{
+   struct cso_hash *cso;   
+   unsigned key_size;
+   unsigned max_entries; /* XXX not obeyed net */
+   unsigned num_entries;
+   keymap_delete_func delete_func;
+};
+
+
+struct keymap_item
+{
+   void *key, *value;
+};
+
+
+/**
+ * This the default key-delete function used when the client doesn't
+ * provide one.
+ */
+static void
+default_delete_func(const struct keymap *map,
+                    const void *key, void *data, void *user)
+{
+   FREE((void*) data);
+}
+
+
+static INLINE struct keymap_item *
+hash_table_item(struct cso_hash_iter iter)
+{
+   return (struct keymap_item *) cso_hash_iter_data(iter);
+}
+
+
+/**
+ * Return 4-byte hash key for a block of bytes.
+ */
+static unsigned
+hash(const void *key, unsigned keySize)
+{
+   unsigned i, hash;
+
+   keySize /= 4; /* convert from bytes to uints */
+
+   hash = 0;
+   for (i = 0; i < keySize; i++) {
+      hash ^= (i + 1) * ((const unsigned *) key)[i];
+   }
+
+   /*hash = hash ^ (hash >> 11) ^ (hash >> 22);*/
+
+   return hash;
+}
+
+
+/**
+ * Create a new map.
+ * \param keySize  size of the keys in bytes
+ * \param maxEntries  max number of entries to allow (~0 = infinity)
+ * \param deleteFunc  optional callback to call when entries
+ *                    are deleted/replaced
+ */
+struct keymap *
+util_new_keymap(unsigned keySize, unsigned maxEntries,
+                 keymap_delete_func deleteFunc)
+{
+   struct keymap *map = MALLOC_STRUCT(keymap);
+   if (!map)
+      return NULL;
+   
+   map->cso = cso_hash_create();
+   if (!map->cso) {
+      FREE(map);
+      return NULL;
+   }
+   
+   map->max_entries = maxEntries;
+   map->num_entries = 0;
+   map->key_size = keySize;
+   map->delete_func = deleteFunc ? deleteFunc : default_delete_func;
+
+   return map;
+}
+
+
+/**
+ * Delete/free a keymap and all entries.  The deleteFunc that was given at
+ * create time will be called for each entry.
+ * \param user  user-provided pointer passed through to the delete callback
+ */
+void
+util_delete_keymap(struct keymap *map, void *user)
+{
+   util_keymap_remove_all(map, user);
+   cso_hash_delete(map->cso);
+   FREE(map);
+}
+
+
+static INLINE struct cso_hash_iter
+hash_table_find_iter(const struct keymap *map, const void *key,
+                     unsigned key_hash)
+{
+   struct cso_hash_iter iter;
+   struct keymap_item *item;
+   
+   iter = cso_hash_find(map->cso, key_hash);
+   while (!cso_hash_iter_is_null(iter)) {
+      item = (struct keymap_item *) cso_hash_iter_data(iter);
+      if (!memcmp(item->key, key, map->key_size))
+         break;
+      iter = cso_hash_iter_next(iter);
+   }
+   
+   return iter;
+}
+
+
+static INLINE struct keymap_item *
+hash_table_find_item(const struct keymap *map, const void *key,
+                     unsigned key_hash)
+{
+   struct cso_hash_iter iter = hash_table_find_iter(map, key, key_hash);
+   if (cso_hash_iter_is_null(iter)) {
+      return NULL;
+   }
+   else {
+      return hash_table_item(iter);
+   }
+}
+
+
+/**
+ * Insert a new key + data pointer into the table.
+ * Note: we create a copy of the key, but not the data!
+ * If the key is already present in the table, replace the existing
+ * entry (calling the delete callback on the previous entry).
+ * If the maximum capacity of the map is reached an old entry
+ * will be deleted (the delete callback will be called).
+ */
+boolean
+util_keymap_insert(struct keymap *map, const void *key,
+                   const void *data, void *user)
+{
+   unsigned key_hash;
+   struct keymap_item *item;
+   struct cso_hash_iter iter;
+
+   assert(map);
+
+   key_hash = hash(key, map->key_size);
+
+   item = hash_table_find_item(map, key, key_hash);
+   if (item) {
+      /* call delete callback for old entry/item */
+      map->delete_func(map, item->key, item->value, user);
+      item->value = (void *) data;
+      return TRUE;
+   }
+   
+   item = MALLOC_STRUCT(keymap_item);
+   if (!item)
+      return FALSE;
+
+   item->key = mem_dup(key, map->key_size);
+   item->value = (void *) data;
+   
+   iter = cso_hash_insert(map->cso, key_hash, item);
+   if (cso_hash_iter_is_null(iter)) {
+      FREE(item);
+      return FALSE;
+   }
+
+   map->num_entries++;
+
+   return TRUE;
+}
+
+
+/**
+ * Look up a key in the map and return the associated data pointer.
+ */
+const void *
+util_keymap_lookup(const struct keymap *map, const void *key)
+{
+   unsigned key_hash;
+   struct keymap_item *item;
+
+   assert(map);
+
+   key_hash = hash(key, map->key_size);
+
+   item = hash_table_find_item(map, key, key_hash);
+   if (!item)
+      return NULL;
+   
+   return item->value;
+}
+
+
+/**
+ * Remove an entry from the map.
+ * The delete callback will be called if the given key/entry is found.
+ * \param user  passed to the delete callback as the last param.
+ */
+void
+util_keymap_remove(struct keymap *map, const void *key, void *user)
+{
+   unsigned key_hash;
+   struct cso_hash_iter iter;
+   struct keymap_item *item;
+
+   assert(map);
+
+   key_hash = hash(key, map->key_size);
+
+   iter = hash_table_find_iter(map, key, key_hash);
+   if (cso_hash_iter_is_null(iter))
+      return;
+   
+   item = hash_table_item(iter);
+   assert(item);
+   map->delete_func(map, item->key, item->value, user);
+   FREE(item->key);
+   FREE(item);
+   
+   map->num_entries--;
+
+   cso_hash_erase(map->cso, iter);
+}
+
+
+/**
+ * Remove all entries from the map, calling the delete callback for each.
+ * \param user  passed to the delete callback as the last param.
+ */
+void
+util_keymap_remove_all(struct keymap *map, void *user)
+{
+   struct cso_hash_iter iter;
+   struct keymap_item *item;
+
+   assert(map);
+   
+   iter = cso_hash_first_node(map->cso);
+   while (!cso_hash_iter_is_null(iter)) {
+      item = (struct keymap_item *)
+         cso_hash_take(map->cso, cso_hash_iter_key(iter));
+      map->delete_func(map, item->key, item->value, user);
+      FREE(item->key);
+      FREE(item);
+      iter = cso_hash_first_node(map->cso);
+   }
+}
+
+
+extern void
+util_keymap_info(const struct keymap *map)
+{
+   debug_printf("Keymap %p: %u of max %u entries\n",
+                (void *) map, map->num_entries, map->max_entries);
+}
diff --git a/src/gallium/auxiliary/util/u_keymap.h b/src/gallium/auxiliary/util/u_keymap.h
new file mode 100644
index 0000000000..8d60a76fc3
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_keymap.h
@@ -0,0 +1,68 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef U_KEYMAP_H
+#define U_KEYMAP_H
+
+#include "pipe/p_compiler.h"
+
+
+/** opaque keymap type */
+struct keymap;
+
+
+/** Delete/callback function type */
+typedef void (*keymap_delete_func)(const struct keymap *map,
+                                   const void *key, void *data,
+                                   void *user);
+
+
+extern struct keymap *
+util_new_keymap(unsigned keySize, unsigned maxEntries,
+                keymap_delete_func deleteFunc);
+
+extern void
+util_delete_keymap(struct keymap *map, void *user);
+
+extern boolean
+util_keymap_insert(struct keymap *map, const void *key,
+                   const void *data, void *user);
+
+extern const void *
+util_keymap_lookup(const struct keymap *map, const void *key);
+
+extern void
+util_keymap_remove(struct keymap *map, const void *key, void *user);
+
+extern void
+util_keymap_remove_all(struct keymap *map, void *user);
+
+extern void
+util_keymap_info(const struct keymap *map);
+
+
+#endif /* U_KEYMAP_H */
diff --git a/src/gallium/auxiliary/util/u_math.c b/src/gallium/auxiliary/util/u_math.c
index 0729114d6a..5b3cab4642 100644
--- a/src/gallium/auxiliary/util/u_math.c
+++ b/src/gallium/auxiliary/util/u_math.c
@@ -30,7 +30,7 @@
 #include "util/u_math.h"
 
 
-
+/** 2^x, for x in [-1.0, 1.0[ */
 float pow2_table[POW2_TABLE_SIZE];
 
 
@@ -38,9 +38,21 @@ static void
 init_pow2_table(void)
 {
    int i;
-   for (i = 0; i < POW2_TABLE_SIZE; i++) {
-      pow2_table[i] = (float) pow(2.0, i / POW2_TABLE_SCALE);
-   }
+   for (i = 0; i < POW2_TABLE_SIZE; i++)
+      pow2_table[i] = (float) pow(2.0, (i - POW2_TABLE_OFFSET) / POW2_TABLE_SCALE);
+}
+
+
+/** log2(x), for x in [1.0, 2.0[ */
+float log2_table[LOG2_TABLE_SIZE];
+
+
+static void 
+init_log2_table(void)
+{
+   unsigned i;
+   for (i = 0; i < LOG2_TABLE_SIZE; i++)
+      log2_table[i] = (float) log2(1.0 + i * (1.0 / LOG2_TABLE_SIZE));
 }
 
 
@@ -53,6 +65,7 @@ util_init_math(void)
    static boolean initialized = FALSE;
    if (!initialized) {
       init_pow2_table();
+      init_log2_table();
       initialized = TRUE;
    }
 }
diff --git a/src/gallium/auxiliary/util/u_math.h b/src/gallium/auxiliary/util/u_math.h
index 0b10622ee7..be7303e550 100644
--- a/src/gallium/auxiliary/util/u_math.h
+++ b/src/gallium/auxiliary/util/u_math.h
@@ -40,6 +40,7 @@
 
 
 #include "pipe/p_compiler.h"
+#include "pipe/p_debug.h"
 
 
 #ifdef __cplusplus
@@ -173,8 +174,10 @@ static INLINE float logf( float f )
 
 
 
-#define POW2_TABLE_SIZE 256
-#define POW2_TABLE_SCALE ((float) (POW2_TABLE_SIZE-1))
+#define POW2_TABLE_SIZE_LOG2 9
+#define POW2_TABLE_SIZE (1 << POW2_TABLE_SIZE_LOG2)
+#define POW2_TABLE_OFFSET (POW2_TABLE_SIZE/2)
+#define POW2_TABLE_SCALE ((float)(POW2_TABLE_SIZE/2))
 extern float pow2_table[POW2_TABLE_SIZE];
 
 
@@ -185,98 +188,78 @@ util_init_math(void);
 
 union fi {
    float f;
-   int i;
-   unsigned ui;
+   int32_t i;
+   uint32_t ui;
 };
 
 
 /**
- * Fast approximation to exp(x).
- * Compute with base 2 exponents:  exp(x) = exp2(log2(e) * x)
- * Note: log2(e) is a constant, k = 1.44269
- * So, exp(x) = exp2(k * x);
+ * Fast version of 2^x
  * Identity: exp2(a + b) = exp2(a) * exp2(b)
- * Let ipart = int(k*x)
- * Let fpart = k*x - ipart;
- * So, exp2(k*x) = exp2(ipart) * exp2(fpart)
+ * Let ipart = int(x)
+ * Let fpart = x - ipart;
+ * So, exp2(x) = exp2(ipart) * exp2(fpart)
  * Compute exp2(ipart) with i << ipart
  * Compute exp2(fpart) with lookup table.
  */
 static INLINE float
-util_fast_exp(float x)
+util_fast_exp2(float x)
 {
-   if (x >= 0.0f) {
-      float k = 1.44269f; /* = log2(e) */
-      float kx = k * x;
-      int ipart = (int) kx;
-      float fpart = kx - (float) ipart;
-      float y = (float) (1 << ipart)
-         * pow2_table[(int) (fpart * POW2_TABLE_SCALE)];
-      return y;
-   }
-   else {
-      /* exp(-x) = 1.0 / exp(x) */
-      float k = -1.44269f;
-      float kx = k * x;
-      int ipart = (int) kx;
-      float fpart = kx - (float) ipart;
-      float y = (float) (1 << ipart)
-         * pow2_table[(int) (fpart * POW2_TABLE_SCALE)];
-      return 1.0f / y;
-   }
+   int32_t ipart;
+   float fpart, mpart;
+   union fi epart;
+   
+   if(x > 129.00000f)
+      return 3.402823466e+38f;
+   
+   if(x < -126.99999f)
+      return 0.0f;
+
+   ipart = (int32_t) x;
+   fpart = x - (float) ipart;
+   
+   /* same as
+    *   epart.f = (float) (1 << ipart)
+    * but faster and without integer overflow for ipart > 31 */
+   epart.i = (ipart + 127 ) << 23;
+   
+   mpart = pow2_table[POW2_TABLE_OFFSET + (int)(fpart * POW2_TABLE_SCALE)];
+   
+   return epart.f * mpart;
 }
 
 
 /**
- * Fast version of 2^x
- * XXX the above function could be implemented in terms of this one.
+ * Fast approximation to exp(x).
  */
 static INLINE float
-util_fast_exp2(float x)
+util_fast_exp(float x)
 {
-   if (x >= 0.0f) {
-      int ipart = (int) x;
-      float fpart = x - (float) ipart;
-      float y = (float) (1 << ipart)
-         * pow2_table[(int) (fpart * POW2_TABLE_SCALE)];
-      return y;
-   }
-   else {
-      /* exp(-x) = 1.0 / exp(x) */
-      int ipart = (int) -x;
-      float fpart = -x - (float) ipart;
-      float y = (float) (1 << ipart)
-         * pow2_table[(int) (fpart * POW2_TABLE_SCALE)];
-      return 1.0f / y;
-   }
+   const float k = 1.44269f; /* = log2(e) */
+   return util_fast_exp2(k * x);
 }
 
 
-/**
- * Based on code from http://www.flipcode.com/totd/
- */
+#define LOG2_TABLE_SIZE_LOG2 8
+#define LOG2_TABLE_SIZE (1 << LOG2_TABLE_SIZE_LOG2)
+extern float log2_table[LOG2_TABLE_SIZE];
+
+
 static INLINE float
-util_fast_log2(float val)
+util_fast_log2(float x)
 {
    union fi num;
-   int log_2;
-   num.f = val;
-   log_2 = ((num.i >> 23) & 255) - 128;
-   num.i &= ~(255 << 23);
-   num.i += 127 << 23;
-   num.f = ((-1.0f/3) * num.f + 2) * num.f - 2.0f/3;
-   return num.f + log_2;
+   float epart, mpart;
+   num.f = x;
+   epart = (float)(((num.i & 0x7f800000) >> 23) - 127);
+   mpart = log2_table[(num.i & 0x007fffff) >> (23 - LOG2_TABLE_SIZE_LOG2)];
+   return epart + mpart;
 }
 
 
 static INLINE float
 util_fast_pow(float x, float y)
 {
-   /* XXX these tests may need adjustment */
-   if (y >= 3.0f && (-0.02f <= x && x <= 0.02f))
-      return 0.0f;
-   if (y >= 50.0f && (-0.9f <= x && x <= 0.9f))
-      return 0.0f;
    return util_fast_exp2(util_fast_log2(x) * y);
 }
 
diff --git a/src/gallium/auxiliary/util/u_sse.h b/src/gallium/auxiliary/util/u_sse.h
new file mode 100644
index 0000000000..e2a8491e62
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_sse.h
@@ -0,0 +1,77 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * SSE intrinsics portability header.
+ * 
+ * Although the SSE intrinsics are support by all modern x86 and x86-64 
+ * compilers, there are some intrisincs missing in some implementations 
+ * (especially older MSVC versions). This header abstracts that away.
+ */
+
+#ifndef U_SSE_H_
+#define U_SSE_H_
+
+#include "pipe/p_config.h"
+
+#if defined(PIPE_ARCH_SSE)
+
+#include <xmmintrin.h>
+#include <emmintrin.h>
+
+
+/* MSVC before VC8 does not support the _mm_castxxx_yyy */
+#if defined(_MSC_VER) && _MSC_VER < 1500
+
+union __declspec(align(16)) m128_types {
+   __m128 m128;
+   __m128i m128i;
+   __m128d m128d;
+};
+
+static __inline __m128
+_mm_castsi128_ps(__m128i a)
+{
+   union m128_types u;
+   u.m128i = a;
+   return u.m128;
+}
+
+static __inline __m128i
+_mm_castps_si128(__m128 a)
+{
+   union m128_types u;
+   u.m128 = a;
+   return u.m128i;
+}
+
+#endif /* defined(_MSC_VER) && _MSC_VER < 1500 */
+
+#endif /* PIPE_ARCH_X86 || PIPE_ARCH_X86_64 */
+
+#endif /* U_SSE_H_ */