48 files changed, 1806 insertions, 584 deletions
diff --git a/src/gallium/auxiliary/draw/draw_pt.c b/src/gallium/auxiliary/draw/draw_pt.c
index 87ec6ae20c..3c175f31d8 100644
--- a/src/gallium/auxiliary/draw/draw_pt.c
+++ b/src/gallium/auxiliary/draw/draw_pt.c
@@ -33,6 +33,8 @@
 #include "draw/draw_context.h"
 #include "draw/draw_private.h"
 #include "draw/draw_pt.h"
+#include "draw/draw_vs.h"
+#include "tgsi/tgsi_dump.h"
 
 static unsigned trim( unsigned count, unsigned first, unsigned incr )
 {
@@ -176,6 +178,92 @@ void draw_pt_destroy( struct draw_context *draw )
 }
 
 
+/**
+ * Debug- print the first 'count' vertices.
+ */
+static void
+draw_print_arrays(struct draw_context *draw, uint prim, int start, uint count)
+{
+   uint i;
+
+   debug_printf("Draw arrays(prim = %u, start = %u, count = %u)\n",
+                prim, start, count);
+
+   for (i = 0; i < count; i++) {
+      uint ii, j;
+
+      if (draw->pt.user.elts) {
+         /* indexed arrays */
+         switch (draw->pt.user.eltSize) {
+         case 1:
+            {
+               const ubyte *elem = (const ubyte *) draw->pt.user.elts;
+               ii = elem[start + i];
+            }
+            break;
+         case 2:
+            {
+               const ushort *elem = (const ushort *) draw->pt.user.elts;
+               ii = elem[start + i];
+            }
+            break;
+         case 4:
+            {
+               const uint *elem = (const uint *) draw->pt.user.elts;
+               ii = elem[start + i];
+            }
+            break;
+         default:
+            assert(0);
+         }
+         debug_printf("Element[%u + %u] -> Vertex %u:\n", start, i, ii);
+      }
+      else {
+         /* non-indexed arrays */
+         ii = start + i;
+         debug_printf("Vertex %u:\n", ii);
+      }
+
+      for (j = 0; j < draw->pt.nr_vertex_elements; j++) {
+         uint buf = draw->pt.vertex_element[j].vertex_buffer_index;
+         ubyte *ptr = (ubyte *) draw->pt.user.vbuffer[buf];
+         ptr += draw->pt.vertex_buffer[buf].pitch * ii;
+         ptr += draw->pt.vertex_element[j].src_offset;
+
+         debug_printf("  Attr %u: ", j);
+         switch (draw->pt.vertex_element[j].src_format) {
+         case PIPE_FORMAT_R32_FLOAT:
+            {
+               float *v = (float *) ptr;
+               debug_printf("%f  @ %p\n", v[0], (void *) v);
+            }
+            break;
+         case PIPE_FORMAT_R32G32_FLOAT:
+            {
+               float *v = (float *) ptr;
+               debug_printf("%f %f  @ %p\n", v[0], v[1], (void *) v);
+            }
+            break;
+         case PIPE_FORMAT_R32G32B32_FLOAT:
+            {
+               float *v = (float *) ptr;
+               debug_printf("%f %f %f  @ %p\n", v[0], v[1], v[2], (void *) v);
+            }
+            break;
+         case PIPE_FORMAT_R32G32B32A32_FLOAT:
+            {
+               float *v = (float *) ptr;
+               debug_printf("%f %f %f %f  @ %p\n", v[0], v[1], v[2], v[3],
+                            (void *) v);
+            }
+            break;
+         default:
+            debug_printf("other format (fix me)\n");
+            ;
+         }
+      }
+   }
+}
 
 
 /**
@@ -195,6 +283,31 @@ draw_arrays(struct draw_context *draw, unsigned prim,
       draw->reduced_prim = reduced_prim;
    }
 
+   if (0)
+      draw_print_arrays(draw, prim, start, MIN2(count, 20));
+
+#if 0
+   {
+      int i;
+      debug_printf("draw_arrays(prim=%u start=%u count=%u):\n",
+                   prim, start, count);
+      tgsi_dump(draw->vs.vertex_shader->state.tokens, 0);
+      debug_printf("Elements:\n");
+      for (i = 0; i < draw->pt.nr_vertex_elements; i++) {
+         debug_printf("  format=%s comps=%u\n",
+                      pf_name(draw->pt.vertex_element[i].src_format),
+                      draw->pt.vertex_element[i].nr_components);
+      }
+      debug_printf("Buffers:\n");
+      for (i = 0; i < draw->pt.nr_vertex_buffers; i++) {
+         debug_printf("  pitch=%u offset=%u ptr=%p\n",
+                      draw->pt.vertex_buffer[i].pitch,
+                      draw->pt.vertex_buffer[i].buffer_offset,
+                      draw->pt.user.vbuffer[i]);
+      }
+   }
+#endif
+
    /* drawing done here: */
    draw_pt_arrays(draw, prim, start, count);
 }
diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.c b/src/gallium/auxiliary/draw/draw_vs_aos.c
index 87232865e2..6141ba9cbf 100644
--- a/src/gallium/auxiliary/draw/draw_vs_aos.c
+++ b/src/gallium/auxiliary/draw/draw_vs_aos.c
@@ -1632,6 +1632,17 @@ static boolean emit_SUB( struct aos_compilation *cp, const struct tgsi_full_inst
    return TRUE;
 }
 
+static boolean emit_TRUNC( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
+{
+   struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
+   struct x86_reg tmp0 = aos_get_xmm_reg(cp);
+
+   sse2_cvttps2dq(cp->func, tmp0, arg0);
+   sse2_cvtdq2ps(cp->func, tmp0, tmp0);
+
+   store_dest(cp, &op->FullDstRegisters[0], tmp0);
+   return TRUE;
+}
 
 static boolean emit_XPD( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) 
 {
@@ -1770,6 +1781,9 @@ emit_instruction( struct aos_compilation *cp,
    case TGSI_OPCODE_SIN:
       return emit_SIN(cp, inst);
 
+   case TGSI_OPCODE_TRUNC:
+      return emit_TRUNC(cp, inst);
+
    case TGSI_OPCODE_END:
       return TRUE;
 
@@ -2176,7 +2190,8 @@ static struct draw_vs_varient *varient_aos_sse( struct draw_vertex_shader *vs,
    if (!vaos->buffer)
       goto fail;
 
-   debug_printf("nr_vb: %d const: %x\n", vaos->nr_vb, vaos->base.key.const_vbuffers);
+   if (0)
+      debug_printf("nr_vb: %d const: %x\n", vaos->nr_vb, vaos->base.key.const_vbuffers);
 
 #if 0
    tgsi_dump(vs->state.tokens, 0);
diff --git a/src/gallium/auxiliary/draw/draw_vs_exec.c b/src/gallium/auxiliary/draw/draw_vs_exec.c
index 13d4fcfdbf..80c3606657 100644
--- a/src/gallium/auxiliary/draw/draw_vs_exec.c
+++ b/src/gallium/auxiliary/draw/draw_vs_exec.c
@@ -123,6 +123,12 @@ vs_exec_run_linear( struct draw_vertex_shader *shader,
 	 input = (const float (*)[4])((const char *)input + input_stride);
       } 
 
+      tgsi_set_exec_mask(machine,
+                         1,
+                         max_vertices > 1,
+                         max_vertices > 2,
+                         max_vertices > 3);
+
       /* run interpreter */
       tgsi_exec_machine_run( machine );
 
diff --git a/src/gallium/auxiliary/draw/draw_vs_ppc.c b/src/gallium/auxiliary/draw/draw_vs_ppc.c
index 8eff6d4fda..8b75136144 100644
--- a/src/gallium/auxiliary/draw/draw_vs_ppc.c
+++ b/src/gallium/auxiliary/draw/draw_vs_ppc.c
@@ -54,31 +54,16 @@
 typedef void (PIPE_CDECL *codegen_function) (float (*inputs)[4][4],
                                              float (*outputs)[4][4],
                                              float (*temps)[4][4],
-                                             float (*immeds)[4][4],
+                                             float (*immeds)[4],
                                              float (*consts)[4],
                                              const float *builtins);
 
-#if 0
-   const struct tgsi_exec_vector *input,
-   struct tgsi_exec_vector *output,
-   float (*constant)[4],        /* 3 */
-   struct tgsi_exec_vector *temporary, /* 4 */
-   float (*immediates)[4],      /* 5 */
-   const float (*aos_input)[4], /* 6 */
-   uint num_inputs,             /* 7 */
-   uint input_stride,           /* 8 */
-   float (*aos_output)[4],      /* 9 */
-   uint num_outputs,            /* 10 */
-   uint output_stride );        /* 11 */
-#endif
 
 struct draw_ppc_vertex_shader {
    struct draw_vertex_shader base;
    struct ppc_function ppc_program;
 
    codegen_function func;
-   
-   struct tgsi_exec_machine *machine;
 };
 
 
@@ -86,11 +71,12 @@ static void
 vs_ppc_prepare( struct draw_vertex_shader *base,
 		struct draw_context *draw )
 {
+   /* nothing */
 }
 
 
-
-/* Simplified vertex shader interface for the pt paths.  Given the
+/**
+ * Simplified vertex shader interface for the pt paths.  Given the
  * complexity of code-generating all the above operations together,
  * it's time to try doing all the other stuff separately.
  */
@@ -104,7 +90,6 @@ vs_ppc_run_linear( struct draw_vertex_shader *base,
 		   unsigned output_stride )
 {
    struct draw_ppc_vertex_shader *shader = (struct draw_ppc_vertex_shader *)base;
-   struct tgsi_exec_machine *machine = shader->machine;
    unsigned int i;
 
 #define MAX_VERTICES 4
@@ -137,27 +122,11 @@ vs_ppc_run_linear( struct draw_vertex_shader *base,
 
       /* run compiled shader
        */
-#if 0
-      shader->func(machine->Inputs,
-		   machine->Outputs,
-		   (float (*)[4])constants,
-		   machine->Temps,
-		   (float (*)[4])shader->base.immediates,
-                   input,
-                   base->info.num_inputs,
-                   input_stride,
-                   output,
-                   base->info.num_outputs,
-                   output_stride );
-#else
       shader->func(inputs_soa, outputs_soa, temps_soa,
-		   (float (*)[4][4]) shader->base.immediates,
+		   (float (*)[4]) shader->base.immediates,
 		   (float (*)[4]) constants,
                    ppc_builtin_constants);
 
-      /*output[0][0] = input[0][0] * 0.5;*/
-#endif
-
       /* convert (up to) four output verts from SoA back to AoS format */
       for (attr = 0; attr < base->info.num_outputs; attr++) {
          float *vOut = (float *) output;
@@ -183,8 +152,6 @@ vs_ppc_run_linear( struct draw_vertex_shader *base,
 }
 
 
-
-
 static void
 vs_ppc_delete( struct draw_vertex_shader *base )
 {
@@ -201,7 +168,7 @@ vs_ppc_delete( struct draw_vertex_shader *base )
 
 struct draw_vertex_shader *
 draw_create_vs_ppc(struct draw_context *draw,
-                          const struct pipe_shader_state *templ)
+                   const struct pipe_shader_state *templ)
 {
    struct draw_ppc_vertex_shader *vs;
 
@@ -227,16 +194,14 @@ draw_create_vs_ppc(struct draw_context *draw,
    vs->base.run_linear = vs_ppc_run_linear;
    vs->base.delete = vs_ppc_delete;
    
-   vs->base.immediates = align_malloc(TGSI_EXEC_NUM_IMMEDIATES * 4 * 4 *
+   vs->base.immediates = align_malloc(TGSI_EXEC_NUM_IMMEDIATES * 4 *
                                       sizeof(float), 16);
 
-   vs->machine = &draw->vs.machine;
-   
-   ppc_init_func( &vs->ppc_program, 2000 ); /* XXX fix limit */
+   ppc_init_func( &vs->ppc_program );
 
    if (!tgsi_emit_ppc( (struct tgsi_token *) vs->base.state.tokens,
 			&vs->ppc_program, 
-                        (float (*)[4])vs->base.immediates, 
+                       (float (*)[4]) vs->base.immediates, 
                         TRUE )) 
       goto fail;
       
diff --git a/src/gallium/auxiliary/draw/draw_vs_sse.c b/src/gallium/auxiliary/draw/draw_vs_sse.c
index b11ae31662..77ba5152f9 100644
--- a/src/gallium/auxiliary/draw/draw_vs_sse.c
+++ b/src/gallium/auxiliary/draw/draw_vs_sse.c
@@ -37,7 +37,7 @@
 
 #include "draw_vs.h"
 
-#if defined(PIPE_ARCH_X86) && defined(PIPE_ARCH_SSE)
+#if defined(PIPE_ARCH_X86)
 
 #include "pipe/p_shader_tokens.h"
 
@@ -99,9 +99,23 @@ vs_sse_run_linear( struct draw_vertex_shader *base,
    struct tgsi_exec_machine *machine = shader->machine;
    unsigned int i;
 
+   /* By default, execute all channels.  XXX move this inside the loop
+    * below when we support shader conditionals/loops.
+    */
+   tgsi_set_exec_mask(machine, 1, 1, 1, 1);
+
    for (i = 0; i < count; i += MAX_TGSI_VERTICES) {
       unsigned int max_vertices = MIN2(MAX_TGSI_VERTICES, count - i);
 
+      if (max_vertices < 4) {
+         /* disable the unused execution channels */
+         tgsi_set_exec_mask(machine,
+                            1,
+                            max_vertices > 1,
+                            max_vertices > 2,
+                            0);
+      }
+
       /* run compiled shader
        */
       shader->func(machine->Inputs,
diff --git a/src/gallium/auxiliary/gallivm/gallivm_cpu.cpp b/src/gallium/auxiliary/gallivm/gallivm_cpu.cpp
index 3a2f2878a3..93a9748bdb 100644
--- a/src/gallium/auxiliary/gallivm/gallivm_cpu.cpp
+++ b/src/gallium/auxiliary/gallivm/gallivm_cpu.cpp
@@ -179,8 +179,7 @@ struct gallivm_cpu_engine * gallivm_global_cpu_engine()
 
 typedef void (*vertex_shader_runner)(void *ainputs,
                                      void *dests,
-                                     float (*aconsts)[4],
-                                     void *temps);
+                                     float (*aconsts)[4]);
 
 #define MAX_TGSI_VERTICES 4
 /*!
@@ -223,8 +222,7 @@ int gallivm_cpu_vs_exec(struct gallivm_prog *prog,
       /* run shader */
       runner(machine->Inputs,
              machine->Outputs,
-             (float (*)[4]) constants,
-             machine->Temps);
+             (float (*)[4]) constants);
 
       /* Unswizzle all output results
        */
diff --git a/src/gallium/auxiliary/gallivm/storagesoa.cpp b/src/gallium/auxiliary/gallivm/storagesoa.cpp
index 4fc075cf6d..e1e5cabcf5 100644
--- a/src/gallium/auxiliary/gallivm/storagesoa.cpp
+++ b/src/gallium/auxiliary/gallivm/storagesoa.cpp
@@ -48,13 +48,11 @@ using namespace llvm;
 StorageSoa::StorageSoa(llvm::BasicBlock *block,
                        llvm::Value *input,
                        llvm::Value *output,
-                       llvm::Value *consts,
-                       llvm::Value *temps)
+                       llvm::Value *consts)
    : m_block(block),
      m_input(input),
      m_output(output),
      m_consts(consts),
-     m_temps(temps),
      m_immediates(0),
      m_idx(0)
 {
@@ -169,7 +167,7 @@ std::vector<llvm::Value*> StorageSoa::constElement(llvm::IRBuilder<>* m_builder,
 {
    llvm::Value* res;
    std::vector<llvm::Value*> res2(4);
-   llvm::Value *xChannel, *yChannel, *zChannel, *wChannel;
+   llvm::Value *xChannel;
 
    xChannel = elementPointer(m_consts, idx, 0);
 
@@ -195,14 +193,15 @@ std::vector<llvm::Value*> StorageSoa::outputElement(llvm::Value *idx)
    return res;
 }
 
-std::vector<llvm::Value*> StorageSoa::tempElement(llvm::Value *idx)
+std::vector<llvm::Value*> StorageSoa::tempElement(llvm::IRBuilder<>* m_builder, int idx)
 {
    std::vector<llvm::Value*> res(4);
+   llvm::Value *temp = m_temps[idx];
 
-   res[0] = element(m_temps, idx, 0);
-   res[1] = element(m_temps, idx, 1);
-   res[2] = element(m_temps, idx, 2);
-   res[3] = element(m_temps, idx, 3);
+   res[0] = element(temp, constantInt(0), 0);
+   res[1] = element(temp, constantInt(0), 1);
+   res[2] = element(temp, constantInt(0), 2);
+   res[3] = element(temp, constantInt(0), 3);
 
    return res;
 }
@@ -326,7 +325,7 @@ std::vector<llvm::Value*> StorageSoa::load(enum tgsi_file_type type, int idx, in
       val = outputElement(realIndex);
       break;
    case TGSI_FILE_TEMPORARY:
-      val = tempElement(realIndex);
+      val = tempElement(m_builder, idx);
       break;
    case TGSI_FILE_CONSTANT:
       val = constElement(m_builder, realIndex);
@@ -355,19 +354,39 @@ std::vector<llvm::Value*> StorageSoa::load(enum tgsi_file_type type, int idx, in
    return res;
 }
 
+llvm::Value * StorageSoa::allocaTemp(llvm::IRBuilder<>* m_builder)
+{
+   VectorType *vector   = VectorType::get(Type::FloatTy, 4);
+   ArrayType  *vecArray = ArrayType::get(vector, 4);
+   AllocaInst *alloca = new AllocaInst(vecArray, "temp",
+                                       m_builder->GetInsertBlock());
+
+   return alloca;
+}
+
+
 void StorageSoa::store(enum tgsi_file_type type, int idx, const std::vector<llvm::Value*> &val,
-                       int mask)
+                       int mask, llvm::IRBuilder<>* m_builder)
 {
    llvm::Value *out = 0;
+   llvm::Value *realIndex = 0;
    switch(type) {
    case TGSI_FILE_OUTPUT:
       out = m_output;
+      realIndex = constantInt(idx);
       break;
    case TGSI_FILE_TEMPORARY:
-      out = m_temps;
+      // if that temp doesn't already exist, alloca it
+      if (m_temps.find(idx) == m_temps.end())
+         m_temps[idx] = allocaTemp(m_builder);
+
+      out = m_temps[idx];
+
+      realIndex = constantInt(0);
       break;
    case TGSI_FILE_INPUT:
       out = m_input;
+      realIndex = constantInt(idx);
       break;
    case TGSI_FILE_ADDRESS: {
       llvm::Value *addr = m_addresses[idx];
@@ -385,7 +404,6 @@ void StorageSoa::store(enum tgsi_file_type type, int idx, const std::vector<llvm
       assert(0);
       break;
    }
-   llvm::Value *realIndex = constantInt(idx);
    if ((mask & TGSI_WRITEMASK_X)) {
       llvm::Value *xChannel = elementPointer(out, realIndex, 0);
       new StoreInst(val[0], xChannel, false, m_block);
diff --git a/src/gallium/auxiliary/gallivm/storagesoa.h b/src/gallium/auxiliary/gallivm/storagesoa.h
index f21ca6ec43..56886f85e7 100644
--- a/src/gallium/auxiliary/gallivm/storagesoa.h
+++ b/src/gallium/auxiliary/gallivm/storagesoa.h
@@ -52,14 +52,13 @@ public:
    StorageSoa(llvm::BasicBlock *block,
               llvm::Value *input,
               llvm::Value *output,
-              llvm::Value *consts,
-              llvm::Value *temps);
+              llvm::Value *consts);
 
 
    std::vector<llvm::Value*> load(enum tgsi_file_type type, int idx, int swizzle, 
                                   llvm::IRBuilder<>* m_builder, llvm::Value *indIdx =0);
    void store(enum tgsi_file_type type, int idx, const std::vector<llvm::Value*> &val,
-              int mask);
+              int mask, llvm::IRBuilder<>* m_builder);
 
    void addImmediate(float *vec);
    void declareImmediates();
@@ -84,7 +83,7 @@ private:
    llvm::Value* unpackConstElement(llvm::IRBuilder<>* m_builder, llvm::Value *indIdx, int cc);
    std::vector<llvm::Value*> constElement(llvm::IRBuilder<>* m_builder, llvm::Value *indIdx);
    std::vector<llvm::Value*> outputElement(llvm::Value *indIdx);
-   std::vector<llvm::Value*> tempElement(llvm::Value *indIdx);
+   std::vector<llvm::Value*> tempElement(llvm::IRBuilder<>* m_builder, int idx);
    std::vector<llvm::Value*> immediateElement(llvm::Value *indIdx);
 private:
    llvm::BasicBlock *m_block;
@@ -92,12 +91,13 @@ private:
    llvm::Value *m_input;
    llvm::Value *m_output;
    llvm::Value *m_consts;
-   llvm::Value *m_temps;
+   std::map<int, llvm::Value*> m_temps;
    llvm::GlobalVariable *m_immediates;
 
    std::map<int, llvm::Value*> m_addresses;
 
    std::vector<std::vector<float> > m_immediatesToFlush;
+   llvm::Value * allocaTemp(llvm::IRBuilder<>* m_builder);
 
    mutable std::map<int, llvm::ConstantInt*> m_constInts;
    mutable char        m_name[32];
diff --git a/src/gallium/auxiliary/gallivm/tgsitollvm.cpp b/src/gallium/auxiliary/gallivm/tgsitollvm.cpp
index 1191a6cae9..c11b88af9e 100644
--- a/src/gallium/auxiliary/gallivm/tgsitollvm.cpp
+++ b/src/gallium/auxiliary/gallivm/tgsitollvm.cpp
@@ -53,7 +53,6 @@ static inline FunctionType *vertexShaderFunctionType()
    // [4 x <4 x float>] inputs,
    // [4 x <4 x float>] output,
    // [4 x [1 x float]] consts,
-   // [4 x <4 x float>] temps
 
    std::vector<const Type*> funcArgs;
    VectorType *vectorType = VectorType::get(Type::FloatTy, 4);
@@ -67,7 +66,6 @@ static inline FunctionType *vertexShaderFunctionType()
    funcArgs.push_back(vectorArrayPtr);//inputs
    funcArgs.push_back(vectorArrayPtr);//output
    funcArgs.push_back(constsArrayPtr);//consts
-   funcArgs.push_back(vectorArrayPtr);//temps
 
    FunctionType *functionType = FunctionType::get(
       /*Result=*/Type::VoidTy,
@@ -246,7 +244,6 @@ translate_instruction(llvm::Module *module,
          val = storage->constElement(src->SrcRegister.Index, indIdx);
       } else if (src->SrcRegister.File == TGSI_FILE_INPUT) {
          val = storage->inputElement(src->SrcRegister.Index, indIdx);
-      // FIXME we should not be generating elements for temporaries, this creates useless memory writes
       } else if (src->SrcRegister.File == TGSI_FILE_TEMPORARY) {
          val = storage->tempElement(src->SrcRegister.Index);
       } else if (src->SrcRegister.File == TGSI_FILE_OUTPUT) {
@@ -677,7 +674,6 @@ translate_instruction(llvm::Module *module,
 
       if (dst->DstRegister.File == TGSI_FILE_OUTPUT) {
          storage->setOutputElement(dst->DstRegister.Index, out, dst->DstRegister.WriteMask);
-      // FIXME we should not be generating elements for temporaries, this creates useless memory writes
       } else if (dst->DstRegister.File == TGSI_FILE_TEMPORARY) {
          storage->setTempElement(dst->DstRegister.Index, out, dst->DstRegister.WriteMask);
       } else if (dst->DstRegister.File == TGSI_FILE_ADDRESS) {
@@ -1027,7 +1023,8 @@ translate_instructionir(llvm::Module *module,
    for (int i = 0; i < inst->Instruction.NumDstRegs; ++i) {
       struct tgsi_full_dst_register *dst = &inst->FullDstRegisters[i];
       storage->store((enum tgsi_file_type)dst->DstRegister.File,
-                     dst->DstRegister.Index, out, dst->DstRegister.WriteMask);
+                     dst->DstRegister.Index, out, dst->DstRegister.WriteMask,
+		     instr->getIRBuilder() );
    }
 }
 
@@ -1122,8 +1119,6 @@ llvm::Module * tgsi_to_llvmir(struct gallivm_ir *ir,
    output->setName("outputs");
    Value *consts = args++;
    consts->setName("consts");
-   Value *temps = args++;
-   temps->setName("temps");
 
    BasicBlock *label_entry = BasicBlock::Create("entry", shader, 0);
 
@@ -1132,7 +1127,7 @@ llvm::Module * tgsi_to_llvmir(struct gallivm_ir *ir,
    fi = tgsi_default_full_instruction();
    fd = tgsi_default_full_declaration();
 
-   StorageSoa storage(label_entry, input, output, consts, temps);
+   StorageSoa storage(label_entry, input, output, consts);
    InstructionsSoa instr(mod, shader, label_entry, &storage);
 
    while(!tgsi_parse_end_of_tokens(&parse)) {
diff --git a/src/gallium/auxiliary/pipebuffer/pb_buffer.h b/src/gallium/auxiliary/pipebuffer/pb_buffer.h
index 8505d333bd..19db8a6a91 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_buffer.h
+++ b/src/gallium/auxiliary/pipebuffer/pb_buffer.h
@@ -177,12 +177,16 @@ pb_get_base_buffer( struct pb_buffer *buf,
 }
 
 
+/**
+ * Don't call this directly. Use pb_reference instead.
+ */
 static INLINE void 
 pb_destroy(struct pb_buffer *buf)
 {
    assert(buf);
    if(!buf)
       return;
+   assert(buf->base.refcount == 0);
    buf->vtbl->destroy(buf);
 }
 
@@ -193,11 +197,16 @@ static INLINE void
 pb_reference(struct pb_buffer **dst,
              struct pb_buffer *src)
 {
-   if (src) 
+   if (src) {
+      assert(src->base.refcount);
       src->base.refcount++;
+   }
 
-   if (*dst && --(*dst)->base.refcount == 0)
-      pb_destroy( *dst );
+   if (*dst) {
+      assert((*dst)->base.refcount);
+      if(--(*dst)->base.refcount == 0)
+         pb_destroy( *dst );
+   }
 
    *dst = src;
 }
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_fenced.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_fenced.c
index 633ee70a75..e2594ea236 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_fenced.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_fenced.c
@@ -86,8 +86,7 @@ fenced_bufmgr_create_buffer(struct pb_manager *mgr,
    
    fenced_buf = fenced_buffer_create(fenced_mgr->fenced_list, buf);
    if(!fenced_buf) {
-      assert(buf->base.refcount == 1);
-      pb_destroy(buf);
+      pb_reference(&buf, NULL);
    }
    
    return fenced_buf;
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c
index fe80ca30ee..a976d3041a 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c
@@ -100,7 +100,7 @@ mm_buffer_destroy(struct pb_buffer *buf)
    assert(buf->base.refcount == 0);
    
    pipe_mutex_lock(mm->mutex);
-   mmFreeMem(mm_buf->block);
+   u_mmFreeMem(mm_buf->block);
    FREE(buf);
    pipe_mutex_unlock(mm->mutex);
 }
@@ -175,14 +175,14 @@ mm_bufmgr_create_buffer(struct pb_manager *mgr,
    
    mm_buf->mgr = mm;
    
-   mm_buf->block = mmAllocMem(mm->heap, size, mm->align2, 0);
+   mm_buf->block = u_mmAllocMem(mm->heap, size, mm->align2, 0);
    if(!mm_buf->block) {
       debug_printf("warning: heap full\n");
 #if 0
       mmDumpMemInfo(mm->heap);
 #endif
       
-      mm_buf->block = mmAllocMem(mm->heap, size, mm->align2, 0);
+      mm_buf->block = u_mmAllocMem(mm->heap, size, mm->align2, 0);
       if(!mm_buf->block) {
          FREE(mm_buf);
          pipe_mutex_unlock(mm->mutex);
@@ -213,7 +213,7 @@ mm_bufmgr_destroy(struct pb_manager *mgr)
    
    pipe_mutex_lock(mm->mutex);
 
-   mmDestroy(mm->heap);
+   u_mmDestroy(mm->heap);
    
    pb_unmap(mm->buffer);
    pb_reference(&mm->buffer, NULL);
@@ -254,7 +254,7 @@ mm_bufmgr_create_from_buffer(struct pb_buffer *buffer,
    if(!mm->map)
       goto failure;
 
-   mm->heap = mmInit(0, size); 
+   mm->heap = u_mmInit(0, size); 
    if (!mm->heap)
       goto failure;
 
@@ -262,7 +262,7 @@ mm_bufmgr_create_from_buffer(struct pb_buffer *buffer,
    
 failure:
 if(mm->heap)
-   mmDestroy(mm->heap);
+   u_mmDestroy(mm->heap);
    if(mm->map)
       pb_unmap(mm->buffer);
    if(mm)
diff --git a/src/gallium/auxiliary/rtasm/rtasm_execmem.c b/src/gallium/auxiliary/rtasm/rtasm_execmem.c
index 19087589a8..be7433baf8 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_execmem.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_execmem.c
@@ -38,12 +38,13 @@
 #include "rtasm_execmem.h"
 
 
-#if defined(__linux__)
+#if defined(PIPE_OS_LINUX)
+
 
 /*
  * Allocate a large block of memory which can hold code then dole it out
  * in pieces by means of the generic memory manager code.
-*/
+ */
 
 #include <unistd.h>
 #include <sys/mman.h>
@@ -62,7 +63,7 @@ static void
 init_heap(void)
 {
    if (!exec_heap)
-      exec_heap = mmInit( 0, EXEC_HEAP_SIZE );
+      exec_heap = u_mmInit( 0, EXEC_HEAP_SIZE );
    
    if (!exec_mem)
       exec_mem = (unsigned char *) mmap(0, EXEC_HEAP_SIZE, 
@@ -82,8 +83,8 @@ rtasm_exec_malloc(size_t size)
    init_heap();
 
    if (exec_heap) {
-      size = (size + 31) & ~31;
-      block = mmAllocMem( exec_heap, size, 32, 0 );
+      size = (size + 31) & ~31;  /* next multiple of 32 bytes */
+      block = u_mmAllocMem( exec_heap, size, 5, 0 ); /* 5 -> 32-byte alignment */
    }
 
    if (block)
@@ -103,17 +104,17 @@ rtasm_exec_free(void *addr)
    pipe_mutex_lock(exec_mutex);
 
    if (exec_heap) {
-      struct mem_block *block = mmFindBlock(exec_heap, (unsigned char *)addr - exec_mem);
+      struct mem_block *block = u_mmFindBlock(exec_heap, (unsigned char *)addr - exec_mem);
    
       if (block)
-	 mmFreeMem(block);
+	 u_mmFreeMem(block);
    }
 
    pipe_mutex_unlock(exec_mutex);
 }
 
 
-#else
+#else /* PIPE_OS_LINUX */
 
 /*
  * Just use regular memory.
@@ -133,4 +134,4 @@ rtasm_exec_free(void *addr)
 }
 
 
-#endif
+#endif /* PIPE_OS_LINUX */
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc.c b/src/gallium/auxiliary/rtasm/rtasm_ppc.c
index 7dd8263749..6d11263be8 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc.c
@@ -38,17 +38,18 @@
 #include <stdio.h>
 #include "util/u_memory.h"
 #include "pipe/p_debug.h"
+#include "rtasm_execmem.h"
 #include "rtasm_ppc.h"
 
 
 void
-ppc_init_func(struct ppc_function *p, unsigned max_inst)
+ppc_init_func(struct ppc_function *p)
 {
    uint i;
 
-   p->store = align_malloc(max_inst * PPC_INST_SIZE, 16);
    p->num_inst = 0;
-   p->max_inst = max_inst;
+   p->max_inst = 100; /* first guess at buffer size */
+   p->store = rtasm_exec_malloc(p->max_inst * PPC_INST_SIZE);
    p->reg_used = 0x0;
    p->fp_used = 0x0;
    p->vec_used = 0x0;
@@ -66,12 +67,19 @@ ppc_release_func(struct ppc_function *p)
 {
    assert(p->num_inst <= p->max_inst);
    if (p->store != NULL) {
-      align_free(p->store);
+      rtasm_exec_free(p->store);
    }
    p->store = NULL;
 }
 
 
+uint
+ppc_num_instructions(const struct ppc_function *p)
+{
+   return p->num_inst;
+}
+
+
 void (*ppc_get_func(struct ppc_function *p))(void)
 {
 #if 0
@@ -202,6 +210,35 @@ ppc_release_vec_register(struct ppc_function *p, int reg)
 }
 
 
+/**
+ * Append instruction to instruction buffer.  Grow buffer if out of room.
+ */
+static void
+emit_instruction(struct ppc_function *p, uint32_t inst_bits)
+{
+   if (!p->store)
+      return;  /* out of memory, drop the instruction */
+
+   if (p->num_inst == p->max_inst) {
+      /* allocate larger buffer */
+      uint32_t *newbuf;
+      p->max_inst *= 2;  /* 2x larger */
+      newbuf = rtasm_exec_malloc(p->max_inst * PPC_INST_SIZE);
+      if (newbuf) {
+         memcpy(newbuf, p->store, p->num_inst * PPC_INST_SIZE);
+      }
+      rtasm_exec_free(p->store);
+      p->store = newbuf;
+      if (!p->store) {
+         /* out of memory */
+         p->num_inst = 0;
+         return;
+      }
+   }
+
+   p->store[p->num_inst++] = inst_bits;
+}
+
 
 union vx_inst {
    uint32_t bits;
@@ -223,8 +260,7 @@ emit_vx(struct ppc_function *p, uint op2, uint vD, uint vA, uint vB)
    inst.inst.vA = vA;
    inst.inst.vB = vB;
    inst.inst.op2 = op2;
-   p->store[p->num_inst++] = inst.bits;
-   assert(p->num_inst <= p->max_inst);
+   emit_instruction(p, inst.bits);
 };
 
 
@@ -250,8 +286,7 @@ emit_vxr(struct ppc_function *p, uint op2, uint vD, uint vA, uint vB)
    inst.inst.vB = vB;
    inst.inst.rC = 0;
    inst.inst.op2 = op2;
-   p->store[p->num_inst++] = inst.bits;
-   assert(p->num_inst <= p->max_inst);
+   emit_instruction(p, inst.bits);
 };
 
 
@@ -277,8 +312,7 @@ emit_va(struct ppc_function *p, uint op2, uint vD, uint vA, uint vB, uint vC)
    inst.inst.vB = vB;
    inst.inst.vC = vC;
    inst.inst.op2 = op2;
-   p->store[p->num_inst++] = inst.bits;
-   assert(p->num_inst <= p->max_inst);
+   emit_instruction(p, inst.bits);
 };
 
 
@@ -300,8 +334,7 @@ emit_i(struct ppc_function *p, uint op, uint li, uint aa, uint lk)
    inst.inst.li = li;
    inst.inst.aa = aa;
    inst.inst.lk = lk;
-   p->store[p->num_inst++] = inst.bits;
-   assert(p->num_inst <= p->max_inst);
+   emit_instruction(p, inst.bits);
 }
 
 
@@ -330,8 +363,7 @@ emit_xl(struct ppc_function *p, uint op, uint bo, uint bi, uint bh,
    inst.inst.bh = bh;
    inst.inst.op2 = op2;
    inst.inst.lk = lk;
-   p->store[p->num_inst++] = inst.bits;
-   assert(p->num_inst <= p->max_inst);
+   emit_instruction(p, inst.bits);
 }
 
 static INLINE void
@@ -373,8 +405,7 @@ emit_x(struct ppc_function *p, uint op, uint vrs, uint ra, uint rb, uint op2)
    inst.inst.rb = rb;
    inst.inst.op2 = op2;
    inst.inst.unused = 0x0;
-   p->store[p->num_inst++] = inst.bits;
-   assert(p->num_inst <= p->max_inst);
+   emit_instruction(p, inst.bits);
 }
 
 
@@ -398,8 +429,7 @@ emit_d(struct ppc_function *p, uint op, uint rt, uint ra, int si)
    inst.inst.rt = rt;
    inst.inst.ra = ra;
    inst.inst.si = (unsigned) (si & 0xffff);
-   p->store[p->num_inst++] = inst.bits;
-   assert(p->num_inst <= p->max_inst);
+   emit_instruction(p, inst.bits);
 };
 
 
@@ -428,8 +458,7 @@ emit_a(struct ppc_function *p, uint op, uint frt, uint fra, uint frb, uint op2,
    inst.inst.unused = 0x0;
    inst.inst.op2 = op2;
    inst.inst.rc = rc;
-   p->store[p->num_inst++] = inst.bits;
-   assert(p->num_inst <= p->max_inst);
+   emit_instruction(p, inst.bits);
 };
 
 
@@ -458,8 +487,7 @@ emit_xo(struct ppc_function *p, uint op, uint rt, uint ra, uint rb, uint oe,
    inst.inst.oe = oe;
    inst.inst.op2 = op2;
    inst.inst.rc = rc;
-   p->store[p->num_inst++] = inst.bits;
-   assert(p->num_inst <= p->max_inst);
+   emit_instruction(p, inst.bits);
 }
 
 
@@ -505,6 +533,13 @@ ppc_vmaddfp(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC)
    emit_va(p, 46, vD, vA, vC, vB); /* note arg order */
 }
 
+/** vector float negative mult subtract: vD = vA - vB * vC */
+void
+ppc_vnmsubfp(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC)
+{
+   emit_va(p, 47, vD, vB, vA, vC); /* note arg order */
+}
+
 /** vector float compare greater than */
 void
 ppc_vcmpgtfpx(struct ppc_function *p, uint vD, uint vA, uint vB)
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc.h b/src/gallium/auxiliary/rtasm/rtasm_ppc.h
index f938d8d759..afb4704c39 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc.h
@@ -62,8 +62,9 @@ struct ppc_function
 
 
 
-extern void ppc_init_func(struct ppc_function *p, unsigned max_inst);
+extern void ppc_init_func(struct ppc_function *p);
 extern void ppc_release_func(struct ppc_function *p);
+extern uint ppc_num_instructions(const struct ppc_function *p);
 extern void (*ppc_get_func( struct ppc_function *p ))( void );
 extern void ppc_dump_func(const struct ppc_function *p);
 
@@ -97,10 +98,14 @@ ppc_vminfp(struct ppc_function *p, uint vD, uint vA, uint vB);
 extern void
 ppc_vmaxfp(struct ppc_function *p, uint vD, uint vA, uint vB);
 
-/** vector float mult add */
+/** vector float mult add: vD = vA * vB + vC */
 extern void
 ppc_vmaddfp(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC);
 
+/** vector float negative mult subtract: vD = vA - vB * vC */
+extern void
+ppc_vnmsubfp(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC);
+
 /** vector float compare greater than */
 extern void
 ppc_vcmpgtfpx(struct ppc_function *p, uint vD, uint vA, uint vB);
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
index dea1aed032..f8568f690b 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
@@ -185,6 +185,34 @@ reg_name(int reg)
 }
 
 
+static void
+emit_instruction(struct spe_function *p, uint32_t inst_bits)
+{
+   if (!p->store)
+      return;  /* out of memory, drop the instruction */
+
+   if (p->num_inst == p->max_inst) {
+      /* allocate larger buffer */
+      uint32_t *newbuf;
+      p->max_inst *= 2;  /* 2x larger */
+      newbuf = align_malloc(p->max_inst * SPE_INST_SIZE, 16);
+      if (newbuf) {
+         memcpy(newbuf, p->store, p->num_inst * SPE_INST_SIZE);
+      }
+      align_free(p->store);
+      p->store = newbuf;
+      if (!p->store) {
+         /* out of memory */
+         p->num_inst = 0;
+         return;
+      }
+   }
+
+   p->store[p->num_inst++] = inst_bits;
+}
+
+
+
 static void emit_RR(struct spe_function *p, unsigned op, unsigned rT,
 		    unsigned rA, unsigned rB, const char *name)
 {
@@ -193,8 +221,7 @@ static void emit_RR(struct spe_function *p, unsigned op, unsigned rT,
     inst.inst.rB = rB;
     inst.inst.rA = rA;
     inst.inst.rT = rT;
-    p->store[p->num_inst++] = inst.bits;
-    assert(p->num_inst <= p->max_inst);
+    emit_instruction(p, inst.bits);
     if (p->print) {
        indent(p);
        printf("%s\t%s, %s, %s\n",
@@ -212,8 +239,7 @@ static void emit_RRR(struct spe_function *p, unsigned op, unsigned rT,
     inst.inst.rB = rB;
     inst.inst.rA = rA;
     inst.inst.rC = rC;
-    p->store[p->num_inst++] = inst.bits;
-    assert(p->num_inst <= p->max_inst);
+    emit_instruction(p, inst.bits);
     if (p->print) {
        indent(p);
        printf("%s\t%s, %s, %s, %s\n", rem_prefix(name), reg_name(rT),
@@ -230,8 +256,7 @@ static void emit_RI7(struct spe_function *p, unsigned op, unsigned rT,
     inst.inst.i7 = imm;
     inst.inst.rA = rA;
     inst.inst.rT = rT;
-    p->store[p->num_inst++] = inst.bits;
-    assert(p->num_inst <= p->max_inst);
+    emit_instruction(p, inst.bits);
     if (p->print) {
        indent(p);
        printf("%s\t%s, %s, 0x%x\n",
@@ -249,8 +274,7 @@ static void emit_RI8(struct spe_function *p, unsigned op, unsigned rT,
     inst.inst.i8 = imm;
     inst.inst.rA = rA;
     inst.inst.rT = rT;
-    p->store[p->num_inst++] = inst.bits;
-    assert(p->num_inst <= p->max_inst);
+    emit_instruction(p, inst.bits);
     if (p->print) {
        indent(p);
        printf("%s\t%s, %s, 0x%x\n",
@@ -268,8 +292,7 @@ static void emit_RI10(struct spe_function *p, unsigned op, unsigned rT,
     inst.inst.i10 = imm;
     inst.inst.rA = rA;
     inst.inst.rT = rT;
-    p->store[p->num_inst++] = inst.bits;
-    assert(p->num_inst <= p->max_inst);
+    emit_instruction(p, inst.bits);
     if (p->print) {
        indent(p);
        printf("%s\t%s, %s, 0x%x\n",
@@ -295,8 +318,7 @@ static void emit_RI16(struct spe_function *p, unsigned op, unsigned rT,
     inst.inst.op = op;
     inst.inst.i16 = imm;
     inst.inst.rT = rT;
-    p->store[p->num_inst++] = inst.bits;
-    assert(p->num_inst <= p->max_inst);
+    emit_instruction(p, inst.bits);
     if (p->print) {
        indent(p);
        printf("%s\t%s, 0x%x\n", rem_prefix(name), reg_name(rT), imm);
@@ -311,8 +333,7 @@ static void emit_RI18(struct spe_function *p, unsigned op, unsigned rT,
     inst.inst.op = op;
     inst.inst.i18 = imm;
     inst.inst.rT = rT;
-    p->store[p->num_inst++] = inst.bits;
-    assert(p->num_inst <= p->max_inst);
+    emit_instruction(p, inst.bits);
     if (p->print) {
        indent(p);
        printf("%s\t%s, 0x%x\n", rem_prefix(name), reg_name(rT), imm);
@@ -394,15 +415,19 @@ void _name (struct spe_function *p, int imm) \
 
 /**
  * Initialize an spe_function.
- * \param code_size  size of instruction buffer to allocate, in bytes.
+ * \param code_size  initial size of instruction buffer to allocate, in bytes.
+ *                   If zero, use a default.
  */
 void spe_init_func(struct spe_function *p, unsigned code_size)
 {
     unsigned int i;
 
-    p->store = align_malloc(code_size, 16);
+    if (!code_size)
+       code_size = 64;
+
     p->num_inst = 0;
     p->max_inst = code_size / SPE_INST_SIZE;
+    p->store = align_malloc(code_size, 16);
 
     p->set_count = 0;
     memset(p->regs, 0, SPE_NUM_REGS * sizeof(p->regs[0]));
diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.c b/src/gallium/auxiliary/tgsi/tgsi_exec.c
index 1a5294eabc..1da04ab7e0 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.c
@@ -958,6 +958,10 @@ fetch_src_file_channel(
       switch( file ) {
       case TGSI_FILE_CONSTANT:
          assert(mach->Consts);
+         assert(index->i[0] >= 0);
+         assert(index->i[1] >= 0);
+         assert(index->i[2] >= 0);
+         assert(index->i[3] >= 0);
          chan->f[0] = mach->Consts[index->i[0]][swizzle];
          chan->f[1] = mach->Consts[index->i[1]][swizzle];
          chan->f[2] = mach->Consts[index->i[2]][swizzle];
@@ -1041,12 +1045,16 @@ fetch_source(
    if (reg->SrcRegister.Indirect) {
       union tgsi_exec_channel index2;
       union tgsi_exec_channel indir_index;
+      const uint execmask = mach->ExecMask;
+      uint i;
 
+      /* which address register (always zero now) */
       index2.i[0] =
       index2.i[1] =
       index2.i[2] =
       index2.i[3] = reg->SrcRegisterInd.Index;
 
+      /* get current value of address register[swizzle] */
       swizzle = tgsi_util_get_src_register_swizzle( &reg->SrcRegisterInd, CHAN_X );
       fetch_src_file_channel(
          mach,
@@ -1055,10 +1063,19 @@ fetch_source(
          &index2,
          &indir_index );
 
+      /* add value of address register to the offset */
       index.i[0] += indir_index.i[0];
       index.i[1] += indir_index.i[1];
       index.i[2] += indir_index.i[2];
       index.i[3] += indir_index.i[3];
+
+      /* for disabled execution channels, zero-out the index to
+       * avoid using a potential garbage value.
+       */
+      for (i = 0; i < QUAD_SIZE; i++) {
+         if ((execmask & (1 << i)) == 0)
+            index.i[i] = 0;
+      }
    }
 
    if( reg->SrcRegister.Dimension ) {
@@ -1087,6 +1104,8 @@ fetch_source(
       if (reg->SrcRegisterDim.Indirect) {
          union tgsi_exec_channel index2;
          union tgsi_exec_channel indir_index;
+         const uint execmask = mach->ExecMask;
+         uint i;
 
          index2.i[0] =
          index2.i[1] =
@@ -1105,6 +1124,14 @@ fetch_source(
          index.i[1] += indir_index.i[1];
          index.i[2] += indir_index.i[2];
          index.i[3] += indir_index.i[3];
+
+         /* for disabled execution channels, zero-out the index to
+          * avoid using a potential garbage value.
+          */
+         for (i = 0; i < QUAD_SIZE; i++) {
+            if ((execmask & (1 << i)) == 0)
+               index.i[i] = 0;
+         }
       }
    }
 
@@ -2007,7 +2034,21 @@ exec_instruction(
 
    case TGSI_OPCODE_DOT2ADD:
       /* TGSI_OPCODE_DP2A */
-      assert (0);
+      FETCH( &r[0], 0, CHAN_X );
+      FETCH( &r[1], 1, CHAN_X );
+      micro_mul( &r[0], &r[0], &r[1] );
+
+      FETCH( &r[1], 0, CHAN_Y );
+      FETCH( &r[2], 1, CHAN_Y );
+      micro_mul( &r[1], &r[1], &r[2] );
+      micro_add( &r[0], &r[0], &r[1] );
+
+      FETCH( &r[2], 2, CHAN_X );
+      micro_add( &r[0], &r[0], &r[2] );
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( &r[0], 0, chan_index );
+      }
       break;
 
    case TGSI_OPCODE_INDEX:
@@ -2436,7 +2477,66 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_NRM:
-      assert (0);
+      /* 3-component vector normalize */
+      {
+         union tgsi_exec_channel tmp, dot;
+
+         /* tmp = dp3(src0, src0): */
+         FETCH( &r[0], 0, CHAN_X );
+         micro_mul( &tmp, &r[0], &r[0] );
+
+         FETCH( &r[1], 0, CHAN_Y );
+         micro_mul( &dot, &r[1], &r[1] );
+         micro_add( &tmp, &tmp, &dot );
+
+         FETCH( &r[2], 0, CHAN_Z );
+         micro_mul( &dot, &r[2], &r[2] );
+         micro_add( &tmp, &tmp, &dot );
+
+         /* tmp = 1 / sqrt(tmp) */
+         micro_sqrt( &tmp, &tmp );
+         micro_div( &tmp, &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &tmp );
+
+         /* note: w channel is undefined */
+         FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+            /* chan = chan * tmp */
+            micro_mul( &r[chan_index], &tmp, &r[chan_index] );
+            STORE( &r[chan_index], 0, chan_index );
+         }
+      }
+      break;
+
+   case TGSI_OPCODE_NRM4:
+      /* 4-component vector normalize */
+      {
+         union tgsi_exec_channel tmp, dot;
+
+         /* tmp = dp4(src0, src0): */
+         FETCH( &r[0], 0, CHAN_X );
+         micro_mul( &tmp, &r[0], &r[0] );
+
+         FETCH( &r[1], 0, CHAN_Y );
+         micro_mul( &dot, &r[1], &r[1] );
+         micro_add( &tmp, &tmp, &dot );
+
+         FETCH( &r[2], 0, CHAN_Z );
+         micro_mul( &dot, &r[2], &r[2] );
+         micro_add( &tmp, &tmp, &dot );
+
+         FETCH( &r[3], 0, CHAN_W );
+         micro_mul( &dot, &r[3], &r[3] );
+         micro_add( &tmp, &tmp, &dot );
+
+         /* tmp = 1 / sqrt(tmp) */
+         micro_sqrt( &tmp, &tmp );
+         micro_div( &tmp, &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &tmp );
+
+         FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+            /* chan = chan * tmp */
+            micro_mul( &r[chan_index], &tmp, &r[chan_index] );
+            STORE( &r[chan_index], 0, chan_index );
+         }
+      }
       break;
 
    case TGSI_OPCODE_DIV:
diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.h b/src/gallium/auxiliary/tgsi/tgsi_exec.h
index c4e649e69c..fc40a25e09 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.h
@@ -165,6 +165,10 @@ struct tgsi_exec_labels
 #define TGSI_EXEC_TEMP_HALF_I       (TGSI_EXEC_NUM_TEMPS + 3)
 #define TGSI_EXEC_TEMP_HALF_C       1
 
+/* execution mask, each value is either 0 or ~0 */
+#define TGSI_EXEC_MASK_I            (TGSI_EXEC_NUM_TEMPS + 3)
+#define TGSI_EXEC_MASK_C            2
+
 #define TGSI_EXEC_TEMP_R0           (TGSI_EXEC_NUM_TEMPS + 4)
 
 #define TGSI_EXEC_TEMP_ADDR         (TGSI_EXEC_NUM_TEMPS + 5)
@@ -265,6 +269,27 @@ void
 tgsi_exec_machine_free_data(struct tgsi_exec_machine *mach);
 
 
+static INLINE void
+tgsi_set_kill_mask(struct tgsi_exec_machine *mach, unsigned mask)
+{
+   mach->Temps[TGSI_EXEC_TEMP_KILMASK_I].xyzw[TGSI_EXEC_TEMP_KILMASK_C].u[0] =
+      mask;
+}
+
+
+/** Set execution mask values prior to executing the shader */
+static INLINE void
+tgsi_set_exec_mask(struct tgsi_exec_machine *mach,
+                   boolean ch0, boolean ch1, boolean ch2, boolean ch3)
+{
+   int *mask = mach->Temps[TGSI_EXEC_MASK_I].xyzw[TGSI_EXEC_MASK_C].i;
+   mask[0] = ch0 ? ~0 : 0;
+   mask[1] = ch1 ? ~0 : 0;
+   mask[2] = ch2 ? ~0 : 0;
+   mask[3] = ch3 ? ~0 : 0;
+}
+
+
 #if defined __cplusplus
 } /* extern "C" */
 #endif
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ppc.c b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
index 9ad7ecd7cf..a92b1902e3 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ppc.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
@@ -40,6 +40,7 @@
 #include "util/u_sse.h"
 #include "tgsi/tgsi_parse.h"
 #include "tgsi/tgsi_util.h"
+#include "tgsi_dump.h"
 #include "tgsi_exec.h"
 #include "tgsi_ppc.h"
 #include "rtasm/rtasm_ppc.h"
@@ -72,11 +73,20 @@ const float ppc_builtin_constants[] ALIGN16_ATTRIB = {
 #define CHAN_Z 2
 #define CHAN_W 3
 
-#define TEMP_ONE_I   TGSI_EXEC_TEMP_ONE_I
-#define TEMP_ONE_C   TGSI_EXEC_TEMP_ONE_C
 
-#define TEMP_R0   TGSI_EXEC_TEMP_R0
-#define TEMP_ADDR TGSI_EXEC_TEMP_ADDR
+/**
+ * How many TGSI temps should be implemented with real PPC vector registers
+ * rather than memory.
+ */
+#define MAX_PPC_TEMPS 4
+
+
+struct reg_chan_vec
+{
+   struct tgsi_full_src_register src;
+   uint chan;
+   uint vec;
+};
 
 
 /**
@@ -92,12 +102,105 @@ struct gen_context
    int const_reg;     /**< GP register pointing to constants buffer */
    int builtins_reg;  /**< GP register pointint to built-in constants */
 
+   int offset_reg;    /**< used to reduce redundant li instructions */
+   int offset_value;
+
    int one_vec;       /**< vector register with {1.0, 1.0, 1.0, 1.0} */
    int bit31_vec;     /**< vector register with {1<<31, 1<<31, 1<<31, 1<<31} */
+
+   /**
+    * Map TGSI temps to PPC vector temps.
+    * We have 32 PPC vector regs.  Use 16 of them for storing 4 TGSI temps.
+    * XXX currently only do this for TGSI temps [0..MAX_PPC_TEMPS-1].
+    */
+   int temps_map[MAX_PPC_TEMPS][4];
+
+   /**
+    * Cache of src registers.
+    * This is used to avoid redundant load instructions.
+    */
+   struct {
+      struct tgsi_full_src_register src;
+      uint chan;
+      uint vec;
+   } regs[12];  /* 3 src regs, 4 channels */
+   uint num_regs;
 };
 
 
 /**
+ * Initialize code generation context.
+ */
+static void
+init_gen_context(struct gen_context *gen, struct ppc_function *func)
+{
+   uint i;
+
+   memset(gen, 0, sizeof(*gen));
+   gen->f = func;
+   gen->inputs_reg = ppc_reserve_register(func, 3);   /* first function param */
+   gen->outputs_reg = ppc_reserve_register(func, 4);  /* second function param */
+   gen->temps_reg = ppc_reserve_register(func, 5);    /* ... */
+   gen->immed_reg = ppc_reserve_register(func, 6);
+   gen->const_reg = ppc_reserve_register(func, 7);
+   gen->builtins_reg = ppc_reserve_register(func, 8);
+   gen->one_vec = -1;
+   gen->bit31_vec = -1;
+   gen->offset_reg = -1;
+   gen->offset_value = -9999999;
+   for (i = 0; i < MAX_PPC_TEMPS; i++) {
+      gen->temps_map[i][0] = ppc_allocate_vec_register(gen->f);
+      gen->temps_map[i][1] = ppc_allocate_vec_register(gen->f);
+      gen->temps_map[i][2] = ppc_allocate_vec_register(gen->f);
+      gen->temps_map[i][3] = ppc_allocate_vec_register(gen->f);
+   }
+}
+
+
+/**
+ * All PPC vector load/store instructions form an effective address
+ * by adding the contents of two registers.  For example:
+ *    lvx v2,r8,r9   # v2 = memory[r8 + r9]
+ *    stvx v2,r8,r9  # memory[r8 + r9] = v2;
+ * So our lvx/stvx instructions are typically preceded by an 'li' instruction
+ * to load r9 (above) with an immediate (an offset).
+ * This code emits that 'li' instruction, but only if the offset value is
+ * different than the previous 'li'.
+ * This optimization seems to save about 10% in the instruction count.
+ * Note that we need to unconditionally emit an 'li' inside basic blocks
+ * (such as inside loops).
+ */
+static int
+emit_li_offset(struct gen_context *gen, int offset)
+{
+   if (gen->offset_reg <= 0) {
+      /* allocate a GP register for storing load/store offset */
+      gen->offset_reg = ppc_allocate_register(gen->f);
+   }
+
+   /* emit new 'li' if offset is changing */
+   if (gen->offset_value < 0 || gen->offset_value != offset) {
+      gen->offset_value = offset;
+      ppc_li(gen->f, gen->offset_reg, offset);
+   }
+
+   return gen->offset_reg;
+}
+
+
+/**
+ * Forces subsequent emit_li_offset() calls to emit an 'li'.
+ * To be called at the top of basic blocks.
+ */
+static void
+reset_li_offset(struct gen_context *gen)
+{
+   gen->offset_value = -9999999;
+}
+
+
+
+/**
  * Load the given vector register with {value, value, value, value}.
  * The value must be in the ppu_builtin_constants[] array.
  * We wouldn't need this if there was a simple way to load PPC vector
@@ -109,10 +212,9 @@ load_constant_vec(struct gen_context *gen, int dst_vec, float value)
    uint pos;
    for (pos = 0; pos < Elements(ppc_builtin_constants); pos++) {
       if (ppc_builtin_constants[pos] == value) {
-         int offset_reg = ppc_allocate_register(gen->f);
          int offset = pos * 4;
+         int offset_reg = emit_li_offset(gen, offset);
 
-         ppc_li(gen->f, offset_reg, offset);
          /* Load 4-byte word into vector register.
           * The vector slot depends on the effective address we load from.
           * We know that our builtins start at a 16-byte boundary so we
@@ -122,7 +224,6 @@ load_constant_vec(struct gen_context *gen, int dst_vec, float value)
          ppc_lvewx(gen->f, dst_vec, gen->builtins_reg, offset_reg);
          /* splat word[pos % 4] across the vector reg */
          ppc_vspltw(gen->f, dst_vec, dst_vec, pos % 4);
-         ppc_release_register(gen->f, offset_reg);
          return;
       }
    }
@@ -159,15 +260,15 @@ gen_get_bit31_vec(struct gen_context *gen)
 
 
 /**
- * Register fetch, put result in 'dst_vec'.
+ * Register fetch.  Return PPC vector register with result.
  */
-static void
+static int
 emit_fetch(struct gen_context *gen,
-           unsigned dst_vec,
            const struct tgsi_full_src_register *reg,
            const unsigned chan_index)
 {
    uint swizzle = tgsi_util_get_full_src_register_extswizzle(reg, chan_index);
+   int dst_vec = -1;
 
    switch (swizzle) {
    case TGSI_EXTSWIZZLE_X:
@@ -177,36 +278,46 @@ emit_fetch(struct gen_context *gen,
       switch (reg->SrcRegister.File) {
       case TGSI_FILE_INPUT:
          {
-            int offset_reg = ppc_allocate_register(gen->f);
             int offset = (reg->SrcRegister.Index * 4 + swizzle) * 16;
-            ppc_li(gen->f, offset_reg, offset);
+            int offset_reg = emit_li_offset(gen, offset);
+            dst_vec = ppc_allocate_vec_register(gen->f);
             ppc_lvx(gen->f, dst_vec, gen->inputs_reg, offset_reg);
-            ppc_release_register(gen->f, offset_reg);
          }
          break;
       case TGSI_FILE_TEMPORARY:
-         {
-            int offset_reg = ppc_allocate_register(gen->f);
+         if (reg->SrcRegister.Index < MAX_PPC_TEMPS) {
+            /* use PPC vec register */
+            dst_vec = gen->temps_map[reg->SrcRegister.Index][swizzle];
+         }
+         else {
+            /* use memory-based temp register "file" */
             int offset = (reg->SrcRegister.Index * 4 + swizzle) * 16;
-            ppc_li(gen->f, offset_reg, offset);
+            int offset_reg = emit_li_offset(gen, offset);
+            dst_vec = ppc_allocate_vec_register(gen->f);
             ppc_lvx(gen->f, dst_vec, gen->temps_reg, offset_reg);
-            ppc_release_register(gen->f, offset_reg);
          }
          break;
       case TGSI_FILE_IMMEDIATE:
          {
-            int offset_reg = ppc_allocate_register(gen->f);
-            int offset = (reg->SrcRegister.Index * 4 + swizzle) * 16;
-            ppc_li(gen->f, offset_reg, offset);
-            ppc_lvx(gen->f, dst_vec, gen->immed_reg, offset_reg);
-            ppc_release_register(gen->f, offset_reg);
+            int offset = (reg->SrcRegister.Index * 4 + swizzle) * 4;
+            int offset_reg = emit_li_offset(gen, offset);
+            dst_vec = ppc_allocate_vec_register(gen->f);
+            /* Load 4-byte word into vector register.
+             * The vector slot depends on the effective address we load from.
+             * We know that our immediates start at a 16-byte boundary so we
+             * know that 'swizzle' tells us which vector slot will have the
+             * loaded word.  The other vector slots will be undefined.
+             */
+            ppc_lvewx(gen->f, dst_vec, gen->immed_reg, offset_reg);
+            /* splat word[swizzle] across the vector reg */
+            ppc_vspltw(gen->f, dst_vec, dst_vec, swizzle);
          }
          break;
       case TGSI_FILE_CONSTANT:
          {
-            int offset_reg = ppc_allocate_register(gen->f);
             int offset = (reg->SrcRegister.Index * 4 + swizzle) * 4;
-            ppc_li(gen->f, offset_reg, offset);
+            int offset_reg = emit_li_offset(gen, offset);
+            dst_vec = ppc_allocate_vec_register(gen->f);
             /* Load 4-byte word into vector register.
              * The vector slot depends on the effective address we load from.
              * We know that our constants start at a 16-byte boundary so we
@@ -216,7 +327,6 @@ emit_fetch(struct gen_context *gen,
             ppc_lvewx(gen->f, dst_vec, gen->const_reg, offset_reg);
             /* splat word[swizzle] across the vector reg */
             ppc_vspltw(gen->f, dst_vec, dst_vec, swizzle);
-            ppc_release_register(gen->f, offset_reg);
          }
          break;
       default:
@@ -229,6 +339,7 @@ emit_fetch(struct gen_context *gen,
    case TGSI_EXTSWIZZLE_ONE:
       {
          int one_vec = gen_one_vec(gen);
+         dst_vec = ppc_allocate_vec_register(gen->f);
          ppc_vmove(gen->f, dst_vec, one_vec);
       }
       break;
@@ -236,6 +347,8 @@ emit_fetch(struct gen_context *gen,
       assert( 0 );
    }
 
+   assert(dst_vec >= 0);
+
    {
       uint sign_op = tgsi_util_get_full_src_register_sign_mode(reg, chan_index);
       if (sign_op != TGSI_UTIL_SIGN_KEEP) {
@@ -259,40 +372,148 @@ emit_fetch(struct gen_context *gen,
          }
       }
    }
+
+   return dst_vec;
 }
 
-#define FETCH( GEN, INST, DST_VEC, SRC_REG, CHAN ) \
-   emit_fetch( GEN, DST_VEC, &(INST).FullSrcRegisters[SRC_REG], CHAN )
 
 
+/**
+ * Test if two TGSI src registers refer to the same memory location.
+ * We use this to avoid redundant register loads.
+ */
+static boolean
+equal_src_locs(const struct tgsi_full_src_register *a, uint chan_a,
+               const struct tgsi_full_src_register *b, uint chan_b)
+{
+   int swz_a, swz_b;
+   int sign_a, sign_b;
+   if (a->SrcRegister.File != b->SrcRegister.File)
+      return FALSE;
+   if (a->SrcRegister.Index != b->SrcRegister.Index)
+      return FALSE;
+   swz_a = tgsi_util_get_full_src_register_extswizzle(a, chan_a);
+   swz_b = tgsi_util_get_full_src_register_extswizzle(b, chan_b);
+   if (swz_a != swz_b)
+      return FALSE;
+   sign_a = tgsi_util_get_full_src_register_sign_mode(a, chan_a);
+   sign_b = tgsi_util_get_full_src_register_sign_mode(b, chan_b);
+   if (sign_a != sign_b)
+      return FALSE;
+   return TRUE;
+}
+
+
+/**
+ * Given a TGSI src register and channel index, return the PPC vector
+ * register containing the value.  We use a cache to prevent re-loading
+ * the same register multiple times.
+ * \return index of PPC vector register with the desired src operand
+ */
+static int
+get_src_vec(struct gen_context *gen,
+            struct tgsi_full_instruction *inst, int src_reg, uint chan)
+{
+   const const struct tgsi_full_src_register *src = 
+      &inst->FullSrcRegisters[src_reg];
+   int vec;
+   uint i;
+
+   /* check the cache */
+   for (i = 0; i < gen->num_regs; i++) {
+      if (equal_src_locs(&gen->regs[i].src, gen->regs[i].chan, src, chan)) {
+         /* cache hit */
+         assert(gen->regs[i].vec >= 0);
+         return gen->regs[i].vec;
+      }
+   }
+
+   /* cache miss: allocate new vec reg and emit fetch/load code */
+   vec = emit_fetch(gen, src, chan);
+   gen->regs[gen->num_regs].src = *src;
+   gen->regs[gen->num_regs].chan = chan;
+   gen->regs[gen->num_regs].vec = vec;
+   gen->num_regs++;
+
+   assert(gen->num_regs <= Elements(gen->regs));
+
+   assert(vec >= 0);
+
+   return vec;
+}
+
+
+/**
+ * Clear the src operand cache.  To be called at the end of each emit function.
+ */
+static void
+release_src_vecs(struct gen_context *gen)
+{
+   uint i;
+   for (i = 0; i < gen->num_regs; i++) {
+      const const struct tgsi_full_src_register src = gen->regs[i].src;
+      if (!(src.SrcRegister.File == TGSI_FILE_TEMPORARY &&
+            src.SrcRegister.Index < MAX_PPC_TEMPS)) {
+         ppc_release_vec_register(gen->f, gen->regs[i].vec);
+      }
+   }
+   gen->num_regs = 0;
+}
+
+
+
+static int
+get_dst_vec(struct gen_context *gen, 
+            const struct tgsi_full_instruction *inst,
+            unsigned chan_index)
+{
+   const struct tgsi_full_dst_register *reg = &inst->FullDstRegisters[0];
+
+   if (reg->DstRegister.File == TGSI_FILE_TEMPORARY &&
+       reg->DstRegister.Index < MAX_PPC_TEMPS) {
+      int vec = gen->temps_map[reg->DstRegister.Index][chan_index];
+      return vec;
+   }
+   else {
+      return ppc_allocate_vec_register(gen->f);
+   }
+}
+
 
 /**
  * Register store.  Store 'src_vec' at location indicated by 'reg'.
+ * \param free_vec  Should the src_vec be released when done?
  */
 static void
 emit_store(struct gen_context *gen,
-           unsigned src_vec,
-           const struct tgsi_full_dst_register *reg,
+           int src_vec,
            const struct tgsi_full_instruction *inst,
-           unsigned chan_index)
+           unsigned chan_index,
+           boolean free_vec)
 {
+   const struct tgsi_full_dst_register *reg = &inst->FullDstRegisters[0];
+
    switch (reg->DstRegister.File) {
    case TGSI_FILE_OUTPUT:
       {
-         int offset_reg = ppc_allocate_register(gen->f);
          int offset = (reg->DstRegister.Index * 4 + chan_index) * 16;
-         ppc_li(gen->f, offset_reg, offset);
+         int offset_reg = emit_li_offset(gen, offset);
          ppc_stvx(gen->f, src_vec, gen->outputs_reg, offset_reg);
-         ppc_release_register(gen->f, offset_reg);
       }
       break;
    case TGSI_FILE_TEMPORARY:
-      {
-         int offset_reg = ppc_allocate_register(gen->f);
+      if (reg->DstRegister.Index < MAX_PPC_TEMPS) {
+         if (!free_vec) {
+            int dst_vec = gen->temps_map[reg->DstRegister.Index][chan_index];
+            if (dst_vec != src_vec)
+               ppc_vmove(gen->f, dst_vec, src_vec);
+         }
+         free_vec = FALSE;
+      }
+      else {
          int offset = (reg->DstRegister.Index * 4 + chan_index) * 16;
-         ppc_li(gen->f, offset_reg, offset);
+         int offset_reg = emit_li_offset(gen, offset);
          ppc_stvx(gen->f, src_vec, gen->temps_reg, offset_reg);
-         ppc_release_register(gen->f, offset_reg);
       }
       break;
 #if 0
@@ -322,22 +543,20 @@ emit_store(struct gen_context *gen,
       break;
    }
 #endif
-}
-
-
-#define STORE( GEN, INST, XMM, INDEX, CHAN )\
-   emit_store( GEN, XMM, &(INST).FullDstRegisters[INDEX], &(INST), CHAN )
 
+   if (free_vec)
+      ppc_release_vec_register(gen->f, src_vec);
+}
 
 
 static void
 emit_scalar_unaryop(struct gen_context *gen, struct tgsi_full_instruction *inst)
 {
-   int v0 = ppc_allocate_vec_register(gen->f);
-   int v1 = ppc_allocate_vec_register(gen->f);
+   int v0, v1;
    uint chan_index;
 
-   FETCH(gen, *inst, v0, 0, CHAN_X);
+   v0 = get_src_vec(gen, inst, 0, CHAN_X);
+   v1 = ppc_allocate_vec_register(gen->f);
 
    switch (inst->Instruction.Opcode) {
    case TGSI_OPCODE_RSQ:
@@ -353,9 +572,10 @@ emit_scalar_unaryop(struct gen_context *gen, struct tgsi_full_instruction *inst)
    }
 
    FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-      STORE(gen, *inst, v1, 0, chan_index);
+      emit_store(gen, v1, inst, chan_index, FALSE);
    }
-   ppc_release_vec_register(gen->f, v0);
+
+   release_src_vecs(gen);
    ppc_release_vec_register(gen->f, v1);
 }
 
@@ -363,61 +583,65 @@ emit_scalar_unaryop(struct gen_context *gen, struct tgsi_full_instruction *inst)
 static void
 emit_unaryop(struct gen_context *gen, struct tgsi_full_instruction *inst)
 {
-   int v0 = ppc_allocate_vec_register(gen->f);
    uint chan_index;
    FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) {
-      FETCH(gen, *inst, 0, 0, chan_index);   /* v0 = srcreg[0] */
+      int v0 = get_src_vec(gen, inst, 0, chan_index);   /* v0 = srcreg[0] */
+      int v1 = get_dst_vec(gen, inst, chan_index);
       switch (inst->Instruction.Opcode) {
       case TGSI_OPCODE_ABS:
          /* turn off the most significant bit of each vector float word */
          {
-            int v1 = ppc_allocate_vec_register(gen->f);
-            ppc_vspltisw(gen->f, v1, -1);  /* v1 = {-1, -1, -1, -1} */
-            ppc_vslw(gen->f, v1, v1, v1);  /* v1 = {1<<31, 1<<31, 1<<31, 1<<31} */
-            ppc_vandc(gen->f, v0, v0, v1); /* v0 = v0 & ~v1 */
-            ppc_release_vec_register(gen->f, v1);
+            int bit31_vec = gen_get_bit31_vec(gen);
+            ppc_vandc(gen->f, v1, v0, bit31_vec); /* v1 = v0 & ~bit31 */
          }
          break;
       case TGSI_OPCODE_FLOOR:
-         ppc_vrfim(gen->f, v0, v0);         /* v0 = floor(v0) */
+         ppc_vrfim(gen->f, v1, v0);         /* v1 = floor(v0) */
          break;
       case TGSI_OPCODE_FRAC:
-         {
-            int v1 = ppc_allocate_vec_register(gen->f);
-            ppc_vrfim(gen->f, v1, v0);         /* v1 = floor(v0) */
-            ppc_vsubfp(gen->f, v0, v0, v1);    /* v0 = v0 - v1 */
-            ppc_release_vec_register(gen->f, v1);
-         }
+         ppc_vrfim(gen->f, v1, v0);      /* tmp = floor(v0) */
+         ppc_vsubfp(gen->f, v1, v0, v1); /* v1 = v0 - v1 */
          break;
       case TGSI_OPCODE_EXPBASE2:
-         ppc_vexptefp(gen->f, v0, v0);      /* v0 = 2^v0 */
+         ppc_vexptefp(gen->f, v1, v0);     /* v1 = 2^v0 */
          break;
       case TGSI_OPCODE_LOGBASE2:
          /* XXX this may be broken! */
-         ppc_vlogefp(gen->f, v0, v0);      /* v0 = log2(v0) */
+         ppc_vlogefp(gen->f, v1, v0);      /* v1 = log2(v0) */
          break;
       case TGSI_OPCODE_MOV:
-         /* nothing */
+      case TGSI_OPCODE_SWZ:
+         if (v0 != v1)
+            ppc_vmove(gen->f, v1, v0);
          break;
       default:
          assert(0);
       }
-      STORE(gen, *inst, v0, 0, chan_index);   /* store v0 */
+      emit_store(gen, v1, inst, chan_index, TRUE);  /* store v0 */
    }
-   ppc_release_vec_register(gen->f, v0);
+
+   release_src_vecs(gen);
 }
 
 
 static void
 emit_binop(struct gen_context *gen, struct tgsi_full_instruction *inst)
 {
-   int v0 = ppc_allocate_vec_register(gen->f);
-   int v1 = ppc_allocate_vec_register(gen->f);
-   int v2 = ppc_allocate_vec_register(gen->f);
-   uint chan_index;
-   FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) {
-      FETCH(gen, *inst, v0, 0, chan_index);   /* v0 = srcreg[0] */
-      FETCH(gen, *inst, v1, 1, chan_index);   /* v1 = srcreg[1] */
+   int zero_vec = -1;
+   uint chan;
+
+   if (inst->Instruction.Opcode == TGSI_OPCODE_MUL) {
+      zero_vec = ppc_allocate_vec_register(gen->f);
+      ppc_vzero(gen->f, zero_vec);
+   }
+
+   FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan) {
+      /* fetch src operands */
+      int v0 = get_src_vec(gen, inst, 0, chan);
+      int v1 = get_src_vec(gen, inst, 1, chan);
+      int v2 = get_dst_vec(gen, inst, chan);
+
+      /* emit binop */
       switch (inst->Instruction.Opcode) {
       case TGSI_OPCODE_ADD:
          ppc_vaddfp(gen->f, v2, v0, v1);
@@ -426,8 +650,7 @@ emit_binop(struct gen_context *gen, struct tgsi_full_instruction *inst)
          ppc_vsubfp(gen->f, v2, v0, v1);
          break;
       case TGSI_OPCODE_MUL:
-         ppc_vxor(gen->f, v2, v2, v2);        /* v2 = {0, 0, 0, 0} */
-         ppc_vmaddfp(gen->f, v2, v0, v1, v2); /* v2 = v0 * v1 + v0 */
+         ppc_vmaddfp(gen->f, v2, v0, v1, zero_vec);
          break;
       case TGSI_OPCODE_MIN:
          ppc_vminfp(gen->f, v2, v0, v1);
@@ -438,11 +661,48 @@ emit_binop(struct gen_context *gen, struct tgsi_full_instruction *inst)
       default:
          assert(0);
       }
-      STORE(gen, *inst, v2, 0, chan_index);   /* store v2 */
+
+      /* store v2 */
+      emit_store(gen, v2, inst, chan, TRUE);
    }
-   ppc_release_vec_register(gen->f, v0);
-   ppc_release_vec_register(gen->f, v1);
-   ppc_release_vec_register(gen->f, v2);
+
+   if (inst->Instruction.Opcode == TGSI_OPCODE_MUL)
+      ppc_release_vec_register(gen->f, zero_vec);
+
+   release_src_vecs(gen);
+}
+
+
+static void
+emit_triop(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+   uint chan;
+
+   FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan) {
+      /* fetch src operands */
+      int v0 = get_src_vec(gen, inst, 0, chan);
+      int v1 = get_src_vec(gen, inst, 1, chan);
+      int v2 = get_src_vec(gen, inst, 2, chan);
+      int v3 = get_dst_vec(gen, inst, chan);
+
+      /* emit ALU */
+      switch (inst->Instruction.Opcode) {
+      case TGSI_OPCODE_MAD:
+         ppc_vmaddfp(gen->f, v3, v0, v1, v2);   /* v3 = v0 * v1 + v2 */
+         break;
+      case TGSI_OPCODE_LRP:
+         ppc_vsubfp(gen->f, v3, v1, v2);        /* v3 = v1 - v2 */
+         ppc_vmaddfp(gen->f, v3, v0, v3, v2);   /* v3 = v0 * v3 + v2 */
+         break;
+      default:
+         assert(0);
+      }
+
+      /* store v3 */
+      emit_store(gen, v3, inst, chan, TRUE);
+   }
+
+   release_src_vecs(gen);
 }
 
 
@@ -452,16 +712,15 @@ emit_binop(struct gen_context *gen, struct tgsi_full_instruction *inst)
 static void
 emit_inequality(struct gen_context *gen, struct tgsi_full_instruction *inst)
 {
-   int v0 = ppc_allocate_vec_register(gen->f);
-   int v1 = ppc_allocate_vec_register(gen->f);
-   int v2 = ppc_allocate_vec_register(gen->f);
-   uint chan_index;
-   boolean complement = FALSE;
+   uint chan;
    int one_vec = gen_one_vec(gen);
 
-   FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) {
-      FETCH(gen, *inst, v0, 0, chan_index);   /* v0 = srcreg[0] */
-      FETCH(gen, *inst, v1, 1, chan_index);   /* v1 = srcreg[1] */
+   FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan) {
+      /* fetch src operands */
+      int v0 = get_src_vec(gen, inst, 0, chan);
+      int v1 = get_src_vec(gen, inst, 1, chan);
+      int v2 = get_dst_vec(gen, inst, chan);
+      boolean complement = FALSE;
 
       switch (inst->Instruction.Opcode) {
       case TGSI_OPCODE_SNE:
@@ -495,89 +754,56 @@ emit_inequality(struct gen_context *gen, struct tgsi_full_instruction *inst)
       else
          ppc_vand(gen->f, v2, one_vec, v2);     /* v2 = one_vec & v2 */
 
-      STORE(gen, *inst, v2, 0, chan_index);   /* store v2 */
+      /* store v2 */
+      emit_store(gen, v2, inst, chan, TRUE);
    }
 
-   ppc_release_vec_register(gen->f, v0);
-   ppc_release_vec_register(gen->f, v1);
-   ppc_release_vec_register(gen->f, v2);
+   release_src_vecs(gen);
 }
 
 
 static void
 emit_dotprod(struct gen_context *gen, struct tgsi_full_instruction *inst)
 {
-   int v0 = ppc_allocate_vec_register(gen->f);
-   int v1 = ppc_allocate_vec_register(gen->f);
-   int v2 = ppc_allocate_vec_register(gen->f);
+   int v0, v1, v2;
    uint chan_index;
 
+   v2 = ppc_allocate_vec_register(gen->f);
+
    ppc_vxor(gen->f, v2, v2, v2);           /* v2 = {0, 0, 0, 0} */
 
-   FETCH(gen, *inst, v0, 0, CHAN_X);       /* v0 = src0.XXXX */
-   FETCH(gen, *inst, v1, 1, CHAN_X);       /* v1 = src1.XXXX */
+   v0 = get_src_vec(gen, inst, 0, CHAN_X); /* v0 = src0.XXXX */
+   v1 = get_src_vec(gen, inst, 1, CHAN_X); /* v1 = src1.XXXX */
    ppc_vmaddfp(gen->f, v2, v0, v1, v2);    /* v2 = v0 * v1 + v2 */
 
-   FETCH(gen, *inst, v0, 0, CHAN_Y);       /* v0 = src0.YYYY */
-   FETCH(gen, *inst, v1, 1, CHAN_Y);       /* v1 = src1.YYYY */
+   v0 = get_src_vec(gen, inst, 0, CHAN_Y); /* v0 = src0.YYYY */
+   v1 = get_src_vec(gen, inst, 1, CHAN_Y); /* v1 = src1.YYYY */
    ppc_vmaddfp(gen->f, v2, v0, v1, v2);    /* v2 = v0 * v1 + v2 */
 
-   FETCH(gen, *inst, v0, 0, CHAN_Z);       /* v0 = src0.ZZZZ */
-   FETCH(gen, *inst, v1, 1, CHAN_Z);       /* v1 = src1.ZZZZ */
+   v0 = get_src_vec(gen, inst, 0, CHAN_Z); /* v0 = src0.ZZZZ */
+   v1 = get_src_vec(gen, inst, 1, CHAN_Z); /* v1 = src1.ZZZZ */
    ppc_vmaddfp(gen->f, v2, v0, v1, v2);    /* v2 = v0 * v1 + v2 */
 
    if (inst->Instruction.Opcode == TGSI_OPCODE_DP4) {
-      FETCH(gen, *inst, v0, 0, CHAN_W);    /* v0 = src0.WWWW */
-      FETCH(gen, *inst, v1, 1, CHAN_W);    /* v1 = src1.WWWW */
-      ppc_vmaddfp(gen->f, v2, v0, v1, v2); /* v2 = v0 * v1 + v2 */
+      v0 = get_src_vec(gen, inst, 0, CHAN_W); /* v0 = src0.WWWW */
+      v1 = get_src_vec(gen, inst, 1, CHAN_W); /* v1 = src1.WWWW */
+      ppc_vmaddfp(gen->f, v2, v0, v1, v2);    /* v2 = v0 * v1 + v2 */
    }
    else if (inst->Instruction.Opcode == TGSI_OPCODE_DPH) {
-      FETCH(gen, *inst, v1, 1, CHAN_W);    /* v1 = src1.WWWW */
-      ppc_vaddfp(gen->f, v2, v2, v1);      /* v2 = v2 + v1 */
+      v1 = get_src_vec(gen, inst, 1, CHAN_W); /* v1 = src1.WWWW */
+      ppc_vaddfp(gen->f, v2, v2, v1);         /* v2 = v2 + v1 */
    }
 
    FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) {
-      STORE(gen, *inst, v2, 0, chan_index);  /* store v2 */
+      emit_store(gen, v2, inst, chan_index, FALSE);  /* store v2, free v2 later */
    }
-   ppc_release_vec_register(gen->f, v0);
-   ppc_release_vec_register(gen->f, v1);
-   ppc_release_vec_register(gen->f, v2);
-}
 
+   release_src_vecs(gen);
 
-static void
-emit_triop(struct gen_context *gen, struct tgsi_full_instruction *inst)
-{
-   int v0 = ppc_allocate_vec_register(gen->f);
-   int v1 = ppc_allocate_vec_register(gen->f);
-   int v2 = ppc_allocate_vec_register(gen->f);
-   int v3 = ppc_allocate_vec_register(gen->f);
-   uint chan_index;
-   FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) {
-      FETCH(gen, *inst, v0, 0, chan_index);   /* v0 = srcreg[0] */
-      FETCH(gen, *inst, v1, 1, chan_index);   /* v1 = srcreg[1] */
-      FETCH(gen, *inst, v2, 2, chan_index);   /* v2 = srcreg[2] */
-      switch (inst->Instruction.Opcode) {
-      case TGSI_OPCODE_MAD:
-         ppc_vmaddfp(gen->f, v3, v0, v1, v2);   /* v3 = v0 * v1 + v2 */
-         break;
-      case TGSI_OPCODE_LRP:
-         ppc_vsubfp(gen->f, v3, v1, v2);        /* v3 = v1 - v2 */
-         ppc_vmaddfp(gen->f, v3, v0, v3, v2);   /* v3 = v0 * v3 + v2 */
-         break;
-      default:
-         assert(0);
-      }
-      STORE(gen, *inst, v3, 0, chan_index);   /* store v3 */
-   }
-   ppc_release_vec_register(gen->f, v0);
-   ppc_release_vec_register(gen->f, v1);
    ppc_release_vec_register(gen->f, v2);
-   ppc_release_vec_register(gen->f, v3);
 }
 
 
-
 /** Approximation for vr = pow(va, vb) */
 static void
 ppc_vec_pow(struct ppc_function *f, int vr, int va, int vb)
@@ -604,43 +830,42 @@ emit_lit(struct gen_context *gen, struct tgsi_full_instruction *inst)
 
    /* Compute X */
    if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) {
-      STORE(gen, *inst, one_vec, 0, CHAN_X);
+      emit_store(gen, one_vec, inst, CHAN_X, FALSE);
    }
 
    /* Compute Y, Z */
    if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y) ||
        IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
-      int x_vec = ppc_allocate_vec_register(gen->f);
+      int x_vec;
       int zero_vec = ppc_allocate_vec_register(gen->f);
 
-      FETCH(gen, *inst, x_vec, 0, CHAN_X);        /* x_vec = src[0].x */
+      x_vec = get_src_vec(gen, inst, 0, CHAN_X);  /* x_vec = src[0].x */
 
       ppc_vzero(gen->f, zero_vec);                /* zero = {0,0,0,0} */
       ppc_vmaxfp(gen->f, x_vec, x_vec, zero_vec); /* x_vec = max(x_vec, 0) */
 
       if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
-         STORE(gen, *inst, x_vec, 0, CHAN_Y);        /* store Y */
+         emit_store(gen, x_vec, inst, CHAN_Y, FALSE);
       }
 
       if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
-         int y_vec = ppc_allocate_vec_register(gen->f);
+         int y_vec, w_vec;
          int z_vec = ppc_allocate_vec_register(gen->f);
-         int w_vec = ppc_allocate_vec_register(gen->f);
          int pow_vec = ppc_allocate_vec_register(gen->f);
          int pos_vec = ppc_allocate_vec_register(gen->f);
          int p128_vec = ppc_allocate_vec_register(gen->f);
          int n128_vec = ppc_allocate_vec_register(gen->f);
 
-         FETCH(gen, *inst, y_vec, 0, CHAN_Y);        /* y_vec = src[0].y */
+         y_vec = get_src_vec(gen, inst, 0, CHAN_Y);  /* y_vec = src[0].y */
          ppc_vmaxfp(gen->f, y_vec, y_vec, zero_vec); /* y_vec = max(y_vec, 0) */
 
-         FETCH(gen, *inst, w_vec, 0, CHAN_W);        /* w_vec = src[0].w */
+         w_vec = get_src_vec(gen, inst, 0, CHAN_W);  /* w_vec = src[0].w */
 
-         /* clamp Y to [-128, 128] */
+         /* clamp W to [-128, 128] */
          load_constant_vec(gen, p128_vec, 128.0f);
          load_constant_vec(gen, n128_vec, -128.0f);
-         ppc_vmaxfp(gen->f, y_vec, y_vec, n128_vec); /* y = max(y, -128) */
-         ppc_vminfp(gen->f, y_vec, y_vec, p128_vec); /* y = min(y, 128) */
+         ppc_vmaxfp(gen->f, w_vec, w_vec, n128_vec); /* w = max(w, -128) */
+         ppc_vminfp(gen->f, w_vec, w_vec, p128_vec); /* w = min(w, 128) */
 
          /* if temp.x > 0
           *    z = pow(tmp.y, tmp.w)
@@ -651,34 +876,216 @@ emit_lit(struct gen_context *gen, struct tgsi_full_instruction *inst)
          ppc_vcmpgtfpx(gen->f, pos_vec, x_vec, zero_vec); /* pos = x > 0 */
          ppc_vand(gen->f, z_vec, pow_vec, pos_vec);       /* z = pow & pos */
 
-         STORE(gen, *inst, z_vec, 0, CHAN_Z);             /* store Z */
+         emit_store(gen, z_vec, inst, CHAN_Z, FALSE);
 
-         ppc_release_vec_register(gen->f, y_vec);
          ppc_release_vec_register(gen->f, z_vec);
-         ppc_release_vec_register(gen->f, w_vec);
          ppc_release_vec_register(gen->f, pow_vec);
          ppc_release_vec_register(gen->f, pos_vec);
          ppc_release_vec_register(gen->f, p128_vec);
          ppc_release_vec_register(gen->f, n128_vec);
       }
 
-      ppc_release_vec_register(gen->f, x_vec);
       ppc_release_vec_register(gen->f, zero_vec);
    }
 
    /* Compute W */
    if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W)) {
-      STORE(gen, *inst, one_vec, 0, CHAN_W);
+      emit_store(gen, one_vec, inst, CHAN_W, FALSE);
+   }
+
+   release_src_vecs(gen);
+}
+
+
+static void
+emit_exp(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+   const int one_vec = gen_one_vec(gen);
+   int src_vec;
+
+   /* get src arg */
+   src_vec = get_src_vec(gen, inst, 0, CHAN_X);
+
+   /* Compute X = 2^floor(src) */
+   if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) {
+      int dst_vec = get_dst_vec(gen, inst, CHAN_X);
+      int tmp_vec = ppc_allocate_vec_register(gen->f);
+      ppc_vrfim(gen->f, tmp_vec, src_vec);             /* tmp = floor(src); */
+      ppc_vexptefp(gen->f, dst_vec, tmp_vec);          /* dst = 2 ^ tmp */
+      emit_store(gen, dst_vec, inst, CHAN_X, TRUE);
+      ppc_release_vec_register(gen->f, tmp_vec);
+   }
+
+   /* Compute Y = src - floor(src) */
+   if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
+      int dst_vec = get_dst_vec(gen, inst, CHAN_Y);
+      int tmp_vec = ppc_allocate_vec_register(gen->f);
+      ppc_vrfim(gen->f, tmp_vec, src_vec);             /* tmp = floor(src); */
+      ppc_vsubfp(gen->f, dst_vec, src_vec, tmp_vec);   /* dst = src - tmp */
+      emit_store(gen, dst_vec, inst, CHAN_Y, TRUE);
+      ppc_release_vec_register(gen->f, tmp_vec);
+   }
+
+   /* Compute Z = RoughApprox2ToX(src) */
+   if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
+      int dst_vec = get_dst_vec(gen, inst, CHAN_Z);
+      ppc_vexptefp(gen->f, dst_vec, src_vec);          /* dst = 2 ^ src */
+      emit_store(gen, dst_vec, inst, CHAN_Z, TRUE);
+   }
+
+   /* Compute W = 1.0 */
+   if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W)) {
+      emit_store(gen, one_vec, inst, CHAN_W, FALSE);
+   }
+
+   release_src_vecs(gen);
+}
+
+
+static void
+emit_log(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+   const int bit31_vec = gen_get_bit31_vec(gen);
+   const int one_vec = gen_one_vec(gen);
+   int src_vec, abs_vec;
+
+   /* get src arg */
+   src_vec = get_src_vec(gen, inst, 0, CHAN_X);
+
+   /* compute abs(src) */
+   abs_vec = ppc_allocate_vec_register(gen->f);
+   ppc_vandc(gen->f, abs_vec, src_vec, bit31_vec);     /* abs = src & ~bit31 */
+
+   if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) &&
+       IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
+
+      /* compute tmp = floor(log2(abs)) */
+      int tmp_vec = ppc_allocate_vec_register(gen->f);
+      ppc_vlogefp(gen->f, tmp_vec, abs_vec);           /* tmp = log2(abs) */
+      ppc_vrfim(gen->f, tmp_vec, tmp_vec);             /* tmp = floor(tmp); */
+
+      /* Compute X = tmp */
+      if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) {
+         emit_store(gen, tmp_vec, inst, CHAN_X, FALSE);
+      }
+      
+      /* Compute Y = abs / 2^tmp */
+      if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
+         const int zero_vec = ppc_allocate_vec_register(gen->f);
+         ppc_vzero(gen->f, zero_vec);
+         ppc_vexptefp(gen->f, tmp_vec, tmp_vec);       /* tmp = 2 ^ tmp */
+         ppc_vrefp(gen->f, tmp_vec, tmp_vec);          /* tmp = 1 / tmp */
+         /* tmp = abs * tmp + zero */
+         ppc_vmaddfp(gen->f, tmp_vec, abs_vec, tmp_vec, zero_vec);
+         emit_store(gen, tmp_vec, inst, CHAN_Y, FALSE);
+         ppc_release_vec_register(gen->f, zero_vec);
+      }
+
+      ppc_release_vec_register(gen->f, tmp_vec);
+   }
+
+   /* Compute Z = RoughApproxLog2(abs) */
+   if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
+      int dst_vec = get_dst_vec(gen, inst, CHAN_Z);
+      ppc_vlogefp(gen->f, dst_vec, abs_vec);           /* dst = log2(abs) */
+      emit_store(gen, dst_vec, inst, CHAN_Z, TRUE);
+   }
+
+   /* Compute W = 1.0 */
+   if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W)) {
+      emit_store(gen, one_vec, inst, CHAN_W, FALSE);
    }
+
+   ppc_release_vec_register(gen->f, abs_vec);
+   release_src_vecs(gen);
+}
+
+
+static void
+emit_pow(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+   int s0_vec = get_src_vec(gen, inst, 0, CHAN_X);
+   int s1_vec = get_src_vec(gen, inst, 1, CHAN_X);
+   int pow_vec = ppc_allocate_vec_register(gen->f);
+   int chan;
+
+   ppc_vec_pow(gen->f, pow_vec, s0_vec, s1_vec);
+
+   FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan) {
+      emit_store(gen, pow_vec, inst, chan, FALSE);
+   }
+
+   ppc_release_vec_register(gen->f, pow_vec);
+
+   release_src_vecs(gen);
 }
 
 
+static void
+emit_xpd(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+   int x0_vec, y0_vec, z0_vec;
+   int x1_vec, y1_vec, z1_vec;
+   int zero_vec, tmp_vec;
+   int tmp2_vec;
+
+   zero_vec = ppc_allocate_vec_register(gen->f);
+   ppc_vzero(gen->f, zero_vec);
+
+   tmp_vec = ppc_allocate_vec_register(gen->f);
+   tmp2_vec = ppc_allocate_vec_register(gen->f);
+
+   if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y) ||
+       IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
+      x0_vec = get_src_vec(gen, inst, 0, CHAN_X);
+      x1_vec = get_src_vec(gen, inst, 1, CHAN_X);
+   }
+   if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) ||
+       IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
+      y0_vec = get_src_vec(gen, inst, 0, CHAN_Y);
+      y1_vec = get_src_vec(gen, inst, 1, CHAN_Y);
+   }
+   if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) ||
+       IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
+      z0_vec = get_src_vec(gen, inst, 0, CHAN_Z);
+      z1_vec = get_src_vec(gen, inst, 1, CHAN_Z);
+   }
+
+   IF_IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) {
+      /* tmp = y0 * z1 */
+      ppc_vmaddfp(gen->f, tmp_vec, y0_vec, z1_vec, zero_vec);
+      /* tmp = tmp - z0 * y1*/
+      ppc_vnmsubfp(gen->f, tmp_vec, tmp_vec, z0_vec, y1_vec);
+      emit_store(gen, tmp_vec, inst, CHAN_X, FALSE);
+   }
+   IF_IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y) {
+      /* tmp = z0 * x1 */
+      ppc_vmaddfp(gen->f, tmp_vec, z0_vec, x1_vec, zero_vec);
+      /* tmp = tmp - x0 * z1 */
+      ppc_vnmsubfp(gen->f, tmp_vec, tmp_vec, x0_vec, z1_vec);
+      emit_store(gen, tmp_vec, inst, CHAN_Y, FALSE);
+   }
+   IF_IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z) {
+      /* tmp = x0 * y1 */
+      ppc_vmaddfp(gen->f, tmp_vec, x0_vec, y1_vec, zero_vec);
+      /* tmp = tmp - y0 * x1 */
+      ppc_vnmsubfp(gen->f, tmp_vec, tmp_vec, y0_vec, x1_vec);
+      emit_store(gen, tmp_vec, inst, CHAN_Z, FALSE);
+   }
+   /* W is undefined */
+
+   ppc_release_vec_register(gen->f, tmp_vec);
+   ppc_release_vec_register(gen->f, zero_vec);
+   release_src_vecs(gen);
+}
+
 static int
 emit_instruction(struct gen_context *gen,
                  struct tgsi_full_instruction *inst)
 {
    switch (inst->Instruction.Opcode) {
    case TGSI_OPCODE_MOV:
+   case TGSI_OPCODE_SWZ:
    case TGSI_OPCODE_ABS:
    case TGSI_OPCODE_FLOOR:
    case TGSI_OPCODE_FRAC:
@@ -717,17 +1124,28 @@ emit_instruction(struct gen_context *gen,
    case TGSI_OPCODE_LIT:
       emit_lit(gen, inst);
       break;
+   case TGSI_OPCODE_LOG:
+      emit_log(gen, inst);
+      break;
+   case TGSI_OPCODE_EXP:
+      emit_exp(gen, inst);
+      break;
+   case TGSI_OPCODE_POW:
+      emit_pow(gen, inst);
+      break;
+   case TGSI_OPCODE_XPD:
+      emit_xpd(gen, inst);
+      break;
    case TGSI_OPCODE_END:
       /* normal end */
       return 1;
    default:
       return 0;
    }
-
-   
    return 1;
 }
 
+
 static void
 emit_declaration(
    struct ppc_function *func,
@@ -805,6 +1223,7 @@ emit_epilogue(struct ppc_function *func)
 {
    ppc_return(func);
    /* XXX restore prev stack frame */
+   debug_printf("PPC: Emitted %u instructions\n", func->num_inst);
 }
 
 
@@ -837,17 +1256,14 @@ tgsi_emit_ppc(const struct tgsi_token *tokens,
    if (!use_ppc_asm)
       return FALSE;
 
+   if (0) {
+      debug_printf("\n********* TGSI->PPC ********\n");
+      tgsi_dump(tokens, 0);
+   }
+
    util_init_math();
 
-   gen.f = func;
-   gen.inputs_reg = ppc_reserve_register(func, 3);   /* first function param */
-   gen.outputs_reg = ppc_reserve_register(func, 4);  /* second function param */
-   gen.temps_reg = ppc_reserve_register(func, 5);    /* ... */
-   gen.immed_reg = ppc_reserve_register(func, 6);
-   gen.const_reg = ppc_reserve_register(func, 7);
-   gen.builtins_reg = ppc_reserve_register(func, 8);
-   gen.one_vec = -1;
-   gen.bit31_vec = -1;
+   init_gen_context(&gen, func);
 
    emit_prologue(func);
 
@@ -878,19 +1294,14 @@ tgsi_emit_ppc(const struct tgsi_token *tokens,
          /* splat each immediate component into a float[4] vector for SoA */
          {
             const uint size = parse.FullToken.FullImmediate.Immediate.Size - 1;
-            float *imm = (float *) immediates;
             uint i;
             assert(size <= 4);
             assert(num_immediates < TGSI_EXEC_NUM_IMMEDIATES);
             for (i = 0; i < size; i++) {
-               const float value =
-                  parse.FullToken.FullImmediate.u.ImmediateFloat32[i].Float;
-               imm[num_immediates * 4 + 0] = 
-               imm[num_immediates * 4 + 1] = 
-               imm[num_immediates * 4 + 2] = 
-               imm[num_immediates * 4 + 3] = value;
-               num_immediates++;
+               immediates[num_immediates][i] =
+		  parse.FullToken.FullImmediate.u.ImmediateFloat32[i].Float;
             }
+            num_immediates++;
          }
          break;
 
@@ -904,6 +1315,14 @@ tgsi_emit_ppc(const struct tgsi_token *tokens,
 
    tgsi_parse_free( &parse );
 
+   if (ppc_num_instructions(func) == 0) {
+      /* ran out of memory for instructions */
+      ok = FALSE;
+   }
+
+   if (!ok)
+      debug_printf("TGSI->PPC translation failed\n");
+
    return ok;
 }
 
diff --git a/src/gallium/auxiliary/tgsi/tgsi_sanity.c b/src/gallium/auxiliary/tgsi/tgsi_sanity.c
index 11659247c0..bc7b941b78 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_sanity.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_sanity.c
@@ -153,17 +153,21 @@ check_register_usage(
    if (!check_file_name( ctx, file ))
       return FALSE;
 
-   if (index < 0 || index > MAX_REGISTERS) {
-      report_error( ctx, "%s[%i]: Invalid index %s", file_names[file], index, name );
-      return FALSE;
-   }
-
    if (indirect_access) {
+      /* Note that 'index' is an offset relative to the value of the
+       * address register.  No range checking done here.
+       */
       if (!is_any_register_declared( ctx, file ))
          report_error( ctx, "%s: Undeclared %s register", file_names[file], name );
       ctx->regs_ind_used[file] = TRUE;
    }
    else {
+      if (index < 0 || index > MAX_REGISTERS) {
+         report_error( ctx, "%s[%i]: Invalid index %s",
+                       file_names[file], index, name );
+         return FALSE;
+      }
+
       if (!is_register_declared( ctx, file, index ))
          report_error( ctx, "%s[%d]: Undeclared %s register", file_names[file], index, name );
       ctx->regs_used[file][index / BITS_IN_REG_FLAG] |= (1 << (index % BITS_IN_REG_FLAG));
diff --git a/src/gallium/auxiliary/tgsi/tgsi_sse2.c b/src/gallium/auxiliary/tgsi/tgsi_sse2.c
index f79170b9d6..f93db18114 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_sse2.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_sse2.c
@@ -27,12 +27,14 @@
 
 #include "pipe/p_config.h"
 
-#if defined(PIPE_ARCH_X86) && defined(PIPE_ARCH_SSE)
+#if defined(PIPE_ARCH_X86)
 
 #include "pipe/p_debug.h"
 #include "pipe/p_shader_tokens.h"
 #include "util/u_math.h"
+#if defined(PIPE_ARCH_SSE)
 #include "util/u_sse.h"
+#endif
 #include "tgsi/tgsi_parse.h"
 #include "tgsi/tgsi_util.h"
 #include "tgsi_exec.h"
@@ -72,6 +74,9 @@
 
 #define TEMP_R0   TGSI_EXEC_TEMP_R0
 #define TEMP_ADDR TGSI_EXEC_TEMP_ADDR
+#define TEMP_EXEC_MASK_I TGSI_EXEC_MASK_I
+#define TEMP_EXEC_MASK_C TGSI_EXEC_MASK_C
+
 
 /**
  * X86 utility functions.
@@ -233,6 +238,9 @@ emit_const(
    int indirectIndex )
 {
    if (indirect) {
+      /* 'vec' is the offset from the address register's value.
+       * We're loading CONST[ADDR+vec] into an xmm register.
+       */
       struct x86_reg r0 = get_input_base();
       struct x86_reg r1 = get_output_base();
       uint i;
@@ -243,18 +251,40 @@ emit_const(
       x86_push( func, r0 );
       x86_push( func, r1 );
 
+      /*
+       * Loop over the four pixels or vertices in the quad.
+       * Get the value of the address (offset) register for pixel/vertex[i],
+       * add it to the src offset and index into the constant buffer.
+       * Note that we're working on SOA data.
+       * If any of the pixel/vertex execution channels are unused their
+       * values will be garbage.  It's very important that we don't use
+       * those garbage values as indexes into the constant buffer since
+       * that'll cause segfaults.
+       * The solution is to bitwise-AND the offset with the execution mask
+       * register whose values are either 0 or ~0.
+       * The caller must setup the execution mask register to indicate
+       * which channels are valid/alive before running the shader.
+       * The execution mask will also figure into loops and conditionals
+       * someday.
+       */
       for (i = 0; i < QUAD_SIZE; i++) {
-         x86_lea( func, r0, get_const( vec, chan ) );
+         /* r1 = address register[i] */
          x86_mov( func, r1, x86_make_disp( get_temp( TEMP_ADDR, CHAN_X ), i * 4 ) );
+         /* r0 = execution mask[i] */
+         x86_mov( func, r0, x86_make_disp( get_temp( TEMP_EXEC_MASK_I, TEMP_EXEC_MASK_C ), i * 4 ) );
+         /* r1 = r1 & r0 */
+         x86_and( func, r1, r0 );
+         /* r0 = 'vec', the offset */
+         x86_lea( func, r0, get_const( vec, chan ) );
 
-         /* Quick hack to multiply by 16 -- need to add SHL to rtasm.
+         /* Quick hack to multiply r1 by 16 -- need to add SHL to rtasm.
           */
          x86_add( func, r1, r1 );
          x86_add( func, r1, r1 );
          x86_add( func, r1, r1 );
          x86_add( func, r1, r1 );
 
-         x86_add( func, r0, r1 );
+         x86_add( func, r0, r1 );  /* r0 = r0 + r1 */
          x86_mov( func, r1, x86_deref( r0 ) );
          x86_mov( func, x86_make_disp( get_temp( TEMP_R0, CHAN_X ), i * 4 ), r1 );
       }
@@ -268,6 +298,7 @@ emit_const(
          get_temp( TEMP_R0, CHAN_X ) );
    }
    else {
+      /* 'vec' is the index into the src register file, such as TEMP[vec] */
       assert( vec >= 0 );
 
       sse_movss(
@@ -598,6 +629,9 @@ emit_func_call_dst_src(
       code );
 }
 
+
+#if defined(PIPE_ARCH_SSE)
+
 /*
  * Fast SSE2 implementation of special math functions.
  */
@@ -649,6 +683,7 @@ exp2f4(__m128 x)
    return _mm_mul_ps(expipart, expfpart);
 }
 
+
 /**
  * See http://www.devmaster.net/forums/showthread.php?p=43580
  */
@@ -691,12 +726,16 @@ log2f4(__m128 x)
    return _mm_add_ps(logmant, exp);
 }
 
+
 static INLINE __m128
 powf4(__m128 x, __m128 y)
 {
    return exp2f4(_mm_mul_ps(log2f4(x), y));
 }
 
+#endif /* PIPE_ARCH_SSE */
+
+
 
 /**
  * Low-level instruction translators.
@@ -751,13 +790,20 @@ emit_cos(
 }
 
 static void PIPE_CDECL
-#if defined(PIPE_CC_GCC)
+#if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
 __attribute__((force_align_arg_pointer))
 #endif
 ex24f(
    float *store )
 {
+#if defined(PIPE_ARCH_SSE)
    _mm_store_ps(&store[0], exp2f4( _mm_load_ps(&store[0]) ));
+#else
+   store[0] = util_fast_exp2( store[0] );
+   store[1] = util_fast_exp2( store[1] );
+   store[2] = util_fast_exp2( store[2] );
+   store[3] = util_fast_exp2( store[3] );
+#endif
 }
 
 static void
@@ -784,6 +830,17 @@ emit_f2it(
       make_xmm( xmm ) );
 }
 
+static void
+emit_i2f(
+   struct x86_function *func,
+   unsigned xmm )
+{
+   sse2_cvtdq2ps(
+      func,
+      make_xmm( xmm ),
+      make_xmm( xmm ) );
+}
+
 static void PIPE_CDECL
 flr4f(
    float *store )
@@ -831,13 +888,20 @@ emit_frc(
 }
 
 static void PIPE_CDECL
-#if defined(PIPE_CC_GCC)
+#if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
 __attribute__((force_align_arg_pointer))
 #endif
 lg24f(
    float *store )
 {
+#if defined(PIPE_ARCH_SSE)
    _mm_store_ps(&store[0], log2f4( _mm_load_ps(&store[0]) ));
+#else
+   store[0] = util_fast_log2( store[0] );
+   store[1] = util_fast_log2( store[1] );
+   store[2] = util_fast_log2( store[2] );
+   store[3] = util_fast_log2( store[3] );
+#endif
 }
 
 static void
@@ -890,19 +954,19 @@ emit_neg(
 }
 
 static void PIPE_CDECL
-#if defined(PIPE_CC_GCC)
+#if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
 __attribute__((force_align_arg_pointer))
 #endif
 pow4f(
    float *store )
 {
-#if 1
+#if defined(PIPE_ARCH_SSE)
    _mm_store_ps(&store[0], powf4( _mm_load_ps(&store[0]), _mm_load_ps(&store[4]) ));
 #else
-   store[0] = powf( store[0], store[4] );
-   store[1] = powf( store[1], store[5] );
-   store[2] = powf( store[2], store[6] );
-   store[3] = powf( store[3], store[7] );
+   store[0] = util_fast_pow( store[0], store[4] );
+   store[1] = util_fast_pow( store[1], store[5] );
+   store[2] = util_fast_pow( store[2], store[6] );
+   store[3] = util_fast_pow( store[3], store[7] );
 #endif
 }
 
@@ -1702,7 +1766,18 @@ emit_instruction(
 
    case TGSI_OPCODE_DOT2ADD:
    /* TGSI_OPCODE_DP2A */
-      return 0;
+      FETCH( func, *inst, 0, 0, CHAN_X );  /* xmm0 = src[0].x */
+      FETCH( func, *inst, 1, 1, CHAN_X );  /* xmm1 = src[1].x */
+      emit_mul( func, 0, 1 );              /* xmm0 = xmm0 * xmm1 */
+      FETCH( func, *inst, 1, 0, CHAN_Y );  /* xmm1 = src[0].y */
+      FETCH( func, *inst, 2, 1, CHAN_Y );  /* xmm2 = src[1].y */
+      emit_mul( func, 1, 2 );              /* xmm1 = xmm1 * xmm2 */
+      emit_add( func, 0, 1 );              /* xmm0 = xmm0 + xmm1 */
+      FETCH( func, *inst, 1, 2, CHAN_X );  /* xmm1 = src[2].x */
+      emit_add( func, 0, 1 );              /* xmm0 = xmm0 + xmm1 */
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( func, *inst, 0, 0, chan_index );  /* dest[ch] = xmm0 */
+      }
       break;
 
    case TGSI_OPCODE_INDEX:
@@ -2036,7 +2111,39 @@ emit_instruction(
       break;
 
    case TGSI_OPCODE_NRM:
-      return 0;
+      /* fall-through */
+   case TGSI_OPCODE_NRM4:
+      /* 3 or 4-component normalization */
+      {
+         uint dims = (inst->Instruction.Opcode == TGSI_OPCODE_NRM) ? 3 : 4;
+         /* note: cannot use xmm regs 2/3 here (see emit_rsqrt() above) */
+         FETCH( func, *inst, 4, 0, CHAN_X );    /* xmm4 = src[0].x */
+         FETCH( func, *inst, 5, 0, CHAN_Y );    /* xmm5 = src[0].y */
+         FETCH( func, *inst, 6, 0, CHAN_Z );    /* xmm6 = src[0].z */
+         if (dims == 4) {
+            FETCH( func, *inst, 7, 0, CHAN_W ); /* xmm7 = src[0].w */
+         }
+         emit_MOV( func, 0, 4 );                /* xmm0 = xmm3 */
+         emit_mul( func, 0, 4 );                /* xmm0 *= xmm3 */
+         emit_MOV( func, 1, 5 );                /* xmm1 = xmm4 */
+         emit_mul( func, 1, 5 );                /* xmm1 *= xmm4 */
+         emit_add( func, 0, 1 );                /* xmm0 += xmm1 */
+         emit_MOV( func, 1, 6 );                /* xmm1 = xmm5 */
+         emit_mul( func, 1, 6 );                /* xmm1 *= xmm5 */
+         emit_add( func, 0, 1 );                /* xmm0 += xmm1 */
+         if (dims == 4) {
+            emit_MOV( func, 1, 7 );             /* xmm1 = xmm7 */
+            emit_mul( func, 1, 7 );             /* xmm1 *= xmm7 */
+            emit_add( func, 0, 0 );             /* xmm0 += xmm1 */
+         }
+         emit_rsqrt( func, 1, 0 );              /* xmm1 = 1/sqrt(xmm0) */
+         FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+            if (chan_index < dims) {
+               emit_mul( func, 4+chan_index, 1); /* xmm[4+ch] *= xmm1 */
+               STORE( func, *inst, 4+chan_index, 0, chan_index );
+            }
+         }
+      }
       break;
 
    case TGSI_OPCODE_DIV:
@@ -2044,7 +2151,16 @@ emit_instruction(
       break;
 
    case TGSI_OPCODE_DP2:
-      return 0;
+      FETCH( func, *inst, 0, 0, CHAN_X );  /* xmm0 = src[0].x */
+      FETCH( func, *inst, 1, 1, CHAN_X );  /* xmm1 = src[1].x */
+      emit_mul( func, 0, 1 );              /* xmm0 = xmm0 * xmm1 */
+      FETCH( func, *inst, 1, 0, CHAN_Y );  /* xmm1 = src[0].y */
+      FETCH( func, *inst, 2, 1, CHAN_Y );  /* xmm2 = src[1].y */
+      emit_mul( func, 1, 2 );              /* xmm1 = xmm1 * xmm2 */
+      emit_add( func, 0, 1 );              /* xmm0 = xmm0 + xmm1 */
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( func, *inst, 0, 0, chan_index );  /* dest[ch] = xmm0 */
+      }
       break;
 
    case TGSI_OPCODE_TXL:
@@ -2104,7 +2220,12 @@ emit_instruction(
       break;
 
    case TGSI_OPCODE_TRUNC:
-      return 0;
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         emit_f2it( func, 0 );
+         emit_i2f( func, 0 );
+         STORE( func, *inst, 0, 0, chan_index );
+      }
       break;
 
    case TGSI_OPCODE_SHL:
diff --git a/src/gallium/auxiliary/util/p_debug.c b/src/gallium/auxiliary/util/p_debug.c
index 3ed8bdfdf3..a1a51d7ef2 100644
--- a/src/gallium/auxiliary/util/p_debug.c
+++ b/src/gallium/auxiliary/util/p_debug.c
@@ -36,6 +36,13 @@
 #include <windows.h>
 #include <winddi.h>
 
+#elif defined(PIPE_SUBSYSTEM_WINDOWS_CE)
+
+#include <stdio.h> 
+#include <stdlib.h> 
+#include <windows.h> 
+#include <types.h> 
+
 #elif defined(PIPE_SUBSYSTEM_WINDOWS_USER)
 
 #ifndef WIN32_LEAN_AND_MEAN
@@ -98,7 +105,35 @@ void _debug_vprintf(const char *format, va_list ap)
       OutputDebugStringA(buf);
       buf[0] = '\0';
    }
-#elif defined(PIPE_SUBSYSTEM_WINDOWS_CE) || defined(PIPE_SUBSYSTEM_WINDOWS_MINIPORT) 
+#elif defined(PIPE_SUBSYSTEM_WINDOWS_CE)
+   wchar_t *wide_format;
+   long wide_str_len;   
+   char buf[512];   
+   int ret;   
+#if (_WIN32_WCE < 600)
+   ret = vsprintf(buf, format, ap);   
+   if(ret < 0){   
+       sprintf(buf, "Cant handle debug print!");   
+       ret = 25;
+   }
+#else
+   ret = vsprintf_s(buf, 512, format, ap);   
+   if(ret < 0){   
+       sprintf_s(buf, 512, "Cant handle debug print!");   
+       ret = 25;
+   }
+#endif
+   buf[ret] = '\0';   
+   /* Format is ascii - needs to be converted to wchar_t for printing */   
+   wide_str_len = MultiByteToWideChar(CP_ACP, 0, (const char *) buf, -1, NULL, 0);   
+   wide_format = (wchar_t *) malloc((wide_str_len+1) * sizeof(wchar_t));   
+   if (wide_format) {   
+      MultiByteToWideChar(CP_ACP, 0, (const char *) buf, -1,   
+            wide_format, wide_str_len);   
+      NKDbgPrintfW(wide_format, wide_format);   
+      free(wide_format);   
+   } 
+#elif defined(PIPE_SUBSYSTEM_WINDOWS_MINIPORT)
    /* TODO */
 #else /* !PIPE_SUBSYSTEM_WINDOWS */
    vfprintf(stderr, format, ap);
@@ -637,6 +672,7 @@ void
 debug_dump_surface_bmp(const char *filename,
                        struct pipe_surface *surface)
 {
+#ifndef PIPE_SUBSYSTEM_WINDOWS_MINIPORT
    struct util_stream *stream;
    unsigned surface_usage;
    struct bmp_file_header bmfh;
@@ -703,6 +739,7 @@ error2:
    FREE(rgba);
 error1:
    ;
+#endif
 }
 
 #endif
diff --git a/src/gallium/auxiliary/util/u_math.h b/src/gallium/auxiliary/util/u_math.h
index be7303e550..d2eaa2e7f7 100644
--- a/src/gallium/auxiliary/util/u_math.h
+++ b/src/gallium/auxiliary/util/u_math.h
@@ -68,7 +68,7 @@ __inline double ceil(double val)
    return ceil_val;
 }
 
-#ifndef PIPE_SUBSYSTEM_WINDOWS_CE
+#ifndef PIPE_SUBSYSTEM_WINDOWS_CE_OGL
 __inline double floor(double val)
 {
    double floor_val;
diff --git a/src/gallium/auxiliary/util/u_mm.c b/src/gallium/auxiliary/util/u_mm.c
index 0f51dd5977..45ce257b5e 100644
--- a/src/gallium/auxiliary/util/u_mm.c
+++ b/src/gallium/auxiliary/util/u_mm.c
@@ -31,7 +31,7 @@
 
 
 void
-mmDumpMemInfo(const struct mem_block *heap)
+u_mmDumpMemInfo(const struct mem_block *heap)
 {
    debug_printf("Memory heap %p:\n", (void *)heap);
    if (heap == 0) {
@@ -58,7 +58,7 @@ mmDumpMemInfo(const struct mem_block *heap)
 }
 
 struct mem_block *
-mmInit(int ofs, int size)
+u_mmInit(int ofs, int size)
 {
    struct mem_block *heap, *block;
   
@@ -165,13 +165,17 @@ SliceBlock(struct mem_block *p,
 
 
 struct mem_block *
-mmAllocMem(struct mem_block *heap, int size, int align2, int startSearch)
+u_mmAllocMem(struct mem_block *heap, int size, int align2, int startSearch)
 {
    struct mem_block *p;
    const int mask = (1 << align2)-1;
    int startofs = 0;
    int endofs;
 
+   assert(size >= 0);
+   assert(align2 >= 0);
+   assert(align2 <= 12); /* sanity check, 2^12 (4KB) enough? */
+
    if (!heap || align2 < 0 || size <= 0)
       return NULL;
 
@@ -198,7 +202,7 @@ mmAllocMem(struct mem_block *heap, int size, int align2, int startSearch)
 
 
 struct mem_block *
-mmFindBlock(struct mem_block *heap, int start)
+u_mmFindBlock(struct mem_block *heap, int start)
 {
    struct mem_block *p;
 
@@ -237,7 +241,7 @@ Join2Blocks(struct mem_block *p)
 }
 
 int
-mmFreeMem(struct mem_block *b)
+u_mmFreeMem(struct mem_block *b)
 {
    if (!b)
       return 0;
@@ -266,7 +270,7 @@ mmFreeMem(struct mem_block *b)
 
 
 void
-mmDestroy(struct mem_block *heap)
+u_mmDestroy(struct mem_block *heap)
 {
    struct mem_block *p;
 
diff --git a/src/gallium/auxiliary/util/u_mm.h b/src/gallium/auxiliary/util/u_mm.h
index b226b101cb..ce20e48763 100644
--- a/src/gallium/auxiliary/util/u_mm.h
+++ b/src/gallium/auxiliary/util/u_mm.h
@@ -49,7 +49,7 @@ struct mem_block {
  * input: total size in bytes
  * return: a heap pointer if OK, NULL if error
  */
-extern struct mem_block *mmInit(int ofs, int size);
+extern struct mem_block *u_mmInit(int ofs, int size);
 
 /**
  * Allocate 'size' bytes with 2^align2 bytes alignment,
@@ -61,7 +61,7 @@ extern struct mem_block *mmInit(int ofs, int size);
  *		startSearch = linear offset from start of heap to begin search
  * return: pointer to the allocated block, 0 if error
  */
-extern struct mem_block *mmAllocMem(struct mem_block *heap, int size, int align2, 
+extern struct mem_block *u_mmAllocMem(struct mem_block *heap, int size, int align2, 
                             int startSearch);
 
 /**
@@ -69,23 +69,23 @@ extern struct mem_block *mmAllocMem(struct mem_block *heap, int size, int align2
  * input: pointer to a block
  * return: 0 if OK, -1 if error
  */
-extern int mmFreeMem(struct mem_block *b);
+extern int u_mmFreeMem(struct mem_block *b);
 
 /**
  * Free block starts at offset
  * input: pointer to a heap, start offset
  * return: pointer to a block
  */
-extern struct mem_block *mmFindBlock(struct mem_block *heap, int start);
+extern struct mem_block *u_mmFindBlock(struct mem_block *heap, int start);
 
 /**
  * destroy MM
  */
-extern void mmDestroy(struct mem_block *mmInit);
+extern void u_mmDestroy(struct mem_block *mmInit);
 
 /**
  * For debuging purpose.
  */
-extern void mmDumpMemInfo(const struct mem_block *mmInit);
+extern void u_mmDumpMemInfo(const struct mem_block *mmInit);
 
 #endif
diff --git a/src/gallium/auxiliary/util/u_rect.c b/src/gallium/auxiliary/util/u_rect.c
index f5619ef791..30f32413d7 100644
--- a/src/gallium/auxiliary/util/u_rect.c
+++ b/src/gallium/auxiliary/util/u_rect.c
@@ -222,7 +222,8 @@ util_surface_copy(struct pipe_context *pipe,
                      w, h,
                      src_map,
                      do_flip ? -(int) src->stride : src->stride,
-                     src_x, src_y);
+                     src_x,
+                     do_flip ? w - src_y : src_y);
    }
 
    pipe->screen->surface_unmap(pipe->screen, src);
diff --git a/src/gallium/auxiliary/util/u_tile.c b/src/gallium/auxiliary/util/u_tile.c
index 853c503f4f..32f6b072a0 100644
--- a/src/gallium/auxiliary/util/u_tile.c
+++ b/src/gallium/auxiliary/util/u_tile.c
@@ -460,7 +460,7 @@ l8_put_tile_rgba(ubyte *dst,
       for (j = 0; j < w; j++, pRow += 4) {
          unsigned r;
          r = float_to_ubyte(pRow[0]);
-         *dst++ = r;
+         *dst++ = (ubyte) r;
       }
       p += src_stride;
    }
@@ -504,7 +504,7 @@ a8_put_tile_rgba(ubyte *dst,
       for (j = 0; j < w; j++, pRow += 4) {
          unsigned a;
          a = float_to_ubyte(pRow[3]);
-         *dst++ = a;
+         *dst++ = (ubyte) a;
       }
       p += src_stride;
    }
@@ -634,7 +634,7 @@ i8_put_tile_rgba(ubyte *dst,
       for (j = 0; j < w; j++, pRow += 4) {
          unsigned r;
          r = float_to_ubyte(pRow[0]);
-         *dst++ = r;
+         *dst++ = (ubyte) r;
       }
       p += src_stride;
    }
@@ -769,6 +769,32 @@ z24s8_get_tile_rgba(const unsigned *src,
 }
 
 
+/*** PIPE_FORMAT_Z32_FLOAT ***/
+
+/**
+ * Return each Z value as four floats in [0,1].
+ */
+static void
+z32f_get_tile_rgba(const float *src,
+                   unsigned w, unsigned h,
+                   float *p,
+                   unsigned dst_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         pRow[0] =
+         pRow[1] =
+         pRow[2] =
+         pRow[3] = *src++;
+      }
+      p += dst_stride;
+   }
+}
+
+
 /*** PIPE_FORMAT_YCBCR / PIPE_FORMAT_YCBCR_REV ***/
 
 /**
@@ -913,6 +939,9 @@ pipe_tile_raw_to_rgba(enum pipe_format format,
    case PIPE_FORMAT_Z24S8_UNORM:
       z24s8_get_tile_rgba((unsigned *) src, w, h, dst, dst_stride);
       break;
+   case PIPE_FORMAT_Z32_FLOAT:
+      z32f_get_tile_rgba((float *) src, w, h, dst, dst_stride);
+      break;
    case PIPE_FORMAT_YCBCR:
       ycbcr_get_tile_rgba((ushort *) src, w, h, dst, dst_stride, FALSE);
       break;
diff --git a/src/gallium/auxiliary/util/u_time.c b/src/gallium/auxiliary/util/u_time.c
index bf7d1d1c8d..57b80e5604 100644
--- a/src/gallium/auxiliary/util/u_time.c
+++ b/src/gallium/auxiliary/util/u_time.c
@@ -200,7 +200,7 @@ util_time_timeout(const struct util_time *start,
 }
 
 
-#if defined(PIPE_SUBSYSYEM_WINDOWS_DISPLAY)
+#if defined(PIPE_SUBSYSTEM_WINDOWS_DISPLAY)
 void util_time_sleep(unsigned usecs)
 {
    LONGLONG start, curr, end;
diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index 23fb0b0831..87488ea2d7 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -122,7 +122,7 @@
 #define CELL_DEBUG_CACHE                (1 << 6)
 
 /** Max instructions for doing per-fragment operations */
-#define SPU_MAX_FRAGMENT_OPS_INSTS 64
+#define SPU_MAX_FRAGMENT_OPS_INSTS 128
 
 
 
diff --git a/src/gallium/drivers/cell/ppu/cell_batch.c b/src/gallium/drivers/cell/ppu/cell_batch.c
index 448b723d85..962775cd33 100644
--- a/src/gallium/drivers/cell/ppu/cell_batch.c
+++ b/src/gallium/drivers/cell/ppu/cell_batch.c
@@ -100,12 +100,23 @@ emit_fence(struct cell_context *cell)
    const uint batch = cell->cur_batch;
    const uint size = cell->buffer_size[batch];
    struct cell_command_fence *fence_cmd;
+   struct cell_fence *fence = &cell->fenced_buffers[batch].fence;
+   uint i;
+
+   /* set fence status to emitted, not yet signalled */
+   for (i = 0; i < cell->num_spus; i++) {
+      fence->status[i][0] = CELL_FENCE_EMITTED;
+   }
 
    ASSERT(size + sizeof(struct cell_command_fence) <= CELL_BUFFER_SIZE);
 
    fence_cmd = (struct cell_command_fence *) (cell->buffer[batch] + size);
    fence_cmd->opcode = CELL_CMD_FENCE;
-   fence_cmd->fence = &cell->fenced_buffers[batch].fence;
+   fence_cmd->fence = fence;
+
+   /* update batch buffer size */
+   cell->buffer_size[batch] = size + sizeof(struct cell_command_fence);
+   assert(sizeof(struct cell_command_fence) % 8 == 0);
 }
 
 
@@ -119,7 +130,7 @@ cell_batch_flush(struct cell_context *cell)
 {
    static boolean flushing = FALSE;
    uint batch = cell->cur_batch;
-   const uint size = cell->buffer_size[batch];
+   uint size = cell->buffer_size[batch];
    uint spu, cmd_word;
 
    assert(!flushing);
@@ -130,8 +141,10 @@ cell_batch_flush(struct cell_context *cell)
    /* Before we use this batch buffer, make sure any fenced texture buffers
     * are released.
     */
-   if (cell->fenced_buffers[batch].head)
+   if (cell->fenced_buffers[batch].head) {
       emit_fence(cell);
+      size = cell->buffer_size[batch];
+   }
 
    flushing = TRUE;
 
diff --git a/src/gallium/drivers/cell/ppu/cell_clear.c b/src/gallium/drivers/cell/ppu/cell_clear.c
index c9c0c721bb..037635e466 100644
--- a/src/gallium/drivers/cell/ppu/cell_clear.c
+++ b/src/gallium/drivers/cell/ppu/cell_clear.c
@@ -106,4 +106,17 @@ cell_clear_surface(struct pipe_context *pipe, struct pipe_surface *ps,
       clr->surface = surfIndex;
       clr->value = clearValue;
    }
+
+   /* Technically, the surface's contents are now known and cleared,
+    * so we could set the status to PIPE_SURFACE_STATUS_CLEAR.  But
+    * it turns out it's quite painful to recognize when any particular
+    * surface goes from PIPE_SURFACE_STATUS_CLEAR to 
+    * PIPE_SURFACE_STATUS_DEFINED (i.e. with known contents), because
+    * the drawing commands could be operating on numerous draw buffers,
+    * which we'd have to iterate through to set all their stati...
+    * For now, we cheat a bit and set the surface's status to DEFINED
+    * right here.  Later we should revisit this and set the status to
+    * CLEAR here, and find a better place to set the status to DEFINED.
+    */
+   ps->status = PIPE_SURFACE_STATUS_DEFINED;
 }
diff --git a/src/gallium/drivers/cell/ppu/cell_context.h b/src/gallium/drivers/cell/ppu/cell_context.h
index 4491ae8cdf..eb1397bb3f 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.h
+++ b/src/gallium/drivers/cell/ppu/cell_context.h
@@ -89,7 +89,7 @@ struct cell_buffer_node;
  */
 struct cell_buffer_list
 {
-   struct cell_fence fence;
+   struct cell_fence fence ALIGN16_ATTRIB;
    struct cell_buffer_node *head;
 };
 
diff --git a/src/gallium/drivers/cell/ppu/cell_fence.c b/src/gallium/drivers/cell/ppu/cell_fence.c
index ffb3bea12b..867b5dcaa0 100644
--- a/src/gallium/drivers/cell/ppu/cell_fence.c
+++ b/src/gallium/drivers/cell/ppu/cell_fence.c
@@ -38,6 +38,7 @@ void
 cell_fence_init(struct cell_fence *fence)
 {
    uint i;
+   ASSERT_ALIGN16(fence->status);
    for (i = 0; i < CELL_MAX_SPUS; i++) {
       fence->status[i][0] = CELL_FENCE_IDLE;
    }
@@ -50,9 +51,9 @@ cell_fence_signalled(const struct cell_context *cell,
 {
    uint i;
    for (i = 0; i < cell->num_spus; i++) {
-      //ASSERT(fence->status[i][0] != CELL_FENCE_IDLE);
-      if (fence->status[i][0] == CELL_FENCE_EMITTED)
+      if (fence->status[i][0] != CELL_FENCE_SIGNALLED)
          return FALSE;
+      /*assert(fence->status[i][0] == CELL_FENCE_EMITTED);*/
    }
    return TRUE;
 }
@@ -65,6 +66,15 @@ cell_fence_finish(const struct cell_context *cell,
    while (!cell_fence_signalled(cell, fence)) {
       usleep(10);
    }
+
+#ifdef DEBUG
+   {
+      uint i;
+      for (i = 0; i < cell->num_spus; i++) {
+         assert(fence->status[i][0] == CELL_FENCE_SIGNALLED);
+      }
+   }
+#endif
 }
 
 
diff --git a/src/gallium/drivers/cell/ppu/cell_flush.c b/src/gallium/drivers/cell/ppu/cell_flush.c
index 6596b72010..a64967b4b9 100644
--- a/src/gallium/drivers/cell/ppu/cell_flush.c
+++ b/src/gallium/drivers/cell/ppu/cell_flush.c
@@ -49,7 +49,7 @@ cell_flush(struct pipe_context *pipe, unsigned flags,
       flags |= CELL_FLUSH_WAIT;
    }
 
-   if (flags & PIPE_FLUSH_SWAPBUFFERS)
+   if (flags & (PIPE_FLUSH_SWAPBUFFERS | PIPE_FLUSH_RENDER_CACHE))
       flags |= CELL_FLUSH_WAIT;
 
    draw_flush( cell->draw );
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
index d4d644d6e8..5c41b264ac 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
@@ -1303,60 +1303,91 @@ lookup_function(struct cell_context *cell, const char *funcname)
 /**
  * Emit code to call a SPU function.
  * Used to implement instructions like SIN/COS/POW/TEX/etc.
+ * If scalar, only the X components of the src regs are used, and the
+ * result is replicated across the dest register's XYZW components.
  */
 static boolean
 emit_function_call(struct codegen *gen,
                    const struct tgsi_full_instruction *inst,
-                   char *funcname, uint num_args)
+                   char *funcname, uint num_args, boolean scalar)
 {
    const uint addr = lookup_function(gen->cell, funcname);
    char comment[100];
-   int ch;
+   int s_regs[3];
+   int func_called = FALSE;
+   uint a, ch;
+   int retval_reg = -1;
 
    assert(num_args <= 3);
 
    snprintf(comment, sizeof(comment), "CALL %s:", funcname);
    spe_comment(gen->f, -4, comment);
 
+   if (scalar) {
+      for (a = 0; a < num_args; a++) {
+         s_regs[a] = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[a]);
+      }
+      /* we'll call the function, put the return value in this register,
+       * then replicate it across all write-enabled components in d_reg.
+       */
+      retval_reg = spe_allocate_available_register(gen->f);
+   }
+
    for (ch = 0; ch < 4; ch++) {
       if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
-         int s_regs[3], d_reg;
+         int d_reg;
          ubyte usedRegs[SPE_NUM_REGS];
-         uint a, i, numUsed;
+         uint i, numUsed;
 
-         for (a = 0; a < num_args; a++) {
-            s_regs[a] = get_src_reg(gen, ch, &inst->FullSrcRegisters[a]);
+         if (!scalar) {
+            for (a = 0; a < num_args; a++) {
+               s_regs[a] = get_src_reg(gen, ch, &inst->FullSrcRegisters[a]);
+            }
          }
-         d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
 
-         numUsed = spe_get_registers_used(gen->f, usedRegs);
-         assert(numUsed < gen->frame_size / 16 - 2);
+         d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
 
-         /* save registers to stack */
-         for (i = 0; i < numUsed; i++) {
-            uint reg = usedRegs[i];
-            int offset = 2 + i;
-            spe_stqd(gen->f, reg, SPE_REG_SP, 16 * offset);
-         }
+         if (!scalar || !func_called) {
+            /* for a scalar function, we'll really only call the function once */
 
-         /* setup function arguments */
-         for (a = 0; a < num_args; a++) {
-            spe_move(gen->f, 3 + a, s_regs[a]);
-         }
+            numUsed = spe_get_registers_used(gen->f, usedRegs);
+            assert(numUsed < gen->frame_size / 16 - 2);
 
-         /* branch to function, save return addr */
-         spe_brasl(gen->f, SPE_REG_RA, addr);
+            /* save registers to stack */
+            for (i = 0; i < numUsed; i++) {
+               uint reg = usedRegs[i];
+               int offset = 2 + i;
+               spe_stqd(gen->f, reg, SPE_REG_SP, 16 * offset);
+            }
 
-         /* save function's return value */
-         spe_move(gen->f, d_reg, 3);
+            /* setup function arguments */
+            for (a = 0; a < num_args; a++) {
+               spe_move(gen->f, 3 + a, s_regs[a]);
+            }
 
-         /* restore registers from stack */
-         for (i = 0; i < numUsed; i++) {
-            uint reg = usedRegs[i];
-            if (reg != d_reg) {
-               int offset = 2 + i;
-               spe_lqd(gen->f, reg, SPE_REG_SP, 16 * offset);
+            /* branch to function, save return addr */
+            spe_brasl(gen->f, SPE_REG_RA, addr);
+
+            /* save function's return value */
+            if (scalar)
+               spe_move(gen->f, retval_reg, 3);
+            else
+               spe_move(gen->f, d_reg, 3);
+
+            /* restore registers from stack */
+            for (i = 0; i < numUsed; i++) {
+               uint reg = usedRegs[i];
+               if (reg != d_reg && reg != retval_reg) {
+                  int offset = 2 + i;
+                  spe_lqd(gen->f, reg, SPE_REG_SP, 16 * offset);
+               }
             }
+
+            func_called = TRUE;
+         }
+
+         if (scalar) {
+            spe_move(gen->f, d_reg, retval_reg);
          }
 
          store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
@@ -1364,6 +1395,10 @@ emit_function_call(struct codegen *gen,
       }
    }
 
+   if (scalar) {
+      spe_release_register(gen->f, retval_reg);
+   }
+
    return true;
 }
 
@@ -1770,15 +1805,15 @@ emit_instruction(struct codegen *gen,
       return emit_END(gen);
 
    case TGSI_OPCODE_COS:
-      return emit_function_call(gen, inst, "spu_cos", 1);
+      return emit_function_call(gen, inst, "spu_cos", 1, TRUE);
    case TGSI_OPCODE_SIN:
-      return emit_function_call(gen, inst, "spu_sin", 1);
+      return emit_function_call(gen, inst, "spu_sin", 1, TRUE);
    case TGSI_OPCODE_POW:
-      return emit_function_call(gen, inst, "spu_pow", 2);
+      return emit_function_call(gen, inst, "spu_pow", 2, TRUE);
    case TGSI_OPCODE_EXPBASE2:
-      return emit_function_call(gen, inst, "spu_exp2", 1);
+      return emit_function_call(gen, inst, "spu_exp2", 1, TRUE);
    case TGSI_OPCODE_LOGBASE2:
-      return emit_function_call(gen, inst, "spu_log2", 1);
+      return emit_function_call(gen, inst, "spu_log2", 1, TRUE);
    case TGSI_OPCODE_TEX:
       /* fall-through for now */
    case TGSI_OPCODE_TXD:
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
index 4e1e53ecdc..d9c3ff3f4d 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
@@ -1141,13 +1141,17 @@ gen_colormask(struct spe_function *f,
  * access to the Compare Immediate instructions where we don't in 
  * gen_depth_test(), which is what makes us very different.
  *
+ * There's some added complexity if there's a non-trivial state->mask
+ * value; then stencil and reference both must be masked
+ *
  * The return value in the stencil_pass_reg is a bitmask of valid
  * fragments that also passed the stencil test.  The bitmask of valid
- * fragments that failed would be found in (mask_reg & ~stencil_pass_reg).
+ * fragments that failed would be found in (fragment_mask_reg & ~stencil_pass_reg).
  */
 static void
 gen_stencil_test(struct spe_function *f, const struct pipe_stencil_state *state, 
-                 unsigned int mask_reg, unsigned int fbS_reg, 
+                 unsigned int stencil_max_value,
+                 unsigned int fragment_mask_reg, unsigned int fbS_reg, 
                  unsigned int stencil_pass_reg)
 {
    /* Generate code that puts the set of passing fragments into the stencil_pass_reg
@@ -1155,68 +1159,134 @@ gen_stencil_test(struct spe_function *f, const struct pipe_stencil_state *state,
     */
    switch (state->func) {
    case PIPE_FUNC_EQUAL:
-      /* stencil_pass = mask & (s == reference) */
-      spe_compare_equal_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
-      spe_and(f, stencil_pass_reg, mask_reg, stencil_pass_reg);
+      if (state->value_mask == stencil_max_value) {
+         /* stencil_pass = fragment_mask & (s == reference) */
+         spe_compare_equal_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
+         spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+      }
+      else {
+         /* stencil_pass = fragment_mask & ((s&mask) == (reference&mask)) */
+         unsigned int tmp_masked_stencil = spe_allocate_available_register(f);
+         spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->value_mask);
+         spe_compare_equal_uint(f, stencil_pass_reg, tmp_masked_stencil, state->value_mask & state->ref_value);
+         spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+         spe_release_register(f, tmp_masked_stencil);
+      }
       break;
 
    case PIPE_FUNC_NOTEQUAL:
-      /* stencil_pass = mask & ~(s == reference) */
-      spe_compare_equal_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
-      spe_andc(f, stencil_pass_reg, mask_reg, stencil_pass_reg);
+      if (state->value_mask == stencil_max_value) {
+         /* stencil_pass = fragment_mask & ~(s == reference) */
+         spe_compare_equal_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
+         spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+      }
+      else {
+         /* stencil_pass = fragment_mask & ~((s&mask) == (reference&mask)) */
+         unsigned int tmp_masked_stencil = spe_allocate_available_register(f);
+         spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->value_mask);
+         spe_compare_equal_uint(f, stencil_pass_reg, tmp_masked_stencil, state->value_mask & state->ref_value);
+         spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+         spe_release_register(f, tmp_masked_stencil);
+      }
       break;
 
    case PIPE_FUNC_GREATER:
-      /* stencil_pass = mask & (s > reference) */
-      spe_compare_greater_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
-      spe_and(f, stencil_pass_reg, mask_reg, stencil_pass_reg);
+      if (state->value_mask == stencil_max_value) {
+         /* stencil_pass = fragment_mask & (s > reference) */
+         spe_compare_greater_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
+         spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+      }
+      else {
+         /* stencil_pass = fragment_mask & ((s&mask) > (reference&mask)) */
+         unsigned int tmp_masked_stencil = spe_allocate_available_register(f);
+         spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->value_mask);
+         spe_compare_greater_uint(f, stencil_pass_reg, tmp_masked_stencil, state->value_mask & state->ref_value);
+         spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+         spe_release_register(f, tmp_masked_stencil);
+      }
       break;
 
-   case PIPE_FUNC_LESS: {
-      /* stencil_pass = mask & (reference > s) */
-      /* There's no convenient Compare Less Than Immediate instruction, so
-       * we'll have to do this one the harder way, by loading a register and 
-       * comparing directly.  Compare Logical Greater Than Word (clgt) 
-       * treats its operands as unsigned - no sign extension.
-       */
-      unsigned int tmp_reg = spe_allocate_available_register(f);
-      spe_load_uint(f, tmp_reg, state->ref_value);
-      spe_clgt(f, stencil_pass_reg, tmp_reg, fbS_reg);
-      spe_and(f, stencil_pass_reg, mask_reg, stencil_pass_reg);
-      spe_release_register(f, tmp_reg);
+   case PIPE_FUNC_LESS:
+      if (state->value_mask == stencil_max_value) {
+         /* stencil_pass = fragment_mask & (reference > s) */
+         /* There's no convenient Compare Less Than Immediate instruction, so
+          * we'll have to do this one the harder way, by loading a register and 
+          * comparing directly.  Compare Logical Greater Than Word (clgt) 
+          * treats its operands as unsigned - no sign extension.
+          */
+         unsigned int tmp_reg = spe_allocate_available_register(f);
+         spe_load_uint(f, tmp_reg, state->ref_value);
+         spe_clgt(f, stencil_pass_reg, tmp_reg, fbS_reg);
+         spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+         spe_release_register(f, tmp_reg);
+      }
+      else {
+         /* stencil_pass = fragment_mask & ((reference&mask) > (s&mask)) */
+         unsigned int tmp_reg = spe_allocate_available_register(f);
+         unsigned int tmp_masked_stencil = spe_allocate_available_register(f);
+         spe_load_uint(f, tmp_reg, state->value_mask & state->ref_value);
+         spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->value_mask);
+         spe_clgt(f, stencil_pass_reg, tmp_reg, tmp_masked_stencil);
+         spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+         spe_release_register(f, tmp_reg);
+         spe_release_register(f, tmp_masked_stencil);
+      }
       break;
-   }
 
    case PIPE_FUNC_LEQUAL:
-      /* stencil_pass = mask & (s <= reference) = mask & ~(s > reference) */
-      spe_compare_greater_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
-      spe_andc(f, stencil_pass_reg, mask_reg, stencil_pass_reg);
+      if (state->value_mask == stencil_max_value) {
+         /* stencil_pass = fragment_mask & (s <= reference) 
+          *              = fragment_mask & ~(s > reference) */
+         spe_compare_greater_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
+         spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+      }
+      else {
+         /* stencil_pass = fragment_mask & ~((s&mask) > (reference&mask)) */
+         unsigned int tmp_masked_stencil = spe_allocate_available_register(f);
+         spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->value_mask);
+         spe_compare_greater_uint(f, stencil_pass_reg, tmp_masked_stencil, state->value_mask & state->ref_value);
+         spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+         spe_release_register(f, tmp_masked_stencil);
+      }
       break;
 
-   case PIPE_FUNC_GEQUAL: {
-      /* stencil_pass = mask & (s >= reference) = mask & ~(reference > s) */
-      /* As above, we have to do this by loading a register */
-      unsigned int tmp_reg = spe_allocate_available_register(f);
-      spe_load_uint(f, tmp_reg, state->ref_value);
-      spe_clgt(f, stencil_pass_reg, tmp_reg, fbS_reg);
-      spe_andc(f, stencil_pass_reg, mask_reg, stencil_pass_reg);
-      spe_release_register(f, tmp_reg);
+   case PIPE_FUNC_GEQUAL:
+      if (state->value_mask == stencil_max_value) {
+         /* stencil_pass = fragment_mask & (s >= reference) ]
+          *               = fragment_mask & ~(reference > s) */
+         /* As above, we have to do this by loading a register */
+         unsigned int tmp_reg = spe_allocate_available_register(f);
+         spe_load_uint(f, tmp_reg, state->ref_value);
+         spe_clgt(f, stencil_pass_reg, tmp_reg, fbS_reg);
+         spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+         spe_release_register(f, tmp_reg);
+      }
+      else {
+         /* stencil_pass = fragment_mask & ~((reference&mask) > (s&mask)) */
+         unsigned int tmp_reg = spe_allocate_available_register(f);
+         unsigned int tmp_masked_stencil = spe_allocate_available_register(f);
+         spe_load_uint(f, tmp_reg, state->ref_value & state->value_mask);
+         spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->value_mask);
+         spe_clgt(f, stencil_pass_reg, tmp_reg, tmp_masked_stencil);
+         spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+         spe_release_register(f, tmp_reg);
+         spe_release_register(f, tmp_masked_stencil);
+      }
       break;
-   }
 
    case PIPE_FUNC_NEVER:
-      /* stencil_pass = mask & 0 = 0 */
+      /* stencil_pass = fragment_mask & 0 = 0 */
       spe_load_uint(f, stencil_pass_reg, 0);
       break;
 
    case PIPE_FUNC_ALWAYS:
-      /* stencil_pass = mask & 1 = mask */
-      spe_move(f, stencil_pass_reg, mask_reg);
+      /* stencil_pass = fragment_mask & 1 = fragment_mask */
+      spe_move(f, stencil_pass_reg, fragment_mask_reg);
       break;
    }
 
    /* The fragments that passed the stencil test are now in stencil_pass_reg.
-    * The fragments that failed would be (mask_reg & ~stencil_pass_reg).
+    * The fragments that failed would be (fragment_mask_reg & ~stencil_pass_reg).
     */
 }
 
@@ -1282,7 +1352,7 @@ gen_stencil_values(struct spe_function *f, unsigned int stencil_op,
       /* Add Word Immediate computes rT = rA + 10-bit signed immediate */
       spe_ai(f, newS_reg, fbS_reg, 1);
       /* Select from the current value or the new value based on the equality test */
-      spe_selb(f, newS_reg, fbS_reg, newS_reg, equals_reg);
+      spe_selb(f, newS_reg, newS_reg, fbS_reg, equals_reg);
 
       spe_release_register(f, equals_reg);
       break;
@@ -1295,7 +1365,7 @@ gen_stencil_values(struct spe_function *f, unsigned int stencil_op,
       /* Add Word Immediate with a (-1) value works */
       spe_ai(f, newS_reg, fbS_reg, -1);
       /* Select from the current value or the new value based on the equality test */
-      spe_selb(f, newS_reg, fbS_reg, newS_reg, equals_reg);
+      spe_selb(f, newS_reg, newS_reg, fbS_reg, equals_reg);
 
       spe_release_register(f, equals_reg);
       break;
@@ -1534,15 +1604,28 @@ gen_stencil_depth_test(struct spe_function *f,
     * meaning that we have to calculate the stencil values but do not
     * need to mask them), we can avoid generating code.  Don't forget
     * that we need to consider backfacing stencil, if enabled.
+    *
+    * Note that if the backface stencil is *not* enabled, the backface
+    * stencil will have the same values as the frontface stencil.
     */
-   if (dsa->stencil[0].write_mask == 0x0 && (!dsa->stencil[1].enabled || dsa->stencil[1].write_mask == 0x00)) {
-      /* Trivial: don't need to calculate stencil values, and don't need to 
-       * write them back to the framebuffer.
+   if (dsa->stencil[0].fail_op == PIPE_STENCIL_OP_KEEP &&
+       dsa->stencil[0].zfail_op == PIPE_STENCIL_OP_KEEP &&
+       dsa->stencil[0].zpass_op == PIPE_STENCIL_OP_KEEP &&
+       dsa->stencil[1].fail_op == PIPE_STENCIL_OP_KEEP &&
+       dsa->stencil[1].zfail_op == PIPE_STENCIL_OP_KEEP &&
+       dsa->stencil[1].zpass_op == PIPE_STENCIL_OP_KEEP) {
+       /* No changes to any stencil values */
+       need_to_calculate_stencil_values = false;
+       need_to_writemask_stencil_values = false;
+    }
+    else if (dsa->stencil[0].write_mask == 0x0 && dsa->stencil[1].write_mask == 0x0) {
+      /* All changes are writemasked out, so no need to calculate
+       * what those changes might be, and no need to write anything back.
        */
       need_to_calculate_stencil_values = false;
       need_to_writemask_stencil_values = false;
    }
-   else if (dsa->stencil[0].write_mask == 0xff && (!dsa->stencil[1].enabled || dsa->stencil[1].write_mask == 0xff)) {
+   else if (dsa->stencil[0].write_mask == 0xff && dsa->stencil[1].write_mask == 0xff) {
       /* Still trivial, but a little less so.  We need to write the stencil
        * values, but we don't need to mask them.
        */
@@ -1583,7 +1666,7 @@ gen_stencil_depth_test(struct spe_function *f,
     */
    spe_comment(f, 0, "Running basic stencil test");
    stencil_pass_reg = spe_allocate_available_register(f);
-   gen_stencil_test(f, &dsa->stencil[0], mask_reg, fbS_reg, stencil_pass_reg);
+   gen_stencil_test(f, &dsa->stencil[0], 0xff, mask_reg, fbS_reg, stencil_pass_reg);
 
    /* If two-sided stenciling is on, generate code to run the stencil
     * test on the backfacing stencil as well, and combine the two results
@@ -1592,7 +1675,7 @@ gen_stencil_depth_test(struct spe_function *f,
    if (dsa->stencil[1].enabled) {
       unsigned int temp_reg = spe_allocate_available_register(f);
       spe_comment(f, 0, "Running backface stencil test");
-      gen_stencil_test(f, &dsa->stencil[1], mask_reg, fbS_reg, temp_reg);
+      gen_stencil_test(f, &dsa->stencil[1], 0xff, mask_reg, fbS_reg, temp_reg);
       spe_selb(f, stencil_pass_reg, stencil_pass_reg, temp_reg, facing_reg);
       spe_release_register(f, temp_reg);
    }
@@ -1914,81 +1997,79 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
        * Z and/or stencil.  We'll also convert the incoming fragment Z
        * value in fragZ_reg from a floating point value in [0.0..1.0] to
        * an unsigned integer value with the appropriate resolution.
+       * Note that even if depth or stencil is *not* enabled, if it's
+       * present in the buffer, we pull it out and put it back later;
+       * otherwise, we can inadvertently destroy the contents of
+       * buffers we're not supposed to touch (e.g., if the user is
+       * clearing the depth buffer but not the stencil buffer, a
+       * quad of constant depth is drawn over the surface; the stencil
+       * buffer must be maintained).
        */
       switch(zs_format) {
 
          case PIPE_FORMAT_S8Z24_UNORM: /* fall through */
          case PIPE_FORMAT_X8Z24_UNORM:
-            if (dsa->depth.enabled) {
-               /* We need the Z part at least */
-               setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
-               /* four 24-bit Z values in the low-order bits */
-               spe_and_uint(f, fbZ_reg, fbZS_reg, 0x00ffffff);
-
-               /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
-                * to a 24-bit unsigned integer
-                */
-               spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
-               spe_rotmi(f, fragZ_reg, fragZ_reg, -8);
-            }
-            if (dsa->stencil[0].enabled) {
-               setup_optional_register(f, &fbS_reg_set, &fbS_reg);
-               /* four 8-bit Z values in the high-order bits */
-               spe_rotmi(f, fbS_reg, fbZS_reg, -24);
-            }
-            break;
+            /* Pull out both Z and stencil */
+            setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
+            setup_optional_register(f, &fbS_reg_set, &fbS_reg);
+
+            /* four 24-bit Z values in the low-order bits */
+            spe_and_uint(f, fbZ_reg, fbZS_reg, 0x00ffffff);
+
+            /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
+             * to a 24-bit unsigned integer
+             */
+            spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+            spe_rotmi(f, fragZ_reg, fragZ_reg, -8);
+
+            /* four 8-bit stencil values in the high-order bits */
+            spe_rotmi(f, fbS_reg, fbZS_reg, -24);
+         break;
 
          case PIPE_FORMAT_Z24S8_UNORM: /* fall through */
          case PIPE_FORMAT_Z24X8_UNORM:
-            if (dsa->depth.enabled) {
-               setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
-               /* shift by 8 to get the upper 24-bit values */
-               spe_rotmi(f, fbS_reg, fbZS_reg, -8);
-
-               /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
-                * to a 24-bit unsigned integer
-                */
-               spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
-               spe_rotmi(f, fragZ_reg, fragZ_reg, -8);
-            }
-            if (dsa->stencil[0].enabled) {
-               setup_optional_register(f, &fbS_reg_set, &fbS_reg);
-               /* 8-bit stencil in the low-order bits - mask them out */
-               spe_and_uint(f, fbS_reg, fbZS_reg, 0x000000ff);
-            }
-            break;
+            setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
+            setup_optional_register(f, &fbS_reg_set, &fbS_reg);
+
+            /* shift by 8 to get the upper 24-bit values */
+            spe_rotmi(f, fbS_reg, fbZS_reg, -8);
+
+            /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
+             * to a 24-bit unsigned integer
+             */
+            spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+            spe_rotmi(f, fragZ_reg, fragZ_reg, -8);
+
+            /* 8-bit stencil in the low-order bits - mask them out */
+            spe_and_uint(f, fbS_reg, fbZS_reg, 0x000000ff);
+         break;
 
          case PIPE_FORMAT_Z32_UNORM:
-            if (dsa->depth.enabled) {
-               setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
-               /* Copy over 4 32-bit values */
-               spe_move(f, fbZ_reg, fbZS_reg);
-
-               /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
-                * to a 32-bit unsigned integer
-                */
-               spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
-            }
+            setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
+            /* Copy over 4 32-bit values */
+            spe_move(f, fbZ_reg, fbZS_reg);
+
+            /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
+             * to a 32-bit unsigned integer
+             */
+            spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
             /* No stencil, so can't do anything there */
-            break;
+         break;
 
          case PIPE_FORMAT_Z16_UNORM:
-            if (dsa->depth.enabled) {
-               /* XXX Not sure this is correct, but it was here before, so we're
-                * going with it for now
-                */
-               setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
-               /* Copy over 4 32-bit values */
-               spe_move(f, fbZ_reg, fbZS_reg);
-
-               /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
-                * to a 16-bit unsigned integer
-                */
-               spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
-               spe_rotmi(f, fragZ_reg, fragZ_reg, -16);
-            }
+            /* XXX Not sure this is correct, but it was here before, so we're
+             * going with it for now
+             */
+            setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
+            /* Copy over 4 32-bit values */
+            spe_move(f, fbZ_reg, fbZS_reg);
+
+            /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
+             * to a 16-bit unsigned integer
+             */
+            spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+            spe_rotmi(f, fragZ_reg, fragZ_reg, -16);
             /* No stencil */
-            break;
 
          default:
             ASSERT(0); /* invalid format */
@@ -2035,39 +2116,19 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
          spe_comment(f, 0, "Store quad's depth/stencil values in tile");
          if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
              zs_format == PIPE_FORMAT_X8Z24_UNORM) {
-            if (fbS_reg_set && fbZ_reg_set) {
-               spe_shli(f, fbS_reg, fbS_reg, 24); /* fbS = fbS << 24 */
-               spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
-            }
-            else if (fbS_reg_set) {
-               spe_shli(f, fbZS_reg, fbS_reg, 24); /* fbS = fbS << 24 */
-            }
-            else {
-               spe_move(f, fbZS_reg, fbZ_reg);
-            }
+            spe_shli(f, fbS_reg, fbS_reg, 24); /* fbS = fbS << 24 */
+            spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
          }
          else if (zs_format == PIPE_FORMAT_Z24S8_UNORM ||
                   zs_format == PIPE_FORMAT_Z24X8_UNORM) {
-            if (fbS_reg_set && fbZ_reg_set) {
-               spe_shli(f, fbZ_reg, fbZ_reg, 8); /* fbZ = fbZ << 8 */
-               spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
-            }
-            else if (fbS_reg_set) {
-               spe_move(f, fbZS_reg, fbS_reg);
-            }
-            else {
-               spe_shli(f, fbZ_reg, fbZ_reg, 8); /* fbZ = fbZ << 8 */
-            }
+            spe_shli(f, fbZ_reg, fbZ_reg, 8); /* fbZ = fbZ << 8 */
+            spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
          }
          else if (zs_format == PIPE_FORMAT_Z32_UNORM) {
-            if (fbZ_reg_set) {
-               spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */
-            }
+            spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */
          }
          else if (zs_format == PIPE_FORMAT_Z16_UNORM) {
-            if (fbZ_reg_set) {
-               spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */
-            }
+            spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */
          }
          else if (zs_format == PIPE_FORMAT_S8_UNORM) {
             ASSERT(0);   /* XXX to do */
@@ -2080,6 +2141,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
          spe_stqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg);
       }
 
+      /* Don't need these any more */
       release_optional_register(f, &fbZ_reg_set, fbZ_reg);
       release_optional_register(f, &fbS_reg_set, fbS_reg);
    }
diff --git a/src/gallium/drivers/cell/ppu/cell_spu.h b/src/gallium/drivers/cell/ppu/cell_spu.h
index b633880c25..c93958a9ed 100644
--- a/src/gallium/drivers/cell/ppu/cell_spu.h
+++ b/src/gallium/drivers/cell/ppu/cell_spu.h
@@ -30,7 +30,6 @@
 
 
 #include <libspe2.h>
-#include <libmisc.h>
 #include <pthread.h>
 #include "cell/common.h"
 
diff --git a/src/gallium/drivers/cell/ppu/cell_texture.c b/src/gallium/drivers/cell/ppu/cell_texture.c
index 9ac2f3bbb9..ae88d06912 100644
--- a/src/gallium/drivers/cell/ppu/cell_texture.c
+++ b/src/gallium/drivers/cell/ppu/cell_texture.c
@@ -28,6 +28,7 @@
   * Authors:
   *   Keith Whitwell <keith@tungstengraphics.com>
   *   Michel Dänzer <michel@tungstengraphics.com>
+  *   Brian Paul
   */
 
 #include "pipe/p_context.h"
@@ -42,10 +43,9 @@
 #include "cell_texture.h"
 
 
-/* Simple, maximally packed layout.
- */
 
-static unsigned minify( unsigned d )
+static unsigned
+minify(unsigned d)
 {
    return MAX2(1, d>>1);
 }
@@ -212,6 +212,89 @@ twiddle_image_uint(uint w, uint h, uint tile_size, uint *dst,
 
 
 /**
+ * For Cell.  Basically, rearrange the pixels/quads from this layout:
+ *  +--+--+--+--+
+ *  |p0|p1|p2|p3|....
+ *  +--+--+--+--+
+ *
+ * to this layout:
+ *  +--+--+
+ *  |p0|p1|....
+ *  +--+--+
+ *  |p2|p3|
+ *  +--+--+
+ */
+static void
+twiddle_tile(const uint *tileIn, uint *tileOut)
+{
+   int y, x;
+
+   for (y = 0; y < TILE_SIZE; y+=2) {
+      for (x = 0; x < TILE_SIZE; x+=2) {
+         int k = 4 * (y/2 * TILE_SIZE/2 + x/2);
+         tileOut[y * TILE_SIZE + (x + 0)] = tileIn[k];
+         tileOut[y * TILE_SIZE + (x + 1)] = tileIn[k+1];
+         tileOut[(y + 1) * TILE_SIZE + (x + 0)] = tileIn[k+2];
+         tileOut[(y + 1) * TILE_SIZE + (x + 1)] = tileIn[k+3];
+      }
+   }
+}
+
+
+/**
+ * Convert image from tiled layout to linear layout.  4-byte pixels.
+ */
+static void
+untwiddle_image_uint(uint w, uint h, uint tile_size, uint *dst,
+                     uint dst_stride, const uint *src)
+{
+   const uint tile_size2 = tile_size * tile_size;
+   const uint h_t = (h + tile_size - 1) / tile_size;
+   const uint w_t = (w + tile_size - 1) / tile_size;
+   uint *tile_buf;
+   uint it, jt;  /* tile counters */
+   uint i, j;    /* intra-tile counters */
+
+   dst_stride /= 4; /* convert from bytes to pixels */
+
+   tile_buf = align_malloc(tile_size * tile_size * 4, 16);
+   
+   /* loop over src tiles */
+   for (it = 0; it < h_t; it++) {
+      for (jt = 0; jt < w_t; jt++) {
+         /* start of src tile: */
+         const uint *tsrc = src + (it * w_t + jt) * tile_size2;
+         
+         twiddle_tile(tsrc, tile_buf);
+         tsrc = tile_buf;
+
+         /* compute size of this tile (may be smaller than tile_size) */
+         /* XXX note: a compiler bug was found here. That's why the code
+          * looks as it does.
+          */
+         uint tile_width = w - jt * tile_size;
+         tile_width = MIN2(tile_width, tile_size);
+         uint tile_height = h - it * tile_size;
+         tile_height = MIN2(tile_height, tile_size);
+
+         /* loop over texels in the tile */
+         for (i = 0; i < tile_height; i++) {
+            for (j = 0; j < tile_width; j++) {
+               uint dsti = it * tile_size + i;
+               uint dstj = jt * tile_size + j;
+               ASSERT(dsti < h);
+               ASSERT(dstj < w);
+               dst[dsti * dst_stride + dstj] = tsrc[i * tile_size + j];
+            }
+         }
+      }
+   }
+
+   align_free(tile_buf);
+}
+
+
+/**
  * Convert linear texture image data to tiled format for SPU usage.
  */
 static void
@@ -230,6 +313,7 @@ cell_twiddle_texture(struct pipe_screen *screen,
 
    switch (ct->base.format) {
    case PIPE_FORMAT_A8R8G8B8_UNORM:
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
       {
          int numFaces = ct->base.target == PIPE_TEXTURE_CUBE ? 6 : 1;
          int offset = bufWidth * bufHeight * 4 * surface->face;
@@ -261,6 +345,51 @@ cell_twiddle_texture(struct pipe_screen *screen,
 }
 
 
+/**
+ * Convert SPU tiled texture image data to linear format for app usage.
+ */
+static void
+cell_untwiddle_texture(struct pipe_screen *screen,
+                     struct pipe_surface *surface)
+{
+   struct cell_texture *ct = cell_texture(surface->texture);
+   const uint level = surface->level;
+   const uint texWidth = ct->base.width[level];
+   const uint texHeight = ct->base.height[level];
+   const void *map = pipe_buffer_map(screen, surface->buffer,
+                                     PIPE_BUFFER_USAGE_CPU_READ);
+   const uint *src = (const uint *) ((const ubyte *) map + surface->offset);
+
+   switch (ct->base.format) {
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
+      {
+         int numFaces = ct->base.target == PIPE_TEXTURE_CUBE ? 6 : 1;
+         int offset = surface->stride * texHeight * 4 * surface->face;
+         uint *dst;
+
+         if (!ct->untiled_data[level]) {
+            ct->untiled_data[level] =
+               align_malloc(surface->stride * texHeight * 4 * numFaces, 16);
+         }
+
+         dst = (uint *) ((ubyte *) ct->untiled_data[level] + offset);
+
+         untwiddle_image_uint(texWidth, texHeight, TILE_SIZE, dst,
+                              surface->stride, src);
+      }
+      break;
+   default:
+      {
+         ct->untiled_data[level] = NULL;
+         printf("Cell: untwiddle unsupported texture format\n");
+      }
+   }
+
+   pipe_buffer_unmap(screen, surface->buffer);
+}
+
+
 static struct pipe_surface *
 cell_get_tex_surface(struct pipe_screen *screen,
                      struct pipe_texture *pt,
@@ -294,13 +423,18 @@ cell_get_tex_surface(struct pipe_screen *screen,
       ps->zslice = zslice;
 
       if (pt->target == PIPE_TEXTURE_CUBE || pt->target == PIPE_TEXTURE_3D) {
-	 ps->offset += ((pt->target == PIPE_TEXTURE_CUBE) ? face : zslice) *
-		       ps->nblocksy *
-		       ps->stride;
+	         ps->offset += ((pt->target == PIPE_TEXTURE_CUBE) ? face : zslice) *
+		      ps->nblocksy *
+		      ps->stride;
       }
       else {
-	 assert(face == 0);
-	 assert(zslice == 0);
+         assert(face == 0);
+         assert(zslice == 0);
+      }
+
+      if (ps->usage & PIPE_BUFFER_USAGE_CPU_READ) {
+         /* convert from tiled to linear layout */
+         cell_untwiddle_texture(screen, ps);
       }
    }
    return ps;
@@ -311,6 +445,15 @@ static void
 cell_tex_surface_release(struct pipe_screen *screen, 
                          struct pipe_surface **s)
 {
+   struct cell_texture *ct = cell_texture((*s)->texture);
+   const uint level = (*s)->level;
+
+   if (((*s)->usage & PIPE_BUFFER_USAGE_CPU_READ) && (ct->untiled_data[level]))
+   {
+      align_free(ct->untiled_data[level]);
+      ct->untiled_data[level] = NULL;
+   }
+
    /* XXX if done rendering to teximage, re-tile */
 
    pipe_texture_reference(&(*s)->texture, NULL); 
@@ -325,6 +468,10 @@ cell_surface_map(struct pipe_screen *screen,
                  unsigned flags)
 {
    ubyte *map;
+   struct cell_texture *ct = cell_texture(surface->texture);
+   const uint level = surface->level;
+
+   assert(ct);
 
    if (flags & ~surface->usage) {
       assert(0);
@@ -335,7 +482,14 @@ cell_surface_map(struct pipe_screen *screen,
    if (map == NULL)
       return NULL;
    else
-      return (void *) (map + surface->offset);
+   {
+      if ((surface->usage & PIPE_BUFFER_USAGE_CPU_READ) && (ct->untiled_data[level])) {
+         return (void *) ((ubyte *) ct->untiled_data[level] + surface->offset);
+      }
+      else {
+         return (void *) (map + surface->offset);
+      }
+   }
 }
 
 
diff --git a/src/gallium/drivers/cell/ppu/cell_texture.h b/src/gallium/drivers/cell/ppu/cell_texture.h
index 2f5fe0dd1b..7018b0c9bf 100644
--- a/src/gallium/drivers/cell/ppu/cell_texture.h
+++ b/src/gallium/drivers/cell/ppu/cell_texture.h
@@ -52,6 +52,7 @@ struct cell_texture
    struct pipe_buffer *tiled_buffer[CELL_MAX_TEXTURE_LEVELS];
    /** Mapped, tiled texture data */
    void *tiled_mapped[CELL_MAX_TEXTURE_LEVELS];
+   void *untiled_data[CELL_MAX_TEXTURE_LEVELS];
 };
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index a6ed29ea63..d726622d94 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -107,7 +107,7 @@ cmd_fence(struct cell_command_fence *fence_cmd)
                                               CELL_FENCE_SIGNALLED};
    uint *dst = (uint *) fence_cmd->fence;
    dst += 4 * spu.init.id;  /* main store/memory address, not local store */
-
+   ASSERT_ALIGN16(dst);
    mfc_put((void *) &status,    /* src in local memory */
            (unsigned int) dst,  /* dst in main memory */
            sizeof(status),      /* size */
@@ -244,8 +244,7 @@ cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops)
       }
    }
 
-   spu.read_depth = spu.depth_stencil_alpha.depth.enabled;
-   spu.read_stencil = spu.depth_stencil_alpha.stencil[0].enabled;
+   spu.read_depth_stencil = (spu.depth_stencil_alpha.depth.enabled || spu.depth_stencil_alpha.stencil[0].enabled);
 }
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_funcs.c b/src/gallium/drivers/cell/spu/spu_funcs.c
index 3534b35000..ff3d609d25 100644
--- a/src/gallium/drivers/cell/spu/spu_funcs.c
+++ b/src/gallium/drivers/cell/spu/spu_funcs.c
@@ -38,7 +38,9 @@
 #include <math.h>
 #include <cos14_v.h>
 #include <sin14_v.h>
-#include <transpose_matrix4x4.h>
+#include <simdmath/exp2f4.h>
+#include <simdmath/log2f4.h>
+#include <simdmath/powf4.h>
 
 #include "cell/common.h"
 #include "spu_main.h"
@@ -68,37 +70,19 @@ spu_sin(vector float x)
 static vector float
 spu_pow(vector float x, vector float y)
 {
-   float z0 = powf(spu_extract(x,0), spu_extract(y,0));
-   float z1 = powf(spu_extract(x,1), spu_extract(y,1));
-   float z2 = powf(spu_extract(x,2), spu_extract(y,2));
-   float z3 = powf(spu_extract(x,3), spu_extract(y,3));
-   return (vector float) {z0, z1, z2, z3};
+   return _powf4(x, y);
 }
 
 static vector float
 spu_exp2(vector float x)
 {
-   float z0 = powf(2.0f, spu_extract(x,0));
-   float z1 = powf(2.0f, spu_extract(x,1));
-   float z2 = powf(2.0f, spu_extract(x,2));
-   float z3 = powf(2.0f, spu_extract(x,3));
-   return (vector float) {z0, z1, z2, z3};
+   return _exp2f4(x);
 }
 
 static vector float
 spu_log2(vector float x)
 {
-   /*
-    * log_base_2(x) = log(x) / log(2)
-    * 1.442695 = 1/log(2).
-    */
-   static const vector float k = {1.442695F, 1.442695F, 1.442695F, 1.442695F};
-   float z0 = logf(spu_extract(x,0));
-   float z1 = logf(spu_extract(x,1));
-   float z2 = logf(spu_extract(x,2));
-   float z3 = logf(spu_extract(x,3));
-   vector float v = (vector float) {z0, z1, z2, z3};
-   return spu_mul(v, k);
+   return _log2f4(x);
 }
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 668af10be2..692790c9f3 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -160,8 +160,7 @@ struct spu_global
    tile_t ztile ALIGN16_ATTRIB;
 
    /** Read depth/stencil tiles? */
-   boolean read_depth;
-   boolean read_stencil;
+   boolean read_depth_stencil;
 
    /** Current tiles' status */
    ubyte cur_ctile_status, cur_ztile_status;
diff --git a/src/gallium/drivers/cell/spu/spu_render.c b/src/gallium/drivers/cell/spu/spu_render.c
index 5515bb55c9..7c225e2f27 100644
--- a/src/gallium/drivers/cell/spu/spu_render.c
+++ b/src/gallium/drivers/cell/spu/spu_render.c
@@ -98,7 +98,7 @@ my_tile(uint tx, uint ty)
 static INLINE void
 get_cz_tiles(uint tx, uint ty)
 {
-   if (spu.read_depth) {
+   if (spu.read_depth_stencil) {
       if (spu.cur_ztile_status != TILE_STATUS_CLEAR) {
          //printf("SPU %u: getting Z tile %u, %u\n", spu.init.id, tx, ty);
          get_tile(tx, ty, &spu.ztile, TAG_READ_TILE_Z, 1);
@@ -153,7 +153,7 @@ static INLINE void
 wait_put_cz_tiles(void)
 {
    wait_on_mask(1 << TAG_WRITE_TILE_COLOR);
-   if (spu.read_depth) {
+   if (spu.read_depth_stencil) {
       wait_on_mask(1 << TAG_WRITE_TILE_Z);
    }
 }
diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index 4caf7d6b61..5f908159bb 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -369,7 +369,7 @@ flush_spans(void)
    }
    ASSERT(spu.cur_ctile_status != TILE_STATUS_DEFINED);
 
-   if (spu.read_depth) {
+   if (spu.read_depth_stencil) {
       if (spu.cur_ztile_status == TILE_STATUS_GETTING) {
          /* wait for mfc_get() to complete */
          //printf("SPU: %u: waiting for ztile\n", spu.init.id);
diff --git a/src/gallium/drivers/softpipe/sp_fs_sse.c b/src/gallium/drivers/softpipe/sp_fs_sse.c
index 0111469405..31908a517b 100644
--- a/src/gallium/drivers/softpipe/sp_fs_sse.c
+++ b/src/gallium/drivers/softpipe/sp_fs_sse.c
@@ -40,7 +40,7 @@
 #include "tgsi/tgsi_sse2.h"
 
 
-#if defined(PIPE_ARCH_X86) && defined(PIPE_ARCH_SSE)
+#if defined(PIPE_ARCH_X86)
 
 #include "rtasm/rtasm_x86sse.h"
 
@@ -92,7 +92,8 @@ fs_sse_run( const struct sp_fragment_shader *base,
 		       machine->Temps);
 
    /* init kill mask */
-   machine->Temps[TGSI_EXEC_TEMP_KILMASK_I].xyzw[TGSI_EXEC_TEMP_KILMASK_C].u[0] = 0x0;
+   tgsi_set_kill_mask(machine, 0x0);
+   tgsi_set_exec_mask(machine, 1, 1, 1, 1);
 
    shader->func( machine->Inputs,
 		 machine->Outputs,
diff --git a/src/gallium/drivers/softpipe/sp_quad_output.c b/src/gallium/drivers/softpipe/sp_quad_output.c
index d05e12d1d9..b7aac7f84a 100644
--- a/src/gallium/drivers/softpipe/sp_quad_output.c
+++ b/src/gallium/drivers/softpipe/sp_quad_output.c
@@ -64,6 +64,14 @@ output_quad(struct quad_stage *qs, struct quad_header *quad)
             for (i = 0; i < 4; i++) { /* loop over color chans */
                tile->data.color[y][x][i] = quadColor[i][j];
             }
+            if (0) {
+               debug_printf("sp write pixel %d,%d: %g, %g, %g\n",
+                            quad->input.x0 + x,
+                            quad->input.y0 + y,
+                            quadColor[0][j],
+                            quadColor[1][j],
+                            quadColor[2][j]);
+            }
          }
       }
    }
diff --git a/src/gallium/include/pipe/p_debug.h b/src/gallium/include/pipe/p_debug.h
index cb6196aa9f..3b00fb9aa8 100644
--- a/src/gallium/include/pipe/p_debug.h
+++ b/src/gallium/include/pipe/p_debug.h
@@ -49,7 +49,7 @@ extern "C" {
 #endif
 
 
-#ifdef DBG
+#if defined(DBG) || defined(DEBUG)
 #ifndef DEBUG
 #define DEBUG 1
 #endif
diff --git a/src/gallium/include/pipe/p_inlines.h b/src/gallium/include/pipe/p_inlines.h
index d70de8e301..5e79b7f485 100644
--- a/src/gallium/include/pipe/p_inlines.h
+++ b/src/gallium/include/pipe/p_inlines.h
@@ -82,11 +82,14 @@ static INLINE void
 pipe_surface_reference(struct pipe_surface **ptr, struct pipe_surface *surf)
 {
    /* bump the refcount first */
-   if (surf) 
+   if (surf) {
+      assert(surf->refcount);
       surf->refcount++;
+   }
 
    if (*ptr) {
-
+      assert((*ptr)->refcount);
+      
       /* There are currently two sorts of surfaces... This needs to be
        * fixed so that all surfaces are views into a texture.
        */
@@ -113,11 +116,16 @@ winsys_buffer_reference(struct pipe_winsys *winsys,
 		      struct pipe_buffer **ptr,
 		      struct pipe_buffer *buf)
 {
-   if (buf) 
+   if (buf) {
+      assert(buf->refcount);
       buf->refcount++;
+   }
 
-   if (*ptr && --(*ptr)->refcount == 0)
-      winsys->buffer_destroy( winsys, *ptr );
+   if (*ptr) {
+      assert((*ptr)->refcount);
+      if(--(*ptr)->refcount == 0)
+         winsys->buffer_destroy( winsys, *ptr );
+   }
 
    *ptr = buf;
 }
@@ -133,12 +141,15 @@ pipe_texture_reference(struct pipe_texture **ptr,
 {
    assert(ptr);
 
-   if (pt) 
+   if (pt) { 
+      assert(pt->refcount);
       pt->refcount++;
+   }
 
    if (*ptr) {
       struct pipe_screen *screen = (*ptr)->screen;
       assert(screen);
+      assert((*ptr)->refcount);
       screen->texture_release(screen, ptr);
 
       assert(!*ptr);
@@ -154,6 +165,7 @@ pipe_texture_release(struct pipe_texture **ptr)
    struct pipe_screen *screen;
    assert(ptr);
    screen = (*ptr)->screen;
+   assert((*ptr)->refcount);
    screen->texture_release(screen, ptr);
    *ptr = NULL;
 }
@@ -176,12 +188,6 @@ pipe_user_buffer_create( struct pipe_screen *screen, void *ptr, unsigned size )
    return screen->winsys->user_buffer_create(screen->winsys, ptr, size);
 }
 
-static INLINE void
-pipe_buffer_destroy( struct pipe_screen *screen, struct pipe_buffer *buf )
-{
-   screen->winsys->buffer_destroy(screen->winsys, buf);
-}
-
 static INLINE void *
 pipe_buffer_map(struct pipe_screen *screen,
                 struct pipe_buffer *buf,