9 files changed, 1230 insertions, 111 deletions
diff --git a/src/gallium/auxiliary/tgsi/Makefile b/src/gallium/auxiliary/tgsi/Makefile
index c5d2082087..d7df9490cf 100644
--- a/src/gallium/auxiliary/tgsi/Makefile
+++ b/src/gallium/auxiliary/tgsi/Makefile
@@ -11,8 +11,10 @@ C_SOURCES = \
 	tgsi_info.c \
 	tgsi_iterate.c \
 	tgsi_parse.c \
+	tgsi_ppc.c \
 	tgsi_scan.c \
 	tgsi_sse2.c \
+	tgsi_text.c \
 	tgsi_transform.c \
 	tgsi_util.c
 
diff --git a/src/gallium/auxiliary/tgsi/SConscript b/src/gallium/auxiliary/tgsi/SConscript
index 45bf3f6d57..8200cce42f 100644
--- a/src/gallium/auxiliary/tgsi/SConscript
+++ b/src/gallium/auxiliary/tgsi/SConscript
@@ -12,6 +12,7 @@ tgsi = env.ConvenienceLibrary(
 		'tgsi_parse.c',
 		'tgsi_sanity.c',
 		'tgsi_scan.c',
+		'tgsi_ppc.c',
 		'tgsi_sse2.c',
 		'tgsi_text.c',
 		'tgsi_transform.c',
diff --git a/src/gallium/auxiliary/tgsi/tgsi_build.c b/src/gallium/auxiliary/tgsi/tgsi_build.c
index 74614d3688..38fcaf8829 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_build.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_build.c
@@ -793,10 +793,14 @@ tgsi_default_instruction_ext_nv( void )
    return instruction_ext_nv;
 }
 
-union token_u32
+
+/** test for inequality of 32-bit values pointed to by a and b */
+static INLINE boolean
+compare32(const void *a, const void *b)
 {
-   unsigned u32;
-};
+   return *((uint32_t *) a) != *((uint32_t *) b);
+}
+
 
 unsigned
 tgsi_compare_instruction_ext_nv(
@@ -805,7 +809,7 @@ tgsi_compare_instruction_ext_nv(
 {
    a.Padding = b.Padding = 0;
    a.Extended = b.Extended = 0;
-   return ((union token_u32 *) &a)->u32 != ((union token_u32 *) &b)->u32;
+   return compare32(&a, &b);
 }
 
 struct tgsi_instruction_ext_nv
@@ -864,7 +868,7 @@ tgsi_compare_instruction_ext_label(
 {
    a.Padding = b.Padding = 0;
    a.Extended = b.Extended = 0;
-   return ((union token_u32 *) &a)->u32 != ((union token_u32 *) &b)->u32;
+   return compare32(&a, &b);
 }
 
 struct tgsi_instruction_ext_label
@@ -905,7 +909,7 @@ tgsi_compare_instruction_ext_texture(
 {
    a.Padding = b.Padding = 0;
    a.Extended = b.Extended = 0;
-   return ((union token_u32 *) &a)->u32 != ((union token_u32 *) &b)->u32;
+   return compare32(&a, &b);
 }
 
 struct tgsi_instruction_ext_texture
@@ -1027,7 +1031,7 @@ tgsi_compare_src_register_ext_swz(
 {
    a.Padding = b.Padding = 0;
    a.Extended = b.Extended = 0;
-   return ((union token_u32 *) &a)->u32 != ((union token_u32 *) &b)->u32;
+   return compare32(&a, &b);
 }
 
 struct tgsi_src_register_ext_swz
@@ -1095,7 +1099,7 @@ tgsi_compare_src_register_ext_mod(
 {
    a.Padding = b.Padding = 0;
    a.Extended = b.Extended = 0;
-   return ((union token_u32 *) &a)->u32 != ((union token_u32 *) &b)->u32;
+   return compare32(&a, &b);
 }
 
 struct tgsi_src_register_ext_mod
@@ -1241,7 +1245,7 @@ tgsi_compare_dst_register_ext_concode(
 {
    a.Padding = b.Padding = 0;
    a.Extended = b.Extended = 0;
-   return ((union token_u32 *) &a)->u32 != ((union token_u32 *) &b)->u32;
+   return compare32(&a, &b);
 }
 
 struct tgsi_dst_register_ext_concode
@@ -1299,7 +1303,7 @@ tgsi_compare_dst_register_ext_modulate(
 {
    a.Padding = b.Padding = 0;
    a.Extended = b.Extended = 0;
-   return ((union token_u32 *) &a)->u32 != ((union token_u32 *) &b)->u32;
+   return compare32(&a, &b);
 }
 
 struct tgsi_dst_register_ext_modulate
diff --git a/src/gallium/auxiliary/tgsi/tgsi_dump.c b/src/gallium/auxiliary/tgsi/tgsi_dump.c
index afc8ffa553..3177f54952 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_dump.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_dump.c
@@ -68,6 +68,7 @@ dump_enum(
 #define CHR(C)          ctx->printf( ctx, "%c", C )
 #define UIX(I)          ctx->printf( ctx, "0x%x", I )
 #define UID(I)          ctx->printf( ctx, "%u", I )
+#define INSTID(I)          ctx->printf( ctx, "% 3u", I )
 #define SID(I)          ctx->printf( ctx, "%d", I )
 #define FLT(F)          ctx->printf( ctx, "%10.4f", F )
 #define ENM(E,ENUMS)    dump_enum( ctx, E, ENUMS, sizeof( ENUMS ) / sizeof( *ENUMS ) )
@@ -315,8 +316,8 @@ iter_instruction(
    uint i;
    boolean first_reg = TRUE;
 
-   UID( instno );
-   CHR( ':' );
+   INSTID( instno );
+   TXT( ": " );
    TXT( tgsi_get_opcode_info( inst->Instruction.Opcode )->mnemonic );
 
    switch (inst->Instruction.Saturate) {
diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.c b/src/gallium/auxiliary/tgsi/tgsi_exec.c
index df002939c6..1a5294eabc 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.c
@@ -1674,6 +1674,7 @@ exec_declaration(
             break;
 
          default:
+            eval = NULL;
             assert( 0 );
          }
 
diff --git a/src/gallium/auxiliary/tgsi/tgsi_parse.c b/src/gallium/auxiliary/tgsi/tgsi_parse.c
index 3757486ba9..2cd56e413a 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_parse.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_parse.c
@@ -88,16 +88,33 @@ tgsi_parse_end_of_tokens(
       1 + ctx->FullHeader.Header.HeaderSize + ctx->FullHeader.Header.BodySize;
 }
 
+
+/**
+ * This function is used to avoid and work-around type punning/aliasing
+ * warnings.  The warnings seem harmless on x86 but on PPC they cause
+ * real failures.
+ */
+static INLINE void
+copy_token(void *dst, const void *src)
+{
+   memcpy(dst, src, 4);
+}
+
+
+/**
+ * Get next 4-byte token, return it at address specified by 'token'
+ */
 static void
 next_token(
    struct tgsi_parse_context *ctx,
    void *token )
 {
    assert( !tgsi_parse_end_of_tokens( ctx ) );
-
-   *(struct tgsi_token *) token = ctx->Tokens[ctx->Position++];
+   copy_token(token, &ctx->Tokens[ctx->Position]);
+   ctx->Position++;
 }
 
+
 void
 tgsi_parse_token(
    struct tgsi_parse_context *ctx )
@@ -116,7 +133,7 @@ tgsi_parse_token(
       struct tgsi_full_declaration *decl = &ctx->FullToken.FullDeclaration;
 
       *decl = tgsi_default_full_declaration();
-      decl->Declaration = *(struct tgsi_declaration *) &token;
+      copy_token(&decl->Declaration, &token);
 
       next_token( ctx, &decl->DeclarationRange );
 
@@ -132,8 +149,7 @@ tgsi_parse_token(
       struct tgsi_full_immediate *imm = &ctx->FullToken.FullImmediate;
 
       *imm = tgsi_default_full_immediate();
-      imm->Immediate = *(struct tgsi_immediate *) &token;
-
+      copy_token(&imm->Immediate, &token);
       assert( !imm->Immediate.Extended );
 
       switch (imm->Immediate.DataType) {
@@ -158,8 +174,7 @@ tgsi_parse_token(
       unsigned extended;
 
       *inst = tgsi_default_full_instruction();
-      inst->Instruction = *(struct tgsi_instruction *) &token;
-
+      copy_token(&inst->Instruction, &token);
       extended = inst->Instruction.Extended;
 
       while( extended ) {
@@ -169,18 +184,15 @@ tgsi_parse_token(
 
          switch( token.Type ) {
          case TGSI_INSTRUCTION_EXT_TYPE_NV:
-            inst->InstructionExtNv =
-               *(struct tgsi_instruction_ext_nv *) &token;
+            copy_token(&inst->InstructionExtNv, &token);
             break;
 
          case TGSI_INSTRUCTION_EXT_TYPE_LABEL:
-            inst->InstructionExtLabel =
-               *(struct tgsi_instruction_ext_label *) &token;
+            copy_token(&inst->InstructionExtLabel, &token);
             break;
 
          case TGSI_INSTRUCTION_EXT_TYPE_TEXTURE:
-            inst->InstructionExtTexture =
-               *(struct tgsi_instruction_ext_texture *) &token;
+            copy_token(&inst->InstructionExtTexture, &token);
             break;
 
          default:
@@ -212,13 +224,13 @@ tgsi_parse_token(
 
             switch( token.Type ) {
             case TGSI_DST_REGISTER_EXT_TYPE_CONDCODE:
-               inst->FullDstRegisters[i].DstRegisterExtConcode =
-                  *(struct tgsi_dst_register_ext_concode *) &token;
+               copy_token(&inst->FullDstRegisters[i].DstRegisterExtConcode,
+                          &token);
                break;
 
             case TGSI_DST_REGISTER_EXT_TYPE_MODULATE:
-               inst->FullDstRegisters[i].DstRegisterExtModulate =
-                  *(struct tgsi_dst_register_ext_modulate *) &token;
+               copy_token(&inst->FullDstRegisters[i].DstRegisterExtModulate,
+                          &token);
                break;
 
             default:
@@ -245,13 +257,13 @@ tgsi_parse_token(
 
             switch( token.Type ) {
             case TGSI_SRC_REGISTER_EXT_TYPE_SWZ:
-               inst->FullSrcRegisters[i].SrcRegisterExtSwz =
-                  *(struct tgsi_src_register_ext_swz *) &token;
+               copy_token(&inst->FullSrcRegisters[i].SrcRegisterExtSwz,
+                          &token);
                break;
 
             case TGSI_SRC_REGISTER_EXT_TYPE_MOD:
-               inst->FullSrcRegisters[i].SrcRegisterExtMod =
-                  *(struct tgsi_src_register_ext_mod *) &token;
+               copy_token(&inst->FullSrcRegisters[i].SrcRegisterExtMod,
+                          &token);
                break;
 
             default:
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ppc.c b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
new file mode 100644
index 0000000000..9ad7ecd7cf
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
@@ -0,0 +1,910 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * TGSI to PowerPC code generation.
+ */
+
+#include "pipe/p_config.h"
+
+#if defined(PIPE_ARCH_PPC)
+
+#include "pipe/p_debug.h"
+#include "pipe/p_shader_tokens.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "util/u_sse.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_util.h"
+#include "tgsi_exec.h"
+#include "tgsi_ppc.h"
+#include "rtasm/rtasm_ppc.h"
+
+
+/**
+ * Since it's pretty much impossible to form PPC vector immediates, load
+ * them from memory here:
+ */
+const float ppc_builtin_constants[] ALIGN16_ATTRIB = {
+   1.0f, -128.0f, 128.0, 0.0
+};
+
+
+#define FOR_EACH_CHANNEL( CHAN )\
+   for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)
+
+#define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
+   ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
+
+#define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
+   if (IS_DST0_CHANNEL_ENABLED( INST, CHAN ))
+
+#define FOR_EACH_DST0_ENABLED_CHANNEL( INST, CHAN )\
+   FOR_EACH_CHANNEL( CHAN )\
+      IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )
+
+#define CHAN_X 0
+#define CHAN_Y 1
+#define CHAN_Z 2
+#define CHAN_W 3
+
+#define TEMP_ONE_I   TGSI_EXEC_TEMP_ONE_I
+#define TEMP_ONE_C   TGSI_EXEC_TEMP_ONE_C
+
+#define TEMP_R0   TGSI_EXEC_TEMP_R0
+#define TEMP_ADDR TGSI_EXEC_TEMP_ADDR
+
+
+/**
+ * Context/state used during code gen.
+ */
+struct gen_context
+{
+   struct ppc_function *f;
+   int inputs_reg;    /**< GP register pointing to input params */
+   int outputs_reg;   /**< GP register pointing to output params */
+   int temps_reg;     /**< GP register pointing to temporary "registers" */
+   int immed_reg;     /**< GP register pointing to immediates buffer */
+   int const_reg;     /**< GP register pointing to constants buffer */
+   int builtins_reg;  /**< GP register pointint to built-in constants */
+
+   int one_vec;       /**< vector register with {1.0, 1.0, 1.0, 1.0} */
+   int bit31_vec;     /**< vector register with {1<<31, 1<<31, 1<<31, 1<<31} */
+};
+
+
+/**
+ * Load the given vector register with {value, value, value, value}.
+ * The value must be in the ppu_builtin_constants[] array.
+ * We wouldn't need this if there was a simple way to load PPC vector
+ * registers with immediate values!
+ */
+static void
+load_constant_vec(struct gen_context *gen, int dst_vec, float value)
+{
+   uint pos;
+   for (pos = 0; pos < Elements(ppc_builtin_constants); pos++) {
+      if (ppc_builtin_constants[pos] == value) {
+         int offset_reg = ppc_allocate_register(gen->f);
+         int offset = pos * 4;
+
+         ppc_li(gen->f, offset_reg, offset);
+         /* Load 4-byte word into vector register.
+          * The vector slot depends on the effective address we load from.
+          * We know that our builtins start at a 16-byte boundary so we
+          * know that 'swizzle' tells us which vector slot will have the
+          * loaded word.  The other vector slots will be undefined.
+          */
+         ppc_lvewx(gen->f, dst_vec, gen->builtins_reg, offset_reg);
+         /* splat word[pos % 4] across the vector reg */
+         ppc_vspltw(gen->f, dst_vec, dst_vec, pos % 4);
+         ppc_release_register(gen->f, offset_reg);
+         return;
+      }
+   }
+   assert(0 && "Need to add new constant to ppc_builtin_constants array");
+}
+
+
+/**
+ * Return index of vector register containing {1.0, 1.0, 1.0, 1.0}.
+ */
+static int
+gen_one_vec(struct gen_context *gen)
+{
+   if (gen->one_vec < 0) {
+      gen->one_vec = ppc_allocate_vec_register(gen->f);
+      load_constant_vec(gen, gen->one_vec, 1.0f);
+   }
+   return gen->one_vec;
+}
+
+/**
+ * Return index of vector register containing {1<<31, 1<<31, 1<<31, 1<<31}.
+ */
+static int
+gen_get_bit31_vec(struct gen_context *gen)
+{
+   if (gen->bit31_vec < 0) {
+      gen->bit31_vec = ppc_allocate_vec_register(gen->f);
+      ppc_vspltisw(gen->f, gen->bit31_vec, -1);
+      ppc_vslw(gen->f, gen->bit31_vec, gen->bit31_vec, gen->bit31_vec);
+   }
+   return gen->bit31_vec;
+}
+
+
+/**
+ * Register fetch, put result in 'dst_vec'.
+ */
+static void
+emit_fetch(struct gen_context *gen,
+           unsigned dst_vec,
+           const struct tgsi_full_src_register *reg,
+           const unsigned chan_index)
+{
+   uint swizzle = tgsi_util_get_full_src_register_extswizzle(reg, chan_index);
+
+   switch (swizzle) {
+   case TGSI_EXTSWIZZLE_X:
+   case TGSI_EXTSWIZZLE_Y:
+   case TGSI_EXTSWIZZLE_Z:
+   case TGSI_EXTSWIZZLE_W:
+      switch (reg->SrcRegister.File) {
+      case TGSI_FILE_INPUT:
+         {
+            int offset_reg = ppc_allocate_register(gen->f);
+            int offset = (reg->SrcRegister.Index * 4 + swizzle) * 16;
+            ppc_li(gen->f, offset_reg, offset);
+            ppc_lvx(gen->f, dst_vec, gen->inputs_reg, offset_reg);
+            ppc_release_register(gen->f, offset_reg);
+         }
+         break;
+      case TGSI_FILE_TEMPORARY:
+         {
+            int offset_reg = ppc_allocate_register(gen->f);
+            int offset = (reg->SrcRegister.Index * 4 + swizzle) * 16;
+            ppc_li(gen->f, offset_reg, offset);
+            ppc_lvx(gen->f, dst_vec, gen->temps_reg, offset_reg);
+            ppc_release_register(gen->f, offset_reg);
+         }
+         break;
+      case TGSI_FILE_IMMEDIATE:
+         {
+            int offset_reg = ppc_allocate_register(gen->f);
+            int offset = (reg->SrcRegister.Index * 4 + swizzle) * 16;
+            ppc_li(gen->f, offset_reg, offset);
+            ppc_lvx(gen->f, dst_vec, gen->immed_reg, offset_reg);
+            ppc_release_register(gen->f, offset_reg);
+         }
+         break;
+      case TGSI_FILE_CONSTANT:
+         {
+            int offset_reg = ppc_allocate_register(gen->f);
+            int offset = (reg->SrcRegister.Index * 4 + swizzle) * 4;
+            ppc_li(gen->f, offset_reg, offset);
+            /* Load 4-byte word into vector register.
+             * The vector slot depends on the effective address we load from.
+             * We know that our constants start at a 16-byte boundary so we
+             * know that 'swizzle' tells us which vector slot will have the
+             * loaded word.  The other vector slots will be undefined.
+             */
+            ppc_lvewx(gen->f, dst_vec, gen->const_reg, offset_reg);
+            /* splat word[swizzle] across the vector reg */
+            ppc_vspltw(gen->f, dst_vec, dst_vec, swizzle);
+            ppc_release_register(gen->f, offset_reg);
+         }
+         break;
+      default:
+         assert( 0 );
+      }
+      break;
+   case TGSI_EXTSWIZZLE_ZERO:
+      ppc_vzero(gen->f, dst_vec);
+      break;
+   case TGSI_EXTSWIZZLE_ONE:
+      {
+         int one_vec = gen_one_vec(gen);
+         ppc_vmove(gen->f, dst_vec, one_vec);
+      }
+      break;
+   default:
+      assert( 0 );
+   }
+
+   {
+      uint sign_op = tgsi_util_get_full_src_register_sign_mode(reg, chan_index);
+      if (sign_op != TGSI_UTIL_SIGN_KEEP) {
+         int bit31_vec = gen_get_bit31_vec(gen);
+
+         switch (sign_op) {
+         case TGSI_UTIL_SIGN_CLEAR:
+            /* vec = vec & ~bit31 */
+            ppc_vandc(gen->f, dst_vec, dst_vec, bit31_vec);
+            break;
+         case TGSI_UTIL_SIGN_SET:
+            /* vec = vec | bit31 */
+            ppc_vor(gen->f, dst_vec, dst_vec, bit31_vec);
+            break;
+         case TGSI_UTIL_SIGN_TOGGLE:
+            /* vec = vec ^ bit31 */
+            ppc_vxor(gen->f, dst_vec, dst_vec, bit31_vec);
+            break;
+         default:
+            assert(0);
+         }
+      }
+   }
+}
+
+#define FETCH( GEN, INST, DST_VEC, SRC_REG, CHAN ) \
+   emit_fetch( GEN, DST_VEC, &(INST).FullSrcRegisters[SRC_REG], CHAN )
+
+
+
+/**
+ * Register store.  Store 'src_vec' at location indicated by 'reg'.
+ */
+static void
+emit_store(struct gen_context *gen,
+           unsigned src_vec,
+           const struct tgsi_full_dst_register *reg,
+           const struct tgsi_full_instruction *inst,
+           unsigned chan_index)
+{
+   switch (reg->DstRegister.File) {
+   case TGSI_FILE_OUTPUT:
+      {
+         int offset_reg = ppc_allocate_register(gen->f);
+         int offset = (reg->DstRegister.Index * 4 + chan_index) * 16;
+         ppc_li(gen->f, offset_reg, offset);
+         ppc_stvx(gen->f, src_vec, gen->outputs_reg, offset_reg);
+         ppc_release_register(gen->f, offset_reg);
+      }
+      break;
+   case TGSI_FILE_TEMPORARY:
+      {
+         int offset_reg = ppc_allocate_register(gen->f);
+         int offset = (reg->DstRegister.Index * 4 + chan_index) * 16;
+         ppc_li(gen->f, offset_reg, offset);
+         ppc_stvx(gen->f, src_vec, gen->temps_reg, offset_reg);
+         ppc_release_register(gen->f, offset_reg);
+      }
+      break;
+#if 0
+   case TGSI_FILE_ADDRESS:
+      emit_addrs(
+         func,
+         xmm,
+         reg->DstRegister.Index,
+         chan_index );
+      break;
+#endif
+   default:
+      assert( 0 );
+   }
+
+#if 0
+   switch( inst->Instruction.Saturate ) {
+   case TGSI_SAT_NONE:
+      break;
+
+   case TGSI_SAT_ZERO_ONE:
+      /* assert( 0 ); */
+      break;
+
+   case TGSI_SAT_MINUS_PLUS_ONE:
+      assert( 0 );
+      break;
+   }
+#endif
+}
+
+
+#define STORE( GEN, INST, XMM, INDEX, CHAN )\
+   emit_store( GEN, XMM, &(INST).FullDstRegisters[INDEX], &(INST), CHAN )
+
+
+
+static void
+emit_scalar_unaryop(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+   int v0 = ppc_allocate_vec_register(gen->f);
+   int v1 = ppc_allocate_vec_register(gen->f);
+   uint chan_index;
+
+   FETCH(gen, *inst, v0, 0, CHAN_X);
+
+   switch (inst->Instruction.Opcode) {
+   case TGSI_OPCODE_RSQ:
+      /* v1 = 1.0 / sqrt(v0) */
+      ppc_vrsqrtefp(gen->f, v1, v0);
+      break;
+   case TGSI_OPCODE_RCP:
+      /* v1 = 1.0 / v0 */
+      ppc_vrefp(gen->f, v1, v0);
+      break;
+   default:
+      assert(0);
+   }
+
+   FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+      STORE(gen, *inst, v1, 0, chan_index);
+   }
+   ppc_release_vec_register(gen->f, v0);
+   ppc_release_vec_register(gen->f, v1);
+}
+
+
+static void
+emit_unaryop(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+   int v0 = ppc_allocate_vec_register(gen->f);
+   uint chan_index;
+   FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) {
+      FETCH(gen, *inst, 0, 0, chan_index);   /* v0 = srcreg[0] */
+      switch (inst->Instruction.Opcode) {
+      case TGSI_OPCODE_ABS:
+         /* turn off the most significant bit of each vector float word */
+         {
+            int v1 = ppc_allocate_vec_register(gen->f);
+            ppc_vspltisw(gen->f, v1, -1);  /* v1 = {-1, -1, -1, -1} */
+            ppc_vslw(gen->f, v1, v1, v1);  /* v1 = {1<<31, 1<<31, 1<<31, 1<<31} */
+            ppc_vandc(gen->f, v0, v0, v1); /* v0 = v0 & ~v1 */
+            ppc_release_vec_register(gen->f, v1);
+         }
+         break;
+      case TGSI_OPCODE_FLOOR:
+         ppc_vrfim(gen->f, v0, v0);         /* v0 = floor(v0) */
+         break;
+      case TGSI_OPCODE_FRAC:
+         {
+            int v1 = ppc_allocate_vec_register(gen->f);
+            ppc_vrfim(gen->f, v1, v0);         /* v1 = floor(v0) */
+            ppc_vsubfp(gen->f, v0, v0, v1);    /* v0 = v0 - v1 */
+            ppc_release_vec_register(gen->f, v1);
+         }
+         break;
+      case TGSI_OPCODE_EXPBASE2:
+         ppc_vexptefp(gen->f, v0, v0);      /* v0 = 2^v0 */
+         break;
+      case TGSI_OPCODE_LOGBASE2:
+         /* XXX this may be broken! */
+         ppc_vlogefp(gen->f, v0, v0);      /* v0 = log2(v0) */
+         break;
+      case TGSI_OPCODE_MOV:
+         /* nothing */
+         break;
+      default:
+         assert(0);
+      }
+      STORE(gen, *inst, v0, 0, chan_index);   /* store v0 */
+   }
+   ppc_release_vec_register(gen->f, v0);
+}
+
+
+static void
+emit_binop(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+   int v0 = ppc_allocate_vec_register(gen->f);
+   int v1 = ppc_allocate_vec_register(gen->f);
+   int v2 = ppc_allocate_vec_register(gen->f);
+   uint chan_index;
+   FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) {
+      FETCH(gen, *inst, v0, 0, chan_index);   /* v0 = srcreg[0] */
+      FETCH(gen, *inst, v1, 1, chan_index);   /* v1 = srcreg[1] */
+      switch (inst->Instruction.Opcode) {
+      case TGSI_OPCODE_ADD:
+         ppc_vaddfp(gen->f, v2, v0, v1);
+         break;
+      case TGSI_OPCODE_SUB:
+         ppc_vsubfp(gen->f, v2, v0, v1);
+         break;
+      case TGSI_OPCODE_MUL:
+         ppc_vxor(gen->f, v2, v2, v2);        /* v2 = {0, 0, 0, 0} */
+         ppc_vmaddfp(gen->f, v2, v0, v1, v2); /* v2 = v0 * v1 + v0 */
+         break;
+      case TGSI_OPCODE_MIN:
+         ppc_vminfp(gen->f, v2, v0, v1);
+         break;
+      case TGSI_OPCODE_MAX:
+         ppc_vmaxfp(gen->f, v2, v0, v1);
+         break;
+      default:
+         assert(0);
+      }
+      STORE(gen, *inst, v2, 0, chan_index);   /* store v2 */
+   }
+   ppc_release_vec_register(gen->f, v0);
+   ppc_release_vec_register(gen->f, v1);
+   ppc_release_vec_register(gen->f, v2);
+}
+
+
+/**
+ * Vector comparisons, resulting in 1.0 or 0.0 values.
+ */
+static void
+emit_inequality(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+   int v0 = ppc_allocate_vec_register(gen->f);
+   int v1 = ppc_allocate_vec_register(gen->f);
+   int v2 = ppc_allocate_vec_register(gen->f);
+   uint chan_index;
+   boolean complement = FALSE;
+   int one_vec = gen_one_vec(gen);
+
+   FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) {
+      FETCH(gen, *inst, v0, 0, chan_index);   /* v0 = srcreg[0] */
+      FETCH(gen, *inst, v1, 1, chan_index);   /* v1 = srcreg[1] */
+
+      switch (inst->Instruction.Opcode) {
+      case TGSI_OPCODE_SNE:
+         complement = TRUE;
+         /* fall-through */
+      case TGSI_OPCODE_SEQ:
+         ppc_vcmpeqfpx(gen->f, v2, v0, v1); /* v2 = v0 == v1 ? ~0 : 0 */
+         break;
+
+      case TGSI_OPCODE_SGE:
+         complement = TRUE;
+         /* fall-through */
+      case TGSI_OPCODE_SLT:
+         ppc_vcmpgtfpx(gen->f, v2, v1, v0); /* v2 = v1 > v0 ? ~0 : 0 */
+         break;
+
+      case TGSI_OPCODE_SLE:
+         complement = TRUE;
+         /* fall-through */
+      case TGSI_OPCODE_SGT:
+         ppc_vcmpgtfpx(gen->f, v2, v0, v1); /* v2 = v0 > v1 ? ~0 : 0 */
+         break;
+      default:
+         assert(0);
+      }
+
+      /* v2 is now {0,0,0,0} or {~0,~0,~0,~0} */
+
+      if (complement)
+         ppc_vandc(gen->f, v2, one_vec, v2);    /* v2 = one_vec & ~v2 */
+      else
+         ppc_vand(gen->f, v2, one_vec, v2);     /* v2 = one_vec & v2 */
+
+      STORE(gen, *inst, v2, 0, chan_index);   /* store v2 */
+   }
+
+   ppc_release_vec_register(gen->f, v0);
+   ppc_release_vec_register(gen->f, v1);
+   ppc_release_vec_register(gen->f, v2);
+}
+
+
+static void
+emit_dotprod(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+   int v0 = ppc_allocate_vec_register(gen->f);
+   int v1 = ppc_allocate_vec_register(gen->f);
+   int v2 = ppc_allocate_vec_register(gen->f);
+   uint chan_index;
+
+   ppc_vxor(gen->f, v2, v2, v2);           /* v2 = {0, 0, 0, 0} */
+
+   FETCH(gen, *inst, v0, 0, CHAN_X);       /* v0 = src0.XXXX */
+   FETCH(gen, *inst, v1, 1, CHAN_X);       /* v1 = src1.XXXX */
+   ppc_vmaddfp(gen->f, v2, v0, v1, v2);    /* v2 = v0 * v1 + v2 */
+
+   FETCH(gen, *inst, v0, 0, CHAN_Y);       /* v0 = src0.YYYY */
+   FETCH(gen, *inst, v1, 1, CHAN_Y);       /* v1 = src1.YYYY */
+   ppc_vmaddfp(gen->f, v2, v0, v1, v2);    /* v2 = v0 * v1 + v2 */
+
+   FETCH(gen, *inst, v0, 0, CHAN_Z);       /* v0 = src0.ZZZZ */
+   FETCH(gen, *inst, v1, 1, CHAN_Z);       /* v1 = src1.ZZZZ */
+   ppc_vmaddfp(gen->f, v2, v0, v1, v2);    /* v2 = v0 * v1 + v2 */
+
+   if (inst->Instruction.Opcode == TGSI_OPCODE_DP4) {
+      FETCH(gen, *inst, v0, 0, CHAN_W);    /* v0 = src0.WWWW */
+      FETCH(gen, *inst, v1, 1, CHAN_W);    /* v1 = src1.WWWW */
+      ppc_vmaddfp(gen->f, v2, v0, v1, v2); /* v2 = v0 * v1 + v2 */
+   }
+   else if (inst->Instruction.Opcode == TGSI_OPCODE_DPH) {
+      FETCH(gen, *inst, v1, 1, CHAN_W);    /* v1 = src1.WWWW */
+      ppc_vaddfp(gen->f, v2, v2, v1);      /* v2 = v2 + v1 */
+   }
+
+   FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) {
+      STORE(gen, *inst, v2, 0, chan_index);  /* store v2 */
+   }
+   ppc_release_vec_register(gen->f, v0);
+   ppc_release_vec_register(gen->f, v1);
+   ppc_release_vec_register(gen->f, v2);
+}
+
+
+static void
+emit_triop(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+   int v0 = ppc_allocate_vec_register(gen->f);
+   int v1 = ppc_allocate_vec_register(gen->f);
+   int v2 = ppc_allocate_vec_register(gen->f);
+   int v3 = ppc_allocate_vec_register(gen->f);
+   uint chan_index;
+   FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) {
+      FETCH(gen, *inst, v0, 0, chan_index);   /* v0 = srcreg[0] */
+      FETCH(gen, *inst, v1, 1, chan_index);   /* v1 = srcreg[1] */
+      FETCH(gen, *inst, v2, 2, chan_index);   /* v2 = srcreg[2] */
+      switch (inst->Instruction.Opcode) {
+      case TGSI_OPCODE_MAD:
+         ppc_vmaddfp(gen->f, v3, v0, v1, v2);   /* v3 = v0 * v1 + v2 */
+         break;
+      case TGSI_OPCODE_LRP:
+         ppc_vsubfp(gen->f, v3, v1, v2);        /* v3 = v1 - v2 */
+         ppc_vmaddfp(gen->f, v3, v0, v3, v2);   /* v3 = v0 * v3 + v2 */
+         break;
+      default:
+         assert(0);
+      }
+      STORE(gen, *inst, v3, 0, chan_index);   /* store v3 */
+   }
+   ppc_release_vec_register(gen->f, v0);
+   ppc_release_vec_register(gen->f, v1);
+   ppc_release_vec_register(gen->f, v2);
+   ppc_release_vec_register(gen->f, v3);
+}
+
+
+
+/** Approximation for vr = pow(va, vb) */
+static void
+ppc_vec_pow(struct ppc_function *f, int vr, int va, int vb)
+{
+   /* pow(a,b) ~= exp2(log2(a) * b) */
+   int t_vec = ppc_allocate_vec_register(f);
+   int zero_vec = ppc_allocate_vec_register(f);
+
+   ppc_vzero(f, zero_vec);
+
+   ppc_vlogefp(f, t_vec, va);                   /* t = log2(va) */
+   ppc_vmaddfp(f, t_vec, t_vec, vb, zero_vec);  /* t = t * vb */
+   ppc_vexptefp(f, vr, t_vec);                  /* vr = 2^t */
+
+   ppc_release_vec_register(f, t_vec);
+   ppc_release_vec_register(f, zero_vec);
+}
+
+
+static void
+emit_lit(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+   int one_vec = gen_one_vec(gen);
+
+   /* Compute X */
+   if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) {
+      STORE(gen, *inst, one_vec, 0, CHAN_X);
+   }
+
+   /* Compute Y, Z */
+   if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y) ||
+       IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
+      int x_vec = ppc_allocate_vec_register(gen->f);
+      int zero_vec = ppc_allocate_vec_register(gen->f);
+
+      FETCH(gen, *inst, x_vec, 0, CHAN_X);        /* x_vec = src[0].x */
+
+      ppc_vzero(gen->f, zero_vec);                /* zero = {0,0,0,0} */
+      ppc_vmaxfp(gen->f, x_vec, x_vec, zero_vec); /* x_vec = max(x_vec, 0) */
+
+      if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
+         STORE(gen, *inst, x_vec, 0, CHAN_Y);        /* store Y */
+      }
+
+      if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
+         int y_vec = ppc_allocate_vec_register(gen->f);
+         int z_vec = ppc_allocate_vec_register(gen->f);
+         int w_vec = ppc_allocate_vec_register(gen->f);
+         int pow_vec = ppc_allocate_vec_register(gen->f);
+         int pos_vec = ppc_allocate_vec_register(gen->f);
+         int p128_vec = ppc_allocate_vec_register(gen->f);
+         int n128_vec = ppc_allocate_vec_register(gen->f);
+
+         FETCH(gen, *inst, y_vec, 0, CHAN_Y);        /* y_vec = src[0].y */
+         ppc_vmaxfp(gen->f, y_vec, y_vec, zero_vec); /* y_vec = max(y_vec, 0) */
+
+         FETCH(gen, *inst, w_vec, 0, CHAN_W);        /* w_vec = src[0].w */
+
+         /* clamp Y to [-128, 128] */
+         load_constant_vec(gen, p128_vec, 128.0f);
+         load_constant_vec(gen, n128_vec, -128.0f);
+         ppc_vmaxfp(gen->f, y_vec, y_vec, n128_vec); /* y = max(y, -128) */
+         ppc_vminfp(gen->f, y_vec, y_vec, p128_vec); /* y = min(y, 128) */
+
+         /* if temp.x > 0
+          *    z = pow(tmp.y, tmp.w)
+          * else
+          *    z = 0.0
+          */
+         ppc_vec_pow(gen->f, pow_vec, y_vec, w_vec);      /* pow = pow(y, w) */
+         ppc_vcmpgtfpx(gen->f, pos_vec, x_vec, zero_vec); /* pos = x > 0 */
+         ppc_vand(gen->f, z_vec, pow_vec, pos_vec);       /* z = pow & pos */
+
+         STORE(gen, *inst, z_vec, 0, CHAN_Z);             /* store Z */
+
+         ppc_release_vec_register(gen->f, y_vec);
+         ppc_release_vec_register(gen->f, z_vec);
+         ppc_release_vec_register(gen->f, w_vec);
+         ppc_release_vec_register(gen->f, pow_vec);
+         ppc_release_vec_register(gen->f, pos_vec);
+         ppc_release_vec_register(gen->f, p128_vec);
+         ppc_release_vec_register(gen->f, n128_vec);
+      }
+
+      ppc_release_vec_register(gen->f, x_vec);
+      ppc_release_vec_register(gen->f, zero_vec);
+   }
+
+   /* Compute W */
+   if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W)) {
+      STORE(gen, *inst, one_vec, 0, CHAN_W);
+   }
+}
+
+
+static int
+emit_instruction(struct gen_context *gen,
+                 struct tgsi_full_instruction *inst)
+{
+   switch (inst->Instruction.Opcode) {
+   case TGSI_OPCODE_MOV:
+   case TGSI_OPCODE_ABS:
+   case TGSI_OPCODE_FLOOR:
+   case TGSI_OPCODE_FRAC:
+   case TGSI_OPCODE_EXPBASE2:
+   case TGSI_OPCODE_LOGBASE2:
+      emit_unaryop(gen, inst);
+      break;
+   case TGSI_OPCODE_RSQ:
+   case TGSI_OPCODE_RCP:
+      emit_scalar_unaryop(gen, inst);
+      break;
+   case TGSI_OPCODE_ADD:
+   case TGSI_OPCODE_SUB:
+   case TGSI_OPCODE_MUL:
+   case TGSI_OPCODE_MIN:
+   case TGSI_OPCODE_MAX:
+      emit_binop(gen, inst);
+      break;
+   case TGSI_OPCODE_SEQ:
+   case TGSI_OPCODE_SNE:
+   case TGSI_OPCODE_SLT:
+   case TGSI_OPCODE_SGT:
+   case TGSI_OPCODE_SLE:
+   case TGSI_OPCODE_SGE:
+      emit_inequality(gen, inst);
+      break;
+   case TGSI_OPCODE_MAD:
+   case TGSI_OPCODE_LRP:
+      emit_triop(gen, inst);
+      break;
+   case TGSI_OPCODE_DP3:
+   case TGSI_OPCODE_DP4:
+   case TGSI_OPCODE_DPH:
+      emit_dotprod(gen, inst);
+      break;
+   case TGSI_OPCODE_LIT:
+      emit_lit(gen, inst);
+      break;
+   case TGSI_OPCODE_END:
+      /* normal end */
+      return 1;
+   default:
+      return 0;
+   }
+
+   
+   return 1;
+}
+
+static void
+emit_declaration(
+   struct ppc_function *func,
+   struct tgsi_full_declaration *decl )
+{
+   if( decl->Declaration.File == TGSI_FILE_INPUT ) {
+#if 0
+      unsigned first, last, mask;
+      unsigned i, j;
+
+      first = decl->DeclarationRange.First;
+      last = decl->DeclarationRange.Last;
+      mask = decl->Declaration.UsageMask;
+
+      for( i = first; i <= last; i++ ) {
+         for( j = 0; j < NUM_CHANNELS; j++ ) {
+            if( mask & (1 << j) ) {
+               switch( decl->Declaration.Interpolate ) {
+               case TGSI_INTERPOLATE_CONSTANT:
+                  emit_coef_a0( func, 0, i, j );
+                  emit_inputs( func, 0, i, j );
+                  break;
+
+               case TGSI_INTERPOLATE_LINEAR:
+                  emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
+                  emit_coef_dadx( func, 1, i, j );
+                  emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
+                  emit_coef_dady( func, 3, i, j );
+                  emit_mul( func, 0, 1 );    /* x * dadx */
+                  emit_coef_a0( func, 4, i, j );
+                  emit_mul( func, 2, 3 );    /* y * dady */
+                  emit_add( func, 0, 4 );    /* x * dadx + a0 */
+                  emit_add( func, 0, 2 );    /* x * dadx + y * dady + a0 */
+                  emit_inputs( func, 0, i, j );
+                  break;
+
+               case TGSI_INTERPOLATE_PERSPECTIVE:
+                  emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
+                  emit_coef_dadx( func, 1, i, j );
+                  emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
+                  emit_coef_dady( func, 3, i, j );
+                  emit_mul( func, 0, 1 );    /* x * dadx */
+                  emit_tempf( func, 4, 0, TGSI_SWIZZLE_W );
+                  emit_coef_a0( func, 5, i, j );
+                  emit_rcp( func, 4, 4 );    /* 1.0 / w */
+                  emit_mul( func, 2, 3 );    /* y * dady */
+                  emit_add( func, 0, 5 );    /* x * dadx + a0 */
+                  emit_add( func, 0, 2 );    /* x * dadx + y * dady + a0 */
+                  emit_mul( func, 0, 4 );    /* (x * dadx + y * dady + a0) / w */
+                  emit_inputs( func, 0, i, j );
+                  break;
+
+               default:
+                  assert( 0 );
+		  break;
+               }
+            }
+         }
+      }
+#endif
+   }
+}
+
+
+
+static void
+emit_prologue(struct ppc_function *func)
+{
+   /* XXX set up stack frame */
+}
+
+
+static void
+emit_epilogue(struct ppc_function *func)
+{
+   ppc_return(func);
+   /* XXX restore prev stack frame */
+}
+
+
+
+/**
+ * Translate a TGSI vertex/fragment shader to PPC code.
+ *
+ * \param tokens  the TGSI input shader
+ * \param func  the output PPC code/function
+ * \param immediates  buffer to place immediates, later passed to PPC func
+ * \return TRUE for success, FALSE if translation failed
+ */
+boolean
+tgsi_emit_ppc(const struct tgsi_token *tokens,
+              struct ppc_function *func,
+              float (*immediates)[4],
+              boolean do_swizzles )
+{
+   static int use_ppc_asm = -1;
+   struct tgsi_parse_context parse;
+   /*boolean instruction_phase = FALSE;*/
+   unsigned ok = 1;
+   uint num_immediates = 0;
+   struct gen_context gen;
+
+   if (use_ppc_asm < 0) {
+      /* If GALLIUM_NOPPC is set, don't use PPC codegen */
+      use_ppc_asm = !debug_get_bool_option("GALLIUM_NOPPC", FALSE);
+   }
+   if (!use_ppc_asm)
+      return FALSE;
+
+   util_init_math();
+
+   gen.f = func;
+   gen.inputs_reg = ppc_reserve_register(func, 3);   /* first function param */
+   gen.outputs_reg = ppc_reserve_register(func, 4);  /* second function param */
+   gen.temps_reg = ppc_reserve_register(func, 5);    /* ... */
+   gen.immed_reg = ppc_reserve_register(func, 6);
+   gen.const_reg = ppc_reserve_register(func, 7);
+   gen.builtins_reg = ppc_reserve_register(func, 8);
+   gen.one_vec = -1;
+   gen.bit31_vec = -1;
+
+   emit_prologue(func);
+
+   tgsi_parse_init( &parse, tokens );
+
+   while (!tgsi_parse_end_of_tokens(&parse) && ok) {
+      tgsi_parse_token(&parse);
+
+      switch (parse.FullToken.Token.Type) {
+      case TGSI_TOKEN_TYPE_DECLARATION:
+         if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
+            emit_declaration(func, &parse.FullToken.FullDeclaration );
+         }
+         break;
+
+      case TGSI_TOKEN_TYPE_INSTRUCTION:
+         ok = emit_instruction(&gen, &parse.FullToken.FullInstruction);
+
+	 if (!ok) {
+	    debug_printf("failed to translate tgsi opcode %d to PPC (%s)\n", 
+			 parse.FullToken.FullInstruction.Instruction.Opcode,
+                         parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX ?
+                         "vertex shader" : "fragment shader");
+	 }
+         break;
+
+      case TGSI_TOKEN_TYPE_IMMEDIATE:
+         /* splat each immediate component into a float[4] vector for SoA */
+         {
+            const uint size = parse.FullToken.FullImmediate.Immediate.Size - 1;
+            float *imm = (float *) immediates;
+            uint i;
+            assert(size <= 4);
+            assert(num_immediates < TGSI_EXEC_NUM_IMMEDIATES);
+            for (i = 0; i < size; i++) {
+               const float value =
+                  parse.FullToken.FullImmediate.u.ImmediateFloat32[i].Float;
+               imm[num_immediates * 4 + 0] = 
+               imm[num_immediates * 4 + 1] = 
+               imm[num_immediates * 4 + 2] = 
+               imm[num_immediates * 4 + 3] = value;
+               num_immediates++;
+            }
+         }
+         break;
+
+      default:
+	 ok = 0;
+         assert( 0 );
+      }
+   }
+
+   emit_epilogue(func);
+
+   tgsi_parse_free( &parse );
+
+   return ok;
+}
+
+#endif /* PIPE_ARCH_PPC */
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ppc.h b/src/gallium/auxiliary/tgsi/tgsi_ppc.h
new file mode 100644
index 0000000000..829ec075e7
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/tgsi_ppc.h
@@ -0,0 +1,51 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef TGSI_PPC_H
+#define TGSI_PPC_H
+
+#if defined __cplusplus
+extern "C" {
+#endif
+
+struct tgsi_token;
+struct ppc_function;
+
+extern const float ppc_builtin_constants[];
+
+
+boolean
+tgsi_emit_ppc(const struct tgsi_token *tokens,
+              struct ppc_function *function,
+              float (*immediates)[4],
+              boolean do_swizzles);
+
+#if defined __cplusplus
+}
+#endif
+
+#endif /* TGSI_PPC_H */
diff --git a/src/gallium/auxiliary/tgsi/tgsi_sse2.c b/src/gallium/auxiliary/tgsi/tgsi_sse2.c
index 4681b29f52..f79170b9d6 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_sse2.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_sse2.c
@@ -25,9 +25,14 @@
  * 
  **************************************************************************/
 
+#include "pipe/p_config.h"
+
+#if defined(PIPE_ARCH_X86) && defined(PIPE_ARCH_SSE)
+
 #include "pipe/p_debug.h"
 #include "pipe/p_shader_tokens.h"
 #include "util/u_math.h"
+#include "util/u_sse.h"
 #include "tgsi/tgsi_parse.h"
 #include "tgsi/tgsi_util.h"
 #include "tgsi_exec.h"
@@ -35,8 +40,6 @@
 
 #include "rtasm/rtasm_x86sse.h"
 
-#ifdef PIPE_ARCH_X86
-
 /* for 1/sqrt()
  *
  * This costs about 100fps (close to 10%) in gears:
@@ -480,10 +483,31 @@ emit_coef_dady(
  * Function call helpers.
  */
 
+/**
+ * NOTE: In gcc, if the destination uses the SSE intrinsics, then it must be 
+ * defined with __attribute__((force_align_arg_pointer)), as we do not guarantee
+ * that the stack pointer is 16 byte aligned, as expected.
+ */
 static void
-emit_push_gp(
-   struct x86_function *func )
+emit_func_call_dst(
+   struct x86_function *func,
+   unsigned xmm_save,
+   unsigned xmm_dst,
+   void (PIPE_CDECL *code)() )
 {
+   struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
+   unsigned i, n, xmm;
+   unsigned xmm_mask;
+   
+   /* Bitmask of the xmm registers to save */
+   xmm_mask = (1 << xmm_save) - 1;
+   xmm_mask &= ~(1 << xmm_dst);
+
+   sse_movaps(
+      func,
+      get_temp( TEMP_R0, 0 ),
+      make_xmm( xmm_dst ) );
+
    x86_push(
       func,
       x86_make_reg( file_REG32, reg_AX) );
@@ -493,12 +517,49 @@ emit_push_gp(
    x86_push(
       func,
       x86_make_reg( file_REG32, reg_DX) );
-}
+   
+   for(i = 0, n = 0; i < 8; ++i)
+      if(xmm_mask & (1 << i))
+         ++n;
+   
+   x86_sub_imm(
+      func, 
+      x86_make_reg( file_REG32, reg_SP ),
+      n*16);
+
+   for(i = 0, n = 0; i < 8; ++i)
+      if(xmm_mask & (1 << i)) {
+         sse_movups(
+            func,
+            x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ),
+            make_xmm( xmm ) );
+         ++n;
+      }
+   
+   x86_lea(
+      func,
+      ecx,
+      get_temp( TEMP_R0, 0 ) );
+   
+   x86_push( func, ecx );
+   x86_mov_reg_imm( func, ecx, (unsigned long) code );
+   x86_call( func, ecx );
+   x86_pop(func, ecx );
+   
+   for(i = 0, n = 0; i < 8; ++i)
+      if(xmm_mask & (1 << i)) {
+         sse_movups(
+            func,
+            make_xmm( xmm ),
+            x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ) );
+         ++n;
+      }
+   
+   x86_add_imm(
+      func, 
+      x86_make_reg( file_REG32, reg_SP ),
+      n*16);
 
-static void
-x86_pop_gp(
-   struct x86_function *func )
-{
    /* Restore GP registers in a reverse order.
     */
    x86_pop(
@@ -510,39 +571,6 @@ x86_pop_gp(
    x86_pop(
       func,
       x86_make_reg( file_REG32, reg_AX) );
-}
-
-static void
-emit_func_call_dst(
-   struct x86_function *func,
-   unsigned xmm_dst,
-   void (PIPE_CDECL *code)() )
-{
-   sse_movaps(
-      func,
-      get_temp( TEMP_R0, 0 ),
-      make_xmm( xmm_dst ) );
-
-   emit_push_gp(
-      func );
-
-   {
-      struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
-
-      x86_lea(
-         func,
-         ecx,
-         get_temp( TEMP_R0, 0 ) );
-
-      x86_push( func, ecx );
-      x86_mov_reg_imm( func, ecx, (unsigned long) code );
-      x86_call( func, ecx );
-      x86_pop(func, ecx ); 
-   }
-
-
-   x86_pop_gp(
-      func );
 
    sse_movaps(
       func,
@@ -553,6 +581,7 @@ emit_func_call_dst(
 static void
 emit_func_call_dst_src(
    struct x86_function *func,
+   unsigned xmm_save, 
    unsigned xmm_dst,
    unsigned xmm_src,
    void (PIPE_CDECL *code)() )
@@ -564,10 +593,111 @@ emit_func_call_dst_src(
 
    emit_func_call_dst(
       func,
+      xmm_save,
       xmm_dst,
       code );
 }
 
+/*
+ * Fast SSE2 implementation of special math functions.
+ */
+
+#define POLY0(x, c0) _mm_set1_ps(c0)
+#define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
+#define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
+#define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
+#define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
+#define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
+
+#define EXP_POLY_DEGREE 3
+#define LOG_POLY_DEGREE 5
+
+/**
+ * See http://www.devmaster.net/forums/showthread.php?p=43580
+ */
+static INLINE __m128 
+exp2f4(__m128 x)
+{
+   __m128i ipart;
+   __m128 fpart, expipart, expfpart;
+
+   x = _mm_min_ps(x, _mm_set1_ps( 129.00000f));
+   x = _mm_max_ps(x, _mm_set1_ps(-126.99999f));
+
+   /* ipart = int(x - 0.5) */
+   ipart = _mm_cvtps_epi32(_mm_sub_ps(x, _mm_set1_ps(0.5f)));
+
+   /* fpart = x - ipart */
+   fpart = _mm_sub_ps(x, _mm_cvtepi32_ps(ipart));
+
+   /* expipart = (float) (1 << ipart) */
+   expipart = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart, _mm_set1_epi32(127)), 23));
+
+   /* minimax polynomial fit of 2**x, in range [-0.5, 0.5[ */
+#if EXP_POLY_DEGREE == 5
+   expfpart = POLY5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f);
+#elif EXP_POLY_DEGREE == 4
+   expfpart = POLY4(fpart, 1.0000026f, 6.9300383e-1f, 2.4144275e-1f, 5.2011464e-2f, 1.3534167e-2f);
+#elif EXP_POLY_DEGREE == 3
+   expfpart = POLY3(fpart, 9.9992520e-1f, 6.9583356e-1f, 2.2606716e-1f, 7.8024521e-2f);
+#elif EXP_POLY_DEGREE == 2
+   expfpart = POLY2(fpart, 1.0017247f, 6.5763628e-1f, 3.3718944e-1f);
+#else
+#error
+#endif
+
+   return _mm_mul_ps(expipart, expfpart);
+}
+
+/**
+ * See http://www.devmaster.net/forums/showthread.php?p=43580
+ */
+static INLINE __m128 
+log2f4(__m128 x)
+{
+   __m128i expmask = _mm_set1_epi32(0x7f800000);
+   __m128i mantmask = _mm_set1_epi32(0x007fffff);
+   __m128 one = _mm_set1_ps(1.0f);
+
+   __m128i i = _mm_castps_si128(x);
+
+   /* exp = (float) exponent(x) */
+   __m128 exp = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i, expmask), 23), _mm_set1_epi32(127)));
+
+   /* mant = (float) mantissa(x) */
+   __m128 mant = _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i, mantmask)), one);
+
+   __m128 logmant;
+
+   /* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[ 
+    * These coefficients can be generate with 
+    * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
+    */
+#if LOG_POLY_DEGREE == 6
+   logmant = POLY5(mant, 3.11578814719469302614f, -3.32419399085241980044f, 2.59883907202499966007f, -1.23152682416275988241f, 0.318212422185251071475f, -0.0344359067839062357313f);
+#elif LOG_POLY_DEGREE == 5
+   logmant = POLY4(mant, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
+#elif LOG_POLY_DEGREE == 4
+   logmant = POLY3(mant, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
+#elif LOG_POLY_DEGREE == 3
+   logmant = POLY2(mant, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
+#else
+#error
+#endif
+
+   /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
+   logmant = _mm_mul_ps(logmant, _mm_sub_ps(mant, one));
+
+   return _mm_add_ps(logmant, exp);
+}
+
+static INLINE __m128
+powf4(__m128 x, __m128 y)
+{
+   return exp2f4(_mm_mul_ps(log2f4(x), y));
+}
+
+
 /**
  * Low-level instruction translators.
  */
@@ -610,38 +740,35 @@ cos4f(
 static void
 emit_cos(
    struct x86_function *func,
+   unsigned xmm_save, 
    unsigned xmm_dst )
 {
    emit_func_call_dst(
       func,
+      xmm_save, 
       xmm_dst,
       cos4f );
 }
 
 static void PIPE_CDECL
+#if defined(PIPE_CC_GCC)
+__attribute__((force_align_arg_pointer))
+#endif
 ex24f(
    float *store )
 {
-#if FAST_MATH
-   store[0] = util_fast_exp2( store[0] );
-   store[1] = util_fast_exp2( store[1] );
-   store[2] = util_fast_exp2( store[2] );
-   store[3] = util_fast_exp2( store[3] );
-#else
-   store[0] = powf( 2.0f, store[0] );
-   store[1] = powf( 2.0f, store[1] );
-   store[2] = powf( 2.0f, store[2] );
-   store[3] = powf( 2.0f, store[3] );
-#endif
+   _mm_store_ps(&store[0], exp2f4( _mm_load_ps(&store[0]) ));
 }
 
 static void
 emit_ex2(
    struct x86_function *func,
+   unsigned xmm_save, 
    unsigned xmm_dst )
 {
    emit_func_call_dst(
       func,
+      xmm_save,
       xmm_dst,
       ex24f );
 }
@@ -670,10 +797,12 @@ flr4f(
 static void
 emit_flr(
    struct x86_function *func,
+   unsigned xmm_save, 
    unsigned xmm_dst )
 {
    emit_func_call_dst(
       func,
+      xmm_save,
       xmm_dst,
       flr4f );
 }
@@ -691,31 +820,35 @@ frc4f(
 static void
 emit_frc(
    struct x86_function *func,
+   unsigned xmm_save, 
    unsigned xmm_dst )
 {
    emit_func_call_dst(
       func,
+      xmm_save,
       xmm_dst,
       frc4f );
 }
 
 static void PIPE_CDECL
+#if defined(PIPE_CC_GCC)
+__attribute__((force_align_arg_pointer))
+#endif
 lg24f(
    float *store )
 {
-   store[0] = util_fast_log2( store[0] );
-   store[1] = util_fast_log2( store[1] );
-   store[2] = util_fast_log2( store[2] );
-   store[3] = util_fast_log2( store[3] );
+   _mm_store_ps(&store[0], log2f4( _mm_load_ps(&store[0]) ));
 }
 
 static void
 emit_lg2(
    struct x86_function *func,
+   unsigned xmm_save, 
    unsigned xmm_dst )
 {
    emit_func_call_dst(
       func,
+      xmm_save,
       xmm_dst,
       lg24f );
 }
@@ -757,14 +890,14 @@ emit_neg(
 }
 
 static void PIPE_CDECL
+#if defined(PIPE_CC_GCC)
+__attribute__((force_align_arg_pointer))
+#endif
 pow4f(
    float *store )
 {
-#if FAST_MATH
-   store[0] = util_fast_pow( store[0], store[4] );
-   store[1] = util_fast_pow( store[1], store[5] );
-   store[2] = util_fast_pow( store[2], store[6] );
-   store[3] = util_fast_pow( store[3], store[7] );
+#if 1
+   _mm_store_ps(&store[0], powf4( _mm_load_ps(&store[0]), _mm_load_ps(&store[4]) ));
 #else
    store[0] = powf( store[0], store[4] );
    store[1] = powf( store[1], store[5] );
@@ -776,11 +909,13 @@ pow4f(
 static void
 emit_pow(
    struct x86_function *func,
+   unsigned xmm_save, 
    unsigned xmm_dst,
    unsigned xmm_src )
 {
    emit_func_call_dst_src(
       func,
+      xmm_save,
       xmm_dst,
       xmm_src,
       pow4f );
@@ -873,10 +1008,12 @@ sin4f(
 
 static void
 emit_sin (struct x86_function *func,
+          unsigned xmm_save, 
           unsigned xmm_dst)
 {
    emit_func_call_dst(
       func,
+      xmm_save,
       xmm_dst,
       sin4f );
 }
@@ -1296,7 +1433,7 @@ emit_instruction(
                get_temp(
                   TGSI_EXEC_TEMP_MINUS_128_I,
                   TGSI_EXEC_TEMP_MINUS_128_C ) );
-            emit_pow( func, 1, 2 );
+            emit_pow( func, 3, 1, 2 );
             FETCH( func, *inst, 0, 0, CHAN_X );
             sse_xorps(
                func,
@@ -1342,11 +1479,11 @@ emit_instruction(
          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
              IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
             emit_MOV( func, 1, 0 );
-            emit_flr( func, 1 );
+            emit_flr( func, 2, 1 );
             /* dst.x = ex2(floor(src.x)) */
             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
                emit_MOV( func, 2, 1 );
-               emit_ex2( func, 2 );
+               emit_ex2( func, 3, 2 );
                STORE( func, *inst, 2, 0, CHAN_X );
             }
             /* dst.y = src.x - floor(src.x) */
@@ -1358,7 +1495,7 @@ emit_instruction(
          }
          /* dst.z = ex2(src.x) */
          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
-            emit_ex2( func, 0 );
+            emit_ex2( func, 3, 0 );
             STORE( func, *inst, 0, 0, CHAN_Z );
          }
       }
@@ -1376,21 +1513,21 @@ emit_instruction(
          FETCH( func, *inst, 0, 0, CHAN_X );
          emit_abs( func, 0 );
          emit_MOV( func, 1, 0 );
-         emit_lg2( func, 1 );
+         emit_lg2( func, 2, 1 );
          /* dst.z = lg2(abs(src.x)) */
          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
             STORE( func, *inst, 1, 0, CHAN_Z );
          }
          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
              IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
-            emit_flr( func, 1 );
+            emit_flr( func, 2, 1 );
             /* dst.x = floor(lg2(abs(src.x))) */
             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
                STORE( func, *inst, 1, 0, CHAN_X );
             }
             /* dst.x = abs(src)/ex2(floor(lg2(abs(src.x)))) */
             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
-               emit_ex2( func, 1 );
+               emit_ex2( func, 2, 1 );
                emit_rcp( func, 1, 1 );
                emit_mul( func, 0, 1 );
                STORE( func, *inst, 0, 0, CHAN_Y );
@@ -1580,7 +1717,7 @@ emit_instruction(
    /* TGSI_OPCODE_FRC */
       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( func, *inst, 0, 0, chan_index );
-         emit_frc( func, 0 );
+         emit_frc( func, 0, 0 );
          STORE( func, *inst, 0, 0, chan_index );
       }
       break;
@@ -1593,7 +1730,7 @@ emit_instruction(
    /* TGSI_OPCODE_FLR */
       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( func, *inst, 0, 0, chan_index );
-         emit_flr( func, 0 );
+         emit_flr( func, 0, 0 );
          STORE( func, *inst, 0, 0, chan_index );
       }
       break;
@@ -1605,7 +1742,7 @@ emit_instruction(
    case TGSI_OPCODE_EXPBASE2:
    /* TGSI_OPCODE_EX2 */
       FETCH( func, *inst, 0, 0, CHAN_X );
-      emit_ex2( func, 0 );
+      emit_ex2( func, 0, 0 );
       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
          STORE( func, *inst, 0, 0, chan_index );
       }
@@ -1614,7 +1751,7 @@ emit_instruction(
    case TGSI_OPCODE_LOGBASE2:
    /* TGSI_OPCODE_LG2 */
       FETCH( func, *inst, 0, 0, CHAN_X );
-      emit_lg2( func, 0 );
+      emit_lg2( func, 0, 0 );
       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
          STORE( func, *inst, 0, 0, chan_index );
       }
@@ -1624,7 +1761,7 @@ emit_instruction(
    /* TGSI_OPCODE_POW */
       FETCH( func, *inst, 0, 0, CHAN_X );
       FETCH( func, *inst, 1, 1, CHAN_X );
-      emit_pow( func, 0, 1 );
+      emit_pow( func, 0, 0, 1 );
       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
          STORE( func, *inst, 0, 0, chan_index );
       }
@@ -1715,7 +1852,7 @@ emit_instruction(
 
    case TGSI_OPCODE_COS:
       FETCH( func, *inst, 0, 0, CHAN_X );
-      emit_cos( func, 0 );
+      emit_cos( func, 0, 0 );
       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
          STORE( func, *inst, 0, 0, chan_index );
       }
@@ -1774,7 +1911,7 @@ emit_instruction(
 
    case TGSI_OPCODE_SIN:
       FETCH( func, *inst, 0, 0, CHAN_X );
-      emit_sin( func, 0 );
+      emit_sin( func, 0, 0 );
       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
          STORE( func, *inst, 0, 0, chan_index );
       }
@@ -1868,12 +2005,12 @@ emit_instruction(
    case TGSI_OPCODE_SCS:
       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
          FETCH( func, *inst, 0, 0, CHAN_X );
-         emit_cos( func, 0 );
+         emit_cos( func, 0, 0 );
          STORE( func, *inst, 0, 0, CHAN_X );
       }
       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
          FETCH( func, *inst, 0, 0, CHAN_X );
-         emit_sin( func, 0 );
+         emit_sin( func, 0, 0 );
          STORE( func, *inst, 0, 0, CHAN_Y );
       }
       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {