From 70f4ad44985e3ec6dabc1b0e55a5bf85803a4cd4 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 22 Oct 2008 11:07:35 -0600
Subject: gallium: TGSI to PPC code generation

Based on the TGSIto SSE2 code generator.
Incomplete and lots of SSE stuff still hanging around but the basic dozen
or so TGSI opcodes are functioning.
---
 src/gallium/auxiliary/tgsi/tgsi_ppc.c | 2781 +++++++++++++++++++++++++++++++++
 1 file changed, 2781 insertions(+)
 create mode 100644 src/gallium/auxiliary/tgsi/tgsi_ppc.c

(limited to 'src/gallium/auxiliary/tgsi/tgsi_ppc.c')

diff --git a/src/gallium/auxiliary/tgsi/tgsi_ppc.c b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
new file mode 100644
index 0000000000..112e736523
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
@@ -0,0 +1,2781 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * TGSI to PowerPC code generation.
+ */
+
+#include "pipe/p_config.h"
+
+#if defined(PIPE_ARCH_PPC)
+
+#include "pipe/p_debug.h"
+#include "pipe/p_shader_tokens.h"
+#include "util/u_math.h"
+#include "util/u_sse.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_util.h"
+#include "tgsi_exec.h"
+#include "tgsi_ppc.h"
+#include "rtasm/rtasm_ppc.h"
+
+
+/* for 1/sqrt()
+ *
+ * This costs about 100fps (close to 10%) in gears:
+ */
+#define HIGH_PRECISION 1
+
+#define FAST_MATH 1
+
+
+#define FOR_EACH_CHANNEL( CHAN )\
+   for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)
+
+#define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
+   ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
+
+#define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
+   if (IS_DST0_CHANNEL_ENABLED( INST, CHAN ))
+
+#define FOR_EACH_DST0_ENABLED_CHANNEL( INST, CHAN )\
+   FOR_EACH_CHANNEL( CHAN )\
+      IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )
+
+#define CHAN_X 0
+#define CHAN_Y 1
+#define CHAN_Z 2
+#define CHAN_W 3
+
+#define TEMP_ONE_I   TGSI_EXEC_TEMP_ONE_I
+#define TEMP_ONE_C   TGSI_EXEC_TEMP_ONE_C
+
+#define TEMP_R0   TGSI_EXEC_TEMP_R0
+#define TEMP_ADDR TGSI_EXEC_TEMP_ADDR
+
+
+/**
+ * Context/state used during code gen.
+ */
+struct gen_context
+{
+   struct ppc_function *f;
+   int inputs_reg;    /**< register pointing to input params */
+   int outputs_reg;   /**< register pointing to output params */
+   int temps_reg;     /**< register pointing to temporary "registers" */
+   int immed_reg;     /**< register pointing to immediates buffer */
+   int const_reg;     /**< register pointing to constants buffer */
+};
+
+
+
+#if 0000
+
+/**
+ * X86 utility functions.
+ */
+
+static struct x86_reg
+make_xmm(
+   unsigned xmm )
+{
+   return x86_make_reg(
+      file_XMM,
+      (enum x86_reg_name) xmm );
+}
+
+/**
+ * X86 register mapping helpers.
+ */
+
+static struct x86_reg
+get_const_base( void )
+{
+   return x86_make_reg(
+      file_REG32,
+      reg_CX );
+}
+
+static struct x86_reg
+get_input_base( void )
+{
+   return x86_make_reg(
+      file_REG32,
+      reg_AX );
+}
+
+static struct x86_reg
+get_output_base( void )
+{
+   return x86_make_reg(
+      file_REG32,
+      reg_DX );
+}
+
+static struct x86_reg
+get_temp_base( void )
+{
+   return x86_make_reg(
+      file_REG32,
+      reg_BX );
+}
+
+static struct x86_reg
+get_coef_base( void )
+{
+   return get_output_base();
+}
+
+static struct x86_reg
+get_immediate_base( void )
+{
+   return x86_make_reg(
+      file_REG32,
+      reg_DI );
+}
+
+
+/**
+ * Data access helpers.
+ */
+
+
+static struct x86_reg
+get_immediate(
+   unsigned vec,
+   unsigned chan )
+{
+   return x86_make_disp(
+      get_immediate_base(),
+      (vec * 4 + chan) * 4 );
+}
+
+static struct x86_reg
+get_const(
+   unsigned vec,
+   unsigned chan )
+{
+   return x86_make_disp(
+      get_const_base(),
+      (vec * 4 + chan) * 4 );
+}
+
+static struct x86_reg
+get_input(
+   unsigned vec,
+   unsigned chan )
+{
+   return x86_make_disp(
+      get_input_base(),
+      (vec * 4 + chan) * 16 );
+}
+
+static struct x86_reg
+get_output(
+   unsigned vec,
+   unsigned chan )
+{
+   return x86_make_disp(
+      get_output_base(),
+      (vec * 4 + chan) * 16 );
+}
+
+static struct x86_reg
+get_temp(
+   unsigned vec,
+   unsigned chan )
+{
+   return x86_make_disp(
+      get_temp_base(),
+      (vec * 4 + chan) * 16 );
+}
+
+static struct x86_reg
+get_coef(
+   unsigned vec,
+   unsigned chan,
+   unsigned member )
+{
+   return x86_make_disp(
+      get_coef_base(),
+      ((vec * 3 + member) * 4 + chan) * 4 );
+}
+
+
+static void
+emit_ret(
+   struct x86_function  *func )
+{
+   x86_ret( func );
+}
+
+#endif
+
+/**
+ * Data fetch helpers.
+ */
+
+#if 00
+/**
+ * Copy a shader constant to xmm register
+ * \param xmm  the destination xmm register
+ * \param vec  the src const buffer index
+ * \param chan  src channel to fetch (X, Y, Z or W)
+ */
+static void
+emit_const(
+   struct x86_function *func,
+   uint xmm,
+   int vec,
+   uint chan,
+   uint indirect,
+   uint indirectFile,
+   int indirectIndex )
+{
+   if (indirect) {
+      struct x86_reg r0 = get_input_base();
+      struct x86_reg r1 = get_output_base();
+      uint i;
+
+      assert( indirectFile == TGSI_FILE_ADDRESS );
+      assert( indirectIndex == 0 );
+
+      x86_push( func, r0 );
+      x86_push( func, r1 );
+
+      for (i = 0; i < QUAD_SIZE; i++) {
+         x86_lea( func, r0, get_const( vec, chan ) );
+         x86_mov( func, r1, x86_make_disp( get_temp( TEMP_ADDR, CHAN_X ), i * 4 ) );
+
+         /* Quick hack to multiply by 16 -- need to add SHL to rtasm.
+          */
+         x86_add( func, r1, r1 );
+         x86_add( func, r1, r1 );
+         x86_add( func, r1, r1 );
+         x86_add( func, r1, r1 );
+
+         x86_add( func, r0, r1 );
+         x86_mov( func, r1, x86_deref( r0 ) );
+         x86_mov( func, x86_make_disp( get_temp( TEMP_R0, CHAN_X ), i * 4 ), r1 );
+      }
+
+      x86_pop( func, r1 );
+      x86_pop( func, r0 );
+
+      sse_movaps(
+         func,
+         make_xmm( xmm ),
+         get_temp( TEMP_R0, CHAN_X ) );
+   }
+   else {
+      assert( vec >= 0 );
+
+      sse_movss(
+         func,
+         make_xmm( xmm ),
+         get_const( vec, chan ) );
+      sse_shufps(
+         func,
+         make_xmm( xmm ),
+         make_xmm( xmm ),
+         SHUF( 0, 0, 0, 0 ) );
+   }
+}
+
+static void
+emit_immediate(
+   struct x86_function *func,
+   unsigned xmm,
+   unsigned vec,
+   unsigned chan )
+{
+   sse_movss(
+      func,
+      make_xmm( xmm ),
+      get_immediate( vec, chan ) );
+   sse_shufps(
+      func,
+      make_xmm( xmm ),
+      make_xmm( xmm ),
+      SHUF( 0, 0, 0, 0 ) );
+}
+
+
+/**
+ * Copy a shader input to xmm register
+ * \param xmm  the destination xmm register
+ * \param vec  the src input attrib
+ * \param chan  src channel to fetch (X, Y, Z or W)
+ */
+static void
+emit_inputf(
+   struct x86_function *func,
+   unsigned xmm,
+   unsigned vec,
+   unsigned chan )
+{
+   sse_movups(
+      func,
+      make_xmm( xmm ),
+      get_input( vec, chan ) );
+}
+
+/**
+ * Store an xmm register to a shader output
+ * \param xmm  the source xmm register
+ * \param vec  the dest output attrib
+ * \param chan  src dest channel to store (X, Y, Z or W)
+ */
+static void
+emit_output(
+   struct x86_function *func,
+   unsigned xmm,
+   unsigned vec,
+   unsigned chan )
+{
+   sse_movups(
+      func,
+      get_output( vec, chan ),
+      make_xmm( xmm ) );
+}
+
+/**
+ * Copy a shader temporary to xmm register
+ * \param xmm  the destination xmm register
+ * \param vec  the src temp register
+ * \param chan  src channel to fetch (X, Y, Z or W)
+ */
+static void
+emit_tempf(
+   struct x86_function *func,
+   unsigned xmm,
+   unsigned vec,
+   unsigned chan )
+{
+   sse_movaps(
+      func,
+      make_xmm( xmm ),
+      get_temp( vec, chan ) );
+}
+
+/**
+ * Load an xmm register with an input attrib coefficient (a0, dadx or dady)
+ * \param xmm  the destination xmm register
+ * \param vec  the src input/attribute coefficient index
+ * \param chan  src channel to fetch (X, Y, Z or W)
+ * \param member  0=a0, 1=dadx, 2=dady
+ */
+static void
+emit_coef(
+   struct x86_function *func,
+   unsigned xmm,
+   unsigned vec,
+   unsigned chan,
+   unsigned member )
+{
+   sse_movss(
+      func,
+      make_xmm( xmm ),
+      get_coef( vec, chan, member ) );
+   sse_shufps(
+      func,
+      make_xmm( xmm ),
+      make_xmm( xmm ),
+      SHUF( 0, 0, 0, 0 ) );
+}
+
+/**
+ * Data store helpers.
+ */
+
+static void
+emit_inputs(
+   struct x86_function *func,
+   unsigned xmm,
+   unsigned vec,
+   unsigned chan )
+{
+   sse_movups(
+      func,
+      get_input( vec, chan ),
+      make_xmm( xmm ) );
+}
+
+static void
+emit_temps(
+   struct x86_function *func,
+   unsigned xmm,
+   unsigned vec,
+   unsigned chan )
+{
+   sse_movaps(
+      func,
+      get_temp( vec, chan ),
+      make_xmm( xmm ) );
+}
+
+static void
+emit_addrs(
+   struct x86_function *func,
+   unsigned xmm,
+   unsigned vec,
+   unsigned chan )
+{
+   assert( vec == 0 );
+
+   emit_temps(
+      func,
+      xmm,
+      vec + TGSI_EXEC_TEMP_ADDR,
+      chan );
+}
+
+/**
+ * Coefficent fetch helpers.
+ */
+
+static void
+emit_coef_a0(
+   struct x86_function *func,
+   unsigned xmm,
+   unsigned vec,
+   unsigned chan )
+{
+   emit_coef(
+      func,
+      xmm,
+      vec,
+      chan,
+      0 );
+}
+
+static void
+emit_coef_dadx(
+   struct x86_function *func,
+   unsigned xmm,
+   unsigned vec,
+   unsigned chan )
+{
+   emit_coef(
+      func,
+      xmm,
+      vec,
+      chan,
+      1 );
+}
+
+static void
+emit_coef_dady(
+   struct x86_function *func,
+   unsigned xmm,
+   unsigned vec,
+   unsigned chan )
+{
+   emit_coef(
+      func,
+      xmm,
+      vec,
+      chan,
+      2 );
+}
+#endif
+
+
+/**
+ * Function call helpers.
+ */
+
+#if 00
+/**
+ * NOTE: In gcc, if the destination uses the SSE intrinsics, then it must be 
+ * defined with __attribute__((force_align_arg_pointer)), as we do not guarantee
+ * that the stack pointer is 16 byte aligned, as expected.
+ */
+static void
+emit_func_call_dst(
+   struct x86_function *func,
+   unsigned xmm_save,
+   unsigned xmm_dst,
+   void (PIPE_CDECL *code)() )
+{
+   struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
+   unsigned i, n, xmm;
+   unsigned xmm_mask;
+   
+   /* Bitmask of the xmm registers to save */
+   xmm_mask = (1 << xmm_save) - 1;
+   xmm_mask &= ~(1 << xmm_dst);
+
+   sse_movaps(
+      func,
+      get_temp( TEMP_R0, 0 ),
+      make_xmm( xmm_dst ) );
+
+   x86_push(
+      func,
+      x86_make_reg( file_REG32, reg_AX) );
+   x86_push(
+      func,
+      x86_make_reg( file_REG32, reg_CX) );
+   x86_push(
+      func,
+      x86_make_reg( file_REG32, reg_DX) );
+   
+   for(i = 0, n = 0; i < 8; ++i)
+      if(xmm_mask & (1 << i))
+         ++n;
+   
+   x86_sub_imm(
+      func, 
+      x86_make_reg( file_REG32, reg_SP ),
+      n*16);
+
+   for(i = 0, n = 0; i < 8; ++i)
+      if(xmm_mask & (1 << i)) {
+         sse_movups(
+            func,
+            x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ),
+            make_xmm( xmm ) );
+         ++n;
+      }
+   
+   x86_lea(
+      func,
+      ecx,
+      get_temp( TEMP_R0, 0 ) );
+   
+   x86_push( func, ecx );
+   x86_mov_reg_imm( func, ecx, (unsigned long) code );
+   x86_call( func, ecx );
+   x86_pop(func, ecx );
+   
+   for(i = 0, n = 0; i < 8; ++i)
+      if(xmm_mask & (1 << i)) {
+         sse_movups(
+            func,
+            make_xmm( xmm ),
+            x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ) );
+         ++n;
+      }
+   
+   x86_add_imm(
+      func, 
+      x86_make_reg( file_REG32, reg_SP ),
+      n*16);
+
+   /* Restore GP registers in a reverse order.
+    */
+   x86_pop(
+      func,
+      x86_make_reg( file_REG32, reg_DX) );
+   x86_pop(
+      func,
+      x86_make_reg( file_REG32, reg_CX) );
+   x86_pop(
+      func,
+      x86_make_reg( file_REG32, reg_AX) );
+
+   sse_movaps(
+      func,
+      make_xmm( xmm_dst ),
+      get_temp( TEMP_R0, 0 ) );
+}
+
+static void
+emit_func_call_dst_src(
+   struct x86_function *func,
+   unsigned xmm_save, 
+   unsigned xmm_dst,
+   unsigned xmm_src,
+   void (PIPE_CDECL *code)() )
+{
+   sse_movaps(
+      func,
+      get_temp( TEMP_R0, 1 ),
+      make_xmm( xmm_src ) );
+
+   emit_func_call_dst(
+      func,
+      xmm_save,
+      xmm_dst,
+      code );
+}
+
+/*
+ * Fast SSE2 implementation of special math functions.
+ */
+
+#define POLY0(x, c0) _mm_set1_ps(c0)
+#define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
+#define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
+#define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
+#define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
+#define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
+
+#define EXP_POLY_DEGREE 3
+#define LOG_POLY_DEGREE 5
+
+/**
+ * See http://www.devmaster.net/forums/showthread.php?p=43580
+ */
+static INLINE __m128 
+exp2f4(__m128 x)
+{
+   __m128i ipart;
+   __m128 fpart, expipart, expfpart;
+
+   x = _mm_min_ps(x, _mm_set1_ps( 129.00000f));
+   x = _mm_max_ps(x, _mm_set1_ps(-126.99999f));
+
+   /* ipart = int(x - 0.5) */
+   ipart = _mm_cvtps_epi32(_mm_sub_ps(x, _mm_set1_ps(0.5f)));
+
+   /* fpart = x - ipart */
+   fpart = _mm_sub_ps(x, _mm_cvtepi32_ps(ipart));
+
+   /* expipart = (float) (1 << ipart) */
+   expipart = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart, _mm_set1_epi32(127)), 23));
+
+   /* minimax polynomial fit of 2**x, in range [-0.5, 0.5[ */
+#if EXP_POLY_DEGREE == 5
+   expfpart = POLY5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f);
+#elif EXP_POLY_DEGREE == 4
+   expfpart = POLY4(fpart, 1.0000026f, 6.9300383e-1f, 2.4144275e-1f, 5.2011464e-2f, 1.3534167e-2f);
+#elif EXP_POLY_DEGREE == 3
+   expfpart = POLY3(fpart, 9.9992520e-1f, 6.9583356e-1f, 2.2606716e-1f, 7.8024521e-2f);
+#elif EXP_POLY_DEGREE == 2
+   expfpart = POLY2(fpart, 1.0017247f, 6.5763628e-1f, 3.3718944e-1f);
+#else
+#error
+#endif
+
+   return _mm_mul_ps(expipart, expfpart);
+}
+
+/**
+ * See http://www.devmaster.net/forums/showthread.php?p=43580
+ */
+static INLINE __m128 
+log2f4(__m128 x)
+{
+   __m128i expmask = _mm_set1_epi32(0x7f800000);
+   __m128i mantmask = _mm_set1_epi32(0x007fffff);
+   __m128 one = _mm_set1_ps(1.0f);
+
+   __m128i i = _mm_castps_si128(x);
+
+   /* exp = (float) exponent(x) */
+   __m128 exp = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i, expmask), 23), _mm_set1_epi32(127)));
+
+   /* mant = (float) mantissa(x) */
+   __m128 mant = _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i, mantmask)), one);
+
+   __m128 logmant;
+
+   /* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[ 
+    * These coefficients can be generate with 
+    * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
+    */
+#if LOG_POLY_DEGREE == 6
+   logmant = POLY5(mant, 3.11578814719469302614f, -3.32419399085241980044f, 2.59883907202499966007f, -1.23152682416275988241f, 0.318212422185251071475f, -0.0344359067839062357313f);
+#elif LOG_POLY_DEGREE == 5
+   logmant = POLY4(mant, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
+#elif LOG_POLY_DEGREE == 4
+   logmant = POLY3(mant, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
+#elif LOG_POLY_DEGREE == 3
+   logmant = POLY2(mant, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
+#else
+#error
+#endif
+
+   /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
+   logmant = _mm_mul_ps(logmant, _mm_sub_ps(mant, one));
+
+   return _mm_add_ps(logmant, exp);
+}
+
+static INLINE __m128
+powf4(__m128 x, __m128 y)
+{
+   return exp2f4(_mm_mul_ps(log2f4(x), y));
+}
+
+
+/**
+ * Low-level instruction translators.
+ */
+
+static void
+emit_abs(
+   struct x86_function *func,
+   unsigned xmm )
+{
+   sse_andps(
+      func,
+      make_xmm( xmm ),
+      get_temp(
+         TGSI_EXEC_TEMP_7FFFFFFF_I,
+         TGSI_EXEC_TEMP_7FFFFFFF_C ) );
+}
+
+static void
+emit_add(
+   struct x86_function *func,
+   unsigned xmm_dst,
+   unsigned xmm_src )
+{
+   sse_addps(
+      func,
+      make_xmm( xmm_dst ),
+      make_xmm( xmm_src ) );
+}
+
+static void PIPE_CDECL
+cos4f(
+   float *store )
+{
+   store[0] = cosf( store[0] );
+   store[1] = cosf( store[1] );
+   store[2] = cosf( store[2] );
+   store[3] = cosf( store[3] );
+}
+
+static void
+emit_cos(
+   struct x86_function *func,
+   unsigned xmm_save, 
+   unsigned xmm_dst )
+{
+   emit_func_call_dst(
+      func,
+      xmm_save, 
+      xmm_dst,
+      cos4f );
+}
+
+static void PIPE_CDECL
+#if defined(PIPE_CC_GCC)
+__attribute__((force_align_arg_pointer))
+#endif
+ex24f(
+   float *store )
+{
+   _mm_store_ps(&store[0], exp2f4( _mm_load_ps(&store[0]) ));
+}
+
+static void
+emit_ex2(
+   struct x86_function *func,
+   unsigned xmm_save, 
+   unsigned xmm_dst )
+{
+   emit_func_call_dst(
+      func,
+      xmm_save,
+      xmm_dst,
+      ex24f );
+}
+
+static void
+emit_f2it(
+   struct x86_function *func,
+   unsigned xmm )
+{
+   sse2_cvttps2dq(
+      func,
+      make_xmm( xmm ),
+      make_xmm( xmm ) );
+}
+
+static void PIPE_CDECL
+flr4f(
+   float *store )
+{
+   store[0] = floorf( store[0] );
+   store[1] = floorf( store[1] );
+   store[2] = floorf( store[2] );
+   store[3] = floorf( store[3] );
+}
+
+static void
+emit_flr(
+   struct x86_function *func,
+   unsigned xmm_save, 
+   unsigned xmm_dst )
+{
+   emit_func_call_dst(
+      func,
+      xmm_save,
+      xmm_dst,
+      flr4f );
+}
+
+static void PIPE_CDECL
+frc4f(
+   float *store )
+{
+   store[0] -= floorf( store[0] );
+   store[1] -= floorf( store[1] );
+   store[2] -= floorf( store[2] );
+   store[3] -= floorf( store[3] );
+}
+
+static void
+emit_frc(
+   struct x86_function *func,
+   unsigned xmm_save, 
+   unsigned xmm_dst )
+{
+   emit_func_call_dst(
+      func,
+      xmm_save,
+      xmm_dst,
+      frc4f );
+}
+
+static void PIPE_CDECL
+#if defined(PIPE_CC_GCC)
+__attribute__((force_align_arg_pointer))
+#endif
+lg24f(
+   float *store )
+{
+   _mm_store_ps(&store[0], log2f4( _mm_load_ps(&store[0]) ));
+}
+
+static void
+emit_lg2(
+   struct x86_function *func,
+   unsigned xmm_save, 
+   unsigned xmm_dst )
+{
+   emit_func_call_dst(
+      func,
+      xmm_save,
+      xmm_dst,
+      lg24f );
+}
+
+static void
+emit_MOV(
+   struct x86_function *func,
+   unsigned xmm_dst,
+   unsigned xmm_src )
+{
+   sse_movups(
+      func,
+      make_xmm( xmm_dst ),
+      make_xmm( xmm_src ) );
+}
+
+static void
+emit_mul (struct x86_function *func,
+          unsigned xmm_dst,
+          unsigned xmm_src)
+{
+   sse_mulps(
+      func,
+      make_xmm( xmm_dst ),
+      make_xmm( xmm_src ) );
+}
+
+static void
+emit_neg(
+   struct x86_function *func,
+   unsigned xmm )
+{
+   sse_xorps(
+      func,
+      make_xmm( xmm ),
+      get_temp(
+         TGSI_EXEC_TEMP_80000000_I,
+         TGSI_EXEC_TEMP_80000000_C ) );
+}
+
+static void PIPE_CDECL
+#if defined(PIPE_CC_GCC)
+__attribute__((force_align_arg_pointer))
+#endif
+pow4f(
+   float *store )
+{
+#if 1
+   _mm_store_ps(&store[0], powf4( _mm_load_ps(&store[0]), _mm_load_ps(&store[4]) ));
+#else
+   store[0] = powf( store[0], store[4] );
+   store[1] = powf( store[1], store[5] );
+   store[2] = powf( store[2], store[6] );
+   store[3] = powf( store[3], store[7] );
+#endif
+}
+
+static void
+emit_pow(
+   struct x86_function *func,
+   unsigned xmm_save, 
+   unsigned xmm_dst,
+   unsigned xmm_src )
+{
+   emit_func_call_dst_src(
+      func,
+      xmm_save,
+      xmm_dst,
+      xmm_src,
+      pow4f );
+}
+
+static void
+emit_rcp (
+   struct x86_function *func,
+   unsigned xmm_dst,
+   unsigned xmm_src )
+{
+   /* On Intel CPUs at least, this is only accurate to 12 bits -- not
+    * good enough.  Need to either emit a proper divide or use the
+    * iterative technique described below in emit_rsqrt().
+    */
+   sse2_rcpps(
+      func,
+      make_xmm( xmm_dst ),
+      make_xmm( xmm_src ) );
+}
+
+static void
+emit_rsqrt(
+   struct x86_function *func,
+   unsigned xmm_dst,
+   unsigned xmm_src )
+{
+#if HIGH_PRECISION
+   /* Although rsqrtps() and rcpps() are low precision on some/all SSE
+    * implementations, it is possible to improve its precision at
+    * fairly low cost, using a newton/raphson step, as below:
+    * 
+    * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
+    * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
+    *
+    * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
+    */
+   {
+      struct x86_reg dst = make_xmm( xmm_dst );
+      struct x86_reg src = make_xmm( xmm_src );
+      struct x86_reg tmp0 = make_xmm( 2 );
+      struct x86_reg tmp1 = make_xmm( 3 );
+
+      assert( xmm_dst != xmm_src );
+      assert( xmm_dst != 2 && xmm_dst != 3 );
+      assert( xmm_src != 2 && xmm_src != 3 );
+
+      sse_movaps(  func, dst,  get_temp( TGSI_EXEC_TEMP_HALF_I, TGSI_EXEC_TEMP_HALF_C ) );
+      sse_movaps(  func, tmp0, get_temp( TGSI_EXEC_TEMP_THREE_I, TGSI_EXEC_TEMP_THREE_C ) );
+      sse_rsqrtps( func, tmp1, src  );
+      sse_mulps(   func, src,  tmp1 );
+      sse_mulps(   func, dst,  tmp1 );
+      sse_mulps(   func, src,  tmp1 );
+      sse_subps(   func, tmp0, src  );
+      sse_mulps(   func, dst,  tmp0 );
+   }
+#else
+   /* On Intel CPUs at least, this is only accurate to 12 bits -- not
+    * good enough.
+    */
+   sse_rsqrtps(
+      func,
+      make_xmm( xmm_dst ),
+      make_xmm( xmm_src ) );
+#endif
+}
+
+static void
+emit_setsign(
+   struct x86_function *func,
+   unsigned xmm )
+{
+   sse_orps(
+      func,
+      make_xmm( xmm ),
+      get_temp(
+         TGSI_EXEC_TEMP_80000000_I,
+         TGSI_EXEC_TEMP_80000000_C ) );
+}
+
+static void PIPE_CDECL
+sin4f(
+   float *store )
+{
+   store[0] = sinf( store[0] );
+   store[1] = sinf( store[1] );
+   store[2] = sinf( store[2] );
+   store[3] = sinf( store[3] );
+}
+
+static void
+emit_sin (struct x86_function *func,
+          unsigned xmm_save, 
+          unsigned xmm_dst)
+{
+   emit_func_call_dst(
+      func,
+      xmm_save,
+      xmm_dst,
+      sin4f );
+}
+
+static void
+emit_sub(
+   struct x86_function *func,
+   unsigned xmm_dst,
+   unsigned xmm_src )
+{
+   sse_subps(
+      func,
+      make_xmm( xmm_dst ),
+      make_xmm( xmm_src ) );
+}
+#endif
+
+
+/**
+ * Register fetch.
+ */
+static void
+emit_fetch(struct gen_context *gen,
+           unsigned vec_reg,
+           const struct tgsi_full_src_register *reg,
+           const unsigned chan_index)
+{
+   uint swizzle = tgsi_util_get_full_src_register_extswizzle(reg, chan_index);
+
+   switch (swizzle) {
+   case TGSI_EXTSWIZZLE_X:
+   case TGSI_EXTSWIZZLE_Y:
+   case TGSI_EXTSWIZZLE_Z:
+   case TGSI_EXTSWIZZLE_W:
+      switch (reg->SrcRegister.File) {
+      case TGSI_FILE_INPUT:
+         {
+            int offset_reg = ppc_allocate_register(gen->f);
+            int offset = (reg->SrcRegister.Index * 4 + swizzle) * 16;
+            ppc_li(gen->f, offset_reg, offset);
+            ppc_lvx(gen->f, vec_reg, gen->inputs_reg, offset_reg);
+            ppc_release_register(gen->f, offset_reg);
+         }
+         break;
+      case TGSI_FILE_TEMPORARY:
+         {
+            int offset_reg = ppc_allocate_register(gen->f);
+            int offset = (reg->SrcRegister.Index * 4 + swizzle) * 16;
+            ppc_li(gen->f, offset_reg, offset);
+            ppc_lvx(gen->f, vec_reg, gen->temps_reg, offset_reg);
+            ppc_release_register(gen->f, offset_reg);
+         }
+         break;
+      case TGSI_FILE_IMMEDIATE:
+         {
+            int offset_reg = ppc_allocate_register(gen->f);
+            int offset = (reg->SrcRegister.Index * 4 + swizzle) * 16;
+            ppc_li(gen->f, offset_reg, offset);
+            ppc_lvx(gen->f, vec_reg, gen->immed_reg, offset_reg);
+            ppc_release_register(gen->f, offset_reg);
+         }
+         break;
+      case TGSI_FILE_CONSTANT:
+         {
+            int offset_reg = ppc_allocate_register(gen->f);
+            int offset = (reg->SrcRegister.Index * 4 + swizzle) * 4;
+            ppc_li(gen->f, offset_reg, offset);
+            /* load vector word */
+            ppc_lvewx(gen->f, vec_reg, gen->const_reg, offset_reg);
+            /* splat word[0] across vector */
+            ppc_vspltw(gen->f, vec_reg, vec_reg, 0);
+            ppc_release_register(gen->f, offset_reg);
+         }
+         break;
+      default:
+         assert( 0 );
+      }
+      break;
+
+   case TGSI_EXTSWIZZLE_ZERO:
+#if 0
+      emit_tempf(
+         func,
+         xmm,
+         TGSI_EXEC_TEMP_00000000_I,
+         TGSI_EXEC_TEMP_00000000_C );
+#endif
+      break;
+
+   case TGSI_EXTSWIZZLE_ONE:
+#if 0
+      emit_tempf(
+         func,
+         xmm,
+         TEMP_ONE_I,
+         TEMP_ONE_C );
+#endif
+      break;
+
+   default:
+      assert( 0 );
+   }
+
+#if 0
+   switch( tgsi_util_get_full_src_register_sign_mode( reg, chan_index ) ) {
+   case TGSI_UTIL_SIGN_CLEAR:
+      emit_abs( func, xmm );
+      break;
+
+   case TGSI_UTIL_SIGN_SET:
+      emit_setsign( func, xmm );
+      break;
+
+   case TGSI_UTIL_SIGN_TOGGLE:
+      emit_neg( func, xmm );
+      break;
+
+   case TGSI_UTIL_SIGN_KEEP:
+      break;
+   }
+#endif
+}
+
+#define FETCH( GEN, INST, VEC_REG, SRC_REG, CHAN ) \
+   emit_fetch( GEN, VEC_REG, &(INST).FullSrcRegisters[SRC_REG], CHAN )
+
+
+
+/**
+ * Register store.
+ */
+static void
+emit_store(struct gen_context *gen,
+           unsigned vec_reg,
+           const struct tgsi_full_dst_register *reg,
+           const struct tgsi_full_instruction *inst,
+           unsigned chan_index)
+{
+   switch (reg->DstRegister.File) {
+   case TGSI_FILE_OUTPUT:
+      {
+         int offset_reg = ppc_allocate_register(gen->f);
+         int offset = (reg->DstRegister.Index * 4 + chan_index) * 16;
+         ppc_li(gen->f, offset_reg, offset);
+         ppc_stvx(gen->f, vec_reg, gen->outputs_reg, offset_reg);
+         ppc_release_register(gen->f, offset_reg);
+      }
+      break;
+   case TGSI_FILE_TEMPORARY:
+      {
+         int offset_reg = ppc_allocate_register(gen->f);
+         int offset = (reg->DstRegister.Index * 4 + chan_index) * 16;
+         ppc_li(gen->f, offset_reg, offset);
+         ppc_stvx(gen->f, vec_reg, gen->temps_reg, offset_reg);
+         ppc_release_register(gen->f, offset_reg);
+      }
+      break;
+#if 0
+   case TGSI_FILE_ADDRESS:
+      emit_addrs(
+         func,
+         xmm,
+         reg->DstRegister.Index,
+         chan_index );
+      break;
+#endif
+   default:
+      assert( 0 );
+   }
+
+#if 0
+   switch( inst->Instruction.Saturate ) {
+   case TGSI_SAT_NONE:
+      break;
+
+   case TGSI_SAT_ZERO_ONE:
+      /* assert( 0 ); */
+      break;
+
+   case TGSI_SAT_MINUS_PLUS_ONE:
+      assert( 0 );
+      break;
+   }
+#endif
+}
+
+
+#define STORE( GEN, INST, XMM, INDEX, CHAN )\
+   emit_store( GEN, XMM, &(INST).FullDstRegisters[INDEX], &(INST), CHAN )
+
+
+
+#if 000
+/**
+ * High-level instruction translators.
+ */
+
+static void
+emit_kil(
+   struct x86_function *func,
+   const struct tgsi_full_src_register *reg )
+{
+   unsigned uniquemask;
+   unsigned registers[4];
+   unsigned nextregister = 0;
+   unsigned firstchan = ~0;
+   unsigned chan_index;
+
+   /* This mask stores component bits that were already tested. Note that
+    * we test if the value is less than zero, so 1.0 and 0.0 need not to be
+    * tested. */
+   uniquemask = (1 << TGSI_EXTSWIZZLE_ZERO) | (1 << TGSI_EXTSWIZZLE_ONE);
+
+   FOR_EACH_CHANNEL( chan_index ) {
+      unsigned swizzle;
+
+      /* unswizzle channel */
+      swizzle = tgsi_util_get_full_src_register_extswizzle(
+         reg,
+         chan_index );
+
+      /* check if the component has not been already tested */
+      if( !(uniquemask & (1 << swizzle)) ) {
+         uniquemask |= 1 << swizzle;
+
+         /* allocate register */
+         registers[chan_index] = nextregister;
+         emit_fetch(
+            func,
+            nextregister,
+            reg,
+            chan_index );
+         nextregister++;
+
+         /* mark the first channel used */
+         if( firstchan == ~0 ) {
+            firstchan = chan_index;
+         }
+      }
+   }
+
+   x86_push(
+      func,
+      x86_make_reg( file_REG32, reg_AX ) );
+   x86_push(
+      func,
+      x86_make_reg( file_REG32, reg_DX ) );
+
+   FOR_EACH_CHANNEL( chan_index ) {
+      if( uniquemask & (1 << chan_index) ) {
+         sse_cmpps(
+            func,
+            make_xmm( registers[chan_index] ),
+            get_temp(
+               TGSI_EXEC_TEMP_00000000_I,
+               TGSI_EXEC_TEMP_00000000_C ),
+            cc_LessThan );
+
+         if( chan_index == firstchan ) {
+            sse_pmovmskb(
+               func,
+               x86_make_reg( file_REG32, reg_AX ),
+               make_xmm( registers[chan_index] ) );
+         }
+         else {
+            sse_pmovmskb(
+               func,
+               x86_make_reg( file_REG32, reg_DX ),
+               make_xmm( registers[chan_index] ) );
+            x86_or(
+               func,
+               x86_make_reg( file_REG32, reg_AX ),
+               x86_make_reg( file_REG32, reg_DX ) );
+         }
+      }
+   }
+
+   x86_or(
+      func,
+      get_temp(
+         TGSI_EXEC_TEMP_KILMASK_I,
+         TGSI_EXEC_TEMP_KILMASK_C ),
+      x86_make_reg( file_REG32, reg_AX ) );
+
+   x86_pop(
+      func,
+      x86_make_reg( file_REG32, reg_DX ) );
+   x86_pop(
+      func,
+      x86_make_reg( file_REG32, reg_AX ) );
+}
+
+
+static void
+emit_kilp(
+   struct x86_function *func )
+{
+   /* XXX todo / fix me */
+}
+
+
+static void
+emit_setcc(
+   struct x86_function *func,
+   struct tgsi_full_instruction *inst,
+   enum sse_cc cc )
+{
+   unsigned chan_index;
+
+   FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+      FETCH( func, *inst, 0, 0, chan_index );
+      FETCH( func, *inst, 1, 1, chan_index );
+      sse_cmpps(
+         func,
+         make_xmm( 0 ),
+         make_xmm( 1 ),
+         cc );
+      sse_andps(
+         func,
+         make_xmm( 0 ),
+         get_temp(
+            TEMP_ONE_I,
+            TEMP_ONE_C ) );
+      STORE( func, *inst, 0, 0, chan_index );
+   }
+}
+
+static void
+emit_cmp(
+   struct x86_function *func,
+   struct tgsi_full_instruction *inst )
+{
+   unsigned chan_index;
+
+   FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+      FETCH( func, *inst, 0, 0, chan_index );
+      FETCH( func, *inst, 1, 1, chan_index );
+      FETCH( func, *inst, 2, 2, chan_index );
+      sse_cmpps(
+         func,
+         make_xmm( 0 ),
+         get_temp(
+            TGSI_EXEC_TEMP_00000000_I,
+            TGSI_EXEC_TEMP_00000000_C ),
+         cc_LessThan );
+      sse_andps(
+         func,
+         make_xmm( 1 ),
+         make_xmm( 0 ) );
+      sse_andnps(
+         func,
+         make_xmm( 0 ),
+         make_xmm( 2 ) );
+      sse_orps(
+         func,
+         make_xmm( 0 ),
+         make_xmm( 1 ) );
+      STORE( func, *inst, 0, 0, chan_index );
+   }
+}
+#endif
+
+
+static void
+emit_unaryop(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+   int v0 = ppc_allocate_vec_register(gen->f);
+   uint chan_index;
+   FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) {
+      FETCH(gen, *inst, 0, 0, chan_index);   /* v0 = srcreg[0] */
+      switch (inst->Instruction.Opcode) {
+      case TGSI_OPCODE_ABS:
+         /* turn off the most significant bit of each vector float word */
+         {
+            int v1 = ppc_allocate_vec_register(gen->f);
+            ppc_vspltisw(gen->f, v1, -1);  /* v1 = {-1, -1, -1, -1} */
+            ppc_vslw(gen->f, v1, v1, v1);  /* v1 = {1<<31, 1<<31, 1<<31, 1<<31} */
+            ppc_vandc(gen->f, v0, v0, v1); /* v0 = v0 & ~v1 */
+            ppc_release_vec_register(gen->f, v1);
+         }
+         break;
+      case TGSI_OPCODE_FLOOR:
+         ppc_vrfim(gen->f, v0, v0);         /* v0 = floor(v0) */
+         break;
+      case TGSI_OPCODE_FRAC:
+         {
+            int v1 = ppc_allocate_vec_register(gen->f);
+            ppc_vrfim(gen->f, v1, v0);         /* v1 = floor(v0) */
+            ppc_vsubfp(gen->f, v0, v0, v1);    /* v0 = v0 - v1 */
+            ppc_release_vec_register(gen->f, v1);
+         }
+         break;
+      case TGSI_OPCODE_EXPBASE2:
+         ppc_vexptefp(gen->f, v0, v0);      /* v0 = 2^v0 */
+         break;
+      case TGSI_OPCODE_LOGBASE2:
+         /* XXX this may be broken! */
+         ppc_vlogefp(gen->f, v0, v0);      /* v0 = log2(v0) */
+         break;
+      case TGSI_OPCODE_MOV:
+         /* nothing */
+         break;
+      default:
+         assert(0);
+      }
+      STORE(gen, *inst, v0, 0, chan_index);   /* store v0 */
+   }
+   ppc_release_vec_register(gen->f, v0);
+}
+
+
+static void
+emit_binop(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+   int v0 = ppc_allocate_vec_register(gen->f);
+   int v1 = ppc_allocate_vec_register(gen->f);
+   int v2 = ppc_allocate_vec_register(gen->f);
+   uint chan_index;
+   FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) {
+      FETCH(gen, *inst, v0, 0, chan_index);   /* v0 = srcreg[0] */
+      FETCH(gen, *inst, v1, 1, chan_index);   /* v1 = srcreg[1] */
+      switch (inst->Instruction.Opcode) {
+      case TGSI_OPCODE_ADD:
+         ppc_vaddfp(gen->f, v2, v0, v1);
+         break;
+      case TGSI_OPCODE_SUB:
+         ppc_vsubfp(gen->f, v2, v0, v1);
+         break;
+      case TGSI_OPCODE_MUL:
+         ppc_vxor(gen->f, v2, v2, v2);        /* v2 = {0, 0, 0, 0} */
+         ppc_vmaddfp(gen->f, v2, v0, v1, v2); /* v2 = v0 * v1 + v0 */
+         break;
+      case TGSI_OPCODE_MIN:
+         ppc_vminfp(gen->f, v2, v0, v1);
+         break;
+      case TGSI_OPCODE_MAX:
+         ppc_vmaxfp(gen->f, v2, v0, v1);
+         break;
+      default:
+         assert(0);
+      }
+      STORE(gen, *inst, v2, 0, chan_index);   /* store v2 */
+   }
+   ppc_release_vec_register(gen->f, v0);
+   ppc_release_vec_register(gen->f, v1);
+   ppc_release_vec_register(gen->f, v2);
+}
+
+
+static void
+emit_dotprod(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+   int v0 = ppc_allocate_vec_register(gen->f);
+   int v1 = ppc_allocate_vec_register(gen->f);
+   int v2 = ppc_allocate_vec_register(gen->f);
+   uint chan_index;
+
+   ppc_vxor(gen->f, v2, v2, v2);           /* v2 = {0, 0, 0, 0} */
+
+   FETCH(gen, *inst, v0, 0, CHAN_X);       /* v0 = src0.XXXX */
+   FETCH(gen, *inst, v1, 1, CHAN_X);       /* v1 = src1.XXXX */
+   ppc_vmaddfp(gen->f, v2, v0, v1, v2);    /* v2 = v0 * v1 + v2 */
+
+   FETCH(gen, *inst, v0, 0, CHAN_Y);       /* v0 = src0.YYYY */
+   FETCH(gen, *inst, v1, 1, CHAN_Y);       /* v1 = src1.YYYY */
+   ppc_vmaddfp(gen->f, v2, v0, v1, v2);    /* v2 = v0 * v1 + v2 */
+
+   FETCH(gen, *inst, v0, 0, CHAN_Z);       /* v0 = src0.ZZZZ */
+   FETCH(gen, *inst, v1, 1, CHAN_Z);       /* v1 = src1.ZZZZ */
+   ppc_vmaddfp(gen->f, v2, v0, v1, v2);    /* v2 = v0 * v1 + v2 */
+
+   if (inst->Instruction.Opcode == TGSI_OPCODE_DP4) {
+      FETCH(gen, *inst, v0, 0, CHAN_W);    /* v0 = src0.WWWW */
+      FETCH(gen, *inst, v1, 1, CHAN_W);    /* v1 = src1.WWWW */
+      ppc_vmaddfp(gen->f, v2, v0, v1, v2); /* v2 = v0 * v1 + v2 */
+   }
+   else if (inst->Instruction.Opcode == TGSI_OPCODE_DPH) {
+      FETCH(gen, *inst, v1, 1, CHAN_W);    /* v1 = src1.WWWW */
+      ppc_vaddfp(gen->f, v2, v2, v1);      /* v2 = v2 + v1 */
+   }
+
+   FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) {
+      STORE(gen, *inst, v2, 0, chan_index);  /* store v2 */
+   }
+   ppc_release_vec_register(gen->f, v0);
+   ppc_release_vec_register(gen->f, v1);
+   ppc_release_vec_register(gen->f, v2);
+}
+
+
+static void
+emit_triop(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+   int v0 = ppc_allocate_vec_register(gen->f);
+   int v1 = ppc_allocate_vec_register(gen->f);
+   int v2 = ppc_allocate_vec_register(gen->f);
+   int v3 = ppc_allocate_vec_register(gen->f);
+   uint chan_index;
+   FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) {
+      FETCH(gen, *inst, v0, 0, chan_index);   /* v0 = srcreg[0] */
+      FETCH(gen, *inst, v1, 1, chan_index);   /* v1 = srcreg[1] */
+      FETCH(gen, *inst, v2, 2, chan_index);   /* v2 = srcreg[2] */
+      switch (inst->Instruction.Opcode) {
+      case TGSI_OPCODE_MAD:
+         ppc_vmaddfp(gen->f, v3, v0, v1, v2);   /* v3 = v0 * v1 + v2 */
+         break;
+      case TGSI_OPCODE_LRP:
+         ppc_vsubfp(gen->f, v3, v1, v2);        /* v3 = v1 - v2 */
+         ppc_vmaddfp(gen->f, v3, v0, v3, v2);   /* v3 = v0 * v3 + v2 */
+         break;
+      default:
+         assert(0);
+      }
+      STORE(gen, *inst, v3, 0, chan_index);   /* store v3 */
+   }
+   ppc_release_vec_register(gen->f, v0);
+   ppc_release_vec_register(gen->f, v1);
+   ppc_release_vec_register(gen->f, v2);
+   ppc_release_vec_register(gen->f, v3);
+}
+
+
+static int
+emit_instruction(struct gen_context *gen,
+                 struct tgsi_full_instruction *inst)
+{
+   switch (inst->Instruction.Opcode) {
+   case TGSI_OPCODE_MOV:
+   case TGSI_OPCODE_ABS:
+   case TGSI_OPCODE_FLOOR:
+   case TGSI_OPCODE_FRAC:
+   case TGSI_OPCODE_EXPBASE2:
+   case TGSI_OPCODE_LOGBASE2:
+      emit_unaryop(gen, inst);
+      break;
+   case TGSI_OPCODE_ADD:
+   case TGSI_OPCODE_SUB:
+   case TGSI_OPCODE_MUL:
+   case TGSI_OPCODE_MIN:
+   case TGSI_OPCODE_MAX:
+      emit_binop(gen, inst);
+      break;
+   case TGSI_OPCODE_MAD:
+   case TGSI_OPCODE_LRP:
+      emit_triop(gen, inst);
+      break;
+   case TGSI_OPCODE_DP3:
+   case TGSI_OPCODE_DP4:
+   case TGSI_OPCODE_DPH:
+      emit_dotprod(gen, inst);
+      break;
+   case TGSI_OPCODE_END:
+      /* normal end */
+      return 1;
+   default:
+      return 0;
+   }
+
+#if 0
+   unsigned chan_index;
+
+   switch (inst->Instruction.Opcode) {
+   case TGSI_OPCODE_ARL:
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         emit_f2it( func, 0 );
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_MOV:
+   case TGSI_OPCODE_SWZ:
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_LIT:
+      if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
+          IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
+         emit_tempf(
+            func,
+            0,
+            TEMP_ONE_I,
+            TEMP_ONE_C);
+         if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ) {
+            STORE( func, *inst, 0, 0, CHAN_X );
+         }
+         if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
+            STORE( func, *inst, 0, 0, CHAN_W );
+         }
+      }
+      if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
+          IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
+         if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
+            FETCH( func, *inst, 0, 0, CHAN_X );
+            sse_maxps(
+               func,
+               make_xmm( 0 ),
+               get_temp(
+                  TGSI_EXEC_TEMP_00000000_I,
+                  TGSI_EXEC_TEMP_00000000_C ) );
+            STORE( func, *inst, 0, 0, CHAN_Y );
+         }
+         if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
+            /* XMM[1] = SrcReg[0].yyyy */
+            FETCH( func, *inst, 1, 0, CHAN_Y );
+            /* XMM[1] = max(XMM[1], 0) */
+            sse_maxps(
+               func,
+               make_xmm( 1 ),
+               get_temp(
+                  TGSI_EXEC_TEMP_00000000_I,
+                  TGSI_EXEC_TEMP_00000000_C ) );
+            /* XMM[2] = SrcReg[0].wwww */
+            FETCH( func, *inst, 2, 0, CHAN_W );
+            /* XMM[2] = min(XMM[2], 128.0) */
+            sse_minps(
+               func,
+               make_xmm( 2 ),
+               get_temp(
+                  TGSI_EXEC_TEMP_128_I,
+                  TGSI_EXEC_TEMP_128_C ) );
+            /* XMM[2] = max(XMM[2], -128.0) */
+            sse_maxps(
+               func,
+               make_xmm( 2 ),
+               get_temp(
+                  TGSI_EXEC_TEMP_MINUS_128_I,
+                  TGSI_EXEC_TEMP_MINUS_128_C ) );
+            emit_pow( func, 3, 1, 2 );
+            FETCH( func, *inst, 0, 0, CHAN_X );
+            sse_xorps(
+               func,
+               make_xmm( 2 ),
+               make_xmm( 2 ) );
+            sse_cmpps(
+               func,
+               make_xmm( 2 ),
+               make_xmm( 0 ),
+               cc_LessThanEqual );
+            sse_andps(
+               func,
+               make_xmm( 2 ),
+               make_xmm( 1 ) );
+            STORE( func, *inst, 2, 0, CHAN_Z );
+         }
+      }
+      break;
+
+   case TGSI_OPCODE_RCP:
+   /* TGSI_OPCODE_RECIP */
+      FETCH( func, *inst, 0, 0, CHAN_X );
+      emit_rcp( func, 0, 0 );
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_RSQ:
+   /* TGSI_OPCODE_RECIPSQRT */
+      FETCH( func, *inst, 0, 0, CHAN_X );
+      emit_rsqrt( func, 1, 0 );
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( func, *inst, 1, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_EXP:
+      if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
+          IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
+          IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
+         FETCH( func, *inst, 0, 0, CHAN_X );
+         if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
+             IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
+            emit_MOV( func, 1, 0 );
+            emit_flr( func, 2, 1 );
+            /* dst.x = ex2(floor(src.x)) */
+            if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
+               emit_MOV( func, 2, 1 );
+               emit_ex2( func, 3, 2 );
+               STORE( func, *inst, 2, 0, CHAN_X );
+            }
+            /* dst.y = src.x - floor(src.x) */
+            if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
+               emit_MOV( func, 2, 0 );
+               emit_sub( func, 2, 1 );
+               STORE( func, *inst, 2, 0, CHAN_Y );
+            }
+         }
+         /* dst.z = ex2(src.x) */
+         if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
+            emit_ex2( func, 3, 0 );
+            STORE( func, *inst, 0, 0, CHAN_Z );
+         }
+      }
+      /* dst.w = 1.0 */
+      if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
+         emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
+         STORE( func, *inst, 0, 0, CHAN_W );
+      }
+      break;
+
+   case TGSI_OPCODE_LOG:
+      if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
+          IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
+          IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
+         FETCH( func, *inst, 0, 0, CHAN_X );
+         emit_abs( func, 0 );
+         emit_MOV( func, 1, 0 );
+         emit_lg2( func, 2, 1 );
+         /* dst.z = lg2(abs(src.x)) */
+         if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
+            STORE( func, *inst, 1, 0, CHAN_Z );
+         }
+         if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
+             IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
+            emit_flr( func, 2, 1 );
+            /* dst.x = floor(lg2(abs(src.x))) */
+            if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
+               STORE( func, *inst, 1, 0, CHAN_X );
+            }
+            /* dst.x = abs(src)/ex2(floor(lg2(abs(src.x)))) */
+            if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
+               emit_ex2( func, 2, 1 );
+               emit_rcp( func, 1, 1 );
+               emit_mul( func, 0, 1 );
+               STORE( func, *inst, 0, 0, CHAN_Y );
+            }
+         }
+      }
+      /* dst.w = 1.0 */
+      if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
+         emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
+         STORE( func, *inst, 0, 0, CHAN_W );
+      }
+      break;
+
+   case TGSI_OPCODE_MUL:
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         FETCH( func, *inst, 1, 1, chan_index );
+         emit_mul( func, 0, 1 );
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_ADD:
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         FETCH( func, *inst, 1, 1, chan_index );
+         emit_add( func, 0, 1 );
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_DP3:
+   /* TGSI_OPCODE_DOT3 */
+      FETCH( func, *inst, 0, 0, CHAN_X );
+      FETCH( func, *inst, 1, 1, CHAN_X );
+      emit_mul( func, 0, 1 );
+      FETCH( func, *inst, 1, 0, CHAN_Y );
+      FETCH( func, *inst, 2, 1, CHAN_Y );
+      emit_mul( func, 1, 2 );
+      emit_add( func, 0, 1 );
+      FETCH( func, *inst, 1, 0, CHAN_Z );
+      FETCH( func, *inst, 2, 1, CHAN_Z );
+      emit_mul( func, 1, 2 );
+      emit_add( func, 0, 1 );
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_DP4:
+   /* TGSI_OPCODE_DOT4 */
+      FETCH( func, *inst, 0, 0, CHAN_X );
+      FETCH( func, *inst, 1, 1, CHAN_X );
+      emit_mul( func, 0, 1 );
+      FETCH( func, *inst, 1, 0, CHAN_Y );
+      FETCH( func, *inst, 2, 1, CHAN_Y );
+      emit_mul( func, 1, 2 );
+      emit_add( func, 0, 1 );
+      FETCH( func, *inst, 1, 0, CHAN_Z );
+      FETCH( func, *inst, 2, 1, CHAN_Z );
+      emit_mul(func, 1, 2 );
+      emit_add(func, 0, 1 );
+      FETCH( func, *inst, 1, 0, CHAN_W );
+      FETCH( func, *inst, 2, 1, CHAN_W );
+      emit_mul( func, 1, 2 );
+      emit_add( func, 0, 1 );
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_DST:
+      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
+         emit_tempf(
+            func,
+            0,
+            TEMP_ONE_I,
+            TEMP_ONE_C );
+         STORE( func, *inst, 0, 0, CHAN_X );
+      }
+      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
+         FETCH( func, *inst, 0, 0, CHAN_Y );
+         FETCH( func, *inst, 1, 1, CHAN_Y );
+         emit_mul( func, 0, 1 );
+         STORE( func, *inst, 0, 0, CHAN_Y );
+      }
+      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
+         FETCH( func, *inst, 0, 0, CHAN_Z );
+         STORE( func, *inst, 0, 0, CHAN_Z );
+      }
+      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
+         FETCH( func, *inst, 0, 1, CHAN_W );
+         STORE( func, *inst, 0, 0, CHAN_W );
+      }
+      break;
+
+   case TGSI_OPCODE_MIN:
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         FETCH( func, *inst, 1, 1, chan_index );
+         sse_minps(
+            func,
+            make_xmm( 0 ),
+            make_xmm( 1 ) );
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_MAX:
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         FETCH( func, *inst, 1, 1, chan_index );
+         sse_maxps(
+            func,
+            make_xmm( 0 ),
+            make_xmm( 1 ) );
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SLT:
+   /* TGSI_OPCODE_SETLT */
+      emit_setcc( func, inst, cc_LessThan );
+      break;
+
+   case TGSI_OPCODE_SGE:
+   /* TGSI_OPCODE_SETGE */
+      emit_setcc( func, inst, cc_NotLessThan );
+      break;
+
+   case TGSI_OPCODE_MAD:
+   /* TGSI_OPCODE_MADD */
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         FETCH( func, *inst, 1, 1, chan_index );
+         FETCH( func, *inst, 2, 2, chan_index );
+         emit_mul( func, 0, 1 );
+         emit_add( func, 0, 2 );
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SUB:
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         FETCH( func, *inst, 1, 1, chan_index );
+         emit_sub( func, 0, 1 );
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_LERP:
+   /* TGSI_OPCODE_LRP */
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         FETCH( func, *inst, 1, 1, chan_index );
+         FETCH( func, *inst, 2, 2, chan_index );
+         emit_sub( func, 1, 2 );
+         emit_mul( func, 0, 1 );
+         emit_add( func, 0, 2 );
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_CND:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_CND0:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_DOT2ADD:
+   /* TGSI_OPCODE_DP2A */
+      return 0;
+      break;
+
+   case TGSI_OPCODE_INDEX:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_NEGATE:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_FRAC:
+   /* TGSI_OPCODE_FRC */
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         emit_frc( func, 0, 0 );
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_CLAMP:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_FLOOR:
+   /* TGSI_OPCODE_FLR */
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         emit_flr( func, 0, 0 );
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_ROUND:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_EXPBASE2:
+   /* TGSI_OPCODE_EX2 */
+      FETCH( func, *inst, 0, 0, CHAN_X );
+      emit_ex2( func, 0, 0 );
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_LOGBASE2:
+   /* TGSI_OPCODE_LG2 */
+      FETCH( func, *inst, 0, 0, CHAN_X );
+      emit_lg2( func, 0, 0 );
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_POWER:
+   /* TGSI_OPCODE_POW */
+      FETCH( func, *inst, 0, 0, CHAN_X );
+      FETCH( func, *inst, 1, 1, CHAN_X );
+      emit_pow( func, 0, 0, 1 );
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_CROSSPRODUCT:
+   /* TGSI_OPCODE_XPD */
+      if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
+          IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
+         FETCH( func, *inst, 1, 1, CHAN_Z );
+         FETCH( func, *inst, 3, 0, CHAN_Z );
+      }
+      if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
+          IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
+         FETCH( func, *inst, 0, 0, CHAN_Y );
+         FETCH( func, *inst, 4, 1, CHAN_Y );
+      }
+      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
+         emit_MOV( func, 2, 0 );
+         emit_mul( func, 2, 1 );
+         emit_MOV( func, 5, 3 );
+         emit_mul( func, 5, 4 );
+         emit_sub( func, 2, 5 );
+         STORE( func, *inst, 2, 0, CHAN_X );
+      }
+      if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
+          IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
+         FETCH( func, *inst, 2, 1, CHAN_X );
+         FETCH( func, *inst, 5, 0, CHAN_X );
+      }
+      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
+         emit_mul( func, 3, 2 );
+         emit_mul( func, 1, 5 );
+         emit_sub( func, 3, 1 );
+         STORE( func, *inst, 3, 0, CHAN_Y );
+      }
+      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
+         emit_mul( func, 5, 4 );
+         emit_mul( func, 0, 2 );
+         emit_sub( func, 5, 0 );
+         STORE( func, *inst, 5, 0, CHAN_Z );
+      }
+      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
+	 emit_tempf(
+	    func,
+	    0,
+	    TEMP_ONE_I,
+	    TEMP_ONE_C );
+         STORE( func, *inst, 0, 0, CHAN_W );
+      }
+      break;
+
+   case TGSI_OPCODE_MULTIPLYMATRIX:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_ABS:
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         emit_abs( func, 0) ;
+
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_RCC:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_DPH:
+      FETCH( func, *inst, 0, 0, CHAN_X );
+      FETCH( func, *inst, 1, 1, CHAN_X );
+      emit_mul( func, 0, 1 );
+      FETCH( func, *inst, 1, 0, CHAN_Y );
+      FETCH( func, *inst, 2, 1, CHAN_Y );
+      emit_mul( func, 1, 2 );
+      emit_add( func, 0, 1 );
+      FETCH( func, *inst, 1, 0, CHAN_Z );
+      FETCH( func, *inst, 2, 1, CHAN_Z );
+      emit_mul( func, 1, 2 );
+      emit_add( func, 0, 1 );
+      FETCH( func, *inst, 1, 1, CHAN_W );
+      emit_add( func, 0, 1 );
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_COS:
+      FETCH( func, *inst, 0, 0, CHAN_X );
+      emit_cos( func, 0, 0 );
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_DDX:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_DDY:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_KILP:
+      /* predicated kill */
+      emit_kilp( func );
+      return 0; /* XXX fix me */
+      break;
+
+   case TGSI_OPCODE_KIL:
+      /* conditional kill */
+      emit_kil( func, &inst->FullSrcRegisters[0] );
+      break;
+
+   case TGSI_OPCODE_PK2H:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_PK2US:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_PK4B:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_PK4UB:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_RFL:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_SEQ:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_SFL:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_SGT:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_SIN:
+      FETCH( func, *inst, 0, 0, CHAN_X );
+      emit_sin( func, 0, 0 );
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SLE:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_SNE:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_STR:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_TEX:
+      if (0) {
+	 /* Disable dummy texture code: 
+	  */
+	 emit_tempf(
+	    func,
+	    0,
+	    TEMP_ONE_I,
+	    TEMP_ONE_C );
+	 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+	    STORE( func, *inst, 0, 0, chan_index );
+	 }
+      }
+      else {
+	 return 0;
+      }
+      break;
+
+   case TGSI_OPCODE_TXD:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_UP2H:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_UP2US:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_UP4B:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_UP4UB:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_X2D:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_ARA:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_ARR:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_BRA:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_CAL:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_RET:
+      emit_ret( func );
+      break;
+
+   case TGSI_OPCODE_END:
+      break;
+
+   case TGSI_OPCODE_SSG:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_CMP:
+      emit_cmp (func, inst);
+      break;
+
+   case TGSI_OPCODE_SCS:
+      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
+         FETCH( func, *inst, 0, 0, CHAN_X );
+         emit_cos( func, 0, 0 );
+         STORE( func, *inst, 0, 0, CHAN_X );
+      }
+      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
+         FETCH( func, *inst, 0, 0, CHAN_X );
+         emit_sin( func, 0, 0 );
+         STORE( func, *inst, 0, 0, CHAN_Y );
+      }
+      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
+	 emit_tempf(
+	    func,
+	    0,
+	    TGSI_EXEC_TEMP_00000000_I,
+	    TGSI_EXEC_TEMP_00000000_C );
+         STORE( func, *inst, 0, 0, CHAN_Z );
+      }
+      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
+	 emit_tempf(
+	    func,
+	    0,
+	    TEMP_ONE_I,
+	    TEMP_ONE_C );
+         STORE( func, *inst, 0, 0, CHAN_W );
+      }
+      break;
+
+   case TGSI_OPCODE_TXB:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_NRM:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_DIV:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_DP2:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_TXL:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_BRK:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_IF:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_LOOP:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_REP:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_ELSE:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_ENDIF:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_ENDLOOP:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_ENDREP:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_PUSHA:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_POPA:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_CEIL:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_I2F:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_NOT:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_TRUNC:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_SHL:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_SHR:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_AND:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_OR:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_MOD:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_XOR:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_SAD:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_TXF:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_TXQ:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_CONT:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_EMIT:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_ENDPRIM:
+      return 0;
+      break;
+
+   default:
+      return 0;
+   }
+#endif
+   
+   return 1;
+}
+
+static void
+emit_declaration(
+   struct ppc_function *func,
+   struct tgsi_full_declaration *decl )
+{
+   if( decl->Declaration.File == TGSI_FILE_INPUT ) {
+#if 0
+      unsigned first, last, mask;
+      unsigned i, j;
+
+      first = decl->DeclarationRange.First;
+      last = decl->DeclarationRange.Last;
+      mask = decl->Declaration.UsageMask;
+
+      for( i = first; i <= last; i++ ) {
+         for( j = 0; j < NUM_CHANNELS; j++ ) {
+            if( mask & (1 << j) ) {
+               switch( decl->Declaration.Interpolate ) {
+               case TGSI_INTERPOLATE_CONSTANT:
+                  emit_coef_a0( func, 0, i, j );
+                  emit_inputs( func, 0, i, j );
+                  break;
+
+               case TGSI_INTERPOLATE_LINEAR:
+                  emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
+                  emit_coef_dadx( func, 1, i, j );
+                  emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
+                  emit_coef_dady( func, 3, i, j );
+                  emit_mul( func, 0, 1 );    /* x * dadx */
+                  emit_coef_a0( func, 4, i, j );
+                  emit_mul( func, 2, 3 );    /* y * dady */
+                  emit_add( func, 0, 4 );    /* x * dadx + a0 */
+                  emit_add( func, 0, 2 );    /* x * dadx + y * dady + a0 */
+                  emit_inputs( func, 0, i, j );
+                  break;
+
+               case TGSI_INTERPOLATE_PERSPECTIVE:
+                  emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
+                  emit_coef_dadx( func, 1, i, j );
+                  emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
+                  emit_coef_dady( func, 3, i, j );
+                  emit_mul( func, 0, 1 );    /* x * dadx */
+                  emit_tempf( func, 4, 0, TGSI_SWIZZLE_W );
+                  emit_coef_a0( func, 5, i, j );
+                  emit_rcp( func, 4, 4 );    /* 1.0 / w */
+                  emit_mul( func, 2, 3 );    /* y * dady */
+                  emit_add( func, 0, 5 );    /* x * dadx + a0 */
+                  emit_add( func, 0, 2 );    /* x * dadx + y * dady + a0 */
+                  emit_mul( func, 0, 4 );    /* (x * dadx + y * dady + a0) / w */
+                  emit_inputs( func, 0, i, j );
+                  break;
+
+               default:
+                  assert( 0 );
+		  break;
+               }
+            }
+         }
+      }
+#endif
+   }
+}
+
+#if 0
+static void aos_to_soa( struct x86_function *func, 
+                        uint arg_aos,
+                        uint arg_soa, 
+                        uint arg_num, 
+                        uint arg_stride )
+{
+   struct x86_reg soa_input = x86_make_reg( file_REG32, reg_AX );
+   struct x86_reg aos_input = x86_make_reg( file_REG32, reg_BX );
+   struct x86_reg num_inputs = x86_make_reg( file_REG32, reg_CX );
+   struct x86_reg stride = x86_make_reg( file_REG32, reg_DX );
+   int inner_loop;
+
+
+   /* Save EBX */
+   x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
+
+   x86_mov( func, aos_input,  x86_fn_arg( func, arg_aos ) );
+   x86_mov( func, soa_input,  x86_fn_arg( func, arg_soa ) );
+   x86_mov( func, num_inputs, x86_fn_arg( func, arg_num ) );
+   x86_mov( func, stride,     x86_fn_arg( func, arg_stride ) );
+
+   /* do */
+   inner_loop = x86_get_label( func );
+   {
+      x86_push( func, aos_input );
+      sse_movlps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
+      sse_movlps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
+      x86_add( func, aos_input, stride );
+      sse_movhps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
+      sse_movhps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
+      x86_add( func, aos_input, stride );
+      sse_movlps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
+      sse_movlps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
+      x86_add( func, aos_input, stride );
+      sse_movhps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
+      sse_movhps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
+      x86_pop( func, aos_input );
+
+      sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
+      sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
+      sse_shufps( func, make_xmm( 0 ), make_xmm( 1 ), 0x88 );
+      sse_shufps( func, make_xmm( 2 ), make_xmm( 1 ), 0xdd );
+      sse_shufps( func, make_xmm( 3 ), make_xmm( 4 ), 0x88 );
+      sse_shufps( func, make_xmm( 5 ), make_xmm( 4 ), 0xdd );
+
+      sse_movups( func, x86_make_disp( soa_input, 0 ), make_xmm( 0 ) );
+      sse_movups( func, x86_make_disp( soa_input, 16 ), make_xmm( 2 ) );
+      sse_movups( func, x86_make_disp( soa_input, 32 ), make_xmm( 3 ) );
+      sse_movups( func, x86_make_disp( soa_input, 48 ), make_xmm( 5 ) );
+
+      /* Advance to next input */
+      x86_lea( func, aos_input, x86_make_disp(aos_input, 16) );
+      x86_lea( func, soa_input, x86_make_disp(soa_input, 64) );
+   }
+   /* while --num_inputs */
+   x86_dec( func, num_inputs );
+   x86_jcc( func, cc_NE, inner_loop );
+
+   /* Restore EBX */
+   x86_pop( func, aos_input );
+}
+#endif
+
+#if 0
+static void soa_to_aos( struct x86_function *func, uint aos, uint soa, uint num, uint stride )
+{
+   struct x86_reg soa_output;
+   struct x86_reg aos_output;
+   struct x86_reg num_outputs;
+   struct x86_reg temp;
+   int inner_loop;
+
+   soa_output = x86_make_reg( file_REG32, reg_AX );
+   aos_output = x86_make_reg( file_REG32, reg_BX );
+   num_outputs = x86_make_reg( file_REG32, reg_CX );
+   temp = x86_make_reg( file_REG32, reg_DX );
+
+   /* Save EBX */
+   x86_push( func, aos_output );
+
+   x86_mov( func, soa_output, x86_fn_arg( func, soa ) );
+   x86_mov( func, aos_output, x86_fn_arg( func, aos ) );
+   x86_mov( func, num_outputs, x86_fn_arg( func, num ) );
+
+   /* do */
+   inner_loop = x86_get_label( func );
+   {
+      sse_movups( func, make_xmm( 0 ), x86_make_disp( soa_output, 0 ) );
+      sse_movups( func, make_xmm( 1 ), x86_make_disp( soa_output, 16 ) );
+      sse_movups( func, make_xmm( 3 ), x86_make_disp( soa_output, 32 ) );
+      sse_movups( func, make_xmm( 4 ), x86_make_disp( soa_output, 48 ) );
+
+      sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
+      sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
+      sse_unpcklps( func, make_xmm( 0 ), make_xmm( 1 ) );
+      sse_unpckhps( func, make_xmm( 2 ), make_xmm( 1 ) );
+      sse_unpcklps( func, make_xmm( 3 ), make_xmm( 4 ) );
+      sse_unpckhps( func, make_xmm( 5 ), make_xmm( 4 ) );
+
+      x86_mov( func, temp, x86_fn_arg( func, stride ) );
+      x86_push( func, aos_output );
+      sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
+      sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
+      x86_add( func, aos_output, temp );
+      sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
+      sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
+      x86_add( func, aos_output, temp );
+      sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
+      sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
+      x86_add( func, aos_output, temp );
+      sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
+      sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
+      x86_pop( func, aos_output );
+
+      /* Advance to next output */
+      x86_lea( func, aos_output, x86_make_disp(aos_output, 16) );
+      x86_lea( func, soa_output, x86_make_disp(soa_output, 64) );
+   }
+   /* while --num_outputs */
+   x86_dec( func, num_outputs );
+   x86_jcc( func, cc_NE, inner_loop );
+
+   /* Restore EBX */
+   x86_pop( func, aos_output );
+}
+#endif
+
+
+static void
+emit_prologue(struct ppc_function *func)
+{
+   /* XXX set up stack frame */
+}
+
+
+static void
+emit_epilogue(struct ppc_function *func)
+{
+   ppc_return(func);
+   /* XXX restore prev stack frame */
+}
+
+
+
+/**
+ * Translate a TGSI vertex/fragment shader to PPC code.
+ *
+ * \param tokens  the TGSI input shader
+ * \param func  the output PPC code/function
+ * \param immediates  buffer to place immediates, later passed to PPC func
+ * \return TRUE for success, FALSE if translation failed
+ */
+boolean
+tgsi_emit_ppc(const struct tgsi_token *tokens,
+              struct ppc_function *func,
+              float (*immediates)[4],
+              boolean do_swizzles )
+{
+   struct tgsi_parse_context parse;
+   /*boolean instruction_phase = FALSE;*/
+   unsigned ok = 1;
+   uint num_immediates = 0;
+   struct gen_context gen;
+
+   util_init_math();
+
+   tgsi_parse_init( &parse, tokens );
+
+   gen.f = func;
+   gen.inputs_reg = 3;   /* first function param */
+   gen.outputs_reg = 4;  /* second function param */
+   gen.temps_reg = 5;    /* ... */
+   gen.immed_reg = 6;
+   gen.const_reg = 7;
+
+   emit_prologue(func);
+
+   /*
+    * Different function args for vertex/fragment shaders:
+    */
+#if 0
+   if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
+      /* DECLARATION phase, do not load output argument. */
+      x86_mov(
+         func,
+         get_input_base(),
+         x86_fn_arg( func, 1 ) );
+      /* skipping outputs argument here */
+      x86_mov(
+         func,
+         get_const_base(),
+         x86_fn_arg( func, 3 ) );
+      x86_mov(
+         func,
+         get_temp_base(),
+         x86_fn_arg( func, 4 ) );
+      x86_mov(
+         func,
+         get_coef_base(),
+         x86_fn_arg( func, 5 ) );
+      x86_mov(
+         func,
+         get_immediate_base(),
+         x86_fn_arg( func, 6 ) );
+   }
+   else {
+      assert(parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX);
+
+      if (do_swizzles)
+         aos_to_soa( func, 
+                     6,         /* aos_input */
+                     1,         /* machine->input */
+                     7,         /* num_inputs */
+                     8 );       /* input_stride */
+
+      x86_mov(
+         func,
+         get_input_base(),
+         x86_fn_arg( func, 1 ) );
+      x86_mov(
+         func,
+         get_output_base(),
+         x86_fn_arg( func, 2 ) );
+      x86_mov(
+         func,
+         get_const_base(),
+         x86_fn_arg( func, 3 ) );
+      x86_mov(
+         func,
+         get_temp_base(),
+         x86_fn_arg( func, 4 ) );
+      x86_mov(
+         func,
+         get_immediate_base(),
+         x86_fn_arg( func, 5 ) );
+   }
+#endif
+
+   while (!tgsi_parse_end_of_tokens(&parse) && ok) {
+      tgsi_parse_token(&parse);
+
+      switch (parse.FullToken.Token.Type) {
+      case TGSI_TOKEN_TYPE_DECLARATION:
+         if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
+            emit_declaration(func, &parse.FullToken.FullDeclaration );
+         }
+         break;
+
+      case TGSI_TOKEN_TYPE_INSTRUCTION:
+         if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
+#if 0
+            if( !instruction_phase ) {
+               /* INSTRUCTION phase, overwrite coeff with output. */
+               instruction_phase = TRUE;
+               x86_mov(
+                  func,
+                  get_output_base(),
+                  x86_fn_arg( func, 2 ) );
+            }
+#endif
+         }
+
+         ok = emit_instruction(&gen, &parse.FullToken.FullInstruction);
+
+	 if (!ok) {
+	    debug_printf("failed to translate tgsi opcode %d to PPC (%s)\n", 
+			 parse.FullToken.FullInstruction.Instruction.Opcode,
+                         parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX ?
+                         "vertex shader" : "fragment shader");
+	 }
+         break;
+
+      case TGSI_TOKEN_TYPE_IMMEDIATE:
+         /* splat each immediate component into a float[4] vector for SoA */
+         {
+            const uint size = parse.FullToken.FullImmediate.Immediate.Size - 1;
+            float *imm = (float *) immediates;
+            uint i;
+            assert(size <= 4);
+            assert(num_immediates < TGSI_EXEC_NUM_IMMEDIATES);
+            for (i = 0; i < size; i++) {
+               const float value =
+                  parse.FullToken.FullImmediate.u.ImmediateFloat32[i].Float;
+               imm[num_immediates * 4 + 0] = 
+               imm[num_immediates * 4 + 1] = 
+               imm[num_immediates * 4 + 2] = 
+               imm[num_immediates * 4 + 3] = value;
+               num_immediates++;
+            }
+         }
+         break;
+
+      default:
+	 ok = 0;
+         assert( 0 );
+      }
+   }
+
+#if 0
+   if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
+      if (do_swizzles)
+         soa_to_aos( func, 9, 2, 10, 11 );
+   }
+#endif
+
+   emit_epilogue(func);
+
+   tgsi_parse_free( &parse );
+
+   return ok;
+}
+
+#endif /* PIPE_ARCH_PPC */
-- 
cgit v1.2.3


From da63edd720fc154820fcbf699e1056ac9357a03f Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 22 Oct 2008 13:59:11 -0600
Subject: gallium: fix broken TGSI_FILE_CONSTANT case, use
 ppc_reserver_register()

---
 src/gallium/auxiliary/tgsi/tgsi_ppc.c | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

(limited to 'src/gallium/auxiliary/tgsi/tgsi_ppc.c')

diff --git a/src/gallium/auxiliary/tgsi/tgsi_ppc.c b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
index 112e736523..dbf215c0d5 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ppc.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
@@ -1108,10 +1108,15 @@ emit_fetch(struct gen_context *gen,
             int offset_reg = ppc_allocate_register(gen->f);
             int offset = (reg->SrcRegister.Index * 4 + swizzle) * 4;
             ppc_li(gen->f, offset_reg, offset);
-            /* load vector word */
+            /* Load 4-byte word into vector register.
+             * The vector slot depends on the effective address we load from.
+             * We know that our constants start at a 16-byte boundary so we
+             * know that 'swizzle' tells us which vector slot will have the
+             * loaded word.  The other vector slots will be undefined.
+             */
             ppc_lvewx(gen->f, vec_reg, gen->const_reg, offset_reg);
-            /* splat word[0] across vector */
-            ppc_vspltw(gen->f, vec_reg, vec_reg, 0);
+            /* splat word[swizzle] across the vector reg */
+            ppc_vspltw(gen->f, vec_reg, vec_reg, swizzle);
             ppc_release_register(gen->f, offset_reg);
          }
          break;
@@ -2635,11 +2640,11 @@ tgsi_emit_ppc(const struct tgsi_token *tokens,
    tgsi_parse_init( &parse, tokens );
 
    gen.f = func;
-   gen.inputs_reg = 3;   /* first function param */
-   gen.outputs_reg = 4;  /* second function param */
-   gen.temps_reg = 5;    /* ... */
-   gen.immed_reg = 6;
-   gen.const_reg = 7;
+   gen.inputs_reg = ppc_reserve_register(func, 3);   /* first function param */
+   gen.outputs_reg = ppc_reserve_register(func, 4);  /* second function param */
+   gen.temps_reg = ppc_reserve_register(func, 5);    /* ... */
+   gen.immed_reg = ppc_reserve_register(func, 6);
+   gen.const_reg = ppc_reserve_register(func, 7);
 
    emit_prologue(func);
 
-- 
cgit v1.2.3


From 51840065607337210fbba5ba1c01874293fbb42e Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 22 Oct 2008 14:48:58 -0600
Subject: gallium: TGSI->PPC inequality operators

---
 src/gallium/auxiliary/tgsi/tgsi_ppc.c | 70 +++++++++++++++++++++++++++++++++++
 1 file changed, 70 insertions(+)

(limited to 'src/gallium/auxiliary/tgsi/tgsi_ppc.c')

diff --git a/src/gallium/auxiliary/tgsi/tgsi_ppc.c b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
index dbf215c0d5..9bf364b8c4 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ppc.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
@@ -1495,6 +1495,68 @@ emit_binop(struct gen_context *gen, struct tgsi_full_instruction *inst)
 }
 
 
+/**
+ * Vector comparisons, resulting in 1.0 or 0.0 values.
+ */
+static void
+emit_inequality(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+   int v0 = ppc_allocate_vec_register(gen->f);
+   int v1 = ppc_allocate_vec_register(gen->f);
+   int v2 = ppc_allocate_vec_register(gen->f);
+   int v_one = ppc_allocate_vec_register(gen->f);
+   uint chan_index;
+   boolean complement = FALSE;
+
+   /* v_one = splat(1.0) */
+   ppc_vload_float(gen->f, v_one, 1.0f);
+
+   FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) {
+      FETCH(gen, *inst, v0, 0, chan_index);   /* v0 = srcreg[0] */
+      FETCH(gen, *inst, v1, 1, chan_index);   /* v1 = srcreg[1] */
+
+      switch (inst->Instruction.Opcode) {
+      case TGSI_OPCODE_SNE:
+         complement = TRUE;
+         /* fall-through */
+      case TGSI_OPCODE_SEQ:
+         ppc_vcmpeqfpx(gen->f, v2, v0, v1); /* v2 = v0 == v1 ? ~0 : 0 */
+         break;
+
+      case TGSI_OPCODE_SGE:
+         complement = TRUE;
+         /* fall-through */
+      case TGSI_OPCODE_SLT:
+         ppc_vcmpgtfpx(gen->f, v2, v1, v0); /* v2 = v1 > v0 ? ~0 : 0 */
+         break;
+
+      case TGSI_OPCODE_SLE:
+         complement = TRUE;
+         /* fall-through */
+      case TGSI_OPCODE_SGT:
+         ppc_vcmpgtfpx(gen->f, v2, v0, v1); /* v2 = v0 > v1 ? ~0 : 0 */
+         break;
+      default:
+         assert(0);
+      }
+
+      /* v2 is now {0,0,0,0} or {~0,~0,~0,~0} */
+
+      if (complement)
+         ppc_vandc(gen->f, v2, v_one, v2);    /* v2 = v_one & ~v2 */
+      else
+         ppc_vand(gen->f, v2, v_one, v2);     /* v2 = v_one & v2 */
+
+      STORE(gen, *inst, v2, 0, chan_index);   /* store v2 */
+   }
+
+   ppc_release_vec_register(gen->f, v0);
+   ppc_release_vec_register(gen->f, v1);
+   ppc_release_vec_register(gen->f, v2);
+   ppc_release_vec_register(gen->f, v_one);
+}
+
+
 static void
 emit_dotprod(struct gen_context *gen, struct tgsi_full_instruction *inst)
 {
@@ -1588,6 +1650,14 @@ emit_instruction(struct gen_context *gen,
    case TGSI_OPCODE_MAX:
       emit_binop(gen, inst);
       break;
+   case TGSI_OPCODE_SEQ:
+   case TGSI_OPCODE_SNE:
+   case TGSI_OPCODE_SLT:
+   case TGSI_OPCODE_SGT:
+   case TGSI_OPCODE_SLE:
+   case TGSI_OPCODE_SGE:
+      emit_inequality(gen, inst);
+      break;
    case TGSI_OPCODE_MAD:
    case TGSI_OPCODE_LRP:
       emit_triop(gen, inst);
-- 
cgit v1.2.3


From c6ff870836e7c970f1030e9e0fbdd0cb5df40d29 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 22 Oct 2008 15:21:22 -0600
Subject: cell: TGSI->PPC for RSQ, RCP and src register sign modes

---
 src/gallium/auxiliary/tgsi/tgsi_ppc.c | 162 ++++++++++++++++++++++++----------
 1 file changed, 116 insertions(+), 46 deletions(-)

(limited to 'src/gallium/auxiliary/tgsi/tgsi_ppc.c')

diff --git a/src/gallium/auxiliary/tgsi/tgsi_ppc.c b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
index 9bf364b8c4..3637772102 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ppc.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
@@ -84,11 +84,14 @@
 struct gen_context
 {
    struct ppc_function *f;
-   int inputs_reg;    /**< register pointing to input params */
-   int outputs_reg;   /**< register pointing to output params */
-   int temps_reg;     /**< register pointing to temporary "registers" */
-   int immed_reg;     /**< register pointing to immediates buffer */
-   int const_reg;     /**< register pointing to constants buffer */
+   int inputs_reg;    /**< GP register pointing to input params */
+   int outputs_reg;   /**< GP register pointing to output params */
+   int temps_reg;     /**< GP register pointing to temporary "registers" */
+   int immed_reg;     /**< GP register pointing to immediates buffer */
+   int const_reg;     /**< GP register pointing to constants buffer */
+
+   int one_vec;       /**< vector register with {1.0, 1.0, 1.0, 1.0} */
+   int bit31_vec;     /**< vector register with {1<<31, 1<<31, 1<<31, 1<<31} */
 };
 
 
@@ -1059,6 +1062,35 @@ emit_sub(
 #endif
 
 
+/**
+ * Return index of vector register containing {1.0, 1.0, 1.0, 1.0}.
+ */
+static int
+gen_one_vec(struct gen_context *gen)
+{
+   if (gen->one_vec < 0) {
+      gen->one_vec = ppc_allocate_vec_register(gen->f);
+      ppc_vload_float(gen->f, gen->one_vec, 1.0f);
+   }
+   return gen->one_vec;
+}
+
+/**
+ * Return index of vector register containing {1<<31, 1<<31, 1<<31, 1<<31}.
+ */
+static int
+gen_get_bit31_vec(struct gen_context *gen)
+{
+   if (gen->bit31_vec < 0) {
+      gen->bit31_vec = ppc_allocate_vec_register(gen->f);
+      ppc_vspltisw(gen->f, gen->bit31_vec, -1);
+      ppc_vslw(gen->f, gen->bit31_vec, gen->bit31_vec, gen->bit31_vec);
+   }
+   return gen->bit31_vec;
+}
+
+
+
 /**
  * Register fetch.
  */
@@ -1124,49 +1156,42 @@ emit_fetch(struct gen_context *gen,
          assert( 0 );
       }
       break;
-
    case TGSI_EXTSWIZZLE_ZERO:
-#if 0
-      emit_tempf(
-         func,
-         xmm,
-         TGSI_EXEC_TEMP_00000000_I,
-         TGSI_EXEC_TEMP_00000000_C );
-#endif
+      ppc_vload_float(gen->f, vec_reg, 0.0f);
       break;
-
    case TGSI_EXTSWIZZLE_ONE:
-#if 0
-      emit_tempf(
-         func,
-         xmm,
-         TEMP_ONE_I,
-         TEMP_ONE_C );
-#endif
+      {
+         int one_vec = gen_one_vec(gen);
+         ppc_vecmove(gen->f, vec_reg, one_vec);
+      }
       break;
-
    default:
       assert( 0 );
    }
 
-#if 0
-   switch( tgsi_util_get_full_src_register_sign_mode( reg, chan_index ) ) {
-   case TGSI_UTIL_SIGN_CLEAR:
-      emit_abs( func, xmm );
-      break;
-
-   case TGSI_UTIL_SIGN_SET:
-      emit_setsign( func, xmm );
-      break;
-
-   case TGSI_UTIL_SIGN_TOGGLE:
-      emit_neg( func, xmm );
-      break;
-
-   case TGSI_UTIL_SIGN_KEEP:
-      break;
+   {
+      uint sign_op = tgsi_util_get_full_src_register_sign_mode(reg, chan_index);
+      if (sign_op != TGSI_UTIL_SIGN_KEEP) {
+         int bit31_vec = gen_get_bit31_vec(gen);
+
+         switch (sign_op) {
+         case TGSI_UTIL_SIGN_CLEAR:
+            /* vec = vec & ~bit31 */
+            ppc_vandc(gen->f, vec_reg, vec_reg, bit31_vec);
+            break;
+         case TGSI_UTIL_SIGN_SET:
+            /* vec = vec | bit31 */
+            ppc_vor(gen->f, vec_reg, vec_reg, bit31_vec);
+            break;
+         case TGSI_UTIL_SIGN_TOGGLE:
+            /* vec = vec ^ bit31 */
+            ppc_vxor(gen->f, vec_reg, vec_reg, bit31_vec);
+            break;
+         default:
+            assert(0);
+         }
+      }
    }
-#endif
 }
 
 #define FETCH( GEN, INST, VEC_REG, SRC_REG, CHAN ) \
@@ -1409,6 +1434,36 @@ emit_cmp(
 #endif
 
 
+static void
+emit_scalar_unaryop(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+   int v0 = ppc_allocate_vec_register(gen->f);
+   int v1 = ppc_allocate_vec_register(gen->f);
+   uint chan_index;
+
+   FETCH(gen, *inst, v0, 0, CHAN_X);
+
+   switch (inst->Instruction.Opcode) {
+   case TGSI_OPCODE_RSQ:
+      /* v1 = 1.0 / sqrt(v0) */
+      ppc_vrsqrtefp(gen->f, v1, v0);
+      break;
+   case TGSI_OPCODE_RCP:
+      /* v1 = 1.0 / v0 */
+      ppc_vrefp(gen->f, v1, v0);
+      break;
+   default:
+      assert(0);
+   }
+
+   FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+      STORE(gen, *inst, v1, 0, chan_index);
+   }
+   ppc_release_vec_register(gen->f, v0);
+   ppc_release_vec_register(gen->f, v1);
+}
+
+
 static void
 emit_unaryop(struct gen_context *gen, struct tgsi_full_instruction *inst)
 {
@@ -1504,12 +1559,9 @@ emit_inequality(struct gen_context *gen, struct tgsi_full_instruction *inst)
    int v0 = ppc_allocate_vec_register(gen->f);
    int v1 = ppc_allocate_vec_register(gen->f);
    int v2 = ppc_allocate_vec_register(gen->f);
-   int v_one = ppc_allocate_vec_register(gen->f);
    uint chan_index;
    boolean complement = FALSE;
-
-   /* v_one = splat(1.0) */
-   ppc_vload_float(gen->f, v_one, 1.0f);
+   int one_vec = gen_one_vec(gen);
 
    FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) {
       FETCH(gen, *inst, v0, 0, chan_index);   /* v0 = srcreg[0] */
@@ -1543,9 +1595,9 @@ emit_inequality(struct gen_context *gen, struct tgsi_full_instruction *inst)
       /* v2 is now {0,0,0,0} or {~0,~0,~0,~0} */
 
       if (complement)
-         ppc_vandc(gen->f, v2, v_one, v2);    /* v2 = v_one & ~v2 */
+         ppc_vandc(gen->f, v2, one_vec, v2);    /* v2 = one_vec & ~v2 */
       else
-         ppc_vand(gen->f, v2, v_one, v2);     /* v2 = v_one & v2 */
+         ppc_vand(gen->f, v2, one_vec, v2);     /* v2 = one_vec & v2 */
 
       STORE(gen, *inst, v2, 0, chan_index);   /* store v2 */
    }
@@ -1553,7 +1605,6 @@ emit_inequality(struct gen_context *gen, struct tgsi_full_instruction *inst)
    ppc_release_vec_register(gen->f, v0);
    ppc_release_vec_register(gen->f, v1);
    ppc_release_vec_register(gen->f, v2);
-   ppc_release_vec_register(gen->f, v_one);
 }
 
 
@@ -1630,6 +1681,14 @@ emit_triop(struct gen_context *gen, struct tgsi_full_instruction *inst)
 }
 
 
+/*
+static void
+emit_lit(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+}
+*/
+
+
 static int
 emit_instruction(struct gen_context *gen,
                  struct tgsi_full_instruction *inst)
@@ -1643,6 +1702,10 @@ emit_instruction(struct gen_context *gen,
    case TGSI_OPCODE_LOGBASE2:
       emit_unaryop(gen, inst);
       break;
+   case TGSI_OPCODE_RSQ:
+   case TGSI_OPCODE_RCP:
+      emit_scalar_unaryop(gen, inst);
+      break;
    case TGSI_OPCODE_ADD:
    case TGSI_OPCODE_SUB:
    case TGSI_OPCODE_MUL:
@@ -1667,6 +1730,11 @@ emit_instruction(struct gen_context *gen,
    case TGSI_OPCODE_DPH:
       emit_dotprod(gen, inst);
       break;
+      /*
+   case TGSI_OPCODE_LIT:
+      emit_lit(gen, inst);
+      break;
+      */
    case TGSI_OPCODE_END:
       /* normal end */
       return 1;
@@ -2715,6 +2783,8 @@ tgsi_emit_ppc(const struct tgsi_token *tokens,
    gen.temps_reg = ppc_reserve_register(func, 5);    /* ... */
    gen.immed_reg = ppc_reserve_register(func, 6);
    gen.const_reg = ppc_reserve_register(func, 7);
+   gen.one_vec = -1;
+   gen.bit31_vec = -1;
 
    emit_prologue(func);
 
-- 
cgit v1.2.3


From 519c2dbed57b3c5e1717a62df5d5f8b908a1acd6 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 22 Oct 2008 15:30:00 -0600
Subject: gallium: remove SSE remnants from tgsi_ppc.c

---
 src/gallium/auxiliary/tgsi/tgsi_ppc.c | 2987 +++++----------------------------
 1 file changed, 417 insertions(+), 2570 deletions(-)

(limited to 'src/gallium/auxiliary/tgsi/tgsi_ppc.c')

diff --git a/src/gallium/auxiliary/tgsi/tgsi_ppc.c b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
index 3637772102..432ec7459b 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ppc.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
@@ -44,14 +44,6 @@
 #include "rtasm/rtasm_ppc.h"
 
 
-/* for 1/sqrt()
- *
- * This costs about 100fps (close to 10%) in gears:
- */
-#define HIGH_PRECISION 1
-
-#define FAST_MATH 1
-
 
 #define FOR_EACH_CHANNEL( CHAN )\
    for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)
@@ -95,2452 +87,515 @@ struct gen_context
 };
 
 
-
-#if 0000
-
 /**
- * X86 utility functions.
+ * Return index of vector register containing {1.0, 1.0, 1.0, 1.0}.
  */
-
-static struct x86_reg
-make_xmm(
-   unsigned xmm )
+static int
+gen_one_vec(struct gen_context *gen)
 {
-   return x86_make_reg(
-      file_XMM,
-      (enum x86_reg_name) xmm );
+   if (gen->one_vec < 0) {
+      gen->one_vec = ppc_allocate_vec_register(gen->f);
+      ppc_vload_float(gen->f, gen->one_vec, 1.0f);
+   }
+   return gen->one_vec;
 }
 
 /**
- * X86 register mapping helpers.
+ * Return index of vector register containing {1<<31, 1<<31, 1<<31, 1<<31}.
  */
-
-static struct x86_reg
-get_const_base( void )
-{
-   return x86_make_reg(
-      file_REG32,
-      reg_CX );
-}
-
-static struct x86_reg
-get_input_base( void )
-{
-   return x86_make_reg(
-      file_REG32,
-      reg_AX );
-}
-
-static struct x86_reg
-get_output_base( void )
-{
-   return x86_make_reg(
-      file_REG32,
-      reg_DX );
-}
-
-static struct x86_reg
-get_temp_base( void )
-{
-   return x86_make_reg(
-      file_REG32,
-      reg_BX );
-}
-
-static struct x86_reg
-get_coef_base( void )
+static int
+gen_get_bit31_vec(struct gen_context *gen)
 {
-   return get_output_base();
+   if (gen->bit31_vec < 0) {
+      gen->bit31_vec = ppc_allocate_vec_register(gen->f);
+      ppc_vspltisw(gen->f, gen->bit31_vec, -1);
+      ppc_vslw(gen->f, gen->bit31_vec, gen->bit31_vec, gen->bit31_vec);
+   }
+   return gen->bit31_vec;
 }
 
-static struct x86_reg
-get_immediate_base( void )
-{
-   return x86_make_reg(
-      file_REG32,
-      reg_DI );
-}
 
 
 /**
- * Data access helpers.
+ * Register fetch.
  */
-
-
-static struct x86_reg
-get_immediate(
-   unsigned vec,
-   unsigned chan )
-{
-   return x86_make_disp(
-      get_immediate_base(),
-      (vec * 4 + chan) * 4 );
-}
-
-static struct x86_reg
-get_const(
-   unsigned vec,
-   unsigned chan )
-{
-   return x86_make_disp(
-      get_const_base(),
-      (vec * 4 + chan) * 4 );
-}
-
-static struct x86_reg
-get_input(
-   unsigned vec,
-   unsigned chan )
+static void
+emit_fetch(struct gen_context *gen,
+           unsigned vec_reg,
+           const struct tgsi_full_src_register *reg,
+           const unsigned chan_index)
 {
-   return x86_make_disp(
-      get_input_base(),
-      (vec * 4 + chan) * 16 );
-}
+   uint swizzle = tgsi_util_get_full_src_register_extswizzle(reg, chan_index);
 
-static struct x86_reg
-get_output(
-   unsigned vec,
-   unsigned chan )
-{
-   return x86_make_disp(
-      get_output_base(),
-      (vec * 4 + chan) * 16 );
-}
+   switch (swizzle) {
+   case TGSI_EXTSWIZZLE_X:
+   case TGSI_EXTSWIZZLE_Y:
+   case TGSI_EXTSWIZZLE_Z:
+   case TGSI_EXTSWIZZLE_W:
+      switch (reg->SrcRegister.File) {
+      case TGSI_FILE_INPUT:
+         {
+            int offset_reg = ppc_allocate_register(gen->f);
+            int offset = (reg->SrcRegister.Index * 4 + swizzle) * 16;
+            ppc_li(gen->f, offset_reg, offset);
+            ppc_lvx(gen->f, vec_reg, gen->inputs_reg, offset_reg);
+            ppc_release_register(gen->f, offset_reg);
+         }
+         break;
+      case TGSI_FILE_TEMPORARY:
+         {
+            int offset_reg = ppc_allocate_register(gen->f);
+            int offset = (reg->SrcRegister.Index * 4 + swizzle) * 16;
+            ppc_li(gen->f, offset_reg, offset);
+            ppc_lvx(gen->f, vec_reg, gen->temps_reg, offset_reg);
+            ppc_release_register(gen->f, offset_reg);
+         }
+         break;
+      case TGSI_FILE_IMMEDIATE:
+         {
+            int offset_reg = ppc_allocate_register(gen->f);
+            int offset = (reg->SrcRegister.Index * 4 + swizzle) * 16;
+            ppc_li(gen->f, offset_reg, offset);
+            ppc_lvx(gen->f, vec_reg, gen->immed_reg, offset_reg);
+            ppc_release_register(gen->f, offset_reg);
+         }
+         break;
+      case TGSI_FILE_CONSTANT:
+         {
+            int offset_reg = ppc_allocate_register(gen->f);
+            int offset = (reg->SrcRegister.Index * 4 + swizzle) * 4;
+            ppc_li(gen->f, offset_reg, offset);
+            /* Load 4-byte word into vector register.
+             * The vector slot depends on the effective address we load from.
+             * We know that our constants start at a 16-byte boundary so we
+             * know that 'swizzle' tells us which vector slot will have the
+             * loaded word.  The other vector slots will be undefined.
+             */
+            ppc_lvewx(gen->f, vec_reg, gen->const_reg, offset_reg);
+            /* splat word[swizzle] across the vector reg */
+            ppc_vspltw(gen->f, vec_reg, vec_reg, swizzle);
+            ppc_release_register(gen->f, offset_reg);
+         }
+         break;
+      default:
+         assert( 0 );
+      }
+      break;
+   case TGSI_EXTSWIZZLE_ZERO:
+      ppc_vload_float(gen->f, vec_reg, 0.0f);
+      break;
+   case TGSI_EXTSWIZZLE_ONE:
+      {
+         int one_vec = gen_one_vec(gen);
+         ppc_vecmove(gen->f, vec_reg, one_vec);
+      }
+      break;
+   default:
+      assert( 0 );
+   }
 
-static struct x86_reg
-get_temp(
-   unsigned vec,
-   unsigned chan )
-{
-   return x86_make_disp(
-      get_temp_base(),
-      (vec * 4 + chan) * 16 );
-}
+   {
+      uint sign_op = tgsi_util_get_full_src_register_sign_mode(reg, chan_index);
+      if (sign_op != TGSI_UTIL_SIGN_KEEP) {
+         int bit31_vec = gen_get_bit31_vec(gen);
 
-static struct x86_reg
-get_coef(
-   unsigned vec,
-   unsigned chan,
-   unsigned member )
-{
-   return x86_make_disp(
-      get_coef_base(),
-      ((vec * 3 + member) * 4 + chan) * 4 );
+         switch (sign_op) {
+         case TGSI_UTIL_SIGN_CLEAR:
+            /* vec = vec & ~bit31 */
+            ppc_vandc(gen->f, vec_reg, vec_reg, bit31_vec);
+            break;
+         case TGSI_UTIL_SIGN_SET:
+            /* vec = vec | bit31 */
+            ppc_vor(gen->f, vec_reg, vec_reg, bit31_vec);
+            break;
+         case TGSI_UTIL_SIGN_TOGGLE:
+            /* vec = vec ^ bit31 */
+            ppc_vxor(gen->f, vec_reg, vec_reg, bit31_vec);
+            break;
+         default:
+            assert(0);
+         }
+      }
+   }
 }
 
+#define FETCH( GEN, INST, VEC_REG, SRC_REG, CHAN ) \
+   emit_fetch( GEN, VEC_REG, &(INST).FullSrcRegisters[SRC_REG], CHAN )
 
-static void
-emit_ret(
-   struct x86_function  *func )
-{
-   x86_ret( func );
-}
-
-#endif
 
-/**
- * Data fetch helpers.
- */
 
-#if 00
 /**
- * Copy a shader constant to xmm register
- * \param xmm  the destination xmm register
- * \param vec  the src const buffer index
- * \param chan  src channel to fetch (X, Y, Z or W)
+ * Register store.
  */
 static void
-emit_const(
-   struct x86_function *func,
-   uint xmm,
-   int vec,
-   uint chan,
-   uint indirect,
-   uint indirectFile,
-   int indirectIndex )
+emit_store(struct gen_context *gen,
+           unsigned vec_reg,
+           const struct tgsi_full_dst_register *reg,
+           const struct tgsi_full_instruction *inst,
+           unsigned chan_index)
 {
-   if (indirect) {
-      struct x86_reg r0 = get_input_base();
-      struct x86_reg r1 = get_output_base();
-      uint i;
-
-      assert( indirectFile == TGSI_FILE_ADDRESS );
-      assert( indirectIndex == 0 );
-
-      x86_push( func, r0 );
-      x86_push( func, r1 );
-
-      for (i = 0; i < QUAD_SIZE; i++) {
-         x86_lea( func, r0, get_const( vec, chan ) );
-         x86_mov( func, r1, x86_make_disp( get_temp( TEMP_ADDR, CHAN_X ), i * 4 ) );
-
-         /* Quick hack to multiply by 16 -- need to add SHL to rtasm.
-          */
-         x86_add( func, r1, r1 );
-         x86_add( func, r1, r1 );
-         x86_add( func, r1, r1 );
-         x86_add( func, r1, r1 );
-
-         x86_add( func, r0, r1 );
-         x86_mov( func, r1, x86_deref( r0 ) );
-         x86_mov( func, x86_make_disp( get_temp( TEMP_R0, CHAN_X ), i * 4 ), r1 );
+   switch (reg->DstRegister.File) {
+   case TGSI_FILE_OUTPUT:
+      {
+         int offset_reg = ppc_allocate_register(gen->f);
+         int offset = (reg->DstRegister.Index * 4 + chan_index) * 16;
+         ppc_li(gen->f, offset_reg, offset);
+         ppc_stvx(gen->f, vec_reg, gen->outputs_reg, offset_reg);
+         ppc_release_register(gen->f, offset_reg);
       }
-
-      x86_pop( func, r1 );
-      x86_pop( func, r0 );
-
-      sse_movaps(
+      break;
+   case TGSI_FILE_TEMPORARY:
+      {
+         int offset_reg = ppc_allocate_register(gen->f);
+         int offset = (reg->DstRegister.Index * 4 + chan_index) * 16;
+         ppc_li(gen->f, offset_reg, offset);
+         ppc_stvx(gen->f, vec_reg, gen->temps_reg, offset_reg);
+         ppc_release_register(gen->f, offset_reg);
+      }
+      break;
+#if 0
+   case TGSI_FILE_ADDRESS:
+      emit_addrs(
          func,
-         make_xmm( xmm ),
-         get_temp( TEMP_R0, CHAN_X ) );
+         xmm,
+         reg->DstRegister.Index,
+         chan_index );
+      break;
+#endif
+   default:
+      assert( 0 );
    }
-   else {
-      assert( vec >= 0 );
 
-      sse_movss(
-         func,
-         make_xmm( xmm ),
-         get_const( vec, chan ) );
-      sse_shufps(
-         func,
-         make_xmm( xmm ),
-         make_xmm( xmm ),
-         SHUF( 0, 0, 0, 0 ) );
+#if 0
+   switch( inst->Instruction.Saturate ) {
+   case TGSI_SAT_NONE:
+      break;
+
+   case TGSI_SAT_ZERO_ONE:
+      /* assert( 0 ); */
+      break;
+
+   case TGSI_SAT_MINUS_PLUS_ONE:
+      assert( 0 );
+      break;
    }
+#endif
 }
 
-static void
-emit_immediate(
-   struct x86_function *func,
-   unsigned xmm,
-   unsigned vec,
-   unsigned chan )
-{
-   sse_movss(
-      func,
-      make_xmm( xmm ),
-      get_immediate( vec, chan ) );
-   sse_shufps(
-      func,
-      make_xmm( xmm ),
-      make_xmm( xmm ),
-      SHUF( 0, 0, 0, 0 ) );
-}
 
+#define STORE( GEN, INST, XMM, INDEX, CHAN )\
+   emit_store( GEN, XMM, &(INST).FullDstRegisters[INDEX], &(INST), CHAN )
 
-/**
- * Copy a shader input to xmm register
- * \param xmm  the destination xmm register
- * \param vec  the src input attrib
- * \param chan  src channel to fetch (X, Y, Z or W)
- */
-static void
-emit_inputf(
-   struct x86_function *func,
-   unsigned xmm,
-   unsigned vec,
-   unsigned chan )
-{
-   sse_movups(
-      func,
-      make_xmm( xmm ),
-      get_input( vec, chan ) );
-}
 
-/**
- * Store an xmm register to a shader output
- * \param xmm  the source xmm register
- * \param vec  the dest output attrib
- * \param chan  src dest channel to store (X, Y, Z or W)
- */
-static void
-emit_output(
-   struct x86_function *func,
-   unsigned xmm,
-   unsigned vec,
-   unsigned chan )
-{
-   sse_movups(
-      func,
-      get_output( vec, chan ),
-      make_xmm( xmm ) );
-}
 
-/**
- * Copy a shader temporary to xmm register
- * \param xmm  the destination xmm register
- * \param vec  the src temp register
- * \param chan  src channel to fetch (X, Y, Z or W)
- */
 static void
-emit_tempf(
-   struct x86_function *func,
-   unsigned xmm,
-   unsigned vec,
-   unsigned chan )
+emit_scalar_unaryop(struct gen_context *gen, struct tgsi_full_instruction *inst)
 {
-   sse_movaps(
-      func,
-      make_xmm( xmm ),
-      get_temp( vec, chan ) );
-}
+   int v0 = ppc_allocate_vec_register(gen->f);
+   int v1 = ppc_allocate_vec_register(gen->f);
+   uint chan_index;
 
-/**
- * Load an xmm register with an input attrib coefficient (a0, dadx or dady)
- * \param xmm  the destination xmm register
- * \param vec  the src input/attribute coefficient index
- * \param chan  src channel to fetch (X, Y, Z or W)
- * \param member  0=a0, 1=dadx, 2=dady
- */
-static void
-emit_coef(
-   struct x86_function *func,
-   unsigned xmm,
-   unsigned vec,
-   unsigned chan,
-   unsigned member )
-{
-   sse_movss(
-      func,
-      make_xmm( xmm ),
-      get_coef( vec, chan, member ) );
-   sse_shufps(
-      func,
-      make_xmm( xmm ),
-      make_xmm( xmm ),
-      SHUF( 0, 0, 0, 0 ) );
+   FETCH(gen, *inst, v0, 0, CHAN_X);
+
+   switch (inst->Instruction.Opcode) {
+   case TGSI_OPCODE_RSQ:
+      /* v1 = 1.0 / sqrt(v0) */
+      ppc_vrsqrtefp(gen->f, v1, v0);
+      break;
+   case TGSI_OPCODE_RCP:
+      /* v1 = 1.0 / v0 */
+      ppc_vrefp(gen->f, v1, v0);
+      break;
+   default:
+      assert(0);
+   }
+
+   FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+      STORE(gen, *inst, v1, 0, chan_index);
+   }
+   ppc_release_vec_register(gen->f, v0);
+   ppc_release_vec_register(gen->f, v1);
 }
 
-/**
- * Data store helpers.
- */
 
 static void
-emit_inputs(
-   struct x86_function *func,
-   unsigned xmm,
-   unsigned vec,
-   unsigned chan )
-{
-   sse_movups(
-      func,
-      get_input( vec, chan ),
-      make_xmm( xmm ) );
-}
-
-static void
-emit_temps(
-   struct x86_function *func,
-   unsigned xmm,
-   unsigned vec,
-   unsigned chan )
-{
-   sse_movaps(
-      func,
-      get_temp( vec, chan ),
-      make_xmm( xmm ) );
-}
-
-static void
-emit_addrs(
-   struct x86_function *func,
-   unsigned xmm,
-   unsigned vec,
-   unsigned chan )
-{
-   assert( vec == 0 );
-
-   emit_temps(
-      func,
-      xmm,
-      vec + TGSI_EXEC_TEMP_ADDR,
-      chan );
-}
-
-/**
- * Coefficent fetch helpers.
- */
-
-static void
-emit_coef_a0(
-   struct x86_function *func,
-   unsigned xmm,
-   unsigned vec,
-   unsigned chan )
-{
-   emit_coef(
-      func,
-      xmm,
-      vec,
-      chan,
-      0 );
-}
-
-static void
-emit_coef_dadx(
-   struct x86_function *func,
-   unsigned xmm,
-   unsigned vec,
-   unsigned chan )
-{
-   emit_coef(
-      func,
-      xmm,
-      vec,
-      chan,
-      1 );
-}
-
-static void
-emit_coef_dady(
-   struct x86_function *func,
-   unsigned xmm,
-   unsigned vec,
-   unsigned chan )
-{
-   emit_coef(
-      func,
-      xmm,
-      vec,
-      chan,
-      2 );
-}
-#endif
-
-
-/**
- * Function call helpers.
- */
-
-#if 00
-/**
- * NOTE: In gcc, if the destination uses the SSE intrinsics, then it must be 
- * defined with __attribute__((force_align_arg_pointer)), as we do not guarantee
- * that the stack pointer is 16 byte aligned, as expected.
- */
-static void
-emit_func_call_dst(
-   struct x86_function *func,
-   unsigned xmm_save,
-   unsigned xmm_dst,
-   void (PIPE_CDECL *code)() )
+emit_unaryop(struct gen_context *gen, struct tgsi_full_instruction *inst)
 {
-   struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
-   unsigned i, n, xmm;
-   unsigned xmm_mask;
-   
-   /* Bitmask of the xmm registers to save */
-   xmm_mask = (1 << xmm_save) - 1;
-   xmm_mask &= ~(1 << xmm_dst);
-
-   sse_movaps(
-      func,
-      get_temp( TEMP_R0, 0 ),
-      make_xmm( xmm_dst ) );
-
-   x86_push(
-      func,
-      x86_make_reg( file_REG32, reg_AX) );
-   x86_push(
-      func,
-      x86_make_reg( file_REG32, reg_CX) );
-   x86_push(
-      func,
-      x86_make_reg( file_REG32, reg_DX) );
-   
-   for(i = 0, n = 0; i < 8; ++i)
-      if(xmm_mask & (1 << i))
-         ++n;
-   
-   x86_sub_imm(
-      func, 
-      x86_make_reg( file_REG32, reg_SP ),
-      n*16);
-
-   for(i = 0, n = 0; i < 8; ++i)
-      if(xmm_mask & (1 << i)) {
-         sse_movups(
-            func,
-            x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ),
-            make_xmm( xmm ) );
-         ++n;
-      }
-   
-   x86_lea(
-      func,
-      ecx,
-      get_temp( TEMP_R0, 0 ) );
-   
-   x86_push( func, ecx );
-   x86_mov_reg_imm( func, ecx, (unsigned long) code );
-   x86_call( func, ecx );
-   x86_pop(func, ecx );
-   
-   for(i = 0, n = 0; i < 8; ++i)
-      if(xmm_mask & (1 << i)) {
-         sse_movups(
-            func,
-            make_xmm( xmm ),
-            x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ) );
-         ++n;
+   int v0 = ppc_allocate_vec_register(gen->f);
+   uint chan_index;
+   FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) {
+      FETCH(gen, *inst, 0, 0, chan_index);   /* v0 = srcreg[0] */
+      switch (inst->Instruction.Opcode) {
+      case TGSI_OPCODE_ABS:
+         /* turn off the most significant bit of each vector float word */
+         {
+            int v1 = ppc_allocate_vec_register(gen->f);
+            ppc_vspltisw(gen->f, v1, -1);  /* v1 = {-1, -1, -1, -1} */
+            ppc_vslw(gen->f, v1, v1, v1);  /* v1 = {1<<31, 1<<31, 1<<31, 1<<31} */
+            ppc_vandc(gen->f, v0, v0, v1); /* v0 = v0 & ~v1 */
+            ppc_release_vec_register(gen->f, v1);
+         }
+         break;
+      case TGSI_OPCODE_FLOOR:
+         ppc_vrfim(gen->f, v0, v0);         /* v0 = floor(v0) */
+         break;
+      case TGSI_OPCODE_FRAC:
+         {
+            int v1 = ppc_allocate_vec_register(gen->f);
+            ppc_vrfim(gen->f, v1, v0);         /* v1 = floor(v0) */
+            ppc_vsubfp(gen->f, v0, v0, v1);    /* v0 = v0 - v1 */
+            ppc_release_vec_register(gen->f, v1);
+         }
+         break;
+      case TGSI_OPCODE_EXPBASE2:
+         ppc_vexptefp(gen->f, v0, v0);      /* v0 = 2^v0 */
+         break;
+      case TGSI_OPCODE_LOGBASE2:
+         /* XXX this may be broken! */
+         ppc_vlogefp(gen->f, v0, v0);      /* v0 = log2(v0) */
+         break;
+      case TGSI_OPCODE_MOV:
+         /* nothing */
+         break;
+      default:
+         assert(0);
       }
-   
-   x86_add_imm(
-      func, 
-      x86_make_reg( file_REG32, reg_SP ),
-      n*16);
-
-   /* Restore GP registers in a reverse order.
-    */
-   x86_pop(
-      func,
-      x86_make_reg( file_REG32, reg_DX) );
-   x86_pop(
-      func,
-      x86_make_reg( file_REG32, reg_CX) );
-   x86_pop(
-      func,
-      x86_make_reg( file_REG32, reg_AX) );
-
-   sse_movaps(
-      func,
-      make_xmm( xmm_dst ),
-      get_temp( TEMP_R0, 0 ) );
-}
-
-static void
-emit_func_call_dst_src(
-   struct x86_function *func,
-   unsigned xmm_save, 
-   unsigned xmm_dst,
-   unsigned xmm_src,
-   void (PIPE_CDECL *code)() )
-{
-   sse_movaps(
-      func,
-      get_temp( TEMP_R0, 1 ),
-      make_xmm( xmm_src ) );
-
-   emit_func_call_dst(
-      func,
-      xmm_save,
-      xmm_dst,
-      code );
-}
-
-/*
- * Fast SSE2 implementation of special math functions.
- */
-
-#define POLY0(x, c0) _mm_set1_ps(c0)
-#define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
-#define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
-#define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
-#define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
-#define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
-
-#define EXP_POLY_DEGREE 3
-#define LOG_POLY_DEGREE 5
-
-/**
- * See http://www.devmaster.net/forums/showthread.php?p=43580
- */
-static INLINE __m128 
-exp2f4(__m128 x)
-{
-   __m128i ipart;
-   __m128 fpart, expipart, expfpart;
-
-   x = _mm_min_ps(x, _mm_set1_ps( 129.00000f));
-   x = _mm_max_ps(x, _mm_set1_ps(-126.99999f));
-
-   /* ipart = int(x - 0.5) */
-   ipart = _mm_cvtps_epi32(_mm_sub_ps(x, _mm_set1_ps(0.5f)));
-
-   /* fpart = x - ipart */
-   fpart = _mm_sub_ps(x, _mm_cvtepi32_ps(ipart));
-
-   /* expipart = (float) (1 << ipart) */
-   expipart = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart, _mm_set1_epi32(127)), 23));
-
-   /* minimax polynomial fit of 2**x, in range [-0.5, 0.5[ */
-#if EXP_POLY_DEGREE == 5
-   expfpart = POLY5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f);
-#elif EXP_POLY_DEGREE == 4
-   expfpart = POLY4(fpart, 1.0000026f, 6.9300383e-1f, 2.4144275e-1f, 5.2011464e-2f, 1.3534167e-2f);
-#elif EXP_POLY_DEGREE == 3
-   expfpart = POLY3(fpart, 9.9992520e-1f, 6.9583356e-1f, 2.2606716e-1f, 7.8024521e-2f);
-#elif EXP_POLY_DEGREE == 2
-   expfpart = POLY2(fpart, 1.0017247f, 6.5763628e-1f, 3.3718944e-1f);
-#else
-#error
-#endif
-
-   return _mm_mul_ps(expipart, expfpart);
-}
-
-/**
- * See http://www.devmaster.net/forums/showthread.php?p=43580
- */
-static INLINE __m128 
-log2f4(__m128 x)
-{
-   __m128i expmask = _mm_set1_epi32(0x7f800000);
-   __m128i mantmask = _mm_set1_epi32(0x007fffff);
-   __m128 one = _mm_set1_ps(1.0f);
-
-   __m128i i = _mm_castps_si128(x);
-
-   /* exp = (float) exponent(x) */
-   __m128 exp = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i, expmask), 23), _mm_set1_epi32(127)));
-
-   /* mant = (float) mantissa(x) */
-   __m128 mant = _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i, mantmask)), one);
-
-   __m128 logmant;
-
-   /* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[ 
-    * These coefficients can be generate with 
-    * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
-    */
-#if LOG_POLY_DEGREE == 6
-   logmant = POLY5(mant, 3.11578814719469302614f, -3.32419399085241980044f, 2.59883907202499966007f, -1.23152682416275988241f, 0.318212422185251071475f, -0.0344359067839062357313f);
-#elif LOG_POLY_DEGREE == 5
-   logmant = POLY4(mant, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
-#elif LOG_POLY_DEGREE == 4
-   logmant = POLY3(mant, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
-#elif LOG_POLY_DEGREE == 3
-   logmant = POLY2(mant, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
-#else
-#error
-#endif
-
-   /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
-   logmant = _mm_mul_ps(logmant, _mm_sub_ps(mant, one));
-
-   return _mm_add_ps(logmant, exp);
-}
-
-static INLINE __m128
-powf4(__m128 x, __m128 y)
-{
-   return exp2f4(_mm_mul_ps(log2f4(x), y));
-}
-
-
-/**
- * Low-level instruction translators.
- */
-
-static void
-emit_abs(
-   struct x86_function *func,
-   unsigned xmm )
-{
-   sse_andps(
-      func,
-      make_xmm( xmm ),
-      get_temp(
-         TGSI_EXEC_TEMP_7FFFFFFF_I,
-         TGSI_EXEC_TEMP_7FFFFFFF_C ) );
-}
-
-static void
-emit_add(
-   struct x86_function *func,
-   unsigned xmm_dst,
-   unsigned xmm_src )
-{
-   sse_addps(
-      func,
-      make_xmm( xmm_dst ),
-      make_xmm( xmm_src ) );
-}
-
-static void PIPE_CDECL
-cos4f(
-   float *store )
-{
-   store[0] = cosf( store[0] );
-   store[1] = cosf( store[1] );
-   store[2] = cosf( store[2] );
-   store[3] = cosf( store[3] );
-}
-
-static void
-emit_cos(
-   struct x86_function *func,
-   unsigned xmm_save, 
-   unsigned xmm_dst )
-{
-   emit_func_call_dst(
-      func,
-      xmm_save, 
-      xmm_dst,
-      cos4f );
-}
-
-static void PIPE_CDECL
-#if defined(PIPE_CC_GCC)
-__attribute__((force_align_arg_pointer))
-#endif
-ex24f(
-   float *store )
-{
-   _mm_store_ps(&store[0], exp2f4( _mm_load_ps(&store[0]) ));
-}
-
-static void
-emit_ex2(
-   struct x86_function *func,
-   unsigned xmm_save, 
-   unsigned xmm_dst )
-{
-   emit_func_call_dst(
-      func,
-      xmm_save,
-      xmm_dst,
-      ex24f );
-}
-
-static void
-emit_f2it(
-   struct x86_function *func,
-   unsigned xmm )
-{
-   sse2_cvttps2dq(
-      func,
-      make_xmm( xmm ),
-      make_xmm( xmm ) );
-}
-
-static void PIPE_CDECL
-flr4f(
-   float *store )
-{
-   store[0] = floorf( store[0] );
-   store[1] = floorf( store[1] );
-   store[2] = floorf( store[2] );
-   store[3] = floorf( store[3] );
-}
-
-static void
-emit_flr(
-   struct x86_function *func,
-   unsigned xmm_save, 
-   unsigned xmm_dst )
-{
-   emit_func_call_dst(
-      func,
-      xmm_save,
-      xmm_dst,
-      flr4f );
-}
-
-static void PIPE_CDECL
-frc4f(
-   float *store )
-{
-   store[0] -= floorf( store[0] );
-   store[1] -= floorf( store[1] );
-   store[2] -= floorf( store[2] );
-   store[3] -= floorf( store[3] );
-}
-
-static void
-emit_frc(
-   struct x86_function *func,
-   unsigned xmm_save, 
-   unsigned xmm_dst )
-{
-   emit_func_call_dst(
-      func,
-      xmm_save,
-      xmm_dst,
-      frc4f );
-}
-
-static void PIPE_CDECL
-#if defined(PIPE_CC_GCC)
-__attribute__((force_align_arg_pointer))
-#endif
-lg24f(
-   float *store )
-{
-   _mm_store_ps(&store[0], log2f4( _mm_load_ps(&store[0]) ));
-}
-
-static void
-emit_lg2(
-   struct x86_function *func,
-   unsigned xmm_save, 
-   unsigned xmm_dst )
-{
-   emit_func_call_dst(
-      func,
-      xmm_save,
-      xmm_dst,
-      lg24f );
-}
-
-static void
-emit_MOV(
-   struct x86_function *func,
-   unsigned xmm_dst,
-   unsigned xmm_src )
-{
-   sse_movups(
-      func,
-      make_xmm( xmm_dst ),
-      make_xmm( xmm_src ) );
-}
-
-static void
-emit_mul (struct x86_function *func,
-          unsigned xmm_dst,
-          unsigned xmm_src)
-{
-   sse_mulps(
-      func,
-      make_xmm( xmm_dst ),
-      make_xmm( xmm_src ) );
-}
-
-static void
-emit_neg(
-   struct x86_function *func,
-   unsigned xmm )
-{
-   sse_xorps(
-      func,
-      make_xmm( xmm ),
-      get_temp(
-         TGSI_EXEC_TEMP_80000000_I,
-         TGSI_EXEC_TEMP_80000000_C ) );
-}
-
-static void PIPE_CDECL
-#if defined(PIPE_CC_GCC)
-__attribute__((force_align_arg_pointer))
-#endif
-pow4f(
-   float *store )
-{
-#if 1
-   _mm_store_ps(&store[0], powf4( _mm_load_ps(&store[0]), _mm_load_ps(&store[4]) ));
-#else
-   store[0] = powf( store[0], store[4] );
-   store[1] = powf( store[1], store[5] );
-   store[2] = powf( store[2], store[6] );
-   store[3] = powf( store[3], store[7] );
-#endif
-}
-
-static void
-emit_pow(
-   struct x86_function *func,
-   unsigned xmm_save, 
-   unsigned xmm_dst,
-   unsigned xmm_src )
-{
-   emit_func_call_dst_src(
-      func,
-      xmm_save,
-      xmm_dst,
-      xmm_src,
-      pow4f );
-}
-
-static void
-emit_rcp (
-   struct x86_function *func,
-   unsigned xmm_dst,
-   unsigned xmm_src )
-{
-   /* On Intel CPUs at least, this is only accurate to 12 bits -- not
-    * good enough.  Need to either emit a proper divide or use the
-    * iterative technique described below in emit_rsqrt().
-    */
-   sse2_rcpps(
-      func,
-      make_xmm( xmm_dst ),
-      make_xmm( xmm_src ) );
-}
-
-static void
-emit_rsqrt(
-   struct x86_function *func,
-   unsigned xmm_dst,
-   unsigned xmm_src )
-{
-#if HIGH_PRECISION
-   /* Although rsqrtps() and rcpps() are low precision on some/all SSE
-    * implementations, it is possible to improve its precision at
-    * fairly low cost, using a newton/raphson step, as below:
-    * 
-    * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
-    * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
-    *
-    * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
-    */
-   {
-      struct x86_reg dst = make_xmm( xmm_dst );
-      struct x86_reg src = make_xmm( xmm_src );
-      struct x86_reg tmp0 = make_xmm( 2 );
-      struct x86_reg tmp1 = make_xmm( 3 );
-
-      assert( xmm_dst != xmm_src );
-      assert( xmm_dst != 2 && xmm_dst != 3 );
-      assert( xmm_src != 2 && xmm_src != 3 );
-
-      sse_movaps(  func, dst,  get_temp( TGSI_EXEC_TEMP_HALF_I, TGSI_EXEC_TEMP_HALF_C ) );
-      sse_movaps(  func, tmp0, get_temp( TGSI_EXEC_TEMP_THREE_I, TGSI_EXEC_TEMP_THREE_C ) );
-      sse_rsqrtps( func, tmp1, src  );
-      sse_mulps(   func, src,  tmp1 );
-      sse_mulps(   func, dst,  tmp1 );
-      sse_mulps(   func, src,  tmp1 );
-      sse_subps(   func, tmp0, src  );
-      sse_mulps(   func, dst,  tmp0 );
+      STORE(gen, *inst, v0, 0, chan_index);   /* store v0 */
    }
-#else
-   /* On Intel CPUs at least, this is only accurate to 12 bits -- not
-    * good enough.
-    */
-   sse_rsqrtps(
-      func,
-      make_xmm( xmm_dst ),
-      make_xmm( xmm_src ) );
-#endif
-}
-
-static void
-emit_setsign(
-   struct x86_function *func,
-   unsigned xmm )
-{
-   sse_orps(
-      func,
-      make_xmm( xmm ),
-      get_temp(
-         TGSI_EXEC_TEMP_80000000_I,
-         TGSI_EXEC_TEMP_80000000_C ) );
-}
-
-static void PIPE_CDECL
-sin4f(
-   float *store )
-{
-   store[0] = sinf( store[0] );
-   store[1] = sinf( store[1] );
-   store[2] = sinf( store[2] );
-   store[3] = sinf( store[3] );
+   ppc_release_vec_register(gen->f, v0);
 }
 
-static void
-emit_sin (struct x86_function *func,
-          unsigned xmm_save, 
-          unsigned xmm_dst)
-{
-   emit_func_call_dst(
-      func,
-      xmm_save,
-      xmm_dst,
-      sin4f );
-}
 
 static void
-emit_sub(
-   struct x86_function *func,
-   unsigned xmm_dst,
-   unsigned xmm_src )
+emit_binop(struct gen_context *gen, struct tgsi_full_instruction *inst)
 {
-   sse_subps(
-      func,
-      make_xmm( xmm_dst ),
-      make_xmm( xmm_src ) );
+   int v0 = ppc_allocate_vec_register(gen->f);
+   int v1 = ppc_allocate_vec_register(gen->f);
+   int v2 = ppc_allocate_vec_register(gen->f);
+   uint chan_index;
+   FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) {
+      FETCH(gen, *inst, v0, 0, chan_index);   /* v0 = srcreg[0] */
+      FETCH(gen, *inst, v1, 1, chan_index);   /* v1 = srcreg[1] */
+      switch (inst->Instruction.Opcode) {
+      case TGSI_OPCODE_ADD:
+         ppc_vaddfp(gen->f, v2, v0, v1);
+         break;
+      case TGSI_OPCODE_SUB:
+         ppc_vsubfp(gen->f, v2, v0, v1);
+         break;
+      case TGSI_OPCODE_MUL:
+         ppc_vxor(gen->f, v2, v2, v2);        /* v2 = {0, 0, 0, 0} */
+         ppc_vmaddfp(gen->f, v2, v0, v1, v2); /* v2 = v0 * v1 + v0 */
+         break;
+      case TGSI_OPCODE_MIN:
+         ppc_vminfp(gen->f, v2, v0, v1);
+         break;
+      case TGSI_OPCODE_MAX:
+         ppc_vmaxfp(gen->f, v2, v0, v1);
+         break;
+      default:
+         assert(0);
+      }
+      STORE(gen, *inst, v2, 0, chan_index);   /* store v2 */
+   }
+   ppc_release_vec_register(gen->f, v0);
+   ppc_release_vec_register(gen->f, v1);
+   ppc_release_vec_register(gen->f, v2);
 }
-#endif
 
 
 /**
- * Return index of vector register containing {1.0, 1.0, 1.0, 1.0}.
+ * Vector comparisons, resulting in 1.0 or 0.0 values.
  */
-static int
-gen_one_vec(struct gen_context *gen)
-{
-   if (gen->one_vec < 0) {
-      gen->one_vec = ppc_allocate_vec_register(gen->f);
-      ppc_vload_float(gen->f, gen->one_vec, 1.0f);
-   }
-   return gen->one_vec;
-}
-
-/**
- * Return index of vector register containing {1<<31, 1<<31, 1<<31, 1<<31}.
- */
-static int
-gen_get_bit31_vec(struct gen_context *gen)
-{
-   if (gen->bit31_vec < 0) {
-      gen->bit31_vec = ppc_allocate_vec_register(gen->f);
-      ppc_vspltisw(gen->f, gen->bit31_vec, -1);
-      ppc_vslw(gen->f, gen->bit31_vec, gen->bit31_vec, gen->bit31_vec);
-   }
-   return gen->bit31_vec;
-}
-
-
-
-/**
- * Register fetch.
- */
-static void
-emit_fetch(struct gen_context *gen,
-           unsigned vec_reg,
-           const struct tgsi_full_src_register *reg,
-           const unsigned chan_index)
-{
-   uint swizzle = tgsi_util_get_full_src_register_extswizzle(reg, chan_index);
-
-   switch (swizzle) {
-   case TGSI_EXTSWIZZLE_X:
-   case TGSI_EXTSWIZZLE_Y:
-   case TGSI_EXTSWIZZLE_Z:
-   case TGSI_EXTSWIZZLE_W:
-      switch (reg->SrcRegister.File) {
-      case TGSI_FILE_INPUT:
-         {
-            int offset_reg = ppc_allocate_register(gen->f);
-            int offset = (reg->SrcRegister.Index * 4 + swizzle) * 16;
-            ppc_li(gen->f, offset_reg, offset);
-            ppc_lvx(gen->f, vec_reg, gen->inputs_reg, offset_reg);
-            ppc_release_register(gen->f, offset_reg);
-         }
-         break;
-      case TGSI_FILE_TEMPORARY:
-         {
-            int offset_reg = ppc_allocate_register(gen->f);
-            int offset = (reg->SrcRegister.Index * 4 + swizzle) * 16;
-            ppc_li(gen->f, offset_reg, offset);
-            ppc_lvx(gen->f, vec_reg, gen->temps_reg, offset_reg);
-            ppc_release_register(gen->f, offset_reg);
-         }
-         break;
-      case TGSI_FILE_IMMEDIATE:
-         {
-            int offset_reg = ppc_allocate_register(gen->f);
-            int offset = (reg->SrcRegister.Index * 4 + swizzle) * 16;
-            ppc_li(gen->f, offset_reg, offset);
-            ppc_lvx(gen->f, vec_reg, gen->immed_reg, offset_reg);
-            ppc_release_register(gen->f, offset_reg);
-         }
-         break;
-      case TGSI_FILE_CONSTANT:
-         {
-            int offset_reg = ppc_allocate_register(gen->f);
-            int offset = (reg->SrcRegister.Index * 4 + swizzle) * 4;
-            ppc_li(gen->f, offset_reg, offset);
-            /* Load 4-byte word into vector register.
-             * The vector slot depends on the effective address we load from.
-             * We know that our constants start at a 16-byte boundary so we
-             * know that 'swizzle' tells us which vector slot will have the
-             * loaded word.  The other vector slots will be undefined.
-             */
-            ppc_lvewx(gen->f, vec_reg, gen->const_reg, offset_reg);
-            /* splat word[swizzle] across the vector reg */
-            ppc_vspltw(gen->f, vec_reg, vec_reg, swizzle);
-            ppc_release_register(gen->f, offset_reg);
-         }
-         break;
-      default:
-         assert( 0 );
-      }
-      break;
-   case TGSI_EXTSWIZZLE_ZERO:
-      ppc_vload_float(gen->f, vec_reg, 0.0f);
-      break;
-   case TGSI_EXTSWIZZLE_ONE:
-      {
-         int one_vec = gen_one_vec(gen);
-         ppc_vecmove(gen->f, vec_reg, one_vec);
-      }
-      break;
-   default:
-      assert( 0 );
-   }
-
-   {
-      uint sign_op = tgsi_util_get_full_src_register_sign_mode(reg, chan_index);
-      if (sign_op != TGSI_UTIL_SIGN_KEEP) {
-         int bit31_vec = gen_get_bit31_vec(gen);
-
-         switch (sign_op) {
-         case TGSI_UTIL_SIGN_CLEAR:
-            /* vec = vec & ~bit31 */
-            ppc_vandc(gen->f, vec_reg, vec_reg, bit31_vec);
-            break;
-         case TGSI_UTIL_SIGN_SET:
-            /* vec = vec | bit31 */
-            ppc_vor(gen->f, vec_reg, vec_reg, bit31_vec);
-            break;
-         case TGSI_UTIL_SIGN_TOGGLE:
-            /* vec = vec ^ bit31 */
-            ppc_vxor(gen->f, vec_reg, vec_reg, bit31_vec);
-            break;
-         default:
-            assert(0);
-         }
-      }
-   }
-}
-
-#define FETCH( GEN, INST, VEC_REG, SRC_REG, CHAN ) \
-   emit_fetch( GEN, VEC_REG, &(INST).FullSrcRegisters[SRC_REG], CHAN )
-
-
-
-/**
- * Register store.
- */
-static void
-emit_store(struct gen_context *gen,
-           unsigned vec_reg,
-           const struct tgsi_full_dst_register *reg,
-           const struct tgsi_full_instruction *inst,
-           unsigned chan_index)
-{
-   switch (reg->DstRegister.File) {
-   case TGSI_FILE_OUTPUT:
-      {
-         int offset_reg = ppc_allocate_register(gen->f);
-         int offset = (reg->DstRegister.Index * 4 + chan_index) * 16;
-         ppc_li(gen->f, offset_reg, offset);
-         ppc_stvx(gen->f, vec_reg, gen->outputs_reg, offset_reg);
-         ppc_release_register(gen->f, offset_reg);
-      }
-      break;
-   case TGSI_FILE_TEMPORARY:
-      {
-         int offset_reg = ppc_allocate_register(gen->f);
-         int offset = (reg->DstRegister.Index * 4 + chan_index) * 16;
-         ppc_li(gen->f, offset_reg, offset);
-         ppc_stvx(gen->f, vec_reg, gen->temps_reg, offset_reg);
-         ppc_release_register(gen->f, offset_reg);
-      }
-      break;
-#if 0
-   case TGSI_FILE_ADDRESS:
-      emit_addrs(
-         func,
-         xmm,
-         reg->DstRegister.Index,
-         chan_index );
-      break;
-#endif
-   default:
-      assert( 0 );
-   }
-
-#if 0
-   switch( inst->Instruction.Saturate ) {
-   case TGSI_SAT_NONE:
-      break;
-
-   case TGSI_SAT_ZERO_ONE:
-      /* assert( 0 ); */
-      break;
-
-   case TGSI_SAT_MINUS_PLUS_ONE:
-      assert( 0 );
-      break;
-   }
-#endif
-}
-
-
-#define STORE( GEN, INST, XMM, INDEX, CHAN )\
-   emit_store( GEN, XMM, &(INST).FullDstRegisters[INDEX], &(INST), CHAN )
-
-
-
-#if 000
-/**
- * High-level instruction translators.
- */
-
-static void
-emit_kil(
-   struct x86_function *func,
-   const struct tgsi_full_src_register *reg )
-{
-   unsigned uniquemask;
-   unsigned registers[4];
-   unsigned nextregister = 0;
-   unsigned firstchan = ~0;
-   unsigned chan_index;
-
-   /* This mask stores component bits that were already tested. Note that
-    * we test if the value is less than zero, so 1.0 and 0.0 need not to be
-    * tested. */
-   uniquemask = (1 << TGSI_EXTSWIZZLE_ZERO) | (1 << TGSI_EXTSWIZZLE_ONE);
-
-   FOR_EACH_CHANNEL( chan_index ) {
-      unsigned swizzle;
-
-      /* unswizzle channel */
-      swizzle = tgsi_util_get_full_src_register_extswizzle(
-         reg,
-         chan_index );
-
-      /* check if the component has not been already tested */
-      if( !(uniquemask & (1 << swizzle)) ) {
-         uniquemask |= 1 << swizzle;
-
-         /* allocate register */
-         registers[chan_index] = nextregister;
-         emit_fetch(
-            func,
-            nextregister,
-            reg,
-            chan_index );
-         nextregister++;
-
-         /* mark the first channel used */
-         if( firstchan == ~0 ) {
-            firstchan = chan_index;
-         }
-      }
-   }
-
-   x86_push(
-      func,
-      x86_make_reg( file_REG32, reg_AX ) );
-   x86_push(
-      func,
-      x86_make_reg( file_REG32, reg_DX ) );
-
-   FOR_EACH_CHANNEL( chan_index ) {
-      if( uniquemask & (1 << chan_index) ) {
-         sse_cmpps(
-            func,
-            make_xmm( registers[chan_index] ),
-            get_temp(
-               TGSI_EXEC_TEMP_00000000_I,
-               TGSI_EXEC_TEMP_00000000_C ),
-            cc_LessThan );
-
-         if( chan_index == firstchan ) {
-            sse_pmovmskb(
-               func,
-               x86_make_reg( file_REG32, reg_AX ),
-               make_xmm( registers[chan_index] ) );
-         }
-         else {
-            sse_pmovmskb(
-               func,
-               x86_make_reg( file_REG32, reg_DX ),
-               make_xmm( registers[chan_index] ) );
-            x86_or(
-               func,
-               x86_make_reg( file_REG32, reg_AX ),
-               x86_make_reg( file_REG32, reg_DX ) );
-         }
-      }
-   }
-
-   x86_or(
-      func,
-      get_temp(
-         TGSI_EXEC_TEMP_KILMASK_I,
-         TGSI_EXEC_TEMP_KILMASK_C ),
-      x86_make_reg( file_REG32, reg_AX ) );
-
-   x86_pop(
-      func,
-      x86_make_reg( file_REG32, reg_DX ) );
-   x86_pop(
-      func,
-      x86_make_reg( file_REG32, reg_AX ) );
-}
-
-
-static void
-emit_kilp(
-   struct x86_function *func )
-{
-   /* XXX todo / fix me */
-}
-
-
-static void
-emit_setcc(
-   struct x86_function *func,
-   struct tgsi_full_instruction *inst,
-   enum sse_cc cc )
-{
-   unsigned chan_index;
-
-   FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-      FETCH( func, *inst, 0, 0, chan_index );
-      FETCH( func, *inst, 1, 1, chan_index );
-      sse_cmpps(
-         func,
-         make_xmm( 0 ),
-         make_xmm( 1 ),
-         cc );
-      sse_andps(
-         func,
-         make_xmm( 0 ),
-         get_temp(
-            TEMP_ONE_I,
-            TEMP_ONE_C ) );
-      STORE( func, *inst, 0, 0, chan_index );
-   }
-}
-
-static void
-emit_cmp(
-   struct x86_function *func,
-   struct tgsi_full_instruction *inst )
-{
-   unsigned chan_index;
-
-   FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-      FETCH( func, *inst, 0, 0, chan_index );
-      FETCH( func, *inst, 1, 1, chan_index );
-      FETCH( func, *inst, 2, 2, chan_index );
-      sse_cmpps(
-         func,
-         make_xmm( 0 ),
-         get_temp(
-            TGSI_EXEC_TEMP_00000000_I,
-            TGSI_EXEC_TEMP_00000000_C ),
-         cc_LessThan );
-      sse_andps(
-         func,
-         make_xmm( 1 ),
-         make_xmm( 0 ) );
-      sse_andnps(
-         func,
-         make_xmm( 0 ),
-         make_xmm( 2 ) );
-      sse_orps(
-         func,
-         make_xmm( 0 ),
-         make_xmm( 1 ) );
-      STORE( func, *inst, 0, 0, chan_index );
-   }
-}
-#endif
-
-
-static void
-emit_scalar_unaryop(struct gen_context *gen, struct tgsi_full_instruction *inst)
+static void
+emit_inequality(struct gen_context *gen, struct tgsi_full_instruction *inst)
 {
    int v0 = ppc_allocate_vec_register(gen->f);
-   int v1 = ppc_allocate_vec_register(gen->f);
-   uint chan_index;
-
-   FETCH(gen, *inst, v0, 0, CHAN_X);
-
-   switch (inst->Instruction.Opcode) {
-   case TGSI_OPCODE_RSQ:
-      /* v1 = 1.0 / sqrt(v0) */
-      ppc_vrsqrtefp(gen->f, v1, v0);
-      break;
-   case TGSI_OPCODE_RCP:
-      /* v1 = 1.0 / v0 */
-      ppc_vrefp(gen->f, v1, v0);
-      break;
-   default:
-      assert(0);
-   }
-
-   FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-      STORE(gen, *inst, v1, 0, chan_index);
-   }
-   ppc_release_vec_register(gen->f, v0);
-   ppc_release_vec_register(gen->f, v1);
-}
-
-
-static void
-emit_unaryop(struct gen_context *gen, struct tgsi_full_instruction *inst)
-{
-   int v0 = ppc_allocate_vec_register(gen->f);
-   uint chan_index;
-   FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) {
-      FETCH(gen, *inst, 0, 0, chan_index);   /* v0 = srcreg[0] */
-      switch (inst->Instruction.Opcode) {
-      case TGSI_OPCODE_ABS:
-         /* turn off the most significant bit of each vector float word */
-         {
-            int v1 = ppc_allocate_vec_register(gen->f);
-            ppc_vspltisw(gen->f, v1, -1);  /* v1 = {-1, -1, -1, -1} */
-            ppc_vslw(gen->f, v1, v1, v1);  /* v1 = {1<<31, 1<<31, 1<<31, 1<<31} */
-            ppc_vandc(gen->f, v0, v0, v1); /* v0 = v0 & ~v1 */
-            ppc_release_vec_register(gen->f, v1);
-         }
-         break;
-      case TGSI_OPCODE_FLOOR:
-         ppc_vrfim(gen->f, v0, v0);         /* v0 = floor(v0) */
-         break;
-      case TGSI_OPCODE_FRAC:
-         {
-            int v1 = ppc_allocate_vec_register(gen->f);
-            ppc_vrfim(gen->f, v1, v0);         /* v1 = floor(v0) */
-            ppc_vsubfp(gen->f, v0, v0, v1);    /* v0 = v0 - v1 */
-            ppc_release_vec_register(gen->f, v1);
-         }
-         break;
-      case TGSI_OPCODE_EXPBASE2:
-         ppc_vexptefp(gen->f, v0, v0);      /* v0 = 2^v0 */
-         break;
-      case TGSI_OPCODE_LOGBASE2:
-         /* XXX this may be broken! */
-         ppc_vlogefp(gen->f, v0, v0);      /* v0 = log2(v0) */
-         break;
-      case TGSI_OPCODE_MOV:
-         /* nothing */
-         break;
-      default:
-         assert(0);
-      }
-      STORE(gen, *inst, v0, 0, chan_index);   /* store v0 */
-   }
-   ppc_release_vec_register(gen->f, v0);
-}
-
-
-static void
-emit_binop(struct gen_context *gen, struct tgsi_full_instruction *inst)
-{
-   int v0 = ppc_allocate_vec_register(gen->f);
-   int v1 = ppc_allocate_vec_register(gen->f);
-   int v2 = ppc_allocate_vec_register(gen->f);
-   uint chan_index;
-   FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) {
-      FETCH(gen, *inst, v0, 0, chan_index);   /* v0 = srcreg[0] */
-      FETCH(gen, *inst, v1, 1, chan_index);   /* v1 = srcreg[1] */
-      switch (inst->Instruction.Opcode) {
-      case TGSI_OPCODE_ADD:
-         ppc_vaddfp(gen->f, v2, v0, v1);
-         break;
-      case TGSI_OPCODE_SUB:
-         ppc_vsubfp(gen->f, v2, v0, v1);
-         break;
-      case TGSI_OPCODE_MUL:
-         ppc_vxor(gen->f, v2, v2, v2);        /* v2 = {0, 0, 0, 0} */
-         ppc_vmaddfp(gen->f, v2, v0, v1, v2); /* v2 = v0 * v1 + v0 */
-         break;
-      case TGSI_OPCODE_MIN:
-         ppc_vminfp(gen->f, v2, v0, v1);
-         break;
-      case TGSI_OPCODE_MAX:
-         ppc_vmaxfp(gen->f, v2, v0, v1);
-         break;
-      default:
-         assert(0);
-      }
-      STORE(gen, *inst, v2, 0, chan_index);   /* store v2 */
-   }
-   ppc_release_vec_register(gen->f, v0);
-   ppc_release_vec_register(gen->f, v1);
-   ppc_release_vec_register(gen->f, v2);
-}
-
-
-/**
- * Vector comparisons, resulting in 1.0 or 0.0 values.
- */
-static void
-emit_inequality(struct gen_context *gen, struct tgsi_full_instruction *inst)
-{
-   int v0 = ppc_allocate_vec_register(gen->f);
-   int v1 = ppc_allocate_vec_register(gen->f);
-   int v2 = ppc_allocate_vec_register(gen->f);
-   uint chan_index;
-   boolean complement = FALSE;
-   int one_vec = gen_one_vec(gen);
-
-   FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) {
-      FETCH(gen, *inst, v0, 0, chan_index);   /* v0 = srcreg[0] */
-      FETCH(gen, *inst, v1, 1, chan_index);   /* v1 = srcreg[1] */
-
-      switch (inst->Instruction.Opcode) {
-      case TGSI_OPCODE_SNE:
-         complement = TRUE;
-         /* fall-through */
-      case TGSI_OPCODE_SEQ:
-         ppc_vcmpeqfpx(gen->f, v2, v0, v1); /* v2 = v0 == v1 ? ~0 : 0 */
-         break;
-
-      case TGSI_OPCODE_SGE:
-         complement = TRUE;
-         /* fall-through */
-      case TGSI_OPCODE_SLT:
-         ppc_vcmpgtfpx(gen->f, v2, v1, v0); /* v2 = v1 > v0 ? ~0 : 0 */
-         break;
-
-      case TGSI_OPCODE_SLE:
-         complement = TRUE;
-         /* fall-through */
-      case TGSI_OPCODE_SGT:
-         ppc_vcmpgtfpx(gen->f, v2, v0, v1); /* v2 = v0 > v1 ? ~0 : 0 */
-         break;
-      default:
-         assert(0);
-      }
-
-      /* v2 is now {0,0,0,0} or {~0,~0,~0,~0} */
-
-      if (complement)
-         ppc_vandc(gen->f, v2, one_vec, v2);    /* v2 = one_vec & ~v2 */
-      else
-         ppc_vand(gen->f, v2, one_vec, v2);     /* v2 = one_vec & v2 */
-
-      STORE(gen, *inst, v2, 0, chan_index);   /* store v2 */
-   }
-
-   ppc_release_vec_register(gen->f, v0);
-   ppc_release_vec_register(gen->f, v1);
-   ppc_release_vec_register(gen->f, v2);
-}
-
-
-static void
-emit_dotprod(struct gen_context *gen, struct tgsi_full_instruction *inst)
-{
-   int v0 = ppc_allocate_vec_register(gen->f);
-   int v1 = ppc_allocate_vec_register(gen->f);
-   int v2 = ppc_allocate_vec_register(gen->f);
-   uint chan_index;
-
-   ppc_vxor(gen->f, v2, v2, v2);           /* v2 = {0, 0, 0, 0} */
-
-   FETCH(gen, *inst, v0, 0, CHAN_X);       /* v0 = src0.XXXX */
-   FETCH(gen, *inst, v1, 1, CHAN_X);       /* v1 = src1.XXXX */
-   ppc_vmaddfp(gen->f, v2, v0, v1, v2);    /* v2 = v0 * v1 + v2 */
-
-   FETCH(gen, *inst, v0, 0, CHAN_Y);       /* v0 = src0.YYYY */
-   FETCH(gen, *inst, v1, 1, CHAN_Y);       /* v1 = src1.YYYY */
-   ppc_vmaddfp(gen->f, v2, v0, v1, v2);    /* v2 = v0 * v1 + v2 */
-
-   FETCH(gen, *inst, v0, 0, CHAN_Z);       /* v0 = src0.ZZZZ */
-   FETCH(gen, *inst, v1, 1, CHAN_Z);       /* v1 = src1.ZZZZ */
-   ppc_vmaddfp(gen->f, v2, v0, v1, v2);    /* v2 = v0 * v1 + v2 */
-
-   if (inst->Instruction.Opcode == TGSI_OPCODE_DP4) {
-      FETCH(gen, *inst, v0, 0, CHAN_W);    /* v0 = src0.WWWW */
-      FETCH(gen, *inst, v1, 1, CHAN_W);    /* v1 = src1.WWWW */
-      ppc_vmaddfp(gen->f, v2, v0, v1, v2); /* v2 = v0 * v1 + v2 */
-   }
-   else if (inst->Instruction.Opcode == TGSI_OPCODE_DPH) {
-      FETCH(gen, *inst, v1, 1, CHAN_W);    /* v1 = src1.WWWW */
-      ppc_vaddfp(gen->f, v2, v2, v1);      /* v2 = v2 + v1 */
-   }
-
-   FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) {
-      STORE(gen, *inst, v2, 0, chan_index);  /* store v2 */
-   }
-   ppc_release_vec_register(gen->f, v0);
-   ppc_release_vec_register(gen->f, v1);
-   ppc_release_vec_register(gen->f, v2);
-}
-
-
-static void
-emit_triop(struct gen_context *gen, struct tgsi_full_instruction *inst)
-{
-   int v0 = ppc_allocate_vec_register(gen->f);
-   int v1 = ppc_allocate_vec_register(gen->f);
-   int v2 = ppc_allocate_vec_register(gen->f);
-   int v3 = ppc_allocate_vec_register(gen->f);
-   uint chan_index;
-   FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) {
-      FETCH(gen, *inst, v0, 0, chan_index);   /* v0 = srcreg[0] */
-      FETCH(gen, *inst, v1, 1, chan_index);   /* v1 = srcreg[1] */
-      FETCH(gen, *inst, v2, 2, chan_index);   /* v2 = srcreg[2] */
-      switch (inst->Instruction.Opcode) {
-      case TGSI_OPCODE_MAD:
-         ppc_vmaddfp(gen->f, v3, v0, v1, v2);   /* v3 = v0 * v1 + v2 */
-         break;
-      case TGSI_OPCODE_LRP:
-         ppc_vsubfp(gen->f, v3, v1, v2);        /* v3 = v1 - v2 */
-         ppc_vmaddfp(gen->f, v3, v0, v3, v2);   /* v3 = v0 * v3 + v2 */
-         break;
-      default:
-         assert(0);
-      }
-      STORE(gen, *inst, v3, 0, chan_index);   /* store v3 */
-   }
-   ppc_release_vec_register(gen->f, v0);
-   ppc_release_vec_register(gen->f, v1);
-   ppc_release_vec_register(gen->f, v2);
-   ppc_release_vec_register(gen->f, v3);
-}
-
-
-/*
-static void
-emit_lit(struct gen_context *gen, struct tgsi_full_instruction *inst)
-{
-}
-*/
-
-
-static int
-emit_instruction(struct gen_context *gen,
-                 struct tgsi_full_instruction *inst)
-{
-   switch (inst->Instruction.Opcode) {
-   case TGSI_OPCODE_MOV:
-   case TGSI_OPCODE_ABS:
-   case TGSI_OPCODE_FLOOR:
-   case TGSI_OPCODE_FRAC:
-   case TGSI_OPCODE_EXPBASE2:
-   case TGSI_OPCODE_LOGBASE2:
-      emit_unaryop(gen, inst);
-      break;
-   case TGSI_OPCODE_RSQ:
-   case TGSI_OPCODE_RCP:
-      emit_scalar_unaryop(gen, inst);
-      break;
-   case TGSI_OPCODE_ADD:
-   case TGSI_OPCODE_SUB:
-   case TGSI_OPCODE_MUL:
-   case TGSI_OPCODE_MIN:
-   case TGSI_OPCODE_MAX:
-      emit_binop(gen, inst);
-      break;
-   case TGSI_OPCODE_SEQ:
-   case TGSI_OPCODE_SNE:
-   case TGSI_OPCODE_SLT:
-   case TGSI_OPCODE_SGT:
-   case TGSI_OPCODE_SLE:
-   case TGSI_OPCODE_SGE:
-      emit_inequality(gen, inst);
-      break;
-   case TGSI_OPCODE_MAD:
-   case TGSI_OPCODE_LRP:
-      emit_triop(gen, inst);
-      break;
-   case TGSI_OPCODE_DP3:
-   case TGSI_OPCODE_DP4:
-   case TGSI_OPCODE_DPH:
-      emit_dotprod(gen, inst);
-      break;
-      /*
-   case TGSI_OPCODE_LIT:
-      emit_lit(gen, inst);
-      break;
-      */
-   case TGSI_OPCODE_END:
-      /* normal end */
-      return 1;
-   default:
-      return 0;
-   }
-
-#if 0
-   unsigned chan_index;
-
-   switch (inst->Instruction.Opcode) {
-   case TGSI_OPCODE_ARL:
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( func, *inst, 0, 0, chan_index );
-         emit_f2it( func, 0 );
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_MOV:
-   case TGSI_OPCODE_SWZ:
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( func, *inst, 0, 0, chan_index );
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_LIT:
-      if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
-          IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
-         emit_tempf(
-            func,
-            0,
-            TEMP_ONE_I,
-            TEMP_ONE_C);
-         if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ) {
-            STORE( func, *inst, 0, 0, CHAN_X );
-         }
-         if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
-            STORE( func, *inst, 0, 0, CHAN_W );
-         }
-      }
-      if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
-          IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
-         if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
-            FETCH( func, *inst, 0, 0, CHAN_X );
-            sse_maxps(
-               func,
-               make_xmm( 0 ),
-               get_temp(
-                  TGSI_EXEC_TEMP_00000000_I,
-                  TGSI_EXEC_TEMP_00000000_C ) );
-            STORE( func, *inst, 0, 0, CHAN_Y );
-         }
-         if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
-            /* XMM[1] = SrcReg[0].yyyy */
-            FETCH( func, *inst, 1, 0, CHAN_Y );
-            /* XMM[1] = max(XMM[1], 0) */
-            sse_maxps(
-               func,
-               make_xmm( 1 ),
-               get_temp(
-                  TGSI_EXEC_TEMP_00000000_I,
-                  TGSI_EXEC_TEMP_00000000_C ) );
-            /* XMM[2] = SrcReg[0].wwww */
-            FETCH( func, *inst, 2, 0, CHAN_W );
-            /* XMM[2] = min(XMM[2], 128.0) */
-            sse_minps(
-               func,
-               make_xmm( 2 ),
-               get_temp(
-                  TGSI_EXEC_TEMP_128_I,
-                  TGSI_EXEC_TEMP_128_C ) );
-            /* XMM[2] = max(XMM[2], -128.0) */
-            sse_maxps(
-               func,
-               make_xmm( 2 ),
-               get_temp(
-                  TGSI_EXEC_TEMP_MINUS_128_I,
-                  TGSI_EXEC_TEMP_MINUS_128_C ) );
-            emit_pow( func, 3, 1, 2 );
-            FETCH( func, *inst, 0, 0, CHAN_X );
-            sse_xorps(
-               func,
-               make_xmm( 2 ),
-               make_xmm( 2 ) );
-            sse_cmpps(
-               func,
-               make_xmm( 2 ),
-               make_xmm( 0 ),
-               cc_LessThanEqual );
-            sse_andps(
-               func,
-               make_xmm( 2 ),
-               make_xmm( 1 ) );
-            STORE( func, *inst, 2, 0, CHAN_Z );
-         }
-      }
-      break;
-
-   case TGSI_OPCODE_RCP:
-   /* TGSI_OPCODE_RECIP */
-      FETCH( func, *inst, 0, 0, CHAN_X );
-      emit_rcp( func, 0, 0 );
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_RSQ:
-   /* TGSI_OPCODE_RECIPSQRT */
-      FETCH( func, *inst, 0, 0, CHAN_X );
-      emit_rsqrt( func, 1, 0 );
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( func, *inst, 1, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_EXP:
-      if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
-          IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
-          IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
-         FETCH( func, *inst, 0, 0, CHAN_X );
-         if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
-             IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
-            emit_MOV( func, 1, 0 );
-            emit_flr( func, 2, 1 );
-            /* dst.x = ex2(floor(src.x)) */
-            if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
-               emit_MOV( func, 2, 1 );
-               emit_ex2( func, 3, 2 );
-               STORE( func, *inst, 2, 0, CHAN_X );
-            }
-            /* dst.y = src.x - floor(src.x) */
-            if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
-               emit_MOV( func, 2, 0 );
-               emit_sub( func, 2, 1 );
-               STORE( func, *inst, 2, 0, CHAN_Y );
-            }
-         }
-         /* dst.z = ex2(src.x) */
-         if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
-            emit_ex2( func, 3, 0 );
-            STORE( func, *inst, 0, 0, CHAN_Z );
-         }
-      }
-      /* dst.w = 1.0 */
-      if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
-         emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
-         STORE( func, *inst, 0, 0, CHAN_W );
-      }
-      break;
-
-   case TGSI_OPCODE_LOG:
-      if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
-          IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
-          IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
-         FETCH( func, *inst, 0, 0, CHAN_X );
-         emit_abs( func, 0 );
-         emit_MOV( func, 1, 0 );
-         emit_lg2( func, 2, 1 );
-         /* dst.z = lg2(abs(src.x)) */
-         if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
-            STORE( func, *inst, 1, 0, CHAN_Z );
-         }
-         if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
-             IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
-            emit_flr( func, 2, 1 );
-            /* dst.x = floor(lg2(abs(src.x))) */
-            if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
-               STORE( func, *inst, 1, 0, CHAN_X );
-            }
-            /* dst.x = abs(src)/ex2(floor(lg2(abs(src.x)))) */
-            if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
-               emit_ex2( func, 2, 1 );
-               emit_rcp( func, 1, 1 );
-               emit_mul( func, 0, 1 );
-               STORE( func, *inst, 0, 0, CHAN_Y );
-            }
-         }
-      }
-      /* dst.w = 1.0 */
-      if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
-         emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
-         STORE( func, *inst, 0, 0, CHAN_W );
-      }
-      break;
-
-   case TGSI_OPCODE_MUL:
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( func, *inst, 0, 0, chan_index );
-         FETCH( func, *inst, 1, 1, chan_index );
-         emit_mul( func, 0, 1 );
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_ADD:
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( func, *inst, 0, 0, chan_index );
-         FETCH( func, *inst, 1, 1, chan_index );
-         emit_add( func, 0, 1 );
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_DP3:
-   /* TGSI_OPCODE_DOT3 */
-      FETCH( func, *inst, 0, 0, CHAN_X );
-      FETCH( func, *inst, 1, 1, CHAN_X );
-      emit_mul( func, 0, 1 );
-      FETCH( func, *inst, 1, 0, CHAN_Y );
-      FETCH( func, *inst, 2, 1, CHAN_Y );
-      emit_mul( func, 1, 2 );
-      emit_add( func, 0, 1 );
-      FETCH( func, *inst, 1, 0, CHAN_Z );
-      FETCH( func, *inst, 2, 1, CHAN_Z );
-      emit_mul( func, 1, 2 );
-      emit_add( func, 0, 1 );
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_DP4:
-   /* TGSI_OPCODE_DOT4 */
-      FETCH( func, *inst, 0, 0, CHAN_X );
-      FETCH( func, *inst, 1, 1, CHAN_X );
-      emit_mul( func, 0, 1 );
-      FETCH( func, *inst, 1, 0, CHAN_Y );
-      FETCH( func, *inst, 2, 1, CHAN_Y );
-      emit_mul( func, 1, 2 );
-      emit_add( func, 0, 1 );
-      FETCH( func, *inst, 1, 0, CHAN_Z );
-      FETCH( func, *inst, 2, 1, CHAN_Z );
-      emit_mul(func, 1, 2 );
-      emit_add(func, 0, 1 );
-      FETCH( func, *inst, 1, 0, CHAN_W );
-      FETCH( func, *inst, 2, 1, CHAN_W );
-      emit_mul( func, 1, 2 );
-      emit_add( func, 0, 1 );
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_DST:
-      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
-         emit_tempf(
-            func,
-            0,
-            TEMP_ONE_I,
-            TEMP_ONE_C );
-         STORE( func, *inst, 0, 0, CHAN_X );
-      }
-      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
-         FETCH( func, *inst, 0, 0, CHAN_Y );
-         FETCH( func, *inst, 1, 1, CHAN_Y );
-         emit_mul( func, 0, 1 );
-         STORE( func, *inst, 0, 0, CHAN_Y );
-      }
-      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
-         FETCH( func, *inst, 0, 0, CHAN_Z );
-         STORE( func, *inst, 0, 0, CHAN_Z );
-      }
-      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
-         FETCH( func, *inst, 0, 1, CHAN_W );
-         STORE( func, *inst, 0, 0, CHAN_W );
-      }
-      break;
-
-   case TGSI_OPCODE_MIN:
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( func, *inst, 0, 0, chan_index );
-         FETCH( func, *inst, 1, 1, chan_index );
-         sse_minps(
-            func,
-            make_xmm( 0 ),
-            make_xmm( 1 ) );
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_MAX:
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( func, *inst, 0, 0, chan_index );
-         FETCH( func, *inst, 1, 1, chan_index );
-         sse_maxps(
-            func,
-            make_xmm( 0 ),
-            make_xmm( 1 ) );
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_SLT:
-   /* TGSI_OPCODE_SETLT */
-      emit_setcc( func, inst, cc_LessThan );
-      break;
-
-   case TGSI_OPCODE_SGE:
-   /* TGSI_OPCODE_SETGE */
-      emit_setcc( func, inst, cc_NotLessThan );
-      break;
-
-   case TGSI_OPCODE_MAD:
-   /* TGSI_OPCODE_MADD */
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( func, *inst, 0, 0, chan_index );
-         FETCH( func, *inst, 1, 1, chan_index );
-         FETCH( func, *inst, 2, 2, chan_index );
-         emit_mul( func, 0, 1 );
-         emit_add( func, 0, 2 );
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_SUB:
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( func, *inst, 0, 0, chan_index );
-         FETCH( func, *inst, 1, 1, chan_index );
-         emit_sub( func, 0, 1 );
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_LERP:
-   /* TGSI_OPCODE_LRP */
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( func, *inst, 0, 0, chan_index );
-         FETCH( func, *inst, 1, 1, chan_index );
-         FETCH( func, *inst, 2, 2, chan_index );
-         emit_sub( func, 1, 2 );
-         emit_mul( func, 0, 1 );
-         emit_add( func, 0, 2 );
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_CND:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_CND0:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_DOT2ADD:
-   /* TGSI_OPCODE_DP2A */
-      return 0;
-      break;
-
-   case TGSI_OPCODE_INDEX:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_NEGATE:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_FRAC:
-   /* TGSI_OPCODE_FRC */
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( func, *inst, 0, 0, chan_index );
-         emit_frc( func, 0, 0 );
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_CLAMP:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_FLOOR:
-   /* TGSI_OPCODE_FLR */
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( func, *inst, 0, 0, chan_index );
-         emit_flr( func, 0, 0 );
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_ROUND:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_EXPBASE2:
-   /* TGSI_OPCODE_EX2 */
-      FETCH( func, *inst, 0, 0, CHAN_X );
-      emit_ex2( func, 0, 0 );
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_LOGBASE2:
-   /* TGSI_OPCODE_LG2 */
-      FETCH( func, *inst, 0, 0, CHAN_X );
-      emit_lg2( func, 0, 0 );
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_POWER:
-   /* TGSI_OPCODE_POW */
-      FETCH( func, *inst, 0, 0, CHAN_X );
-      FETCH( func, *inst, 1, 1, CHAN_X );
-      emit_pow( func, 0, 0, 1 );
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_CROSSPRODUCT:
-   /* TGSI_OPCODE_XPD */
-      if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
-          IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
-         FETCH( func, *inst, 1, 1, CHAN_Z );
-         FETCH( func, *inst, 3, 0, CHAN_Z );
-      }
-      if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
-          IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
-         FETCH( func, *inst, 0, 0, CHAN_Y );
-         FETCH( func, *inst, 4, 1, CHAN_Y );
-      }
-      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
-         emit_MOV( func, 2, 0 );
-         emit_mul( func, 2, 1 );
-         emit_MOV( func, 5, 3 );
-         emit_mul( func, 5, 4 );
-         emit_sub( func, 2, 5 );
-         STORE( func, *inst, 2, 0, CHAN_X );
-      }
-      if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
-          IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
-         FETCH( func, *inst, 2, 1, CHAN_X );
-         FETCH( func, *inst, 5, 0, CHAN_X );
-      }
-      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
-         emit_mul( func, 3, 2 );
-         emit_mul( func, 1, 5 );
-         emit_sub( func, 3, 1 );
-         STORE( func, *inst, 3, 0, CHAN_Y );
-      }
-      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
-         emit_mul( func, 5, 4 );
-         emit_mul( func, 0, 2 );
-         emit_sub( func, 5, 0 );
-         STORE( func, *inst, 5, 0, CHAN_Z );
-      }
-      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
-	 emit_tempf(
-	    func,
-	    0,
-	    TEMP_ONE_I,
-	    TEMP_ONE_C );
-         STORE( func, *inst, 0, 0, CHAN_W );
-      }
-      break;
-
-   case TGSI_OPCODE_MULTIPLYMATRIX:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_ABS:
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( func, *inst, 0, 0, chan_index );
-         emit_abs( func, 0) ;
-
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_RCC:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_DPH:
-      FETCH( func, *inst, 0, 0, CHAN_X );
-      FETCH( func, *inst, 1, 1, CHAN_X );
-      emit_mul( func, 0, 1 );
-      FETCH( func, *inst, 1, 0, CHAN_Y );
-      FETCH( func, *inst, 2, 1, CHAN_Y );
-      emit_mul( func, 1, 2 );
-      emit_add( func, 0, 1 );
-      FETCH( func, *inst, 1, 0, CHAN_Z );
-      FETCH( func, *inst, 2, 1, CHAN_Z );
-      emit_mul( func, 1, 2 );
-      emit_add( func, 0, 1 );
-      FETCH( func, *inst, 1, 1, CHAN_W );
-      emit_add( func, 0, 1 );
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_COS:
-      FETCH( func, *inst, 0, 0, CHAN_X );
-      emit_cos( func, 0, 0 );
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_DDX:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_DDY:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_KILP:
-      /* predicated kill */
-      emit_kilp( func );
-      return 0; /* XXX fix me */
-      break;
-
-   case TGSI_OPCODE_KIL:
-      /* conditional kill */
-      emit_kil( func, &inst->FullSrcRegisters[0] );
-      break;
-
-   case TGSI_OPCODE_PK2H:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_PK2US:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_PK4B:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_PK4UB:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_RFL:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_SEQ:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_SFL:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_SGT:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_SIN:
-      FETCH( func, *inst, 0, 0, CHAN_X );
-      emit_sin( func, 0, 0 );
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_SLE:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_SNE:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_STR:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_TEX:
-      if (0) {
-	 /* Disable dummy texture code: 
-	  */
-	 emit_tempf(
-	    func,
-	    0,
-	    TEMP_ONE_I,
-	    TEMP_ONE_C );
-	 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-	    STORE( func, *inst, 0, 0, chan_index );
-	 }
-      }
-      else {
-	 return 0;
-      }
-      break;
-
-   case TGSI_OPCODE_TXD:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_UP2H:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_UP2US:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_UP4B:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_UP4UB:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_X2D:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_ARA:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_ARR:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_BRA:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_CAL:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_RET:
-      emit_ret( func );
-      break;
-
-   case TGSI_OPCODE_END:
-      break;
-
-   case TGSI_OPCODE_SSG:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_CMP:
-      emit_cmp (func, inst);
-      break;
-
-   case TGSI_OPCODE_SCS:
-      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
-         FETCH( func, *inst, 0, 0, CHAN_X );
-         emit_cos( func, 0, 0 );
-         STORE( func, *inst, 0, 0, CHAN_X );
-      }
-      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
-         FETCH( func, *inst, 0, 0, CHAN_X );
-         emit_sin( func, 0, 0 );
-         STORE( func, *inst, 0, 0, CHAN_Y );
-      }
-      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
-	 emit_tempf(
-	    func,
-	    0,
-	    TGSI_EXEC_TEMP_00000000_I,
-	    TGSI_EXEC_TEMP_00000000_C );
-         STORE( func, *inst, 0, 0, CHAN_Z );
-      }
-      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
-	 emit_tempf(
-	    func,
-	    0,
-	    TEMP_ONE_I,
-	    TEMP_ONE_C );
-         STORE( func, *inst, 0, 0, CHAN_W );
-      }
-      break;
-
-   case TGSI_OPCODE_TXB:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_NRM:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_DIV:
-      return 0;
-      break;
+   int v1 = ppc_allocate_vec_register(gen->f);
+   int v2 = ppc_allocate_vec_register(gen->f);
+   uint chan_index;
+   boolean complement = FALSE;
+   int one_vec = gen_one_vec(gen);
 
-   case TGSI_OPCODE_DP2:
-      return 0;
-      break;
+   FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) {
+      FETCH(gen, *inst, v0, 0, chan_index);   /* v0 = srcreg[0] */
+      FETCH(gen, *inst, v1, 1, chan_index);   /* v1 = srcreg[1] */
 
-   case TGSI_OPCODE_TXL:
-      return 0;
-      break;
+      switch (inst->Instruction.Opcode) {
+      case TGSI_OPCODE_SNE:
+         complement = TRUE;
+         /* fall-through */
+      case TGSI_OPCODE_SEQ:
+         ppc_vcmpeqfpx(gen->f, v2, v0, v1); /* v2 = v0 == v1 ? ~0 : 0 */
+         break;
 
-   case TGSI_OPCODE_BRK:
-      return 0;
-      break;
+      case TGSI_OPCODE_SGE:
+         complement = TRUE;
+         /* fall-through */
+      case TGSI_OPCODE_SLT:
+         ppc_vcmpgtfpx(gen->f, v2, v1, v0); /* v2 = v1 > v0 ? ~0 : 0 */
+         break;
 
-   case TGSI_OPCODE_IF:
-      return 0;
-      break;
+      case TGSI_OPCODE_SLE:
+         complement = TRUE;
+         /* fall-through */
+      case TGSI_OPCODE_SGT:
+         ppc_vcmpgtfpx(gen->f, v2, v0, v1); /* v2 = v0 > v1 ? ~0 : 0 */
+         break;
+      default:
+         assert(0);
+      }
 
-   case TGSI_OPCODE_LOOP:
-      return 0;
-      break;
+      /* v2 is now {0,0,0,0} or {~0,~0,~0,~0} */
 
-   case TGSI_OPCODE_REP:
-      return 0;
-      break;
+      if (complement)
+         ppc_vandc(gen->f, v2, one_vec, v2);    /* v2 = one_vec & ~v2 */
+      else
+         ppc_vand(gen->f, v2, one_vec, v2);     /* v2 = one_vec & v2 */
 
-   case TGSI_OPCODE_ELSE:
-      return 0;
-      break;
+      STORE(gen, *inst, v2, 0, chan_index);   /* store v2 */
+   }
 
-   case TGSI_OPCODE_ENDIF:
-      return 0;
-      break;
+   ppc_release_vec_register(gen->f, v0);
+   ppc_release_vec_register(gen->f, v1);
+   ppc_release_vec_register(gen->f, v2);
+}
 
-   case TGSI_OPCODE_ENDLOOP:
-      return 0;
-      break;
 
-   case TGSI_OPCODE_ENDREP:
-      return 0;
-      break;
+static void
+emit_dotprod(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+   int v0 = ppc_allocate_vec_register(gen->f);
+   int v1 = ppc_allocate_vec_register(gen->f);
+   int v2 = ppc_allocate_vec_register(gen->f);
+   uint chan_index;
 
-   case TGSI_OPCODE_PUSHA:
-      return 0;
-      break;
+   ppc_vxor(gen->f, v2, v2, v2);           /* v2 = {0, 0, 0, 0} */
 
-   case TGSI_OPCODE_POPA:
-      return 0;
-      break;
+   FETCH(gen, *inst, v0, 0, CHAN_X);       /* v0 = src0.XXXX */
+   FETCH(gen, *inst, v1, 1, CHAN_X);       /* v1 = src1.XXXX */
+   ppc_vmaddfp(gen->f, v2, v0, v1, v2);    /* v2 = v0 * v1 + v2 */
 
-   case TGSI_OPCODE_CEIL:
-      return 0;
-      break;
+   FETCH(gen, *inst, v0, 0, CHAN_Y);       /* v0 = src0.YYYY */
+   FETCH(gen, *inst, v1, 1, CHAN_Y);       /* v1 = src1.YYYY */
+   ppc_vmaddfp(gen->f, v2, v0, v1, v2);    /* v2 = v0 * v1 + v2 */
 
-   case TGSI_OPCODE_I2F:
-      return 0;
-      break;
+   FETCH(gen, *inst, v0, 0, CHAN_Z);       /* v0 = src0.ZZZZ */
+   FETCH(gen, *inst, v1, 1, CHAN_Z);       /* v1 = src1.ZZZZ */
+   ppc_vmaddfp(gen->f, v2, v0, v1, v2);    /* v2 = v0 * v1 + v2 */
 
-   case TGSI_OPCODE_NOT:
-      return 0;
-      break;
+   if (inst->Instruction.Opcode == TGSI_OPCODE_DP4) {
+      FETCH(gen, *inst, v0, 0, CHAN_W);    /* v0 = src0.WWWW */
+      FETCH(gen, *inst, v1, 1, CHAN_W);    /* v1 = src1.WWWW */
+      ppc_vmaddfp(gen->f, v2, v0, v1, v2); /* v2 = v0 * v1 + v2 */
+   }
+   else if (inst->Instruction.Opcode == TGSI_OPCODE_DPH) {
+      FETCH(gen, *inst, v1, 1, CHAN_W);    /* v1 = src1.WWWW */
+      ppc_vaddfp(gen->f, v2, v2, v1);      /* v2 = v2 + v1 */
+   }
 
-   case TGSI_OPCODE_TRUNC:
-      return 0;
-      break;
+   FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) {
+      STORE(gen, *inst, v2, 0, chan_index);  /* store v2 */
+   }
+   ppc_release_vec_register(gen->f, v0);
+   ppc_release_vec_register(gen->f, v1);
+   ppc_release_vec_register(gen->f, v2);
+}
 
-   case TGSI_OPCODE_SHL:
-      return 0;
-      break;
 
-   case TGSI_OPCODE_SHR:
-      return 0;
-      break;
+static void
+emit_triop(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+   int v0 = ppc_allocate_vec_register(gen->f);
+   int v1 = ppc_allocate_vec_register(gen->f);
+   int v2 = ppc_allocate_vec_register(gen->f);
+   int v3 = ppc_allocate_vec_register(gen->f);
+   uint chan_index;
+   FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) {
+      FETCH(gen, *inst, v0, 0, chan_index);   /* v0 = srcreg[0] */
+      FETCH(gen, *inst, v1, 1, chan_index);   /* v1 = srcreg[1] */
+      FETCH(gen, *inst, v2, 2, chan_index);   /* v2 = srcreg[2] */
+      switch (inst->Instruction.Opcode) {
+      case TGSI_OPCODE_MAD:
+         ppc_vmaddfp(gen->f, v3, v0, v1, v2);   /* v3 = v0 * v1 + v2 */
+         break;
+      case TGSI_OPCODE_LRP:
+         ppc_vsubfp(gen->f, v3, v1, v2);        /* v3 = v1 - v2 */
+         ppc_vmaddfp(gen->f, v3, v0, v3, v2);   /* v3 = v0 * v3 + v2 */
+         break;
+      default:
+         assert(0);
+      }
+      STORE(gen, *inst, v3, 0, chan_index);   /* store v3 */
+   }
+   ppc_release_vec_register(gen->f, v0);
+   ppc_release_vec_register(gen->f, v1);
+   ppc_release_vec_register(gen->f, v2);
+   ppc_release_vec_register(gen->f, v3);
+}
 
-   case TGSI_OPCODE_AND:
-      return 0;
-      break;
 
-   case TGSI_OPCODE_OR:
-      return 0;
-      break;
+/*
+static void
+emit_lit(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+}
+*/
 
-   case TGSI_OPCODE_MOD:
-      return 0;
-      break;
 
-   case TGSI_OPCODE_XOR:
-      return 0;
+static int
+emit_instruction(struct gen_context *gen,
+                 struct tgsi_full_instruction *inst)
+{
+   switch (inst->Instruction.Opcode) {
+   case TGSI_OPCODE_MOV:
+   case TGSI_OPCODE_ABS:
+   case TGSI_OPCODE_FLOOR:
+   case TGSI_OPCODE_FRAC:
+   case TGSI_OPCODE_EXPBASE2:
+   case TGSI_OPCODE_LOGBASE2:
+      emit_unaryop(gen, inst);
       break;
-
-   case TGSI_OPCODE_SAD:
-      return 0;
+   case TGSI_OPCODE_RSQ:
+   case TGSI_OPCODE_RCP:
+      emit_scalar_unaryop(gen, inst);
       break;
-
-   case TGSI_OPCODE_TXF:
-      return 0;
+   case TGSI_OPCODE_ADD:
+   case TGSI_OPCODE_SUB:
+   case TGSI_OPCODE_MUL:
+   case TGSI_OPCODE_MIN:
+   case TGSI_OPCODE_MAX:
+      emit_binop(gen, inst);
       break;
-
-   case TGSI_OPCODE_TXQ:
-      return 0;
+   case TGSI_OPCODE_SEQ:
+   case TGSI_OPCODE_SNE:
+   case TGSI_OPCODE_SLT:
+   case TGSI_OPCODE_SGT:
+   case TGSI_OPCODE_SLE:
+   case TGSI_OPCODE_SGE:
+      emit_inequality(gen, inst);
       break;
-
-   case TGSI_OPCODE_CONT:
-      return 0;
+   case TGSI_OPCODE_MAD:
+   case TGSI_OPCODE_LRP:
+      emit_triop(gen, inst);
       break;
-
-   case TGSI_OPCODE_EMIT:
-      return 0;
+   case TGSI_OPCODE_DP3:
+   case TGSI_OPCODE_DP4:
+   case TGSI_OPCODE_DPH:
+      emit_dotprod(gen, inst);
       break;
-
-   case TGSI_OPCODE_ENDPRIM:
-      return 0;
+      /*
+   case TGSI_OPCODE_LIT:
+      emit_lit(gen, inst);
       break;
-
+      */
+   case TGSI_OPCODE_END:
+      /* normal end */
+      return 1;
    default:
       return 0;
    }
-#endif
+
    
    return 1;
 }
@@ -2608,133 +663,6 @@ emit_declaration(
    }
 }
 
-#if 0
-static void aos_to_soa( struct x86_function *func, 
-                        uint arg_aos,
-                        uint arg_soa, 
-                        uint arg_num, 
-                        uint arg_stride )
-{
-   struct x86_reg soa_input = x86_make_reg( file_REG32, reg_AX );
-   struct x86_reg aos_input = x86_make_reg( file_REG32, reg_BX );
-   struct x86_reg num_inputs = x86_make_reg( file_REG32, reg_CX );
-   struct x86_reg stride = x86_make_reg( file_REG32, reg_DX );
-   int inner_loop;
-
-
-   /* Save EBX */
-   x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
-
-   x86_mov( func, aos_input,  x86_fn_arg( func, arg_aos ) );
-   x86_mov( func, soa_input,  x86_fn_arg( func, arg_soa ) );
-   x86_mov( func, num_inputs, x86_fn_arg( func, arg_num ) );
-   x86_mov( func, stride,     x86_fn_arg( func, arg_stride ) );
-
-   /* do */
-   inner_loop = x86_get_label( func );
-   {
-      x86_push( func, aos_input );
-      sse_movlps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
-      sse_movlps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
-      x86_add( func, aos_input, stride );
-      sse_movhps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
-      sse_movhps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
-      x86_add( func, aos_input, stride );
-      sse_movlps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
-      sse_movlps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
-      x86_add( func, aos_input, stride );
-      sse_movhps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
-      sse_movhps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
-      x86_pop( func, aos_input );
-
-      sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
-      sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
-      sse_shufps( func, make_xmm( 0 ), make_xmm( 1 ), 0x88 );
-      sse_shufps( func, make_xmm( 2 ), make_xmm( 1 ), 0xdd );
-      sse_shufps( func, make_xmm( 3 ), make_xmm( 4 ), 0x88 );
-      sse_shufps( func, make_xmm( 5 ), make_xmm( 4 ), 0xdd );
-
-      sse_movups( func, x86_make_disp( soa_input, 0 ), make_xmm( 0 ) );
-      sse_movups( func, x86_make_disp( soa_input, 16 ), make_xmm( 2 ) );
-      sse_movups( func, x86_make_disp( soa_input, 32 ), make_xmm( 3 ) );
-      sse_movups( func, x86_make_disp( soa_input, 48 ), make_xmm( 5 ) );
-
-      /* Advance to next input */
-      x86_lea( func, aos_input, x86_make_disp(aos_input, 16) );
-      x86_lea( func, soa_input, x86_make_disp(soa_input, 64) );
-   }
-   /* while --num_inputs */
-   x86_dec( func, num_inputs );
-   x86_jcc( func, cc_NE, inner_loop );
-
-   /* Restore EBX */
-   x86_pop( func, aos_input );
-}
-#endif
-
-#if 0
-static void soa_to_aos( struct x86_function *func, uint aos, uint soa, uint num, uint stride )
-{
-   struct x86_reg soa_output;
-   struct x86_reg aos_output;
-   struct x86_reg num_outputs;
-   struct x86_reg temp;
-   int inner_loop;
-
-   soa_output = x86_make_reg( file_REG32, reg_AX );
-   aos_output = x86_make_reg( file_REG32, reg_BX );
-   num_outputs = x86_make_reg( file_REG32, reg_CX );
-   temp = x86_make_reg( file_REG32, reg_DX );
-
-   /* Save EBX */
-   x86_push( func, aos_output );
-
-   x86_mov( func, soa_output, x86_fn_arg( func, soa ) );
-   x86_mov( func, aos_output, x86_fn_arg( func, aos ) );
-   x86_mov( func, num_outputs, x86_fn_arg( func, num ) );
-
-   /* do */
-   inner_loop = x86_get_label( func );
-   {
-      sse_movups( func, make_xmm( 0 ), x86_make_disp( soa_output, 0 ) );
-      sse_movups( func, make_xmm( 1 ), x86_make_disp( soa_output, 16 ) );
-      sse_movups( func, make_xmm( 3 ), x86_make_disp( soa_output, 32 ) );
-      sse_movups( func, make_xmm( 4 ), x86_make_disp( soa_output, 48 ) );
-
-      sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
-      sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
-      sse_unpcklps( func, make_xmm( 0 ), make_xmm( 1 ) );
-      sse_unpckhps( func, make_xmm( 2 ), make_xmm( 1 ) );
-      sse_unpcklps( func, make_xmm( 3 ), make_xmm( 4 ) );
-      sse_unpckhps( func, make_xmm( 5 ), make_xmm( 4 ) );
-
-      x86_mov( func, temp, x86_fn_arg( func, stride ) );
-      x86_push( func, aos_output );
-      sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
-      sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
-      x86_add( func, aos_output, temp );
-      sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
-      sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
-      x86_add( func, aos_output, temp );
-      sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
-      sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
-      x86_add( func, aos_output, temp );
-      sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
-      sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
-      x86_pop( func, aos_output );
-
-      /* Advance to next output */
-      x86_lea( func, aos_output, x86_make_disp(aos_output, 16) );
-      x86_lea( func, soa_output, x86_make_disp(soa_output, 64) );
-   }
-   /* while --num_outputs */
-   x86_dec( func, num_outputs );
-   x86_jcc( func, cc_NE, inner_loop );
-
-   /* Restore EBX */
-   x86_pop( func, aos_output );
-}
-#endif
 
 
 static void
@@ -2788,67 +716,6 @@ tgsi_emit_ppc(const struct tgsi_token *tokens,
 
    emit_prologue(func);
 
-   /*
-    * Different function args for vertex/fragment shaders:
-    */
-#if 0
-   if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
-      /* DECLARATION phase, do not load output argument. */
-      x86_mov(
-         func,
-         get_input_base(),
-         x86_fn_arg( func, 1 ) );
-      /* skipping outputs argument here */
-      x86_mov(
-         func,
-         get_const_base(),
-         x86_fn_arg( func, 3 ) );
-      x86_mov(
-         func,
-         get_temp_base(),
-         x86_fn_arg( func, 4 ) );
-      x86_mov(
-         func,
-         get_coef_base(),
-         x86_fn_arg( func, 5 ) );
-      x86_mov(
-         func,
-         get_immediate_base(),
-         x86_fn_arg( func, 6 ) );
-   }
-   else {
-      assert(parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX);
-
-      if (do_swizzles)
-         aos_to_soa( func, 
-                     6,         /* aos_input */
-                     1,         /* machine->input */
-                     7,         /* num_inputs */
-                     8 );       /* input_stride */
-
-      x86_mov(
-         func,
-         get_input_base(),
-         x86_fn_arg( func, 1 ) );
-      x86_mov(
-         func,
-         get_output_base(),
-         x86_fn_arg( func, 2 ) );
-      x86_mov(
-         func,
-         get_const_base(),
-         x86_fn_arg( func, 3 ) );
-      x86_mov(
-         func,
-         get_temp_base(),
-         x86_fn_arg( func, 4 ) );
-      x86_mov(
-         func,
-         get_immediate_base(),
-         x86_fn_arg( func, 5 ) );
-   }
-#endif
-
    while (!tgsi_parse_end_of_tokens(&parse) && ok) {
       tgsi_parse_token(&parse);
 
@@ -2860,19 +727,6 @@ tgsi_emit_ppc(const struct tgsi_token *tokens,
          break;
 
       case TGSI_TOKEN_TYPE_INSTRUCTION:
-         if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
-#if 0
-            if( !instruction_phase ) {
-               /* INSTRUCTION phase, overwrite coeff with output. */
-               instruction_phase = TRUE;
-               x86_mov(
-                  func,
-                  get_output_base(),
-                  x86_fn_arg( func, 2 ) );
-            }
-#endif
-         }
-
          ok = emit_instruction(&gen, &parse.FullToken.FullInstruction);
 
 	 if (!ok) {
@@ -2909,13 +763,6 @@ tgsi_emit_ppc(const struct tgsi_token *tokens,
       }
    }
 
-#if 0
-   if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
-      if (do_swizzles)
-         soa_to_aos( func, 9, 2, 10, 11 );
-   }
-#endif
-
    emit_epilogue(func);
 
    tgsi_parse_free( &parse );
-- 
cgit v1.2.3


From 77160cd97b7f2181b7953bcc8d13e86055b819e3 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 22 Oct 2008 15:34:16 -0600
Subject: gallium: var renaming in tgsi_ppc.c

---
 src/gallium/auxiliary/tgsi/tgsi_ppc.c | 36 +++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

(limited to 'src/gallium/auxiliary/tgsi/tgsi_ppc.c')

diff --git a/src/gallium/auxiliary/tgsi/tgsi_ppc.c b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
index 432ec7459b..c1e707657b 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ppc.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
@@ -117,11 +117,11 @@ gen_get_bit31_vec(struct gen_context *gen)
 
 
 /**
- * Register fetch.
+ * Register fetch, put result in 'dst_vec'.
  */
 static void
 emit_fetch(struct gen_context *gen,
-           unsigned vec_reg,
+           unsigned dst_vec,
            const struct tgsi_full_src_register *reg,
            const unsigned chan_index)
 {
@@ -138,7 +138,7 @@ emit_fetch(struct gen_context *gen,
             int offset_reg = ppc_allocate_register(gen->f);
             int offset = (reg->SrcRegister.Index * 4 + swizzle) * 16;
             ppc_li(gen->f, offset_reg, offset);
-            ppc_lvx(gen->f, vec_reg, gen->inputs_reg, offset_reg);
+            ppc_lvx(gen->f, dst_vec, gen->inputs_reg, offset_reg);
             ppc_release_register(gen->f, offset_reg);
          }
          break;
@@ -147,7 +147,7 @@ emit_fetch(struct gen_context *gen,
             int offset_reg = ppc_allocate_register(gen->f);
             int offset = (reg->SrcRegister.Index * 4 + swizzle) * 16;
             ppc_li(gen->f, offset_reg, offset);
-            ppc_lvx(gen->f, vec_reg, gen->temps_reg, offset_reg);
+            ppc_lvx(gen->f, dst_vec, gen->temps_reg, offset_reg);
             ppc_release_register(gen->f, offset_reg);
          }
          break;
@@ -156,7 +156,7 @@ emit_fetch(struct gen_context *gen,
             int offset_reg = ppc_allocate_register(gen->f);
             int offset = (reg->SrcRegister.Index * 4 + swizzle) * 16;
             ppc_li(gen->f, offset_reg, offset);
-            ppc_lvx(gen->f, vec_reg, gen->immed_reg, offset_reg);
+            ppc_lvx(gen->f, dst_vec, gen->immed_reg, offset_reg);
             ppc_release_register(gen->f, offset_reg);
          }
          break;
@@ -171,9 +171,9 @@ emit_fetch(struct gen_context *gen,
              * know that 'swizzle' tells us which vector slot will have the
              * loaded word.  The other vector slots will be undefined.
              */
-            ppc_lvewx(gen->f, vec_reg, gen->const_reg, offset_reg);
+            ppc_lvewx(gen->f, dst_vec, gen->const_reg, offset_reg);
             /* splat word[swizzle] across the vector reg */
-            ppc_vspltw(gen->f, vec_reg, vec_reg, swizzle);
+            ppc_vspltw(gen->f, dst_vec, dst_vec, swizzle);
             ppc_release_register(gen->f, offset_reg);
          }
          break;
@@ -182,12 +182,12 @@ emit_fetch(struct gen_context *gen,
       }
       break;
    case TGSI_EXTSWIZZLE_ZERO:
-      ppc_vload_float(gen->f, vec_reg, 0.0f);
+      ppc_vload_float(gen->f, dst_vec, 0.0f);
       break;
    case TGSI_EXTSWIZZLE_ONE:
       {
          int one_vec = gen_one_vec(gen);
-         ppc_vecmove(gen->f, vec_reg, one_vec);
+         ppc_vecmove(gen->f, dst_vec, one_vec);
       }
       break;
    default:
@@ -202,15 +202,15 @@ emit_fetch(struct gen_context *gen,
          switch (sign_op) {
          case TGSI_UTIL_SIGN_CLEAR:
             /* vec = vec & ~bit31 */
-            ppc_vandc(gen->f, vec_reg, vec_reg, bit31_vec);
+            ppc_vandc(gen->f, dst_vec, dst_vec, bit31_vec);
             break;
          case TGSI_UTIL_SIGN_SET:
             /* vec = vec | bit31 */
-            ppc_vor(gen->f, vec_reg, vec_reg, bit31_vec);
+            ppc_vor(gen->f, dst_vec, dst_vec, bit31_vec);
             break;
          case TGSI_UTIL_SIGN_TOGGLE:
             /* vec = vec ^ bit31 */
-            ppc_vxor(gen->f, vec_reg, vec_reg, bit31_vec);
+            ppc_vxor(gen->f, dst_vec, dst_vec, bit31_vec);
             break;
          default:
             assert(0);
@@ -219,17 +219,17 @@ emit_fetch(struct gen_context *gen,
    }
 }
 
-#define FETCH( GEN, INST, VEC_REG, SRC_REG, CHAN ) \
-   emit_fetch( GEN, VEC_REG, &(INST).FullSrcRegisters[SRC_REG], CHAN )
+#define FETCH( GEN, INST, DST_VEC, SRC_REG, CHAN ) \
+   emit_fetch( GEN, DST_VEC, &(INST).FullSrcRegisters[SRC_REG], CHAN )
 
 
 /**
- * Register store.
+ * Register store.  Store 'src_vec' at location indicated by 'reg'.
  */
 static void
 emit_store(struct gen_context *gen,
-           unsigned vec_reg,
+           unsigned src_vec,
            const struct tgsi_full_dst_register *reg,
            const struct tgsi_full_instruction *inst,
            unsigned chan_index)
@@ -240,7 +240,7 @@ emit_store(struct gen_context *gen,
          int offset_reg = ppc_allocate_register(gen->f);
          int offset = (reg->DstRegister.Index * 4 + chan_index) * 16;
          ppc_li(gen->f, offset_reg, offset);
-         ppc_stvx(gen->f, vec_reg, gen->outputs_reg, offset_reg);
+         ppc_stvx(gen->f, src_vec, gen->outputs_reg, offset_reg);
          ppc_release_register(gen->f, offset_reg);
       }
       break;
@@ -249,7 +249,7 @@ emit_store(struct gen_context *gen,
          int offset_reg = ppc_allocate_register(gen->f);
          int offset = (reg->DstRegister.Index * 4 + chan_index) * 16;
          ppc_li(gen->f, offset_reg, offset);
-         ppc_stvx(gen->f, vec_reg, gen->temps_reg, offset_reg);
+         ppc_stvx(gen->f, src_vec, gen->temps_reg, offset_reg);
          ppc_release_register(gen->f, offset_reg);
       }
       break;
-- 
cgit v1.2.3


From 9e3ee82305b4602feca0253dc0e0c27f9bc9b05e Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 22 Oct 2008 16:57:22 -0600
Subject: gallium: PPC LIT instruction (not quite complete yet)

---
 src/gallium/auxiliary/tgsi/tgsi_ppc.c | 89 +++++++++++++++++++++++++++++++++--
 1 file changed, 85 insertions(+), 4 deletions(-)

(limited to 'src/gallium/auxiliary/tgsi/tgsi_ppc.c')

diff --git a/src/gallium/auxiliary/tgsi/tgsi_ppc.c b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
index c1e707657b..edd535a884 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ppc.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
@@ -535,12 +535,95 @@ emit_triop(struct gen_context *gen, struct tgsi_full_instruction *inst)
 }
 
 
-/*
+
+/** Approximation for vr = pow(va, vb) */
+static void
+ppc_vec_pow(struct ppc_function *f, int vr, int va, int vb)
+{
+   /* pow(a,b) ~= exp2(log2(a) * b) */
+   int t_vec = ppc_allocate_vec_register(f);
+   int zero_vec = ppc_allocate_vec_register(f);
+
+   ppc_vload_float(f, zero_vec, 0.0f);
+
+   ppc_vlogefp(f, t_vec, va);                   /* t = log2(va) */
+   ppc_vmaddfp(f, t_vec, t_vec, vb, zero_vec);  /* t = t * vb */
+   ppc_vexptefp(f, vr, t_vec);                  /* vr = 2^t */
+
+   ppc_release_vec_register(f, t_vec);
+   ppc_release_vec_register(f, zero_vec);
+}
+
+
 static void
 emit_lit(struct gen_context *gen, struct tgsi_full_instruction *inst)
 {
+   int one_vec = gen_one_vec(gen);
+
+   /* Compute X */
+   if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) {
+      STORE(gen, *inst, one_vec, 0, CHAN_X);
+   }
+
+   /* Compute Y, Z */
+   if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y) ||
+       IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
+      int x_vec = ppc_allocate_vec_register(gen->f);
+      int zero_vec = ppc_allocate_vec_register(gen->f);
+
+      FETCH(gen, *inst, x_vec, 0, CHAN_X);        /* x_vec = src[0].x */
+
+      ppc_vload_float(gen->f, zero_vec, 0.0f);    /* zero = {0,0,0,0} */
+      ppc_vmaxfp(gen->f, x_vec, x_vec, zero_vec); /* x_vec = max(x_vec, 0) */
+
+      if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
+         STORE(gen, *inst, x_vec, 0, CHAN_Y);        /* store Y */
+      }
+
+      if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
+         int y_vec = ppc_allocate_vec_register(gen->f);
+         int z_vec = ppc_allocate_vec_register(gen->f);
+         int w_vec = ppc_allocate_vec_register(gen->f);
+         int pow_vec = ppc_allocate_vec_register(gen->f);
+         int pos_vec = ppc_allocate_vec_register(gen->f);
+         int c128_vec = ppc_allocate_vec_register(gen->f);
+
+         FETCH(gen, *inst, y_vec, 0, CHAN_Y);        /* y_vec = src[0].y */
+         ppc_vmaxfp(gen->f, y_vec, y_vec, zero_vec); /* y_vec = max(y_vec, 0) */
+
+         FETCH(gen, *inst, w_vec, 0, CHAN_W);        /* w_vec = src[0].w */
+
+         /* XXX clamp Y to [-128, 128] */
+         ppc_vload_float(gen->f, c128_vec, 128.0f);
+
+         /* if temp.x > 0
+          *    pow(tmp.y, tmp.w)
+          * else
+          *   0.0
+          */
+
+         ppc_vec_pow(gen->f, pow_vec, y_vec, w_vec);      /* pow = pow(y, w) */
+         ppc_vcmpgtfpx(gen->f, pos_vec, x_vec, zero_vec); /* pos = x > 0 */
+         ppc_vand(gen->f, z_vec, pow_vec, pos_vec);       /* z = pow & pos */
+
+         STORE(gen, *inst, z_vec, 0, CHAN_Z);             /* store Z */
+
+         ppc_release_vec_register(gen->f, y_vec);
+         ppc_release_vec_register(gen->f, z_vec);
+         ppc_release_vec_register(gen->f, w_vec);
+         ppc_release_vec_register(gen->f, pow_vec);
+         ppc_release_vec_register(gen->f, pos_vec);
+      }
+
+      ppc_release_vec_register(gen->f, x_vec);
+      ppc_release_vec_register(gen->f, zero_vec);
+   }
+
+   /* Compute W */
+   if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W)) {
+      STORE(gen, *inst, one_vec, 0, CHAN_W);
+   }
 }
-*/
 
 
 static int
@@ -584,11 +667,9 @@ emit_instruction(struct gen_context *gen,
    case TGSI_OPCODE_DPH:
       emit_dotprod(gen, inst);
       break;
-      /*
    case TGSI_OPCODE_LIT:
       emit_lit(gen, inst);
       break;
-      */
    case TGSI_OPCODE_END:
       /* normal end */
       return 1;
-- 
cgit v1.2.3


From ae81aeb12868db219cbdc02437c481714cfed3f5 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 22 Oct 2008 16:58:05 -0600
Subject: gallium: GALLIUM_NOPPC debug var to disable PPC codegen

---
 src/gallium/auxiliary/tgsi/tgsi_ppc.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'src/gallium/auxiliary/tgsi/tgsi_ppc.c')

diff --git a/src/gallium/auxiliary/tgsi/tgsi_ppc.c b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
index edd535a884..9d7de41fe7 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ppc.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
@@ -776,15 +776,21 @@ tgsi_emit_ppc(const struct tgsi_token *tokens,
               float (*immediates)[4],
               boolean do_swizzles )
 {
+   static int use_ppc_asm = -1;
    struct tgsi_parse_context parse;
    /*boolean instruction_phase = FALSE;*/
    unsigned ok = 1;
    uint num_immediates = 0;
    struct gen_context gen;
 
-   util_init_math();
+   if (use_ppc_asm < 0) {
+      /* If GALLIUM_NOPPC is set, don't use PPC codegen */
+      use_ppc_asm = !debug_get_bool_option("GALLIUM_NOPPC", FALSE);
+   }
+   if (!use_ppc_asm)
+      return FALSE;
 
-   tgsi_parse_init( &parse, tokens );
+   util_init_math();
 
    gen.f = func;
    gen.inputs_reg = ppc_reserve_register(func, 3);   /* first function param */
@@ -797,6 +803,8 @@ tgsi_emit_ppc(const struct tgsi_token *tokens,
 
    emit_prologue(func);
 
+   tgsi_parse_init( &parse, tokens );
+
    while (!tgsi_parse_end_of_tokens(&parse) && ok) {
       tgsi_parse_token(&parse);
 
-- 
cgit v1.2.3


From abbbe876ac98596b143da295abf6887e0a4e50d2 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 22 Oct 2008 17:19:12 -0600
Subject: gallium: new PPC built-in constants array

It's hard to form PPC vector immediates so load them from an array.
---
 src/gallium/auxiliary/draw/draw_vs_ppc.c |  8 +++--
 src/gallium/auxiliary/tgsi/tgsi_ppc.c    | 61 ++++++++++++++++++++++++++++----
 src/gallium/auxiliary/tgsi/tgsi_ppc.h    |  3 ++
 3 files changed, 63 insertions(+), 9 deletions(-)

(limited to 'src/gallium/auxiliary/tgsi/tgsi_ppc.c')

diff --git a/src/gallium/auxiliary/draw/draw_vs_ppc.c b/src/gallium/auxiliary/draw/draw_vs_ppc.c
index fcc9cbfec5..8eff6d4fda 100644
--- a/src/gallium/auxiliary/draw/draw_vs_ppc.c
+++ b/src/gallium/auxiliary/draw/draw_vs_ppc.c
@@ -55,7 +55,8 @@ typedef void (PIPE_CDECL *codegen_function) (float (*inputs)[4][4],
                                              float (*outputs)[4][4],
                                              float (*temps)[4][4],
                                              float (*immeds)[4][4],
-                                             float (*consts)[4]);
+                                             float (*consts)[4],
+                                             const float *builtins);
 
 #if 0
    const struct tgsi_exec_vector *input,
@@ -151,7 +152,8 @@ vs_ppc_run_linear( struct draw_vertex_shader *base,
 #else
       shader->func(inputs_soa, outputs_soa, temps_soa,
 		   (float (*)[4][4]) shader->base.immediates,
-		   (float (*)[4]) constants);
+		   (float (*)[4]) constants,
+                   ppc_builtin_constants);
 
       /*output[0][0] = input[0][0] * 0.5;*/
 #endif
@@ -246,7 +248,9 @@ draw_create_vs_ppc(struct draw_context *draw,
    return &vs->base;
 
 fail:
+   /*
    debug_error("tgsi_emit_ppc() failed, falling back to interpreter\n");
+   */
 
    ppc_release_func( &vs->ppc_program );
    
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ppc.c b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
index 9d7de41fe7..6b05fd16cf 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ppc.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
@@ -36,6 +36,7 @@
 #include "pipe/p_debug.h"
 #include "pipe/p_shader_tokens.h"
 #include "util/u_math.h"
+#include "util/u_memory.h"
 #include "util/u_sse.h"
 #include "tgsi/tgsi_parse.h"
 #include "tgsi/tgsi_util.h"
@@ -44,6 +45,14 @@
 #include "rtasm/rtasm_ppc.h"
 
 
+/**
+ * Since it's pretty much impossible to form PPC vector immediates, load
+ * them from memory here:
+ */
+const float ppc_builtin_constants[] ALIGN16_ATTRIB = {
+   1.0f, -128.0f, 128.0, 0.0
+};
+
 
 #define FOR_EACH_CHANNEL( CHAN )\
    for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)
@@ -81,12 +90,46 @@ struct gen_context
    int temps_reg;     /**< GP register pointing to temporary "registers" */
    int immed_reg;     /**< GP register pointing to immediates buffer */
    int const_reg;     /**< GP register pointing to constants buffer */
+   int builtins_reg;  /**< GP register pointint to built-in constants */
 
    int one_vec;       /**< vector register with {1.0, 1.0, 1.0, 1.0} */
    int bit31_vec;     /**< vector register with {1<<31, 1<<31, 1<<31, 1<<31} */
 };
 
 
+/**
+ * Load the given vector register with {value, value, value, value}.
+ * The value must be in the ppu_builtin_constants[] array.
+ * We wouldn't need this if there was a simple way to load PPC vector
+ * registers with immediate values!
+ */
+static void
+load_constant_vec(struct gen_context *gen, int dst_vec, float value)
+{
+   uint pos;
+   for (pos = 0; pos < Elements(ppc_builtin_constants); pos++) {
+      if (ppc_builtin_constants[pos] == value) {
+         int offset_reg = ppc_allocate_register(gen->f);
+         int offset = pos * 4;
+
+         ppc_li(gen->f, offset_reg, offset);
+         /* Load 4-byte word into vector register.
+          * The vector slot depends on the effective address we load from.
+          * We know that our builtins start at a 16-byte boundary so we
+          * know that 'swizzle' tells us which vector slot will have the
+          * loaded word.  The other vector slots will be undefined.
+          */
+         ppc_lvewx(gen->f, dst_vec, gen->builtins_reg, offset_reg);
+         /* splat word[pos % 4] across the vector reg */
+         ppc_vspltw(gen->f, dst_vec, dst_vec, pos % 4);
+         ppc_release_register(gen->f, offset_reg);
+         return;
+      }
+   }
+   assert(0 && "Need to add new constant to ppc_builtin_constants array");
+}
+
+
 /**
  * Return index of vector register containing {1.0, 1.0, 1.0, 1.0}.
  */
@@ -95,7 +138,7 @@ gen_one_vec(struct gen_context *gen)
 {
    if (gen->one_vec < 0) {
       gen->one_vec = ppc_allocate_vec_register(gen->f);
-      ppc_vload_float(gen->f, gen->one_vec, 1.0f);
+      load_constant_vec(gen, gen->one_vec, 1.0f);
    }
    return gen->one_vec;
 }
@@ -115,7 +158,6 @@ gen_get_bit31_vec(struct gen_context *gen)
 }
 
 
-
 /**
  * Register fetch, put result in 'dst_vec'.
  */
@@ -182,7 +224,7 @@ emit_fetch(struct gen_context *gen,
       }
       break;
    case TGSI_EXTSWIZZLE_ZERO:
-      ppc_vload_float(gen->f, dst_vec, 0.0f);
+      ppc_vzero(gen->f, dst_vec);
       break;
    case TGSI_EXTSWIZZLE_ONE:
       {
@@ -544,7 +586,7 @@ ppc_vec_pow(struct ppc_function *f, int vr, int va, int vb)
    int t_vec = ppc_allocate_vec_register(f);
    int zero_vec = ppc_allocate_vec_register(f);
 
-   ppc_vload_float(f, zero_vec, 0.0f);
+   ppc_vzero(f, zero_vec);
 
    ppc_vlogefp(f, t_vec, va);                   /* t = log2(va) */
    ppc_vmaddfp(f, t_vec, t_vec, vb, zero_vec);  /* t = t * vb */
@@ -573,7 +615,7 @@ emit_lit(struct gen_context *gen, struct tgsi_full_instruction *inst)
 
       FETCH(gen, *inst, x_vec, 0, CHAN_X);        /* x_vec = src[0].x */
 
-      ppc_vload_float(gen->f, zero_vec, 0.0f);    /* zero = {0,0,0,0} */
+      ppc_vzero(gen->f, zero_vec);                /* zero = {0,0,0,0} */
       ppc_vmaxfp(gen->f, x_vec, x_vec, zero_vec); /* x_vec = max(x_vec, 0) */
 
       if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
@@ -586,7 +628,8 @@ emit_lit(struct gen_context *gen, struct tgsi_full_instruction *inst)
          int w_vec = ppc_allocate_vec_register(gen->f);
          int pow_vec = ppc_allocate_vec_register(gen->f);
          int pos_vec = ppc_allocate_vec_register(gen->f);
-         int c128_vec = ppc_allocate_vec_register(gen->f);
+         int p128_vec = ppc_allocate_vec_register(gen->f);
+         int n128_vec = ppc_allocate_vec_register(gen->f);
 
          FETCH(gen, *inst, y_vec, 0, CHAN_Y);        /* y_vec = src[0].y */
          ppc_vmaxfp(gen->f, y_vec, y_vec, zero_vec); /* y_vec = max(y_vec, 0) */
@@ -594,7 +637,8 @@ emit_lit(struct gen_context *gen, struct tgsi_full_instruction *inst)
          FETCH(gen, *inst, w_vec, 0, CHAN_W);        /* w_vec = src[0].w */
 
          /* XXX clamp Y to [-128, 128] */
-         ppc_vload_float(gen->f, c128_vec, 128.0f);
+         load_constant_vec(gen, p128_vec, 128.0f);
+         load_constant_vec(gen, n128_vec, -128.0f);
 
          /* if temp.x > 0
           *    pow(tmp.y, tmp.w)
@@ -613,6 +657,8 @@ emit_lit(struct gen_context *gen, struct tgsi_full_instruction *inst)
          ppc_release_vec_register(gen->f, w_vec);
          ppc_release_vec_register(gen->f, pow_vec);
          ppc_release_vec_register(gen->f, pos_vec);
+         ppc_release_vec_register(gen->f, p128_vec);
+         ppc_release_vec_register(gen->f, n128_vec);
       }
 
       ppc_release_vec_register(gen->f, x_vec);
@@ -798,6 +844,7 @@ tgsi_emit_ppc(const struct tgsi_token *tokens,
    gen.temps_reg = ppc_reserve_register(func, 5);    /* ... */
    gen.immed_reg = ppc_reserve_register(func, 6);
    gen.const_reg = ppc_reserve_register(func, 7);
+   gen.builtins_reg = ppc_reserve_register(func, 8);
    gen.one_vec = -1;
    gen.bit31_vec = -1;
 
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ppc.h b/src/gallium/auxiliary/tgsi/tgsi_ppc.h
index 7cd2bf9aff..829ec075e7 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ppc.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_ppc.h
@@ -35,6 +35,9 @@ extern "C" {
 struct tgsi_token;
 struct ppc_function;
 
+extern const float ppc_builtin_constants[];
+
+
 boolean
 tgsi_emit_ppc(const struct tgsi_token *tokens,
               struct ppc_function *function,
-- 
cgit v1.2.3


From f8ab4feb75f4a592e23859813c093dcdbd4b8988 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 22 Oct 2008 17:21:43 -0600
Subject: gallium: remove ppc_vload_float(), rename ppc_vecmove() ->
 ppc_vmove().

---
 src/gallium/auxiliary/rtasm/rtasm_ppc.c | 19 +------------------
 src/gallium/auxiliary/rtasm/rtasm_ppc.h |  6 +-----
 src/gallium/auxiliary/tgsi/tgsi_ppc.c   |  2 +-
 3 files changed, 3 insertions(+), 24 deletions(-)

(limited to 'src/gallium/auxiliary/tgsi/tgsi_ppc.c')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc.c b/src/gallium/auxiliary/rtasm/rtasm_ppc.c
index 51d9b53657..7dd8263749 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc.c
@@ -603,23 +603,6 @@ ppc_lvewx(struct ppc_function *p, uint vr, uint ra, uint rb)
    emit_x(p, 31, vr, ra, rb, 71);
 }
 
-/** vector load float: vr = splats(imm) */
-void
-ppc_vload_float(struct ppc_function *p, uint vr, float imm)
-{
-   if (imm == 0.0f) {
-      ppc_vxor(p, vr, vr, vr);
-   }
-   else if (imm == 1.0f) {
-      /* use 2^0=1 to get 1.0 */
-      ppc_vxor(p, vr, vr, vr);  /* vr = {0,0,0,0} */
-      ppc_vexptefp(p, vr, vr);  /* vr = 0^0 */
-   }
-   else {
-      assert(0);
-   }
-}
-
 
 
@@ -664,7 +647,7 @@ ppc_vxor(struct ppc_function *p, uint vD, uint vA, uint vB)
 
 /** Pseudo-instruction: vector move */
 void
-ppc_vecmove(struct ppc_function *p, uint vD, uint vA)
+ppc_vmove(struct ppc_function *p, uint vD, uint vA)
 {
    ppc_vor(p, vD, vA, vA);
 }
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc.h b/src/gallium/auxiliary/rtasm/rtasm_ppc.h
index f194d3be13..f938d8d759 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc.h
@@ -158,10 +158,6 @@ ppc_lvx(struct ppc_function *p, uint vR, uint vA, uint vB);
 extern void
 ppc_lvewx(struct ppc_function *p, uint vR, uint vA, uint vB);
 
-/** vector load float: vr = splats(imm) */
-extern void
-ppc_vload_float(struct ppc_function *p, uint vr, float imm);
-
 
 
 /**
@@ -191,7 +187,7 @@ ppc_vxor(struct ppc_function *p, uint vD, uint vA, uint vB);
 
 /** Pseudo-instruction: vector move */
 extern void
-ppc_vecmove(struct ppc_function *p, uint vD, uint vA);
+ppc_vmove(struct ppc_function *p, uint vD, uint vA);
 
 /** Set vector register to {0,0,0,0} */
 extern void
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ppc.c b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
index 6b05fd16cf..96beec0cc6 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ppc.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
@@ -229,7 +229,7 @@ emit_fetch(struct gen_context *gen,
    case TGSI_EXTSWIZZLE_ONE:
       {
          int one_vec = gen_one_vec(gen);
-         ppc_vecmove(gen->f, dst_vec, one_vec);
+         ppc_vmove(gen->f, dst_vec, one_vec);
       }
       break;
    default:
-- 
cgit v1.2.3


From 0ac99457811eb766e9bdd3903857b5c0fdef7694 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 22 Oct 2008 17:29:37 -0600
Subject: gallium: PPC: clamp y to [-128,128] for LIT

---
 src/gallium/auxiliary/tgsi/tgsi_ppc.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'src/gallium/auxiliary/tgsi/tgsi_ppc.c')

diff --git a/src/gallium/auxiliary/tgsi/tgsi_ppc.c b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
index 96beec0cc6..9ad7ecd7cf 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ppc.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
@@ -636,16 +636,17 @@ emit_lit(struct gen_context *gen, struct tgsi_full_instruction *inst)
 
          FETCH(gen, *inst, w_vec, 0, CHAN_W);        /* w_vec = src[0].w */
 
-         /* XXX clamp Y to [-128, 128] */
+         /* clamp Y to [-128, 128] */
          load_constant_vec(gen, p128_vec, 128.0f);
          load_constant_vec(gen, n128_vec, -128.0f);
+         ppc_vmaxfp(gen->f, y_vec, y_vec, n128_vec); /* y = max(y, -128) */
+         ppc_vminfp(gen->f, y_vec, y_vec, p128_vec); /* y = min(y, 128) */
 
          /* if temp.x > 0
-          *    pow(tmp.y, tmp.w)
+          *    z = pow(tmp.y, tmp.w)
           * else
-          *   0.0
+          *    z = 0.0
           */
-
          ppc_vec_pow(gen->f, pow_vec, y_vec, w_vec);      /* pow = pow(y, w) */
          ppc_vcmpgtfpx(gen->f, pos_vec, x_vec, zero_vec); /* pos = x > 0 */
          ppc_vand(gen->f, z_vec, pow_vec, pos_vec);       /* z = pow & pos */
-- 
cgit v1.2.3