7 files changed, 243 insertions, 805 deletions
diff --git a/src/gallium/auxiliary/tgsi/exec/tgsi_exec.c b/src/gallium/auxiliary/tgsi/exec/tgsi_exec.c
index 78e7dec569..29e104bbd1 100644
--- a/src/gallium/auxiliary/tgsi/exec/tgsi_exec.c
+++ b/src/gallium/auxiliary/tgsi/exec/tgsi_exec.c
@@ -287,10 +287,10 @@ micro_abs(
    union tgsi_exec_channel *dst,
    const union tgsi_exec_channel *src )
 {
-   dst->f[0] = (float) fabs( (double) src->f[0] );
-   dst->f[1] = (float) fabs( (double) src->f[1] );
-   dst->f[2] = (float) fabs( (double) src->f[2] );
-   dst->f[3] = (float) fabs( (double) src->f[3] );
+   dst->f[0] = fabsf( src->f[0] );
+   dst->f[1] = fabsf( src->f[1] );
+   dst->f[2] = fabsf( src->f[2] );
+   dst->f[3] = fabsf( src->f[3] );
 }
 
 static void
@@ -334,10 +334,10 @@ micro_ceil(
    union tgsi_exec_channel *dst,
    const union tgsi_exec_channel *src )
 {
-   dst->f[0] = (float) ceil( (double) src->f[0] );
-   dst->f[1] = (float) ceil( (double) src->f[1] );
-   dst->f[2] = (float) ceil( (double) src->f[2] );
-   dst->f[3] = (float) ceil( (double) src->f[3] );
+   dst->f[0] = ceilf( src->f[0] );
+   dst->f[1] = ceilf( src->f[1] );
+   dst->f[2] = ceilf( src->f[2] );
+   dst->f[3] = ceilf( src->f[3] );
 }
 
 static void
@@ -345,10 +345,10 @@ micro_cos(
    union tgsi_exec_channel *dst,
    const union tgsi_exec_channel *src )
 {
-   dst->f[0] = (float) cos( (double) src->f[0] );
-   dst->f[1] = (float) cos( (double) src->f[1] );
-   dst->f[2] = (float) cos( (double) src->f[2] );
-   dst->f[3] = (float) cos( (double) src->f[3] );
+   dst->f[0] = cosf( src->f[0] );
+   dst->f[1] = cosf( src->f[1] );
+   dst->f[2] = cosf( src->f[2] );
+   dst->f[3] = cosf( src->f[3] );
 }
 
 static void
@@ -430,10 +430,10 @@ micro_exp2(
    union tgsi_exec_channel *dst,
    const union tgsi_exec_channel *src)
 {
-   dst->f[0] = (float) pow( 2.0, (double) src->f[0] );
-   dst->f[1] = (float) pow( 2.0, (double) src->f[1] );
-   dst->f[2] = (float) pow( 2.0, (double) src->f[2] );
-   dst->f[3] = (float) pow( 2.0, (double) src->f[3] );
+   dst->f[0] = powf( 2.0f, src->f[0] );
+   dst->f[1] = powf( 2.0f, src->f[1] );
+   dst->f[2] = powf( 2.0f, src->f[2] );
+   dst->f[3] = powf( 2.0f, src->f[3] );
 }
 
 static void
@@ -463,10 +463,10 @@ micro_flr(
    union tgsi_exec_channel *dst,
    const union tgsi_exec_channel *src )
 {
-   dst->f[0] = (float) floor( (double) src->f[0] );
-   dst->f[1] = (float) floor( (double) src->f[1] );
-   dst->f[2] = (float) floor( (double) src->f[2] );
-   dst->f[3] = (float) floor( (double) src->f[3] );
+   dst->f[0] = floorf( src->f[0] );
+   dst->f[1] = floorf( src->f[1] );
+   dst->f[2] = floorf( src->f[2] );
+   dst->f[3] = floorf( src->f[3] );
 }
 
 static void
@@ -474,10 +474,10 @@ micro_frc(
    union tgsi_exec_channel *dst,
    const union tgsi_exec_channel *src )
 {
-   dst->f[0] = src->f[0] - (float) floor( (double) src->f[0] );
-   dst->f[1] = src->f[1] - (float) floor( (double) src->f[1] );
-   dst->f[2] = src->f[2] - (float) floor( (double) src->f[2] );
-   dst->f[3] = src->f[3] - (float) floor( (double) src->f[3] );
+   dst->f[0] = src->f[0] - floorf( src->f[0] );
+   dst->f[1] = src->f[1] - floorf( src->f[1] );
+   dst->f[2] = src->f[2] - floorf( src->f[2] );
+   dst->f[3] = src->f[3] - floorf( src->f[3] );
 }
 
 static void
@@ -510,10 +510,10 @@ micro_lg2(
    union tgsi_exec_channel *dst,
    const union tgsi_exec_channel *src )
 {
-   dst->f[0] = (float) log( (double) src->f[0] ) * 1.442695f;
-   dst->f[1] = (float) log( (double) src->f[1] ) * 1.442695f;
-   dst->f[2] = (float) log( (double) src->f[2] ) * 1.442695f;
-   dst->f[3] = (float) log( (double) src->f[3] ) * 1.442695f;
+   dst->f[0] = logf( src->f[0] ) * 1.442695f;
+   dst->f[1] = logf( src->f[1] ) * 1.442695f;
+   dst->f[2] = logf( src->f[2] ) * 1.442695f;
+   dst->f[3] = logf( src->f[3] ) * 1.442695f;
 }
 
 static void
@@ -764,10 +764,10 @@ micro_pow(
    const union tgsi_exec_channel *src0,
    const union tgsi_exec_channel *src1 )
 {
-   dst->f[0] = (float) pow( (double) src0->f[0], (double) src1->f[0] );
-   dst->f[1] = (float) pow( (double) src0->f[1], (double) src1->f[1] );
-   dst->f[2] = (float) pow( (double) src0->f[2], (double) src1->f[2] );
-   dst->f[3] = (float) pow( (double) src0->f[3], (double) src1->f[3] );
+   dst->f[0] = powf( src0->f[0], src1->f[0] );
+   dst->f[1] = powf( src0->f[1], src1->f[1] );
+   dst->f[2] = powf( src0->f[2], src1->f[2] );
+   dst->f[3] = powf( src0->f[3], src1->f[3] );
 }
 
 static void
@@ -775,10 +775,10 @@ micro_rnd(
    union tgsi_exec_channel *dst,
    const union tgsi_exec_channel *src )
 {
-   dst->f[0] = (float) floor( (double) (src->f[0] + 0.5f) );
-   dst->f[1] = (float) floor( (double) (src->f[1] + 0.5f) );
-   dst->f[2] = (float) floor( (double) (src->f[2] + 0.5f) );
-   dst->f[3] = (float) floor( (double) (src->f[3] + 0.5f) );
+   dst->f[0] = floorf( src->f[0] + 0.5f );
+   dst->f[1] = floorf( src->f[1] + 0.5f );
+   dst->f[2] = floorf( src->f[2] + 0.5f );
+   dst->f[3] = floorf( src->f[3] + 0.5f );
 }
 
 static void
@@ -833,20 +833,20 @@ micro_sin(
    union tgsi_exec_channel *dst,
    const union tgsi_exec_channel *src )
 {
-   dst->f[0] = (float) sin( (double) src->f[0] );
-   dst->f[1] = (float) sin( (double) src->f[1] );
-   dst->f[2] = (float) sin( (double) src->f[2] );
-   dst->f[3] = (float) sin( (double) src->f[3] );
+   dst->f[0] = sinf( src->f[0] );
+   dst->f[1] = sinf( src->f[1] );
+   dst->f[2] = sinf( src->f[2] );
+   dst->f[3] = sinf( src->f[3] );
 }
 
 static void
 micro_sqrt( union tgsi_exec_channel *dst,
             const union tgsi_exec_channel *src )
 {
-   dst->f[0] = (float) sqrt( (double) src->f[0] );
-   dst->f[1] = (float) sqrt( (double) src->f[1] );
-   dst->f[2] = (float) sqrt( (double) src->f[2] );
-   dst->f[3] = (float) sqrt( (double) src->f[3] );
+   dst->f[0] = sqrtf( src->f[0] );
+   dst->f[1] = sqrtf( src->f[1] );
+   dst->f[2] = sqrtf( src->f[2] );
+   dst->f[3] = sqrtf( src->f[3] );
 }
 
 static void
diff --git a/src/gallium/auxiliary/tgsi/exec/tgsi_exec.h b/src/gallium/auxiliary/tgsi/exec/tgsi_exec.h
index 45c49dd007..92e2e5e985 100644
--- a/src/gallium/auxiliary/tgsi/exec/tgsi_exec.h
+++ b/src/gallium/auxiliary/tgsi/exec/tgsi_exec.h
@@ -166,7 +166,7 @@ struct tgsi_exec_machine
 
    float                         Imms[TGSI_EXEC_NUM_IMMEDIATES][4];
    unsigned                      ImmLimit;
-   float                         (*Consts)[4];
+   const float                   (*Consts)[4];
    struct tgsi_exec_vector       *Inputs;
    struct tgsi_exec_vector       *Outputs;
    const struct tgsi_token       *Tokens;
diff --git a/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c b/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c
index c37e201b2b..c3295a27ff 100755
--- a/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c
+++ b/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c
@@ -36,113 +36,8 @@
 
 #if defined(__i386__) || defined(__386__)
 
-#define DUMP_SSE  0
+#define HIGH_PRECISION 1  /* for 1/sqrt() */
 
-#if DUMP_SSE
-
-static void
-_print_reg(
-   struct x86_reg reg )
-{
-   if (reg.mod != mod_REG) 
-      debug_printf( "[" );
-      
-   switch( reg.file ) {
-   case file_REG32:
-      switch( reg.idx ) {
-      case reg_AX:
-         debug_printf( "EAX" );
-         break;
-      case reg_CX:
-         debug_printf( "ECX" );
-         break;
-      case reg_DX:
-         debug_printf( "EDX" );
-         break;
-      case reg_BX:
-         debug_printf( "EBX" );
-         break;
-      case reg_SP:
-         debug_printf( "ESP" );
-         break;
-      case reg_BP:
-         debug_printf( "EBP" );
-         break;
-      case reg_SI:
-         debug_printf( "ESI" );
-         break;
-      case reg_DI:
-         debug_printf( "EDI" );
-         break;
-      }
-      break;
-   case file_MMX:
-      assert( 0 );
-      break;
-   case file_XMM:
-      debug_printf( "XMM%u", reg.idx );
-      break;
-   case file_x87:
-      assert( 0 );
-      break;
-   }
-
-   if (reg.mod == mod_DISP8 ||
-       reg.mod == mod_DISP32)
-      debug_printf("+%d", reg.disp);
-
-   if (reg.mod != mod_REG) 
-      debug_printf( "]" );
-}
-
-static void
-_fill(
-   const char  *op )
-{
-   unsigned count = 10 - strlen( op );
-
-   while( count-- ) {
-      debug_printf( " " );
-   }
-}
-
-#define DUMP_START() debug_printf( "\nsse-dump start ----------------" )
-#define DUMP_END() debug_printf( "\nsse-dump end ----------------\n" )
-#define DUMP( OP ) debug_printf( "\n%s", OP )
-#define DUMP_I( OP, I ) do {\
-   debug_printf( "\n%s", OP );\
-   _fill( OP );\
-   debug_printf( "%u", I ); } while( 0 )
-#define DUMP_R( OP, R0 ) do {\
-   debug_printf( "\n%s", OP );\
-   _fill( OP );\
-   _print_reg( R0 ); } while( 0 )
-#define DUMP_RR( OP, R0, R1 ) do {\
-   debug_printf( "\n%s", OP );\
-   _fill( OP );\
-   _print_reg( R0 );\
-   debug_printf( ", " );\
-   _print_reg( R1 ); } while( 0 )
-#define DUMP_RRI( OP, R0, R1, I ) do {\
-   debug_printf( "\n%s", OP );\
-   _fill( OP );\
-   _print_reg( R0 );\
-   debug_printf( ", " );\
-   _print_reg( R1 );\
-   debug_printf( ", " );\
-   debug_printf( "%u", I ); } while( 0 )
-
-#else
-
-#define DUMP_START()
-#define DUMP_END()
-#define DUMP( OP )
-#define DUMP_I( OP, I )
-#define DUMP_R( OP, R0 )
-#define DUMP_RR( OP, R0, R1 )
-#define DUMP_RRI( OP, R0, R1, I )
-
-#endif
 
 #define FOR_EACH_CHANNEL( CHAN )\
    for( CHAN = 0; CHAN < 4; CHAN++ )
@@ -308,200 +203,6 @@ get_coef(
       ((vec * 3 + member) * 4 + chan) * 4 );
 }
 
-/**
- * X86 rtasm wrappers.
- */
-
-static void
-emit_addps(
-   struct x86_function  *func,
-   struct x86_reg       dst,
-   struct x86_reg       src )
-{
-   DUMP_RR( "ADDPS", dst, src );
-   sse_addps( func, dst, src );
-}
-
-static void
-emit_andnps(
-   struct x86_function  *func,
-   struct x86_reg       dst,
-   struct x86_reg       src )
-{
-   DUMP_RR( "ANDNPS", dst, src );
-   sse_andnps( func, dst, src );
-}
-
-static void
-emit_andps(
-   struct x86_function  *func,
-   struct x86_reg       dst,
-   struct x86_reg       src )
-{
-   DUMP_RR( "ANDPS", dst, src );
-   sse_andps( func, dst, src );
-}
-
-static void
-emit_call(
-   struct x86_function  *func,
-   void                 (* addr)() )
-{
-   struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
-
-   DUMP_I( "CALL", addr );
-   x86_mov_reg_imm( func, ecx, (unsigned long) addr );
-   x86_call( func, ecx );
-}
-
-static void
-emit_cmpps(
-   struct x86_function  *func,
-   struct x86_reg       dst,
-   struct x86_reg       src,
-   enum sse_cc          cc )
-{
-   DUMP_RRI( "CMPPS", dst, src, cc );
-   sse_cmpps( func, dst, src, cc );
-}
-
-static void
-emit_cvttps2dq(
-   struct x86_function  *func,
-   struct x86_reg       dst,
-   struct x86_reg       src )
-{
-   DUMP_RR( "CVTTPS2DQ", dst, src );
-   sse2_cvttps2dq( func, dst, src );
-}
-
-static void
-emit_maxps(
-   struct x86_function  *func,
-   struct x86_reg       dst,
-   struct x86_reg       src )
-{
-   DUMP_RR( "MAXPS", dst, src );
-   sse_maxps( func, dst, src );
-}
-
-static void
-emit_minps(
-   struct x86_function  *func,
-   struct x86_reg       dst,
-   struct x86_reg       src )
-{
-   DUMP_RR( "MINPS", dst, src );
-   sse_minps( func, dst, src );
-}
-
-static void
-emit_mov(
-   struct x86_function  *func,
-   struct x86_reg       dst,
-   struct x86_reg       src )
-{
-   DUMP_RR( "MOV", dst, src );
-   x86_mov( func, dst, src );
-}
-
-static void
-emit_movaps(
-   struct x86_function  *func,
-   struct x86_reg       dst,
-   struct x86_reg       src )
-{
-   DUMP_RR( "MOVAPS", dst, src );
-   sse_movaps( func, dst, src );
-}
-
-static void
-emit_movss(
-   struct x86_function  *func,
-   struct x86_reg       dst,
-   struct x86_reg       src )
-{
-   DUMP_RR( "MOVSS", dst, src );
-   sse_movss( func, dst, src );
-}
-
-static void
-emit_movups(
-   struct x86_function  *func,
-   struct x86_reg       dst,
-   struct x86_reg       src )
-{
-   DUMP_RR( "MOVUPS", dst, src );
-   sse_movups( func, dst, src );
-}
-
-static void
-emit_mulps(
-   struct x86_function  *func,
-   struct x86_reg       dst,
-   struct x86_reg       src )
-{
-   DUMP_RR( "MULPS", dst, src );
-   sse_mulps( func, dst, src );
-}
-
-static void
-emit_or(
-   struct x86_function  *func,
-   struct x86_reg       dst,
-   struct x86_reg       src )
-{
-   DUMP_RR( "OR", dst, src );
-   x86_or( func, dst, src );
-}
-
-static void
-emit_orps(
-   struct x86_function  *func,
-   struct x86_reg       dst,
-   struct x86_reg       src )
-{
-   DUMP_RR( "ORPS", dst, src );
-   sse_orps( func, dst, src );
-}
-
-static void
-emit_pmovmskb(
-   struct x86_function  *func,
-   struct x86_reg       dst,
-   struct x86_reg       src )
-{
-   DUMP_RR( "PMOVMSKB", dst, src );
-   sse_pmovmskb( func, dst, src );
-}
-
-static void
-emit_pop(
-   struct x86_function  *func,
-   struct x86_reg       dst )
-{
-   DUMP_R( "POP", dst );
-   x86_pop( func, dst );
-}
-
-static void
-emit_push(
-   struct x86_function  *func,
-   struct x86_reg       dst )
-{
-   DUMP_R( "PUSH", dst );
-   x86_push( func, dst );
-}
-
-static void
-emit_rcpps(
-   struct x86_function  *func,
-   struct x86_reg       dst,
-   struct x86_reg       src )
-{
-   DUMP_RR( "RCPPS", dst, src );
-   sse2_rcpps( func, dst, src );
-}
 
 #ifdef WIN32
 static void
@@ -509,7 +210,6 @@ emit_retw(
    struct x86_function  *func,
    unsigned             size )
 {
-   DUMP_I( "RET", size );
    x86_retw( func, size );
 }
 #else
@@ -517,51 +217,10 @@ static void
 emit_ret(
    struct x86_function  *func )
 {
-   DUMP( "RET" );
    x86_ret( func );
 }
 #endif
 
-static void
-emit_rsqrtps(
-   struct x86_function  *func,
-   struct x86_reg       dst,
-   struct x86_reg       src )
-{
-   DUMP_RR( "RSQRTPS", dst, src );
-   sse_rsqrtps( func, dst, src );
-}
-
-static void
-emit_shufps(
-   struct x86_function  *func,
-   struct x86_reg       dst,
-   struct x86_reg       src,
-   unsigned char        shuf )
-{
-   DUMP_RRI( "SHUFPS", dst, src, shuf );
-   sse_shufps( func, dst, src, shuf );
-}
-
-static void
-emit_subps(
-   struct x86_function  *func,
-   struct x86_reg       dst,
-   struct x86_reg       src )
-{
-   DUMP_RR( "SUBPS", dst, src );
-   sse_subps( func, dst, src );
-}
-
-static void
-emit_xorps(
-   struct x86_function  *func,
-   struct x86_reg       dst,
-   struct x86_reg       src )
-{
-   DUMP_RR( "XORPS", dst, src );
-   sse_xorps( func, dst, src );
-}
 
 /**
  * Data fetch helpers.
@@ -580,11 +239,11 @@ emit_const(
    unsigned vec,
    unsigned chan )
 {
-   emit_movss(
+   sse_movss(
       func,
       make_xmm( xmm ),
       get_const( vec, chan ) );
-   emit_shufps(
+   sse_shufps(
       func,
       make_xmm( xmm ),
       make_xmm( xmm ),
@@ -598,11 +257,11 @@ emit_immediate(
    unsigned vec,
    unsigned chan )
 {
-   emit_movss(
+   sse_movss(
       func,
       make_xmm( xmm ),
       get_immediate( vec, chan ) );
-   emit_shufps(
+   sse_shufps(
       func,
       make_xmm( xmm ),
       make_xmm( xmm ),
@@ -623,7 +282,7 @@ emit_inputf(
    unsigned vec,
    unsigned chan )
 {
-   emit_movups(
+   sse_movups(
       func,
       make_xmm( xmm ),
       get_input( vec, chan ) );
@@ -642,7 +301,7 @@ emit_output(
    unsigned vec,
    unsigned chan )
 {
-   emit_movups(
+   sse_movups(
       func,
       get_output( vec, chan ),
       make_xmm( xmm ) );
@@ -661,7 +320,7 @@ emit_tempf(
    unsigned vec,
    unsigned chan )
 {
-   emit_movaps(
+   sse_movaps(
       func,
       make_xmm( xmm ),
       get_temp( vec, chan ) );
@@ -682,11 +341,11 @@ emit_coef(
    unsigned chan,
    unsigned member )
 {
-   emit_movss(
+   sse_movss(
       func,
       make_xmm( xmm ),
       get_coef( vec, chan, member ) );
-   emit_shufps(
+   sse_shufps(
       func,
       make_xmm( xmm ),
       make_xmm( xmm ),
@@ -704,7 +363,7 @@ emit_inputs(
    unsigned vec,
    unsigned chan )
 {
-   emit_movups(
+   sse_movups(
       func,
       get_input( vec, chan ),
       make_xmm( xmm ) );
@@ -717,7 +376,7 @@ emit_temps(
    unsigned vec,
    unsigned chan )
 {
-   emit_movaps(
+   sse_movaps(
       func,
       get_temp( vec, chan ),
       make_xmm( xmm ) );
@@ -794,39 +453,39 @@ static void
 emit_push_gp(
    struct x86_function *func )
 {
-   emit_push(
+   x86_push(
       func,
       get_const_base() );
-   emit_push(
+   x86_push(
       func,
       get_input_base() );
-   emit_push(
+   x86_push(
       func,
       get_output_base() );
 
    /* It is important on non-win32 platforms that temp base is pushed last.
     */
-   emit_push(
+   x86_push(
       func,
       get_temp_base() );
 }
 
 static void
-emit_pop_gp(
+x86_pop_gp(
    struct x86_function *func )
 {
    /* Restore GP registers in a reverse order.
     */
-   emit_pop(
+   x86_pop(
       func,
       get_temp_base() );
-   emit_pop(
+   x86_pop(
       func,
       get_output_base() );
-   emit_pop(
+   x86_pop(
       func,
       get_input_base() );
-   emit_pop(
+   x86_pop(
       func,
       get_const_base() );
 }
@@ -837,7 +496,7 @@ emit_func_call_dst(
    unsigned xmm_dst,
    void (*code)() )
 {
-   emit_movaps(
+   sse_movaps(
       func,
       get_temp( TEMP_R0, 0 ),
       make_xmm( xmm_dst ) );
@@ -846,19 +505,22 @@ emit_func_call_dst(
       func );
 
 #ifdef WIN32
-   emit_push(
+   x86_push(
       func,
       get_temp( TEMP_R0, 0 ) );
 #endif
 
-   emit_call(
-      func,
-      code );
+   {
+      struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
+
+      x86_mov_reg_imm( func, ecx, (unsigned long) code );
+      x86_call( func, ecx );
+   }
 
-   emit_pop_gp(
+   x86_pop_gp(
       func );
 
-   emit_movaps(
+   sse_movaps(
       func,
       make_xmm( xmm_dst ),
       get_temp( TEMP_R0, 0 ) );
@@ -871,7 +533,7 @@ emit_func_call_dst_src(
    unsigned xmm_src,
    void (*code)() )
 {
-   emit_movaps(
+   sse_movaps(
       func,
       get_temp( TEMP_R0, 1 ),
       make_xmm( xmm_src ) );
@@ -891,7 +553,7 @@ emit_abs(
    struct x86_function *func,
    unsigned xmm )
 {
-   emit_andps(
+   sse_andps(
       func,
       make_xmm( xmm ),
       get_temp(
@@ -905,7 +567,7 @@ emit_add(
    unsigned xmm_dst,
    unsigned xmm_src )
 {
-   emit_addps(
+   sse_addps(
       func,
       make_xmm( xmm_dst ),
       make_xmm( xmm_src ) );
@@ -916,17 +578,15 @@ cos4f(
    float *store )
 {
 #ifdef WIN32
-   store[0] = (float) cos( (double) store[0] );
-   store[1] = (float) cos( (double) store[1] );
-   store[2] = (float) cos( (double) store[2] );
-   store[3] = (float) cos( (double) store[3] );
+   const unsigned X = 0;
 #else
    const unsigned X = TEMP_R0 * 16;
+#endif
+
    store[X + 0] = cosf( store[X + 0] );
    store[X + 1] = cosf( store[X + 1] );
    store[X + 2] = cosf( store[X + 2] );
    store[X + 3] = cosf( store[X + 3] );
-#endif
 }
 
 static void
@@ -945,17 +605,14 @@ ex24f(
    float *store )
 {
 #ifdef WIN32
-   store[0] = (float) pow( 2.0, (double) store[0] );
-   store[1] = (float) pow( 2.0, (double) store[1] );
-   store[2] = (float) pow( 2.0, (double) store[2] );
-   store[3] = (float) pow( 2.0, (double) store[3] );
+   const unsigned X = 0;
 #else
    const unsigned X = TEMP_R0 * 16;
+#endif
    store[X + 0] = powf( 2.0f, store[X + 0] );
    store[X + 1] = powf( 2.0f, store[X + 1] );
    store[X + 2] = powf( 2.0f, store[X + 2] );
    store[X + 3] = powf( 2.0f, store[X + 3] );
-#endif
 }
 
 static void
@@ -974,7 +631,7 @@ emit_f2it(
    struct x86_function *func,
    unsigned xmm )
 {
-   emit_cvttps2dq(
+   sse2_cvttps2dq(
       func,
       make_xmm( xmm ),
       make_xmm( xmm ) );
@@ -989,10 +646,10 @@ flr4f(
 #else
    const unsigned X = TEMP_R0 * 16;
 #endif
-   store[X + 0] = (float) floor( (double) store[X + 0] );
-   store[X + 1] = (float) floor( (double) store[X + 1] );
-   store[X + 2] = (float) floor( (double) store[X + 2] );
-   store[X + 3] = (float) floor( (double) store[X + 3] );
+   store[X + 0] = floorf( store[X + 0] );
+   store[X + 1] = floorf( store[X + 1] );
+   store[X + 2] = floorf( store[X + 2] );
+   store[X + 3] = floorf( store[X + 3] );
 }
 
 static void
@@ -1015,10 +672,10 @@ frc4f(
 #else
    const unsigned X = TEMP_R0 * 16;
 #endif
-   store[X + 0] -= (float) floor( (double) store[X + 0] );
-   store[X + 1] -= (float) floor( (double) store[X + 1] );
-   store[X + 2] -= (float) floor( (double) store[X + 2] );
-   store[X + 3] -= (float) floor( (double) store[X + 3] );
+   store[X + 0] -= floorf( store[X + 0] );
+   store[X + 1] -= floorf( store[X + 1] );
+   store[X + 2] -= floorf( store[X + 2] );
+   store[X + 3] -= floorf( store[X + 3] );
 }
 
 static void
@@ -1064,7 +721,7 @@ emit_MOV(
    unsigned xmm_dst,
    unsigned xmm_src )
 {
-   emit_movups(
+   sse_movups(
       func,
       make_xmm( xmm_dst ),
       make_xmm( xmm_src ) );
@@ -1075,7 +732,7 @@ emit_mul (struct x86_function *func,
           unsigned xmm_dst,
           unsigned xmm_src)
 {
-   emit_mulps(
+   sse_mulps(
       func,
       make_xmm( xmm_dst ),
       make_xmm( xmm_src ) );
@@ -1086,7 +743,7 @@ emit_neg(
    struct x86_function *func,
    unsigned xmm )
 {
-   emit_xorps(
+   sse_xorps(
       func,
       make_xmm( xmm ),
       get_temp(
@@ -1099,17 +756,14 @@ pow4f(
    float *store )
 {
 #ifdef WIN32
-   store[0] = (float) pow( (double) store[0], (double) store[4] );
-   store[1] = (float) pow( (double) store[1], (double) store[5] );
-   store[2] = (float) pow( (double) store[2], (double) store[6] );
-   store[3] = (float) pow( (double) store[3], (double) store[7] );
+   const unsigned X = 0;
 #else
    const unsigned X = TEMP_R0 * 16;
+#endif
    store[X + 0] = powf( store[X + 0], store[X + 4] );
    store[X + 1] = powf( store[X + 1], store[X + 5] );
    store[X + 2] = powf( store[X + 2], store[X + 6] );
    store[X + 3] = powf( store[X + 3], store[X + 7] );
-#endif
 }
 
 static void
@@ -1131,22 +785,80 @@ emit_rcp (
    unsigned xmm_dst,
    unsigned xmm_src )
 {
-   emit_rcpps(
+   /* On Intel CPUs at least, this is only accurate to 12 bits -- not
+    * good enough.  Need to either emit a proper divide or use the
+    * iterative technique described below in emit_rsqrt().
+    */
+   sse2_rcpps(
       func,
       make_xmm( xmm_dst ),
       make_xmm( xmm_src ) );
 }
 
+#if HIGH_PRECISION
+static void XSTDCALL
+rsqrt4f(
+   float *store )
+{
+#ifdef WIN32
+   const unsigned X = 0;
+#else
+   const unsigned X = TEMP_R0 * 16;
+#endif
+   store[X + 0] = 1.0F / sqrtf( store[X + 0] );
+   store[X + 1] = 1.0F / sqrtf( store[X + 1] );
+   store[X + 2] = 1.0F / sqrtf( store[X + 2] );
+   store[X + 3] = 1.0F / sqrtf( store[X + 3] );
+}
+#endif
+
 static void
 emit_rsqrt(
    struct x86_function *func,
    unsigned xmm_dst,
    unsigned xmm_src )
 {
+#if HIGH_PRECISION
+#if 1
+   emit_func_call_dst_src(
+      func,
+      xmm_dst,
+      xmm_src,
+      rsqrt4f );
+#else
+   /* Although rsqrtps() and rcpps() are low precision on some/all SSE
+    * implementations, it is possible to improve its precision at
+    * fairly low cost, using a newton/raphson step, as below:
+    * 
+    * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
+    * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
+    *
+    * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
+    */
+   /* This is some code that woudl do the above for a scalar 'a'.  We
+    * obviously are interested in a vector version:
+    *
+    * movss   xmm3, a;
+    * movss   xmm1, half;
+    * movss   xmm2, three;
+    * rsqrtss xmm0, xmm3;
+    * mulss   xmm3, xmm0;
+    * mulss   xmm1, xmm0;
+    * mulss   xmm3, xmm0;
+    * subss   xmm2, xmm3;
+    * mulss   xmm1, xmm2;
+    * movss   x,    xmm1;
+    */
+#endif
+#else
+   /* On Intel CPUs at least, this is only accurate to 12 bits -- not
+    * good enough.
+    */
    emit_rsqrtps(
       func,
       make_xmm( xmm_dst ),
       make_xmm( xmm_src ) );
+#endif
 }
 
 static void
@@ -1154,7 +866,7 @@ emit_setsign(
    struct x86_function *func,
    unsigned xmm )
 {
-   emit_orps(
+   sse_orps(
       func,
       make_xmm( xmm ),
       get_temp(
@@ -1167,17 +879,14 @@ sin4f(
    float *store )
 {
 #ifdef WIN32
-   store[0] = (float) sin( (double) store[0] );
-   store[1] = (float) sin( (double) store[1] );
-   store[2] = (float) sin( (double) store[2] );
-   store[3] = (float) sin( (double) store[3] );
+   const unsigned X = 0;
 #else
    const unsigned X = TEMP_R0 * 16;
+#endif
    store[X + 0] = sinf( store[X + 0] );
    store[X + 1] = sinf( store[X + 1] );
    store[X + 2] = sinf( store[X + 2] );
    store[X + 3] = sinf( store[X + 3] );
-#endif
 }
 
 static void
@@ -1196,7 +905,7 @@ emit_sub(
    unsigned xmm_dst,
    unsigned xmm_src )
 {
-   emit_subps(
+   sse_subps(
       func,
       make_xmm( xmm_dst ),
       make_xmm( xmm_src ) );
@@ -1405,16 +1114,16 @@ emit_kil(
       }
    }
 
-   emit_push(
+   x86_push(
       func,
       x86_make_reg( file_REG32, reg_AX ) );
-   emit_push(
+   x86_push(
       func,
       x86_make_reg( file_REG32, reg_DX ) );
 
    FOR_EACH_CHANNEL( chan_index ) {
       if( uniquemask & (1 << chan_index) ) {
-         emit_cmpps(
+         sse_cmpps(
             func,
             make_xmm( registers[chan_index] ),
             get_temp(
@@ -1423,17 +1132,17 @@ emit_kil(
             cc_LessThan );
 
          if( chan_index == firstchan ) {
-            emit_pmovmskb(
+            sse_pmovmskb(
                func,
                x86_make_reg( file_REG32, reg_AX ),
                make_xmm( registers[chan_index] ) );
          }
          else {
-            emit_pmovmskb(
+            sse_pmovmskb(
                func,
                x86_make_reg( file_REG32, reg_DX ),
                make_xmm( registers[chan_index] ) );
-            emit_or(
+            x86_or(
                func,
                x86_make_reg( file_REG32, reg_AX ),
                x86_make_reg( file_REG32, reg_DX ) );
@@ -1441,17 +1150,17 @@ emit_kil(
       }
    }
 
-   emit_or(
+   x86_or(
       func,
       get_temp(
          TGSI_EXEC_TEMP_KILMASK_I,
          TGSI_EXEC_TEMP_KILMASK_C ),
       x86_make_reg( file_REG32, reg_AX ) );
 
-   emit_pop(
+   x86_pop(
       func,
       x86_make_reg( file_REG32, reg_DX ) );
-   emit_pop(
+   x86_pop(
       func,
       x86_make_reg( file_REG32, reg_AX ) );
 }
@@ -1467,12 +1176,12 @@ emit_setcc(
    FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
       FETCH( func, *inst, 0, 0, chan_index );
       FETCH( func, *inst, 1, 1, chan_index );
-      emit_cmpps(
+      sse_cmpps(
          func,
          make_xmm( 0 ),
          make_xmm( 1 ),
          cc );
-      emit_andps(
+      sse_andps(
          func,
          make_xmm( 0 ),
          get_temp(
@@ -1493,22 +1202,22 @@ emit_cmp(
       FETCH( func, *inst, 0, 0, chan_index );
       FETCH( func, *inst, 1, 1, chan_index );
       FETCH( func, *inst, 2, 2, chan_index );
-      emit_cmpps(
+      sse_cmpps(
          func,
          make_xmm( 0 ),
          get_temp(
             TGSI_EXEC_TEMP_00000000_I,
             TGSI_EXEC_TEMP_00000000_C ),
          cc_LessThan );
-      emit_andps(
+      sse_andps(
          func,
          make_xmm( 1 ),
          make_xmm( 0 ) );
-      emit_andnps(
+      sse_andnps(
          func,
          make_xmm( 0 ),
          make_xmm( 2 ) );
-      emit_orps(
+      sse_orps(
          func,
          make_xmm( 0 ),
          make_xmm( 1 ) );
@@ -1559,7 +1268,7 @@ emit_instruction(
           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
             FETCH( func, *inst, 0, 0, CHAN_X );
-            emit_maxps(
+            sse_maxps(
                func,
                make_xmm( 0 ),
                get_temp(
@@ -1568,21 +1277,26 @@ emit_instruction(
             STORE( func, *inst, 0, 0, CHAN_Y );
          }
          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
+            /* XMM[1] = SrcReg[0].yyyy */
             FETCH( func, *inst, 1, 0, CHAN_Y );
-            emit_maxps(
+            /* XMM[1] = max(XMM[1], 0) */
+            sse_maxps(
                func,
                make_xmm( 1 ),
                get_temp(
                   TGSI_EXEC_TEMP_00000000_I,
                   TGSI_EXEC_TEMP_00000000_C ) );
+            /* XMM[2] = SrcReg[0].wwww */
             FETCH( func, *inst, 2, 0, CHAN_W );
-            emit_minps(
+            /* XMM[2] = min(XMM[2], 128.0) */
+            sse_minps(
                func,
                make_xmm( 2 ),
                get_temp(
                   TGSI_EXEC_TEMP_128_I,
                   TGSI_EXEC_TEMP_128_C ) );
-            emit_maxps(
+            /* XMM[2] = max(XMM[2], -128.0) */
+            sse_maxps(
                func,
                make_xmm( 2 ),
                get_temp(
@@ -1590,16 +1304,16 @@ emit_instruction(
                   TGSI_EXEC_TEMP_MINUS_128_C ) );
             emit_pow( func, 1, 2 );
             FETCH( func, *inst, 0, 0, CHAN_X );
-            emit_xorps(
+            sse_xorps(
                func,
                make_xmm( 2 ),
                make_xmm( 2 ) );
-            emit_cmpps(
+            sse_cmpps(
                func,
                make_xmm( 2 ),
                make_xmm( 0 ),
                cc_LessThanEqual );
-            emit_andps(
+            sse_andps(
                func,
                make_xmm( 2 ),
                make_xmm( 1 ) );
@@ -1721,7 +1435,7 @@ emit_instruction(
       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( func, *inst, 0, 0, chan_index );
          FETCH( func, *inst, 1, 1, chan_index );
-         emit_minps(
+         sse_minps(
             func,
             make_xmm( 0 ),
             make_xmm( 1 ) );
@@ -1733,7 +1447,7 @@ emit_instruction(
       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( func, *inst, 0, 0, chan_index );
          FETCH( func, *inst, 1, 1, chan_index );
-         emit_maxps(
+         sse_maxps(
             func,
             make_xmm( 0 ),
             make_xmm( 1 ) );
@@ -2332,7 +2046,7 @@ emit_declaration(
  */
 unsigned
 tgsi_emit_sse2(
-   struct tgsi_token *tokens,
+   const struct tgsi_token *tokens,
    struct x86_function *func,
    float (*immediates)[4])
 {
@@ -2341,8 +2055,6 @@ tgsi_emit_sse2(
    unsigned ok = 1;
    uint num_immediates = 0;
 
-   DUMP_START();
-
    func->csr = func->store;
 
    tgsi_parse_init( &parse, tokens );
@@ -2352,24 +2064,24 @@ tgsi_emit_sse2(
     */
    if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
       /* DECLARATION phase, do not load output argument. */
-      emit_mov(
+      x86_mov(
          func,
          get_input_base(),
          get_argument( 0 ) );
       /* skipping outputs argument here */
-      emit_mov(
+      x86_mov(
          func,
          get_const_base(),
          get_argument( 2 ) );
-      emit_mov(
+      x86_mov(
          func,
          get_temp_base(),
          get_argument( 3 ) );
-      emit_mov(
+      x86_mov(
          func,
          get_coef_base(),
          get_argument( 4 ) );
-      emit_mov(
+      x86_mov(
          func,
          get_immediate_base(),
          get_argument( 5 ) );
@@ -2377,23 +2089,23 @@ tgsi_emit_sse2(
    else {
       assert(parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX);
 
-      emit_mov(
+      x86_mov(
          func,
          get_input_base(),
          get_argument( 0 ) );
-      emit_mov(
+      x86_mov(
          func,
          get_output_base(),
          get_argument( 1 ) );
-      emit_mov(
+      x86_mov(
          func,
          get_const_base(),
          get_argument( 2 ) );
-      emit_mov(
+      x86_mov(
          func,
          get_temp_base(),
          get_argument( 3 ) );
-      emit_mov(
+      x86_mov(
          func,
          get_immediate_base(),
          get_argument( 4 ) );
@@ -2416,7 +2128,7 @@ tgsi_emit_sse2(
             if( !instruction_phase ) {
                /* INSTRUCTION phase, overwrite coeff with output. */
                instruction_phase = TRUE;
-               emit_mov(
+               x86_mov(
                   func,
                   get_output_base(),
                   get_argument( 1 ) );
@@ -2428,8 +2140,10 @@ tgsi_emit_sse2(
             &parse.FullToken.FullInstruction );
 
 	 if (!ok) {
-	    debug_printf("failed to translate tgsi opcode %d to SSE\n", 
-			 parse.FullToken.FullInstruction.Instruction.Opcode );
+	    debug_printf("failed to translate tgsi opcode %d to SSE (%s)\n", 
+			 parse.FullToken.FullInstruction.Instruction.Opcode,
+                         parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX ?
+                         "vertex shader" : "fragment shader");
 	 }
          break;
 
@@ -2464,8 +2178,6 @@ tgsi_emit_sse2(
 
    tgsi_parse_free( &parse );
 
-   DUMP_END();
-
    return ok;
 }
 
diff --git a/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.h b/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.h
index d56bf7f98a..063287dc5e 100755
--- a/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.h
+++ b/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.h
@@ -10,7 +10,7 @@ struct x86_function;
 
 unsigned
 tgsi_emit_sse2(
-   struct tgsi_token *tokens,
+   const struct tgsi_token *tokens,
    struct x86_function *function,
    float (*immediates)[4]
  );
diff --git a/src/gallium/auxiliary/tgsi/util/tgsi_dump.c b/src/gallium/auxiliary/tgsi/util/tgsi_dump.c
index ff6a2c4194..26bfc2051f 100644
--- a/src/gallium/auxiliary/tgsi/util/tgsi_dump.c
+++ b/src/gallium/auxiliary/tgsi/util/tgsi_dump.c
@@ -25,8 +25,6 @@
  * 
  **************************************************************************/
 
-#include <stdio.h> 
-
 #include "pipe/p_debug.h"
 #include "pipe/p_util.h"
 #include "pipe/p_shader_tokens.h"
@@ -35,196 +33,28 @@
 #include "tgsi_parse.h"
 #include "tgsi_build.h"
 
-struct gen_dump
-{
-   unsigned tabs;
-   void  (* write)(
-               struct gen_dump   *dump,
-               const void        *data,
-               unsigned          size );
-};
-
-struct text_dump
-{
-   struct gen_dump   base;
-   char              *text;
-   unsigned          length;
-   unsigned          capacity;
-};
-
-static void
-_text_dump_write(
-   struct gen_dump   *dump,
-   const void        *data,
-   unsigned          size )
-{
-   struct text_dump  *td = (struct text_dump *) dump;
-   unsigned          new_length = td->length + size;
-
-   if( new_length >= td->capacity ) {
-      unsigned new_capacity = td->capacity;
-
-      do {
-         if( new_capacity == 0 ) {
-            new_capacity = 256;
-         }
-         else {
-            new_capacity *= 2;
-         }
-      } while( new_length >= new_capacity );
-      td->text = (char *) REALLOC(
-         td->text,
-         td->capacity,
-         new_capacity );
-      td->capacity = new_capacity;
-   }
-   memcpy(
-      &td->text[td->length],
-      data,
-      size );
-   td->length = new_length;
-   td->text[td->length] = '\0';
-}
-
-struct file_dump
-{
-   struct gen_dump   base;
-   FILE              *file;
-};
-
-static void
-_file_dump_write(
-   struct gen_dump   *dump,
-   const void        *data,
-   unsigned          size )
-{
-   struct file_dump  *fd = (struct file_dump *) dump;
-
-#if 0
-   fwrite( data, 1, size, fd->file );
-#else
-   {
-      unsigned i;
-
-      for (i = 0; i < size; i++ ) {
-         fprintf( fd->file, "%c", ((const char *) data)[i] );
-      }
-   }
-#endif
-}
-
-static void
-gen_dump_str(
-   struct gen_dump   *dump,
-   const char        *str )
-{
-   unsigned i;
-   size_t   len = strlen( str );
-
-   for (i = 0; i < len; i++) {
-      dump->write( dump, &str[i], 1 );
-      if (str[i] == '\n') {
-         unsigned i;
-
-         for (i = 0; i < dump->tabs; i++) {
-            dump->write( dump, "    ", 4 );
-         }
-      }
-   }
-}
-
-static void
-gen_dump_chr(
-   struct gen_dump   *dump,
-   const char        chr )
-{
-   dump->write( dump, &chr, 1 );
-}
-
-static void
-gen_dump_uix(
-   struct gen_dump   *dump,
-   const unsigned    ui )
-{
-   char  str[36];
-
-   util_snprintf( str, sizeof(str), "0x%x", ui );
-   gen_dump_str( dump, str );
-}
-
-static void
-gen_dump_uid(
-   struct gen_dump   *dump,
-   const unsigned    ui )
-{
-   char  str[16];
-
-   util_snprintf( str, sizeof(str), "%u", ui );
-   gen_dump_str( dump, str );
-}
-
-static void
-gen_dump_sid(
-   struct gen_dump   *dump,
-   const int         si )
-{
-   char  str[16];
-
-   util_snprintf( str, sizeof(str), "%d", si );
-   gen_dump_str( dump, str );
-}
-
 static void
-gen_dump_flt(
-   struct gen_dump   *dump,
-   const float       flt )
-{
-   char  str[48];
-
-   util_snprintf( str, sizeof(str), "%10.4f", flt );
-   gen_dump_str( dump, str );
-}
-
-static void
-gen_dump_enum(
-   struct gen_dump   *dump,
+dump_enum(
    const unsigned    e,
    const char        **enums,
    const unsigned    enums_count )
 {
    if (e >= enums_count) {
-      gen_dump_uid( dump, e );
+      debug_printf( "%u", e );
    }
    else {
-      gen_dump_str( dump, enums[e] );
+      debug_printf( "%s", enums[e] );
    }
 }
 
-static void
-gen_dump_tab(
-   struct gen_dump   *dump )
-{
-   ++dump->tabs;
-}
-
-static void
-gen_dump_untab(
-   struct gen_dump   *dump )
-{
-   assert( dump->tabs > 0 );
-
-   --dump->tabs;
-}
-
-#define TXT(S)          gen_dump_str( dump, S )
-#define CHR(C)          gen_dump_chr( dump, C )
-#define UIX(I)          gen_dump_uix( dump, I )
-#define UID(I)          gen_dump_uid( dump, I )
-#define SID(I)          gen_dump_sid( dump, I )
-#define FLT(F)          gen_dump_flt( dump, F )
-#define TAB()           gen_dump_tab( dump )
-#define UNT()           gen_dump_untab( dump )
-#define ENM(E,ENUMS)    gen_dump_enum( dump, E, ENUMS, sizeof( ENUMS ) / sizeof( *ENUMS ) )
+#define EOL()           debug_printf( "\n" )
+#define TXT(S)          debug_printf( "%s", S )
+#define CHR(C)          debug_printf( "%c", C )
+#define UIX(I)          debug_printf( "0x%x", I )
+#define UID(I)          debug_printf( "%u", I )
+#define SID(I)          debug_printf( "%d", I )
+#define FLT(F)          debug_printf( "%10.4f", F )
+#define ENM(E,ENUMS)    dump_enum( E, ENUMS, sizeof( ENUMS ) / sizeof( *ENUMS ) )
 
 static const char *TGSI_PROCESSOR_TYPES[] =
 {
@@ -711,7 +541,6 @@ static const char *TGSI_MODULATES[] =
 
 static void
 dump_declaration_short(
-   struct gen_dump               *dump,
    struct tgsi_full_declaration  *decl )
 {
    TXT( "\nDCL " );
@@ -765,7 +594,6 @@ dump_declaration_short(
 
 static void
 dump_declaration_verbose(
-   struct gen_dump               *dump,
    struct tgsi_full_declaration  *decl,
    unsigned                      ignored,
    unsigned                      deflt,
@@ -803,7 +631,7 @@ dump_declaration_verbose(
       UIX( decl->Declaration.Padding );
    }
 
-   CHR( '\n' );
+   EOL();
    switch( decl->Declaration.Declare ) {
    case TGSI_DECLARE_RANGE:
       TXT( "\nFirst: " );
@@ -822,7 +650,7 @@ dump_declaration_verbose(
    }
 
    if( decl->Declaration.Interpolate ) {
-      CHR( '\n' );
+      EOL();
       TXT( "\nInterpolate: " );
       ENM( decl->Interpolation.Interpolate, TGSI_INTERPOLATES );
       if( ignored ) {
@@ -832,7 +660,7 @@ dump_declaration_verbose(
    }
 
    if( decl->Declaration.Semantic ) {
-      CHR( '\n' );
+      EOL();
       TXT( "\nSemanticName : " );
       ENM( decl->Semantic.SemanticName, TGSI_SEMANTICS );
       TXT( "\nSemanticIndex: " );
@@ -846,7 +674,6 @@ dump_declaration_verbose(
 
 static void
 dump_immediate_short(
-   struct gen_dump            *dump,
    struct tgsi_full_immediate *imm )
 {
    unsigned i;
@@ -874,7 +701,6 @@ dump_immediate_short(
 
 static void
 dump_immediate_verbose(
-   struct gen_dump            *dump,
    struct tgsi_full_immediate *imm,
    unsigned                   ignored )
 {
@@ -888,7 +714,7 @@ dump_immediate_verbose(
    }
 
    for( i = 0; i < imm->Immediate.Size - 1; i++ ) {
-      CHR( '\n' );
+      EOL();
       switch( imm->Immediate.DataType ) {
       case TGSI_IMM_FLOAT32:
          TXT( "\nFloat: " );
@@ -903,14 +729,13 @@ dump_immediate_verbose(
 
 static void
 dump_instruction_short(
-   struct gen_dump               *dump,
    struct tgsi_full_instruction  *inst,
    unsigned                      instno )
 {
    unsigned i;
    boolean  first_reg = TRUE;
 
-   CHR( '\n' );
+   EOL();
    UID( instno );
    CHR( ':' );
    ENM( inst->Instruction.Opcode, TGSI_OPCODES_SHORT );
@@ -1042,7 +867,6 @@ dump_instruction_short(
 
 static void
 dump_instruction_verbose(
-   struct gen_dump               *dump,
    struct tgsi_full_instruction  *inst,
    unsigned                      ignored,
    unsigned                      deflt,
@@ -1070,7 +894,7 @@ dump_instruction_verbose(
    }
 
    if( deflt || tgsi_compare_instruction_ext_nv( inst->InstructionExtNv, fi->InstructionExtNv ) ) {
-      CHR( '\n' );
+      EOL();
       TXT( "\nType          : " );
       ENM( inst->InstructionExtNv.Type, TGSI_INSTRUCTION_EXTS );
       if( deflt || fi->InstructionExtNv.Precision != inst->InstructionExtNv.Precision ) {
@@ -1124,7 +948,7 @@ dump_instruction_verbose(
    }
 
    if( deflt || tgsi_compare_instruction_ext_label( inst->InstructionExtLabel, fi->InstructionExtLabel ) ) {
-      CHR( '\n' );
+      EOL();
       TXT( "\nType    : " );
       ENM( inst->InstructionExtLabel.Type, TGSI_INSTRUCTION_EXTS );
       if( deflt || fi->InstructionExtLabel.Label != inst->InstructionExtLabel.Label ) {
@@ -1142,7 +966,7 @@ dump_instruction_verbose(
    }
 
    if( deflt || tgsi_compare_instruction_ext_texture( inst->InstructionExtTexture, fi->InstructionExtTexture ) ) {
-      CHR( '\n' );
+      EOL();
       TXT( "\nType    : " );
       ENM( inst->InstructionExtTexture.Type, TGSI_INSTRUCTION_EXTS );
       if( deflt || fi->InstructionExtTexture.Texture != inst->InstructionExtTexture.Texture ) {
@@ -1163,7 +987,7 @@ dump_instruction_verbose(
       struct tgsi_full_dst_register *dst = &inst->FullDstRegisters[i];
       struct tgsi_full_dst_register *fd = &fi->FullDstRegisters[i];
 
-      CHR( '\n' );
+      EOL();
       TXT( "\nFile     : " );
       ENM( dst->DstRegister.File, TGSI_FILES );
       if( deflt || fd->DstRegister.WriteMask != dst->DstRegister.WriteMask ) {
@@ -1194,7 +1018,7 @@ dump_instruction_verbose(
       }
 
       if( deflt || tgsi_compare_dst_register_ext_concode( dst->DstRegisterExtConcode, fd->DstRegisterExtConcode ) ) {
-         CHR( '\n' );
+         EOL();
          TXT( "\nType        : " );
          ENM( dst->DstRegisterExtConcode.Type, TGSI_DST_REGISTER_EXTS );
          if( deflt || fd->DstRegisterExtConcode.CondMask != dst->DstRegisterExtConcode.CondMask ) {
@@ -1232,7 +1056,7 @@ dump_instruction_verbose(
       }
 
       if( deflt || tgsi_compare_dst_register_ext_modulate( dst->DstRegisterExtModulate, fd->DstRegisterExtModulate ) ) {
-         CHR( '\n' );
+         EOL();
          TXT( "\nType    : " );
          ENM( dst->DstRegisterExtModulate.Type, TGSI_DST_REGISTER_EXTS );
          if( deflt || fd->DstRegisterExtModulate.Modulate != dst->DstRegisterExtModulate.Modulate ) {
@@ -1254,7 +1078,7 @@ dump_instruction_verbose(
       struct tgsi_full_src_register *src = &inst->FullSrcRegisters[i];
       struct tgsi_full_src_register *fs = &fi->FullSrcRegisters[i];
 
-      CHR( '\n' );
+      EOL();
       TXT( "\nFile     : ");
       ENM( src->SrcRegister.File, TGSI_FILES );
       if( deflt || fs->SrcRegister.SwizzleX != src->SrcRegister.SwizzleX ) {
@@ -1299,7 +1123,7 @@ dump_instruction_verbose(
       }
 
       if( deflt || tgsi_compare_src_register_ext_swz( src->SrcRegisterExtSwz, fs->SrcRegisterExtSwz ) ) {
-         CHR( '\n' );
+         EOL();
          TXT( "\nType       : " );
          ENM( src->SrcRegisterExtSwz.Type, TGSI_SRC_REGISTER_EXTS );
          if( deflt || fs->SrcRegisterExtSwz.ExtSwizzleX != src->SrcRegisterExtSwz.ExtSwizzleX ) {
@@ -1345,7 +1169,7 @@ dump_instruction_verbose(
       }
 
       if( deflt || tgsi_compare_src_register_ext_mod( src->SrcRegisterExtMod, fs->SrcRegisterExtMod ) ) {
-         CHR( '\n' );
+         EOL();
          TXT( "\nType     : " );
          ENM( src->SrcRegisterExtMod.Type, TGSI_SRC_REGISTER_EXTS );
          if( deflt || fs->SrcRegisterExtMod.Complement != src->SrcRegisterExtMod.Complement ) {
@@ -1380,9 +1204,8 @@ dump_instruction_verbose(
    }
 }
 
-static void
-dump_gen(
-   struct gen_dump         *dump,
+void
+tgsi_dump(
    const struct tgsi_token *tokens,
    unsigned                flags )
 {
@@ -1394,16 +1217,16 @@ dump_gen(
    unsigned deflt = !(flags & TGSI_DUMP_NO_DEFAULT);
    unsigned instno = 0;
 
-   dump->tabs = 0;
-
-   /* sanity check */
+   /* sanity checks */
    assert(strcmp(TGSI_OPCODES[TGSI_OPCODE_CONT], "OPCODE_CONT") == 0);
+   assert(strcmp(TGSI_OPCODES[TGSI_OPCODE_END], "OPCODE_END") == 0);
+   assert(strcmp(TGSI_OPCODES_SHORT[TGSI_OPCODE_END], "END") == 0);
 
    tgsi_parse_init( &parse, tokens );
 
    TXT( "tgsi-dump begin -----------------" );
 
-   CHR( '\n' );
+   EOL();
    ENM( parse.FullHeader.Processor.Processor, TGSI_PROCESSOR_TYPES_SHORT );
    UID( parse.FullVersion.Version.MajorVersion );
    CHR( '.' );
@@ -1414,7 +1237,7 @@ dump_gen(
       UID( parse.FullVersion.Version.MajorVersion );
       TXT( "\nMinorVersion: " );
       UID( parse.FullVersion.Version.MinorVersion );
-      CHR( '\n' );
+      EOL();
 
       TXT( "\nHeaderSize: " );
       UID( parse.FullHeader.Header.HeaderSize );
@@ -1422,7 +1245,7 @@ dump_gen(
       UID( parse.FullHeader.Header.BodySize );
       TXT( "\nProcessor : " );
       ENM( parse.FullHeader.Processor.Processor, TGSI_PROCESSOR_TYPES );
-      CHR( '\n' );
+      EOL();
    }
 
    fi = tgsi_default_full_instruction();
@@ -1434,19 +1257,16 @@ dump_gen(
       switch( parse.FullToken.Token.Type ) {
       case TGSI_TOKEN_TYPE_DECLARATION:
          dump_declaration_short(
-            dump,
             &parse.FullToken.FullDeclaration );
          break;
 
       case TGSI_TOKEN_TYPE_IMMEDIATE:
          dump_immediate_short(
-            dump,
             &parse.FullToken.FullImmediate );
          break;
 
       case TGSI_TOKEN_TYPE_INSTRUCTION:
          dump_instruction_short(
-            dump,
             &parse.FullToken.FullInstruction,
             instno );
          instno++;
@@ -1471,7 +1291,6 @@ dump_gen(
          switch( parse.FullToken.Token.Type ) {
          case TGSI_TOKEN_TYPE_DECLARATION:
             dump_declaration_verbose(
-               dump,
                &parse.FullToken.FullDeclaration,
                ignored,
                deflt,
@@ -1480,14 +1299,12 @@ dump_gen(
 
          case TGSI_TOKEN_TYPE_IMMEDIATE:
             dump_immediate_verbose(
-               dump,
                &parse.FullToken.FullImmediate,
                ignored );
             break;
 
          case TGSI_TOKEN_TYPE_INSTRUCTION:
             dump_instruction_verbose(
-               dump,
                &parse.FullToken.FullInstruction,
                ignored,
                deflt,
@@ -1498,7 +1315,7 @@ dump_gen(
             assert( 0 );
          }
 
-         CHR( '\n' );
+         EOL();
       }
    }
 
@@ -1506,86 +1323,3 @@ dump_gen(
 
    tgsi_parse_free( &parse );
 }
-
-
-static void
-sanity_checks(void)
-{
-   assert(strcmp(TGSI_OPCODES[TGSI_OPCODE_END], "OPCODE_END") == 0);
-   assert(strcmp(TGSI_OPCODES_SHORT[TGSI_OPCODE_END], "END") == 0);
-}
-
-
-void
-tgsi_dump(
-   const struct tgsi_token *tokens,
-   unsigned                flags )
-{
-   struct file_dump  dump;
-
-   sanity_checks();
-
-   dump.base.write = _file_dump_write;
-#if 0
-   {
-      static unsigned   counter = 0;
-      char              buffer[64];
-      sprintf( buffer, "tgsi-dump-%.4u.txt", counter++ );
-      dump.file = fopen( buffer, "wt" );
-   }
-#else
-   dump.file = stderr;
-#endif
-
-   dump_gen(
-      &dump.base,
-      tokens,
-      flags );
-
-#if 0
-   fclose( dump.file );
-#endif
-}
-
-void
-tgsi_dump_str(
-   char                    **str,
-   const struct tgsi_token *tokens,
-   unsigned                flags )
-{
-   struct text_dump  dump;
-
-   dump.base.write = _text_dump_write;
-   dump.text = NULL;
-   dump.length = 0;
-   dump.capacity = 0;
-
-   dump_gen(
-      &dump.base,
-      tokens,
-      flags );
-
-   *str = dump.text;
-}
-
-
-void tgsi_debug_dump( struct tgsi_token *tokens )
-{
-   char *str, *p;
-
-   tgsi_dump_str( &str, tokens, 0 );
-
-   p = str;
-   while (p != NULL)
-   {
-      char *end = strchr( p, '\n' );
-      if (end != NULL)
-      {
-         *end++ = '\0';
-      }
-      debug_printf( "%s\n", p );
-      p = end;
-   }
-
-   FREE( str );
-}
diff --git a/src/gallium/auxiliary/tgsi/util/tgsi_dump.h b/src/gallium/auxiliary/tgsi/util/tgsi_dump.h
index 51d79a0362..beb0155d56 100644
--- a/src/gallium/auxiliary/tgsi/util/tgsi_dump.h
+++ b/src/gallium/auxiliary/tgsi/util/tgsi_dump.h
@@ -14,16 +14,6 @@ tgsi_dump(
    const struct tgsi_token *tokens,
    unsigned                flags );
 
-void
-tgsi_dump_str(
-   char                    **str,
-   const struct tgsi_token *tokens,
-   unsigned                flags );
-
-/* Dump to debug_printf()
- */
-void tgsi_debug_dump( struct tgsi_token *tokens );
-
 #if defined __cplusplus
 }
 #endif
diff --git a/src/gallium/auxiliary/tgsi/util/tgsi_parse.h b/src/gallium/auxiliary/tgsi/util/tgsi_parse.h
index a98e88e343..da0121c482 100644
--- a/src/gallium/auxiliary/tgsi/util/tgsi_parse.h
+++ b/src/gallium/auxiliary/tgsi/util/tgsi_parse.h
@@ -1,6 +1,8 @@
 #if !defined TGSI_PARSE_H
 #define TGSI_PARSE_H
 
+#include "pipe/p_shader_tokens.h"
+
 #if defined __cplusplus
 extern "C" {
 #endif