From b642730be93149baa7556e5791393168ab396175 Mon Sep 17 00:00:00 2001
From: José Fonseca <jrfonseca@tungstengraphics.com>
Date: Fri, 15 Feb 2008 17:35:24 +0900
Subject: Code reorganization: move files into their places.

This is in a separate commit to ensure renames are properly preserved.
---
 src/gallium/drivers/cell/spu/Makefile            |   72 +
 src/gallium/drivers/cell/spu/spu_blend.c         |   62 +
 src/gallium/drivers/cell/spu/spu_blend.h         |   37 +
 src/gallium/drivers/cell/spu/spu_colorpack.h     |  110 ++
 src/gallium/drivers/cell/spu/spu_exec.c          | 1948 ++++++++++++++++++++++
 src/gallium/drivers/cell/spu/spu_exec.h          |  172 ++
 src/gallium/drivers/cell/spu/spu_main.c          |  567 +++++++
 src/gallium/drivers/cell/spu/spu_main.h          |  177 ++
 src/gallium/drivers/cell/spu/spu_render.c        |  301 ++++
 src/gallium/drivers/cell/spu/spu_render.h        |   38 +
 src/gallium/drivers/cell/spu/spu_texture.c       |  217 +++
 src/gallium/drivers/cell/spu/spu_texture.h       |   47 +
 src/gallium/drivers/cell/spu/spu_tile.c          |   83 +
 src/gallium/drivers/cell/spu/spu_tile.h          |   73 +
 src/gallium/drivers/cell/spu/spu_tri.c           |  926 ++++++++++
 src/gallium/drivers/cell/spu/spu_tri.h           |   37 +
 src/gallium/drivers/cell/spu/spu_util.c          |  165 ++
 src/gallium/drivers/cell/spu/spu_vertex_fetch.c  |  673 ++++++++
 src/gallium/drivers/cell/spu/spu_vertex_shader.c |  231 +++
 src/gallium/drivers/cell/spu/spu_vertex_shader.h |   63 +
 src/gallium/drivers/cell/spu/spu_ztest.h         |  135 ++
 21 files changed, 6134 insertions(+)
 create mode 100644 src/gallium/drivers/cell/spu/Makefile
 create mode 100644 src/gallium/drivers/cell/spu/spu_blend.c
 create mode 100644 src/gallium/drivers/cell/spu/spu_blend.h
 create mode 100644 src/gallium/drivers/cell/spu/spu_colorpack.h
 create mode 100644 src/gallium/drivers/cell/spu/spu_exec.c
 create mode 100644 src/gallium/drivers/cell/spu/spu_exec.h
 create mode 100644 src/gallium/drivers/cell/spu/spu_main.c
 create mode 100644 src/gallium/drivers/cell/spu/spu_main.h
 create mode 100644 src/gallium/drivers/cell/spu/spu_render.c
 create mode 100644 src/gallium/drivers/cell/spu/spu_render.h
 create mode 100644 src/gallium/drivers/cell/spu/spu_texture.c
 create mode 100644 src/gallium/drivers/cell/spu/spu_texture.h
 create mode 100644 src/gallium/drivers/cell/spu/spu_tile.c
 create mode 100644 src/gallium/drivers/cell/spu/spu_tile.h
 create mode 100644 src/gallium/drivers/cell/spu/spu_tri.c
 create mode 100644 src/gallium/drivers/cell/spu/spu_tri.h
 create mode 100644 src/gallium/drivers/cell/spu/spu_util.c
 create mode 100644 src/gallium/drivers/cell/spu/spu_vertex_fetch.c
 create mode 100644 src/gallium/drivers/cell/spu/spu_vertex_shader.c
 create mode 100644 src/gallium/drivers/cell/spu/spu_vertex_shader.h
 create mode 100644 src/gallium/drivers/cell/spu/spu_ztest.h

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/spu/Makefile b/src/gallium/drivers/cell/spu/Makefile
new file mode 100644
index 0000000000..f202971d73
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/Makefile
@@ -0,0 +1,72 @@
+# Gallium3D Cell driver: SPU code
+
+# This makefile builds the g3d_spu.a file that's linked into the
+# PPU code/library.
+
+
+TOP = ../../../../..
+include $(TOP)/configs/linux-cell
+
+
+PROG = g3d
+
+PROG_SPU = $(PROG)_spu
+PROG_SPU_A = $(PROG)_spu.a
+PROG_SPU_EMBED_O = $(PROG)_spu-embed.o
+
+
+SOURCES = \
+	spu_main.c \
+	spu_blend.c \
+	spu_render.c \
+	spu_texture.c \
+	spu_tile.c \
+	spu_tri.c \
+	spu_exec.c \
+	spu_util.c \
+	spu_vertex_fetch.c \
+	spu_vertex_shader.c
+
+SPU_OBJECTS = $(SOURCES:.c=.o) \
+
+SPU_ASM_OUT = $(SOURCES:.c=.s) \
+
+INCLUDE_DIRS = -I$(TOP)/src/mesa
+
+
+.c.o:
+	$(SPU_CC) $(SPU_CFLAGS) -c $<
+
+.c.s:
+	$(SPU_CC) $(SPU_CFLAGS) -S $<
+
+
+# The .a file will be linked into the main/PPU executable
+default: $(PROG_SPU_A)
+
+$(PROG_SPU_A): $(PROG_SPU_EMBED_O)
+	$(SPU_AR) $(SPU_AR_FLAGS) $(PROG_SPU_A) $(PROG_SPU_EMBED_O)
+
+$(PROG_SPU_EMBED_O): $(PROG_SPU)
+	$(SPU_EMBED) $(SPU_EMBED_FLAGS) $(PROG_SPU) $(PROG_SPU) $(PROG_SPU_EMBED_O)
+
+$(PROG_SPU): $(SPU_OBJECTS)
+	$(SPU_CC) -o $(PROG_SPU) $(SPU_OBJECTS) $(SPU_LFLAGS)
+
+
+
+asmfiles: $(SPU_ASM_OUT)
+
+
+clean:
+	rm -f *~ *.o *.a *.d *.s $(PROG_SPU)
+
+
+
+depend: $(SOURCES)
+	rm -f depend
+	touch depend
+	$(MKDEP) $(MKDEP_OPTIONS) $(INCLUDE_DIRS) $(SOURCES) 2> /dev/null
+
+include depend
+
diff --git a/src/gallium/drivers/cell/spu/spu_blend.c b/src/gallium/drivers/cell/spu/spu_blend.c
new file mode 100644
index 0000000000..23ec0eeb45
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_blend.c
@@ -0,0 +1,62 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "spu_main.h"
+#include "spu_blend.h"
+#include "spu_colorpack.h"
+
+
+void
+blend_quad(uint itx, uint ity, vector float colors[4])
+{
+   /* simple SRC_ALPHA, ONE_MINUS_SRC_ALPHA blending */
+   vector float fbc00 = spu_unpack_color(spu.ctile.ui[ity][itx]);
+   vector float fbc01 = spu_unpack_color(spu.ctile.ui[ity][itx+1]);
+   vector float fbc10 = spu_unpack_color(spu.ctile.ui[ity+1][itx]);
+   vector float fbc11 = spu_unpack_color(spu.ctile.ui[ity+1][itx+1]);
+
+   vector float alpha00 = spu_splats(spu_extract(colors[0], 3));
+   vector float alpha01 = spu_splats(spu_extract(colors[1], 3));
+   vector float alpha10 = spu_splats(spu_extract(colors[2], 3));
+   vector float alpha11 = spu_splats(spu_extract(colors[3], 3));
+
+   vector float one_minus_alpha00 = spu_sub(spu_splats(1.0f), alpha00);
+   vector float one_minus_alpha01 = spu_sub(spu_splats(1.0f), alpha01);
+   vector float one_minus_alpha10 = spu_sub(spu_splats(1.0f), alpha10);
+   vector float one_minus_alpha11 = spu_sub(spu_splats(1.0f), alpha11);
+
+   colors[0] = spu_add(spu_mul(colors[0], alpha00),
+                       spu_mul(fbc00, one_minus_alpha00));
+   colors[1] = spu_add(spu_mul(colors[1], alpha01),
+                       spu_mul(fbc01, one_minus_alpha01));
+   colors[2] = spu_add(spu_mul(colors[2], alpha10),
+                       spu_mul(fbc10, one_minus_alpha10));
+   colors[3] = spu_add(spu_mul(colors[3], alpha11),
+                       spu_mul(fbc11, one_minus_alpha11));
+}
+
diff --git a/src/gallium/drivers/cell/spu/spu_blend.h b/src/gallium/drivers/cell/spu/spu_blend.h
new file mode 100644
index 0000000000..2b594b578b
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_blend.h
@@ -0,0 +1,37 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef SPU_BLEND_H
+#define SPU_BLEND_H
+
+
+extern void
+blend_quad(uint itx, uint ity, vector float colors[4]);
+
+
+#endif /* SPU_BLEND_H */
diff --git a/src/gallium/drivers/cell/spu/spu_colorpack.h b/src/gallium/drivers/cell/spu/spu_colorpack.h
new file mode 100644
index 0000000000..e9fee8a3a6
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_colorpack.h
@@ -0,0 +1,110 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+
+#ifndef SPU_COLORPACK_H
+#define SPU_COLORPACK_H
+
+
+#include <spu_intrinsics.h>
+
+
+static INLINE unsigned int
+spu_pack_R8G8B8A8(vector float rgba)
+{
+  vector unsigned int out = spu_convtu(rgba, 32);
+
+  out = spu_shuffle(out, out, ((vector unsigned char) {
+                                  0, 4, 8, 12, 0, 0, 0, 0, 
+                                  0, 0, 0, 0, 0, 0, 0, 0 }) );
+
+  return spu_extract(out, 0);
+}
+
+
+static INLINE unsigned int
+spu_pack_A8R8G8B8(vector float rgba)
+{
+  vector unsigned int out = spu_convtu(rgba, 32);
+  out = spu_shuffle(out, out, ((vector unsigned char) {
+                                  12, 0, 4, 8, 0, 0, 0, 0, 
+                                  0, 0, 0, 0, 0, 0, 0, 0}) );
+  return spu_extract(out, 0);
+}
+
+
+static INLINE unsigned int
+spu_pack_B8G8R8A8(vector float rgba)
+{
+  vector unsigned int out = spu_convtu(rgba, 32);
+  out = spu_shuffle(out, out, ((vector unsigned char) {
+                                  8, 4, 0, 12, 0, 0, 0, 0, 
+                                  0, 0, 0, 0, 0, 0, 0, 0}) );
+  return spu_extract(out, 0);
+}
+
+
+static INLINE unsigned int
+spu_pack_color_shuffle(vector float rgba, vector unsigned char shuffle)
+{
+  vector unsigned int out = spu_convtu(rgba, 32);
+  out = spu_shuffle(out, out, shuffle);
+  return spu_extract(out, 0);
+}
+
+
+static INLINE vector float
+spu_unpack_color(uint color)
+{
+   vector unsigned int color_u4 = spu_splats(color);
+   color_u4 = spu_shuffle(color_u4, color_u4,
+                          ((vector unsigned char) {
+                             0, 0, 0, 0,
+                             5, 5, 5, 5,
+                             10, 10, 10, 10,
+                             15, 15, 15, 15}) );
+   return spu_convtf(color_u4, 32);
+}
+
+
+static INLINE vector float
+spu_unpack_A8R8G8B8(uint color)
+{
+   vector unsigned int color_u4 = spu_splats(color);
+   color_u4 = spu_shuffle(color_u4, color_u4,
+                          ((vector unsigned char) {
+                             5, 5, 5, 5,
+                             10, 10, 10, 10,
+                             15, 15, 15, 15,
+                             0, 0, 0, 0}) );
+
+   return spu_convtf(color_u4, 32);
+}
+
+
+#endif /* SPU_COLORPACK_H */
diff --git a/src/gallium/drivers/cell/spu/spu_exec.c b/src/gallium/drivers/cell/spu/spu_exec.c
new file mode 100644
index 0000000000..e51008b9b3
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_exec.c
@@ -0,0 +1,1948 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * TGSI interpretor/executor.
+ *
+ * Flow control information:
+ *
+ * Since we operate on 'quads' (4 pixels or 4 vertices in parallel)
+ * flow control statements (IF/ELSE/ENDIF, LOOP/ENDLOOP) require special
+ * care since a condition may be true for some quad components but false
+ * for other components.
+ *
+ * We basically execute all statements (even if they're in the part of
+ * an IF/ELSE clause that's "not taken") and use a special mask to
+ * control writing to destination registers.  This is the ExecMask.
+ * See store_dest().
+ *
+ * The ExecMask is computed from three other masks (CondMask, LoopMask and
+ * ContMask) which are controlled by the flow control instructions (namely:
+ * (IF/ELSE/ENDIF, LOOP/ENDLOOP and CONT).
+ *
+ *
+ * Authors:
+ *   Michal Krol
+ *   Brian Paul
+ */
+
+#include <libmisc.h>
+#include <spu_mfcio.h>
+#include <transpose_matrix4x4.h>
+#include <simdmath/ceilf4.h>
+#include <simdmath/cosf4.h>
+#include <simdmath/divf4.h>
+#include <simdmath/floorf4.h>
+#include <simdmath/log2f4.h>
+#include <simdmath/powf4.h>
+#include <simdmath/sinf4.h>
+#include <simdmath/sqrtf4.h>
+#include <simdmath/truncf4.h>
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_state.h"
+#include "pipe/p_util.h"
+#include "pipe/p_shader_tokens.h"
+#include "pipe/tgsi/util/tgsi_parse.h"
+#include "pipe/tgsi/util/tgsi_util.h"
+#include "spu_exec.h"
+#include "spu_main.h"
+#include "spu_vertex_shader.h"
+
+#define TILE_TOP_LEFT     0
+#define TILE_TOP_RIGHT    1
+#define TILE_BOTTOM_LEFT  2
+#define TILE_BOTTOM_RIGHT 3
+
+/*
+ * Shorthand locations of various utility registers (_I = Index, _C = Channel)
+ */
+#define TEMP_0_I           TGSI_EXEC_TEMP_00000000_I
+#define TEMP_0_C           TGSI_EXEC_TEMP_00000000_C
+#define TEMP_7F_I          TGSI_EXEC_TEMP_7FFFFFFF_I
+#define TEMP_7F_C          TGSI_EXEC_TEMP_7FFFFFFF_C
+#define TEMP_80_I          TGSI_EXEC_TEMP_80000000_I
+#define TEMP_80_C          TGSI_EXEC_TEMP_80000000_C
+#define TEMP_FF_I          TGSI_EXEC_TEMP_FFFFFFFF_I
+#define TEMP_FF_C          TGSI_EXEC_TEMP_FFFFFFFF_C
+#define TEMP_1_I           TGSI_EXEC_TEMP_ONE_I
+#define TEMP_1_C           TGSI_EXEC_TEMP_ONE_C
+#define TEMP_2_I           TGSI_EXEC_TEMP_TWO_I
+#define TEMP_2_C           TGSI_EXEC_TEMP_TWO_C
+#define TEMP_128_I         TGSI_EXEC_TEMP_128_I
+#define TEMP_128_C         TGSI_EXEC_TEMP_128_C
+#define TEMP_M128_I        TGSI_EXEC_TEMP_MINUS_128_I
+#define TEMP_M128_C        TGSI_EXEC_TEMP_MINUS_128_C
+#define TEMP_KILMASK_I     TGSI_EXEC_TEMP_KILMASK_I
+#define TEMP_KILMASK_C     TGSI_EXEC_TEMP_KILMASK_C
+#define TEMP_OUTPUT_I      TGSI_EXEC_TEMP_OUTPUT_I
+#define TEMP_OUTPUT_C      TGSI_EXEC_TEMP_OUTPUT_C
+#define TEMP_PRIMITIVE_I   TGSI_EXEC_TEMP_PRIMITIVE_I
+#define TEMP_PRIMITIVE_C   TGSI_EXEC_TEMP_PRIMITIVE_C
+#define TEMP_R0            TGSI_EXEC_TEMP_R0
+
+#define FOR_EACH_CHANNEL(CHAN)\
+   for (CHAN = 0; CHAN < 4; CHAN++)
+
+#define IS_CHANNEL_ENABLED(INST, CHAN)\
+   ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
+
+#define IS_CHANNEL_ENABLED2(INST, CHAN)\
+   ((INST).FullDstRegisters[1].DstRegister.WriteMask & (1 << (CHAN)))
+
+#define FOR_EACH_ENABLED_CHANNEL(INST, CHAN)\
+   FOR_EACH_CHANNEL( CHAN )\
+      if (IS_CHANNEL_ENABLED( INST, CHAN ))
+
+#define FOR_EACH_ENABLED_CHANNEL2(INST, CHAN)\
+   FOR_EACH_CHANNEL( CHAN )\
+      if (IS_CHANNEL_ENABLED2( INST, CHAN ))
+
+
+/** The execution mask depends on the conditional mask and the loop mask */
+#define UPDATE_EXEC_MASK(MACH) \
+      MACH->ExecMask = MACH->CondMask & MACH->LoopMask & MACH->ContMask & MACH->FuncMask
+
+
+#define CHAN_X  0
+#define CHAN_Y  1
+#define CHAN_Z  2
+#define CHAN_W  3
+
+
+
+/**
+ * Initialize machine state by expanding tokens to full instructions,
+ * allocating temporary storage, setting up constants, etc.
+ * After this, we can call spu_exec_machine_run() many times.
+ */
+void
+spu_exec_machine_init(struct spu_exec_machine *mach,
+                      uint numSamplers,
+                      struct spu_sampler *samplers,
+                      unsigned processor)
+{
+   qword zero;
+   qword not_zero;
+   uint i;
+
+   mach->Samplers = samplers;
+   mach->Processor = processor;
+   mach->Addrs = &mach->Temps[TGSI_EXEC_NUM_TEMPS];
+
+   zero = si_xor(zero, zero);
+   not_zero = si_xori(zero, 0xff);
+
+   /* Setup constants. */
+   mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].q = zero;
+   mach->Temps[TEMP_FF_I].xyzw[TEMP_FF_C].q = not_zero;
+   mach->Temps[TEMP_7F_I].xyzw[TEMP_7F_C].q = si_shli(not_zero, -1);
+   mach->Temps[TEMP_80_I].xyzw[TEMP_80_C].q = si_shli(not_zero, 31);
+
+   mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].q = (qword) spu_splats(1.0f);
+   mach->Temps[TEMP_2_I].xyzw[TEMP_2_C].q = (qword) spu_splats(2.0f);
+   mach->Temps[TEMP_128_I].xyzw[TEMP_128_C].q = (qword) spu_splats(128.0f);
+   mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C].q = (qword) spu_splats(-128.0f);
+}
+
+
+static INLINE qword
+micro_abs(qword src)
+{
+   return si_rotmi(si_shli(src, 1), -1);
+}
+
+static INLINE qword
+micro_ceil(qword src)
+{
+   return (qword) _ceilf4((vec_float4) src);
+}
+
+static INLINE qword
+micro_cos(qword src)
+{
+   return (qword) _cosf4((vec_float4) src);
+}
+
+static const qword br_shuf = {
+   TILE_BOTTOM_RIGHT + 0, TILE_BOTTOM_RIGHT + 1,
+   TILE_BOTTOM_RIGHT + 2, TILE_BOTTOM_RIGHT + 3,
+   TILE_BOTTOM_RIGHT + 0, TILE_BOTTOM_RIGHT + 1,
+   TILE_BOTTOM_RIGHT + 2, TILE_BOTTOM_RIGHT + 3,
+   TILE_BOTTOM_RIGHT + 0, TILE_BOTTOM_RIGHT + 1,
+   TILE_BOTTOM_RIGHT + 2, TILE_BOTTOM_RIGHT + 3,
+   TILE_BOTTOM_RIGHT + 0, TILE_BOTTOM_RIGHT + 1,
+   TILE_BOTTOM_RIGHT + 2, TILE_BOTTOM_RIGHT + 3,
+};
+
+static const qword bl_shuf = {
+   TILE_BOTTOM_LEFT + 0, TILE_BOTTOM_LEFT + 1,
+   TILE_BOTTOM_LEFT + 2, TILE_BOTTOM_LEFT + 3,
+   TILE_BOTTOM_LEFT + 0, TILE_BOTTOM_LEFT + 1,
+   TILE_BOTTOM_LEFT + 2, TILE_BOTTOM_LEFT + 3,
+   TILE_BOTTOM_LEFT + 0, TILE_BOTTOM_LEFT + 1,
+   TILE_BOTTOM_LEFT + 2, TILE_BOTTOM_LEFT + 3,
+   TILE_BOTTOM_LEFT + 0, TILE_BOTTOM_LEFT + 1,
+   TILE_BOTTOM_LEFT + 2, TILE_BOTTOM_LEFT + 3,
+};
+
+static const qword tl_shuf = {
+   TILE_TOP_LEFT + 0, TILE_TOP_LEFT + 1,
+   TILE_TOP_LEFT + 2, TILE_TOP_LEFT + 3,
+   TILE_TOP_LEFT + 0, TILE_TOP_LEFT + 1,
+   TILE_TOP_LEFT + 2, TILE_TOP_LEFT + 3,
+   TILE_TOP_LEFT + 0, TILE_TOP_LEFT + 1,
+   TILE_TOP_LEFT + 2, TILE_TOP_LEFT + 3,
+   TILE_TOP_LEFT + 0, TILE_TOP_LEFT + 1,
+   TILE_TOP_LEFT + 2, TILE_TOP_LEFT + 3,
+};
+
+static qword
+micro_ddx(qword src)
+{
+   qword bottom_right = si_shufb(src, src, br_shuf);
+   qword bottom_left = si_shufb(src, src, bl_shuf);
+
+   return si_fs(bottom_right, bottom_left);
+}
+
+static qword
+micro_ddy(qword src)
+{
+   qword top_left = si_shufb(src, src, tl_shuf);
+   qword bottom_left = si_shufb(src, src, bl_shuf);
+
+   return si_fs(top_left, bottom_left);
+}
+
+static INLINE qword
+micro_div(qword src0, qword src1)
+{
+   return (qword) _divf4((vec_float4) src0, (vec_float4) src1);
+}
+
+static qword
+micro_flr(qword src)
+{
+   return (qword) _floorf4((vec_float4) src);
+}
+
+static qword
+micro_frc(qword src)
+{
+   return si_fs(src, (qword) _floorf4((vec_float4) src));
+}
+
+static INLINE qword
+micro_ge(qword src0, qword src1)
+{
+   return si_or(si_fceq(src0, src1), si_fcgt(src0, src1));
+}
+
+static qword
+micro_lg2(qword src)
+{
+   return (qword) _log2f4((vec_float4) src);
+}
+
+static INLINE qword
+micro_lt(qword src0, qword src1)
+{
+   const qword tmp = si_or(si_fceq(src0, src1), si_fcgt(src0, src1));
+
+   return si_xori(tmp, 0xff);
+}
+
+static INLINE qword
+micro_max(qword src0, qword src1)
+{
+   return si_selb(src1, src0, si_fcgt(src0, src1));
+}
+
+static INLINE qword
+micro_min(qword src0, qword src1)
+{
+   return si_selb(src0, src1, si_fcgt(src0, src1));
+}
+
+static qword
+micro_neg(qword src)
+{
+   return si_xor(src, (qword) spu_splats(0x80000000));
+}
+
+static qword
+micro_set_sign(qword src)
+{
+   return si_or(src, (qword) spu_splats(0x80000000));
+}
+
+static qword
+micro_pow(qword src0, qword src1)
+{
+   return (qword) _powf4((vec_float4) src0, (vec_float4) src1);
+}
+
+static qword
+micro_rnd(qword src)
+{
+   const qword half = (qword) spu_splats(0.5f);
+
+   /* May be able to use _roundf4.  There may be some difference, though.
+    */
+   return (qword) _floorf4((vec_float4) si_fa(src, half));
+}
+
+static INLINE qword
+micro_ishr(qword src0, qword src1)
+{
+   return si_rotma(src0, si_sfi(src1, 0));
+}
+
+static qword
+micro_trunc(qword src)
+{
+   return (qword) _truncf4((vec_float4) src);
+}
+
+static qword
+micro_sin(qword src)
+{
+   return (qword) _sinf4((vec_float4) src);
+}
+
+static INLINE qword
+micro_sqrt(qword src)
+{
+   return (qword) _sqrtf4((vec_float4) src);
+}
+
+static void
+fetch_src_file_channel(
+   const struct spu_exec_machine *mach,
+   const uint file,
+   const uint swizzle,
+   const union spu_exec_channel *index,
+   union spu_exec_channel *chan )
+{
+   switch( swizzle ) {
+   case TGSI_EXTSWIZZLE_X:
+   case TGSI_EXTSWIZZLE_Y:
+   case TGSI_EXTSWIZZLE_Z:
+   case TGSI_EXTSWIZZLE_W:
+      switch( file ) {
+      case TGSI_FILE_CONSTANT: {
+         unsigned char buffer[32] ALIGN16_ATTRIB;
+         unsigned i;
+
+         for (i = 0; i < 4; i++) {
+            const float *ptr = mach->Consts[index->i[i]];
+            const uint64_t addr = (uint64_t)(uintptr_t) ptr;
+            const unsigned size = ((addr & 0x0f) == 0) ? 16 : 32;
+
+            mfc_get(buffer, addr & ~0x0f, size, TAG_VERTEX_BUFFER, 0, 0);
+            wait_on_mask(1 << TAG_VERTEX_BUFFER);
+
+            (void) memcpy(& chan->f[i], &buffer[(addr & 0x0f) 
+                + (sizeof(float) * swizzle)], sizeof(float));
+         }
+         break;
+      }
+
+      case TGSI_FILE_INPUT:
+         chan->u[0] = mach->Inputs[index->i[0]].xyzw[swizzle].u[0];
+         chan->u[1] = mach->Inputs[index->i[1]].xyzw[swizzle].u[1];
+         chan->u[2] = mach->Inputs[index->i[2]].xyzw[swizzle].u[2];
+         chan->u[3] = mach->Inputs[index->i[3]].xyzw[swizzle].u[3];
+         break;
+
+      case TGSI_FILE_TEMPORARY:
+         chan->u[0] = mach->Temps[index->i[0]].xyzw[swizzle].u[0];
+         chan->u[1] = mach->Temps[index->i[1]].xyzw[swizzle].u[1];
+         chan->u[2] = mach->Temps[index->i[2]].xyzw[swizzle].u[2];
+         chan->u[3] = mach->Temps[index->i[3]].xyzw[swizzle].u[3];
+         break;
+
+      case TGSI_FILE_IMMEDIATE:
+         assert( index->i[0] < (int) mach->ImmLimit );
+         assert( index->i[1] < (int) mach->ImmLimit );
+         assert( index->i[2] < (int) mach->ImmLimit );
+         assert( index->i[3] < (int) mach->ImmLimit );
+
+         chan->f[0] = mach->Imms[index->i[0]][swizzle];
+         chan->f[1] = mach->Imms[index->i[1]][swizzle];
+         chan->f[2] = mach->Imms[index->i[2]][swizzle];
+         chan->f[3] = mach->Imms[index->i[3]][swizzle];
+         break;
+
+      case TGSI_FILE_ADDRESS:
+         chan->u[0] = mach->Addrs[index->i[0]].xyzw[swizzle].u[0];
+         chan->u[1] = mach->Addrs[index->i[1]].xyzw[swizzle].u[1];
+         chan->u[2] = mach->Addrs[index->i[2]].xyzw[swizzle].u[2];
+         chan->u[3] = mach->Addrs[index->i[3]].xyzw[swizzle].u[3];
+         break;
+
+      case TGSI_FILE_OUTPUT:
+         /* vertex/fragment output vars can be read too */
+         chan->u[0] = mach->Outputs[index->i[0]].xyzw[swizzle].u[0];
+         chan->u[1] = mach->Outputs[index->i[1]].xyzw[swizzle].u[1];
+         chan->u[2] = mach->Outputs[index->i[2]].xyzw[swizzle].u[2];
+         chan->u[3] = mach->Outputs[index->i[3]].xyzw[swizzle].u[3];
+         break;
+
+      default:
+         assert( 0 );
+      }
+      break;
+
+   case TGSI_EXTSWIZZLE_ZERO:
+      *chan = mach->Temps[TEMP_0_I].xyzw[TEMP_0_C];
+      break;
+
+   case TGSI_EXTSWIZZLE_ONE:
+      *chan = mach->Temps[TEMP_1_I].xyzw[TEMP_1_C];
+      break;
+
+   default:
+      assert( 0 );
+   }
+}
+
+static void
+fetch_source(
+   const struct spu_exec_machine *mach,
+   union spu_exec_channel *chan,
+   const struct tgsi_full_src_register *reg,
+   const uint chan_index )
+{
+   union spu_exec_channel index;
+   uint swizzle;
+
+   index.i[0] =
+   index.i[1] =
+   index.i[2] =
+   index.i[3] = reg->SrcRegister.Index;
+
+   if (reg->SrcRegister.Indirect) {
+      union spu_exec_channel index2;
+      union spu_exec_channel indir_index;
+
+      index2.i[0] =
+      index2.i[1] =
+      index2.i[2] =
+      index2.i[3] = reg->SrcRegisterInd.Index;
+
+      swizzle = tgsi_util_get_src_register_swizzle(&reg->SrcRegisterInd,
+                                                   CHAN_X);
+      fetch_src_file_channel(
+         mach,
+         reg->SrcRegisterInd.File,
+         swizzle,
+         &index2,
+         &indir_index );
+
+      index.q = si_a(index.q, indir_index.q);
+   }
+
+   if( reg->SrcRegister.Dimension ) {
+      switch( reg->SrcRegister.File ) {
+      case TGSI_FILE_INPUT:
+         index.q = si_mpyi(index.q, 17);
+         break;
+      case TGSI_FILE_CONSTANT:
+         index.q = si_shli(index.q, 12);
+         break;
+      default:
+         assert( 0 );
+      }
+
+      index.i[0] += reg->SrcRegisterDim.Index;
+      index.i[1] += reg->SrcRegisterDim.Index;
+      index.i[2] += reg->SrcRegisterDim.Index;
+      index.i[3] += reg->SrcRegisterDim.Index;
+
+      if (reg->SrcRegisterDim.Indirect) {
+         union spu_exec_channel index2;
+         union spu_exec_channel indir_index;
+
+         index2.i[0] =
+         index2.i[1] =
+         index2.i[2] =
+         index2.i[3] = reg->SrcRegisterDimInd.Index;
+
+         swizzle = tgsi_util_get_src_register_swizzle( &reg->SrcRegisterDimInd, CHAN_X );
+         fetch_src_file_channel(
+            mach,
+            reg->SrcRegisterDimInd.File,
+            swizzle,
+            &index2,
+            &indir_index );
+
+         index.q = si_a(index.q, indir_index.q);
+      }
+   }
+
+   swizzle = tgsi_util_get_full_src_register_extswizzle( reg, chan_index );
+   fetch_src_file_channel(
+      mach,
+      reg->SrcRegister.File,
+      swizzle,
+      &index,
+      chan );
+
+   switch (tgsi_util_get_full_src_register_sign_mode( reg, chan_index )) {
+   case TGSI_UTIL_SIGN_CLEAR:
+      chan->q = micro_abs(chan->q);
+      break;
+
+   case TGSI_UTIL_SIGN_SET:
+      chan->q = micro_set_sign(chan->q);
+      break;
+
+   case TGSI_UTIL_SIGN_TOGGLE:
+      chan->q = micro_neg(chan->q);
+      break;
+
+   case TGSI_UTIL_SIGN_KEEP:
+      break;
+   }
+
+   if (reg->SrcRegisterExtMod.Complement) {
+      chan->q = si_fs(mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].q, chan->q);
+   }
+}
+
+static void
+store_dest(
+   struct spu_exec_machine *mach,
+   const union spu_exec_channel *chan,
+   const struct tgsi_full_dst_register *reg,
+   const struct tgsi_full_instruction *inst,
+   uint chan_index )
+{
+   union spu_exec_channel *dst;
+
+   switch( reg->DstRegister.File ) {
+   case TGSI_FILE_NULL:
+      return;
+
+   case TGSI_FILE_OUTPUT:
+      dst = &mach->Outputs[mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0]
+                           + reg->DstRegister.Index].xyzw[chan_index];
+      break;
+
+   case TGSI_FILE_TEMPORARY:
+      dst = &mach->Temps[reg->DstRegister.Index].xyzw[chan_index];
+      break;
+
+   case TGSI_FILE_ADDRESS:
+      dst = &mach->Addrs[reg->DstRegister.Index].xyzw[chan_index];
+      break;
+
+   default:
+      assert( 0 );
+      return;
+   }
+
+   switch (inst->Instruction.Saturate)
+   {
+   case TGSI_SAT_NONE:
+      if (mach->ExecMask & 0x1)
+         dst->i[0] = chan->i[0];
+      if (mach->ExecMask & 0x2)
+         dst->i[1] = chan->i[1];
+      if (mach->ExecMask & 0x4)
+         dst->i[2] = chan->i[2];
+      if (mach->ExecMask & 0x8)
+         dst->i[3] = chan->i[3];
+      break;
+
+   case TGSI_SAT_ZERO_ONE:
+      /* XXX need to obey ExecMask here */
+      dst->q = micro_max(chan->q, mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].q);
+      dst->q = micro_min(dst->q, mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].q);
+      break;
+
+   case TGSI_SAT_MINUS_PLUS_ONE:
+      assert( 0 );
+      break;
+
+   default:
+      assert( 0 );
+   }
+}
+
+#define FETCH(VAL,INDEX,CHAN)\
+    fetch_source (mach, VAL, &inst->FullSrcRegisters[INDEX], CHAN)
+
+#define STORE(VAL,INDEX,CHAN)\
+    store_dest (mach, VAL, &inst->FullDstRegisters[INDEX], inst, CHAN )
+
+
+/**
+ * Execute ARB-style KIL which is predicated by a src register.
+ * Kill fragment if any of the four values is less than zero.
+ */
+static void
+exec_kilp(struct spu_exec_machine *mach,
+          const struct tgsi_full_instruction *inst)
+{
+   uint uniquemask;
+   uint chan_index;
+   uint kilmask = 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
+   union spu_exec_channel r[1];
+
+   /* This mask stores component bits that were already tested. Note that
+    * we test if the value is less than zero, so 1.0 and 0.0 need not to be
+    * tested. */
+   uniquemask = (1 << TGSI_EXTSWIZZLE_ZERO) | (1 << TGSI_EXTSWIZZLE_ONE);
+
+   for (chan_index = 0; chan_index < 4; chan_index++)
+   {
+      uint swizzle;
+      uint i;
+
+      /* unswizzle channel */
+      swizzle = tgsi_util_get_full_src_register_extswizzle (
+                        &inst->FullSrcRegisters[0],
+                        chan_index);
+
+      /* check if the component has not been already tested */
+      if (uniquemask & (1 << swizzle))
+         continue;
+      uniquemask |= 1 << swizzle;
+
+      FETCH(&r[0], 0, chan_index);
+      for (i = 0; i < 4; i++)
+         if (r[0].f[i] < 0.0f)
+            kilmask |= 1 << i;
+   }
+
+   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
+}
+
+
+/*
+ * Fetch a texel using STR texture coordinates.
+ */
+static void
+fetch_texel( struct spu_sampler *sampler,
+             const union spu_exec_channel *s,
+             const union spu_exec_channel *t,
+             const union spu_exec_channel *p,
+             float lodbias,  /* XXX should be float[4] */
+             union spu_exec_channel *r,
+             union spu_exec_channel *g,
+             union spu_exec_channel *b,
+             union spu_exec_channel *a )
+{
+   qword rgba[4];
+   qword out[4];
+
+   sampler->get_samples(sampler, s->f, t->f, p->f, lodbias, (float *) rgba);
+
+   _transpose_matrix4x4(out, rgba);
+   r->q = out[0];
+   g->q = out[1];
+   b->q = out[2];
+   a->q = out[3];
+}
+
+
+static void
+exec_tex(struct spu_exec_machine *mach,
+         const struct tgsi_full_instruction *inst,
+         boolean biasLod)
+{
+   const uint unit = inst->FullSrcRegisters[1].SrcRegister.Index;
+   union spu_exec_channel r[8];
+   uint chan_index;
+   float lodBias;
+
+   /*   printf("Sampler %u unit %u\n", sampler, unit); */
+
+   switch (inst->InstructionExtTexture.Texture) {
+   case TGSI_TEXTURE_1D:
+
+      FETCH(&r[0], 0, CHAN_X);
+
+      switch (inst->FullSrcRegisters[0].SrcRegisterExtSwz.ExtDivide) {
+      case TGSI_EXTSWIZZLE_W:
+         FETCH(&r[1], 0, CHAN_W);
+         r[0].q = micro_div(r[0].q, r[1].q);
+         break;
+
+      case TGSI_EXTSWIZZLE_ONE:
+         break;
+
+      default:
+         assert (0);
+      }
+
+      if (biasLod) {
+         FETCH(&r[1], 0, CHAN_W);
+         lodBias = r[2].f[0];
+      }
+      else
+         lodBias = 0.0;
+
+      fetch_texel(&mach->Samplers[unit],
+                  &r[0], NULL, NULL, lodBias,  /* S, T, P, BIAS */
+                  &r[0], &r[1], &r[2], &r[3]); /* R, G, B, A */
+      break;
+
+   case TGSI_TEXTURE_2D:
+   case TGSI_TEXTURE_RECT:
+
+      FETCH(&r[0], 0, CHAN_X);
+      FETCH(&r[1], 0, CHAN_Y);
+      FETCH(&r[2], 0, CHAN_Z);
+
+      switch (inst->FullSrcRegisters[0].SrcRegisterExtSwz.ExtDivide) {
+      case TGSI_EXTSWIZZLE_W:
+         FETCH(&r[3], 0, CHAN_W);
+         r[0].q = micro_div(r[0].q, r[3].q);
+         r[1].q = micro_div(r[1].q, r[3].q);
+         r[2].q = micro_div(r[2].q, r[3].q);
+         break;
+
+      case TGSI_EXTSWIZZLE_ONE:
+         break;
+
+      default:
+         assert (0);
+      }
+
+      if (biasLod) {
+         FETCH(&r[3], 0, CHAN_W);
+         lodBias = r[3].f[0];
+      }
+      else
+         lodBias = 0.0;
+
+      fetch_texel(&mach->Samplers[unit],
+                  &r[0], &r[1], &r[2], lodBias,  /* inputs */
+                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
+      break;
+
+   case TGSI_TEXTURE_3D:
+   case TGSI_TEXTURE_CUBE:
+
+      FETCH(&r[0], 0, CHAN_X);
+      FETCH(&r[1], 0, CHAN_Y);
+      FETCH(&r[2], 0, CHAN_Z);
+
+      switch (inst->FullSrcRegisters[0].SrcRegisterExtSwz.ExtDivide) {
+      case TGSI_EXTSWIZZLE_W:
+         FETCH(&r[3], 0, CHAN_W);
+         r[0].q = micro_div(r[0].q, r[3].q);
+         r[1].q = micro_div(r[1].q, r[3].q);
+         r[2].q = micro_div(r[2].q, r[3].q);
+         break;
+
+      case TGSI_EXTSWIZZLE_ONE:
+         break;
+
+      default:
+         assert (0);
+      }
+
+      if (biasLod) {
+         FETCH(&r[3], 0, CHAN_W);
+         lodBias = r[3].f[0];
+      }
+      else
+         lodBias = 0.0;
+
+      fetch_texel(&mach->Samplers[unit],
+                  &r[0], &r[1], &r[2], lodBias,
+                  &r[0], &r[1], &r[2], &r[3]);
+      break;
+
+   default:
+      assert (0);
+   }
+
+   FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+      STORE( &r[chan_index], 0, chan_index );
+   }
+}
+
+
+
+static void
+constant_interpolation(
+   struct spu_exec_machine *mach,
+   unsigned attrib,
+   unsigned chan )
+{
+   unsigned i;
+
+   for( i = 0; i < QUAD_SIZE; i++ ) {
+      mach->Inputs[attrib].xyzw[chan].f[i] = mach->InterpCoefs[attrib].a0[chan];
+   }
+}
+
+static void
+linear_interpolation(
+   struct spu_exec_machine *mach,
+   unsigned attrib,
+   unsigned chan )
+{
+   const float x = mach->QuadPos.xyzw[0].f[0];
+   const float y = mach->QuadPos.xyzw[1].f[0];
+   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
+   const float dady = mach->InterpCoefs[attrib].dady[chan];
+   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
+   mach->Inputs[attrib].xyzw[chan].f[0] = a0;
+   mach->Inputs[attrib].xyzw[chan].f[1] = a0 + dadx;
+   mach->Inputs[attrib].xyzw[chan].f[2] = a0 + dady;
+   mach->Inputs[attrib].xyzw[chan].f[3] = a0 + dadx + dady;
+}
+
+static void
+perspective_interpolation(
+   struct spu_exec_machine *mach,
+   unsigned attrib,
+   unsigned chan )
+{
+   const float x = mach->QuadPos.xyzw[0].f[0];
+   const float y = mach->QuadPos.xyzw[1].f[0];
+   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
+   const float dady = mach->InterpCoefs[attrib].dady[chan];
+   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
+   const float *w = mach->QuadPos.xyzw[3].f;
+   /* divide by W here */
+   mach->Inputs[attrib].xyzw[chan].f[0] = a0 / w[0];
+   mach->Inputs[attrib].xyzw[chan].f[1] = (a0 + dadx) / w[1];
+   mach->Inputs[attrib].xyzw[chan].f[2] = (a0 + dady) / w[2];
+   mach->Inputs[attrib].xyzw[chan].f[3] = (a0 + dadx + dady) / w[3];
+}
+
+
+typedef void (* interpolation_func)(
+   struct spu_exec_machine *mach,
+   unsigned attrib,
+   unsigned chan );
+
+static void
+exec_declaration(struct spu_exec_machine *mach,
+                 const struct tgsi_full_declaration *decl)
+{
+   if( mach->Processor == TGSI_PROCESSOR_FRAGMENT ) {
+      if( decl->Declaration.File == TGSI_FILE_INPUT ) {
+         unsigned first, last, mask;
+         interpolation_func interp;
+
+         assert( decl->Declaration.Declare == TGSI_DECLARE_RANGE );
+
+         first = decl->u.DeclarationRange.First;
+         last = decl->u.DeclarationRange.Last;
+         mask = decl->Declaration.UsageMask;
+
+         switch( decl->Interpolation.Interpolate ) {
+         case TGSI_INTERPOLATE_CONSTANT:
+            interp = constant_interpolation;
+            break;
+
+         case TGSI_INTERPOLATE_LINEAR:
+            interp = linear_interpolation;
+            break;
+
+         case TGSI_INTERPOLATE_PERSPECTIVE:
+            interp = perspective_interpolation;
+            break;
+
+         default:
+            assert( 0 );
+         }
+
+         if( mask == TGSI_WRITEMASK_XYZW ) {
+            unsigned i, j;
+
+            for( i = first; i <= last; i++ ) {
+               for( j = 0; j < NUM_CHANNELS; j++ ) {
+                  interp( mach, i, j );
+               }
+            }
+         }
+         else {
+            unsigned i, j;
+
+            for( j = 0; j < NUM_CHANNELS; j++ ) {
+               if( mask & (1 << j) ) {
+                  for( i = first; i <= last; i++ ) {
+                     interp( mach, i, j );
+                  }
+               }
+            }
+         }
+      }
+   }
+}
+
+static void
+exec_instruction(
+   struct spu_exec_machine *mach,
+   const struct tgsi_full_instruction *inst,
+   int *pc )
+{
+   uint chan_index;
+   union spu_exec_channel r[8];
+
+   (*pc)++;
+
+   switch (inst->Instruction.Opcode) {
+   case TGSI_OPCODE_ARL:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 FETCH( &r[0], 0, chan_index );
+         r[0].q = si_cflts(r[0].q, 0);
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_MOV:
+   /* TGSI_OPCODE_SWZ */
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_LIT:
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
+	 STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X );
+      }
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y ) || IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
+	 FETCH( &r[0], 0, CHAN_X );
+         if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
+            r[0].q = micro_max(r[0].q, mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].q);
+	    STORE( &r[0], 0, CHAN_Y );
+	 }
+
+         if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
+            FETCH( &r[1], 0, CHAN_Y );
+            r[1].q = micro_max(r[1].q, mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].q);
+
+            FETCH( &r[2], 0, CHAN_W );
+            r[2].q = micro_min(r[2].q, mach->Temps[TEMP_128_I].xyzw[TEMP_128_C].q);
+            r[2].q = micro_max(r[2].q, mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C].q);
+            r[1].q = micro_pow(r[1].q, r[2].q);
+
+            /* r0 = (r0 > 0.0) ? r1 : 0.0
+             */
+            r[0].q = si_fcgt(r[0].q, mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].q);
+            r[0].q = si_selb(mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].q, r[1].q,
+                             r[0].q);
+            STORE( &r[0], 0, CHAN_Z );
+         }
+      }
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
+	 STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
+      }
+      break;
+
+   case TGSI_OPCODE_RCP:
+   /* TGSI_OPCODE_RECIP */
+      FETCH( &r[0], 0, CHAN_X );
+      r[0].q = micro_div(mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].q, r[0].q);
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_RSQ:
+   /* TGSI_OPCODE_RECIPSQRT */
+      FETCH( &r[0], 0, CHAN_X );
+      r[0].q = micro_sqrt(r[0].q);
+      r[0].q = micro_div(mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].q, r[0].q);
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_EXP:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_LOG:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_MUL:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index )
+      {
+         FETCH(&r[0], 0, chan_index);
+         FETCH(&r[1], 1, chan_index);
+
+         r[0].q = si_fm(r[0].q, r[1].q);
+
+         STORE(&r[0], 0, chan_index);
+      }
+      break;
+
+   case TGSI_OPCODE_ADD:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         r[0].q = si_fa(r[0].q, r[1].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_DP3:
+   /* TGSI_OPCODE_DOT3 */
+      FETCH( &r[0], 0, CHAN_X );
+      FETCH( &r[1], 1, CHAN_X );
+      r[0].q = si_fm(r[0].q, r[1].q);
+
+      FETCH( &r[1], 0, CHAN_Y );
+      FETCH( &r[2], 1, CHAN_Y );
+      r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
+
+
+      FETCH( &r[1], 0, CHAN_Z );
+      FETCH( &r[2], 1, CHAN_Z );
+      r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+    case TGSI_OPCODE_DP4:
+    /* TGSI_OPCODE_DOT4 */
+       FETCH(&r[0], 0, CHAN_X);
+       FETCH(&r[1], 1, CHAN_X);
+
+      r[0].q = si_fm(r[0].q, r[1].q);
+
+       FETCH(&r[1], 0, CHAN_Y);
+       FETCH(&r[2], 1, CHAN_Y);
+
+      r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
+
+       FETCH(&r[1], 0, CHAN_Z);
+       FETCH(&r[2], 1, CHAN_Z);
+
+      r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
+
+       FETCH(&r[1], 0, CHAN_W);
+       FETCH(&r[2], 1, CHAN_W);
+
+      r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_DST:
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
+	 STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X );
+      }
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
+	 FETCH( &r[0], 0, CHAN_Y );
+	 FETCH( &r[1], 1, CHAN_Y);
+      r[0].q = si_fm(r[0].q, r[1].q);
+	 STORE( &r[0], 0, CHAN_Y );
+      }
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
+	 FETCH( &r[0], 0, CHAN_Z );
+	 STORE( &r[0], 0, CHAN_Z );
+      }
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
+	 FETCH( &r[0], 1, CHAN_W );
+	 STORE( &r[0], 0, CHAN_W );
+      }
+      break;
+
+   case TGSI_OPCODE_MIN:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH(&r[0], 0, chan_index);
+         FETCH(&r[1], 1, chan_index);
+
+         r[0].q = micro_min(r[0].q, r[1].q);
+
+         STORE(&r[0], 0, chan_index);
+      }
+      break;
+
+   case TGSI_OPCODE_MAX:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH(&r[0], 0, chan_index);
+         FETCH(&r[1], 1, chan_index);
+
+         r[0].q = micro_max(r[0].q, r[1].q);
+
+         STORE(&r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SLT:
+   /* TGSI_OPCODE_SETLT */
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+
+         r[0].q = micro_ge(r[0].q, r[1].q);
+         r[0].q = si_xori(r[0].q, 0xff);
+
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SGE:
+   /* TGSI_OPCODE_SETGE */
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         r[0].q = micro_ge(r[0].q, r[1].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_MAD:
+   /* TGSI_OPCODE_MADD */
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         FETCH( &r[2], 2, chan_index );
+         r[0].q = si_fma(r[0].q, r[1].q, r[2].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SUB:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH(&r[0], 0, chan_index);
+         FETCH(&r[1], 1, chan_index);
+
+         r[0].q = si_fs(r[0].q, r[1].q);
+
+         STORE(&r[0], 0, chan_index);
+      }
+      break;
+
+   case TGSI_OPCODE_LERP:
+   /* TGSI_OPCODE_LRP */
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH(&r[0], 0, chan_index);
+         FETCH(&r[1], 1, chan_index);
+         FETCH(&r[2], 2, chan_index);
+
+         r[1].q = si_fs(r[1].q, r[2].q);
+         r[0].q = si_fma(r[0].q, r[1].q, r[2].q);
+
+         STORE(&r[0], 0, chan_index);
+      }
+      break;
+
+   case TGSI_OPCODE_CND:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_CND0:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_DOT2ADD:
+      /* TGSI_OPCODE_DP2A */
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_INDEX:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_NEGATE:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_FRAC:
+   /* TGSI_OPCODE_FRC */
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         r[0].q = micro_frc(r[0].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_CLAMP:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_FLOOR:
+   /* TGSI_OPCODE_FLR */
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         r[0].q = micro_flr(r[0].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_ROUND:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         r[0].q = micro_rnd(r[0].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_EXPBASE2:
+    /* TGSI_OPCODE_EX2 */
+      FETCH(&r[0], 0, CHAN_X);
+
+      r[0].q = micro_pow(mach->Temps[TEMP_2_I].xyzw[TEMP_2_C].q, r[0].q);
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_LOGBASE2:
+   /* TGSI_OPCODE_LG2 */
+      FETCH( &r[0], 0, CHAN_X );
+      r[0].q = micro_lg2(r[0].q);
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_POWER:
+      /* TGSI_OPCODE_POW */
+      FETCH(&r[0], 0, CHAN_X);
+      FETCH(&r[1], 1, CHAN_X);
+
+      r[0].q = micro_pow(r[0].q, r[1].q);
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_CROSSPRODUCT:
+      /* TGSI_OPCODE_XPD */
+      FETCH(&r[0], 0, CHAN_Y);
+      FETCH(&r[1], 1, CHAN_Z);
+      FETCH(&r[3], 0, CHAN_Z);
+      FETCH(&r[4], 1, CHAN_Y);
+
+      /* r2 = (r0 * r1) - (r3 * r5)
+       */
+      r[2].q = si_fm(r[3].q, r[5].q);
+      r[2].q = si_fms(r[0].q, r[1].q, r[2].q);
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
+         STORE( &r[2], 0, CHAN_X );
+      }
+
+      FETCH(&r[2], 1, CHAN_X);
+      FETCH(&r[5], 0, CHAN_X);
+
+      /* r3 = (r3 * r2) - (r1 * r5)
+       */
+      r[1].q = si_fm(r[1].q, r[5].q);
+      r[3].q = si_fms(r[3].q, r[2].q, r[1].q);
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
+         STORE( &r[3], 0, CHAN_Y );
+      }
+
+      /* r5 = (r5 * r4) - (r0 * r2)
+       */
+      r[0].q = si_fm(r[0].q, r[2].q);
+      r[5].q = si_fms(r[5].q, r[4].q, r[0].q);
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
+         STORE( &r[5], 0, CHAN_Z );
+      }
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
+         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
+      }
+      break;
+
+    case TGSI_OPCODE_MULTIPLYMATRIX:
+       assert (0);
+       break;
+
+    case TGSI_OPCODE_ABS:
+       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+          FETCH(&r[0], 0, chan_index);
+
+          r[0].q = micro_abs(r[0].q);
+
+          STORE(&r[0], 0, chan_index);
+       }
+       break;
+
+   case TGSI_OPCODE_RCC:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_DPH:
+      FETCH(&r[0], 0, CHAN_X);
+      FETCH(&r[1], 1, CHAN_X);
+
+      r[0].q = si_fm(r[0].q, r[1].q);
+
+      FETCH(&r[1], 0, CHAN_Y);
+      FETCH(&r[2], 1, CHAN_Y);
+
+      r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
+
+      FETCH(&r[1], 0, CHAN_Z);
+      FETCH(&r[2], 1, CHAN_Z);
+
+      r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
+
+      FETCH(&r[1], 1, CHAN_W);
+
+      r[0].q = si_fa(r[0].q, r[1].q);
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_COS:
+      FETCH(&r[0], 0, CHAN_X);
+
+      r[0].q = micro_cos(r[0].q);
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_DDX:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         r[0].q = micro_ddx(r[0].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_DDY:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         r[0].q = micro_ddy(r[0].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_KILP:
+      exec_kilp (mach, inst);
+      break;
+
+   case TGSI_OPCODE_KIL:
+      /* for enabled ExecMask bits, set the killed bit */
+      mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= mach->ExecMask;
+      break;
+
+   case TGSI_OPCODE_PK2H:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_PK2US:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_PK4B:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_PK4UB:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_RFL:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_SEQ:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+
+         r[0].q = si_fceq(r[0].q, r[1].q);
+
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SFL:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_SGT:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         r[0].q = si_fcgt(r[0].q, r[1].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SIN:
+      FETCH( &r[0], 0, CHAN_X );
+      r[0].q = micro_sin(r[0].q);
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SLE:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+
+         r[0].q = si_fcgt(r[0].q, r[1].q);
+         r[0].q = si_xori(r[0].q, 0xff);
+
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SNE:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+
+         r[0].q = si_fceq(r[0].q, r[1].q);
+         r[0].q = si_xori(r[0].q, 0xff);
+
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_STR:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_TEX:
+      /* simple texture lookup */
+      /* src[0] = texcoord */
+      /* src[1] = sampler unit */
+      exec_tex(mach, inst, FALSE);
+      break;
+
+   case TGSI_OPCODE_TXB:
+      /* Texture lookup with lod bias */
+      /* src[0] = texcoord (src[0].w = load bias) */
+      /* src[1] = sampler unit */
+      exec_tex(mach, inst, TRUE);
+      break;
+
+   case TGSI_OPCODE_TXD:
+      /* Texture lookup with explict partial derivatives */
+      /* src[0] = texcoord */
+      /* src[1] = d[strq]/dx */
+      /* src[2] = d[strq]/dy */
+      /* src[3] = sampler unit */
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_TXL:
+      /* Texture lookup with explit LOD */
+      /* src[0] = texcoord (src[0].w = load bias) */
+      /* src[1] = sampler unit */
+      exec_tex(mach, inst, TRUE);
+      break;
+
+   case TGSI_OPCODE_UP2H:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_UP2US:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_UP4B:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_UP4UB:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_X2D:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_ARA:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_ARR:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_BRA:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_CAL:
+      /* skip the call if no execution channels are enabled */
+      if (mach->ExecMask) {
+         /* do the call */
+
+         /* push the Cond, Loop, Cont stacks */
+         assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
+         mach->CondStack[mach->CondStackTop++] = mach->CondMask;
+         assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
+         mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
+         assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
+         mach->ContStack[mach->ContStackTop++] = mach->ContMask;
+
+         assert(mach->FuncStackTop < TGSI_EXEC_MAX_CALL_NESTING);
+         mach->FuncStack[mach->FuncStackTop++] = mach->FuncMask;
+
+         /* note that PC was already incremented above */
+         mach->CallStack[mach->CallStackTop++] = *pc;
+         *pc = inst->InstructionExtLabel.Label;
+      }
+      break;
+
+   case TGSI_OPCODE_RET:
+      mach->FuncMask &= ~mach->ExecMask;
+      UPDATE_EXEC_MASK(mach);
+
+      if (mach->ExecMask == 0x0) {
+         /* really return now (otherwise, keep executing */
+
+         if (mach->CallStackTop == 0) {
+            /* returning from main() */
+            *pc = -1;
+            return;
+         }
+         *pc = mach->CallStack[--mach->CallStackTop];
+
+         /* pop the Cond, Loop, Cont stacks */
+         assert(mach->CondStackTop > 0);
+         mach->CondMask = mach->CondStack[--mach->CondStackTop];
+         assert(mach->LoopStackTop > 0);
+         mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
+         assert(mach->ContStackTop > 0);
+         mach->ContMask = mach->ContStack[--mach->ContStackTop];
+         assert(mach->FuncStackTop > 0);
+         mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
+
+         UPDATE_EXEC_MASK(mach);
+      }
+      break;
+
+   case TGSI_OPCODE_SSG:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_CMP:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH(&r[0], 0, chan_index);
+         FETCH(&r[1], 1, chan_index);
+         FETCH(&r[2], 2, chan_index);
+
+         /* r0 = (r0 < 0.0) ? r1 : r2
+          */
+         r[3].q = si_xor(r[3].q, r[3].q);
+         r[0].q = micro_lt(r[0].q, r[3].q);
+         r[0].q = si_selb(r[1].q, r[2].q, r[0].q);
+
+         STORE(&r[0], 0, chan_index);
+      }
+      break;
+
+   case TGSI_OPCODE_SCS:
+      if( IS_CHANNEL_ENABLED( *inst, CHAN_X ) || IS_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
+         FETCH( &r[0], 0, CHAN_X );
+      }
+      if( IS_CHANNEL_ENABLED( *inst, CHAN_X ) ) {
+         r[1].q = micro_cos(r[0].q);
+         STORE( &r[1], 0, CHAN_X );
+      }
+      if( IS_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
+         r[1].q = micro_sin(r[0].q);
+         STORE( &r[1], 0, CHAN_Y );
+      }
+      if( IS_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
+         STORE( &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], 0, CHAN_Z );
+      }
+      if( IS_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
+         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
+      }
+      break;
+
+   case TGSI_OPCODE_NRM:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_DIV:
+      assert( 0 );
+      break;
+
+   case TGSI_OPCODE_DP2:
+      FETCH( &r[0], 0, CHAN_X );
+      FETCH( &r[1], 1, CHAN_X );
+      r[0].q = si_fm(r[0].q, r[1].q);
+
+      FETCH( &r[1], 0, CHAN_Y );
+      FETCH( &r[2], 1, CHAN_Y );
+      r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_IF:
+      /* push CondMask */
+      assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
+      mach->CondStack[mach->CondStackTop++] = mach->CondMask;
+      FETCH( &r[0], 0, CHAN_X );
+      /* update CondMask */
+      if( ! r[0].u[0] ) {
+         mach->CondMask &= ~0x1;
+      }
+      if( ! r[0].u[1] ) {
+         mach->CondMask &= ~0x2;
+      }
+      if( ! r[0].u[2] ) {
+         mach->CondMask &= ~0x4;
+      }
+      if( ! r[0].u[3] ) {
+         mach->CondMask &= ~0x8;
+      }
+      UPDATE_EXEC_MASK(mach);
+      /* Todo: If CondMask==0, jump to ELSE */
+      break;
+
+   case TGSI_OPCODE_ELSE:
+      /* invert CondMask wrt previous mask */
+      {
+         uint prevMask;
+         assert(mach->CondStackTop > 0);
+         prevMask = mach->CondStack[mach->CondStackTop - 1];
+         mach->CondMask = ~mach->CondMask & prevMask;
+         UPDATE_EXEC_MASK(mach);
+         /* Todo: If CondMask==0, jump to ENDIF */
+      }
+      break;
+
+   case TGSI_OPCODE_ENDIF:
+      /* pop CondMask */
+      assert(mach->CondStackTop > 0);
+      mach->CondMask = mach->CondStack[--mach->CondStackTop];
+      UPDATE_EXEC_MASK(mach);
+      break;
+
+   case TGSI_OPCODE_END:
+      /* halt execution */
+      *pc = -1;
+      break;
+
+   case TGSI_OPCODE_REP:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_ENDREP:
+       assert (0);
+       break;
+
+   case TGSI_OPCODE_PUSHA:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_POPA:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_CEIL:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         r[0].q = micro_ceil(r[0].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_I2F:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         r[0].q = si_csflt(r[0].q, 0);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_NOT:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         r[0].q = si_xorbi(r[0].q, 0xff);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_TRUNC:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         r[0].q = micro_trunc(r[0].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SHL:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+
+         r[0].q = si_shl(r[0].q, r[1].q);
+
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SHR:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         r[0].q = micro_ishr(r[0].q, r[1].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_AND:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         r[0].q = si_and(r[0].q, r[1].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_OR:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         r[0].q = si_or(r[0].q, r[1].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_MOD:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_XOR:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         r[0].q = si_xor(r[0].q, r[1].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SAD:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_TXF:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_TXQ:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_EMIT:
+      mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] += 16;
+      mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]]++;
+      break;
+
+   case TGSI_OPCODE_ENDPRIM:
+      mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]++;
+      mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]] = 0;
+      break;
+
+   case TGSI_OPCODE_LOOP:
+      /* fall-through (for now) */
+   case TGSI_OPCODE_BGNLOOP2:
+      /* push LoopMask and ContMasks */
+      assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
+      mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
+      assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
+      mach->ContStack[mach->ContStackTop++] = mach->ContMask;
+      break;
+
+   case TGSI_OPCODE_ENDLOOP:
+      /* fall-through (for now at least) */
+   case TGSI_OPCODE_ENDLOOP2:
+      /* Restore ContMask, but don't pop */
+      assert(mach->ContStackTop > 0);
+      mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
+      if (mach->LoopMask) {
+         /* repeat loop: jump to instruction just past BGNLOOP */
+         *pc = inst->InstructionExtLabel.Label + 1;
+      }
+      else {
+         /* exit loop: pop LoopMask */
+         assert(mach->LoopStackTop > 0);
+         mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
+         /* pop ContMask */
+         assert(mach->ContStackTop > 0);
+         mach->ContMask = mach->ContStack[--mach->ContStackTop];
+      }
+      UPDATE_EXEC_MASK(mach);
+      break;
+
+   case TGSI_OPCODE_BRK:
+      /* turn off loop channels for each enabled exec channel */
+      mach->LoopMask &= ~mach->ExecMask;
+      /* Todo: if mach->LoopMask == 0, jump to end of loop */
+      UPDATE_EXEC_MASK(mach);
+      break;
+
+   case TGSI_OPCODE_CONT:
+      /* turn off cont channels for each enabled exec channel */
+      mach->ContMask &= ~mach->ExecMask;
+      /* Todo: if mach->LoopMask == 0, jump to end of loop */
+      UPDATE_EXEC_MASK(mach);
+      break;
+
+   case TGSI_OPCODE_BGNSUB:
+      /* no-op */
+      break;
+
+   case TGSI_OPCODE_ENDSUB:
+      /* no-op */
+      break;
+
+   case TGSI_OPCODE_NOISE1:
+      assert( 0 );
+      break;
+
+   case TGSI_OPCODE_NOISE2:
+      assert( 0 );
+      break;
+
+   case TGSI_OPCODE_NOISE3:
+      assert( 0 );
+      break;
+
+   case TGSI_OPCODE_NOISE4:
+      assert( 0 );
+      break;
+
+   case TGSI_OPCODE_NOP:
+      break;
+
+   default:
+      assert( 0 );
+   }
+}
+
+
+/**
+ * Run TGSI interpreter.
+ * \return bitmask of "alive" quad components
+ */
+uint
+spu_exec_machine_run( struct spu_exec_machine *mach )
+{
+   uint i;
+   int pc = 0;
+
+   mach->CondMask = 0xf;
+   mach->LoopMask = 0xf;
+   mach->ContMask = 0xf;
+   mach->FuncMask = 0xf;
+   mach->ExecMask = 0xf;
+
+   mach->CondStackTop = 0; /* temporarily subvert this assertion */
+   assert(mach->CondStackTop == 0);
+   assert(mach->LoopStackTop == 0);
+   assert(mach->ContStackTop == 0);
+   assert(mach->CallStackTop == 0);
+
+   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] = 0;
+   mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] = 0;
+
+   if( mach->Processor == TGSI_PROCESSOR_GEOMETRY ) {
+      mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0] = 0;
+      mach->Primitives[0] = 0;
+   }
+
+
+   /* execute declarations (interpolants) */
+   if( mach->Processor == TGSI_PROCESSOR_FRAGMENT ) {
+      for (i = 0; i < mach->NumDeclarations; i++) {
+	 uint8_t buffer[sizeof(struct tgsi_full_declaration) + 32] ALIGN16_ATTRIB;
+	 struct tgsi_full_declaration decl;
+	 unsigned long decl_addr = (unsigned long) (mach->Declarations+i);
+	 unsigned size = ((sizeof(decl) + (decl_addr & 0x0f) + 0x0f) & ~0x0f);
+
+	 mfc_get(buffer, decl_addr & ~0x0f, size, TAG_INSTRUCTION_FETCH, 0, 0);
+	 wait_on_mask(1 << TAG_INSTRUCTION_FETCH);
+
+	 memcpy(& decl, buffer + (decl_addr & 0x0f), sizeof(decl));
+	 exec_declaration( mach, &decl );
+      }
+   }
+
+   /* execute instructions, until pc is set to -1 */
+   while (pc != -1) {
+      uint8_t buffer[sizeof(struct tgsi_full_instruction) + 32] ALIGN16_ATTRIB;
+      struct tgsi_full_instruction inst;
+      unsigned long inst_addr = (unsigned long) (mach->Instructions + pc);
+      unsigned size = ((sizeof(inst) + (inst_addr & 0x0f) + 0x0f) & ~0x0f);
+
+      assert(pc < mach->NumInstructions);
+      mfc_get(buffer, inst_addr & ~0x0f, size, TAG_INSTRUCTION_FETCH, 0, 0);
+      wait_on_mask(1 << TAG_INSTRUCTION_FETCH);
+
+      memcpy(& inst, buffer + (inst_addr & 0x0f), sizeof(inst));
+      exec_instruction( mach, & inst, &pc );
+   }
+
+#if 0
+   /* we scale from floats in [0,1] to Zbuffer ints in sp_quad_depth_test.c */
+   if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) {
+      /*
+       * Scale back depth component.
+       */
+      for (i = 0; i < 4; i++)
+         mach->Outputs[0].xyzw[2].f[i] *= ctx->DrawBuffer->_DepthMaxF;
+   }
+#endif
+
+   return ~mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
+}
+
+
diff --git a/src/gallium/drivers/cell/spu/spu_exec.h b/src/gallium/drivers/cell/spu/spu_exec.h
new file mode 100644
index 0000000000..b4c7661ef6
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_exec.h
@@ -0,0 +1,172 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#if !defined SPU_EXEC_H
+#define SPU_EXEC_H
+
+#include "pipe/p_compiler.h"
+#include "pipe/tgsi/exec/tgsi_exec.h"
+
+#if defined __cplusplus
+extern "C" {
+#endif
+
+/**
+  * Registers may be treated as float, signed int or unsigned int.
+  */
+union spu_exec_channel
+{
+   float    f[QUAD_SIZE];
+   int      i[QUAD_SIZE];
+   unsigned u[QUAD_SIZE];
+   qword    q;
+};
+
+/**
+  * A vector[RGBA] of channels[4 pixels]
+  */
+struct spu_exec_vector
+{
+   union spu_exec_channel xyzw[NUM_CHANNELS];
+};
+
+/**
+ * For fragment programs, information for computing fragment input
+ * values from plane equation of the triangle/line.
+ */
+struct spu_interp_coef
+{
+   float a0[NUM_CHANNELS];	/* in an xyzw layout */
+   float dadx[NUM_CHANNELS];
+   float dady[NUM_CHANNELS];
+};
+
+
+struct softpipe_tile_cache;  /**< Opaque to TGSI */
+
+/**
+ * Information for sampling textures, which must be implemented
+ * by code outside the TGSI executor.
+ */
+struct spu_sampler
+{
+   const struct pipe_sampler_state *state;
+   struct pipe_texture *texture;
+   /** Get samples for four fragments in a quad */
+   void (*get_samples)(struct spu_sampler *sampler,
+                       const float s[QUAD_SIZE],
+                       const float t[QUAD_SIZE],
+                       const float p[QUAD_SIZE],
+                       float lodbias,
+                       float rgba[NUM_CHANNELS][QUAD_SIZE]);
+   void *pipe; /*XXX temporary*/
+   struct softpipe_tile_cache *cache;
+};
+
+
+/**
+ * Run-time virtual machine state for executing TGSI shader.
+ */
+struct spu_exec_machine
+{
+   /*
+    * 32 program temporaries
+    * 4  internal temporaries
+    * 1  address
+    */
+   struct spu_exec_vector       Temps[TGSI_EXEC_NUM_TEMPS 
+				      + TGSI_EXEC_NUM_ADDRS + 1]
+       ALIGN16_ATTRIB;
+
+   struct spu_exec_vector       *Addrs;
+
+   struct spu_sampler           *Samplers;
+
+   float                         Imms[TGSI_EXEC_NUM_IMMEDIATES][4];
+   unsigned                      ImmLimit;
+   float                         (*Consts)[4];
+   struct spu_exec_vector       *Inputs;
+   struct spu_exec_vector       *Outputs;
+   unsigned                      Processor;
+
+   /* GEOMETRY processor only. */
+   unsigned                      *Primitives;
+
+   /* FRAGMENT processor only. */
+   const struct spu_interp_coef *InterpCoefs;
+   struct spu_exec_vector       QuadPos;
+
+   /* Conditional execution masks */
+   uint CondMask;  /**< For IF/ELSE/ENDIF */
+   uint LoopMask;  /**< For BGNLOOP/ENDLOOP */
+   uint ContMask;  /**< For loop CONT statements */
+   uint FuncMask;  /**< For function calls */
+   uint ExecMask;  /**< = CondMask & LoopMask */
+
+   /** Condition mask stack (for nested conditionals) */
+   uint CondStack[TGSI_EXEC_MAX_COND_NESTING];
+   int CondStackTop;
+
+   /** Loop mask stack (for nested loops) */
+   uint LoopStack[TGSI_EXEC_MAX_LOOP_NESTING];
+   int LoopStackTop;
+
+   /** Loop continue mask stack (see comments in tgsi_exec.c) */
+   uint ContStack[TGSI_EXEC_MAX_LOOP_NESTING];
+   int ContStackTop;
+
+   /** Function execution mask stack (for executing subroutine code) */
+   uint FuncStack[TGSI_EXEC_MAX_CALL_NESTING];
+   int FuncStackTop;
+
+   /** Function call stack for saving/restoring the program counter */
+   uint CallStack[TGSI_EXEC_MAX_CALL_NESTING];
+   int CallStackTop;
+
+   struct tgsi_full_instruction *Instructions;
+   uint NumInstructions;
+
+   struct tgsi_full_declaration *Declarations;
+   uint NumDeclarations;
+};
+
+
+extern void
+spu_exec_machine_init(struct spu_exec_machine *mach,
+                      uint numSamplers,
+                      struct spu_sampler *samplers,
+                      unsigned processor);
+
+extern uint
+spu_exec_machine_run( struct spu_exec_machine *mach );
+
+
+#if defined __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /* SPU_EXEC_H */
diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
new file mode 100644
index 0000000000..e375197fe6
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -0,0 +1,567 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+/* main() for Cell SPU code */
+
+
+#include <stdio.h>
+#include <libmisc.h>
+
+#include "spu_main.h"
+#include "spu_render.h"
+#include "spu_texture.h"
+#include "spu_tile.h"
+//#include "spu_test.h"
+#include "spu_vertex_shader.h"
+#include "pipe/cell/common.h"
+#include "pipe/p_defines.h"
+
+
+/*
+helpful headers:
+/usr/lib/gcc/spu/4.1.1/include/spu_mfcio.h
+/opt/ibm/cell-sdk/prototype/sysroot/usr/include/libmisc.h
+*/
+
+boolean Debug = FALSE;
+
+struct spu_global spu;
+
+struct spu_vs_context draw;
+
+/**
+ * Tell the PPU that this SPU has finished copying a buffer to
+ * local store and that it may be reused by the PPU.
+ * This is done by writting a 16-byte batch-buffer-status block back into
+ * main memory (in cell_context->buffer_status[]).
+ */
+static void
+release_buffer(uint buffer)
+{
+   /* Evidently, using less than a 16-byte status doesn't work reliably */
+   static const uint status[4] ALIGN16_ATTRIB
+      = {CELL_BUFFER_STATUS_FREE, 0, 0, 0};
+
+   const uint index = 4 * (spu.init.id * CELL_NUM_BUFFERS + buffer);
+   uint *dst = spu.init.buffer_status + index;
+
+   ASSERT(buffer < CELL_NUM_BUFFERS);
+
+   mfc_put((void *) &status,    /* src in local memory */
+           (unsigned int) dst,  /* dst in main memory */
+           sizeof(status),      /* size */
+           TAG_MISC,            /* tag is unimportant */
+           0, /* tid */
+           0  /* rid */);
+}
+
+
+/**
+ * For tiles whose status is TILE_STATUS_CLEAR, write solid-filled
+ * tiles back to the main framebuffer.
+ */
+static void
+really_clear_tiles(uint surfaceIndex)
+{
+   const uint num_tiles = spu.fb.width_tiles * spu.fb.height_tiles;
+   uint i;
+
+   if (surfaceIndex == 0) {
+      clear_c_tile(&spu.ctile);
+
+      for (i = spu.init.id; i < num_tiles; i += spu.init.num_spus) {
+         uint tx = i % spu.fb.width_tiles;
+         uint ty = i / spu.fb.width_tiles;
+         if (spu.ctile_status[ty][tx] == TILE_STATUS_CLEAR) {
+            put_tile(tx, ty, &spu.ctile, TAG_SURFACE_CLEAR, 0);
+         }
+      }
+   }
+   else {
+      clear_z_tile(&spu.ztile);
+
+      for (i = spu.init.id; i < num_tiles; i += spu.init.num_spus) {
+         uint tx = i % spu.fb.width_tiles;
+         uint ty = i / spu.fb.width_tiles;
+         if (spu.ztile_status[ty][tx] == TILE_STATUS_CLEAR)
+            put_tile(tx, ty, &spu.ctile, TAG_SURFACE_CLEAR, 1);
+      }
+   }
+
+#if 0
+   wait_on_mask(1 << TAG_SURFACE_CLEAR);
+#endif
+}
+
+
+static void
+cmd_clear_surface(const struct cell_command_clear_surface *clear)
+{
+   const uint num_tiles = spu.fb.width_tiles * spu.fb.height_tiles;
+   uint i;
+
+   if (Debug)
+      printf("SPU %u: CLEAR SURF %u to 0x%08x\n", spu.init.id,
+             clear->surface, clear->value);
+
+#define CLEAR_OPT 1
+#if CLEAR_OPT
+   /* set all tile's status to CLEAR */
+   if (clear->surface == 0) {
+      memset(spu.ctile_status, TILE_STATUS_CLEAR, sizeof(spu.ctile_status));
+      spu.fb.color_clear_value = clear->value;
+   }
+   else {
+      memset(spu.ztile_status, TILE_STATUS_CLEAR, sizeof(spu.ztile_status));
+      spu.fb.depth_clear_value = clear->value;
+   }
+   return;
+#endif
+
+   if (clear->surface == 0) {
+      spu.fb.color_clear_value = clear->value;
+      clear_c_tile(&spu.ctile);
+   }
+   else {
+      spu.fb.depth_clear_value = clear->value;
+      clear_z_tile(&spu.ztile);
+   }
+
+   /*
+   printf("SPU: %s num=%d w=%d h=%d\n",
+          __FUNCTION__, num_tiles, spu.fb.width_tiles, spu.fb.height_tiles);
+   */
+
+   for (i = spu.init.id; i < num_tiles; i += spu.init.num_spus) {
+      uint tx = i % spu.fb.width_tiles;
+      uint ty = i / spu.fb.width_tiles;
+      if (clear->surface == 0)
+         put_tile(tx, ty, &spu.ctile, TAG_SURFACE_CLEAR, 0);
+      else
+         put_tile(tx, ty, &spu.ztile, TAG_SURFACE_CLEAR, 1);
+      /* XXX we don't want this here, but it fixes bad tile results */
+   }
+
+#if 0
+   wait_on_mask(1 << TAG_SURFACE_CLEAR);
+#endif
+
+   if (Debug)
+      printf("SPU %u: CLEAR SURF done\n", spu.init.id);
+}
+
+
+static void
+cmd_release_verts(const struct cell_command_release_verts *release)
+{
+   if (Debug)
+      printf("SPU %u: RELEASE VERTS %u\n",
+             spu.init.id, release->vertex_buf);
+   ASSERT(release->vertex_buf != ~0U);
+   release_buffer(release->vertex_buf);
+}
+
+
+static void
+cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
+{
+   if (Debug)
+      printf("SPU %u: FRAMEBUFFER: %d x %d at %p, cformat 0x%x  zformat 0x%x\n",
+             spu.init.id,
+             cmd->width,
+             cmd->height,
+             cmd->color_start,
+             cmd->color_format,
+             cmd->depth_format);
+
+   ASSERT_ALIGN16(cmd->color_start);
+   ASSERT_ALIGN16(cmd->depth_start);
+
+   spu.fb.color_start = cmd->color_start;
+   spu.fb.depth_start = cmd->depth_start;
+   spu.fb.color_format = cmd->color_format;
+   spu.fb.depth_format = cmd->depth_format;
+   spu.fb.width = cmd->width;
+   spu.fb.height = cmd->height;
+   spu.fb.width_tiles = (spu.fb.width + TILE_SIZE - 1) / TILE_SIZE;
+   spu.fb.height_tiles = (spu.fb.height + TILE_SIZE - 1) / TILE_SIZE;
+
+   if (spu.fb.depth_format == PIPE_FORMAT_Z32_UNORM)
+      spu.fb.zsize = 4;
+   else if (spu.fb.depth_format == PIPE_FORMAT_Z16_UNORM)
+      spu.fb.zsize = 2;
+   else
+      spu.fb.zsize = 0;
+
+   if (spu.fb.color_format == PIPE_FORMAT_A8R8G8B8_UNORM)
+      spu.color_shuffle = ((vector unsigned char) {
+                              12, 0, 4, 8, 0, 0, 0, 0, 
+                              0, 0, 0, 0, 0, 0, 0, 0});
+   else if (spu.fb.color_format == PIPE_FORMAT_B8G8R8A8_UNORM)
+      spu.color_shuffle = ((vector unsigned char) {
+                              8, 4, 0, 12, 0, 0, 0, 0, 
+                              0, 0, 0, 0, 0, 0, 0, 0});
+   else
+      ASSERT(0);
+}
+
+
+static void
+cmd_state_blend(const struct pipe_blend_state *state)
+{
+   if (Debug)
+      printf("SPU %u: BLEND: enabled %d\n",
+             spu.init.id,
+             state->blend_enable);
+
+   memcpy(&spu.blend, state, sizeof(*state));
+}
+
+
+static void
+cmd_state_depth_stencil(const struct pipe_depth_stencil_alpha_state *state)
+{
+   if (Debug)
+      printf("SPU %u: DEPTH_STENCIL: ztest %d\n",
+             spu.init.id,
+             state->depth.enabled);
+
+   memcpy(&spu.depth_stencil, state, sizeof(*state));
+}
+
+
+static void
+cmd_state_sampler(const struct pipe_sampler_state *state)
+{
+   if (Debug)
+      printf("SPU %u: SAMPLER\n",
+             spu.init.id);
+
+   memcpy(&spu.sampler[0], state, sizeof(*state));
+   if (spu.sampler[0].min_img_filter == PIPE_TEX_FILTER_LINEAR)
+      spu.sample_texture = sample_texture_bilinear;
+   else
+      spu.sample_texture = sample_texture_nearest;
+}
+
+
+static void
+cmd_state_texture(const struct cell_command_texture *texture)
+{
+   if (Debug)
+      printf("SPU %u: TEXTURE at %p  size %u x %u\n",
+             spu.init.id, texture->start, texture->width, texture->height);
+
+   memcpy(&spu.texture, texture, sizeof(*texture));
+   spu.tex_size = (vector float)
+      { spu.texture.width, spu.texture.height, 0.0, 0.0};
+   spu.tex_size_mask = (vector unsigned int)
+      { spu.texture.width - 1, spu.texture.height - 1, 0, 0 };
+}
+
+
+static void
+cmd_state_vertex_info(const struct vertex_info *vinfo)
+{
+   if (Debug) {
+      printf("SPU %u: VERTEX_INFO num_attribs=%u\n", spu.init.id,
+             vinfo->num_attribs);
+   }
+   ASSERT(vinfo->num_attribs >= 1);
+   ASSERT(vinfo->num_attribs <= 8);
+   memcpy(&spu.vertex_info, vinfo, sizeof(*vinfo));
+}
+
+
+static void
+cmd_state_vs_array_info(const struct cell_array_info *vs_info)
+{
+   const unsigned attr = vs_info->attr;
+
+   ASSERT(attr < PIPE_ATTRIB_MAX);
+   draw.vertex_fetch.src_ptr[attr] = vs_info->base;
+   draw.vertex_fetch.pitch[attr] = vs_info->pitch;
+   draw.vertex_fetch.format[attr] = vs_info->format;
+   draw.vertex_fetch.dirty = 1;
+}
+
+
+static void
+cmd_finish(void)
+{
+   if (Debug)
+      printf("SPU %u: FINISH\n", spu.init.id);
+   really_clear_tiles(0);
+   /* wait for all outstanding DMAs to finish */
+   mfc_write_tag_mask(~0);
+   mfc_read_tag_status_all();
+   /* send mbox message to PPU */
+   spu_write_out_mbox(CELL_CMD_FINISH);
+}
+
+
+/**
+ * Execute a batch of commands
+ * The opcode param encodes the location of the buffer and its size.
+ */
+static void
+cmd_batch(uint opcode)
+{
+   const uint buf = (opcode >> 8) & 0xff;
+   uint size = (opcode >> 16);
+   uint64_t buffer[CELL_BUFFER_SIZE / 8] ALIGN16_ATTRIB;
+   const unsigned usize = size / sizeof(buffer[0]);
+   uint pos;
+
+   if (Debug)
+      printf("SPU %u: BATCH buffer %u, len %u, from %p\n",
+             spu.init.id, buf, size, spu.init.buffers[buf]);
+
+   ASSERT((opcode & CELL_CMD_OPCODE_MASK) == CELL_CMD_BATCH);
+
+   ASSERT_ALIGN16(spu.init.buffers[buf]);
+
+   size = ROUNDUP16(size);
+
+   ASSERT_ALIGN16(spu.init.buffers[buf]);
+
+   mfc_get(buffer,  /* dest */
+           (unsigned int) spu.init.buffers[buf],  /* src */
+           size,
+           TAG_BATCH_BUFFER,
+           0, /* tid */
+           0  /* rid */);
+   wait_on_mask(1 << TAG_BATCH_BUFFER);
+
+   /* Tell PPU we're done copying the buffer to local store */
+   if (Debug)
+      printf("SPU %u: release batch buf %u\n", spu.init.id, buf);
+   release_buffer(buf);
+
+   for (pos = 0; pos < usize; /* no incr */) {
+      switch (buffer[pos]) {
+      case CELL_CMD_STATE_FRAMEBUFFER:
+         {
+            struct cell_command_framebuffer *fb
+               = (struct cell_command_framebuffer *) &buffer[pos];
+            cmd_state_framebuffer(fb);
+            pos += sizeof(*fb) / 8;
+         }
+         break;
+      case CELL_CMD_CLEAR_SURFACE:
+         {
+            struct cell_command_clear_surface *clr
+               = (struct cell_command_clear_surface *) &buffer[pos];
+            cmd_clear_surface(clr);
+            pos += sizeof(*clr) / 8;
+         }
+         break;
+      case CELL_CMD_RENDER:
+         {
+            struct cell_command_render *render
+               = (struct cell_command_render *) &buffer[pos];
+            uint pos_incr;
+            cmd_render(render, &pos_incr);
+            pos += pos_incr;
+         }
+         break;
+      case CELL_CMD_RELEASE_VERTS:
+         {
+            struct cell_command_release_verts *release
+               = (struct cell_command_release_verts *) &buffer[pos];
+            cmd_release_verts(release);
+            pos += sizeof(*release) / 8;
+         }
+         break;
+      case CELL_CMD_FINISH:
+         cmd_finish();
+         pos += 1;
+         break;
+      case CELL_CMD_STATE_BLEND:
+         cmd_state_blend((struct pipe_blend_state *)
+                                 &buffer[pos+1]);
+         pos += (1 + ROUNDUP8(sizeof(struct pipe_blend_state)) / 8);
+         break;
+      case CELL_CMD_STATE_DEPTH_STENCIL:
+         cmd_state_depth_stencil((struct pipe_depth_stencil_alpha_state *)
+                                 &buffer[pos+1]);
+         pos += (1 + ROUNDUP8(sizeof(struct pipe_depth_stencil_alpha_state)) / 8);
+         break;
+      case CELL_CMD_STATE_SAMPLER:
+         cmd_state_sampler((struct pipe_sampler_state *) &buffer[pos+1]);
+         pos += (1 + ROUNDUP8(sizeof(struct pipe_sampler_state)) / 8);
+         break;
+      case CELL_CMD_STATE_TEXTURE:
+         cmd_state_texture((struct cell_command_texture *) &buffer[pos+1]);
+         pos += (1 + ROUNDUP8(sizeof(struct cell_command_texture)) / 8);
+         break;
+      case CELL_CMD_STATE_VERTEX_INFO:
+         cmd_state_vertex_info((struct vertex_info *) &buffer[pos+1]);
+         pos += (1 + ROUNDUP8(sizeof(struct vertex_info)) / 8);
+         break;
+      case CELL_CMD_STATE_VIEWPORT:
+         (void) memcpy(& draw.viewport, &buffer[pos+1],
+                       sizeof(struct pipe_viewport_state));
+         pos += (1 + ROUNDUP8(sizeof(struct pipe_viewport_state)) / 8);
+         break;
+      case CELL_CMD_STATE_VS_ARRAY_INFO:
+         cmd_state_vs_array_info((struct cell_array_info *) &buffer[pos+1]);
+         pos += (1 + ROUNDUP8(sizeof(struct cell_array_info)) / 8);
+         break;
+      default:
+         printf("SPU %u: bad opcode: 0x%llx\n", spu.init.id, buffer[pos]);
+         ASSERT(0);
+         break;
+      }
+   }
+
+   if (Debug)
+      printf("SPU %u: BATCH complete\n", spu.init.id);
+}
+
+
+/**
+ * Temporary/simple main loop for SPEs: Get a command, execute it, repeat.
+ */
+static void
+main_loop(void)
+{
+   struct cell_command cmd;
+   int exitFlag = 0;
+
+   if (Debug)
+      printf("SPU %u: Enter main loop\n", spu.init.id);
+
+   ASSERT((sizeof(struct cell_command) & 0xf) == 0);
+   ASSERT_ALIGN16(&cmd);
+
+   while (!exitFlag) {
+      unsigned opcode;
+      int tag = 0;
+
+      if (Debug)
+         printf("SPU %u: Wait for cmd...\n", spu.init.id);
+
+      /* read/wait from mailbox */
+      opcode = (unsigned int) spu_read_in_mbox();
+
+      if (Debug)
+         printf("SPU %u: got cmd 0x%x\n", spu.init.id, opcode);
+
+      /* command payload */
+      mfc_get(&cmd,  /* dest */
+              (unsigned int) spu.init.cmd, /* src */
+              sizeof(struct cell_command), /* bytes */
+              tag,
+              0, /* tid */
+              0  /* rid */);
+      wait_on_mask( 1 << tag );
+
+      /*
+       * NOTE: most commands should be contained in a batch buffer
+       */
+
+      switch (opcode & CELL_CMD_OPCODE_MASK) {
+      case CELL_CMD_EXIT:
+         if (Debug)
+            printf("SPU %u: EXIT\n", spu.init.id);
+         exitFlag = 1;
+         break;
+      case CELL_CMD_VS_EXECUTE:
+         spu_execute_vertex_shader(&draw, &cmd.vs);
+         break;
+      case CELL_CMD_BATCH:
+         cmd_batch(opcode);
+         break;
+      default:
+         printf("Bad opcode!\n");
+      }
+
+   }
+
+   if (Debug)
+      printf("SPU %u: Exit main loop\n", spu.init.id);
+}
+
+
+
+static void
+one_time_init(void)
+{
+   memset(spu.ctile_status, TILE_STATUS_DEFINED, sizeof(spu.ctile_status));
+   memset(spu.ztile_status, TILE_STATUS_DEFINED, sizeof(spu.ztile_status));
+   invalidate_tex_cache();
+}
+
+
+
+/* In some versions of the SDK the SPE main takes 'unsigned long' as a
+ * parameter.  In others it takes 'unsigned long long'.  Use a define to
+ * select between the two.
+ */
+#ifdef SPU_MAIN_PARAM_LONG_LONG
+typedef unsigned long long main_param_t;
+#else
+typedef unsigned long main_param_t;
+#endif
+
+/**
+ * SPE entrypoint.
+ */
+int
+main(main_param_t speid, main_param_t argp)
+{
+   int tag = 0;
+
+   (void) speid;
+
+   ASSERT(sizeof(tile_t) == TILE_SIZE * TILE_SIZE * 4);
+   ASSERT(sizeof(struct cell_command_render) % 8 == 0);
+
+   one_time_init();
+
+   if (Debug)
+      printf("SPU: main() speid=%lu\n", speid);
+
+   mfc_get(&spu.init,  /* dest */
+           (unsigned int) argp, /* src */
+           sizeof(struct cell_init_info), /* bytes */
+           tag,
+           0, /* tid */
+           0  /* rid */);
+   wait_on_mask( 1 << tag );
+
+#if 0
+   if (spu.init.id==0)
+      spu_test_misc();
+#endif
+
+   main_loop();
+
+   return 0;
+}
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
new file mode 100644
index 0000000000..1710a17512
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -0,0 +1,177 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef SPU_MAIN_H
+#define SPU_MAIN_H
+
+
+#include <spu_mfcio.h>
+
+#include "pipe/cell/common.h"
+#include "pipe/draw/draw_vertex.h"
+#include "pipe/p_state.h"
+
+
+
+#define MAX_WIDTH 1024
+#define MAX_HEIGHT 1024
+
+
+typedef union {
+   ushort us[TILE_SIZE][TILE_SIZE];
+   uint   ui[TILE_SIZE][TILE_SIZE];
+   vector unsigned short us8[TILE_SIZE/2][TILE_SIZE/4];
+   vector unsigned int ui4[TILE_SIZE/2][TILE_SIZE/2];
+} tile_t;
+
+
+#define TILE_STATUS_CLEAR   1
+#define TILE_STATUS_DEFINED 2  /**< defined in FB, but not in local store */
+#define TILE_STATUS_CLEAN   3  /**< in local store, but not changed */
+#define TILE_STATUS_DIRTY   4  /**< modified locally, but not put back yet */
+#define TILE_STATUS_GETTING 5  /**< mfc_get() called but not yet arrived */
+
+
+struct spu_framebuffer {
+   void *color_start;              /**< addr of color surface in main memory */
+   void *depth_start;              /**< addr of depth surface in main memory */
+   enum pipe_format color_format;
+   enum pipe_format depth_format;
+   uint width, height;             /**< size in pixels */
+   uint width_tiles, height_tiles; /**< width and height in tiles */
+
+   uint color_clear_value;
+   uint depth_clear_value;
+
+   uint zsize;                     /**< 0, 2 or 4 bytes per Z */
+} ALIGN16_ATTRIB;
+
+
+/**
+ * All SPU global/context state will be in singleton object of this type:
+ */
+struct spu_global
+{
+   struct cell_init_info init;
+
+   struct spu_framebuffer fb;
+   struct pipe_blend_state blend_stencil;
+   struct pipe_depth_stencil_alpha_state depth_stencil;
+   struct pipe_blend_state blend;
+   struct pipe_sampler_state sampler[PIPE_MAX_SAMPLERS];
+   struct cell_command_texture texture;
+
+   struct vertex_info vertex_info;
+
+   /* XXX more state to come */
+
+
+   /** current color and Z tiles */
+   tile_t ctile ALIGN16_ATTRIB;
+   tile_t ztile ALIGN16_ATTRIB;
+
+   /** Current tiles' status */
+   ubyte cur_ctile_status, cur_ztile_status;
+
+   /** Status of all tiles in framebuffer */
+   ubyte ctile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
+   ubyte ztile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
+
+
+   /** for converting RGBA to PIPE_FORMAT_x colors */
+   vector unsigned char color_shuffle;
+
+   vector float tex_size;
+   vector unsigned int tex_size_mask; /**< == int(size - 1) */
+
+   vector float (*sample_texture)(vector float texcoord);
+
+} ALIGN16_ATTRIB;
+
+
+extern struct spu_global spu;
+extern boolean Debug;
+
+
+
+
+/* DMA TAGS */
+
+#define TAG_SURFACE_CLEAR     10
+#define TAG_VERTEX_BUFFER     11
+#define TAG_READ_TILE_COLOR   12
+#define TAG_READ_TILE_Z       13
+#define TAG_WRITE_TILE_COLOR  14
+#define TAG_WRITE_TILE_Z      15
+#define TAG_INDEX_BUFFER      16
+#define TAG_BATCH_BUFFER      17
+#define TAG_MISC              18
+#define TAG_TEXTURE_TILE      19
+#define TAG_INSTRUCTION_FETCH 20
+
+
+
+static INLINE void
+wait_on_mask(unsigned tagMask)
+{
+   mfc_write_tag_mask( tagMask );
+   /* wait for completion of _any_ DMAs specified by tagMask */
+   mfc_read_tag_status_any();
+}
+
+
+static INLINE void
+wait_on_mask_all(unsigned tagMask)
+{
+   mfc_write_tag_mask( tagMask );
+   /* wait for completion of _any_ DMAs specified by tagMask */
+   mfc_read_tag_status_all();
+}
+
+
+
+
+
+static INLINE void
+memset16(ushort *d, ushort value, uint count)
+{
+   uint i;
+   for (i = 0; i < count; i++)
+      d[i] = value;
+}
+
+
+static INLINE void
+memset32(uint *d, uint value, uint count)
+{
+   uint i;
+   for (i = 0; i < count; i++)
+      d[i] = value;
+}
+
+
+#endif /* SPU_MAIN_H */
diff --git a/src/gallium/drivers/cell/spu/spu_render.c b/src/gallium/drivers/cell/spu/spu_render.c
new file mode 100644
index 0000000000..932fb500b3
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_render.c
@@ -0,0 +1,301 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include <stdio.h>
+#include <libmisc.h>
+#include <spu_mfcio.h>
+
+#include "spu_main.h"
+#include "spu_render.h"
+#include "spu_tri.h"
+#include "spu_tile.h"
+#include "pipe/cell/common.h"
+
+
+
+/**
+ * Given a rendering command's bounding box (in pixels) compute the
+ * location of the corresponding screen tile bounding box.
+ */
+static INLINE void
+tile_bounding_box(const struct cell_command_render *render,
+                  uint *txmin, uint *tymin,
+                  uint *box_num_tiles, uint *box_width_tiles)
+{
+#if 0
+   /* Debug: full-window bounding box */
+   uint txmax = spu.fb.width_tiles - 1;
+   uint tymax = spu.fb.height_tiles - 1;
+   *txmin = 0;
+   *tymin = 0;
+   *box_num_tiles = spu.fb.width_tiles * spu.fb.height_tiles;
+   *box_width_tiles = spu.fb.width_tiles;
+   (void) render;
+   (void) txmax;
+   (void) tymax;
+#else
+   uint txmax, tymax, box_height_tiles;
+
+   *txmin = (uint) render->xmin / TILE_SIZE;
+   *tymin = (uint) render->ymin / TILE_SIZE;
+   txmax = (uint) render->xmax / TILE_SIZE;
+   tymax = (uint) render->ymax / TILE_SIZE;
+   if (txmax >= spu.fb.width_tiles)
+      txmax = spu.fb.width_tiles-1;
+   if (tymax >= spu.fb.height_tiles)
+      tymax = spu.fb.height_tiles-1;
+   *box_width_tiles = txmax - *txmin + 1;
+   box_height_tiles = tymax - *tymin + 1;
+   *box_num_tiles = *box_width_tiles * box_height_tiles;
+#endif
+#if 0
+   printf("SPU %u: bounds: %g, %g  ...  %g, %g\n", spu.init.id,
+          render->xmin, render->ymin, render->xmax, render->ymax);
+   printf("SPU %u: tiles:  %u, %u .. %u, %u\n",
+           spu.init.id, *txmin, *tymin, txmax, tymax);
+   ASSERT(render->xmin <= render->xmax);
+   ASSERT(render->ymin <= render->ymax);
+#endif
+}
+
+
+/** Check if the tile at (tx,ty) belongs to this SPU */
+static INLINE boolean
+my_tile(uint tx, uint ty)
+{
+   return (spu.fb.width_tiles * ty + tx) % spu.init.num_spus == spu.init.id;
+}
+
+
+/**
+ * Start fetching non-clear color/Z tiles from main memory
+ */
+static INLINE void
+get_cz_tiles(uint tx, uint ty)
+{
+   if (spu.depth_stencil.depth.enabled) {
+      if (spu.cur_ztile_status != TILE_STATUS_CLEAR) {
+         //printf("SPU %u: getting Z tile %u, %u\n", spu.init.id, tx, ty);
+         get_tile(tx, ty, &spu.ztile, TAG_READ_TILE_Z, 1);
+         spu.cur_ztile_status = TILE_STATUS_GETTING;
+      }
+   }
+
+   if (spu.cur_ctile_status != TILE_STATUS_CLEAR) {
+      //printf("SPU %u: getting C tile %u, %u\n", spu.init.id, tx, ty);
+      get_tile(tx, ty, &spu.ctile, TAG_READ_TILE_COLOR, 0);
+      spu.cur_ctile_status = TILE_STATUS_GETTING;
+   }
+}
+
+
+/**
+ * Start putting dirty color/Z tiles back to main memory
+ */
+static INLINE void
+put_cz_tiles(uint tx, uint ty)
+{
+   if (spu.cur_ztile_status == TILE_STATUS_DIRTY) {
+      /* tile was modified and needs to be written back */
+      //printf("SPU %u: put dirty Z tile %u, %u\n", spu.init.id, tx, ty);
+      put_tile(tx, ty, &spu.ztile, TAG_WRITE_TILE_Z, 1);
+      spu.cur_ztile_status = TILE_STATUS_DEFINED;
+   }
+   else if (spu.cur_ztile_status == TILE_STATUS_GETTING) {
+      /* tile was never used */
+      spu.cur_ztile_status = TILE_STATUS_DEFINED;
+      //printf("SPU %u: put getting Z tile %u, %u\n", spu.init.id, tx, ty);
+   }
+
+   if (spu.cur_ctile_status == TILE_STATUS_DIRTY) {
+      /* tile was modified and needs to be written back */
+      //printf("SPU %u: put dirty C tile %u, %u\n", spu.init.id, tx, ty);
+      put_tile(tx, ty, &spu.ctile, TAG_WRITE_TILE_COLOR, 0);
+      spu.cur_ctile_status = TILE_STATUS_DEFINED;
+   }
+   else if (spu.cur_ctile_status == TILE_STATUS_GETTING) {
+      /* tile was never used */
+      spu.cur_ctile_status = TILE_STATUS_DEFINED;
+      //printf("SPU %u: put getting C tile %u, %u\n", spu.init.id, tx, ty);
+   }
+}
+
+
+/**
+ * Wait for 'put' of color/z tiles to complete.
+ */
+static INLINE void
+wait_put_cz_tiles(void)
+{
+   wait_on_mask(1 << TAG_WRITE_TILE_COLOR);
+   if (spu.depth_stencil.depth.enabled) {
+      wait_on_mask(1 << TAG_WRITE_TILE_Z);
+   }
+}
+
+
+/**
+ * Render primitives
+ * \param pos_incr  returns value indicating how may words to skip after
+ *                  this command in the batch buffer
+ */
+void
+cmd_render(const struct cell_command_render *render, uint *pos_incr)
+{
+   /* we'll DMA into these buffers */
+   ubyte vertex_data[CELL_BUFFER_SIZE] ALIGN16_ATTRIB;
+   const uint vertex_size = render->vertex_size; /* in bytes */
+   /*const*/ uint total_vertex_bytes = render->num_verts * vertex_size;
+   uint index_bytes;
+   const ubyte *vertices;
+   const ushort *indexes;
+   uint i, j;
+
+
+   if (Debug) {
+      printf("SPU %u: RENDER prim %u, num_vert=%u  num_ind=%u  "
+             "inline_vert=%u\n",
+             spu.init.id,
+             render->prim_type,
+             render->num_verts,
+             render->num_indexes,
+             render->inline_verts);
+
+      /*
+      printf("       bound: %g, %g .. %g, %g\n",
+             render->xmin, render->ymin, render->xmax, render->ymax);
+      */
+   }
+
+   ASSERT(sizeof(*render) % 4 == 0);
+   ASSERT(total_vertex_bytes % 16 == 0);
+   ASSERT(render->prim_type == PIPE_PRIM_TRIANGLES);
+   ASSERT(render->num_indexes % 3 == 0);
+
+
+   /* indexes are right after the render command in the batch buffer */
+   indexes = (const ushort *) (render + 1);
+   index_bytes = ROUNDUP8(render->num_indexes * 2);
+   *pos_incr = index_bytes / 8 + sizeof(*render) / 8;
+
+
+   if (render->inline_verts) {
+      /* Vertices are after indexes in batch buffer at next 16-byte addr */
+      vertices = (const ubyte *) render + (*pos_incr * 8);
+      vertices = (const ubyte *) align_pointer((void *) vertices, 16);
+      ASSERT_ALIGN16(vertices);
+      *pos_incr = ((vertices + total_vertex_bytes) - (ubyte *) render) / 8;
+   }
+   else {
+      /* Begin DMA fetch of vertex buffer */
+      ubyte *src = spu.init.buffers[render->vertex_buf];
+      ubyte *dest = vertex_data;
+
+      /* skip vertex data we won't use */
+#if 01
+      src += render->min_index * vertex_size;
+      dest += render->min_index * vertex_size;
+      total_vertex_bytes -= render->min_index * vertex_size;
+#endif
+      ASSERT(total_vertex_bytes % 16 == 0);
+      ASSERT_ALIGN16(dest);
+      ASSERT_ALIGN16(src);
+
+      mfc_get(dest,   /* in vertex_data[] array */
+              (unsigned int) src,  /* src in main memory */
+              total_vertex_bytes,  /* size */
+              TAG_VERTEX_BUFFER,
+              0, /* tid */
+              0  /* rid */);
+
+      vertices = vertex_data;
+
+      wait_on_mask(1 << TAG_VERTEX_BUFFER);
+   }
+
+
+   /**
+    ** find tiles which intersect the prim bounding box
+    **/
+   uint txmin, tymin, box_width_tiles, box_num_tiles;
+   tile_bounding_box(render, &txmin, &tymin,
+                     &box_num_tiles, &box_width_tiles);
+
+
+   /* make sure any pending clears have completed */
+   wait_on_mask(1 << TAG_SURFACE_CLEAR); /* XXX temporary */
+
+
+   /**
+    ** loop over tiles, rendering tris
+    **/
+   for (i = 0; i < box_num_tiles; i++) {
+      const uint tx = txmin + i % box_width_tiles;
+      const uint ty = tymin + i / box_width_tiles;
+
+      ASSERT(tx < spu.fb.width_tiles);
+      ASSERT(ty < spu.fb.height_tiles);
+
+      if (!my_tile(tx, ty))
+         continue;
+
+      spu.cur_ctile_status = spu.ctile_status[ty][tx];
+      spu.cur_ztile_status = spu.ztile_status[ty][tx];
+
+      get_cz_tiles(tx, ty);
+
+      uint drawn = 0;
+
+      /* loop over tris */
+      for (j = 0; j < render->num_indexes; j += 3) {
+         const float *v0, *v1, *v2;
+
+         v0 = (const float *) (vertices + indexes[j+0] * vertex_size);
+         v1 = (const float *) (vertices + indexes[j+1] * vertex_size);
+         v2 = (const float *) (vertices + indexes[j+2] * vertex_size);
+
+         drawn += tri_draw(v0, v1, v2, tx, ty);
+      }
+
+      //printf("SPU %u: drew %u of %u\n", spu.init.id, drawn, render->num_indexes/3);
+
+      /* write color/z tiles back to main framebuffer, if dirtied */
+      put_cz_tiles(tx, ty);
+
+      wait_put_cz_tiles(); /* XXX seems unnecessary... */
+
+      spu.ctile_status[ty][tx] = spu.cur_ctile_status;
+      spu.ztile_status[ty][tx] = spu.cur_ztile_status;
+   }
+
+   if (Debug)
+      printf("SPU %u: RENDER done\n",
+             spu.init.id);
+}
+
+
diff --git a/src/gallium/drivers/cell/spu/spu_render.h b/src/gallium/drivers/cell/spu/spu_render.h
new file mode 100644
index 0000000000..fbcdc5ec31
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_render.h
@@ -0,0 +1,38 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef SPU_RENDER_H
+#define SPU_RENDER_H
+
+#include "pipe/cell/common.h"
+
+extern void
+cmd_render(const struct cell_command_render *render, uint *pos_incr);
+
+#endif /* SPU_RENDER_H */
+
diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
new file mode 100644
index 0000000000..3962aaa4a9
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -0,0 +1,217 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "pipe/p_compiler.h"
+#include "spu_main.h"
+#include "spu_texture.h"
+#include "spu_tile.h"
+#include "spu_colorpack.h"
+
+
+/**
+ * Number of texture tiles to cache.
+ * Note that this will probably be the largest consumer of SPU local store/
+ * memory for this driver!
+ */
+#define CACHE_SIZE 16
+
+static tile_t tex_tiles[CACHE_SIZE]  ALIGN16_ATTRIB;
+
+static vector unsigned int tex_tile_xy[CACHE_SIZE];
+
+
+
+/**
+ * Mark all tex cache entries as invalid.
+ */
+void
+invalidate_tex_cache(void)
+{
+   /* XXX memset? */
+   uint i;
+   for (i = 0; i < CACHE_SIZE; i++) {
+      tex_tile_xy[i] = ((vector unsigned int) { ~0U, ~0U, ~0U, ~0U });
+   }
+}
+
+
+/**
+ * Return the cache pos/index which corresponds to tile (tx,ty)
+ */
+static INLINE uint
+cache_pos(vector unsigned int txty)
+{
+   uint pos = (spu_extract(txty,0) + spu_extract(txty,1) * 4) % CACHE_SIZE;
+   return pos;
+}
+
+
+/**
+ * Make sure the tile for texel (i,j) is present, return its position/index
+ * in the cache.
+ */
+static uint
+get_tex_tile(vector unsigned int ij)
+{
+   /* tile address: tx,ty */
+   const vector unsigned int txty = spu_rlmask(ij, -5);  /* divide by 32 */
+   const uint pos = cache_pos(txty);
+
+   if ((spu_extract(tex_tile_xy[pos], 0) != spu_extract(txty, 0)) ||
+       (spu_extract(tex_tile_xy[pos], 1) != spu_extract(txty, 1))) {
+
+      /* texture cache miss, fetch tile from main memory */
+      const uint tiles_per_row = spu.texture.width / TILE_SIZE;
+      const uint bytes_per_tile = sizeof(tile_t);
+      const void *src = (const ubyte *) spu.texture.start
+         + (spu_extract(txty,1) * tiles_per_row + spu_extract(txty,0)) * bytes_per_tile;
+
+      printf("SPU %u: tex cache miss at %d, %d  pos=%u  old=%d,%d\n",
+             spu.init.id,
+             spu_extract(txty,0),
+             spu_extract(txty,1),
+             pos,
+             spu_extract(tex_tile_xy[pos],0),
+             spu_extract(tex_tile_xy[pos],1));
+
+      ASSERT_ALIGN16(tex_tiles[pos].ui);
+      ASSERT_ALIGN16(src);
+
+      mfc_get(tex_tiles[pos].ui,  /* dest */
+              (unsigned int) src,
+              bytes_per_tile,      /* size */
+              TAG_TEXTURE_TILE,
+              0, /* tid */
+              0  /* rid */);
+
+      wait_on_mask(1 << TAG_TEXTURE_TILE);
+
+      tex_tile_xy[pos] = txty;
+   }
+   else {
+#if 0
+      printf("SPU %u: tex cache HIT at %d, %d\n",
+             spu.init.id, tx, ty);
+#endif
+   }
+
+   return pos;
+}
+
+
+/**
+ * Get texture sample at texcoord.
+ * XXX this is extremely primitive for now.
+ */
+vector float
+sample_texture_nearest(vector float texcoord)
+{
+   vector float tc = spu_mul(texcoord, spu.tex_size);
+   vector unsigned int itc = spu_convtu(tc, 0);  /* convert to int */
+   itc = spu_and(itc, spu.tex_size_mask);        /* mask (GL_REPEAT) */
+   vector unsigned int ij = spu_and(itc, TILE_SIZE-1); /* intra tile addr */
+   uint pos = get_tex_tile(itc);
+   uint texel = tex_tiles[pos].ui[spu_extract(ij, 1)][spu_extract(ij, 0)];
+   return spu_unpack_A8R8G8B8(texel);
+}
+
+
+vector float
+sample_texture_bilinear(vector float texcoord)
+{
+   static const vector unsigned int offset10 = {1, 0, 0, 0};
+   static const vector unsigned int offset01 = {0, 1, 0, 0};
+
+   vector float tc = spu_mul(texcoord, spu.tex_size);
+   tc = spu_add(tc, spu_splats(-0.5f));  /* half texel bias */
+
+   /* integer texcoords S,T: */
+   vector unsigned int itc00 = spu_convtu(tc, 0);  /* convert to int */
+   vector unsigned int itc01 = spu_add(itc00, offset01);
+   vector unsigned int itc10 = spu_add(itc00, offset10);
+   vector unsigned int itc11 = spu_add(itc10, offset01);
+
+   /* mask (GL_REPEAT) */
+   itc00 = spu_and(itc00, spu.tex_size_mask);
+   itc01 = spu_and(itc01, spu.tex_size_mask);
+   itc10 = spu_and(itc10, spu.tex_size_mask);
+   itc11 = spu_and(itc11, spu.tex_size_mask);
+
+   /* intra tile addr */
+   vector unsigned int ij00 = spu_and(itc00, TILE_SIZE-1);
+   vector unsigned int ij01 = spu_and(itc01, TILE_SIZE-1);
+   vector unsigned int ij10 = spu_and(itc10, TILE_SIZE-1);
+   vector unsigned int ij11 = spu_and(itc11, TILE_SIZE-1);
+
+   /* get tile cache positions */
+   uint pos00 = get_tex_tile(itc00);
+   uint pos01, pos10, pos11;
+   if ((spu_extract(ij00, 0) < TILE_SIZE-1) &&
+       (spu_extract(ij00, 1) < TILE_SIZE-1)) {
+      /* all texels are in the same tile */
+      pos01 = pos10 = pos11 = pos00;
+   }
+   else {
+      pos01 = get_tex_tile(itc01);
+      pos10 = get_tex_tile(itc10);
+      pos11 = get_tex_tile(itc11);
+   }
+
+   /* get texels from tiles and convert to float[4] */
+   vector float texel00 = spu_unpack_A8R8G8B8(tex_tiles[pos00].ui[spu_extract(ij00, 1)][spu_extract(ij00, 0)]);
+   vector float texel01 = spu_unpack_A8R8G8B8(tex_tiles[pos01].ui[spu_extract(ij01, 1)][spu_extract(ij01, 0)]);
+   vector float texel10 = spu_unpack_A8R8G8B8(tex_tiles[pos10].ui[spu_extract(ij10, 1)][spu_extract(ij10, 0)]);
+   vector float texel11 = spu_unpack_A8R8G8B8(tex_tiles[pos11].ui[spu_extract(ij11, 1)][spu_extract(ij11, 0)]);
+
+   /* Compute weighting factors in [0,1]
+    * Multiply texcoord by 1024, AND with 1023, convert back to float.
+    */
+   vector float tc1024 = spu_mul(tc, spu_splats(1024.0f));
+   vector signed int itc1024 = spu_convts(tc1024, 0);
+   itc1024 = spu_and(itc1024, spu_splats((1 << 10) - 1));
+   vector float weight = spu_convtf(itc1024, 10);
+
+   /* smeared frac and 1-frac */
+   vector float sfrac = spu_splats(spu_extract(weight, 0));
+   vector float tfrac = spu_splats(spu_extract(weight, 1));
+   vector float sfrac1 = spu_sub(spu_splats(1.0f), sfrac);
+   vector float tfrac1 = spu_sub(spu_splats(1.0f), tfrac);
+
+   /* multiply the samples (colors) by the S/T weights */
+   texel00 = spu_mul(spu_mul(texel00, sfrac1), tfrac1);
+   texel10 = spu_mul(spu_mul(texel10, sfrac ), tfrac1);
+   texel01 = spu_mul(spu_mul(texel01, sfrac1), tfrac );
+   texel11 = spu_mul(spu_mul(texel11, sfrac ), tfrac );
+
+   /* compute sum of weighted samples */
+   vector float texel_sum = spu_add(texel00, texel01);
+   texel_sum = spu_add(texel_sum, texel10);
+   texel_sum = spu_add(texel_sum, texel11);
+
+   return texel_sum;
+}
diff --git a/src/gallium/drivers/cell/spu/spu_texture.h b/src/gallium/drivers/cell/spu/spu_texture.h
new file mode 100644
index 0000000000..95eb87080f
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_texture.h
@@ -0,0 +1,47 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef SPU_TEXTURE_H
+#define SPU_TEXTURE_H
+
+
+#include "pipe/p_compiler.h"
+
+
+extern void
+invalidate_tex_cache(void);
+
+
+extern vector float
+sample_texture_nearest(vector float texcoord);
+
+
+extern vector float
+sample_texture_bilinear(vector float texcoord);
+
+
+#endif /* SPU_TEXTURE_H */
diff --git a/src/gallium/drivers/cell/spu/spu_tile.c b/src/gallium/drivers/cell/spu/spu_tile.c
new file mode 100644
index 0000000000..12dc246328
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_tile.c
@@ -0,0 +1,83 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+
+#include "spu_tile.h"
+#include "spu_main.h"
+
+
+void
+get_tile(uint tx, uint ty, tile_t *tile, int tag, int zBuf)
+{
+   const uint offset = ty * spu.fb.width_tiles + tx;
+   const uint bytesPerTile = TILE_SIZE * TILE_SIZE * (zBuf ? spu.fb.zsize : 4);
+   const ubyte *src = zBuf ? spu.fb.depth_start : spu.fb.color_start;
+
+   src += offset * bytesPerTile;
+
+   ASSERT(tx < spu.fb.width_tiles);
+   ASSERT(ty < spu.fb.height_tiles);
+   ASSERT_ALIGN16(tile);
+   /*
+   printf("get_tile:  dest: %p  src: 0x%x  size: %d\n",
+          tile, (unsigned int) src, bytesPerTile);
+   */
+   mfc_get(tile->ui,  /* dest in local memory */
+           (unsigned int) src, /* src in main memory */
+           bytesPerTile,
+           tag,
+           0, /* tid */
+           0  /* rid */);
+}
+
+
+void
+put_tile(uint tx, uint ty, const tile_t *tile, int tag, int zBuf)
+{
+   const uint offset = ty * spu.fb.width_tiles + tx;
+   const uint bytesPerTile = TILE_SIZE * TILE_SIZE * (zBuf ? spu.fb.zsize : 4);
+   ubyte *dst = zBuf ? spu.fb.depth_start : spu.fb.color_start;
+
+   dst += offset * bytesPerTile;
+
+   ASSERT(tx < spu.fb.width_tiles);
+   ASSERT(ty < spu.fb.height_tiles);
+   ASSERT_ALIGN16(tile);
+   /*
+   printf("SPU %u: put_tile:  src: %p  dst: 0x%x  size: %d\n",
+          spu.init.id,
+          tile, (unsigned int) dst, bytesPerTile);
+   */
+   mfc_put((void *) tile->ui,  /* src in local memory */
+           (unsigned int) dst,  /* dst in main memory */
+           bytesPerTile,
+           tag,
+           0, /* tid */
+           0  /* rid */);
+}
+
diff --git a/src/gallium/drivers/cell/spu/spu_tile.h b/src/gallium/drivers/cell/spu/spu_tile.h
new file mode 100644
index 0000000000..e53340a55a
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_tile.h
@@ -0,0 +1,73 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef SPU_TILE_H
+#define SPU_TILE_H
+
+
+#include <libmisc.h>
+#include <spu_mfcio.h>
+#include "spu_main.h"
+#include "pipe/cell/common.h"
+
+
+
+void
+get_tile(uint tx, uint ty, tile_t *tile, int tag, int zBuf);
+
+void
+put_tile(uint tx, uint ty, const tile_t *tile, int tag, int zBuf);
+
+
+
+static INLINE void
+clear_c_tile(tile_t *ctile)
+{
+   memset32((uint*) ctile->ui,
+            spu.fb.color_clear_value,
+            TILE_SIZE * TILE_SIZE);
+}
+
+
+static INLINE void
+clear_z_tile(tile_t *ztile)
+{
+   if (spu.fb.depth_format == PIPE_FORMAT_Z16_UNORM) {
+      memset16((ushort*) ztile->us,
+               spu.fb.depth_clear_value,
+               TILE_SIZE * TILE_SIZE);
+   }
+   else {
+      ASSERT(spu.fb.depth_format == PIPE_FORMAT_Z32_UNORM);
+      memset32((uint*) ztile->ui,
+               spu.fb.depth_clear_value,
+               TILE_SIZE * TILE_SIZE);
+   }
+}
+
+
+#endif /* SPU_TILE_H */
diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
new file mode 100644
index 0000000000..be9624cf7d
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -0,0 +1,926 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * Triangle rendering within a tile.
+ */
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_format.h"
+#include "pipe/p_util.h"
+#include "spu_blend.h"
+#include "spu_colorpack.h"
+#include "spu_main.h"
+#include "spu_texture.h"
+#include "spu_tile.h"
+#include "spu_tri.h"
+
+#include "spu_ztest.h"
+
+
+/** Masks are uint[4] vectors with each element being 0 or 0xffffffff */
+typedef vector unsigned int mask_t;
+
+typedef union
+{
+   vector float v;
+   float f[4];
+} float4;
+
+
+/**
+ * Simplified types taken from other parts of Gallium
+ */
+struct vertex_header {
+   vector float data[1];
+};
+
+
+
+/* XXX fix this */
+#undef CEILF
+#define CEILF(X) ((float) (int) ((X) + 0.99999))
+
+
+#define QUAD_TOP_LEFT     0
+#define QUAD_TOP_RIGHT    1
+#define QUAD_BOTTOM_LEFT  2
+#define QUAD_BOTTOM_RIGHT 3
+#define MASK_TOP_LEFT     (1 << QUAD_TOP_LEFT)
+#define MASK_TOP_RIGHT    (1 << QUAD_TOP_RIGHT)
+#define MASK_BOTTOM_LEFT  (1 << QUAD_BOTTOM_LEFT)
+#define MASK_BOTTOM_RIGHT (1 << QUAD_BOTTOM_RIGHT)
+#define MASK_ALL          0xf
+
+
+#define DEBUG_VERTS 0
+
+/**
+ * Triangle edge info
+ */
+struct edge {
+   float dx;		/**< X(v1) - X(v0), used only during setup */
+   float dy;		/**< Y(v1) - Y(v0), used only during setup */
+   float dxdy;		/**< dx/dy */
+   float sx, sy;	/**< first sample point coord */
+   int lines;		/**< number of lines on this edge */
+};
+
+
+struct interp_coef
+{
+   float4 a0;
+   float4 dadx;
+   float4 dady;
+};
+
+
+/**
+ * Triangle setup info (derived from draw_stage).
+ * Also used for line drawing (taking some liberties).
+ */
+struct setup_stage {
+
+   /* Vertices are just an array of floats making up each attribute in
+    * turn.  Currently fixed at 4 floats, but should change in time.
+    * Codegen will help cope with this.
+    */
+   const struct vertex_header *vmax;
+   const struct vertex_header *vmid;
+   const struct vertex_header *vmin;
+   const struct vertex_header *vprovoke;
+
+   struct edge ebot;
+   struct edge etop;
+   struct edge emaj;
+
+   float oneoverarea;
+
+   uint tx, ty;
+
+   int cliprect_minx, cliprect_maxx, cliprect_miny, cliprect_maxy;
+
+#if 0
+   struct tgsi_interp_coef coef[PIPE_MAX_SHADER_INPUTS];
+#else
+   struct interp_coef coef[PIPE_MAX_SHADER_INPUTS];
+#endif
+
+#if 0
+   struct quad_header quad; 
+#endif
+
+   struct {
+      int left[2];   /**< [0] = row0, [1] = row1 */
+      int right[2];
+      int y;
+      unsigned y_flags;
+      unsigned mask;     /**< mask of MASK_BOTTOM/TOP_LEFT/RIGHT bits */
+   } span;
+};
+
+
+
+static struct setup_stage setup;
+
+
+
+
+#if 0
+/**
+ * Basically a cast wrapper.
+ */
+static INLINE struct setup_stage *setup_stage( struct draw_stage *stage )
+{
+   return (struct setup_stage *)stage;
+}
+#endif
+
+#if 0
+/**
+ * Clip setup.quad against the scissor/surface bounds.
+ */
+static INLINE void
+quad_clip(struct setup_stage *setup)
+{
+   const struct pipe_scissor_state *cliprect = &setup.softpipe->cliprect;
+   const int minx = (int) cliprect->minx;
+   const int maxx = (int) cliprect->maxx;
+   const int miny = (int) cliprect->miny;
+   const int maxy = (int) cliprect->maxy;
+
+   if (setup.quad.x0 >= maxx ||
+       setup.quad.y0 >= maxy ||
+       setup.quad.x0 + 1 < minx ||
+       setup.quad.y0 + 1 < miny) {
+      /* totally clipped */
+      setup.quad.mask = 0x0;
+      return;
+   }
+   if (setup.quad.x0 < minx)
+      setup.quad.mask &= (MASK_BOTTOM_RIGHT | MASK_TOP_RIGHT);
+   if (setup.quad.y0 < miny)
+      setup.quad.mask &= (MASK_BOTTOM_LEFT | MASK_BOTTOM_RIGHT);
+   if (setup.quad.x0 == maxx - 1)
+      setup.quad.mask &= (MASK_BOTTOM_LEFT | MASK_TOP_LEFT);
+   if (setup.quad.y0 == maxy - 1)
+      setup.quad.mask &= (MASK_TOP_LEFT | MASK_TOP_RIGHT);
+}
+#endif
+
+#if 0
+/**
+ * Emit a quad (pass to next stage) with clipping.
+ */
+static INLINE void
+clip_emit_quad(struct setup_stage *setup)
+{
+   quad_clip(setup);
+   if (setup.quad.mask) {
+      struct softpipe_context *sp = setup.softpipe;
+      sp->quad.first->run(sp->quad.first, &setup.quad);
+   }
+}
+#endif
+
+/**
+ * Evaluate attribute coefficients (plane equations) to compute
+ * attribute values for the four fragments in a quad.
+ * Eg: four colors will be compute.
+ */
+static INLINE void
+eval_coeff(uint slot, float x, float y, vector float result[4])
+{
+   switch (spu.vertex_info.interp_mode[slot]) {
+   case INTERP_CONSTANT:
+      result[QUAD_TOP_LEFT] =
+      result[QUAD_TOP_RIGHT] =
+      result[QUAD_BOTTOM_LEFT] =
+      result[QUAD_BOTTOM_RIGHT] = setup.coef[slot].a0.v;
+      break;
+
+   case INTERP_LINEAR:
+      /* fall-through, for now */
+   default:
+      {
+         register vector float dadx = setup.coef[slot].dadx.v;
+         register vector float dady = setup.coef[slot].dady.v;
+         register vector float topLeft
+            = spu_add(setup.coef[slot].a0.v,
+                      spu_add(spu_mul(spu_splats(x), dadx),
+                              spu_mul(spu_splats(y), dady)));
+
+         result[QUAD_TOP_LEFT] = topLeft;
+         result[QUAD_TOP_RIGHT] = spu_add(topLeft, dadx);
+         result[QUAD_BOTTOM_LEFT] = spu_add(topLeft, dady);
+         result[QUAD_BOTTOM_RIGHT] = spu_add(spu_add(topLeft, dadx), dady);
+      }
+   }
+}
+
+
+static INLINE vector float
+eval_z(float x, float y)
+{
+   const uint slot = 0;
+   const float dzdx = setup.coef[slot].dadx.f[2];
+   const float dzdy = setup.coef[slot].dady.f[2];
+   const float topLeft = setup.coef[slot].a0.f[2] + x * dzdx + y * dzdy;
+   const vector float topLeftv = spu_splats(topLeft);
+   const vector float derivs = (vector float) { 0.0, dzdx, dzdy, dzdx + dzdy };
+   return spu_add(topLeftv, derivs);
+}
+
+
+static INLINE mask_t
+do_depth_test(int x, int y, mask_t quadmask)
+{
+   float4 zvals;
+   mask_t mask;
+
+   zvals.v = eval_z((float) x, (float) y);
+
+   if (spu.fb.depth_format == PIPE_FORMAT_Z16_UNORM) {
+      int ix = (x - setup.cliprect_minx) / 4;
+      int iy = (y - setup.cliprect_miny) / 2;
+      mask = spu_z16_test_less(zvals.v, &spu.ztile.us8[iy][ix], x>>1, quadmask);
+   }
+   else {
+      int ix = (x - setup.cliprect_minx) / 2;
+      int iy = (y - setup.cliprect_miny) / 2;
+      mask = spu_z32_test_less(zvals.v, &spu.ztile.ui4[iy][ix], quadmask);
+   }
+
+   if (spu_extract(spu_orx(mask), 0))
+      spu.cur_ztile_status = TILE_STATUS_DIRTY;
+
+   return mask;
+}
+
+
+/**
+ * Emit a quad (pass to next stage).  No clipping is done.
+ * Note: about 1/5 to 1/7 of the time, mask is zero and this function
+ * should be skipped.  But adding the test for that slows things down
+ * overall.
+ */
+static INLINE void
+emit_quad( int x, int y, mask_t mask )
+{
+#if 0
+   struct softpipe_context *sp = setup.softpipe;
+   setup.quad.x0 = x;
+   setup.quad.y0 = y;
+   setup.quad.mask = mask;
+   sp->quad.first->run(sp->quad.first, &setup.quad);
+#else
+
+   if (spu.depth_stencil.depth.enabled) {
+      mask = do_depth_test(x, y, mask);
+   }
+
+   /* If any bits in mask are set... */
+   if (spu_extract(spu_orx(mask), 0)) {
+      const int ix = x - setup.cliprect_minx;
+      const int iy = y - setup.cliprect_miny;
+      const vector unsigned char shuffle = spu.color_shuffle;
+      vector float colors[4];
+
+      spu.cur_ctile_status = TILE_STATUS_DIRTY;
+
+      if (spu.texture.start) {
+         /* texture mapping */
+         vector float texcoords[4];
+         eval_coeff(2, (float) x, (float) y, texcoords);
+
+         if (spu_extract(mask, 0))
+            colors[0] = spu.sample_texture(texcoords[0]);
+         if (spu_extract(mask, 1))
+            colors[1] = spu.sample_texture(texcoords[1]);
+         if (spu_extract(mask, 2))
+            colors[2] = spu.sample_texture(texcoords[2]);
+         if (spu_extract(mask, 3))
+            colors[3] = spu.sample_texture(texcoords[3]);
+      }
+      else {
+         /* simple shading */
+         eval_coeff(1, (float) x, (float) y, colors);
+      }
+
+#if 1
+      if (spu.blend.blend_enable)
+         blend_quad(ix % TILE_SIZE, iy % TILE_SIZE, colors);
+#endif
+
+      if (spu_extract(mask, 0))
+         spu.ctile.ui[iy][ix] = spu_pack_color_shuffle(colors[0], shuffle);
+      if (spu_extract(mask, 1))
+         spu.ctile.ui[iy][ix+1] = spu_pack_color_shuffle(colors[1], shuffle);
+      if (spu_extract(mask, 2))
+         spu.ctile.ui[iy+1][ix] = spu_pack_color_shuffle(colors[2], shuffle);
+      if (spu_extract(mask, 3))
+         spu.ctile.ui[iy+1][ix+1] = spu_pack_color_shuffle(colors[3], shuffle);
+
+#if 0
+      /* SIMD_Z with swizzled color buffer (someday) */
+      vector unsigned int uicolors = *((vector unsigned int *) &colors);
+      spu.ctile.ui4[iy/2][ix/2] = spu_sel(spu.ctile.ui4[iy/2][ix/2], uicolors, mask);
+#endif
+   }
+
+#endif
+}
+
+
+/**
+ * Given an X or Y coordinate, return the block/quad coordinate that it
+ * belongs to.
+ */
+static INLINE int block( int x )
+{
+   return x & ~1;
+}
+
+
+/**
+ * Compute mask which indicates which pixels in the 2x2 quad are actually inside
+ * the triangle's bounds.
+ * The mask is a uint4 vector and each element will be 0 or 0xffffffff.
+ */
+static INLINE mask_t calculate_mask( int x )
+{
+   /* This is a little tricky.
+    * Use & instead of && to avoid branches.
+    * Use negation to convert true/false to ~0/0 values.
+    */
+   mask_t mask;
+   mask = spu_insert(-((x   >= setup.span.left[0]) & (x   < setup.span.right[0])), mask, 0);
+   mask = spu_insert(-((x+1 >= setup.span.left[0]) & (x+1 < setup.span.right[0])), mask, 1);
+   mask = spu_insert(-((x   >= setup.span.left[1]) & (x   < setup.span.right[1])), mask, 2);
+   mask = spu_insert(-((x+1 >= setup.span.left[1]) & (x+1 < setup.span.right[1])), mask, 3);
+   return mask;
+}
+
+
+/**
+ * Render a horizontal span of quads
+ */
+static void flush_spans( void )
+{
+   int minleft, maxright;
+   int x;
+
+   switch (setup.span.y_flags) {
+   case 0x3:
+      /* both odd and even lines written (both quad rows) */
+      minleft = MIN2(setup.span.left[0], setup.span.left[1]);
+      maxright = MAX2(setup.span.right[0], setup.span.right[1]);
+      break;
+
+   case 0x1:
+      /* only even line written (quad top row) */
+      minleft = setup.span.left[0];
+      maxright = setup.span.right[0];
+      break;
+
+   case 0x2:
+      /* only odd line written (quad bottom row) */
+      minleft = setup.span.left[1];
+      maxright = setup.span.right[1];
+      break;
+
+   default:
+      return;
+   }
+
+
+   /* OK, we're very likely to need the tile data now.
+    * clear or finish waiting if needed.
+    */
+   if (spu.cur_ctile_status == TILE_STATUS_GETTING) {
+      /* wait for mfc_get() to complete */
+      //printf("SPU: %u: waiting for ctile\n", spu.init.id);
+      wait_on_mask(1 << TAG_READ_TILE_COLOR);
+      spu.cur_ctile_status = TILE_STATUS_CLEAN;
+   }
+   else if (spu.cur_ctile_status == TILE_STATUS_CLEAR) {
+      //printf("SPU %u: clearing C tile %u, %u\n", spu.init.id, setup.tx, setup.ty);
+      clear_c_tile(&spu.ctile);
+      spu.cur_ctile_status = TILE_STATUS_DIRTY;
+   }
+   ASSERT(spu.cur_ctile_status != TILE_STATUS_DEFINED);
+
+   if (spu.depth_stencil.depth.enabled) {
+      if (spu.cur_ztile_status == TILE_STATUS_GETTING) {
+         /* wait for mfc_get() to complete */
+         //printf("SPU: %u: waiting for ztile\n", spu.init.id);
+         wait_on_mask(1 << TAG_READ_TILE_Z);
+         spu.cur_ztile_status = TILE_STATUS_CLEAN;
+      }
+      else if (spu.cur_ztile_status == TILE_STATUS_CLEAR) {
+         //printf("SPU %u: clearing Z tile %u, %u\n", spu.init.id, setup.tx, setup.ty);
+         clear_z_tile(&spu.ztile);
+         spu.cur_ztile_status = TILE_STATUS_DIRTY;
+      }
+      ASSERT(spu.cur_ztile_status != TILE_STATUS_DEFINED);
+   }
+
+   /* XXX this loop could be moved into the above switch cases and
+    * calculate_mask() could be simplified a bit...
+    */
+   for (x = block(minleft); x <= block(maxright); x += 2) {
+#if 1
+      emit_quad( x, setup.span.y, calculate_mask( x ) );
+#endif
+   }
+
+   setup.span.y = 0;
+   setup.span.y_flags = 0;
+   setup.span.right[0] = 0;
+   setup.span.right[1] = 0;
+}
+
+#if DEBUG_VERTS
+static void print_vertex(const struct vertex_header *v)
+{
+   int i;
+   fprintf(stderr, "Vertex: (%p)\n", v);
+   for (i = 0; i < setup.quad.nr_attrs; i++) {
+      fprintf(stderr, "  %d: %f %f %f %f\n",  i, 
+              v->data[i][0], v->data[i][1], v->data[i][2], v->data[i][3]);
+   }
+}
+#endif
+
+
+static boolean setup_sort_vertices(const struct vertex_header *v0,
+                                   const struct vertex_header *v1,
+                                   const struct vertex_header *v2)
+{
+
+#if DEBUG_VERTS
+   fprintf(stderr, "Triangle:\n");
+   print_vertex(v0);
+   print_vertex(v1);
+   print_vertex(v2);
+#endif
+
+   setup.vprovoke = v2;
+
+   /* determine bottom to top order of vertices */
+   {
+      float y0 = spu_extract(v0->data[0], 1);
+      float y1 = spu_extract(v1->data[0], 1);
+      float y2 = spu_extract(v2->data[0], 1);
+      if (y0 <= y1) {
+	 if (y1 <= y2) {
+	    /* y0<=y1<=y2 */
+	    setup.vmin = v0;   
+	    setup.vmid = v1;   
+	    setup.vmax = v2;
+	 }
+	 else if (y2 <= y0) {
+	    /* y2<=y0<=y1 */
+	    setup.vmin = v2;   
+	    setup.vmid = v0;   
+	    setup.vmax = v1;   
+	 }
+	 else {
+	    /* y0<=y2<=y1 */
+	    setup.vmin = v0;   
+	    setup.vmid = v2;   
+	    setup.vmax = v1;  
+	 }
+      }
+      else {
+	 if (y0 <= y2) {
+	    /* y1<=y0<=y2 */
+	    setup.vmin = v1;   
+	    setup.vmid = v0;   
+	    setup.vmax = v2;  
+	 }
+	 else if (y2 <= y1) {
+	    /* y2<=y1<=y0 */
+	    setup.vmin = v2;   
+	    setup.vmid = v1;   
+	    setup.vmax = v0;  
+	 }
+	 else {
+	    /* y1<=y2<=y0 */
+	    setup.vmin = v1;   
+	    setup.vmid = v2;   
+	    setup.vmax = v0;
+	 }
+      }
+   }
+
+   /* Check if triangle is completely outside the tile bounds */
+   if (spu_extract(setup.vmin->data[0], 1) > setup.cliprect_maxy)
+      return FALSE;
+   if (spu_extract(setup.vmax->data[0], 1) < setup.cliprect_miny)
+      return FALSE;
+   if (spu_extract(setup.vmin->data[0], 0) < setup.cliprect_minx &&
+       spu_extract(setup.vmid->data[0], 0) < setup.cliprect_minx &&
+       spu_extract(setup.vmax->data[0], 0) < setup.cliprect_minx)
+      return FALSE;
+   if (spu_extract(setup.vmin->data[0], 0) > setup.cliprect_maxx &&
+       spu_extract(setup.vmid->data[0], 0) > setup.cliprect_maxx &&
+       spu_extract(setup.vmax->data[0], 0) > setup.cliprect_maxx)
+      return FALSE;
+
+   setup.ebot.dx = spu_extract(setup.vmid->data[0], 0) - spu_extract(setup.vmin->data[0], 0);
+   setup.ebot.dy = spu_extract(setup.vmid->data[0], 1) - spu_extract(setup.vmin->data[0], 1);
+   setup.emaj.dx = spu_extract(setup.vmax->data[0], 0) - spu_extract(setup.vmin->data[0], 0);
+   setup.emaj.dy = spu_extract(setup.vmax->data[0], 1) - spu_extract(setup.vmin->data[0], 1);
+   setup.etop.dx = spu_extract(setup.vmax->data[0], 0) - spu_extract(setup.vmid->data[0], 0);
+   setup.etop.dy = spu_extract(setup.vmax->data[0], 1) - spu_extract(setup.vmid->data[0], 1);
+
+   /*
+    * Compute triangle's area.  Use 1/area to compute partial
+    * derivatives of attributes later.
+    *
+    * The area will be the same as prim->det, but the sign may be
+    * different depending on how the vertices get sorted above.
+    *
+    * To determine whether the primitive is front or back facing we
+    * use the prim->det value because its sign is correct.
+    */
+   {
+      const float area = (setup.emaj.dx * setup.ebot.dy - 
+			    setup.ebot.dx * setup.emaj.dy);
+
+      setup.oneoverarea = 1.0f / area;
+      /*
+      _mesa_printf("%s one-over-area %f  area %f  det %f\n",
+                   __FUNCTION__, setup.oneoverarea, area, prim->det );
+      */
+   }
+
+#if 0
+   /* We need to know if this is a front or back-facing triangle for:
+    *  - the GLSL gl_FrontFacing fragment attribute (bool)
+    *  - two-sided stencil test
+    */
+   setup.quad.facing = (prim->det > 0.0) ^ (setup.softpipe->rasterizer->front_winding == PIPE_WINDING_CW);
+#endif
+
+   return TRUE;
+}
+
+
+/**
+ * Compute a0 for a constant-valued coefficient (GL_FLAT shading).
+ * The value value comes from vertex->data[slot].
+ * The result will be put into setup.coef[slot].a0.
+ * \param slot  which attribute slot 
+ */
+static INLINE void
+const_coeff(uint slot)
+{
+   setup.coef[slot].dadx.v = (vector float) {0.0, 0.0, 0.0, 0.0};
+   setup.coef[slot].dady.v = (vector float) {0.0, 0.0, 0.0, 0.0};
+   setup.coef[slot].a0.v = setup.vprovoke->data[slot];
+}
+
+
+/**
+ * Compute a0, dadx and dady for a linearly interpolated coefficient,
+ * for a triangle.
+ */
+static INLINE void
+tri_linear_coeff(uint slot, uint firstComp, uint lastComp)
+{
+   uint i;
+   const float *vmin_d = (float *) &setup.vmin->data[slot];
+   const float *vmid_d = (float *) &setup.vmid->data[slot];
+   const float *vmax_d = (float *) &setup.vmax->data[slot];
+   const float x = spu_extract(setup.vmin->data[0], 0) - 0.5f;
+   const float y = spu_extract(setup.vmin->data[0], 1) - 0.5f;
+
+   for (i = firstComp; i < lastComp; i++) {
+      float botda = vmid_d[i] - vmin_d[i];
+      float majda = vmax_d[i] - vmin_d[i];
+      float a = setup.ebot.dy * majda - botda * setup.emaj.dy;
+      float b = setup.emaj.dx * botda - majda * setup.ebot.dx;
+   
+      ASSERT(slot < PIPE_MAX_SHADER_INPUTS);
+
+      setup.coef[slot].dadx.f[i] = a * setup.oneoverarea;
+      setup.coef[slot].dady.f[i] = b * setup.oneoverarea;
+
+      /* calculate a0 as the value which would be sampled for the
+       * fragment at (0,0), taking into account that we want to sample at
+       * pixel centers, in other words (0.5, 0.5).
+       *
+       * this is neat but unfortunately not a good way to do things for
+       * triangles with very large values of dadx or dady as it will
+       * result in the subtraction and re-addition from a0 of a very
+       * large number, which means we'll end up loosing a lot of the
+       * fractional bits and precision from a0.  the way to fix this is
+       * to define a0 as the sample at a pixel center somewhere near vmin
+       * instead - i'll switch to this later.
+       */
+      setup.coef[slot].a0.f[i] = (vmin_d[i] - 
+                                 (setup.coef[slot].dadx.f[i] * x + 
+                                  setup.coef[slot].dady.f[i] * y));
+   }
+
+   /*
+   _mesa_printf("attr[%d].%c: %f dx:%f dy:%f\n",
+		slot, "xyzw"[i], 
+		setup.coef[slot].a0[i],
+		setup.coef[slot].dadx.f[i],
+		setup.coef[slot].dady.f[i]);
+   */
+}
+
+
+/**
+ * As above, but interp setup all four vector components.
+ */
+static INLINE void
+tri_linear_coeff4(uint slot)
+{
+   const vector float vmin_d = setup.vmin->data[slot];
+   const vector float vmid_d = setup.vmid->data[slot];
+   const vector float vmax_d = setup.vmax->data[slot];
+   const vector float xxxx = spu_splats(spu_extract(setup.vmin->data[0], 0) - 0.5f);
+   const vector float yyyy = spu_splats(spu_extract(setup.vmin->data[0], 1) - 0.5f);
+
+   vector float botda = vmid_d - vmin_d;
+   vector float majda = vmax_d - vmin_d;
+
+   vector float a = spu_sub(spu_mul(spu_splats(setup.ebot.dy), majda),
+                            spu_mul(botda, spu_splats(setup.emaj.dy)));
+   vector float b = spu_sub(spu_mul(spu_splats(setup.emaj.dx), botda),
+                            spu_mul(majda, spu_splats(setup.ebot.dx)));
+
+   setup.coef[slot].dadx.v = spu_mul(a, spu_splats(setup.oneoverarea));
+   setup.coef[slot].dady.v = spu_mul(b, spu_splats(setup.oneoverarea));
+
+   vector float tempx = spu_mul(setup.coef[slot].dadx.v, xxxx);
+   vector float tempy = spu_mul(setup.coef[slot].dady.v, yyyy);
+                         
+   setup.coef[slot].a0.v = spu_sub(vmin_d, spu_add(tempx, tempy));
+}
+
+
+
+#if 0
+/**
+ * Compute a0, dadx and dady for a perspective-corrected interpolant,
+ * for a triangle.
+ * We basically multiply the vertex value by 1/w before computing
+ * the plane coefficients (a0, dadx, dady).
+ * Later, when we compute the value at a particular fragment position we'll
+ * divide the interpolated value by the interpolated W at that fragment.
+ */
+static void tri_persp_coeff( unsigned slot,
+                             unsigned i )
+{
+   /* premultiply by 1/w:
+    */
+   float mina = setup.vmin->data[slot][i] * setup.vmin->data[0][3];
+   float mida = setup.vmid->data[slot][i] * setup.vmid->data[0][3];
+   float maxa = setup.vmax->data[slot][i] * setup.vmax->data[0][3];
+
+   float botda = mida - mina;
+   float majda = maxa - mina;
+   float a = setup.ebot.dy * majda - botda * setup.emaj.dy;
+   float b = setup.emaj.dx * botda - majda * setup.ebot.dx;
+      
+   /*
+   printf("tri persp %d,%d: %f %f %f\n", slot, i,
+          setup.vmin->data[slot][i],
+          setup.vmid->data[slot][i],
+          setup.vmax->data[slot][i]
+          );
+   */
+
+   assert(slot < PIPE_MAX_SHADER_INPUTS);
+   assert(i <= 3);
+
+   setup.coef[slot].dadx.f[i] = a * setup.oneoverarea;
+   setup.coef[slot].dady.f[i] = b * setup.oneoverarea;
+   setup.coef[slot].a0.f[i] = (mina - 
+			    (setup.coef[slot].dadx.f[i] * (setup.vmin->data[0][0] - 0.5f) + 
+			     setup.coef[slot].dady.f[i] * (setup.vmin->data[0][1] - 0.5f)));
+}
+#endif
+
+
+/**
+ * Compute the setup.coef[] array dadx, dady, a0 values.
+ * Must be called after setup.vmin,vmid,vmax,vprovoke are initialized.
+ */
+static void setup_tri_coefficients(void)
+{
+#if 1
+   uint i;
+
+   for (i = 0; i < spu.vertex_info.num_attribs; i++) {
+      switch (spu.vertex_info.interp_mode[i]) {
+      case INTERP_NONE:
+         break;
+      case INTERP_POS:
+         /*tri_linear_coeff(i, 2, 3);*/
+         /* XXX interp W if PERSPECTIVE... */
+         tri_linear_coeff4(i);
+         break;
+      case INTERP_CONSTANT:
+         const_coeff(i);
+         break;
+      case INTERP_LINEAR:
+         tri_linear_coeff4(i);
+         break;
+      case INTERP_PERSPECTIVE:
+         tri_linear_coeff4(i);  /* temporary */
+         break;
+      default:
+         ASSERT(0);
+      }
+   }
+#else
+   ASSERT(spu.vertex_info.interp_mode[0] == INTERP_POS);
+   ASSERT(spu.vertex_info.interp_mode[1] == INTERP_LINEAR ||
+          spu.vertex_info.interp_mode[1] == INTERP_CONSTANT);
+   tri_linear_coeff(0, 2, 3);  /* slot 0, z */
+   tri_linear_coeff(1, 0, 4);  /* slot 1, color */
+#endif
+}
+
+
+static void setup_tri_edges(void)
+{
+   float vmin_x = spu_extract(setup.vmin->data[0], 0) + 0.5f;
+   float vmid_x = spu_extract(setup.vmid->data[0], 0) + 0.5f;
+
+   float vmin_y = spu_extract(setup.vmin->data[0], 1) - 0.5f;
+   float vmid_y = spu_extract(setup.vmid->data[0], 1) - 0.5f;
+   float vmax_y = spu_extract(setup.vmax->data[0], 1) - 0.5f;
+
+   setup.emaj.sy = CEILF(vmin_y);
+   setup.emaj.lines = (int) CEILF(vmax_y - setup.emaj.sy);
+   setup.emaj.dxdy = setup.emaj.dx / setup.emaj.dy;
+   setup.emaj.sx = vmin_x + (setup.emaj.sy - vmin_y) * setup.emaj.dxdy;
+
+   setup.etop.sy = CEILF(vmid_y);
+   setup.etop.lines = (int) CEILF(vmax_y - setup.etop.sy);
+   setup.etop.dxdy = setup.etop.dx / setup.etop.dy;
+   setup.etop.sx = vmid_x + (setup.etop.sy - vmid_y) * setup.etop.dxdy;
+
+   setup.ebot.sy = CEILF(vmin_y);
+   setup.ebot.lines = (int) CEILF(vmid_y - setup.ebot.sy);
+   setup.ebot.dxdy = setup.ebot.dx / setup.ebot.dy;
+   setup.ebot.sx = vmin_x + (setup.ebot.sy - vmin_y) * setup.ebot.dxdy;
+}
+
+
+/**
+ * Render the upper or lower half of a triangle.
+ * Scissoring/cliprect is applied here too.
+ */
+static void subtriangle( struct edge *eleft,
+			 struct edge *eright,
+			 unsigned lines )
+{
+   const int minx = setup.cliprect_minx;
+   const int maxx = setup.cliprect_maxx;
+   const int miny = setup.cliprect_miny;
+   const int maxy = setup.cliprect_maxy;
+   int y, start_y, finish_y;
+   int sy = (int)eleft->sy;
+
+   ASSERT((int)eleft->sy == (int) eright->sy);
+
+   /* clip top/bottom */
+   start_y = sy;
+   finish_y = sy + lines;
+
+   if (start_y < miny)
+      start_y = miny;
+
+   if (finish_y > maxy)
+      finish_y = maxy;
+
+   start_y -= sy;
+   finish_y -= sy;
+
+   /*
+   _mesa_printf("%s %d %d\n", __FUNCTION__, start_y, finish_y);  
+   */
+
+   for (y = start_y; y < finish_y; y++) {
+
+      /* avoid accumulating adds as floats don't have the precision to
+       * accurately iterate large triangle edges that way.  luckily we
+       * can just multiply these days.
+       *
+       * this is all drowned out by the attribute interpolation anyway.
+       */
+      int left = (int)(eleft->sx + y * eleft->dxdy);
+      int right = (int)(eright->sx + y * eright->dxdy);
+
+      /* clip left/right */
+      if (left < minx)
+         left = minx;
+      if (right > maxx)
+         right = maxx;
+
+      if (left < right) {
+         int _y = sy + y;
+         if (block(_y) != setup.span.y) {
+            flush_spans();
+            setup.span.y = block(_y);
+         }
+
+         setup.span.left[_y&1] = left;
+         setup.span.right[_y&1] = right;
+         setup.span.y_flags |= 1<<(_y&1);
+      }
+   }
+
+
+   /* save the values so that emaj can be restarted:
+    */
+   eleft->sx += lines * eleft->dxdy;
+   eright->sx += lines * eright->dxdy;
+   eleft->sy += lines;
+   eright->sy += lines;
+}
+
+
+/**
+ * Draw triangle into tile at (tx, ty) (tile coords)
+ * The tile data should have already been fetched.
+ */
+boolean
+tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty)
+{
+   setup.tx = tx;
+   setup.ty = ty;
+
+   /* set clipping bounds to tile bounds */
+   setup.cliprect_minx = tx * TILE_SIZE;
+   setup.cliprect_miny = ty * TILE_SIZE;
+   setup.cliprect_maxx = (tx + 1) * TILE_SIZE;
+   setup.cliprect_maxy = (ty + 1) * TILE_SIZE;
+
+   if (!setup_sort_vertices((struct vertex_header *) v0,
+                            (struct vertex_header *) v1,
+                            (struct vertex_header *) v2)) {
+      return FALSE; /* totally clipped */
+   }
+
+   setup_tri_coefficients();
+   setup_tri_edges();
+
+   setup.span.y = 0;
+   setup.span.y_flags = 0;
+   setup.span.right[0] = 0;
+   setup.span.right[1] = 0;
+   /*   setup.span.z_mode = tri_z_mode( setup.ctx ); */
+
+   /*   init_constant_attribs( setup ); */
+      
+   if (setup.oneoverarea < 0.0) {
+      /* emaj on left:
+       */
+      subtriangle( &setup.emaj, &setup.ebot, setup.ebot.lines );
+      subtriangle( &setup.emaj, &setup.etop, setup.etop.lines );
+   }
+   else {
+      /* emaj on right:
+       */
+      subtriangle( &setup.ebot, &setup.emaj, setup.ebot.lines );
+      subtriangle( &setup.etop, &setup.emaj, setup.etop.lines );
+   }
+
+   flush_spans();
+
+   return TRUE;
+}
diff --git a/src/gallium/drivers/cell/spu/spu_tri.h b/src/gallium/drivers/cell/spu/spu_tri.h
new file mode 100644
index 0000000000..aa694dd7c9
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_tri.h
@@ -0,0 +1,37 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef SPU_TRI_H
+#define SPU_TRI_H
+
+
+extern boolean
+tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty);
+
+
+#endif /* SPU_TRI_H */
diff --git a/src/gallium/drivers/cell/spu/spu_util.c b/src/gallium/drivers/cell/spu/spu_util.c
new file mode 100644
index 0000000000..ac373240c1
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_util.c
@@ -0,0 +1,165 @@
+#include "pipe/p_util.h"
+#include "pipe/p_shader_tokens.h"
+#include "pipe/tgsi/util/tgsi_parse.h"
+//#include "tgsi_build.h"
+#include "pipe/tgsi/util/tgsi_util.h"
+
+unsigned
+tgsi_util_get_src_register_swizzle(
+   const struct tgsi_src_register *reg,
+   unsigned component )
+{
+   switch( component ) {
+   case 0:
+      return reg->SwizzleX;
+   case 1:
+      return reg->SwizzleY;
+   case 2:
+      return reg->SwizzleZ;
+   case 3:
+      return reg->SwizzleW;
+   default:
+      assert( 0 );
+   }
+   return 0;
+}
+
+unsigned
+tgsi_util_get_src_register_extswizzle(
+   const struct tgsi_src_register_ext_swz *reg,
+   unsigned component )
+{
+   switch( component ) {
+   case 0:
+      return reg->ExtSwizzleX;
+   case 1:
+      return reg->ExtSwizzleY;
+   case 2:
+      return reg->ExtSwizzleZ;
+   case 3:
+      return reg->ExtSwizzleW;
+   default:
+      assert( 0 );
+   }
+   return 0;
+}
+
+unsigned
+tgsi_util_get_full_src_register_extswizzle(
+   const struct tgsi_full_src_register  *reg,
+   unsigned component )
+{
+   unsigned swizzle;
+
+   /*
+    * First, calculate  the   extended swizzle for a given channel. This will give
+    * us either a channel index into the simple swizzle or  a constant 1 or   0.
+    */
+   swizzle = tgsi_util_get_src_register_extswizzle(
+      &reg->SrcRegisterExtSwz,
+      component );
+
+   assert (TGSI_SWIZZLE_X == TGSI_EXTSWIZZLE_X);
+   assert (TGSI_SWIZZLE_Y == TGSI_EXTSWIZZLE_Y);
+   assert (TGSI_SWIZZLE_Z == TGSI_EXTSWIZZLE_Z);
+   assert (TGSI_SWIZZLE_W == TGSI_EXTSWIZZLE_W);
+   assert (TGSI_EXTSWIZZLE_ZERO > TGSI_SWIZZLE_W);
+   assert (TGSI_EXTSWIZZLE_ONE > TGSI_SWIZZLE_W);
+
+   /*
+    * Second, calculate the simple  swizzle  for   the   unswizzled channel index.
+    * Leave the constants intact, they are   not   affected by the   simple swizzle.
+    */
+   if( swizzle <= TGSI_SWIZZLE_W ) {
+      swizzle = tgsi_util_get_src_register_swizzle(
+         &reg->SrcRegister,
+         component );
+   }
+
+   return swizzle;
+}
+
+unsigned
+tgsi_util_get_src_register_extnegate(
+   const  struct tgsi_src_register_ext_swz *reg,
+   unsigned component )
+{
+   switch( component ) {
+   case 0:
+      return reg->NegateX;
+   case 1:
+      return reg->NegateY;
+   case 2:
+      return reg->NegateZ;
+   case 3:
+      return reg->NegateW;
+   default:
+      assert( 0 );
+   }
+   return 0;
+}
+
+void
+tgsi_util_set_src_register_extnegate(
+   struct tgsi_src_register_ext_swz *reg,
+   unsigned negate,
+   unsigned component )
+{
+   switch( component ) {
+   case 0:
+      reg->NegateX = negate;
+      break;
+   case 1:
+      reg->NegateY = negate;
+      break;
+   case 2:
+      reg->NegateZ = negate;
+      break;
+   case 3:
+      reg->NegateW = negate;
+      break;
+   default:
+      assert( 0 );
+   }
+}
+
+unsigned
+tgsi_util_get_full_src_register_sign_mode(
+   const struct  tgsi_full_src_register *reg,
+   unsigned component )
+{
+   unsigned sign_mode;
+
+   if( reg->SrcRegisterExtMod.Absolute ) {
+      /* Consider only the post-abs negation. */
+
+      if( reg->SrcRegisterExtMod.Negate ) {
+         sign_mode = TGSI_UTIL_SIGN_SET;
+      }
+      else {
+         sign_mode = TGSI_UTIL_SIGN_CLEAR;
+      }
+   }
+   else {
+      /* Accumulate the three negations. */
+
+      unsigned negate;
+
+      negate = reg->SrcRegister.Negate;
+      if( tgsi_util_get_src_register_extnegate( &reg->SrcRegisterExtSwz, component ) ) {
+         negate = !negate;
+      }
+      if( reg->SrcRegisterExtMod.Negate ) {
+         negate = !negate;
+      }
+
+      if( negate ) {
+         sign_mode = TGSI_UTIL_SIGN_TOGGLE;
+      }
+      else {
+         sign_mode = TGSI_UTIL_SIGN_KEEP;
+      }
+   }
+
+   return sign_mode;
+}
diff --git a/src/gallium/drivers/cell/spu/spu_vertex_fetch.c b/src/gallium/drivers/cell/spu/spu_vertex_fetch.c
new file mode 100644
index 0000000000..45e3c26c00
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_vertex_fetch.c
@@ -0,0 +1,673 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include <spu_mfcio.h>
+#include <transpose_matrix4x4.h>
+
+#include "pipe/p_util.h"
+#include "pipe/p_state.h"
+#include "pipe/p_shader_tokens.h"
+#include "spu_exec.h"
+#include "spu_vertex_shader.h"
+#include "spu_main.h"
+
+#define CACHE_NAME            attribute
+#define CACHED_TYPE           qword
+#define CACHE_TYPE            CACHE_TYPE_RO
+#define CACHE_SET_TAGID(set)  TAG_VERTEX_BUFFER
+#define CACHE_LOG2NNWAY       2
+#define CACHE_LOG2NSETS       6
+#include <cache-api.h>
+
+/* Yes folks, this is ugly.
+ */
+#undef CACHE_NWAY
+#undef CACHE_NSETS
+#define CACHE_NAME            attribute
+#define CACHE_NWAY            4
+#define CACHE_NSETS           (1U << 6)
+
+
+#define DRAW_DBG 0
+
+static const qword fetch_shuffle_data[] = {
+   /* Shuffle used by CVT_64_FLOAT
+    */
+   {
+      0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
+      0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+   },
+
+   /* Shuffle used by CVT_8_USCALED and CVT_8_SSCALED
+    */
+   {
+      0x00, 0x80, 0x80, 0x80, 0x01, 0x80, 0x80, 0x80,
+      0x02, 0x80, 0x80, 0x80, 0x03, 0x80, 0x80, 0x80,
+   },
+   
+   /* Shuffle used by CVT_16_USCALED and CVT_16_SSCALED
+    */
+   {
+      0x00, 0x01, 0x80, 0x80, 0x02, 0x03, 0x80, 0x80,
+      0x04, 0x05, 0x80, 0x80, 0x06, 0x07, 0x80, 0x80,
+   },
+   
+   /* High value shuffle used by trans4x4.
+    */
+   {
+      0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
+      0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17
+   },
+
+   /* Low value shuffle used by trans4x4.
+    */
+   {
+      0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B,
+      0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F
+   }
+};
+
+
+static INLINE void
+trans4x4(qword row0, qword row1, qword row2, qword row3, qword *out,
+         const qword *shuffle)
+{
+   qword t1 = si_shufb(row0, row2, shuffle[3]);
+   qword t2 = si_shufb(row0, row2, shuffle[4]);
+   qword t3 = si_shufb(row1, row3, shuffle[3]);
+   qword t4 = si_shufb(row1, row3, shuffle[4]);
+
+   out[0] = si_shufb(t1, t3, shuffle[3]);
+   out[1] = si_shufb(t1, t3, shuffle[4]);
+   out[2] = si_shufb(t2, t4, shuffle[3]);
+   out[3] = si_shufb(t2, t4, shuffle[4]);
+}
+
+
+/**
+ * Fetch between 1 and 32 bytes from an unaligned address
+ */
+static INLINE void
+fetch_unaligned(qword *dst, unsigned ea, unsigned size)
+{
+   qword tmp[4];
+   const int shift = ea & 0x0f;
+   const unsigned aligned_start_ea = ea & ~0x0f;
+   const unsigned aligned_end_ea = (ea + size) & ~0x0f;
+   const unsigned num_entries = ((aligned_end_ea - aligned_start_ea) / 16) + 1;
+   unsigned i;
+
+
+   if (shift == 0) {
+      /* Data is already aligned.  Fetch directly into the destination buffer.
+       */
+      for (i = 0; i < num_entries; i++) {
+	 dst[i] = cache_rd(attribute, (ea & ~0x0f) + (i * 16));
+      }
+   } else {
+      /* Fetch data from the cache to the local buffer.
+       */
+      for (i = 0; i < num_entries; i++) {
+	 tmp[i] = cache_rd(attribute, (ea & ~0x0f) + (i * 16));
+      }
+
+
+      /* Fix the alignment of the data and write to the destination buffer.
+       */
+      for (i = 0; i < ((size + 15) / 16); i++) {
+	 dst[i] = si_or((qword) spu_slqwbyte(tmp[i], shift),
+			(qword) spu_rlmaskqwbyte(tmp[i + 1], shift - 16));
+      }
+   }
+}
+
+
+#define CVT_32_FLOAT(q, s)    (*(q))
+
+static INLINE qword
+CVT_64_FLOAT(const qword *qw, const qword *shuffle)
+{
+   qword a = si_frds(qw[0]);
+   qword b = si_frds(si_rotqbyi(qw[0], 8));
+   qword c = si_frds(qw[1]);
+   qword d = si_frds(si_rotqbyi(qw[1], 8));
+
+   qword ab = si_shufb(a, b, shuffle[0]);
+   qword cd = si_shufb(c, d, si_rotqbyi(shuffle[0], 8));
+   
+   return si_or(ab, cd);
+}
+
+
+static INLINE qword
+CVT_8_USCALED(const qword *qw, const qword *shuffle)
+{
+   return si_cuflt(si_shufb(*qw, *qw, shuffle[1]), 0);
+}
+
+
+static INLINE qword
+CVT_16_USCALED(const qword *qw, const qword *shuffle)
+{
+   return si_cuflt(si_shufb(*qw, *qw, shuffle[2]), 0);
+}
+
+
+static INLINE qword
+CVT_32_USCALED(const qword *qw, const qword *shuffle)
+{
+   (void) shuffle;
+   return si_cuflt(*qw, 0);
+}
+
+static INLINE qword
+CVT_8_SSCALED(const qword *qw, const qword *shuffle)
+{
+   return si_csflt(si_shufb(*qw, *qw, shuffle[1]), 0);
+}
+
+
+static INLINE qword
+CVT_16_SSCALED(const qword *qw, const qword *shuffle)
+{
+   return si_csflt(si_shufb(*qw, *qw, shuffle[2]), 0);
+}
+
+
+static INLINE qword
+CVT_32_SSCALED(const qword *qw, const qword *shuffle)
+{
+   (void) shuffle;
+   return si_csflt(*qw, 0);
+}
+
+
+static INLINE qword
+CVT_8_UNORM(const qword *qw, const qword *shuffle)
+{
+   const qword scale = (qword) spu_splats(1.0f / 255.0f);
+   return si_fm(CVT_8_USCALED(qw, shuffle), scale);
+}
+
+
+static INLINE qword
+CVT_16_UNORM(const qword *qw, const qword *shuffle)
+{
+   const qword scale = (qword) spu_splats(1.0f / 65535.0f);
+   return si_fm(CVT_16_USCALED(qw, shuffle), scale);
+}
+
+
+static INLINE qword
+CVT_32_UNORM(const qword *qw, const qword *shuffle)
+{
+   const qword scale = (qword) spu_splats(1.0f / 4294967295.0f);
+   return si_fm(CVT_32_USCALED(qw, shuffle), scale);
+}
+
+
+static INLINE qword
+CVT_8_SNORM(const qword *qw, const qword *shuffle)
+{
+   const qword scale = (qword) spu_splats(1.0f / 127.0f);
+   return si_fm(CVT_8_SSCALED(qw, shuffle), scale);
+}
+
+
+static INLINE qword
+CVT_16_SNORM(const qword *qw, const qword *shuffle)
+{
+   const qword scale = (qword) spu_splats(1.0f / 32767.0f);
+   return si_fm(CVT_16_SSCALED(qw, shuffle), scale);
+}
+
+
+static INLINE qword
+CVT_32_SNORM(const qword *qw, const qword *shuffle)
+{
+   const qword scale = (qword) spu_splats(1.0f / 2147483647.0f);
+   return si_fm(CVT_32_SSCALED(qw, shuffle), scale);
+}
+
+#define SZ_4 si_il(0U)
+#define SZ_3 si_fsmbi(0x000f)
+#define SZ_2 si_fsmbi(0x00ff)
+#define SZ_1 si_fsmbi(0x0fff)
+
+/**
+ * Fetch a float[4] vertex attribute from memory, doing format/type
+ * conversion as needed.
+ *
+ * This is probably needed/dupliocated elsewhere, eg format
+ * conversion, texture sampling etc.
+ */
+#define FETCH_ATTRIB( NAME, SZ, CVT, N )			\
+static void							\
+fetch_##NAME(qword *out, const qword *in, qword defaults, \
+                const qword *shuffle)	\
+{								\
+   qword tmp[4];						\
+								\
+   tmp[0] = si_selb(CVT(in + (0 * N), shuffle), defaults, SZ);		\
+   tmp[1] = si_selb(CVT(in + (1 * N), shuffle), defaults, SZ);		\
+   tmp[2] = si_selb(CVT(in + (2 * N), shuffle), defaults, SZ);		\
+   tmp[3] = si_selb(CVT(in + (3 * N), shuffle), defaults, SZ);		\
+   trans4x4(tmp[0], tmp[1], tmp[2], tmp[3], out, shuffle);		\
+}
+
+
+FETCH_ATTRIB( R64G64B64A64_FLOAT,   SZ_4, CVT_64_FLOAT, 2 )
+FETCH_ATTRIB( R64G64B64_FLOAT,      SZ_3, CVT_64_FLOAT, 2 )
+FETCH_ATTRIB( R64G64_FLOAT,         SZ_2, CVT_64_FLOAT, 2 )
+FETCH_ATTRIB( R64_FLOAT,            SZ_1, CVT_64_FLOAT, 2 )
+
+FETCH_ATTRIB( R32G32B32A32_FLOAT,   SZ_4, CVT_32_FLOAT, 1 )
+FETCH_ATTRIB( R32G32B32_FLOAT,      SZ_3, CVT_32_FLOAT, 1 )
+FETCH_ATTRIB( R32G32_FLOAT,         SZ_2, CVT_32_FLOAT, 1 )
+FETCH_ATTRIB( R32_FLOAT,            SZ_1, CVT_32_FLOAT, 1 )
+
+FETCH_ATTRIB( R32G32B32A32_USCALED, SZ_4, CVT_32_USCALED, 1 )
+FETCH_ATTRIB( R32G32B32_USCALED,    SZ_3, CVT_32_USCALED, 1 )
+FETCH_ATTRIB( R32G32_USCALED,       SZ_2, CVT_32_USCALED, 1 )
+FETCH_ATTRIB( R32_USCALED,          SZ_1, CVT_32_USCALED, 1 )
+
+FETCH_ATTRIB( R32G32B32A32_SSCALED, SZ_4, CVT_32_SSCALED, 1 )
+FETCH_ATTRIB( R32G32B32_SSCALED,    SZ_3, CVT_32_SSCALED, 1 )
+FETCH_ATTRIB( R32G32_SSCALED,       SZ_2, CVT_32_SSCALED, 1 )
+FETCH_ATTRIB( R32_SSCALED,          SZ_1, CVT_32_SSCALED, 1 )
+
+FETCH_ATTRIB( R32G32B32A32_UNORM, SZ_4, CVT_32_UNORM, 1 )
+FETCH_ATTRIB( R32G32B32_UNORM,    SZ_3, CVT_32_UNORM, 1 )
+FETCH_ATTRIB( R32G32_UNORM,       SZ_2, CVT_32_UNORM, 1 )
+FETCH_ATTRIB( R32_UNORM,          SZ_1, CVT_32_UNORM, 1 )
+
+FETCH_ATTRIB( R32G32B32A32_SNORM, SZ_4, CVT_32_SNORM, 1 )
+FETCH_ATTRIB( R32G32B32_SNORM,    SZ_3, CVT_32_SNORM, 1 )
+FETCH_ATTRIB( R32G32_SNORM,       SZ_2, CVT_32_SNORM, 1 )
+FETCH_ATTRIB( R32_SNORM,          SZ_1, CVT_32_SNORM, 1 )
+
+FETCH_ATTRIB( R16G16B16A16_USCALED, SZ_4, CVT_16_USCALED, 1 )
+FETCH_ATTRIB( R16G16B16_USCALED,    SZ_3, CVT_16_USCALED, 1 )
+FETCH_ATTRIB( R16G16_USCALED,       SZ_2, CVT_16_USCALED, 1 )
+FETCH_ATTRIB( R16_USCALED,          SZ_1, CVT_16_USCALED, 1 )
+
+FETCH_ATTRIB( R16G16B16A16_SSCALED, SZ_4, CVT_16_SSCALED, 1 )
+FETCH_ATTRIB( R16G16B16_SSCALED,    SZ_3, CVT_16_SSCALED, 1 )
+FETCH_ATTRIB( R16G16_SSCALED,       SZ_2, CVT_16_SSCALED, 1 )
+FETCH_ATTRIB( R16_SSCALED,          SZ_1, CVT_16_SSCALED, 1 )
+
+FETCH_ATTRIB( R16G16B16A16_UNORM, SZ_4, CVT_16_UNORM, 1 )
+FETCH_ATTRIB( R16G16B16_UNORM,    SZ_3, CVT_16_UNORM, 1 )
+FETCH_ATTRIB( R16G16_UNORM,       SZ_2, CVT_16_UNORM, 1 )
+FETCH_ATTRIB( R16_UNORM,          SZ_1, CVT_16_UNORM, 1 )
+
+FETCH_ATTRIB( R16G16B16A16_SNORM, SZ_4, CVT_16_SNORM, 1 )
+FETCH_ATTRIB( R16G16B16_SNORM,    SZ_3, CVT_16_SNORM, 1 )
+FETCH_ATTRIB( R16G16_SNORM,       SZ_2, CVT_16_SNORM, 1 )
+FETCH_ATTRIB( R16_SNORM,          SZ_1, CVT_16_SNORM, 1 )
+
+FETCH_ATTRIB( R8G8B8A8_USCALED,   SZ_4, CVT_8_USCALED, 1 )
+FETCH_ATTRIB( R8G8B8_USCALED,     SZ_3, CVT_8_USCALED, 1 )
+FETCH_ATTRIB( R8G8_USCALED,       SZ_2, CVT_8_USCALED, 1 )
+FETCH_ATTRIB( R8_USCALED,         SZ_1, CVT_8_USCALED, 1 )
+
+FETCH_ATTRIB( R8G8B8A8_SSCALED,  SZ_4, CVT_8_SSCALED, 1 )
+FETCH_ATTRIB( R8G8B8_SSCALED,    SZ_3, CVT_8_SSCALED, 1 )
+FETCH_ATTRIB( R8G8_SSCALED,      SZ_2, CVT_8_SSCALED, 1 )
+FETCH_ATTRIB( R8_SSCALED,        SZ_1, CVT_8_SSCALED, 1 )
+
+FETCH_ATTRIB( R8G8B8A8_UNORM,  SZ_4, CVT_8_UNORM, 1 )
+FETCH_ATTRIB( R8G8B8_UNORM,    SZ_3, CVT_8_UNORM, 1 )
+FETCH_ATTRIB( R8G8_UNORM,      SZ_2, CVT_8_UNORM, 1 )
+FETCH_ATTRIB( R8_UNORM,        SZ_1, CVT_8_UNORM, 1 )
+
+FETCH_ATTRIB( R8G8B8A8_SNORM,  SZ_4, CVT_8_SNORM, 1 )
+FETCH_ATTRIB( R8G8B8_SNORM,    SZ_3, CVT_8_SNORM, 1 )
+FETCH_ATTRIB( R8G8_SNORM,      SZ_2, CVT_8_SNORM, 1 )
+FETCH_ATTRIB( R8_SNORM,        SZ_1, CVT_8_SNORM, 1 )
+
+FETCH_ATTRIB( A8R8G8B8_UNORM,       SZ_4, CVT_8_UNORM, 1 )
+
+
+
+static spu_fetch_func get_fetch_func( enum pipe_format format )
+{
+   switch (format) {
+   case PIPE_FORMAT_R64_FLOAT:
+      return fetch_R64_FLOAT;
+   case PIPE_FORMAT_R64G64_FLOAT:
+      return fetch_R64G64_FLOAT;
+   case PIPE_FORMAT_R64G64B64_FLOAT:
+      return fetch_R64G64B64_FLOAT;
+   case PIPE_FORMAT_R64G64B64A64_FLOAT:
+      return fetch_R64G64B64A64_FLOAT;
+
+   case PIPE_FORMAT_R32_FLOAT:
+      return fetch_R32_FLOAT;
+   case PIPE_FORMAT_R32G32_FLOAT:
+      return fetch_R32G32_FLOAT;
+   case PIPE_FORMAT_R32G32B32_FLOAT:
+      return fetch_R32G32B32_FLOAT;
+   case PIPE_FORMAT_R32G32B32A32_FLOAT:
+      return fetch_R32G32B32A32_FLOAT;
+
+   case PIPE_FORMAT_R32_UNORM:
+      return fetch_R32_UNORM;
+   case PIPE_FORMAT_R32G32_UNORM:
+      return fetch_R32G32_UNORM;
+   case PIPE_FORMAT_R32G32B32_UNORM:
+      return fetch_R32G32B32_UNORM;
+   case PIPE_FORMAT_R32G32B32A32_UNORM:
+      return fetch_R32G32B32A32_UNORM;
+
+   case PIPE_FORMAT_R32_USCALED:
+      return fetch_R32_USCALED;
+   case PIPE_FORMAT_R32G32_USCALED:
+      return fetch_R32G32_USCALED;
+   case PIPE_FORMAT_R32G32B32_USCALED:
+      return fetch_R32G32B32_USCALED;
+   case PIPE_FORMAT_R32G32B32A32_USCALED:
+      return fetch_R32G32B32A32_USCALED;
+
+   case PIPE_FORMAT_R32_SNORM:
+      return fetch_R32_SNORM;
+   case PIPE_FORMAT_R32G32_SNORM:
+      return fetch_R32G32_SNORM;
+   case PIPE_FORMAT_R32G32B32_SNORM:
+      return fetch_R32G32B32_SNORM;
+   case PIPE_FORMAT_R32G32B32A32_SNORM:
+      return fetch_R32G32B32A32_SNORM;
+
+   case PIPE_FORMAT_R32_SSCALED:
+      return fetch_R32_SSCALED;
+   case PIPE_FORMAT_R32G32_SSCALED:
+      return fetch_R32G32_SSCALED;
+   case PIPE_FORMAT_R32G32B32_SSCALED:
+      return fetch_R32G32B32_SSCALED;
+   case PIPE_FORMAT_R32G32B32A32_SSCALED:
+      return fetch_R32G32B32A32_SSCALED;
+
+   case PIPE_FORMAT_R16_UNORM:
+      return fetch_R16_UNORM;
+   case PIPE_FORMAT_R16G16_UNORM:
+      return fetch_R16G16_UNORM;
+   case PIPE_FORMAT_R16G16B16_UNORM:
+      return fetch_R16G16B16_UNORM;
+   case PIPE_FORMAT_R16G16B16A16_UNORM:
+      return fetch_R16G16B16A16_UNORM;
+
+   case PIPE_FORMAT_R16_USCALED:
+      return fetch_R16_USCALED;
+   case PIPE_FORMAT_R16G16_USCALED:
+      return fetch_R16G16_USCALED;
+   case PIPE_FORMAT_R16G16B16_USCALED:
+      return fetch_R16G16B16_USCALED;
+   case PIPE_FORMAT_R16G16B16A16_USCALED:
+      return fetch_R16G16B16A16_USCALED;
+
+   case PIPE_FORMAT_R16_SNORM:
+      return fetch_R16_SNORM;
+   case PIPE_FORMAT_R16G16_SNORM:
+      return fetch_R16G16_SNORM;
+   case PIPE_FORMAT_R16G16B16_SNORM:
+      return fetch_R16G16B16_SNORM;
+   case PIPE_FORMAT_R16G16B16A16_SNORM:
+      return fetch_R16G16B16A16_SNORM;
+
+   case PIPE_FORMAT_R16_SSCALED:
+      return fetch_R16_SSCALED;
+   case PIPE_FORMAT_R16G16_SSCALED:
+      return fetch_R16G16_SSCALED;
+   case PIPE_FORMAT_R16G16B16_SSCALED:
+      return fetch_R16G16B16_SSCALED;
+   case PIPE_FORMAT_R16G16B16A16_SSCALED:
+      return fetch_R16G16B16A16_SSCALED;
+
+   case PIPE_FORMAT_R8_UNORM:
+      return fetch_R8_UNORM;
+   case PIPE_FORMAT_R8G8_UNORM:
+      return fetch_R8G8_UNORM;
+   case PIPE_FORMAT_R8G8B8_UNORM:
+      return fetch_R8G8B8_UNORM;
+   case PIPE_FORMAT_R8G8B8A8_UNORM:
+      return fetch_R8G8B8A8_UNORM;
+
+   case PIPE_FORMAT_R8_USCALED:
+      return fetch_R8_USCALED;
+   case PIPE_FORMAT_R8G8_USCALED:
+      return fetch_R8G8_USCALED;
+   case PIPE_FORMAT_R8G8B8_USCALED:
+      return fetch_R8G8B8_USCALED;
+   case PIPE_FORMAT_R8G8B8A8_USCALED:
+      return fetch_R8G8B8A8_USCALED;
+
+   case PIPE_FORMAT_R8_SNORM:
+      return fetch_R8_SNORM;
+   case PIPE_FORMAT_R8G8_SNORM:
+      return fetch_R8G8_SNORM;
+   case PIPE_FORMAT_R8G8B8_SNORM:
+      return fetch_R8G8B8_SNORM;
+   case PIPE_FORMAT_R8G8B8A8_SNORM:
+      return fetch_R8G8B8A8_SNORM;
+
+   case PIPE_FORMAT_R8_SSCALED:
+      return fetch_R8_SSCALED;
+   case PIPE_FORMAT_R8G8_SSCALED:
+      return fetch_R8G8_SSCALED;
+   case PIPE_FORMAT_R8G8B8_SSCALED:
+      return fetch_R8G8B8_SSCALED;
+   case PIPE_FORMAT_R8G8B8A8_SSCALED:
+      return fetch_R8G8B8A8_SSCALED;
+
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+      return fetch_A8R8G8B8_UNORM;
+
+   case 0:
+      return NULL;		/* not sure why this is needed */
+
+   default:
+      assert(0);
+      return NULL;
+   }
+}
+
+
+static unsigned get_vertex_size( enum pipe_format format )
+{
+   switch (format) {
+   case PIPE_FORMAT_R64_FLOAT:
+      return 8;
+   case PIPE_FORMAT_R64G64_FLOAT:
+      return 2 * 8;
+   case PIPE_FORMAT_R64G64B64_FLOAT:
+      return 3 * 8;
+   case PIPE_FORMAT_R64G64B64A64_FLOAT:
+      return 4 * 8;
+
+   case PIPE_FORMAT_R32_SSCALED:
+   case PIPE_FORMAT_R32_SNORM:
+   case PIPE_FORMAT_R32_USCALED:
+   case PIPE_FORMAT_R32_UNORM:
+   case PIPE_FORMAT_R32_FLOAT:
+      return 4;
+   case PIPE_FORMAT_R32G32_SSCALED:
+   case PIPE_FORMAT_R32G32_SNORM:
+   case PIPE_FORMAT_R32G32_USCALED:
+   case PIPE_FORMAT_R32G32_UNORM:
+   case PIPE_FORMAT_R32G32_FLOAT:
+      return 2 * 4;
+   case PIPE_FORMAT_R32G32B32_SSCALED:
+   case PIPE_FORMAT_R32G32B32_SNORM:
+   case PIPE_FORMAT_R32G32B32_USCALED:
+   case PIPE_FORMAT_R32G32B32_UNORM:
+   case PIPE_FORMAT_R32G32B32_FLOAT:
+      return 3 * 4;
+   case PIPE_FORMAT_R32G32B32A32_SSCALED:
+   case PIPE_FORMAT_R32G32B32A32_SNORM:
+   case PIPE_FORMAT_R32G32B32A32_USCALED:
+   case PIPE_FORMAT_R32G32B32A32_UNORM:
+   case PIPE_FORMAT_R32G32B32A32_FLOAT:
+      return 4 * 4;
+
+   case PIPE_FORMAT_R16_SSCALED:
+   case PIPE_FORMAT_R16_SNORM:
+   case PIPE_FORMAT_R16_UNORM:
+   case PIPE_FORMAT_R16_USCALED:
+      return 2;
+   case PIPE_FORMAT_R16G16_SSCALED:
+   case PIPE_FORMAT_R16G16_SNORM:
+   case PIPE_FORMAT_R16G16_USCALED:
+   case PIPE_FORMAT_R16G16_UNORM:
+      return 2 * 2;
+   case PIPE_FORMAT_R16G16B16_SSCALED:
+   case PIPE_FORMAT_R16G16B16_SNORM:
+   case PIPE_FORMAT_R16G16B16_USCALED:
+   case PIPE_FORMAT_R16G16B16_UNORM:
+      return 3 * 2;
+   case PIPE_FORMAT_R16G16B16A16_SSCALED:
+   case PIPE_FORMAT_R16G16B16A16_SNORM:
+   case PIPE_FORMAT_R16G16B16A16_USCALED:
+   case PIPE_FORMAT_R16G16B16A16_UNORM:
+      return 4 * 2;
+
+   case PIPE_FORMAT_R8_SSCALED:
+   case PIPE_FORMAT_R8_SNORM:
+   case PIPE_FORMAT_R8_USCALED:
+   case PIPE_FORMAT_R8_UNORM:
+      return 1;
+   case PIPE_FORMAT_R8G8_SSCALED:
+   case PIPE_FORMAT_R8G8_SNORM:
+   case PIPE_FORMAT_R8G8_USCALED:
+   case PIPE_FORMAT_R8G8_UNORM:
+      return 2 * 1;
+   case PIPE_FORMAT_R8G8B8_SSCALED:
+   case PIPE_FORMAT_R8G8B8_SNORM:
+   case PIPE_FORMAT_R8G8B8_USCALED:
+   case PIPE_FORMAT_R8G8B8_UNORM:
+      return 3 * 1;
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+   case PIPE_FORMAT_R8G8B8A8_SSCALED:
+   case PIPE_FORMAT_R8G8B8A8_SNORM:
+   case PIPE_FORMAT_R8G8B8A8_USCALED:
+   case PIPE_FORMAT_R8G8B8A8_UNORM:
+      return 4 * 1;
+
+   case 0:
+      return 0;		/* not sure why this is needed */
+
+   default:
+      assert(0);
+      return 0;
+   }
+}
+
+
+/**
+ * Fetch vertex attributes for 'count' vertices.
+ */
+static void generic_vertex_fetch(struct spu_vs_context *draw,
+                                 struct spu_exec_machine *machine,
+                                 const unsigned *elts,
+                                 unsigned count)
+{
+   unsigned nr_attrs = draw->vertex_fetch.nr_attrs;
+   unsigned attr;
+
+   assert(count <= 4);
+
+#if DRAW_DBG
+   printf("SPU: %s count = %u, nr_attrs = %u\n", 
+          __FUNCTION__, count, nr_attrs);
+#endif
+
+   /* loop over vertex attributes (vertex shader inputs)
+    */
+   for (attr = 0; attr < nr_attrs; attr++) {
+      const qword default_values = (qword)(vec_float4){ 0.0, 0.0, 0.0, 1.0 };
+      const unsigned pitch = draw->vertex_fetch.pitch[attr];
+      const uint64_t src = draw->vertex_fetch.src_ptr[attr];
+      const spu_fetch_func fetch = draw->vertex_fetch.fetch[attr];
+      unsigned i;
+      unsigned idx;
+      const unsigned bytes_per_entry = draw->vertex_fetch.size[attr];
+      const unsigned quads_per_entry = (bytes_per_entry + 15) / 16;
+      qword in[2 * 4];
+
+
+      /* Fetch four attributes for four vertices.  
+       */
+      idx = 0;
+      for (i = 0; i < count; i++) {
+         const uint64_t addr = src + (elts[i] * pitch);
+
+#if DRAW_DBG
+         printf("SPU: fetching = 0x%llx\n", addr);
+#endif
+
+         fetch_unaligned(& in[idx], addr, bytes_per_entry);
+         idx += quads_per_entry;
+      }
+
+      /* Be nice and zero out any missing vertices.
+       */
+      (void) memset(& in[idx], 0, (8 - idx) * sizeof(qword));
+
+
+      /* Convert all 4 vertices to vectors of float.
+       */
+      (*fetch)(&machine->Inputs[attr].xyzw[0].q, in, default_values,
+               fetch_shuffle_data);
+   }
+}
+
+
+void spu_update_vertex_fetch( struct spu_vs_context *draw )
+{
+   unsigned i;
+
+   
+   /* Invalidate the vertex cache.
+    */
+   for (i = 0; i < (CACHE_NWAY * CACHE_NSETS); i++) {
+      CACHELINE_CLEARVALID(i);
+   }
+
+
+   for (i = 0; i < draw->vertex_fetch.nr_attrs; i++) {
+      draw->vertex_fetch.fetch[i] =
+          get_fetch_func(draw->vertex_fetch.format[i]);
+      draw->vertex_fetch.size[i] =
+          get_vertex_size(draw->vertex_fetch.format[i]);
+   }
+
+   draw->vertex_fetch.fetch_func = generic_vertex_fetch;
+}
diff --git a/src/gallium/drivers/cell/spu/spu_vertex_shader.c b/src/gallium/drivers/cell/spu/spu_vertex_shader.c
new file mode 100644
index 0000000000..c1cbbb6d1e
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_vertex_shader.c
@@ -0,0 +1,231 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  *   Brian Paul
+  *   Ian Romanick <idr@us.ibm.com>
+  */
+
+#include <spu_mfcio.h>
+
+#include "pipe/p_util.h"
+#include "pipe/p_state.h"
+#include "pipe/p_shader_tokens.h"
+#include "spu_vertex_shader.h"
+#include "spu_exec.h"
+#include "pipe/draw/draw_private.h"
+#include "pipe/draw/draw_context.h"
+#include "pipe/cell/common.h"
+#include "spu_main.h"
+
+static INLINE unsigned
+compute_clipmask(const float *clip, /*const*/ float plane[][4], unsigned nr)
+{
+   unsigned mask = 0;
+   unsigned i;
+
+   /* Do the hardwired planes first:
+    */
+   if (-clip[0] + clip[3] < 0) mask |= CLIP_RIGHT_BIT;
+   if ( clip[0] + clip[3] < 0) mask |= CLIP_LEFT_BIT;
+   if (-clip[1] + clip[3] < 0) mask |= CLIP_TOP_BIT;
+   if ( clip[1] + clip[3] < 0) mask |= CLIP_BOTTOM_BIT;
+   if (-clip[2] + clip[3] < 0) mask |= CLIP_FAR_BIT;
+   if ( clip[2] + clip[3] < 0) mask |= CLIP_NEAR_BIT;
+
+   /* Followed by any remaining ones:
+    */
+   for (i = 6; i < nr; i++) {
+      if (dot4(clip, plane[i]) < 0) 
+         mask |= (1<<i);
+   }
+
+   return mask;
+}
+
+
+/**
+ * Transform vertices with the current vertex program/shader
+ * Up to four vertices can be shaded at a time.
+ * \param vbuffer  the input vertex data
+ * \param elts  indexes of four input vertices
+ * \param count  number of vertices to shade [1..4]
+ * \param vOut  array of pointers to four output vertices
+ */
+static void
+run_vertex_program(struct spu_vs_context *draw,
+                   unsigned elts[4], unsigned count,
+                   const uint64_t *vOut)
+{
+   struct spu_exec_machine *machine = &draw->machine;
+   unsigned int j;
+
+   ALIGN16_DECL(struct spu_exec_vector, inputs, PIPE_ATTRIB_MAX);
+   ALIGN16_DECL(struct spu_exec_vector, outputs, PIPE_ATTRIB_MAX);
+   const float *scale = draw->viewport.scale;
+   const float *trans = draw->viewport.translate;
+
+   assert(count <= 4);
+
+   machine->Processor = TGSI_PROCESSOR_VERTEX;
+
+   ASSERT_ALIGN16(draw->constants);
+   machine->Consts = (float (*)[4]) draw->constants;
+
+   machine->Inputs = ALIGN16_ASSIGN(inputs);
+   machine->Outputs = ALIGN16_ASSIGN(outputs);
+
+   spu_vertex_fetch( draw, machine, elts, count );
+
+   /* run shader */
+   spu_exec_machine_run( machine );
+
+
+   /* store machine results */
+   for (j = 0; j < count; j++) {
+      unsigned slot;
+      float x, y, z, w;
+      unsigned char buffer[sizeof(struct vertex_header)
+          + MAX_VERTEX_SIZE] ALIGN16_ATTRIB;
+      struct vertex_header *const tmpOut =
+          (struct vertex_header *) buffer;
+      const unsigned vert_size = ROUNDUP16(sizeof(struct vertex_header)
+                                           + (sizeof(float) * 4 
+                                              * draw->num_vs_outputs));
+
+      mfc_get(tmpOut, vOut[j], vert_size, TAG_VERTEX_BUFFER, 0, 0);
+      wait_on_mask(1 << TAG_VERTEX_BUFFER);
+
+
+      /* Handle attr[0] (position) specially:
+       *
+       * XXX: Computing the clipmask should be done in the vertex
+       * program as a set of DP4 instructions appended to the
+       * user-provided code.
+       */
+      x = tmpOut->clip[0] = machine->Outputs[0].xyzw[0].f[j];
+      y = tmpOut->clip[1] = machine->Outputs[0].xyzw[1].f[j];
+      z = tmpOut->clip[2] = machine->Outputs[0].xyzw[2].f[j];
+      w = tmpOut->clip[3] = machine->Outputs[0].xyzw[3].f[j];
+
+      tmpOut->clipmask = compute_clipmask(tmpOut->clip, draw->plane,
+					   draw->nr_planes);
+      tmpOut->edgeflag = 1;
+
+      /* divide by w */
+      w = 1.0f / w;
+      x *= w;
+      y *= w;
+      z *= w;
+
+      /* Viewport mapping */
+      tmpOut->data[0][0] = x * scale[0] + trans[0];
+      tmpOut->data[0][1] = y * scale[1] + trans[1];
+      tmpOut->data[0][2] = z * scale[2] + trans[2];
+      tmpOut->data[0][3] = w;
+
+      /* Remaining attributes are packed into sequential post-transform
+       * vertex attrib slots.
+       */
+      for (slot = 1; slot < draw->num_vs_outputs; slot++) {
+         tmpOut->data[slot][0] = machine->Outputs[slot].xyzw[0].f[j];
+         tmpOut->data[slot][1] = machine->Outputs[slot].xyzw[1].f[j];
+         tmpOut->data[slot][2] = machine->Outputs[slot].xyzw[2].f[j];
+         tmpOut->data[slot][3] = machine->Outputs[slot].xyzw[3].f[j];
+      }
+
+      mfc_put(tmpOut, vOut[j], vert_size, TAG_VERTEX_BUFFER, 0, 0);
+   } /* loop over vertices */
+}
+
+
+static void
+spu_bind_vertex_shader(struct spu_vs_context *draw,
+		       void *uniforms,
+		       void *planes,
+		       unsigned nr_planes,
+		       unsigned num_outputs
+		       )
+{
+   draw->constants = (float (*)[4]) uniforms;
+
+   (void) memcpy(draw->plane, planes, sizeof(float) * 4 * nr_planes);
+   draw->nr_planes = nr_planes;
+   draw->num_vs_outputs = num_outputs;
+
+   /* specify the shader to interpret/execute */
+   spu_exec_machine_init(&draw->machine,
+			 PIPE_MAX_SAMPLERS,
+			 NULL /*samplers*/,
+			 PIPE_SHADER_VERTEX);
+}
+
+
+unsigned char immediates[(sizeof(float) * 4 * TGSI_EXEC_NUM_IMMEDIATES) + 32]
+    ALIGN16_ATTRIB;
+
+void
+spu_execute_vertex_shader(struct spu_vs_context *draw,
+                          const struct cell_command_vs *vs)
+{
+   unsigned i;
+
+   const uint64_t immediate_addr = vs->shader.immediates;
+   const unsigned immediate_size = 
+       ROUNDUP16((sizeof(float) * 4 * vs->shader.num_immediates)
+                 + (immediate_addr & 0x0f));
+
+   mfc_get(immediates, immediate_addr & ~0x0f, immediate_size,
+           TAG_VERTEX_BUFFER, 0, 0);
+
+   draw->machine.Instructions = (struct tgsi_full_instruction *)
+       vs->shader.instructions;
+   draw->machine.NumInstructions = vs->shader.num_instructions;
+
+   draw->machine.Declarations = (struct tgsi_full_declaration *)
+       vs->shader.declarations;
+   draw->machine.NumDeclarations = vs->shader.num_declarations;
+
+   draw->vertex_fetch.nr_attrs = vs->nr_attrs;
+
+   wait_on_mask(1 << TAG_VERTEX_BUFFER);
+
+   (void) memcpy(& draw->machine.Imms, &immediates[immediate_addr & 0x0f],
+                 sizeof(float) * 4 * vs->shader.num_immediates);
+
+   spu_bind_vertex_shader(draw, vs->shader.uniforms,
+                          vs->plane, vs->nr_planes,
+                          vs->shader.num_outputs);
+
+   for (i = 0; i < vs->num_elts; i += 4) {
+      const unsigned batch_size = MIN2(vs->num_elts - i, 4);
+
+      run_vertex_program(draw, & vs->elts[i], batch_size, &vs->vOut[i]);
+   }
+}
diff --git a/src/gallium/drivers/cell/spu/spu_vertex_shader.h b/src/gallium/drivers/cell/spu/spu_vertex_shader.h
new file mode 100644
index 0000000000..b5bf31e67d
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_vertex_shader.h
@@ -0,0 +1,63 @@
+#ifndef SPU_VERTEX_SHADER_H
+#define SPU_VERTEX_SHADER_H
+
+#include "pipe/p_format.h"
+#include "spu_exec.h"
+
+struct spu_vs_context;
+
+typedef void (*spu_fetch_func)(qword *out, const qword *in, qword defaults,
+			       const qword *shuffle_data);
+typedef void (*spu_full_fetch_func)( struct spu_vs_context *draw,
+				     struct spu_exec_machine *machine,
+				     const unsigned *elts,
+				     unsigned count );
+
+struct spu_vs_context {
+   struct pipe_viewport_state viewport;
+
+   struct {
+      uint64_t src_ptr[PIPE_ATTRIB_MAX];
+      unsigned pitch[PIPE_ATTRIB_MAX];
+      unsigned size[PIPE_ATTRIB_MAX];
+      enum pipe_format format[PIPE_ATTRIB_MAX];
+      unsigned nr_attrs;
+      boolean dirty;
+
+      spu_fetch_func fetch[PIPE_ATTRIB_MAX];
+      spu_full_fetch_func fetch_func;
+   } vertex_fetch;
+   
+   /* Clip derived state:
+    */
+   float plane[12][4];
+   unsigned nr_planes;
+
+   struct spu_exec_machine machine;
+   const float (*constants)[4];
+
+   unsigned num_vs_outputs;
+};
+
+extern void spu_update_vertex_fetch(struct spu_vs_context *draw);
+
+static INLINE void spu_vertex_fetch(struct spu_vs_context *draw,
+				    struct spu_exec_machine *machine,
+				    const unsigned *elts,
+				    unsigned count)
+{
+   if (draw->vertex_fetch.dirty) {
+      spu_update_vertex_fetch(draw);
+      draw->vertex_fetch.dirty = 0;
+   }
+   
+   (*draw->vertex_fetch.fetch_func)(draw, machine, elts, count);
+}
+
+struct cell_command_vs;
+
+extern void
+spu_execute_vertex_shader(struct spu_vs_context *draw,
+			  const struct cell_command_vs *vs);
+
+#endif /* SPU_VERTEX_SHADER_H */
diff --git a/src/gallium/drivers/cell/spu/spu_ztest.h b/src/gallium/drivers/cell/spu/spu_ztest.h
new file mode 100644
index 0000000000..ce8ad00339
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_ztest.h
@@ -0,0 +1,135 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+/**
+ * Zbuffer/depth test code.
+ */
+
+
+#ifndef SPU_ZTEST_H
+#define SPU_ZTEST_H
+
+
+#ifdef __SPU__
+#include <spu_intrinsics.h>
+#endif
+
+
+
+/**
+ * Perform Z testing for a 16-bit/value Z buffer.
+ *
+ * \param zvals  vector of four fragment zvalues as floats
+ * \param zbuf   ptr to vector of ushort[8] zbuffer values.  Note that this
+ *               contains the Z values for 2 quads, 8 pixels.
+ * \param x      x coordinate of quad (only lsbit is significant)
+ * \param inMask indicates which fragments in the quad are alive
+ * \return new mask indicating which fragments are alive after ztest
+ */
+static INLINE vector unsigned int
+spu_z16_test_less(vector float zvals, vector unsigned short *zbuf,
+                  uint x, vector unsigned int inMask)
+{
+#define ZERO 0x80
+   vector unsigned int zvals_ui4, zbuf_ui4, mask;
+
+   /* convert floats to uints in [0, 65535] */
+   zvals_ui4 = spu_convtu(zvals, 32); /* convert to [0, 2^32] */
+   zvals_ui4 = spu_rlmask(zvals_ui4, -16);  /* right shift 16 */
+
+   /* XXX this conditional could be removed with a bit of work */
+   if (x & 1) {
+      /* convert zbuffer values from ushorts to uints */
+      /* gather lower four ushorts */
+      zbuf_ui4 = spu_shuffle((vector unsigned int) *zbuf,
+                             (vector unsigned int) *zbuf,
+                             ((vector unsigned char) {
+                                ZERO, ZERO,  8,  9, ZERO, ZERO, 10, 11,
+                                ZERO, ZERO, 12, 13, ZERO, ZERO, 14, 15}));
+      /* mask = (zbuf_ui4 < zvals_ui4) ? ~0 : 0 */
+      mask = spu_cmpgt(zbuf_ui4, zvals_ui4);
+      /* mask &= inMask */
+      mask = spu_and(mask, inMask);
+      /* zbuf = mask ? zval : zbuf */
+      zbuf_ui4 = spu_sel(zbuf_ui4, zvals_ui4, mask);
+      /* convert zbuffer values from uints back to ushorts, preserve lower 4 */
+      *zbuf = (vector unsigned short)
+         spu_shuffle(zbuf_ui4, (vector unsigned int) *zbuf,
+                     ((vector unsigned char) {
+                        16, 17, 18, 19, 20, 21, 22, 23,
+                        2, 3, 6, 7, 10, 11, 14, 15}));
+   }
+   else {
+      /* convert zbuffer values from ushorts to uints */
+      /* gather upper four ushorts */
+      zbuf_ui4 = spu_shuffle((vector unsigned int) *zbuf,
+                             (vector unsigned int) *zbuf,
+                             ((vector unsigned char) {
+                                ZERO, ZERO, 0, 1, ZERO, ZERO, 2, 3,
+                                ZERO, ZERO, 4, 5, ZERO, ZERO, 6, 7}));
+      /* mask = (zbuf_ui4 < zvals_ui4) ? ~0 : 0 */
+      mask = spu_cmpgt(zbuf_ui4, zvals_ui4);
+      /* mask &= inMask */
+      mask = spu_and(mask, inMask);
+      /* zbuf = mask ? zval : zbuf */
+      zbuf_ui4 = spu_sel(zbuf_ui4, zvals_ui4, mask);
+      /* convert zbuffer values from uints back to ushorts, preserve upper 4 */
+      *zbuf = (vector unsigned short)
+         spu_shuffle(zbuf_ui4, (vector unsigned int) *zbuf,
+                     ((vector unsigned char) {
+                        2, 3, 6, 7, 10, 11, 14, 15,
+                        24, 25, 26, 27, 28, 29, 30, 31}));
+   }
+   return mask;
+#undef ZERO
+}
+
+
+/**
+ * As above, but Zbuffer values as 32-bit uints
+ */
+static INLINE vector unsigned int
+spu_z32_test_less(vector float zvals, vector unsigned int *zbuf_ptr,
+                  vector unsigned int inMask)
+{
+   vector unsigned int zvals_ui4, mask, zbuf = *zbuf_ptr;
+
+   /* convert floats to uints in [0, 0xffffffff] */
+   zvals_ui4 = spu_convtu(zvals, 32);
+   /* mask = (zbuf < zvals_ui4) ? ~0 : 0 */
+   mask = spu_cmpgt(zbuf, zvals_ui4);
+   /* mask &= inMask */
+   mask = spu_and(mask, inMask);
+   /* zbuf = mask ? zval : zbuf */
+   *zbuf_ptr = spu_sel(zbuf, zvals_ui4, mask);
+
+   return mask;
+}
+
+
+#endif /* SPU_ZTEST_H */
-- 
cgit v1.2.3


From 6acd63a4980951727939c0dd545a0324965b3834 Mon Sep 17 00:00:00 2001
From: José Fonseca <jrfonseca@tungstengraphics.com>
Date: Fri, 15 Feb 2008 17:50:12 +0900
Subject: Code reorganization: update build.

Update the Makefiles and includes for the new paths.

Note that there hasn't been no separation of the Makefiles yet, and make is
jumping all over the place. That will be taken care shortly. But for now, make
should work. It was tested with linux and linux-dri. Linux-cell and linux-llvm
might require some minor tweaks.
---
 configs/beos                                       |   2 +-
 configs/darwin                                     |   2 +-
 configs/darwin-x86ppc                              |   2 +-
 configs/default                                    |   2 +-
 configs/freebsd-dri                                |   2 +-
 configs/linux-cell                                 |   2 +-
 configs/linux-directfb                             |   2 +-
 configs/linux-dri                                  |   6 +-
 configs/linux-dri-xcb                              |   4 +-
 configs/linux-fbdev                                |   2 +-
 configs/linux-osmesa                               |   2 +-
 configs/linux-osmesa16                             |   2 +-
 configs/linux-osmesa16-static                      |   2 +-
 configs/linux-osmesa32                             |   2 +-
 configs/linux-solo                                 |   2 +-
 src/gallium/Makefile                               |  12 +--
 src/gallium/Makefile.template                      |   7 +-
 src/gallium/aux/Makefile                           |  24 +++++
 src/gallium/aux/draw/draw_private.h                |   2 +-
 src/gallium/aux/draw/draw_vertex.c                 |   4 +-
 src/gallium/aux/draw/draw_vertex_shader.c          |   4 +-
 src/gallium/aux/llvm/Makefile                      |   4 +-
 src/gallium/aux/llvm/gallivm.cpp                   |   4 +-
 src/gallium/aux/llvm/gallivm_cpu.cpp               |   4 +-
 src/gallium/aux/llvm/tgsitollvm.cpp                |  10 +-
 src/gallium/aux/pipebuffer/Makefile                |   2 +-
 src/gallium/aux/tgsi/exec/tgsi_exec.c              |   4 +-
 src/gallium/aux/tgsi/exec/tgsi_sse2.c              |   4 +-
 src/gallium/aux/tgsi/util/tgsi_transform.h         |   4 +-
 src/gallium/drivers/Makefile                       |  24 +++++
 src/gallium/drivers/cell/ppu/Makefile              |   7 +-
 src/gallium/drivers/cell/ppu/cell_clear.c          |   2 +-
 src/gallium/drivers/cell/ppu/cell_context.c        |   6 +-
 src/gallium/drivers/cell/ppu/cell_context.h        |   6 +-
 src/gallium/drivers/cell/ppu/cell_draw_arrays.c    |   2 +-
 src/gallium/drivers/cell/ppu/cell_flush.c          |   2 +-
 src/gallium/drivers/cell/ppu/cell_render.c         |   2 +-
 src/gallium/drivers/cell/ppu/cell_spu.c            |   2 +-
 src/gallium/drivers/cell/ppu/cell_spu.h            |   2 +-
 src/gallium/drivers/cell/ppu/cell_state_blend.c    |   2 +-
 src/gallium/drivers/cell/ppu/cell_state_clip.c     |   2 +-
 src/gallium/drivers/cell/ppu/cell_state_derived.c  |   4 +-
 src/gallium/drivers/cell/ppu/cell_state_fs.c       |   8 +-
 .../drivers/cell/ppu/cell_state_rasterizer.c       |   2 +-
 src/gallium/drivers/cell/ppu/cell_state_sampler.c  |   2 +-
 src/gallium/drivers/cell/ppu/cell_state_vertex.c   |   2 +-
 src/gallium/drivers/cell/ppu/cell_surface.c        |   2 +-
 src/gallium/drivers/cell/ppu/cell_vbuf.c           |   2 +-
 src/gallium/drivers/cell/ppu/cell_vertex_shader.c  |   6 +-
 src/gallium/drivers/cell/spu/Makefile              |   6 +-
 src/gallium/drivers/cell/spu/spu_exec.c            |   4 +-
 src/gallium/drivers/cell/spu/spu_exec.h            |   2 +-
 src/gallium/drivers/cell/spu/spu_main.c            |   2 +-
 src/gallium/drivers/cell/spu/spu_main.h            |   4 +-
 src/gallium/drivers/cell/spu/spu_render.c          |   2 +-
 src/gallium/drivers/cell/spu/spu_render.h          |   2 +-
 src/gallium/drivers/cell/spu/spu_tile.h            |   2 +-
 src/gallium/drivers/cell/spu/spu_util.c            |   4 +-
 src/gallium/drivers/cell/spu/spu_vertex_shader.c   |   6 +-
 src/gallium/drivers/failover/Makefile              |   2 +-
 src/gallium/drivers/i915simple/Makefile            |   2 +-
 src/gallium/drivers/i915simple/i915_context.c      |   2 +-
 src/gallium/drivers/i915simple/i915_context.h      |   2 +-
 .../drivers/i915simple/i915_fpc_translate.c        |   4 +-
 src/gallium/drivers/i915simple/i915_prim_emit.c    |   2 +-
 src/gallium/drivers/i915simple/i915_prim_vbuf.c    |   2 +-
 src/gallium/drivers/i915simple/i915_state.c        |   2 +-
 .../drivers/i915simple/i915_state_derived.c        |   4 +-
 src/gallium/drivers/i915simple/i915_strings.c      |   2 +-
 src/gallium/drivers/i915simple/i915_surface.c      |   2 +-
 src/gallium/drivers/i965simple/Makefile            |   2 +-
 src/gallium/drivers/i965simple/brw_shader_info.c   |   2 +-
 src/gallium/drivers/i965simple/brw_state.c         |   2 +-
 src/gallium/drivers/i965simple/brw_strings.c       |   2 +-
 src/gallium/drivers/i965simple/brw_surface.c       |   2 +-
 src/gallium/drivers/i965simple/brw_vs_emit.c       |   2 +-
 src/gallium/drivers/i965simple/brw_wm_decl.c       |   2 +-
 src/gallium/drivers/i965simple/brw_wm_glsl.c       |   2 +-
 src/gallium/drivers/softpipe/Makefile              |   2 +-
 src/gallium/drivers/softpipe/sp_context.c          |   2 +-
 src/gallium/drivers/softpipe/sp_context.h          |   2 +-
 src/gallium/drivers/softpipe/sp_draw_arrays.c      |   2 +-
 src/gallium/drivers/softpipe/sp_flush.c            |   2 +-
 src/gallium/drivers/softpipe/sp_headers.h          |   2 +-
 src/gallium/drivers/softpipe/sp_prim_setup.c       |   4 +-
 src/gallium/drivers/softpipe/sp_prim_vbuf.c        |   6 +-
 src/gallium/drivers/softpipe/sp_quad_fs.c          |   2 +-
 src/gallium/drivers/softpipe/sp_query.c            |   2 +-
 src/gallium/drivers/softpipe/sp_state_clip.c       |   2 +-
 src/gallium/drivers/softpipe/sp_state_derived.c    |   6 +-
 src/gallium/drivers/softpipe/sp_state_fs.c         |   8 +-
 src/gallium/drivers/softpipe/sp_state_rasterizer.c |   2 +-
 src/gallium/drivers/softpipe/sp_state_sampler.c    |   4 +-
 src/gallium/drivers/softpipe/sp_state_vertex.c     |   2 +-
 src/gallium/drivers/softpipe/sp_surface.c          |   2 +-
 src/gallium/drivers/softpipe/sp_tex_sample.c       |   2 +-
 src/gallium/drivers/softpipe/sp_tile_cache.c       |   2 +-
 src/gallium/winsys/dri/Makefile                    |  38 +++++++
 src/gallium/winsys/dri/Makefile.template           | 113 +++++++++++++++++++++
 src/gallium/winsys/dri/intel/Makefile              |   8 +-
 src/gallium/winsys/dri/intel/intel_winsys_i915.c   |   2 +-
 .../winsys/dri/intel/intel_winsys_softpipe.c       |   2 +-
 src/gallium/winsys/xlib/xm_winsys.c                |   6 +-
 src/gallium/winsys/xlib/xm_winsys_aub.c            |   2 +-
 src/mesa/Makefile                                  |  16 ++-
 src/mesa/drivers/x11/xm_api.c                      |   2 +-
 src/mesa/drivers/x11/xm_dd.c                       |   2 +-
 src/mesa/drivers/x11/xm_surface.c                  |   8 +-
 src/mesa/drivers/x11/xm_winsys.c                   |   2 +-
 src/mesa/drivers/x11/xmesaP.h                      |   4 +-
 src/mesa/sources                                   |  83 +++++++--------
 src/mesa/state_tracker/st_atom_shader.c            |   2 +-
 src/mesa/state_tracker/st_cache.c                  |   4 +-
 src/mesa/state_tracker/st_cache.h                  |   2 +-
 src/mesa/state_tracker/st_cb_accum.c               |   2 +-
 src/mesa/state_tracker/st_cb_drawpixels.c          |   2 +-
 src/mesa/state_tracker/st_cb_feedback.c            |   6 +-
 src/mesa/state_tracker/st_cb_program.c             |   4 +-
 src/mesa/state_tracker/st_cb_rasterpos.c           |   4 +-
 src/mesa/state_tracker/st_cb_readpixels.c          |   2 +-
 src/mesa/state_tracker/st_cb_texture.c             |   2 +-
 src/mesa/state_tracker/st_context.c                |   4 +-
 src/mesa/state_tracker/st_debug.c                  |   4 +-
 src/mesa/state_tracker/st_draw.c                   |   4 +-
 src/mesa/state_tracker/st_gen_mipmap.c             |   2 +-
 src/mesa/state_tracker/st_mesa_to_tgsi.c           |   6 +-
 src/mesa/state_tracker/st_program.c                |   4 +-
 127 files changed, 445 insertions(+), 241 deletions(-)
 create mode 100644 src/gallium/aux/Makefile
 create mode 100644 src/gallium/drivers/Makefile
 create mode 100644 src/gallium/winsys/dri/Makefile
 create mode 100644 src/gallium/winsys/dri/Makefile.template

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/configs/beos b/configs/beos
index f07973d0c7..2b74af739d 100644
--- a/configs/beos
+++ b/configs/beos
@@ -86,7 +86,7 @@ else
 endif
 
 # Directories
-SRC_DIRS = mesa glu glut/beos
+SRC_DIRS = gallium mesa glu glut/beos
 GLU_DIRS = sgi
 DRIVER_DIRS = beos
 PROGRAM_DIRS = beos samples redbook demos tests
diff --git a/configs/darwin b/configs/darwin
index 7826ecc605..bba7802696 100644
--- a/configs/darwin
+++ b/configs/darwin
@@ -25,5 +25,5 @@ GLW_LIB_DEPS = -L/usr/X11R6/lib -lX11 -lXt $(TOP)/lib/GL.dylib
 APP_LIB_DEPS = -L$(TOP)/lib -l$(GLUT_LIB) -l$(GLU_LIB) -l$(GL_LIB) -L/usr/X11R6/lib -lX11 -lXmu -lXt -lXi -lm
 
 # omit glw lib for now:
-SRC_DIRS = mesa glu glut/glx
+SRC_DIRS = gallium mesa glu glut/glx
 
diff --git a/configs/darwin-x86ppc b/configs/darwin-x86ppc
index 13172327a7..ebeb25051f 100644
--- a/configs/darwin-x86ppc
+++ b/configs/darwin-x86ppc
@@ -29,5 +29,5 @@ GLW_LIB_DEPS = -L/usr/X11R6/lib -lX11 -lXt $(TOP)/lib/GL.dylib
 APP_LIB_DEPS = -L$(TOP)/lib -l$(GLUT_LIB) -l$(GLU_LIB) -l$(GL_LIB) -L/usr/X11R6/lib -lX11 -lXmu -lXt -lXi -lm
 
 # omit glw lib for now:
-SRC_DIRS = mesa glu glut/glx
+SRC_DIRS = gallium mesa glu glut/glx
 
diff --git a/configs/default b/configs/default
index 166205a1d3..25a87e66a1 100644
--- a/configs/default
+++ b/configs/default
@@ -60,7 +60,7 @@ GLW_SOURCES = GLwDrawA.c
 
 # Directories to build
 LIB_DIR = lib
-SRC_DIRS = mesa glu glut/glx glw
+SRC_DIRS = gallium mesa glu glut/glx glw
 GLU_DIRS = sgi
 DRIVER_DIRS = x11 osmesa
 # Which subdirs under $(TOP)/progs/ to enter:
diff --git a/configs/freebsd-dri b/configs/freebsd-dri
index 402883d1de..67d253b869 100644
--- a/configs/freebsd-dri
+++ b/configs/freebsd-dri
@@ -36,7 +36,7 @@ GLW_LIB_DEPS = -L$(TOP)/$(LIB_DIR) -L/usr/X11R6/lib -lGL -lXt -lX11
 
 
 # Directories
-SRC_DIRS = glx/x11 mesa glu glut/glx glw
+SRC_DIRS = glx/x11 gallium mesa glu glut/glx glw
 DRIVER_DIRS = dri
 PROGRAM_DIRS = 
 WINDOW_SYSTEM=dri
diff --git a/configs/linux-cell b/configs/linux-cell
index 3d874491e4..fdf20deeeb 100644
--- a/configs/linux-cell
+++ b/configs/linux-cell
@@ -21,7 +21,7 @@ CFLAGS = $(OPT_FLAGS) -Wall -Winline -fPIC -m32 -mabi=altivec -maltivec -I. -I$(
 CXXFLAGS = $(CFLAGS)
 
 # Omitting glw here:
-SRC_DIRS = mesa glu glut/glx
+SRC_DIRS = gallium mesa glu glut/glx
 
 
 MKDEP_OPTIONS = -fdepend -Y
diff --git a/configs/linux-directfb b/configs/linux-directfb
index 09332f4808..dff27f7850 100644
--- a/configs/linux-directfb
+++ b/configs/linux-directfb
@@ -22,7 +22,7 @@ ifeq ($(HAVE_X86), yes)
 endif
 
 # Directories
-SRC_DIRS     = mesa glu glut/directfb
+SRC_DIRS     = gallium mesa glu glut/directfb
 GLU_DIRS     = sgi
 DRIVER_DIRS  = directfb
 PROGRAM_DIRS = demos directfb
diff --git a/configs/linux-dri b/configs/linux-dri
index 936fce9982..e6135856fc 100644
--- a/configs/linux-dri
+++ b/configs/linux-dri
@@ -54,10 +54,10 @@ USING_EGL=0
 
 # Directories
 ifeq ($(USING_EGL), 1)
-SRC_DIRS = egl glx/x11 mesa glu glut/glx glw
+SRC_DIRS = egl glx/x11 gallium mesa glu glut/glx glw
 PROGRAM_DIRS = egl
 else
-SRC_DIRS = glx/x11 mesa glu glut/glx glw
+SRC_DIRS = glx/x11 gallium mesa glu glut/glx glw
 PROGRAM_DIRS =
 endif
 
@@ -66,4 +66,4 @@ WINDOW_SYSTEM=dri
 
 # gamma are missing because they have not been converted to use the new
 # interface.
-DRI_DIRS = intel_winsys 
+DRI_DIRS = intel 
diff --git a/configs/linux-dri-xcb b/configs/linux-dri-xcb
index aa292a13ec..ea4bdf1864 100644
--- a/configs/linux-dri-xcb
+++ b/configs/linux-dri-xcb
@@ -53,10 +53,10 @@ USING_EGL=0
 
 # Directories
 ifeq ($(USING_EGL), 1)
-SRC_DIRS = egl glx/x11 mesa glu glut/glx glw
+SRC_DIRS = egl glx/x11 gallium mesa glu glut/glx glw
 PROGRAM_DIRS = egl
 else
-SRC_DIRS = glx/x11 mesa glu glut/glx glw
+SRC_DIRS = glx/x11 gallium mesa glu glut/glx glw
 PROGRAM_DIRS =
 endif
 
diff --git a/configs/linux-fbdev b/configs/linux-fbdev
index e36d20a702..1ddccb3f52 100644
--- a/configs/linux-fbdev
+++ b/configs/linux-fbdev
@@ -6,7 +6,7 @@ CONFIG_NAME = linux-fbdev
 
 CFLAGS = -O3 -ffast-math -ansi -pedantic -fPIC -D_POSIX_C_SOURCE=199309L -D_SVID_SOURCE -D_BSD_SOURCE -DPTHREADS -DUSE_GLFBDEV_DRIVER
 
-SRC_DIRS = mesa glu glut/fbdev
+SRC_DIRS = gallium mesa glu glut/fbdev
 DRIVER_DIRS = fbdev osmesa
 PROGRAM_DIRS = fbdev demos redbook samples
 
diff --git a/configs/linux-osmesa b/configs/linux-osmesa
index cc1fbbd109..0382a19553 100644
--- a/configs/linux-osmesa
+++ b/configs/linux-osmesa
@@ -14,7 +14,7 @@ CXXFLAGS = -O3 -ansi -pedantic -fPIC -ffast-math -D_POSIX_SOURCE -D_POSIX_C_SOUR
 
 
 # Directories
-SRC_DIRS = mesa glu
+SRC_DIRS = gallium mesa glu
 DRIVER_DIRS = osmesa
 PROGRAM_DIRS = osdemos
 
diff --git a/configs/linux-osmesa16 b/configs/linux-osmesa16
index 1fb0186d31..9a527592f1 100644
--- a/configs/linux-osmesa16
+++ b/configs/linux-osmesa16
@@ -17,7 +17,7 @@ OSMESA_LIB_NAME = libOSMesa16.so
 
 
 # Directories
-SRC_DIRS = mesa glu
+SRC_DIRS = gallium mesa glu
 DRIVER_DIRS = osmesa
 PROGRAM_DIRS = 
 
diff --git a/configs/linux-osmesa16-static b/configs/linux-osmesa16-static
index 6645504478..1e6380b02e 100644
--- a/configs/linux-osmesa16-static
+++ b/configs/linux-osmesa16-static
@@ -18,7 +18,7 @@ OSMESA_LIB_NAME = libOSMesa16.a
 
 
 # Directories
-SRC_DIRS = mesa glu
+SRC_DIRS = gallium mesa glu
 DRIVER_DIRS = osmesa
 PROGRAM_DIRS = 
 
diff --git a/configs/linux-osmesa32 b/configs/linux-osmesa32
index a1e5a358d6..f0ef1831b0 100644
--- a/configs/linux-osmesa32
+++ b/configs/linux-osmesa32
@@ -17,7 +17,7 @@ OSMESA_LIB_NAME = libOSMesa32.so
 
 
 # Directories
-SRC_DIRS = mesa glu
+SRC_DIRS = gallium mesa glu
 DRIVER_DIRS = osmesa
 PROGRAM_DIRS = 
 
diff --git a/configs/linux-solo b/configs/linux-solo
index 220fe58b9a..d49b972228 100644
--- a/configs/linux-solo
+++ b/configs/linux-solo
@@ -43,7 +43,7 @@ GLUT_LIB_DEPS = -L$(TOP)/$(LIB_DIR) -l$(GLU_LIB) -l$(GL_LIB) -lm
 APP_LIB_DEPS = -L$(TOP)/$(LIB_DIR) -l$(GLUT_LIB) -l$(GLU_LIB) -l$(GL_LIB) -lm -lpthread
 
 # Directories
-SRC_DIRS = glx/mini mesa glu glut/mini
+SRC_DIRS = glx/mini gallium mesa glu glut/mini
 DRIVER_DIRS = dri
 PROGRAM_DIRS = miniglx
 
diff --git a/src/gallium/Makefile b/src/gallium/Makefile
index d880d090c1..a13b9a52d3 100644
--- a/src/gallium/Makefile
+++ b/src/gallium/Makefile
@@ -1,16 +1,8 @@
-TOP = ../../..
+TOP = ../..
 include $(TOP)/configs/current
 
 
-ifeq ($(CONFIG_NAME), linux-cell)
-CELL_DIR = cell
-endif
-
-ifeq ($(CONFIG_NAME), linux-llvm)
-LLVM_DIR = llvm
-endif
-
-SUBDIRS = softpipe i915simple i965simple failover pipebuffer $(CELL_DIR) $(LLVM_DIR)
+SUBDIRS = aux drivers
 
 
 default: subdirs
diff --git a/src/gallium/Makefile.template b/src/gallium/Makefile.template
index 8e84f8eb2d..0717ed8dd2 100644
--- a/src/gallium/Makefile.template
+++ b/src/gallium/Makefile.template
@@ -15,7 +15,10 @@ OBJECTS = $(C_SOURCES:.c=.o) \
 ### Include directories
 INCLUDES = \
 	-I. \
-	-I$(TOP)/src/mesa/pipe \
+	-I$(TOP)/src/gallium/include \
+	-I$(TOP)/src/gallium/include/pipe \
+	-I$(TOP)/src/gallium/aux \
+	-I$(TOP)/src/gallium/drivers \
 	-I$(TOP)/src/mesa \
 	-I$(TOP)/include \
         $(DRIVER_INCLUDES)
@@ -38,7 +41,7 @@ INCLUDES = \
 default: depend symlinks $(LIBNAME)
 
 
-$(LIBNAME): $(OBJECTS) Makefile $(TOP)/src/mesa/pipe/Makefile.template
+$(LIBNAME): $(OBJECTS) Makefile $(TOP)/src/gallium/Makefile.template
 	$(TOP)/bin/mklib -o $@ -static $(OBJECTS) $(DRIVER_LIBS)
 
 
diff --git a/src/gallium/aux/Makefile b/src/gallium/aux/Makefile
new file mode 100644
index 0000000000..da68498aa1
--- /dev/null
+++ b/src/gallium/aux/Makefile
@@ -0,0 +1,24 @@
+TOP = ../../..
+include $(TOP)/configs/current
+
+
+ifeq ($(CONFIG_NAME), linux-llvm)
+LLVM_DIR = llvm
+endif
+
+SUBDIRS = pipebuffer $(LLVM_DIR)
+
+
+default: subdirs
+
+
+subdirs:
+	@for dir in $(SUBDIRS) ; do \
+		if [ -d $$dir ] ; then \
+			(cd $$dir && $(MAKE)) || exit 1 ; \
+		fi \
+	done
+
+
+clean:
+	rm -f `find . -name \*.[oa]`
diff --git a/src/gallium/aux/draw/draw_private.h b/src/gallium/aux/draw/draw_private.h
index b17eaaed65..3d09aef87c 100644
--- a/src/gallium/aux/draw/draw_private.h
+++ b/src/gallium/aux/draw/draw_private.h
@@ -45,7 +45,7 @@
 #include "pipe/p_defines.h"
 
 #include "x86/rtasm/x86sse.h"
-#include "pipe/tgsi/exec/tgsi_exec.h"
+#include "tgsi/exec/tgsi_exec.h"
 
 
 struct gallivm_prog;
diff --git a/src/gallium/aux/draw/draw_vertex.c b/src/gallium/aux/draw/draw_vertex.c
index 2d6592150f..daf1ef4b80 100644
--- a/src/gallium/aux/draw/draw_vertex.c
+++ b/src/gallium/aux/draw/draw_vertex.c
@@ -34,8 +34,8 @@
  */
 
 
-#include "pipe/draw/draw_private.h"
-#include "pipe/draw/draw_vertex.h"
+#include "draw/draw_private.h"
+#include "draw/draw_vertex.h"
 
 
 /**
diff --git a/src/gallium/aux/draw/draw_vertex_shader.c b/src/gallium/aux/draw/draw_vertex_shader.c
index c824c1407e..377ecbb931 100644
--- a/src/gallium/aux/draw/draw_vertex_shader.c
+++ b/src/gallium/aux/draw/draw_vertex_shader.c
@@ -34,13 +34,13 @@
 #include "pipe/p_util.h"
 #include "pipe/p_shader_tokens.h"
 #if defined(__i386__) || defined(__386__)
-#include "pipe/tgsi/exec/tgsi_sse2.h"
+#include "tgsi/exec/tgsi_sse2.h"
 #endif
 #include "draw_private.h"
 #include "draw_context.h"
 
 #include "x86/rtasm/x86sse.h"
-#include "pipe/llvm/gallivm.h"
+#include "llvm/gallivm.h"
 
 
 #define DBG_VS 0
diff --git a/src/gallium/aux/llvm/Makefile b/src/gallium/aux/llvm/Makefile
index 9c6e16d86b..e6ac399d08 100644
--- a/src/gallium/aux/llvm/Makefile
+++ b/src/gallium/aux/llvm/Makefile
@@ -30,7 +30,9 @@ OBJECTS = $(C_SOURCES:.c=.o) \
 ### Include directories
 INCLUDES = \
 	-I. \
-	-I$(TOP)/src/mesa/pipe \
+	-I$(TOP)/src/gallium/drivers
+	-I$(TOP)/src/gallium/aux \
+	-I$(TOP)/src/gallium/include \
 	-I$(TOP)/src/mesa \
 	-I$(TOP)/include
 
diff --git a/src/gallium/aux/llvm/gallivm.cpp b/src/gallium/aux/llvm/gallivm.cpp
index da0105c2c9..d14bb3b99a 100644
--- a/src/gallium/aux/llvm/gallivm.cpp
+++ b/src/gallium/aux/llvm/gallivm.cpp
@@ -42,8 +42,8 @@
 #include "pipe/p_context.h"
 #include "pipe/p_shader_tokens.h"
 
-#include "pipe/tgsi/exec/tgsi_exec.h"
-#include "pipe/tgsi/util/tgsi_dump.h"
+#include "tgsi/exec/tgsi_exec.h"
+#include "tgsi/util/tgsi_dump.h"
 
 #include <llvm/Module.h>
 #include <llvm/CallingConv.h>
diff --git a/src/gallium/aux/llvm/gallivm_cpu.cpp b/src/gallium/aux/llvm/gallivm_cpu.cpp
index dc4d92a72a..8f9830d0b1 100644
--- a/src/gallium/aux/llvm/gallivm_cpu.cpp
+++ b/src/gallium/aux/llvm/gallivm_cpu.cpp
@@ -42,8 +42,8 @@
 #include "pipe/p_context.h"
 #include "pipe/p_shader_tokens.h"
 
-#include "pipe/tgsi/exec/tgsi_exec.h"
-#include "pipe/tgsi/util/tgsi_dump.h"
+#include "tgsi/exec/tgsi_exec.h"
+#include "tgsi/util/tgsi_dump.h"
 
 #include <llvm/Module.h>
 #include <llvm/CallingConv.h>
diff --git a/src/gallium/aux/llvm/tgsitollvm.cpp b/src/gallium/aux/llvm/tgsitollvm.cpp
index 0de595e678..2cb4acce32 100644
--- a/src/gallium/aux/llvm/tgsitollvm.cpp
+++ b/src/gallium/aux/llvm/tgsitollvm.cpp
@@ -10,11 +10,11 @@
 
 #include "pipe/p_shader_tokens.h"
 
-#include "pipe/tgsi/util/tgsi_parse.h"
-#include "pipe/tgsi/exec/tgsi_exec.h"
-#include "pipe/tgsi/util/tgsi_util.h"
-#include "pipe/tgsi/util/tgsi_build.h"
-#include "pipe/tgsi/util/tgsi_dump.h"
+#include "tgsi/util/tgsi_parse.h"
+#include "tgsi/exec/tgsi_exec.h"
+#include "tgsi/util/tgsi_util.h"
+#include "tgsi/util/tgsi_build.h"
+#include "tgsi/util/tgsi_dump.h"
 
 
 #include <llvm/Module.h>
diff --git a/src/gallium/aux/pipebuffer/Makefile b/src/gallium/aux/pipebuffer/Makefile
index 75764a9a18..588629e870 100644
--- a/src/gallium/aux/pipebuffer/Makefile
+++ b/src/gallium/aux/pipebuffer/Makefile
@@ -17,7 +17,7 @@ C_SOURCES = \
 
 ASM_SOURCES = 
 
-include ../Makefile.template
+include ../../Makefile.template
 
 symlinks:
 
diff --git a/src/gallium/aux/tgsi/exec/tgsi_exec.c b/src/gallium/aux/tgsi/exec/tgsi_exec.c
index 37e6007068..a8f64c2287 100644
--- a/src/gallium/aux/tgsi/exec/tgsi_exec.c
+++ b/src/gallium/aux/tgsi/exec/tgsi_exec.c
@@ -54,8 +54,8 @@
 #include "pipe/p_state.h"
 #include "pipe/p_util.h"
 #include "pipe/p_shader_tokens.h"
-#include "pipe/tgsi/util/tgsi_parse.h"
-#include "pipe/tgsi/util/tgsi_util.h"
+#include "tgsi/util/tgsi_parse.h"
+#include "tgsi/util/tgsi_util.h"
 #include "tgsi_exec.h"
 
 #define TILE_TOP_LEFT     0
diff --git a/src/gallium/aux/tgsi/exec/tgsi_sse2.c b/src/gallium/aux/tgsi/exec/tgsi_sse2.c
index 1e56e4afb6..593464db3e 100755
--- a/src/gallium/aux/tgsi/exec/tgsi_sse2.c
+++ b/src/gallium/aux/tgsi/exec/tgsi_sse2.c
@@ -27,8 +27,8 @@
 
 #include "pipe/p_util.h"
 #include "pipe/p_shader_tokens.h"
-#include "pipe/tgsi/util/tgsi_parse.h"
-#include "pipe/tgsi/util/tgsi_util.h"
+#include "tgsi/util/tgsi_parse.h"
+#include "tgsi/util/tgsi_util.h"
 #include "tgsi_exec.h"
 #include "tgsi_sse2.h"
 
diff --git a/src/gallium/aux/tgsi/util/tgsi_transform.h b/src/gallium/aux/tgsi/util/tgsi_transform.h
index 365d8c298c..fcf85d603b 100644
--- a/src/gallium/aux/tgsi/util/tgsi_transform.h
+++ b/src/gallium/aux/tgsi/util/tgsi_transform.h
@@ -31,8 +31,8 @@
 
 #include "pipe/p_util.h"
 #include "pipe/p_shader_tokens.h"
-#include "pipe/tgsi/util/tgsi_parse.h"
-#include "pipe/tgsi/util/tgsi_build.h"
+#include "tgsi/util/tgsi_parse.h"
+#include "tgsi/util/tgsi_build.h"
 
 
diff --git a/src/gallium/drivers/Makefile b/src/gallium/drivers/Makefile
new file mode 100644
index 0000000000..c0345a9cb5
--- /dev/null
+++ b/src/gallium/drivers/Makefile
@@ -0,0 +1,24 @@
+TOP = ../../..
+include $(TOP)/configs/current
+
+
+ifeq ($(CONFIG_NAME), linux-cell)
+CELL_DIR = cell
+endif
+
+SUBDIRS = softpipe i915simple i965simple failover pipebuffer $(CELL_DIR)
+
+
+default: subdirs
+
+
+subdirs:
+	@for dir in $(SUBDIRS) ; do \
+		if [ -d $$dir ] ; then \
+			(cd $$dir && $(MAKE)) || exit 1 ; \
+		fi \
+	done
+
+
+clean:
+	rm -f `find . -name \*.[oa]`
diff --git a/src/gallium/drivers/cell/ppu/Makefile b/src/gallium/drivers/cell/ppu/Makefile
index 50060f5cd3..011863c11e 100644
--- a/src/gallium/drivers/cell/ppu/Makefile
+++ b/src/gallium/drivers/cell/ppu/Makefile
@@ -40,8 +40,11 @@ SOURCES = \
 
 OBJECTS = $(SOURCES:.c=.o) \
 
-INCLUDE_DIRS = -I$(TOP)/src/mesa
-
+INCLUDE_DIRS = \
+	-I$(TOP)/src/mesa \
+	-I$(TOP)/src/gallium/include \
+	-I$(TOP)/src/gallium/aux \
+	-I$(TOP)/src/gallium/drivers
 
 .c.o:
 	$(CC) -c $(INCLUDE_DIRS) $(CFLAGS) $< -o $@
diff --git a/src/gallium/drivers/cell/ppu/cell_clear.c b/src/gallium/drivers/cell/ppu/cell_clear.c
index 07b908eec5..e588a30d5b 100644
--- a/src/gallium/drivers/cell/ppu/cell_clear.c
+++ b/src/gallium/drivers/cell/ppu/cell_clear.c
@@ -35,7 +35,7 @@
 #include <stdint.h>
 #include "pipe/p_inlines.h"
 #include "pipe/p_util.h"
-#include "pipe/cell/common.h"
+#include "cell/common.h"
 #include "cell_clear.h"
 #include "cell_context.h"
 #include "cell_batch.h"
diff --git a/src/gallium/drivers/cell/ppu/cell_context.c b/src/gallium/drivers/cell/ppu/cell_context.c
index bbe1fd7a11..e1eb22f468 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.c
+++ b/src/gallium/drivers/cell/ppu/cell_context.c
@@ -37,9 +37,9 @@
 #include "pipe/p_format.h"
 #include "pipe/p_util.h"
 #include "pipe/p_winsys.h"
-#include "pipe/cell/common.h"
-#include "pipe/draw/draw_context.h"
-#include "pipe/draw/draw_private.h"
+#include "cell/common.h"
+#include "draw/draw_context.h"
+#include "draw/draw_private.h"
 #include "cell_clear.h"
 #include "cell_context.h"
 #include "cell_draw_arrays.h"
diff --git a/src/gallium/drivers/cell/ppu/cell_context.h b/src/gallium/drivers/cell/ppu/cell_context.h
index 3b63419b5e..6196c0c72f 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.h
+++ b/src/gallium/drivers/cell/ppu/cell_context.h
@@ -32,10 +32,10 @@
 
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
-#include "pipe/draw/draw_vertex.h"
-#include "pipe/draw/draw_vbuf.h"
+#include "draw/draw_vertex.h"
+#include "draw/draw_vbuf.h"
 #include "cell_winsys.h"
-#include "pipe/cell/common.h"
+#include "cell/common.h"
 
 
 struct cell_vbuf_render;
diff --git a/src/gallium/drivers/cell/ppu/cell_draw_arrays.c b/src/gallium/drivers/cell/ppu/cell_draw_arrays.c
index 717cd8370f..f12613649b 100644
--- a/src/gallium/drivers/cell/ppu/cell_draw_arrays.c
+++ b/src/gallium/drivers/cell/ppu/cell_draw_arrays.c
@@ -39,7 +39,7 @@
 #include "cell_draw_arrays.h"
 #include "cell_state.h"
 
-#include "pipe/draw/draw_context.h"
+#include "draw/draw_context.h"
 
 
diff --git a/src/gallium/drivers/cell/ppu/cell_flush.c b/src/gallium/drivers/cell/ppu/cell_flush.c
index f62bc4650c..20f27531fc 100644
--- a/src/gallium/drivers/cell/ppu/cell_flush.c
+++ b/src/gallium/drivers/cell/ppu/cell_flush.c
@@ -31,7 +31,7 @@
 #include "cell_flush.h"
 #include "cell_spu.h"
 #include "cell_render.h"
-#include "pipe/draw/draw_context.h"
+#include "draw/draw_context.h"
 
 
 void
diff --git a/src/gallium/drivers/cell/ppu/cell_render.c b/src/gallium/drivers/cell/ppu/cell_render.c
index 4ab277a4b2..b663b37622 100644
--- a/src/gallium/drivers/cell/ppu/cell_render.c
+++ b/src/gallium/drivers/cell/ppu/cell_render.c
@@ -34,7 +34,7 @@
 #include "cell_render.h"
 #include "cell_spu.h"
 #include "pipe/p_util.h"
-#include "pipe/draw/draw_private.h"
+#include "draw/draw_private.h"
 
 
 struct render_stage {
diff --git a/src/gallium/drivers/cell/ppu/cell_spu.c b/src/gallium/drivers/cell/ppu/cell_spu.c
index 7c83a47e57..419e74dc40 100644
--- a/src/gallium/drivers/cell/ppu/cell_spu.c
+++ b/src/gallium/drivers/cell/ppu/cell_spu.c
@@ -31,7 +31,7 @@
 #include "cell_spu.h"
 #include "pipe/p_format.h"
 #include "pipe/p_state.h"
-#include "pipe/cell/common.h"
+#include "cell/common.h"
 
 
 /*
diff --git a/src/gallium/drivers/cell/ppu/cell_spu.h b/src/gallium/drivers/cell/ppu/cell_spu.h
index 19eff94f96..137f26612e 100644
--- a/src/gallium/drivers/cell/ppu/cell_spu.h
+++ b/src/gallium/drivers/cell/ppu/cell_spu.h
@@ -31,7 +31,7 @@
 
 #include <libspe2.h>
 #include <libmisc.h>
-#include "pipe/cell/common.h"
+#include "cell/common.h"
 
 #include "cell_context.h"
 
diff --git a/src/gallium/drivers/cell/ppu/cell_state_blend.c b/src/gallium/drivers/cell/ppu/cell_state_blend.c
index 4fc60548c8..b6d6d71f0c 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_blend.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_blend.c
@@ -29,7 +29,7 @@
  */
 
 #include "pipe/p_util.h"
-#include "pipe/draw/draw_context.h"
+#include "draw/draw_context.h"
 #include "cell_context.h"
 #include "cell_state.h"
 
diff --git a/src/gallium/drivers/cell/ppu/cell_state_clip.c b/src/gallium/drivers/cell/ppu/cell_state_clip.c
index 4f43665941..0482f87e88 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_clip.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_clip.c
@@ -30,7 +30,7 @@
 
 #include "cell_context.h"
 #include "cell_state.h"
-#include "pipe/draw/draw_context.h"
+#include "draw/draw_context.h"
 
 
 void cell_set_clip_state( struct pipe_context *pipe,
diff --git a/src/gallium/drivers/cell/ppu/cell_state_derived.c b/src/gallium/drivers/cell/ppu/cell_state_derived.c
index 56daf5dfde..0c46829258 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_derived.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_derived.c
@@ -27,8 +27,8 @@
 
 #include "pipe/p_util.h"
 #include "pipe/p_shader_tokens.h"
-#include "pipe/draw/draw_context.h"
-#include "pipe/draw/draw_vertex.h"
+#include "draw/draw_context.h"
+#include "draw/draw_vertex.h"
 #include "cell_context.h"
 #include "cell_batch.h"
 #include "cell_state.h"
diff --git a/src/gallium/drivers/cell/ppu/cell_state_fs.c b/src/gallium/drivers/cell/ppu/cell_state_fs.c
index 3f46a87d18..b2ed699a5b 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_fs.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_fs.c
@@ -29,12 +29,12 @@
 #include "pipe/p_util.h"
 #include "pipe/p_inlines.h"
 #include "pipe/p_winsys.h"
-#include "pipe/draw/draw_context.h"
+#include "draw/draw_context.h"
 #if 0
 #include "pipe/p_shader_tokens.h"
-#include "pipe/llvm/gallivm.h"
-#include "pipe/tgsi/util/tgsi_dump.h"
-#include "pipe/tgsi/exec/tgsi_sse2.h"
+#include "llvm/gallivm.h"
+#include "tgsi/util/tgsi_dump.h"
+#include "tgsi/exec/tgsi_sse2.h"
 #endif
 
 #include "cell_context.h"
diff --git a/src/gallium/drivers/cell/ppu/cell_state_rasterizer.c b/src/gallium/drivers/cell/ppu/cell_state_rasterizer.c
index d8128ece54..7eca5b5765 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_rasterizer.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_rasterizer.c
@@ -27,7 +27,7 @@
 
 #include "pipe/p_defines.h"
 #include "pipe/p_util.h"
-#include "pipe/draw/draw_context.h"
+#include "draw/draw_context.h"
 #include "cell_context.h"
 #include "cell_state.h"
 
diff --git a/src/gallium/drivers/cell/ppu/cell_state_sampler.c b/src/gallium/drivers/cell/ppu/cell_state_sampler.c
index ade6cc8338..a33421a4ad 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_sampler.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_sampler.c
@@ -30,7 +30,7 @@
  */
 
 #include "pipe/p_util.h"
-#include "pipe/draw/draw_context.h"
+#include "draw/draw_context.h"
 #include "cell_context.h"
 #include "cell_state.h"
 #include "cell_texture.h"
diff --git a/src/gallium/drivers/cell/ppu/cell_state_vertex.c b/src/gallium/drivers/cell/ppu/cell_state_vertex.c
index 0f01e920f9..563831b62d 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_vertex.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_vertex.c
@@ -32,7 +32,7 @@
 #include "cell_context.h"
 #include "cell_state.h"
 
-#include "pipe/draw/draw_context.h"
+#include "draw/draw_context.h"
 
 
 void
diff --git a/src/gallium/drivers/cell/ppu/cell_surface.c b/src/gallium/drivers/cell/ppu/cell_surface.c
index fca93e4742..a35db0ef99 100644
--- a/src/gallium/drivers/cell/ppu/cell_surface.c
+++ b/src/gallium/drivers/cell/ppu/cell_surface.c
@@ -29,7 +29,7 @@
 #include "pipe/p_util.h"
 #include "pipe/p_inlines.h"
 #include "pipe/p_winsys.h"
-#include "pipe/util/p_tile.h"
+#include "util/p_tile.h"
 #include "cell_context.h"
 #include "cell_surface.h"
 
diff --git a/src/gallium/drivers/cell/ppu/cell_vbuf.c b/src/gallium/drivers/cell/ppu/cell_vbuf.c
index e9fafe492e..cc727ff4ed 100644
--- a/src/gallium/drivers/cell/ppu/cell_vbuf.c
+++ b/src/gallium/drivers/cell/ppu/cell_vbuf.c
@@ -36,7 +36,7 @@
 #include "cell_flush.h"
 #include "cell_spu.h"
 #include "cell_vbuf.h"
-#include "pipe/draw/draw_vbuf.h"
+#include "draw/draw_vbuf.h"
 
 
 /** Allow vertex data to be inlined after RENDER command */
diff --git a/src/gallium/drivers/cell/ppu/cell_vertex_shader.c b/src/gallium/drivers/cell/ppu/cell_vertex_shader.c
index 80dd500b34..0ba4506505 100644
--- a/src/gallium/drivers/cell/ppu/cell_vertex_shader.c
+++ b/src/gallium/drivers/cell/ppu/cell_vertex_shader.c
@@ -38,9 +38,9 @@
 #include "cell_spu.h"
 #include "cell_batch.h"
 
-#include "pipe/cell/common.h"
-#include "pipe/draw/draw_context.h"
-#include "pipe/draw/draw_private.h"
+#include "cell/common.h"
+#include "draw/draw_context.h"
+#include "draw/draw_private.h"
 
 /**
  * Run the vertex shader on all vertices in the vertex queue.
diff --git a/src/gallium/drivers/cell/spu/Makefile b/src/gallium/drivers/cell/spu/Makefile
index f202971d73..7aa947299e 100644
--- a/src/gallium/drivers/cell/spu/Makefile
+++ b/src/gallium/drivers/cell/spu/Makefile
@@ -31,7 +31,11 @@ SPU_OBJECTS = $(SOURCES:.c=.o) \
 
 SPU_ASM_OUT = $(SOURCES:.c=.s) \
 
-INCLUDE_DIRS = -I$(TOP)/src/mesa
+INCLUDE_DIRS = \
+	-I$(TOP)/src/mesa \
+	-I$(TOP)/src/gallium/include \
+	-I$(TOP)/src/gallium/aux \
+	-I$(TOP)/src/gallium/drivers
 
 
 .c.o:
diff --git a/src/gallium/drivers/cell/spu/spu_exec.c b/src/gallium/drivers/cell/spu/spu_exec.c
index e51008b9b3..109540b1f7 100644
--- a/src/gallium/drivers/cell/spu/spu_exec.c
+++ b/src/gallium/drivers/cell/spu/spu_exec.c
@@ -67,8 +67,8 @@
 #include "pipe/p_state.h"
 #include "pipe/p_util.h"
 #include "pipe/p_shader_tokens.h"
-#include "pipe/tgsi/util/tgsi_parse.h"
-#include "pipe/tgsi/util/tgsi_util.h"
+#include "tgsi/util/tgsi_parse.h"
+#include "tgsi/util/tgsi_util.h"
 #include "spu_exec.h"
 #include "spu_main.h"
 #include "spu_vertex_shader.h"
diff --git a/src/gallium/drivers/cell/spu/spu_exec.h b/src/gallium/drivers/cell/spu/spu_exec.h
index b4c7661ef6..3e17c490d2 100644
--- a/src/gallium/drivers/cell/spu/spu_exec.h
+++ b/src/gallium/drivers/cell/spu/spu_exec.h
@@ -29,7 +29,7 @@
 #define SPU_EXEC_H
 
 #include "pipe/p_compiler.h"
-#include "pipe/tgsi/exec/tgsi_exec.h"
+#include "tgsi/exec/tgsi_exec.h"
 
 #if defined __cplusplus
 extern "C" {
diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index e375197fe6..1e7243b863 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -38,7 +38,7 @@
 #include "spu_tile.h"
 //#include "spu_test.h"
 #include "spu_vertex_shader.h"
-#include "pipe/cell/common.h"
+#include "cell/common.h"
 #include "pipe/p_defines.h"
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 1710a17512..5c95d112ac 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -31,8 +31,8 @@
 
 #include <spu_mfcio.h>
 
-#include "pipe/cell/common.h"
-#include "pipe/draw/draw_vertex.h"
+#include "cell/common.h"
+#include "draw/draw_vertex.h"
 #include "pipe/p_state.h"
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_render.c b/src/gallium/drivers/cell/spu/spu_render.c
index 932fb500b3..20e77aa2e6 100644
--- a/src/gallium/drivers/cell/spu/spu_render.c
+++ b/src/gallium/drivers/cell/spu/spu_render.c
@@ -34,7 +34,7 @@
 #include "spu_render.h"
 #include "spu_tri.h"
 #include "spu_tile.h"
-#include "pipe/cell/common.h"
+#include "cell/common.h"
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_render.h b/src/gallium/drivers/cell/spu/spu_render.h
index fbcdc5ec31..493434f087 100644
--- a/src/gallium/drivers/cell/spu/spu_render.h
+++ b/src/gallium/drivers/cell/spu/spu_render.h
@@ -29,7 +29,7 @@
 #ifndef SPU_RENDER_H
 #define SPU_RENDER_H
 
-#include "pipe/cell/common.h"
+#include "cell/common.h"
 
 extern void
 cmd_render(const struct cell_command_render *render, uint *pos_incr);
diff --git a/src/gallium/drivers/cell/spu/spu_tile.h b/src/gallium/drivers/cell/spu/spu_tile.h
index e53340a55a..3105b848fd 100644
--- a/src/gallium/drivers/cell/spu/spu_tile.h
+++ b/src/gallium/drivers/cell/spu/spu_tile.h
@@ -32,7 +32,7 @@
 #include <libmisc.h>
 #include <spu_mfcio.h>
 #include "spu_main.h"
-#include "pipe/cell/common.h"
+#include "cell/common.h"
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_util.c b/src/gallium/drivers/cell/spu/spu_util.c
index ac373240c1..ea4274a0a7 100644
--- a/src/gallium/drivers/cell/spu/spu_util.c
+++ b/src/gallium/drivers/cell/spu/spu_util.c
@@ -1,8 +1,8 @@
 #include "pipe/p_util.h"
 #include "pipe/p_shader_tokens.h"
-#include "pipe/tgsi/util/tgsi_parse.h"
+#include "tgsi/util/tgsi_parse.h"
 //#include "tgsi_build.h"
-#include "pipe/tgsi/util/tgsi_util.h"
+#include "tgsi/util/tgsi_util.h"
 
 unsigned
 tgsi_util_get_src_register_swizzle(
diff --git a/src/gallium/drivers/cell/spu/spu_vertex_shader.c b/src/gallium/drivers/cell/spu/spu_vertex_shader.c
index c1cbbb6d1e..3f5bf41aa2 100644
--- a/src/gallium/drivers/cell/spu/spu_vertex_shader.c
+++ b/src/gallium/drivers/cell/spu/spu_vertex_shader.c
@@ -39,9 +39,9 @@
 #include "pipe/p_shader_tokens.h"
 #include "spu_vertex_shader.h"
 #include "spu_exec.h"
-#include "pipe/draw/draw_private.h"
-#include "pipe/draw/draw_context.h"
-#include "pipe/cell/common.h"
+#include "draw/draw_private.h"
+#include "draw/draw_context.h"
+#include "cell/common.h"
 #include "spu_main.h"
 
 static INLINE unsigned
diff --git a/src/gallium/drivers/failover/Makefile b/src/gallium/drivers/failover/Makefile
index 72d0895c74..14389bd055 100644
--- a/src/gallium/drivers/failover/Makefile
+++ b/src/gallium/drivers/failover/Makefile
@@ -15,7 +15,7 @@ C_SOURCES = \
 
 ASM_SOURCES = 
 
-include ../Makefile.template
+include ../../Makefile.template
 
 symlinks:
 
diff --git a/src/gallium/drivers/i915simple/Makefile b/src/gallium/drivers/i915simple/Makefile
index 2f91de3afc..ee22ba86f9 100644
--- a/src/gallium/drivers/i915simple/Makefile
+++ b/src/gallium/drivers/i915simple/Makefile
@@ -32,7 +32,7 @@ C_SOURCES = \
 
 ASM_SOURCES = 
 
-include ../Makefile.template
+include ../../Makefile.template
 
 symlinks:
 
diff --git a/src/gallium/drivers/i915simple/i915_context.c b/src/gallium/drivers/i915simple/i915_context.c
index 497623a700..7f71f8fd4f 100644
--- a/src/gallium/drivers/i915simple/i915_context.c
+++ b/src/gallium/drivers/i915simple/i915_context.c
@@ -32,7 +32,7 @@
 #include "i915_texture.h"
 #include "i915_reg.h"
 
-#include "pipe/draw/draw_context.h"
+#include "draw/draw_context.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_winsys.h"
 #include "pipe/p_util.h"
diff --git a/src/gallium/drivers/i915simple/i915_context.h b/src/gallium/drivers/i915simple/i915_context.h
index b4ea63c3e7..2d876925b2 100644
--- a/src/gallium/drivers/i915simple/i915_context.h
+++ b/src/gallium/drivers/i915simple/i915_context.h
@@ -33,7 +33,7 @@
 #include "pipe/p_defines.h"
 #include "pipe/p_state.h"
 
-#include "pipe/draw/draw_vertex.h"
+#include "draw/draw_vertex.h"
 
 
 #define I915_TEX_UNITS 8
diff --git a/src/gallium/drivers/i915simple/i915_fpc_translate.c b/src/gallium/drivers/i915simple/i915_fpc_translate.c
index 868f0c7e04..6c1524c768 100644
--- a/src/gallium/drivers/i915simple/i915_fpc_translate.c
+++ b/src/gallium/drivers/i915simple/i915_fpc_translate.c
@@ -33,9 +33,9 @@
 #include "i915_fpc.h"
 
 #include "pipe/p_shader_tokens.h"
-#include "pipe/tgsi/util/tgsi_parse.h"
+#include "tgsi/util/tgsi_parse.h"
 
-#include "pipe/draw/draw_vertex.h"
+#include "draw/draw_vertex.h"
 
 
 /**
diff --git a/src/gallium/drivers/i915simple/i915_prim_emit.c b/src/gallium/drivers/i915simple/i915_prim_emit.c
index c4a706c37d..44c4325936 100644
--- a/src/gallium/drivers/i915simple/i915_prim_emit.c
+++ b/src/gallium/drivers/i915simple/i915_prim_emit.c
@@ -26,7 +26,7 @@
  **************************************************************************/
 
 
-#include "pipe/draw/draw_private.h"
+#include "draw/draw_private.h"
 #include "pipe/p_util.h"
 
 #include "i915_context.h"
diff --git a/src/gallium/drivers/i915simple/i915_prim_vbuf.c b/src/gallium/drivers/i915simple/i915_prim_vbuf.c
index e069773fd4..c5bf6174f6 100644
--- a/src/gallium/drivers/i915simple/i915_prim_vbuf.c
+++ b/src/gallium/drivers/i915simple/i915_prim_vbuf.c
@@ -38,7 +38,7 @@
  */
 
 
-#include "pipe/draw/draw_vbuf.h"
+#include "draw/draw_vbuf.h"
 #include "pipe/p_debug.h"
 #include "pipe/p_util.h"
 #include "pipe/p_inlines.h"
diff --git a/src/gallium/drivers/i915simple/i915_state.c b/src/gallium/drivers/i915simple/i915_state.c
index abd5571b88..294e6fad03 100644
--- a/src/gallium/drivers/i915simple/i915_state.c
+++ b/src/gallium/drivers/i915simple/i915_state.c
@@ -29,7 +29,7 @@
  */
 
 
-#include "pipe/draw/draw_context.h"
+#include "draw/draw_context.h"
 #include "pipe/p_winsys.h"
 #include "pipe/p_util.h"
 
diff --git a/src/gallium/drivers/i915simple/i915_state_derived.c b/src/gallium/drivers/i915simple/i915_state_derived.c
index 653983e4a9..4767584fc6 100644
--- a/src/gallium/drivers/i915simple/i915_state_derived.c
+++ b/src/gallium/drivers/i915simple/i915_state_derived.c
@@ -27,8 +27,8 @@
 
 
 #include "pipe/p_util.h"
-#include "pipe/draw/draw_context.h"
-#include "pipe/draw/draw_vertex.h"
+#include "draw/draw_context.h"
+#include "draw/draw_vertex.h"
 #include "i915_context.h"
 #include "i915_state.h"
 #include "i915_reg.h"
diff --git a/src/gallium/drivers/i915simple/i915_strings.c b/src/gallium/drivers/i915simple/i915_strings.c
index c713bf7208..301fedea19 100644
--- a/src/gallium/drivers/i915simple/i915_strings.c
+++ b/src/gallium/drivers/i915simple/i915_strings.c
@@ -70,7 +70,7 @@ static const char *i915_get_name( struct pipe_context *pipe )
       break;
    }
 
-   sprintf(buffer, "pipe/i915 (chipset: %s)", chipset);
+   sprintf(buffer, "i915 (chipset: %s)", chipset);
    return buffer;
 }
 
diff --git a/src/gallium/drivers/i915simple/i915_surface.c b/src/gallium/drivers/i915simple/i915_surface.c
index de0cc5fe06..17fd27895a 100644
--- a/src/gallium/drivers/i915simple/i915_surface.c
+++ b/src/gallium/drivers/i915simple/i915_surface.c
@@ -33,7 +33,7 @@
 #include "pipe/p_util.h"
 #include "pipe/p_inlines.h"
 #include "pipe/p_winsys.h"
-#include "pipe/util/p_tile.h"
+#include "util/p_tile.h"
 
 
 /*
diff --git a/src/gallium/drivers/i965simple/Makefile b/src/gallium/drivers/i965simple/Makefile
index 48c00ab50b..1dec1f9749 100644
--- a/src/gallium/drivers/i965simple/Makefile
+++ b/src/gallium/drivers/i965simple/Makefile
@@ -61,6 +61,6 @@ ASM_SOURCES =
 
 DRIVER_DEFINES = -I.
 
-include ../Makefile.template
+include ../../Makefile.template
 
 symlinks:
diff --git a/src/gallium/drivers/i965simple/brw_shader_info.c b/src/gallium/drivers/i965simple/brw_shader_info.c
index 431b45466a..a762a870fe 100644
--- a/src/gallium/drivers/i965simple/brw_shader_info.c
+++ b/src/gallium/drivers/i965simple/brw_shader_info.c
@@ -3,7 +3,7 @@
 #include "brw_state.h"
 #include "pipe/p_util.h"
 #include "pipe/p_shader_tokens.h"
-#include "pipe/tgsi/util/tgsi_parse.h"
+#include "tgsi/util/tgsi_parse.h"
 
 
diff --git a/src/gallium/drivers/i965simple/brw_state.c b/src/gallium/drivers/i965simple/brw_state.c
index 95dfce88e4..f746d1cc57 100644
--- a/src/gallium/drivers/i965simple/brw_state.c
+++ b/src/gallium/drivers/i965simple/brw_state.c
@@ -33,7 +33,7 @@
 #include "pipe/p_winsys.h"
 #include "pipe/p_util.h"
 #include "pipe/p_shader_tokens.h"
-#include "pipe/tgsi/util/tgsi_dump.h"
+#include "tgsi/util/tgsi_dump.h"
 
 #include "brw_context.h"
 #include "brw_defines.h"
diff --git a/src/gallium/drivers/i965simple/brw_strings.c b/src/gallium/drivers/i965simple/brw_strings.c
index 29a41ed1e9..3d9c50961f 100644
--- a/src/gallium/drivers/i965simple/brw_strings.c
+++ b/src/gallium/drivers/i965simple/brw_strings.c
@@ -59,7 +59,7 @@ static const char *brw_get_name( struct pipe_context *pipe )
       break;
    }
 
-   sprintf(buffer, "pipe/i965 (chipset: %s)", chipset);
+   sprintf(buffer, "i965 (chipset: %s)", chipset);
    return buffer;
 }
 
diff --git a/src/gallium/drivers/i965simple/brw_surface.c b/src/gallium/drivers/i965simple/brw_surface.c
index 518845e4b2..376a42b1a6 100644
--- a/src/gallium/drivers/i965simple/brw_surface.c
+++ b/src/gallium/drivers/i965simple/brw_surface.c
@@ -32,7 +32,7 @@
 #include "pipe/p_util.h"
 #include "pipe/p_inlines.h"
 #include "pipe/p_winsys.h"
-#include "pipe/util/p_tile.h"
+#include "util/p_tile.h"
 
 
 /*
diff --git a/src/gallium/drivers/i965simple/brw_vs_emit.c b/src/gallium/drivers/i965simple/brw_vs_emit.c
index 98915ba101..05df4860ed 100644
--- a/src/gallium/drivers/i965simple/brw_vs_emit.c
+++ b/src/gallium/drivers/i965simple/brw_vs_emit.c
@@ -33,7 +33,7 @@
 #include "brw_vs.h"
 
 #include "pipe/p_shader_tokens.h"
-#include "pipe/tgsi/util/tgsi_parse.h"
+#include "tgsi/util/tgsi_parse.h"
 
 struct brw_prog_info {
    unsigned num_temps;
diff --git a/src/gallium/drivers/i965simple/brw_wm_decl.c b/src/gallium/drivers/i965simple/brw_wm_decl.c
index b45a333a2e..97418a52e7 100644
--- a/src/gallium/drivers/i965simple/brw_wm_decl.c
+++ b/src/gallium/drivers/i965simple/brw_wm_decl.c
@@ -4,7 +4,7 @@
 #include "brw_wm.h"
 #include "pipe/p_util.h"
 #include "pipe/p_shader_tokens.h"
-#include "pipe/tgsi/util/tgsi_parse.h"
+#include "tgsi/util/tgsi_parse.h"
 
 static struct brw_reg alloc_tmp(struct brw_wm_compile *c)
 {
diff --git a/src/gallium/drivers/i965simple/brw_wm_glsl.c b/src/gallium/drivers/i965simple/brw_wm_glsl.c
index d95645d108..44f946ea74 100644
--- a/src/gallium/drivers/i965simple/brw_wm_glsl.c
+++ b/src/gallium/drivers/i965simple/brw_wm_glsl.c
@@ -4,7 +4,7 @@
 #include "brw_wm.h"
 #include "pipe/p_util.h"
 #include "pipe/p_shader_tokens.h"
-#include "pipe/tgsi/util/tgsi_parse.h"
+#include "tgsi/util/tgsi_parse.h"
 
 
diff --git a/src/gallium/drivers/softpipe/Makefile b/src/gallium/drivers/softpipe/Makefile
index 31438a882e..2304ea4246 100644
--- a/src/gallium/drivers/softpipe/Makefile
+++ b/src/gallium/drivers/softpipe/Makefile
@@ -44,7 +44,7 @@ C_SOURCES = \
 
 ASM_SOURCES = 
 
-include ../Makefile.template
+include ../../Makefile.template
 
 symlinks:
 
diff --git a/src/gallium/drivers/softpipe/sp_context.c b/src/gallium/drivers/softpipe/sp_context.c
index cea6b90104..5e98f190bb 100644
--- a/src/gallium/drivers/softpipe/sp_context.c
+++ b/src/gallium/drivers/softpipe/sp_context.c
@@ -29,7 +29,7 @@
  *    Keith Whitwell <keith@tungstengraphics.com>
  */
 
-#include "pipe/draw/draw_context.h"
+#include "draw/draw_context.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_inlines.h"
 #include "pipe/p_util.h"
diff --git a/src/gallium/drivers/softpipe/sp_context.h b/src/gallium/drivers/softpipe/sp_context.h
index aff8c2cc5d..8c79cb3ce4 100644
--- a/src/gallium/drivers/softpipe/sp_context.h
+++ b/src/gallium/drivers/softpipe/sp_context.h
@@ -34,7 +34,7 @@
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
 
-#include "pipe/draw/draw_vertex.h"
+#include "draw/draw_vertex.h"
 
 #include "sp_quad.h"
 
diff --git a/src/gallium/drivers/softpipe/sp_draw_arrays.c b/src/gallium/drivers/softpipe/sp_draw_arrays.c
index 71a303a8b5..2049afda34 100644
--- a/src/gallium/drivers/softpipe/sp_draw_arrays.c
+++ b/src/gallium/drivers/softpipe/sp_draw_arrays.c
@@ -38,7 +38,7 @@
 #include "sp_context.h"
 #include "sp_state.h"
 
-#include "pipe/draw/draw_context.h"
+#include "draw/draw_context.h"
 
 
diff --git a/src/gallium/drivers/softpipe/sp_flush.c b/src/gallium/drivers/softpipe/sp_flush.c
index ced0d5d098..2cbd0d7cab 100644
--- a/src/gallium/drivers/softpipe/sp_flush.c
+++ b/src/gallium/drivers/softpipe/sp_flush.c
@@ -31,7 +31,7 @@
 
 
 #include "pipe/p_defines.h"
-#include "pipe/draw/draw_context.h"
+#include "draw/draw_context.h"
 #include "sp_flush.h"
 #include "sp_context.h"
 #include "sp_surface.h"
diff --git a/src/gallium/drivers/softpipe/sp_headers.h b/src/gallium/drivers/softpipe/sp_headers.h
index 0ae31d8796..9cf8222133 100644
--- a/src/gallium/drivers/softpipe/sp_headers.h
+++ b/src/gallium/drivers/softpipe/sp_headers.h
@@ -31,7 +31,7 @@
 #ifndef SP_HEADERS_H
 #define SP_HEADERS_H
 
-#include "pipe/tgsi/exec/tgsi_exec.h"
+#include "tgsi/exec/tgsi_exec.h"
 
 #define PRIM_POINT 1
 #define PRIM_LINE  2
diff --git a/src/gallium/drivers/softpipe/sp_prim_setup.c b/src/gallium/drivers/softpipe/sp_prim_setup.c
index 2772048661..d73521ccbe 100644
--- a/src/gallium/drivers/softpipe/sp_prim_setup.c
+++ b/src/gallium/drivers/softpipe/sp_prim_setup.c
@@ -38,8 +38,8 @@
 #include "sp_quad.h"
 #include "sp_state.h"
 #include "sp_prim_setup.h"
-#include "pipe/draw/draw_private.h"
-#include "pipe/draw/draw_vertex.h"
+#include "draw/draw_private.h"
+#include "draw/draw_vertex.h"
 #include "pipe/p_util.h"
 #include "pipe/p_shader_tokens.h"
 
diff --git a/src/gallium/drivers/softpipe/sp_prim_vbuf.c b/src/gallium/drivers/softpipe/sp_prim_vbuf.c
index 7f71fdb6a9..69bea8a8f5 100644
--- a/src/gallium/drivers/softpipe/sp_prim_vbuf.c
+++ b/src/gallium/drivers/softpipe/sp_prim_vbuf.c
@@ -39,9 +39,9 @@
 #include "sp_context.h"
 #include "sp_state.h"
 #include "sp_prim_vbuf.h"
-#include "pipe/draw/draw_context.h"
-#include "pipe/draw/draw_private.h"
-#include "pipe/draw/draw_vbuf.h"
+#include "draw/draw_context.h"
+#include "draw/draw_private.h"
+#include "draw/draw_vbuf.h"
 
 
 #define SP_MAX_VBUF_INDEXES 1024
diff --git a/src/gallium/drivers/softpipe/sp_quad_fs.c b/src/gallium/drivers/softpipe/sp_quad_fs.c
index 3316858413..b4c01a7ea8 100644
--- a/src/gallium/drivers/softpipe/sp_quad_fs.c
+++ b/src/gallium/drivers/softpipe/sp_quad_fs.c
@@ -42,7 +42,7 @@
 #include "x86/rtasm/x86sse.h"
 
 #ifdef MESA_LLVM
-#include "pipe/llvm/gallivm.h"
+#include "llvm/gallivm.h"
 #endif
 
 #include "sp_context.h"
diff --git a/src/gallium/drivers/softpipe/sp_query.c b/src/gallium/drivers/softpipe/sp_query.c
index 6a8a43aeda..adf9ccf64c 100644
--- a/src/gallium/drivers/softpipe/sp_query.c
+++ b/src/gallium/drivers/softpipe/sp_query.c
@@ -29,7 +29,7 @@
  *    Keith Whitwell <keith@tungstengraphics.com>
  */
 
-#include "pipe/draw/draw_context.h"
+#include "draw/draw_context.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_inlines.h"
 #include "pipe/p_util.h"
diff --git a/src/gallium/drivers/softpipe/sp_state_clip.c b/src/gallium/drivers/softpipe/sp_state_clip.c
index 08c5f06d05..c797c0dd3b 100644
--- a/src/gallium/drivers/softpipe/sp_state_clip.c
+++ b/src/gallium/drivers/softpipe/sp_state_clip.c
@@ -29,7 +29,7 @@
  */
 #include "sp_context.h"
 #include "sp_state.h"
-#include "pipe/draw/draw_context.h"
+#include "draw/draw_context.h"
 
 
 void softpipe_set_clip_state( struct pipe_context *pipe,
diff --git a/src/gallium/drivers/softpipe/sp_state_derived.c b/src/gallium/drivers/softpipe/sp_state_derived.c
index 372597869f..9d8fd8b750 100644
--- a/src/gallium/drivers/softpipe/sp_state_derived.c
+++ b/src/gallium/drivers/softpipe/sp_state_derived.c
@@ -27,9 +27,9 @@
 
 #include "pipe/p_util.h"
 #include "pipe/p_shader_tokens.h"
-#include "pipe/draw/draw_context.h"
-#include "pipe/draw/draw_vertex.h"
-#include "pipe/draw/draw_private.h"
+#include "draw/draw_context.h"
+#include "draw/draw_vertex.h"
+#include "draw/draw_private.h"
 #include "sp_context.h"
 #include "sp_state.h"
 
diff --git a/src/gallium/drivers/softpipe/sp_state_fs.c b/src/gallium/drivers/softpipe/sp_state_fs.c
index 0b814fc284..1e3cadd43d 100644
--- a/src/gallium/drivers/softpipe/sp_state_fs.c
+++ b/src/gallium/drivers/softpipe/sp_state_fs.c
@@ -32,11 +32,11 @@
 #include "pipe/p_util.h"
 #include "pipe/p_inlines.h"
 #include "pipe/p_winsys.h"
-#include "pipe/draw/draw_context.h"
+#include "draw/draw_context.h"
 #include "pipe/p_shader_tokens.h"
-#include "pipe/llvm/gallivm.h"
-#include "pipe/tgsi/util/tgsi_dump.h"
-#include "pipe/tgsi/exec/tgsi_sse2.h"
+#include "llvm/gallivm.h"
+#include "tgsi/util/tgsi_dump.h"
+#include "tgsi/exec/tgsi_sse2.h"
 
 
 void *
diff --git a/src/gallium/drivers/softpipe/sp_state_rasterizer.c b/src/gallium/drivers/softpipe/sp_state_rasterizer.c
index 53755099dd..98e04352db 100644
--- a/src/gallium/drivers/softpipe/sp_state_rasterizer.c
+++ b/src/gallium/drivers/softpipe/sp_state_rasterizer.c
@@ -29,7 +29,7 @@
 #include "pipe/p_util.h"
 #include "sp_context.h"
 #include "sp_state.h"
-#include "pipe/draw/draw_context.h"
+#include "draw/draw_context.h"
 
 
diff --git a/src/gallium/drivers/softpipe/sp_state_sampler.c b/src/gallium/drivers/softpipe/sp_state_sampler.c
index ea348c7e95..460adccec4 100644
--- a/src/gallium/drivers/softpipe/sp_state_sampler.c
+++ b/src/gallium/drivers/softpipe/sp_state_sampler.c
@@ -31,14 +31,14 @@
 
 #include "pipe/p_util.h"
 
-#include "pipe/draw/draw_context.h"
+#include "draw/draw_context.h"
 
 #include "sp_context.h"
 #include "sp_context.h"
 #include "sp_state.h"
 #include "sp_texture.h"
 #include "sp_tile_cache.h"
-#include "pipe/draw/draw_context.h"
+#include "draw/draw_context.h"
 
 
diff --git a/src/gallium/drivers/softpipe/sp_state_vertex.c b/src/gallium/drivers/softpipe/sp_state_vertex.c
index 09ff540ccf..f01a10de3b 100644
--- a/src/gallium/drivers/softpipe/sp_state_vertex.c
+++ b/src/gallium/drivers/softpipe/sp_state_vertex.c
@@ -33,7 +33,7 @@
 #include "sp_state.h"
 #include "sp_surface.h"
 
-#include "pipe/draw/draw_context.h"
+#include "draw/draw_context.h"
 
 
 void
diff --git a/src/gallium/drivers/softpipe/sp_surface.c b/src/gallium/drivers/softpipe/sp_surface.c
index 8802ced187..653449c4f1 100644
--- a/src/gallium/drivers/softpipe/sp_surface.c
+++ b/src/gallium/drivers/softpipe/sp_surface.c
@@ -29,7 +29,7 @@
 #include "pipe/p_util.h"
 #include "pipe/p_inlines.h"
 #include "pipe/p_winsys.h"
-#include "pipe/util/p_tile.h"
+#include "util/p_tile.h"
 #include "sp_context.h"
 #include "sp_surface.h"
 
diff --git a/src/gallium/drivers/softpipe/sp_tex_sample.c b/src/gallium/drivers/softpipe/sp_tex_sample.c
index 325bdb86da..2f82fd6abe 100644
--- a/src/gallium/drivers/softpipe/sp_tex_sample.c
+++ b/src/gallium/drivers/softpipe/sp_tex_sample.c
@@ -40,7 +40,7 @@
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_util.h"
-#include "pipe/tgsi/exec/tgsi_exec.h"
+#include "tgsi/exec/tgsi_exec.h"
 
 
 /*
diff --git a/src/gallium/drivers/softpipe/sp_tile_cache.c b/src/gallium/drivers/softpipe/sp_tile_cache.c
index 1597361b82..dde3fabc81 100644
--- a/src/gallium/drivers/softpipe/sp_tile_cache.c
+++ b/src/gallium/drivers/softpipe/sp_tile_cache.c
@@ -34,7 +34,7 @@
 
 #include "pipe/p_util.h"
 #include "pipe/p_inlines.h"
-#include "pipe/util/p_tile.h"
+#include "util/p_tile.h"
 #include "sp_context.h"
 #include "sp_surface.h"
 #include "sp_tile_cache.h"
diff --git a/src/gallium/winsys/dri/Makefile b/src/gallium/winsys/dri/Makefile
new file mode 100644
index 0000000000..f466ce6c3c
--- /dev/null
+++ b/src/gallium/winsys/dri/Makefile
@@ -0,0 +1,38 @@
+# src/mesa/drivers/dri/Makefile
+
+TOP = ../../../..
+
+include $(TOP)/configs/current
+
+
+
+default: $(TOP)/$(LIB_DIR) subdirs
+
+
+$(TOP)/$(LIB_DIR):
+	-mkdir $(TOP)/$(LIB_DIR)
+
+
+subdirs:
+	@for dir in $(DRI_DIRS) ; do \
+		if [ -d $$dir ] ; then \
+			(cd $$dir && $(MAKE)) || exit 1 ; \
+		fi \
+	done
+
+
+install:
+	@for dir in $(DRI_DIRS) ; do \
+		if [ -d $$dir ] ; then \
+			(cd $$dir && $(MAKE) install) || exit 1 ; \
+		fi \
+	done
+
+
+clean:
+	@for dir in $(DRI_DIRS) ; do \
+		if [ -d $$dir ] ; then \
+			(cd $$dir && $(MAKE) clean) ; \
+		fi \
+	done
+	-rm -f common/*.o
diff --git a/src/gallium/winsys/dri/Makefile.template b/src/gallium/winsys/dri/Makefile.template
new file mode 100644
index 0000000000..b96305c094
--- /dev/null
+++ b/src/gallium/winsys/dri/Makefile.template
@@ -0,0 +1,113 @@
+# -*-makefile-*-
+
+MESA_MODULES = $(TOP)/src/mesa/libmesa.a
+
+COMMON_GALLIUM_SOURCES = \
+        $(TOP)/src/mesa/drivers/dri/common/utils.c \
+        $(TOP)/src/mesa/drivers/dri/common/vblank.c \
+        $(TOP)/src/mesa/drivers/dri/common/dri_util.c \
+        $(TOP)/src/mesa/drivers/dri/common/xmlconfig.c
+
+COMMON_SOURCES = $(COMMON_GALLIUM_SOURCES) \
+        $(TOP)/src/mesa/drivers/common/driverfuncs.c \
+        $(TOP)/src/mesa/drivers/dri/common/texmem.c \
+        $(TOP)/src/mesa/drivers/dri/common/drirenderbuffer.c
+
+COMMON_BM_SOURCES = \
+	$(TOP)/src/mesa/drivers/dri/common/dri_bufmgr.c \
+	$(TOP)/src/mesa/drivers/dri/common/dri_drmpool.c
+
+
+ifeq ($(WINDOW_SYSTEM),dri)
+WINOBJ=
+WINLIB=
+INCLUDES = $(SHARED_INCLUDES) $(EXPAT_INCLUDES)
+
+OBJECTS = $(C_SOURCES:.c=.o) \
+	  $(ASM_SOURCES:.S=.o) 
+
+else
+# miniglx
+WINOBJ=
+WINLIB=-L$(MESA)/src/glx/mini
+MINIGLX_INCLUDES = -I$(TOP)/src/glx/mini
+INCLUDES = $(MINIGLX_INCLUDES) \
+	   $(SHARED_INCLUDES) \
+	   $(PCIACCESS_CFLAGS)
+
+OBJECTS = $(C_SOURCES:.c=.o) \
+	  $(MINIGLX_SOURCES:.c=.o) \
+	  $(ASM_SOURCES:.S=.o) 
+endif
+
+
+### Include directories
+SHARED_INCLUDES = \
+	-I. \
+	-I$(TOP)/src/mesa/drivers/dri/common \
+	-Iserver \
+	-I$(TOP)/include \
+	-I$(TOP)/include/GL/internal \
+	-I$(TOP)/src/gallium/include \
+	-I$(TOP)/src/gallium/aux \
+	-I$(TOP)/src/gallium/drivers \
+	-I$(TOP)/src/mesa \
+	-I$(TOP)/src/mesa/main \
+	-I$(TOP)/src/mesa/glapi \
+	-I$(TOP)/src/mesa/math \
+	-I$(TOP)/src/mesa/transform \
+	-I$(TOP)/src/mesa/shader \
+	-I$(TOP)/src/mesa/swrast \
+	-I$(TOP)/src/mesa/swrast_setup \
+	-I$(TOP)/src/egl/main \
+	-I$(TOP)/src/egl/drivers/dri \
+	$(LIBDRM_CFLAGS)
+
+
+##### RULES #####
+
+.c.o:
+	$(CC) -c $(INCLUDES) $(CFLAGS) $(DRIVER_DEFINES) $< -o $@
+
+.S.o:
+	$(CC) -c $(INCLUDES) $(CFLAGS) $(DRIVER_DEFINES)  $< -o $@
+
+
+##### TARGETS #####
+
+default: depend symlinks $(LIBNAME) $(TOP)/$(LIB_DIR)/$(LIBNAME)
+
+
+$(LIBNAME): $(OBJECTS) $(MESA_MODULES) $(PIPE_DRIVERS) $(WINOBJ) Makefile $(TOP)/src/mesa/drivers/dri/Makefile.template
+	$(TOP)/bin/mklib -noprefix -o $@ \
+		$(OBJECTS) $(PIPE_DRIVERS) $(MESA_MODULES)  $(WINOBJ) $(DRI_LIB_DEPS)
+
+
+$(TOP)/$(LIB_DIR)/$(LIBNAME): $(LIBNAME)
+	$(INSTALL) $(LIBNAME) $(TOP)/$(LIB_DIR) 
+
+
+depend: $(C_SOURCES) $(ASM_SOURCES) $(SYMLINKS)
+	rm -f depend
+	touch depend
+	$(MKDEP) $(MKDEP_OPTIONS) $(DRIVER_DEFINES) $(INCLUDES) $(C_SOURCES) \
+		$(ASM_SOURCES) 2> /dev/null
+
+
+# Emacs tags
+tags:
+	etags `find . -name \*.[ch]` `find ../include`
+
+
+# Remove .o and backup files
+clean:
+	-rm -f *.o */*.o *~ *.so *~ server/*.o $(SYMLINKS)
+	-rm -f depend depend.bak
+
+
+install: $(LIBNAME)
+	$(INSTALL) -d $(DRI_DRIVER_INSTALL_DIR)
+	$(INSTALL) -m 755 $(LIBNAME) $(DRI_DRIVER_INSTALL_DIR)
+
+
+include depend
diff --git a/src/gallium/winsys/dri/intel/Makefile b/src/gallium/winsys/dri/intel/Makefile
index 9ae0f01325..40654bb2ac 100644
--- a/src/gallium/winsys/dri/intel/Makefile
+++ b/src/gallium/winsys/dri/intel/Makefile
@@ -7,8 +7,8 @@ LIBNAME = i915tex_dri.so
 MINIGLX_SOURCES = server/intel_dri.c
 
 PIPE_DRIVERS = \
-	$(TOP)/src/mesa/pipe/softpipe/libsoftpipe.a \
-	$(TOP)/src/mesa/pipe/i915simple/libi915simple.a
+	$(TOP)/src/gallium/drivers/softpipe/libsoftpipe.a \
+	$(TOP)/src/gallium/drivers/i915simple/libi915simple.a
 
 DRIVER_SOURCES = \
 	intel_winsys_pipe.c \
@@ -28,11 +28,11 @@ C_SOURCES = \
 
 ASM_SOURCES = 
 
-DRIVER_DEFINES = -I../intel $(shell pkg-config libdrm --atleast-version=2.3.1 \
+DRIVER_DEFINES = -I$(TOP)/src/mesa/drivers/dri/intel $(shell pkg-config libdrm --atleast-version=2.3.1 \
 				&& echo "-DDRM_VBLANK_FLIP=DRM_VBLANK_FLIP")
 
 include ../Makefile.template
 
-intel_tex_layout.o: ../intel/intel_tex_layout.c
+intel_tex_layout.o: $(TOP)/src/mesa/drivers/dri/intel/intel_tex_layout.c
 
 symlinks:
diff --git a/src/gallium/winsys/dri/intel/intel_winsys_i915.c b/src/gallium/winsys/dri/intel/intel_winsys_i915.c
index 1ba6a9e1b2..0ed3890e93 100644
--- a/src/gallium/winsys/dri/intel/intel_winsys_i915.c
+++ b/src/gallium/winsys/dri/intel/intel_winsys_i915.c
@@ -39,7 +39,7 @@
 #include "intel_winsys.h"
 
 #include "pipe/p_util.h"
-#include "pipe/i915simple/i915_winsys.h"
+#include "i915simple/i915_winsys.h"
 
 
 struct intel_i915_winsys {
diff --git a/src/gallium/winsys/dri/intel/intel_winsys_softpipe.c b/src/gallium/winsys/dri/intel/intel_winsys_softpipe.c
index cec3437831..9e483bdc9f 100644
--- a/src/gallium/winsys/dri/intel/intel_winsys_softpipe.c
+++ b/src/gallium/winsys/dri/intel/intel_winsys_softpipe.c
@@ -34,7 +34,7 @@
 #include "pipe/p_defines.h"
 #include "pipe/p_util.h"
 #include "pipe/p_format.h"
-#include "pipe/softpipe/sp_winsys.h"
+#include "softpipe/sp_winsys.h"
 
 
 struct intel_softpipe_winsys {
diff --git a/src/gallium/winsys/xlib/xm_winsys.c b/src/gallium/winsys/xlib/xm_winsys.c
index c3cd22eea3..8da596d419 100644
--- a/src/gallium/winsys/xlib/xm_winsys.c
+++ b/src/gallium/winsys/xlib/xm_winsys.c
@@ -41,11 +41,11 @@
 #include "pipe/p_context.h"
 #include "pipe/p_util.h"
 #include "pipe/p_inlines.h"
-#include "pipe/softpipe/sp_winsys.h"
+#include "softpipe/sp_winsys.h"
 
 #ifdef GALLIUM_CELL
-#include "pipe/cell/ppu/cell_context.h"
-#include "pipe/cell/ppu/cell_winsys.h"
+#include "cell/ppu/cell_context.h"
+#include "cell/ppu/cell_winsys.h"
 #else
 #define TILE_SIZE 32  /* avoid compilation errors */
 #endif
diff --git a/src/gallium/winsys/xlib/xm_winsys_aub.c b/src/gallium/winsys/xlib/xm_winsys_aub.c
index bf41570257..dbfd37bda2 100644
--- a/src/gallium/winsys/xlib/xm_winsys_aub.c
+++ b/src/gallium/winsys/xlib/xm_winsys_aub.c
@@ -39,7 +39,7 @@
 #include "pipe/p_winsys.h"
 #include "pipe/p_util.h"
 #include "pipe/p_inlines.h"
-#include "pipe/i965simple/brw_winsys.h"
+#include "i965simple/brw_winsys.h"
 #include "brw_aub.h"
 #include "xm_winsys_aub.h"
 
diff --git a/src/mesa/Makefile b/src/mesa/Makefile
index 720f1b2e02..561608fedd 100644
--- a/src/mesa/Makefile
+++ b/src/mesa/Makefile
@@ -12,16 +12,16 @@ GL_TINY = 0$(MESA_MAJOR)0$(MESA_MINOR)0$(MESA_TINY)
 
 
 PIPE_LIB = \
-	$(TOP)/src/mesa/pipe/softpipe/libsoftpipe.a \
-	$(TOP)/src/mesa/pipe/i965simple/libi965simple.a
+	$(TOP)/src/gallium/drivers/softpipe/libsoftpipe.a \
+	$(TOP)/src/gallium/drivers/i965simple/libi965simple.a
 
 ifeq ($(CONFIG_NAME), linux-cell)
-CELL_LIB = $(TOP)/src/mesa/pipe/cell/ppu/libcell.a
-CELL_LIB_SPU = $(TOP)/src/mesa/pipe/cell/spu/g3d_spu.a
+CELL_LIB = $(TOP)/src/gallium/drivers/cell/ppu/libcell.a
+CELL_LIB_SPU = $(TOP)/src/gallium/drivers/cell/spu/g3d_spu.a
 endif
 
 ifeq ($(CONFIG_NAME), linux-llvm)
-LLVM_LIB = $(TOP)/src/mesa/pipe/llvm/libgallivm.a
+LLVM_LIB = $(TOP)/src/gallium/aux/llvm/libgallivm.a
 endif
 
 .SUFFIXES : .cpp
@@ -71,7 +71,7 @@ libmesa.a: $(SOLO_OBJECTS)
 	fi
 
 linux-solo: depend subdirs libmesa.a
-	cd drivers/dri ; $(MAKE)
+	cd $(TOP)/src/gallium/winsys/dri ; $(MAKE)
 
 
 #####################################################################
@@ -165,7 +165,6 @@ depend: $(ALL_SOURCES)
 subdirs:
 	@ (cd x86 ; $(MAKE))
 	@ (cd x86-64 ; $(MAKE))
-	(cd pipe ; $(MAKE))
 
 install: default
 	$(INSTALL) -d $(INSTALL_DIR)/include/GL
@@ -178,7 +177,7 @@ install: default
 		$(INSTALL) $(TOP)/$(LIB_DIR)/libOSMesa* $(INSTALL_DIR)/$(LIB_DIR); \
 	fi
 	@if [ "${DRIVER_DIRS}" = "dri" ] ; then \
-		cd drivers/dri ; $(MAKE) install ; \
+		cd $(TOP)/gallium/winsys/dri ; $(MAKE) install ; \
 	fi
 
 ## NOT INSTALLED YET:
@@ -198,7 +197,6 @@ clean:
 	(cd drivers/dri && $(MAKE) clean)
 	(cd x86 && $(MAKE) clean)
 	(cd x86-64 && $(MAKE) clean)
-	(cd pipe ; $(MAKE) clean )
 
 
 include depend
diff --git a/src/mesa/drivers/x11/xm_api.c b/src/mesa/drivers/x11/xm_api.c
index 08c98eab48..18b033666f 100644
--- a/src/mesa/drivers/x11/xm_api.c
+++ b/src/mesa/drivers/x11/xm_api.c
@@ -85,7 +85,7 @@
 
 #include "state_tracker/st_public.h"
 #include "state_tracker/st_context.h"
-#include "pipe/softpipe/sp_context.h"
+#include "softpipe/sp_context.h"
 #include "pipe/p_defines.h"
 
 /**
diff --git a/src/mesa/drivers/x11/xm_dd.c b/src/mesa/drivers/x11/xm_dd.c
index 8ae243ae66..34287effe1 100644
--- a/src/mesa/drivers/x11/xm_dd.c
+++ b/src/mesa/drivers/x11/xm_dd.c
@@ -53,7 +53,7 @@
 #include "tnl/tnl.h"
 #include "tnl/t_context.h"
 
-#include "pipe/softpipe/sp_context.h"
+#include "softpipe/sp_context.h"
 #include "state_tracker/st_public.h"
 #include "state_tracker/st_context.h"
 #include "state_tracker/st_draw.h"
diff --git a/src/mesa/drivers/x11/xm_surface.c b/src/mesa/drivers/x11/xm_surface.c
index 5533158ece..81616b92d9 100644
--- a/src/mesa/drivers/x11/xm_surface.c
+++ b/src/mesa/drivers/x11/xm_surface.c
@@ -45,10 +45,10 @@
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_winsys.h"
-#include "pipe/softpipe/sp_context.h"
-#include "pipe/softpipe/sp_clear.h"
-#include "pipe/softpipe/sp_tile_cache.h"
-#include "pipe/softpipe/sp_surface.h"
+#include "softpipe/sp_context.h"
+#include "softpipe/sp_clear.h"
+#include "softpipe/sp_tile_cache.h"
+#include "softpipe/sp_surface.h"
 #include "state_tracker/st_context.h"
 
 
diff --git a/src/mesa/drivers/x11/xm_winsys.c b/src/mesa/drivers/x11/xm_winsys.c
index a690df2772..2edc697693 100644
--- a/src/mesa/drivers/x11/xm_winsys.c
+++ b/src/mesa/drivers/x11/xm_winsys.c
@@ -38,7 +38,7 @@
 #include "main/macros.h"
 
 #include "pipe/p_winsys.h"
-#include "pipe/softpipe/sp_winsys.h"
+#include "softpipe/sp_winsys.h"
 
 
 /**
diff --git a/src/mesa/drivers/x11/xmesaP.h b/src/mesa/drivers/x11/xmesaP.h
index 4709d63394..fd2dfcd79a 100644
--- a/src/mesa/drivers/x11/xmesaP.h
+++ b/src/mesa/drivers/x11/xmesaP.h
@@ -37,8 +37,8 @@
 #include "xm_image.h"
 #endif
 #include "state_tracker/st_cb_fbo.h"
-#include "pipe/softpipe/sp_context.h"
-#include "pipe/softpipe/sp_surface.h"
+#include "softpipe/sp_context.h"
+#include "softpipe/sp_surface.h"
 
 
 extern _glthread_Mutex _xmesa_lock;
diff --git a/src/mesa/sources b/src/mesa/sources
index 1165425183..2d07738210 100644
--- a/src/mesa/sources
+++ b/src/mesa/sources
@@ -158,45 +158,45 @@ VF_SOURCES = \
 
 
 DRAW_SOURCES = \
-	pipe/draw/draw_clip.c \
-	pipe/draw/draw_context.c\
-	pipe/draw/draw_cull.c \
-	pipe/draw/draw_debug.c \
-	pipe/draw/draw_flatshade.c \
-	pipe/draw/draw_offset.c \
-	pipe/draw/draw_prim.c \
-	pipe/draw/draw_stipple.c \
-	pipe/draw/draw_twoside.c \
-	pipe/draw/draw_unfilled.c \
-	pipe/draw/draw_validate.c \
-	pipe/draw/draw_vbuf.c \
-	pipe/draw/draw_vertex.c \
-	pipe/draw/draw_vertex_cache.c \
-	pipe/draw/draw_vertex_fetch.c \
-	pipe/draw/draw_vertex_shader.c \
-	pipe/draw/draw_vf.c \
-	pipe/draw/draw_vf_generic.c \
-	pipe/draw/draw_vf_sse.c \
-	pipe/draw/draw_wide_prims.c
+	$(TOP)/src/gallium/aux/draw/draw_clip.c \
+	$(TOP)/src/gallium/aux/draw/draw_context.c\
+	$(TOP)/src/gallium/aux/draw/draw_cull.c \
+	$(TOP)/src/gallium/aux/draw/draw_debug.c \
+	$(TOP)/src/gallium/aux/draw/draw_flatshade.c \
+	$(TOP)/src/gallium/aux/draw/draw_offset.c \
+	$(TOP)/src/gallium/aux/draw/draw_prim.c \
+	$(TOP)/src/gallium/aux/draw/draw_stipple.c \
+	$(TOP)/src/gallium/aux/draw/draw_twoside.c \
+	$(TOP)/src/gallium/aux/draw/draw_unfilled.c \
+	$(TOP)/src/gallium/aux/draw/draw_validate.c \
+	$(TOP)/src/gallium/aux/draw/draw_vbuf.c \
+	$(TOP)/src/gallium/aux/draw/draw_vertex.c \
+	$(TOP)/src/gallium/aux/draw/draw_vertex_cache.c \
+	$(TOP)/src/gallium/aux/draw/draw_vertex_fetch.c \
+	$(TOP)/src/gallium/aux/draw/draw_vertex_shader.c \
+	$(TOP)/src/gallium/aux/draw/draw_vf.c \
+	$(TOP)/src/gallium/aux/draw/draw_vf_generic.c \
+	$(TOP)/src/gallium/aux/draw/draw_vf_sse.c \
+	$(TOP)/src/gallium/aux/draw/draw_wide_prims.c
 
 TGSIEXEC_SOURCES = \
-	pipe/tgsi/exec/tgsi_exec.c \
-	pipe/tgsi/exec/tgsi_sse2.c
+	$(TOP)/src/gallium/aux/tgsi/exec/tgsi_exec.c \
+	$(TOP)/src/gallium/aux/tgsi/exec/tgsi_sse2.c
 
 TGSIUTIL_SOURCES = \
-	pipe/tgsi/util/tgsi_build.c \
-	pipe/tgsi/util/tgsi_dump.c \
-	pipe/tgsi/util/tgsi_parse.c \
-	pipe/tgsi/util/tgsi_util.c
+	$(TOP)/src/gallium/aux/tgsi/util/tgsi_build.c \
+	$(TOP)/src/gallium/aux/tgsi/util/tgsi_dump.c \
+	$(TOP)/src/gallium/aux/tgsi/util/tgsi_parse.c \
+	$(TOP)/src/gallium/aux/tgsi/util/tgsi_util.c
 
 STATECACHE_SOURCES = \
-	pipe/cso_cache/cso_hash.c \
-	pipe/cso_cache/cso_cache.c
+	$(TOP)/src/gallium/aux/cso_cache/cso_hash.c \
+	$(TOP)/src/gallium/aux/cso_cache/cso_cache.c
 
 PIPEUTIL_SOURCES = \
-	pipe/util/p_debug.c \
-	pipe/util/p_tile.c \
-	pipe/util/p_util.c
+	$(TOP)/src/gallium/aux/util/p_debug.c \
+	$(TOP)/src/gallium/aux/util/p_tile.c \
+	$(TOP)/src/gallium/aux/util/p_util.c
 
 STATETRACKER_SOURCES = \
 	state_tracker/st_atom.c \
@@ -331,13 +331,13 @@ __COMMON_DRIVER_SOURCES =			\
 	drivers/common/driverfuncs.c
 
 X11_DRIVER_SOURCES =		\
-	pipe/xlib/glxapi.c	\
-	pipe/xlib/fakeglx.c	\
-	pipe/xlib/xfonts.c	\
-	pipe/xlib/xm_api.c	\
-	pipe/xlib/xm_winsys.c	\
-	pipe/xlib/xm_winsys_aub.c	\
-	pipe/xlib/brw_aub.c
+	$(TOP)/src/gallium/winsys/xlib/glxapi.c	\
+	$(TOP)/src/gallium/winsys/xlib/fakeglx.c	\
+	$(TOP)/src/gallium/winsys/xlib/xfonts.c	\
+	$(TOP)/src/gallium/winsys/xlib/xm_api.c	\
+	$(TOP)/src/gallium/winsys/xlib/xm_winsys.c	\
+	$(TOP)/src/gallium/winsys/xlib/xm_winsys_aub.c	\
+	$(TOP)/src/gallium/winsys/xlib/brw_aub.c
 
 OSMESA_DRIVER_SOURCES = \
 	drivers/osmesa/osmesa.c
@@ -425,7 +425,10 @@ FBDEV_DRIVER_OBJECTS = $(FBDEV_DRIVER_SOURCES:.c=.o)
 INCLUDE_DIRS = \
 	-I$(TOP)/include \
 	-I$(TOP)/src/mesa \
-	-I$(TOP)/src/mesa/main
+	-I$(TOP)/src/mesa/main \
+	-I$(TOP)/src/gallium/include \
+	-I$(TOP)/src/gallium/drivers \
+	-I$(TOP)/src/gallium/aux
 
 OLD_INCLUDE_DIRS = \
 	-I$(TOP)/src/mesa/tnl \
@@ -435,4 +438,4 @@ OLD_INCLUDE_DIRS = \
 	-I$(TOP)/src/mesa/shader \
 	-I$(TOP)/src/mesa/shader/grammar \
 	-I$(TOP)/src/mesa/shader/slang \
-	-I$(TOP)/src/mesa/pipe/tgsi
+	-I$(TOP)/s$(TOP)/src/gallium/aux/tgsi
diff --git a/src/mesa/state_tracker/st_atom_shader.c b/src/mesa/state_tracker/st_atom_shader.c
index 2c6ec8421b..b67b620eaa 100644
--- a/src/mesa/state_tracker/st_atom_shader.c
+++ b/src/mesa/state_tracker/st_atom_shader.c
@@ -43,7 +43,7 @@
 #include "pipe/p_context.h"
 #include "pipe/p_shader_tokens.h"
 
-#include "pipe/cso_cache/cso_cache.h"
+#include "cso_cache/cso_cache.h"
 
 #include "st_context.h"
 #include "st_cache.h"
diff --git a/src/mesa/state_tracker/st_cache.c b/src/mesa/state_tracker/st_cache.c
index e0965b217a..2979e7fae5 100644
--- a/src/mesa/state_tracker/st_cache.c
+++ b/src/mesa/state_tracker/st_cache.c
@@ -36,8 +36,8 @@
 
 #include "pipe/p_state.h"
 
-#include "pipe/cso_cache/cso_cache.h"
-#include "pipe/cso_cache/cso_hash.h"
+#include "cso_cache/cso_cache.h"
+#include "cso_cache/cso_hash.h"
 
 
 /* Those function will either find the state of the given template
diff --git a/src/mesa/state_tracker/st_cache.h b/src/mesa/state_tracker/st_cache.h
index e0c176b0ff..b81de316ec 100644
--- a/src/mesa/state_tracker/st_cache.h
+++ b/src/mesa/state_tracker/st_cache.h
@@ -33,7 +33,7 @@
 #ifndef ST_CACHE_H
 #define ST_CACHE_H
 
-#include "pipe/cso_cache/cso_cache.h"
+#include "cso_cache/cso_cache.h"
 
 struct pipe_blend_state;
 struct pipe_sampler_state;
diff --git a/src/mesa/state_tracker/st_cb_accum.c b/src/mesa/state_tracker/st_cb_accum.c
index 3a3bf9016d..663c4f205d 100644
--- a/src/mesa/state_tracker/st_cb_accum.c
+++ b/src/mesa/state_tracker/st_cb_accum.c
@@ -43,7 +43,7 @@
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_inlines.h"
-#include "pipe/util/p_tile.h"
+#include "util/p_tile.h"
 
 
 #define UNCLAMPED_FLOAT_TO_SHORT(us, f)  \
diff --git a/src/mesa/state_tracker/st_cb_drawpixels.c b/src/mesa/state_tracker/st_cb_drawpixels.c
index f13199a3c0..e2d4e06da1 100644
--- a/src/mesa/state_tracker/st_cb_drawpixels.c
+++ b/src/mesa/state_tracker/st_cb_drawpixels.c
@@ -56,7 +56,7 @@
 #include "pipe/p_defines.h"
 #include "pipe/p_inlines.h"
 #include "pipe/p_winsys.h"
-#include "pipe/util/p_tile.h"
+#include "util/p_tile.h"
 #include "shader/prog_instruction.h"
 
 
diff --git a/src/mesa/state_tracker/st_cb_feedback.c b/src/mesa/state_tracker/st_cb_feedback.c
index 31744151f1..5315294c07 100644
--- a/src/mesa/state_tracker/st_cb_feedback.c
+++ b/src/mesa/state_tracker/st_cb_feedback.c
@@ -53,10 +53,10 @@
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_winsys.h"
-#include "pipe/cso_cache/cso_cache.h"
+#include "cso_cache/cso_cache.h"
 
-#include "pipe/draw/draw_context.h"
-#include "pipe/draw/draw_private.h"
+#include "draw/draw_context.h"
+#include "draw/draw_private.h"
 
 
 /**
diff --git a/src/mesa/state_tracker/st_cb_program.c b/src/mesa/state_tracker/st_cb_program.c
index af3ee65504..61d4f4c41c 100644
--- a/src/mesa/state_tracker/st_cb_program.c
+++ b/src/mesa/state_tracker/st_cb_program.c
@@ -39,8 +39,8 @@
 #include "shader/programopt.h"
 #include "shader/shader_api.h"
 
-#include "pipe/cso_cache/cso_cache.h"
-#include "pipe/draw/draw_context.h"
+#include "cso_cache/cso_cache.h"
+#include "draw/draw_context.h"
 
 #include "st_context.h"
 #include "st_program.h"
diff --git a/src/mesa/state_tracker/st_cb_rasterpos.c b/src/mesa/state_tracker/st_cb_rasterpos.c
index 7e347c4893..5b0eb6022b 100644
--- a/src/mesa/state_tracker/st_cb_rasterpos.c
+++ b/src/mesa/state_tracker/st_cb_rasterpos.c
@@ -44,8 +44,8 @@
 #include "st_draw.h"
 #include "st_cb_rasterpos.h"
 #include "st_draw.h"
-#include "pipe/draw/draw_context.h"
-#include "pipe/draw/draw_private.h"
+#include "draw/draw_context.h"
+#include "draw/draw_private.h"
 #include "shader/prog_instruction.h"
 #include "vbo/vbo.h"
 
diff --git a/src/mesa/state_tracker/st_cb_readpixels.c b/src/mesa/state_tracker/st_cb_readpixels.c
index 868c5f3c5f..c89c74229e 100644
--- a/src/mesa/state_tracker/st_cb_readpixels.c
+++ b/src/mesa/state_tracker/st_cb_readpixels.c
@@ -40,7 +40,7 @@
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_inlines.h"
-#include "pipe/util/p_tile.h"
+#include "util/p_tile.h"
 #include "st_context.h"
 #include "st_cb_readpixels.h"
 #include "st_cb_fbo.h"
diff --git a/src/mesa/state_tracker/st_cb_texture.c b/src/mesa/state_tracker/st_cb_texture.c
index 91a40288cc..03dbb30b0f 100644
--- a/src/mesa/state_tracker/st_cb_texture.c
+++ b/src/mesa/state_tracker/st_cb_texture.c
@@ -47,7 +47,7 @@
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_inlines.h"
-#include "pipe/util/p_tile.h"
+#include "util/p_tile.h"
 
 
 #define DBG if (0) printf
diff --git a/src/mesa/state_tracker/st_context.c b/src/mesa/state_tracker/st_context.c
index bf4618bed8..09e389f9dc 100644
--- a/src/mesa/state_tracker/st_context.c
+++ b/src/mesa/state_tracker/st_context.c
@@ -54,8 +54,8 @@
 #include "pipe/p_context.h"
 #include "pipe/p_winsys.h"
 #include "pipe/p_inlines.h"
-#include "pipe/draw/draw_context.h"
-#include "pipe/cso_cache/cso_cache.h"
+#include "draw/draw_context.h"
+#include "cso_cache/cso_cache.h"
 
 
 /**
diff --git a/src/mesa/state_tracker/st_debug.c b/src/mesa/state_tracker/st_debug.c
index 57450e52bf..5888bcb98a 100644
--- a/src/mesa/state_tracker/st_debug.c
+++ b/src/mesa/state_tracker/st_debug.c
@@ -31,9 +31,9 @@
 
 #include "pipe/p_state.h"
 #include "pipe/p_shader_tokens.h"
-#include "pipe/tgsi/util/tgsi_dump.h"
+#include "tgsi/util/tgsi_dump.h"
 
-#include "pipe/cso_cache/cso_cache.h"
+#include "cso_cache/cso_cache.h"
 
 #include "st_context.h"
 #include "st_debug.h"
diff --git a/src/mesa/state_tracker/st_draw.c b/src/mesa/state_tracker/st_draw.c
index ae9f5c8b11..1c0fa8c6aa 100644
--- a/src/mesa/state_tracker/st_draw.c
+++ b/src/mesa/state_tracker/st_draw.c
@@ -47,8 +47,8 @@
 #include "pipe/p_winsys.h"
 #include "pipe/p_inlines.h"
 
-#include "pipe/draw/draw_private.h"
-#include "pipe/draw/draw_context.h"
+#include "draw/draw_private.h"
+#include "draw/draw_context.h"
 
 
 static GLuint double_types[4] = {
diff --git a/src/mesa/state_tracker/st_gen_mipmap.c b/src/mesa/state_tracker/st_gen_mipmap.c
index 459941cca8..6c09b86033 100644
--- a/src/mesa/state_tracker/st_gen_mipmap.c
+++ b/src/mesa/state_tracker/st_gen_mipmap.c
@@ -37,7 +37,7 @@
 #include "pipe/p_defines.h"
 #include "pipe/p_inlines.h"
 #include "pipe/p_winsys.h"
-#include "pipe/cso_cache/cso_cache.h"
+#include "cso_cache/cso_cache.h"
 
 #include "st_context.h"
 #include "st_draw.h"
diff --git a/src/mesa/state_tracker/st_mesa_to_tgsi.c b/src/mesa/state_tracker/st_mesa_to_tgsi.c
index 325aa20173..97206752af 100644
--- a/src/mesa/state_tracker/st_mesa_to_tgsi.c
+++ b/src/mesa/state_tracker/st_mesa_to_tgsi.c
@@ -32,9 +32,9 @@
 
 #include "pipe/p_compiler.h"
 #include "pipe/p_shader_tokens.h"
-#include "pipe/tgsi/util/tgsi_parse.h"
-#include "pipe/tgsi/util/tgsi_build.h"
-#include "pipe/tgsi/util/tgsi_util.h"
+#include "tgsi/util/tgsi_parse.h"
+#include "tgsi/util/tgsi_build.h"
+#include "tgsi/util/tgsi_util.h"
 #include "st_mesa_to_tgsi.h"
 #include "shader/prog_instruction.h"
 #include "shader/prog_parameter.h"
diff --git a/src/mesa/state_tracker/st_program.c b/src/mesa/state_tracker/st_program.c
index c8297baded..dc992ee9c2 100644
--- a/src/mesa/state_tracker/st_program.c
+++ b/src/mesa/state_tracker/st_program.c
@@ -38,8 +38,8 @@
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_shader_tokens.h"
-#include "pipe/draw/draw_context.h"
-#include "pipe/tgsi/util/tgsi_dump.h"
+#include "draw/draw_context.h"
+#include "tgsi/util/tgsi_dump.h"
 
 #include "st_context.h"
 #include "st_cache.h"
-- 
cgit v1.2.3


From 66f22aa3bf7fa546e946b45156aa578e202982c9 Mon Sep 17 00:00:00 2001
From: José Fonseca <jrfonseca@tungstengraphics.com>
Date: Fri, 15 Feb 2008 20:11:40 +0900
Subject: Code reorganization: s/aux/auxiliary/ -- update build.

---
 src/gallium/Makefile                     |  2 +-
 src/gallium/Makefile.template            |  2 +-
 src/gallium/auxiliary/llvm/Makefile      |  2 +-
 src/gallium/drivers/cell/ppu/Makefile    |  2 +-
 src/gallium/drivers/cell/spu/Makefile    |  2 +-
 src/gallium/winsys/dri/Makefile.template |  2 +-
 src/mesa/Makefile                        |  2 +-
 src/mesa/sources                         | 66 ++++++++++++++++----------------
 8 files changed, 40 insertions(+), 40 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/Makefile b/src/gallium/Makefile
index a13b9a52d3..89e068a449 100644
--- a/src/gallium/Makefile
+++ b/src/gallium/Makefile
@@ -2,7 +2,7 @@ TOP = ../..
 include $(TOP)/configs/current
 
 
-SUBDIRS = aux drivers
+SUBDIRS = auxiliary drivers
 
 
 default: subdirs
diff --git a/src/gallium/Makefile.template b/src/gallium/Makefile.template
index 0717ed8dd2..83b25f9b47 100644
--- a/src/gallium/Makefile.template
+++ b/src/gallium/Makefile.template
@@ -17,7 +17,7 @@ INCLUDES = \
 	-I. \
 	-I$(TOP)/src/gallium/include \
 	-I$(TOP)/src/gallium/include/pipe \
-	-I$(TOP)/src/gallium/aux \
+	-I$(TOP)/src/gallium/auxiliary \
 	-I$(TOP)/src/gallium/drivers \
 	-I$(TOP)/src/mesa \
 	-I$(TOP)/include \
diff --git a/src/gallium/auxiliary/llvm/Makefile b/src/gallium/auxiliary/llvm/Makefile
index e6ac399d08..e0abf860c1 100644
--- a/src/gallium/auxiliary/llvm/Makefile
+++ b/src/gallium/auxiliary/llvm/Makefile
@@ -31,7 +31,7 @@ OBJECTS = $(C_SOURCES:.c=.o) \
 INCLUDES = \
 	-I. \
 	-I$(TOP)/src/gallium/drivers
-	-I$(TOP)/src/gallium/aux \
+	-I$(TOP)/src/gallium/auxiliary \
 	-I$(TOP)/src/gallium/include \
 	-I$(TOP)/src/mesa \
 	-I$(TOP)/include
diff --git a/src/gallium/drivers/cell/ppu/Makefile b/src/gallium/drivers/cell/ppu/Makefile
index 011863c11e..a4c3f29e8a 100644
--- a/src/gallium/drivers/cell/ppu/Makefile
+++ b/src/gallium/drivers/cell/ppu/Makefile
@@ -43,7 +43,7 @@ OBJECTS = $(SOURCES:.c=.o) \
 INCLUDE_DIRS = \
 	-I$(TOP)/src/mesa \
 	-I$(TOP)/src/gallium/include \
-	-I$(TOP)/src/gallium/aux \
+	-I$(TOP)/src/gallium/auxiliary \
 	-I$(TOP)/src/gallium/drivers
 
 .c.o:
diff --git a/src/gallium/drivers/cell/spu/Makefile b/src/gallium/drivers/cell/spu/Makefile
index 7aa947299e..30ef2450ec 100644
--- a/src/gallium/drivers/cell/spu/Makefile
+++ b/src/gallium/drivers/cell/spu/Makefile
@@ -34,7 +34,7 @@ SPU_ASM_OUT = $(SOURCES:.c=.s) \
 INCLUDE_DIRS = \
 	-I$(TOP)/src/mesa \
 	-I$(TOP)/src/gallium/include \
-	-I$(TOP)/src/gallium/aux \
+	-I$(TOP)/src/gallium/auxiliary \
 	-I$(TOP)/src/gallium/drivers
 
 
diff --git a/src/gallium/winsys/dri/Makefile.template b/src/gallium/winsys/dri/Makefile.template
index b96305c094..2a261ed669 100644
--- a/src/gallium/winsys/dri/Makefile.template
+++ b/src/gallium/winsys/dri/Makefile.template
@@ -49,7 +49,7 @@ SHARED_INCLUDES = \
 	-I$(TOP)/include \
 	-I$(TOP)/include/GL/internal \
 	-I$(TOP)/src/gallium/include \
-	-I$(TOP)/src/gallium/aux \
+	-I$(TOP)/src/gallium/auxiliary \
 	-I$(TOP)/src/gallium/drivers \
 	-I$(TOP)/src/mesa \
 	-I$(TOP)/src/mesa/main \
diff --git a/src/mesa/Makefile b/src/mesa/Makefile
index 561608fedd..c8cb2b592f 100644
--- a/src/mesa/Makefile
+++ b/src/mesa/Makefile
@@ -21,7 +21,7 @@ CELL_LIB_SPU = $(TOP)/src/gallium/drivers/cell/spu/g3d_spu.a
 endif
 
 ifeq ($(CONFIG_NAME), linux-llvm)
-LLVM_LIB = $(TOP)/src/gallium/aux/llvm/libgallivm.a
+LLVM_LIB = $(TOP)/src/gallium/auxiliary/llvm/libgallivm.a
 endif
 
 .SUFFIXES : .cpp
diff --git a/src/mesa/sources b/src/mesa/sources
index 2d07738210..cecd8a830f 100644
--- a/src/mesa/sources
+++ b/src/mesa/sources
@@ -158,45 +158,45 @@ VF_SOURCES = \
 
 
 DRAW_SOURCES = \
-	$(TOP)/src/gallium/aux/draw/draw_clip.c \
-	$(TOP)/src/gallium/aux/draw/draw_context.c\
-	$(TOP)/src/gallium/aux/draw/draw_cull.c \
-	$(TOP)/src/gallium/aux/draw/draw_debug.c \
-	$(TOP)/src/gallium/aux/draw/draw_flatshade.c \
-	$(TOP)/src/gallium/aux/draw/draw_offset.c \
-	$(TOP)/src/gallium/aux/draw/draw_prim.c \
-	$(TOP)/src/gallium/aux/draw/draw_stipple.c \
-	$(TOP)/src/gallium/aux/draw/draw_twoside.c \
-	$(TOP)/src/gallium/aux/draw/draw_unfilled.c \
-	$(TOP)/src/gallium/aux/draw/draw_validate.c \
-	$(TOP)/src/gallium/aux/draw/draw_vbuf.c \
-	$(TOP)/src/gallium/aux/draw/draw_vertex.c \
-	$(TOP)/src/gallium/aux/draw/draw_vertex_cache.c \
-	$(TOP)/src/gallium/aux/draw/draw_vertex_fetch.c \
-	$(TOP)/src/gallium/aux/draw/draw_vertex_shader.c \
-	$(TOP)/src/gallium/aux/draw/draw_vf.c \
-	$(TOP)/src/gallium/aux/draw/draw_vf_generic.c \
-	$(TOP)/src/gallium/aux/draw/draw_vf_sse.c \
-	$(TOP)/src/gallium/aux/draw/draw_wide_prims.c
+	$(TOP)/src/gallium/auxiliary/draw/draw_clip.c \
+	$(TOP)/src/gallium/auxiliary/draw/draw_context.c\
+	$(TOP)/src/gallium/auxiliary/draw/draw_cull.c \
+	$(TOP)/src/gallium/auxiliary/draw/draw_debug.c \
+	$(TOP)/src/gallium/auxiliary/draw/draw_flatshade.c \
+	$(TOP)/src/gallium/auxiliary/draw/draw_offset.c \
+	$(TOP)/src/gallium/auxiliary/draw/draw_prim.c \
+	$(TOP)/src/gallium/auxiliary/draw/draw_stipple.c \
+	$(TOP)/src/gallium/auxiliary/draw/draw_twoside.c \
+	$(TOP)/src/gallium/auxiliary/draw/draw_unfilled.c \
+	$(TOP)/src/gallium/auxiliary/draw/draw_validate.c \
+	$(TOP)/src/gallium/auxiliary/draw/draw_vbuf.c \
+	$(TOP)/src/gallium/auxiliary/draw/draw_vertex.c \
+	$(TOP)/src/gallium/auxiliary/draw/draw_vertex_cache.c \
+	$(TOP)/src/gallium/auxiliary/draw/draw_vertex_fetch.c \
+	$(TOP)/src/gallium/auxiliary/draw/draw_vertex_shader.c \
+	$(TOP)/src/gallium/auxiliary/draw/draw_vf.c \
+	$(TOP)/src/gallium/auxiliary/draw/draw_vf_generic.c \
+	$(TOP)/src/gallium/auxiliary/draw/draw_vf_sse.c \
+	$(TOP)/src/gallium/auxiliary/draw/draw_wide_prims.c
 
 TGSIEXEC_SOURCES = \
-	$(TOP)/src/gallium/aux/tgsi/exec/tgsi_exec.c \
-	$(TOP)/src/gallium/aux/tgsi/exec/tgsi_sse2.c
+	$(TOP)/src/gallium/auxiliary/tgsi/exec/tgsi_exec.c \
+	$(TOP)/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c
 
 TGSIUTIL_SOURCES = \
-	$(TOP)/src/gallium/aux/tgsi/util/tgsi_build.c \
-	$(TOP)/src/gallium/aux/tgsi/util/tgsi_dump.c \
-	$(TOP)/src/gallium/aux/tgsi/util/tgsi_parse.c \
-	$(TOP)/src/gallium/aux/tgsi/util/tgsi_util.c
+	$(TOP)/src/gallium/auxiliary/tgsi/util/tgsi_build.c \
+	$(TOP)/src/gallium/auxiliary/tgsi/util/tgsi_dump.c \
+	$(TOP)/src/gallium/auxiliary/tgsi/util/tgsi_parse.c \
+	$(TOP)/src/gallium/auxiliary/tgsi/util/tgsi_util.c
 
 STATECACHE_SOURCES = \
-	$(TOP)/src/gallium/aux/cso_cache/cso_hash.c \
-	$(TOP)/src/gallium/aux/cso_cache/cso_cache.c
+	$(TOP)/src/gallium/auxiliary/cso_cache/cso_hash.c \
+	$(TOP)/src/gallium/auxiliary/cso_cache/cso_cache.c
 
 PIPEUTIL_SOURCES = \
-	$(TOP)/src/gallium/aux/util/p_debug.c \
-	$(TOP)/src/gallium/aux/util/p_tile.c \
-	$(TOP)/src/gallium/aux/util/p_util.c
+	$(TOP)/src/gallium/auxiliary/util/p_debug.c \
+	$(TOP)/src/gallium/auxiliary/util/p_tile.c \
+	$(TOP)/src/gallium/auxiliary/util/p_util.c
 
 STATETRACKER_SOURCES = \
 	state_tracker/st_atom.c \
@@ -428,7 +428,7 @@ INCLUDE_DIRS = \
 	-I$(TOP)/src/mesa/main \
 	-I$(TOP)/src/gallium/include \
 	-I$(TOP)/src/gallium/drivers \
-	-I$(TOP)/src/gallium/aux
+	-I$(TOP)/src/gallium/auxiliary
 
 OLD_INCLUDE_DIRS = \
 	-I$(TOP)/src/mesa/tnl \
@@ -438,4 +438,4 @@ OLD_INCLUDE_DIRS = \
 	-I$(TOP)/src/mesa/shader \
 	-I$(TOP)/src/mesa/shader/grammar \
 	-I$(TOP)/src/mesa/shader/slang \
-	-I$(TOP)/s$(TOP)/src/gallium/aux/tgsi
+	-I$(TOP)/s$(TOP)/src/gallium/auxiliary/tgsi
-- 
cgit v1.2.3


From 3320b1874e810583f95b93a89697b2955987b84f Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Fri, 15 Feb 2008 11:03:54 -0800
Subject: Cell: Enable code gen for SPE attribute fetch

Doubles are still unsupported.
---
 src/gallium/drivers/cell/common.h                 |  15 +-
 src/gallium/drivers/cell/ppu/Makefile             |   1 +
 src/gallium/drivers/cell/ppu/cell_context.h       |   4 +
 src/gallium/drivers/cell/ppu/cell_vertex_fetch.c  |  15 +-
 src/gallium/drivers/cell/ppu/cell_vertex_shader.c |  21 +-
 src/gallium/drivers/cell/spu/spu_main.c           |  22 +-
 src/gallium/drivers/cell/spu/spu_vertex_fetch.c   | 477 +---------------------
 src/gallium/drivers/cell/spu/spu_vertex_shader.h  |   6 +-
 8 files changed, 71 insertions(+), 490 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index 4de514c358..74b131fbef 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -90,6 +90,7 @@
 #define CELL_CMD_STATE_VS_ARRAY_INFO 16
 #define CELL_CMD_STATE_BLEND         17
 #define CELL_CMD_VS_EXECUTE          18
+#define CELL_CMD_STATE_ATTRIB_FETCH  19
 
 
 #define CELL_NUM_BUFFERS 4
@@ -128,13 +129,19 @@ struct cell_command_clear_surface
  */
 struct cell_array_info
 {
-    uint64_t base;      /**< Base address of the 0th element. */
-    uint attr;          /**< Attribute that this state is for. */
-    uint pitch;         /**< Byte pitch from one entry to the next. */
-    uint format;        /**< Pipe format of each entry. */
+   uint64_t base;      /**< Base address of the 0th element. */
+   uint attr;          /**< Attribute that this state is for. */
+   uint pitch;         /**< Byte pitch from one entry to the next. */
+   uint size;
+   uint function_offset;
 } ALIGN16_ATTRIB;
 
 
+struct cell_attribute_fetch_code {
+   uint64_t base;
+   uint size;
+};
+
 struct cell_shader_info
 {
    unsigned num_outputs;
diff --git a/src/gallium/drivers/cell/ppu/Makefile b/src/gallium/drivers/cell/ppu/Makefile
index a4c3f29e8a..196ab777f5 100644
--- a/src/gallium/drivers/cell/ppu/Makefile
+++ b/src/gallium/drivers/cell/ppu/Makefile
@@ -34,6 +34,7 @@ SOURCES = \
 	cell_surface.c \
 	cell_texture.c \
 	cell_vbuf.c \
+	cell_vertex_fetch.c \
 	cell_vertex_shader.c \
 	cell_winsys.c
 
diff --git a/src/gallium/drivers/cell/ppu/cell_context.h b/src/gallium/drivers/cell/ppu/cell_context.h
index 6196c0c72f..91f8e542a2 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.h
+++ b/src/gallium/drivers/cell/ppu/cell_context.h
@@ -36,6 +36,7 @@
 #include "draw/draw_vbuf.h"
 #include "cell_winsys.h"
 #include "cell/common.h"
+#include "ppc/rtasm/spe_asm.h"
 
 
 struct cell_vbuf_render;
@@ -111,6 +112,9 @@ struct cell_context
    /** [4] to ensure 16-byte alignment for each status word */
    uint buffer_status[CELL_MAX_SPUS][CELL_NUM_BUFFERS][4] ALIGN16_ATTRIB;
 
+
+   struct spe_function attrib_fetch;
+   unsigned attrib_fetch_offsets[PIPE_ATTRIB_MAX];
 };
 
 
diff --git a/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c b/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c
index f2432f4ff5..f10689a959 100644
--- a/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c
+++ b/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c
@@ -27,10 +27,10 @@
 #include "pipe/p_context.h"
 #include "pipe/p_format.h"
 
-#include "pipe/draw/draw_context.h"
-#include "pipe/draw/draw_private.h"
+#include "../auxiliary/draw/draw_context.h"
+#include "../auxiliary/draw/draw_private.h"
 
-#include "pipe/cell/ppu/cell_context.h"
+#include "cell_context.h"
 #include "ppc/rtasm/spe_asm.h"
 
 typedef uint64_t register_mask;
@@ -380,13 +380,4 @@ void cell_update_vertex_fetch(struct draw_context *draw)
 	     cell->attrib_fetch_offsets[function_index[i]];
       }
    }
-   
-   static first_time = 1;
-   if (first_time) {
-      first_time = 0;
-      const unsigned instructions = p->csr - p->store;
-      for (i = 0; i < instructions; i++) {
-	 printf("\t.long\t0x%08x\n", p->store[i]);
-      }
-   }
 }
diff --git a/src/gallium/drivers/cell/ppu/cell_vertex_shader.c b/src/gallium/drivers/cell/ppu/cell_vertex_shader.c
index 0ba4506505..6a1d3bc20a 100644
--- a/src/gallium/drivers/cell/ppu/cell_vertex_shader.c
+++ b/src/gallium/drivers/cell/ppu/cell_vertex_shader.c
@@ -55,14 +55,32 @@ cell_vertex_shader_queue_flush(struct draw_context *draw)
    uint64_t *batch;
    struct cell_array_info *array_info;
    unsigned i, j;
+   struct cell_attribute_fetch_code *cf;
 
    assert(draw->vs.queue_nr != 0);
 
    /* XXX: do this on statechange: 
     */
    draw_update_vertex_fetch(draw);
+   cell_update_vertex_fetch(draw);
+
+
+   batch = cell_batch_alloc(cell, sizeof(batch[0]) + sizeof(*cf));
+   batch[0] = CELL_CMD_STATE_ATTRIB_FETCH;
+   cf = (struct cell_attribute_fetch_code *) (&batch[1]);
+   cf->base = cell->attrib_fetch.store;
+   cf->size = ROUNDUP16((unsigned)((void *) cell->attrib_fetch.csr 
+				   - (void *) cell->attrib_fetch.store));
+
 
    for (i = 0; i < draw->vertex_fetch.nr_attrs; i++) {
+      const enum pipe_format format = draw->vertex_element[i].src_format;
+      const unsigned count = ((pf_size_x(format) != 0)
+			      + (pf_size_y(format) != 0)
+			      + (pf_size_z(format) != 0)
+			      + (pf_size_w(format) != 0));
+      const unsigned size = pf_size_x(format) * count;
+
       batch = cell_batch_alloc(cell, sizeof(batch[0]) + sizeof(*array_info));
 
       batch[0] = CELL_CMD_STATE_VS_ARRAY_INFO;
@@ -72,7 +90,8 @@ cell_vertex_shader_queue_flush(struct draw_context *draw)
       array_info->base = (uintptr_t) draw->vertex_fetch.src_ptr[i];
       array_info->attr = i;
       array_info->pitch = draw->vertex_fetch.pitch[i];
-      array_info->format = draw->vertex_element[i].src_format;
+      array_info->size = size;
+      array_info->function_offset = cell->attrib_fetch_offsets[i];
    }
 
    batch = cell_batch_alloc(cell, sizeof(batch[0])
diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index 1e7243b863..fcbf0f841e 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -54,6 +54,9 @@ struct spu_global spu;
 
 struct spu_vs_context draw;
 
+static unsigned char attribute_fetch_code_buffer[136 * PIPE_ATTRIB_MAX]
+    ALIGN16_ATTRIB;
+
 /**
  * Tell the PPU that this SPU has finished copying a buffer to
  * local store and that it may be reused by the PPU.
@@ -306,7 +309,8 @@ cmd_state_vs_array_info(const struct cell_array_info *vs_info)
    ASSERT(attr < PIPE_ATTRIB_MAX);
    draw.vertex_fetch.src_ptr[attr] = vs_info->base;
    draw.vertex_fetch.pitch[attr] = vs_info->pitch;
-   draw.vertex_fetch.format[attr] = vs_info->format;
+   draw.vertex_fetch.size[attr] = vs_info->size;
+   draw.vertex_fetch.code_offset[attr] = vs_info->function_offset;
    draw.vertex_fetch.dirty = 1;
 }
 
@@ -433,6 +437,22 @@ cmd_batch(uint opcode)
          cmd_state_vs_array_info((struct cell_array_info *) &buffer[pos+1]);
          pos += (1 + ROUNDUP8(sizeof(struct cell_array_info)) / 8);
          break;
+      case CELL_CMD_STATE_ATTRIB_FETCH: {
+         struct cell_attribute_fetch_code *code =
+             (struct cell_attribute_fetch_code *) &buffer[pos+1];
+
+              mfc_get(attribute_fetch_code_buffer,
+                      (unsigned int) code->base,  /* src */
+                      code->size,
+                      TAG_BATCH_BUFFER,
+                      0, /* tid */
+                      0  /* rid */);
+         wait_on_mask(1 << TAG_BATCH_BUFFER);
+
+         draw.vertex_fetch.code = attribute_fetch_code_buffer;
+         pos += (1 + ROUNDUP8(sizeof(struct cell_attribute_fetch_code)) / 8);
+         break;
+      }
       default:
          printf("SPU %u: bad opcode: 0x%llx\n", spu.init.id, buffer[pos]);
          ASSERT(0);
diff --git a/src/gallium/drivers/cell/spu/spu_vertex_fetch.c b/src/gallium/drivers/cell/spu/spu_vertex_fetch.c
index 45e3c26c00..55c6c28717 100644
--- a/src/gallium/drivers/cell/spu/spu_vertex_fetch.c
+++ b/src/gallium/drivers/cell/spu/spu_vertex_fetch.c
@@ -1,6 +1,7 @@
 /**************************************************************************
  * 
  * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * (C) Copyright IBM Corporation 2008
  * All Rights Reserved.
  * 
  * Permission is hereby granted, free of charge, to any person obtaining a
@@ -28,10 +29,10 @@
  /*
   * Authors:
   *   Keith Whitwell <keith@tungstengraphics.com>
+  *   Ian Romanick <idr@us.ibm.com>
   */
 
 #include <spu_mfcio.h>
-#include <transpose_matrix4x4.h>
 
 #include "pipe/p_util.h"
 #include "pipe/p_state.h"
@@ -59,6 +60,10 @@
 
 #define DRAW_DBG 0
 
+typedef void (*spu_fetch_func)(qword *out, const qword *in,
+			       const qword *shuffle_data);
+
+
 static const qword fetch_shuffle_data[] = {
    /* Shuffle used by CVT_64_FLOAT
     */
@@ -97,22 +102,6 @@ static const qword fetch_shuffle_data[] = {
 };
 
 
-static INLINE void
-trans4x4(qword row0, qword row1, qword row2, qword row3, qword *out,
-         const qword *shuffle)
-{
-   qword t1 = si_shufb(row0, row2, shuffle[3]);
-   qword t2 = si_shufb(row0, row2, shuffle[4]);
-   qword t3 = si_shufb(row1, row3, shuffle[3]);
-   qword t4 = si_shufb(row1, row3, shuffle[4]);
-
-   out[0] = si_shufb(t1, t3, shuffle[3]);
-   out[1] = si_shufb(t1, t3, shuffle[4]);
-   out[2] = si_shufb(t2, t4, shuffle[3]);
-   out[3] = si_shufb(t2, t4, shuffle[4]);
-}
-
-
 /**
  * Fetch between 1 and 32 bytes from an unaligned address
  */
@@ -151,446 +140,6 @@ fetch_unaligned(qword *dst, unsigned ea, unsigned size)
 }
 
 
-#define CVT_32_FLOAT(q, s)    (*(q))
-
-static INLINE qword
-CVT_64_FLOAT(const qword *qw, const qword *shuffle)
-{
-   qword a = si_frds(qw[0]);
-   qword b = si_frds(si_rotqbyi(qw[0], 8));
-   qword c = si_frds(qw[1]);
-   qword d = si_frds(si_rotqbyi(qw[1], 8));
-
-   qword ab = si_shufb(a, b, shuffle[0]);
-   qword cd = si_shufb(c, d, si_rotqbyi(shuffle[0], 8));
-   
-   return si_or(ab, cd);
-}
-
-
-static INLINE qword
-CVT_8_USCALED(const qword *qw, const qword *shuffle)
-{
-   return si_cuflt(si_shufb(*qw, *qw, shuffle[1]), 0);
-}
-
-
-static INLINE qword
-CVT_16_USCALED(const qword *qw, const qword *shuffle)
-{
-   return si_cuflt(si_shufb(*qw, *qw, shuffle[2]), 0);
-}
-
-
-static INLINE qword
-CVT_32_USCALED(const qword *qw, const qword *shuffle)
-{
-   (void) shuffle;
-   return si_cuflt(*qw, 0);
-}
-
-static INLINE qword
-CVT_8_SSCALED(const qword *qw, const qword *shuffle)
-{
-   return si_csflt(si_shufb(*qw, *qw, shuffle[1]), 0);
-}
-
-
-static INLINE qword
-CVT_16_SSCALED(const qword *qw, const qword *shuffle)
-{
-   return si_csflt(si_shufb(*qw, *qw, shuffle[2]), 0);
-}
-
-
-static INLINE qword
-CVT_32_SSCALED(const qword *qw, const qword *shuffle)
-{
-   (void) shuffle;
-   return si_csflt(*qw, 0);
-}
-
-
-static INLINE qword
-CVT_8_UNORM(const qword *qw, const qword *shuffle)
-{
-   const qword scale = (qword) spu_splats(1.0f / 255.0f);
-   return si_fm(CVT_8_USCALED(qw, shuffle), scale);
-}
-
-
-static INLINE qword
-CVT_16_UNORM(const qword *qw, const qword *shuffle)
-{
-   const qword scale = (qword) spu_splats(1.0f / 65535.0f);
-   return si_fm(CVT_16_USCALED(qw, shuffle), scale);
-}
-
-
-static INLINE qword
-CVT_32_UNORM(const qword *qw, const qword *shuffle)
-{
-   const qword scale = (qword) spu_splats(1.0f / 4294967295.0f);
-   return si_fm(CVT_32_USCALED(qw, shuffle), scale);
-}
-
-
-static INLINE qword
-CVT_8_SNORM(const qword *qw, const qword *shuffle)
-{
-   const qword scale = (qword) spu_splats(1.0f / 127.0f);
-   return si_fm(CVT_8_SSCALED(qw, shuffle), scale);
-}
-
-
-static INLINE qword
-CVT_16_SNORM(const qword *qw, const qword *shuffle)
-{
-   const qword scale = (qword) spu_splats(1.0f / 32767.0f);
-   return si_fm(CVT_16_SSCALED(qw, shuffle), scale);
-}
-
-
-static INLINE qword
-CVT_32_SNORM(const qword *qw, const qword *shuffle)
-{
-   const qword scale = (qword) spu_splats(1.0f / 2147483647.0f);
-   return si_fm(CVT_32_SSCALED(qw, shuffle), scale);
-}
-
-#define SZ_4 si_il(0U)
-#define SZ_3 si_fsmbi(0x000f)
-#define SZ_2 si_fsmbi(0x00ff)
-#define SZ_1 si_fsmbi(0x0fff)
-
-/**
- * Fetch a float[4] vertex attribute from memory, doing format/type
- * conversion as needed.
- *
- * This is probably needed/dupliocated elsewhere, eg format
- * conversion, texture sampling etc.
- */
-#define FETCH_ATTRIB( NAME, SZ, CVT, N )			\
-static void							\
-fetch_##NAME(qword *out, const qword *in, qword defaults, \
-                const qword *shuffle)	\
-{								\
-   qword tmp[4];						\
-								\
-   tmp[0] = si_selb(CVT(in + (0 * N), shuffle), defaults, SZ);		\
-   tmp[1] = si_selb(CVT(in + (1 * N), shuffle), defaults, SZ);		\
-   tmp[2] = si_selb(CVT(in + (2 * N), shuffle), defaults, SZ);		\
-   tmp[3] = si_selb(CVT(in + (3 * N), shuffle), defaults, SZ);		\
-   trans4x4(tmp[0], tmp[1], tmp[2], tmp[3], out, shuffle);		\
-}
-
-
-FETCH_ATTRIB( R64G64B64A64_FLOAT,   SZ_4, CVT_64_FLOAT, 2 )
-FETCH_ATTRIB( R64G64B64_FLOAT,      SZ_3, CVT_64_FLOAT, 2 )
-FETCH_ATTRIB( R64G64_FLOAT,         SZ_2, CVT_64_FLOAT, 2 )
-FETCH_ATTRIB( R64_FLOAT,            SZ_1, CVT_64_FLOAT, 2 )
-
-FETCH_ATTRIB( R32G32B32A32_FLOAT,   SZ_4, CVT_32_FLOAT, 1 )
-FETCH_ATTRIB( R32G32B32_FLOAT,      SZ_3, CVT_32_FLOAT, 1 )
-FETCH_ATTRIB( R32G32_FLOAT,         SZ_2, CVT_32_FLOAT, 1 )
-FETCH_ATTRIB( R32_FLOAT,            SZ_1, CVT_32_FLOAT, 1 )
-
-FETCH_ATTRIB( R32G32B32A32_USCALED, SZ_4, CVT_32_USCALED, 1 )
-FETCH_ATTRIB( R32G32B32_USCALED,    SZ_3, CVT_32_USCALED, 1 )
-FETCH_ATTRIB( R32G32_USCALED,       SZ_2, CVT_32_USCALED, 1 )
-FETCH_ATTRIB( R32_USCALED,          SZ_1, CVT_32_USCALED, 1 )
-
-FETCH_ATTRIB( R32G32B32A32_SSCALED, SZ_4, CVT_32_SSCALED, 1 )
-FETCH_ATTRIB( R32G32B32_SSCALED,    SZ_3, CVT_32_SSCALED, 1 )
-FETCH_ATTRIB( R32G32_SSCALED,       SZ_2, CVT_32_SSCALED, 1 )
-FETCH_ATTRIB( R32_SSCALED,          SZ_1, CVT_32_SSCALED, 1 )
-
-FETCH_ATTRIB( R32G32B32A32_UNORM, SZ_4, CVT_32_UNORM, 1 )
-FETCH_ATTRIB( R32G32B32_UNORM,    SZ_3, CVT_32_UNORM, 1 )
-FETCH_ATTRIB( R32G32_UNORM,       SZ_2, CVT_32_UNORM, 1 )
-FETCH_ATTRIB( R32_UNORM,          SZ_1, CVT_32_UNORM, 1 )
-
-FETCH_ATTRIB( R32G32B32A32_SNORM, SZ_4, CVT_32_SNORM, 1 )
-FETCH_ATTRIB( R32G32B32_SNORM,    SZ_3, CVT_32_SNORM, 1 )
-FETCH_ATTRIB( R32G32_SNORM,       SZ_2, CVT_32_SNORM, 1 )
-FETCH_ATTRIB( R32_SNORM,          SZ_1, CVT_32_SNORM, 1 )
-
-FETCH_ATTRIB( R16G16B16A16_USCALED, SZ_4, CVT_16_USCALED, 1 )
-FETCH_ATTRIB( R16G16B16_USCALED,    SZ_3, CVT_16_USCALED, 1 )
-FETCH_ATTRIB( R16G16_USCALED,       SZ_2, CVT_16_USCALED, 1 )
-FETCH_ATTRIB( R16_USCALED,          SZ_1, CVT_16_USCALED, 1 )
-
-FETCH_ATTRIB( R16G16B16A16_SSCALED, SZ_4, CVT_16_SSCALED, 1 )
-FETCH_ATTRIB( R16G16B16_SSCALED,    SZ_3, CVT_16_SSCALED, 1 )
-FETCH_ATTRIB( R16G16_SSCALED,       SZ_2, CVT_16_SSCALED, 1 )
-FETCH_ATTRIB( R16_SSCALED,          SZ_1, CVT_16_SSCALED, 1 )
-
-FETCH_ATTRIB( R16G16B16A16_UNORM, SZ_4, CVT_16_UNORM, 1 )
-FETCH_ATTRIB( R16G16B16_UNORM,    SZ_3, CVT_16_UNORM, 1 )
-FETCH_ATTRIB( R16G16_UNORM,       SZ_2, CVT_16_UNORM, 1 )
-FETCH_ATTRIB( R16_UNORM,          SZ_1, CVT_16_UNORM, 1 )
-
-FETCH_ATTRIB( R16G16B16A16_SNORM, SZ_4, CVT_16_SNORM, 1 )
-FETCH_ATTRIB( R16G16B16_SNORM,    SZ_3, CVT_16_SNORM, 1 )
-FETCH_ATTRIB( R16G16_SNORM,       SZ_2, CVT_16_SNORM, 1 )
-FETCH_ATTRIB( R16_SNORM,          SZ_1, CVT_16_SNORM, 1 )
-
-FETCH_ATTRIB( R8G8B8A8_USCALED,   SZ_4, CVT_8_USCALED, 1 )
-FETCH_ATTRIB( R8G8B8_USCALED,     SZ_3, CVT_8_USCALED, 1 )
-FETCH_ATTRIB( R8G8_USCALED,       SZ_2, CVT_8_USCALED, 1 )
-FETCH_ATTRIB( R8_USCALED,         SZ_1, CVT_8_USCALED, 1 )
-
-FETCH_ATTRIB( R8G8B8A8_SSCALED,  SZ_4, CVT_8_SSCALED, 1 )
-FETCH_ATTRIB( R8G8B8_SSCALED,    SZ_3, CVT_8_SSCALED, 1 )
-FETCH_ATTRIB( R8G8_SSCALED,      SZ_2, CVT_8_SSCALED, 1 )
-FETCH_ATTRIB( R8_SSCALED,        SZ_1, CVT_8_SSCALED, 1 )
-
-FETCH_ATTRIB( R8G8B8A8_UNORM,  SZ_4, CVT_8_UNORM, 1 )
-FETCH_ATTRIB( R8G8B8_UNORM,    SZ_3, CVT_8_UNORM, 1 )
-FETCH_ATTRIB( R8G8_UNORM,      SZ_2, CVT_8_UNORM, 1 )
-FETCH_ATTRIB( R8_UNORM,        SZ_1, CVT_8_UNORM, 1 )
-
-FETCH_ATTRIB( R8G8B8A8_SNORM,  SZ_4, CVT_8_SNORM, 1 )
-FETCH_ATTRIB( R8G8B8_SNORM,    SZ_3, CVT_8_SNORM, 1 )
-FETCH_ATTRIB( R8G8_SNORM,      SZ_2, CVT_8_SNORM, 1 )
-FETCH_ATTRIB( R8_SNORM,        SZ_1, CVT_8_SNORM, 1 )
-
-FETCH_ATTRIB( A8R8G8B8_UNORM,       SZ_4, CVT_8_UNORM, 1 )
-
-
-
-static spu_fetch_func get_fetch_func( enum pipe_format format )
-{
-   switch (format) {
-   case PIPE_FORMAT_R64_FLOAT:
-      return fetch_R64_FLOAT;
-   case PIPE_FORMAT_R64G64_FLOAT:
-      return fetch_R64G64_FLOAT;
-   case PIPE_FORMAT_R64G64B64_FLOAT:
-      return fetch_R64G64B64_FLOAT;
-   case PIPE_FORMAT_R64G64B64A64_FLOAT:
-      return fetch_R64G64B64A64_FLOAT;
-
-   case PIPE_FORMAT_R32_FLOAT:
-      return fetch_R32_FLOAT;
-   case PIPE_FORMAT_R32G32_FLOAT:
-      return fetch_R32G32_FLOAT;
-   case PIPE_FORMAT_R32G32B32_FLOAT:
-      return fetch_R32G32B32_FLOAT;
-   case PIPE_FORMAT_R32G32B32A32_FLOAT:
-      return fetch_R32G32B32A32_FLOAT;
-
-   case PIPE_FORMAT_R32_UNORM:
-      return fetch_R32_UNORM;
-   case PIPE_FORMAT_R32G32_UNORM:
-      return fetch_R32G32_UNORM;
-   case PIPE_FORMAT_R32G32B32_UNORM:
-      return fetch_R32G32B32_UNORM;
-   case PIPE_FORMAT_R32G32B32A32_UNORM:
-      return fetch_R32G32B32A32_UNORM;
-
-   case PIPE_FORMAT_R32_USCALED:
-      return fetch_R32_USCALED;
-   case PIPE_FORMAT_R32G32_USCALED:
-      return fetch_R32G32_USCALED;
-   case PIPE_FORMAT_R32G32B32_USCALED:
-      return fetch_R32G32B32_USCALED;
-   case PIPE_FORMAT_R32G32B32A32_USCALED:
-      return fetch_R32G32B32A32_USCALED;
-
-   case PIPE_FORMAT_R32_SNORM:
-      return fetch_R32_SNORM;
-   case PIPE_FORMAT_R32G32_SNORM:
-      return fetch_R32G32_SNORM;
-   case PIPE_FORMAT_R32G32B32_SNORM:
-      return fetch_R32G32B32_SNORM;
-   case PIPE_FORMAT_R32G32B32A32_SNORM:
-      return fetch_R32G32B32A32_SNORM;
-
-   case PIPE_FORMAT_R32_SSCALED:
-      return fetch_R32_SSCALED;
-   case PIPE_FORMAT_R32G32_SSCALED:
-      return fetch_R32G32_SSCALED;
-   case PIPE_FORMAT_R32G32B32_SSCALED:
-      return fetch_R32G32B32_SSCALED;
-   case PIPE_FORMAT_R32G32B32A32_SSCALED:
-      return fetch_R32G32B32A32_SSCALED;
-
-   case PIPE_FORMAT_R16_UNORM:
-      return fetch_R16_UNORM;
-   case PIPE_FORMAT_R16G16_UNORM:
-      return fetch_R16G16_UNORM;
-   case PIPE_FORMAT_R16G16B16_UNORM:
-      return fetch_R16G16B16_UNORM;
-   case PIPE_FORMAT_R16G16B16A16_UNORM:
-      return fetch_R16G16B16A16_UNORM;
-
-   case PIPE_FORMAT_R16_USCALED:
-      return fetch_R16_USCALED;
-   case PIPE_FORMAT_R16G16_USCALED:
-      return fetch_R16G16_USCALED;
-   case PIPE_FORMAT_R16G16B16_USCALED:
-      return fetch_R16G16B16_USCALED;
-   case PIPE_FORMAT_R16G16B16A16_USCALED:
-      return fetch_R16G16B16A16_USCALED;
-
-   case PIPE_FORMAT_R16_SNORM:
-      return fetch_R16_SNORM;
-   case PIPE_FORMAT_R16G16_SNORM:
-      return fetch_R16G16_SNORM;
-   case PIPE_FORMAT_R16G16B16_SNORM:
-      return fetch_R16G16B16_SNORM;
-   case PIPE_FORMAT_R16G16B16A16_SNORM:
-      return fetch_R16G16B16A16_SNORM;
-
-   case PIPE_FORMAT_R16_SSCALED:
-      return fetch_R16_SSCALED;
-   case PIPE_FORMAT_R16G16_SSCALED:
-      return fetch_R16G16_SSCALED;
-   case PIPE_FORMAT_R16G16B16_SSCALED:
-      return fetch_R16G16B16_SSCALED;
-   case PIPE_FORMAT_R16G16B16A16_SSCALED:
-      return fetch_R16G16B16A16_SSCALED;
-
-   case PIPE_FORMAT_R8_UNORM:
-      return fetch_R8_UNORM;
-   case PIPE_FORMAT_R8G8_UNORM:
-      return fetch_R8G8_UNORM;
-   case PIPE_FORMAT_R8G8B8_UNORM:
-      return fetch_R8G8B8_UNORM;
-   case PIPE_FORMAT_R8G8B8A8_UNORM:
-      return fetch_R8G8B8A8_UNORM;
-
-   case PIPE_FORMAT_R8_USCALED:
-      return fetch_R8_USCALED;
-   case PIPE_FORMAT_R8G8_USCALED:
-      return fetch_R8G8_USCALED;
-   case PIPE_FORMAT_R8G8B8_USCALED:
-      return fetch_R8G8B8_USCALED;
-   case PIPE_FORMAT_R8G8B8A8_USCALED:
-      return fetch_R8G8B8A8_USCALED;
-
-   case PIPE_FORMAT_R8_SNORM:
-      return fetch_R8_SNORM;
-   case PIPE_FORMAT_R8G8_SNORM:
-      return fetch_R8G8_SNORM;
-   case PIPE_FORMAT_R8G8B8_SNORM:
-      return fetch_R8G8B8_SNORM;
-   case PIPE_FORMAT_R8G8B8A8_SNORM:
-      return fetch_R8G8B8A8_SNORM;
-
-   case PIPE_FORMAT_R8_SSCALED:
-      return fetch_R8_SSCALED;
-   case PIPE_FORMAT_R8G8_SSCALED:
-      return fetch_R8G8_SSCALED;
-   case PIPE_FORMAT_R8G8B8_SSCALED:
-      return fetch_R8G8B8_SSCALED;
-   case PIPE_FORMAT_R8G8B8A8_SSCALED:
-      return fetch_R8G8B8A8_SSCALED;
-
-   case PIPE_FORMAT_A8R8G8B8_UNORM:
-      return fetch_A8R8G8B8_UNORM;
-
-   case 0:
-      return NULL;		/* not sure why this is needed */
-
-   default:
-      assert(0);
-      return NULL;
-   }
-}
-
-
-static unsigned get_vertex_size( enum pipe_format format )
-{
-   switch (format) {
-   case PIPE_FORMAT_R64_FLOAT:
-      return 8;
-   case PIPE_FORMAT_R64G64_FLOAT:
-      return 2 * 8;
-   case PIPE_FORMAT_R64G64B64_FLOAT:
-      return 3 * 8;
-   case PIPE_FORMAT_R64G64B64A64_FLOAT:
-      return 4 * 8;
-
-   case PIPE_FORMAT_R32_SSCALED:
-   case PIPE_FORMAT_R32_SNORM:
-   case PIPE_FORMAT_R32_USCALED:
-   case PIPE_FORMAT_R32_UNORM:
-   case PIPE_FORMAT_R32_FLOAT:
-      return 4;
-   case PIPE_FORMAT_R32G32_SSCALED:
-   case PIPE_FORMAT_R32G32_SNORM:
-   case PIPE_FORMAT_R32G32_USCALED:
-   case PIPE_FORMAT_R32G32_UNORM:
-   case PIPE_FORMAT_R32G32_FLOAT:
-      return 2 * 4;
-   case PIPE_FORMAT_R32G32B32_SSCALED:
-   case PIPE_FORMAT_R32G32B32_SNORM:
-   case PIPE_FORMAT_R32G32B32_USCALED:
-   case PIPE_FORMAT_R32G32B32_UNORM:
-   case PIPE_FORMAT_R32G32B32_FLOAT:
-      return 3 * 4;
-   case PIPE_FORMAT_R32G32B32A32_SSCALED:
-   case PIPE_FORMAT_R32G32B32A32_SNORM:
-   case PIPE_FORMAT_R32G32B32A32_USCALED:
-   case PIPE_FORMAT_R32G32B32A32_UNORM:
-   case PIPE_FORMAT_R32G32B32A32_FLOAT:
-      return 4 * 4;
-
-   case PIPE_FORMAT_R16_SSCALED:
-   case PIPE_FORMAT_R16_SNORM:
-   case PIPE_FORMAT_R16_UNORM:
-   case PIPE_FORMAT_R16_USCALED:
-      return 2;
-   case PIPE_FORMAT_R16G16_SSCALED:
-   case PIPE_FORMAT_R16G16_SNORM:
-   case PIPE_FORMAT_R16G16_USCALED:
-   case PIPE_FORMAT_R16G16_UNORM:
-      return 2 * 2;
-   case PIPE_FORMAT_R16G16B16_SSCALED:
-   case PIPE_FORMAT_R16G16B16_SNORM:
-   case PIPE_FORMAT_R16G16B16_USCALED:
-   case PIPE_FORMAT_R16G16B16_UNORM:
-      return 3 * 2;
-   case PIPE_FORMAT_R16G16B16A16_SSCALED:
-   case PIPE_FORMAT_R16G16B16A16_SNORM:
-   case PIPE_FORMAT_R16G16B16A16_USCALED:
-   case PIPE_FORMAT_R16G16B16A16_UNORM:
-      return 4 * 2;
-
-   case PIPE_FORMAT_R8_SSCALED:
-   case PIPE_FORMAT_R8_SNORM:
-   case PIPE_FORMAT_R8_USCALED:
-   case PIPE_FORMAT_R8_UNORM:
-      return 1;
-   case PIPE_FORMAT_R8G8_SSCALED:
-   case PIPE_FORMAT_R8G8_SNORM:
-   case PIPE_FORMAT_R8G8_USCALED:
-   case PIPE_FORMAT_R8G8_UNORM:
-      return 2 * 1;
-   case PIPE_FORMAT_R8G8B8_SSCALED:
-   case PIPE_FORMAT_R8G8B8_SNORM:
-   case PIPE_FORMAT_R8G8B8_USCALED:
-   case PIPE_FORMAT_R8G8B8_UNORM:
-      return 3 * 1;
-   case PIPE_FORMAT_A8R8G8B8_UNORM:
-   case PIPE_FORMAT_R8G8B8A8_SSCALED:
-   case PIPE_FORMAT_R8G8B8A8_SNORM:
-   case PIPE_FORMAT_R8G8B8A8_USCALED:
-   case PIPE_FORMAT_R8G8B8A8_UNORM:
-      return 4 * 1;
-
-   case 0:
-      return 0;		/* not sure why this is needed */
-
-   default:
-      assert(0);
-      return 0;
-   }
-}
-
-
 /**
  * Fetch vertex attributes for 'count' vertices.
  */
@@ -612,10 +161,10 @@ static void generic_vertex_fetch(struct spu_vs_context *draw,
    /* loop over vertex attributes (vertex shader inputs)
     */
    for (attr = 0; attr < nr_attrs; attr++) {
-      const qword default_values = (qword)(vec_float4){ 0.0, 0.0, 0.0, 1.0 };
       const unsigned pitch = draw->vertex_fetch.pitch[attr];
       const uint64_t src = draw->vertex_fetch.src_ptr[attr];
-      const spu_fetch_func fetch = draw->vertex_fetch.fetch[attr];
+      const spu_fetch_func fetch = (spu_fetch_func)
+	  (draw->vertex_fetch.code + draw->vertex_fetch.code_offset[attr]);
       unsigned i;
       unsigned idx;
       const unsigned bytes_per_entry = draw->vertex_fetch.size[attr];
@@ -644,8 +193,7 @@ static void generic_vertex_fetch(struct spu_vs_context *draw,
 
       /* Convert all 4 vertices to vectors of float.
        */
-      (*fetch)(&machine->Inputs[attr].xyzw[0].q, in, default_values,
-               fetch_shuffle_data);
+      (*fetch)(&machine->Inputs[attr].xyzw[0].q, in, fetch_shuffle_data);
    }
 }
 
@@ -662,12 +210,5 @@ void spu_update_vertex_fetch( struct spu_vs_context *draw )
    }
 
 
-   for (i = 0; i < draw->vertex_fetch.nr_attrs; i++) {
-      draw->vertex_fetch.fetch[i] =
-          get_fetch_func(draw->vertex_fetch.format[i]);
-      draw->vertex_fetch.size[i] =
-          get_vertex_size(draw->vertex_fetch.format[i]);
-   }
-
    draw->vertex_fetch.fetch_func = generic_vertex_fetch;
 }
diff --git a/src/gallium/drivers/cell/spu/spu_vertex_shader.h b/src/gallium/drivers/cell/spu/spu_vertex_shader.h
index b5bf31e67d..0fb0bc28d0 100644
--- a/src/gallium/drivers/cell/spu/spu_vertex_shader.h
+++ b/src/gallium/drivers/cell/spu/spu_vertex_shader.h
@@ -6,8 +6,6 @@
 
 struct spu_vs_context;
 
-typedef void (*spu_fetch_func)(qword *out, const qword *in, qword defaults,
-			       const qword *shuffle_data);
 typedef void (*spu_full_fetch_func)( struct spu_vs_context *draw,
 				     struct spu_exec_machine *machine,
 				     const unsigned *elts,
@@ -20,12 +18,12 @@ struct spu_vs_context {
       uint64_t src_ptr[PIPE_ATTRIB_MAX];
       unsigned pitch[PIPE_ATTRIB_MAX];
       unsigned size[PIPE_ATTRIB_MAX];
-      enum pipe_format format[PIPE_ATTRIB_MAX];
+      unsigned code_offset[PIPE_ATTRIB_MAX];
       unsigned nr_attrs;
       boolean dirty;
 
-      spu_fetch_func fetch[PIPE_ATTRIB_MAX];
       spu_full_fetch_func fetch_func;
+      void *code;
    } vertex_fetch;
    
    /* Clip derived state:
-- 
cgit v1.2.3


From 4362c6e59d575a039e654e1520bbff89b73fc8f2 Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Mon, 18 Feb 2008 18:51:57 -0800
Subject: Cell: trivial clean-ups

---
 src/gallium/drivers/cell/spu/spu_exec.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/spu/spu_exec.c b/src/gallium/drivers/cell/spu/spu_exec.c
index 109540b1f7..0eb5ea1a3f 100644
--- a/src/gallium/drivers/cell/spu/spu_exec.c
+++ b/src/gallium/drivers/cell/spu/spu_exec.c
@@ -146,17 +146,13 @@ spu_exec_machine_init(struct spu_exec_machine *mach,
                       struct spu_sampler *samplers,
                       unsigned processor)
 {
-   qword zero;
-   qword not_zero;
-   uint i;
+   const qword zero = si_il(0);
+   const qword not_zero = si_il(~0);
 
    mach->Samplers = samplers;
    mach->Processor = processor;
    mach->Addrs = &mach->Temps[TGSI_EXEC_NUM_TEMPS];
 
-   zero = si_xor(zero, zero);
-   not_zero = si_xori(zero, 0xff);
-
    /* Setup constants. */
    mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].q = zero;
    mach->Temps[TEMP_FF_I].xyzw[TEMP_FF_C].q = not_zero;
-- 
cgit v1.2.3


From 66be2810c3be07dd1ee45a60cfc632725837f2cd Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Mon, 18 Feb 2008 18:55:39 -0800
Subject: Cell: emit vertex shaders and uniforms more intelligently

---
 src/gallium/drivers/cell/common.h                 | 22 ++++----
 src/gallium/drivers/cell/ppu/cell_state_emit.c    | 18 ++++++
 src/gallium/drivers/cell/ppu/cell_vertex_shader.c | 17 +++---
 src/gallium/drivers/cell/spu/spu_main.c           |  9 +++
 src/gallium/drivers/cell/spu/spu_vertex_fetch.c   |  6 +-
 src/gallium/drivers/cell/spu/spu_vertex_shader.c  | 68 ++++++++++-------------
 src/gallium/drivers/cell/spu/spu_vertex_shader.h  |  5 ++
 7 files changed, 85 insertions(+), 60 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index 74b131fbef..cf892206c6 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -87,10 +87,12 @@
 #define CELL_CMD_STATE_TEXTURE       13
 #define CELL_CMD_STATE_VERTEX_INFO   14
 #define CELL_CMD_STATE_VIEWPORT      15
-#define CELL_CMD_STATE_VS_ARRAY_INFO 16
-#define CELL_CMD_STATE_BLEND         17
-#define CELL_CMD_VS_EXECUTE          18
-#define CELL_CMD_STATE_ATTRIB_FETCH  19
+#define CELL_CMD_STATE_UNIFORMS      16
+#define CELL_CMD_STATE_VS_ARRAY_INFO 17
+#define CELL_CMD_STATE_BIND_VS       18
+#define CELL_CMD_STATE_BLEND         19
+#define CELL_CMD_STATE_ATTRIB_FETCH  20
+#define CELL_CMD_VS_EXECUTE          21
 
 
 #define CELL_NUM_BUFFERS 4
@@ -144,14 +146,13 @@ struct cell_attribute_fetch_code {
 
 struct cell_shader_info
 {
-   unsigned num_outputs;
-
    uint64_t declarations;
-   unsigned num_declarations;
    uint64_t instructions;
-   unsigned num_instructions;
-   uint64_t uniforms;
    uint64_t  immediates;
+
+   unsigned num_outputs;
+   unsigned num_declarations;
+   unsigned num_instructions;
    unsigned num_immediates;
 } ALIGN16_ATTRIB;
 
@@ -160,10 +161,9 @@ struct cell_shader_info
 struct cell_command_vs
 {
    uint64_t opcode;       /**< CELL_CMD_VS_EXECUTE */
-   struct cell_shader_info   shader;
+   uint64_t vOut[SPU_VERTS_PER_BATCH];
    unsigned num_elts;
    unsigned elts[SPU_VERTS_PER_BATCH];
-   uint64_t vOut[SPU_VERTS_PER_BATCH];
    float plane[12][4];
    unsigned nr_planes;
    unsigned nr_attrs;
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index 5d2a786449..49c0d130c5 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -31,6 +31,8 @@
 #include "cell_state_emit.h"
 #include "cell_batch.h"
 #include "cell_texture.h"
+#include "draw/draw_context.h"
+#include "draw/draw_private.h"
 
 
 static void
@@ -100,4 +102,20 @@ cell_emit_state(struct cell_context *cell)
       emit_state_cmd(cell, CELL_CMD_STATE_VERTEX_INFO,
                      &cell->vertex_info, sizeof(struct vertex_info));
    }
+   
+   if (cell->dirty & CELL_NEW_VS) {
+      const struct draw_context *const draw = cell->draw;
+      struct cell_shader_info info;
+
+      info.num_outputs = draw->num_vs_outputs;
+      info.declarations = (uintptr_t) draw->machine.Declarations;
+      info.num_declarations = draw->machine.NumDeclarations;
+      info.instructions = (uintptr_t) draw->machine.Instructions;
+      info.num_instructions = draw->machine.NumInstructions;
+      info.immediates = (uintptr_t) draw->machine.Imms;
+      info.num_immediates = draw->machine.ImmLimit / 4;
+
+      emit_state_cmd(cell, CELL_CMD_STATE_BIND_VS,
+		     & info, sizeof(info));
+   }
 }
diff --git a/src/gallium/drivers/cell/ppu/cell_vertex_shader.c b/src/gallium/drivers/cell/ppu/cell_vertex_shader.c
index 6a1d3bc20a..64c7821c19 100644
--- a/src/gallium/drivers/cell/ppu/cell_vertex_shader.c
+++ b/src/gallium/drivers/cell/ppu/cell_vertex_shader.c
@@ -54,6 +54,7 @@ cell_vertex_shader_queue_flush(struct draw_context *draw)
    struct cell_command_vs *const vs = &cell_global.command[0].vs;
    uint64_t *batch;
    struct cell_array_info *array_info;
+   struct cell_shader_info *shader_info;
    unsigned i, j;
    struct cell_attribute_fetch_code *cf;
 
@@ -100,17 +101,17 @@ cell_vertex_shader_queue_flush(struct draw_context *draw)
    (void) memcpy(&batch[1], &draw->viewport,
                  sizeof(struct pipe_viewport_state));
 
+   {
+      uint64_t uniforms = (uintptr_t) draw->user.constants;
+
+      batch = cell_batch_alloc(cell, 2 *sizeof(batch[0]));
+      batch[0] = CELL_CMD_STATE_UNIFORMS;
+      batch[1] = uniforms;
+   }
+
    cell_batch_flush(cell);
 
    vs->opcode = CELL_CMD_VS_EXECUTE;
-   vs->shader.num_outputs = draw->num_vs_outputs;
-   vs->shader.declarations = (uintptr_t) draw->machine.Declarations;
-   vs->shader.num_declarations = draw->machine.NumDeclarations;
-   vs->shader.instructions = (uintptr_t) draw->machine.Instructions;
-   vs->shader.num_instructions = draw->machine.NumInstructions;
-   vs->shader.uniforms = (uintptr_t) draw->user.constants;
-   vs->shader.immediates = (uintptr_t) draw->machine.Imms;
-   vs->shader.num_immediates = draw->machine.ImmLimit / 4;
    vs->nr_attrs = draw->vertex_fetch.nr_attrs;
 
    (void) memcpy(vs->plane, draw->plane, sizeof(draw->plane));
diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index fcbf0f841e..dbc3705c24 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -433,10 +433,19 @@ cmd_batch(uint opcode)
                        sizeof(struct pipe_viewport_state));
          pos += (1 + ROUNDUP8(sizeof(struct pipe_viewport_state)) / 8);
          break;
+      case CELL_CMD_STATE_UNIFORMS:
+         draw.constants = (float (*)[4]) (uintptr_t) buffer[pos + 1];
+         pos += 2;
+         break;
       case CELL_CMD_STATE_VS_ARRAY_INFO:
          cmd_state_vs_array_info((struct cell_array_info *) &buffer[pos+1]);
          pos += (1 + ROUNDUP8(sizeof(struct cell_array_info)) / 8);
          break;
+      case CELL_CMD_STATE_BIND_VS:
+         spu_bind_vertex_shader(&draw,
+                                (struct cell_shader_info *) &buffer[pos+1]);
+         pos += (1 + ROUNDUP8(sizeof(struct cell_shader_info)) / 8);
+         break;
       case CELL_CMD_STATE_ATTRIB_FETCH: {
          struct cell_attribute_fetch_code *code =
              (struct cell_attribute_fetch_code *) &buffer[pos+1];
diff --git a/src/gallium/drivers/cell/spu/spu_vertex_fetch.c b/src/gallium/drivers/cell/spu/spu_vertex_fetch.c
index 55c6c28717..e5d9910ff3 100644
--- a/src/gallium/drivers/cell/spu/spu_vertex_fetch.c
+++ b/src/gallium/drivers/cell/spu/spu_vertex_fetch.c
@@ -64,7 +64,7 @@ typedef void (*spu_fetch_func)(qword *out, const qword *in,
 			       const qword *shuffle_data);
 
 
-static const qword fetch_shuffle_data[] = {
+static const qword fetch_shuffle_data[5] ALIGN16_ATTRIB = {
    /* Shuffle used by CVT_64_FLOAT
     */
    {
@@ -108,7 +108,7 @@ static const qword fetch_shuffle_data[] = {
 static INLINE void
 fetch_unaligned(qword *dst, unsigned ea, unsigned size)
 {
-   qword tmp[4];
+   qword tmp[4] ALIGN16_ATTRIB;
    const int shift = ea & 0x0f;
    const unsigned aligned_start_ea = ea & ~0x0f;
    const unsigned aligned_end_ea = (ea + size) & ~0x0f;
@@ -169,7 +169,7 @@ static void generic_vertex_fetch(struct spu_vs_context *draw,
       unsigned idx;
       const unsigned bytes_per_entry = draw->vertex_fetch.size[attr];
       const unsigned quads_per_entry = (bytes_per_entry + 15) / 16;
-      qword in[2 * 4];
+      qword in[2 * 4] ALIGN16_ATTRIB;
 
 
       /* Fetch four attributes for four vertices.  
diff --git a/src/gallium/drivers/cell/spu/spu_vertex_shader.c b/src/gallium/drivers/cell/spu/spu_vertex_shader.c
index 3f5bf41aa2..8363efeeb6 100644
--- a/src/gallium/drivers/cell/spu/spu_vertex_shader.c
+++ b/src/gallium/drivers/cell/spu/spu_vertex_shader.c
@@ -165,63 +165,55 @@ run_vertex_program(struct spu_vs_context *draw,
 }
 
 
-static void
-spu_bind_vertex_shader(struct spu_vs_context *draw,
-		       void *uniforms,
-		       void *planes,
-		       unsigned nr_planes,
-		       unsigned num_outputs
-		       )
-{
-   draw->constants = (float (*)[4]) uniforms;
-
-   (void) memcpy(draw->plane, planes, sizeof(float) * 4 * nr_planes);
-   draw->nr_planes = nr_planes;
-   draw->num_vs_outputs = num_outputs;
-
-   /* specify the shader to interpret/execute */
-   spu_exec_machine_init(&draw->machine,
-			 PIPE_MAX_SAMPLERS,
-			 NULL /*samplers*/,
-			 PIPE_SHADER_VERTEX);
-}
-
-
 unsigned char immediates[(sizeof(float) * 4 * TGSI_EXEC_NUM_IMMEDIATES) + 32]
     ALIGN16_ATTRIB;
 
+
 void
-spu_execute_vertex_shader(struct spu_vs_context *draw,
-                          const struct cell_command_vs *vs)
+spu_bind_vertex_shader(struct spu_vs_context *draw,
+		       struct cell_shader_info *vs)
 {
-   unsigned i;
-
-   const uint64_t immediate_addr = vs->shader.immediates;
+   const unsigned immediate_addr = vs->immediates;
    const unsigned immediate_size = 
-       ROUNDUP16((sizeof(float) * 4 * vs->shader.num_immediates)
-                 + (immediate_addr & 0x0f));
+       ROUNDUP16((sizeof(float) * 4 * vs->num_immediates)
+		 + (immediate_addr & 0x0f));
+ 
 
    mfc_get(immediates, immediate_addr & ~0x0f, immediate_size,
            TAG_VERTEX_BUFFER, 0, 0);
 
    draw->machine.Instructions = (struct tgsi_full_instruction *)
-       vs->shader.instructions;
-   draw->machine.NumInstructions = vs->shader.num_instructions;
+       vs->instructions;
+   draw->machine.NumInstructions = vs->num_instructions;
 
    draw->machine.Declarations = (struct tgsi_full_declaration *)
-       vs->shader.declarations;
-   draw->machine.NumDeclarations = vs->shader.num_declarations;
+       vs->declarations;
+   draw->machine.NumDeclarations = vs->num_declarations;
 
-   draw->vertex_fetch.nr_attrs = vs->nr_attrs;
+   draw->num_vs_outputs = vs->num_outputs;
+
+   /* specify the shader to interpret/execute */
+   spu_exec_machine_init(&draw->machine,
+			 PIPE_MAX_SAMPLERS,
+			 NULL /*samplers*/,
+			 PIPE_SHADER_VERTEX);
 
    wait_on_mask(1 << TAG_VERTEX_BUFFER);
 
    (void) memcpy(& draw->machine.Imms, &immediates[immediate_addr & 0x0f],
-                 sizeof(float) * 4 * vs->shader.num_immediates);
+                 sizeof(float) * 4 * vs->num_immediates);
+}
 
-   spu_bind_vertex_shader(draw, vs->shader.uniforms,
-                          vs->plane, vs->nr_planes,
-                          vs->shader.num_outputs);
+
+void
+spu_execute_vertex_shader(struct spu_vs_context *draw,
+                          const struct cell_command_vs *vs)
+{
+   unsigned i;
+
+   (void) memcpy(draw->plane, vs->plane, sizeof(float) * 4 * vs->nr_planes);
+   draw->nr_planes = vs->nr_planes;
+   draw->vertex_fetch.nr_attrs = vs->nr_attrs;
 
    for (i = 0; i < vs->num_elts; i += 4) {
       const unsigned batch_size = MIN2(vs->num_elts - i, 4);
diff --git a/src/gallium/drivers/cell/spu/spu_vertex_shader.h b/src/gallium/drivers/cell/spu/spu_vertex_shader.h
index 0fb0bc28d0..54a4b8d9b9 100644
--- a/src/gallium/drivers/cell/spu/spu_vertex_shader.h
+++ b/src/gallium/drivers/cell/spu/spu_vertex_shader.h
@@ -1,6 +1,7 @@
 #ifndef SPU_VERTEX_SHADER_H
 #define SPU_VERTEX_SHADER_H
 
+#include "cell/common.h"
 #include "pipe/p_format.h"
 #include "spu_exec.h"
 
@@ -54,6 +55,10 @@ static INLINE void spu_vertex_fetch(struct spu_vs_context *draw,
 
 struct cell_command_vs;
 
+extern void
+spu_bind_vertex_shader(struct spu_vs_context *draw,
+		       struct cell_shader_info *vs);
+
 extern void
 spu_execute_vertex_shader(struct spu_vs_context *draw,
 			  const struct cell_command_vs *vs);
-- 
cgit v1.2.3


From 7c74037852a484a8a50e8bc540b954a624de4d33 Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Wed, 20 Feb 2008 14:32:25 -0800
Subject: Cell: Initial pass at unified data cache

---
 src/gallium/drivers/cell/common.h               |   8 ++
 src/gallium/drivers/cell/ppu/cell_draw_arrays.c |   6 +-
 src/gallium/drivers/cell/ppu/cell_flush.c       |  14 ++++
 src/gallium/drivers/cell/spu/spu_dcache.c       | 100 ++++++++++++++++++++++++
 src/gallium/drivers/cell/spu/spu_dcache.h       |  34 ++++++++
 src/gallium/drivers/cell/spu/spu_exec.c         |  49 ++++++------
 src/gallium/drivers/cell/spu/spu_main.c         |   8 ++
 src/gallium/drivers/cell/spu/spu_vertex_fetch.c |  70 +----------------
 8 files changed, 194 insertions(+), 95 deletions(-)
 create mode 100644 src/gallium/drivers/cell/spu/spu_dcache.c
 create mode 100644 src/gallium/drivers/cell/spu/spu_dcache.h

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index cf892206c6..f32ad5bfbe 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -93,6 +93,7 @@
 #define CELL_CMD_STATE_BLEND         19
 #define CELL_CMD_STATE_ATTRIB_FETCH  20
 #define CELL_CMD_VS_EXECUTE          21
+#define CELL_CMD_FLUSH_BUFFER_RANGE  22
 
 
 #define CELL_NUM_BUFFERS 4
@@ -144,6 +145,13 @@ struct cell_attribute_fetch_code {
    uint size;
 };
 
+
+struct cell_buffer_range {
+   uint64_t base;
+   unsigned size;
+};
+
+
 struct cell_shader_info
 {
    uint64_t declarations;
diff --git a/src/gallium/drivers/cell/ppu/cell_draw_arrays.c b/src/gallium/drivers/cell/ppu/cell_draw_arrays.c
index f12613649b..cbd387f014 100644
--- a/src/gallium/drivers/cell/ppu/cell_draw_arrays.c
+++ b/src/gallium/drivers/cell/ppu/cell_draw_arrays.c
@@ -49,9 +49,12 @@ cell_map_constant_buffers(struct cell_context *sp)
    struct pipe_winsys *ws = sp->pipe.winsys;
    uint i;
    for (i = 0; i < 2; i++) {
-      if (sp->constants[i].size)
+      if (sp->constants[i].size) {
          sp->mapped_constants[i] = ws->buffer_map(ws, sp->constants[i].buffer,
                                                   PIPE_BUFFER_USAGE_CPU_READ);
+         cell_flush_buffer_range(sp, sp->mapped_constants[i], 
+                                 sp->constants[i].buffer->size);
+      }
    }
 
    draw_set_mapped_constant_buffer(sp->draw,
@@ -124,6 +127,7 @@ cell_draw_elements(struct pipe_context *pipe,
          void *buf = pipe->winsys->buffer_map(pipe->winsys,
                                               sp->vertex_buffer[i].buffer,
                                               PIPE_BUFFER_USAGE_CPU_READ);
+	 cell_flush_buffer_range(sp, buf, sp->vertex_buffer[i].buffer->size);
          draw_set_mapped_vertex_buffer(draw, i, buf);
       }
    }
diff --git a/src/gallium/drivers/cell/ppu/cell_flush.c b/src/gallium/drivers/cell/ppu/cell_flush.c
index 20f27531fc..66a5627d84 100644
--- a/src/gallium/drivers/cell/ppu/cell_flush.c
+++ b/src/gallium/drivers/cell/ppu/cell_flush.c
@@ -82,3 +82,17 @@ cell_flush_int(struct pipe_context *pipe, unsigned flags)
 
    flushing = FALSE;
 }
+
+
+void
+cell_flush_buffer_range(struct cell_context *cell, void *ptr,
+			unsigned size)
+{
+   uint64_t batch[1 + (ROUNDUP8(sizeof(struct cell_buffer_range)) / 8)];
+   struct cell_buffer_range *br = (struct cell_buffer_range *) & batch[1];
+
+   batch[0] = CELL_CMD_FLUSH_BUFFER_RANGE;
+   br->base = (uintptr_t) ptr;
+   br->size = size;
+   cell_batch_append(cell, batch, sizeof(batch));
+}
diff --git a/src/gallium/drivers/cell/spu/spu_dcache.c b/src/gallium/drivers/cell/spu/spu_dcache.c
new file mode 100644
index 0000000000..9e30e17880
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_dcache.c
@@ -0,0 +1,100 @@
+/*
+ * (C) Copyright IBM Corporation 2008
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "spu_main.h"
+#include "spu_dcache.h"
+
+#define CACHE_NAME            data
+#define CACHED_TYPE           qword
+#define CACHE_TYPE            CACHE_TYPE_RO
+#define CACHE_SET_TAGID(set)  TAG_VERTEX_BUFFER
+#define CACHE_LOG2NNWAY       2
+#define CACHE_LOG2NSETS       6
+#include <cache-api.h>
+
+/* Yes folks, this is ugly.
+ */
+#undef CACHE_NWAY
+#undef CACHE_NSETS
+#define CACHE_NAME            data
+#define CACHE_NWAY            4
+#define CACHE_NSETS           (1U << 6)
+
+
+/**
+ * Fetch between arbitrary number of bytes from an unaligned address
+ */
+void
+spu_dcache_fetch_unaligned(qword *dst, unsigned ea, unsigned size)
+{
+   const int shift = ea & 0x0f;
+   const unsigned aligned_start_ea = ea & ~0x0f;
+   const unsigned aligned_end_ea = (ea + size) & ~0x0f;
+   const unsigned num_entries = ((aligned_end_ea - aligned_start_ea) / 16) + 1;
+   unsigned i;
+
+
+   if (shift == 0) {
+      /* Data is already aligned.  Fetch directly into the destination buffer.
+       */
+      for (i = 0; i < num_entries; i++) {
+         dst[i] = cache_rd(data, (ea & ~0x0f) + (i * 16));
+      }
+   } else {
+      qword tmp[2] ALIGN16_ATTRIB;
+
+
+      tmp[0] = cache_rd(data, (ea & ~0x0f));
+      for (i = 0; i < (num_entries & ~1); i++) {
+         const unsigned curr = i & 1;
+         const unsigned next = curr ^ 1;
+
+         tmp[next] = cache_rd(data, (ea & ~0x0f) + (next * 16));
+
+         dst[i] = si_or((qword) spu_slqwbyte(tmp[curr], shift),
+                        (qword) spu_rlmaskqwbyte(tmp[next], shift - 16));
+      }
+
+      if (i < num_entries) {
+         dst[i] = si_or((qword) spu_slqwbyte(tmp[(i & 1)], shift),
+                        si_il(0));
+      }
+   }
+}
+
+
+void
+spu_dcache_mark_dirty(unsigned ea, unsigned size)
+{
+   unsigned i;
+
+   (void) ea;
+   (void) size;
+
+   /* Invalidate the whole cache for now.
+    */
+   for (i = 0; i < (CACHE_NWAY * CACHE_NSETS); i++) {
+      CACHELINE_CLEARVALID(i);
+   }
+}
diff --git a/src/gallium/drivers/cell/spu/spu_dcache.h b/src/gallium/drivers/cell/spu/spu_dcache.h
new file mode 100644
index 0000000000..7a06b8c25a
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_dcache.h
@@ -0,0 +1,34 @@
+/*
+ * (C) Copyright IBM Corporation 2008
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef SPU_DCACHE_H
+#define SPU_DCACHE_H
+
+extern void
+spu_dcache_fetch_unaligned(qword *dst, unsigned ea, unsigned size);
+
+extern void
+spu_dcache_mark_dirty(unsigned ea, unsigned size);
+
+#endif /* SPU_DCACHE_H */
diff --git a/src/gallium/drivers/cell/spu/spu_exec.c b/src/gallium/drivers/cell/spu/spu_exec.c
index 0eb5ea1a3f..94ac6a2885 100644
--- a/src/gallium/drivers/cell/spu/spu_exec.c
+++ b/src/gallium/drivers/cell/spu/spu_exec.c
@@ -72,6 +72,7 @@
 #include "spu_exec.h"
 #include "spu_main.h"
 #include "spu_vertex_shader.h"
+#include "spu_dcache.h"
 
 #define TILE_TOP_LEFT     0
 #define TILE_TOP_RIGHT    1
@@ -352,19 +353,17 @@ fetch_src_file_channel(
    case TGSI_EXTSWIZZLE_W:
       switch( file ) {
       case TGSI_FILE_CONSTANT: {
-         unsigned char buffer[32] ALIGN16_ATTRIB;
          unsigned i;
 
          for (i = 0; i < 4; i++) {
             const float *ptr = mach->Consts[index->i[i]];
-            const uint64_t addr = (uint64_t)(uintptr_t) ptr;
-            const unsigned size = ((addr & 0x0f) == 0) ? 16 : 32;
+            float tmp[4];
 
-            mfc_get(buffer, addr & ~0x0f, size, TAG_VERTEX_BUFFER, 0, 0);
-            wait_on_mask(1 << TAG_VERTEX_BUFFER);
+            spu_dcache_fetch_unaligned((qword *) tmp,
+                                       (uintptr_t)(ptr + swizzle),
+                                       sizeof(float));
 
-            (void) memcpy(& chan->f[i], &buffer[(addr & 0x0f) 
-                + (sizeof(float) * swizzle)], sizeof(float));
+            chan->f[i] = tmp[0];
          }
          break;
       }
@@ -1899,32 +1898,30 @@ spu_exec_machine_run( struct spu_exec_machine *mach )
    /* execute declarations (interpolants) */
    if( mach->Processor == TGSI_PROCESSOR_FRAGMENT ) {
       for (i = 0; i < mach->NumDeclarations; i++) {
-	 uint8_t buffer[sizeof(struct tgsi_full_declaration) + 32] ALIGN16_ATTRIB;
-	 struct tgsi_full_declaration decl;
-	 unsigned long decl_addr = (unsigned long) (mach->Declarations+i);
-	 unsigned size = ((sizeof(decl) + (decl_addr & 0x0f) + 0x0f) & ~0x0f);
+         union {
+            struct tgsi_full_declaration decl;
+            qword buffer[2 * ((sizeof(struct tgsi_full_declaration) + 31) 
+                              / 32)];
+         } d ALIGN16_ATTRIB;
+         unsigned ea = (unsigned) (mach->Declarations + pc);
 
-	 mfc_get(buffer, decl_addr & ~0x0f, size, TAG_INSTRUCTION_FETCH, 0, 0);
-	 wait_on_mask(1 << TAG_INSTRUCTION_FETCH);
+         spu_dcache_fetch_unaligned(d.buffer, ea, sizeof(d.decl));
 
-	 memcpy(& decl, buffer + (decl_addr & 0x0f), sizeof(decl));
-	 exec_declaration( mach, &decl );
+         exec_declaration( mach, &d.decl );
       }
    }
 
    /* execute instructions, until pc is set to -1 */
    while (pc != -1) {
-      uint8_t buffer[sizeof(struct tgsi_full_instruction) + 32] ALIGN16_ATTRIB;
-      struct tgsi_full_instruction inst;
-      unsigned long inst_addr = (unsigned long) (mach->Instructions + pc);
-      unsigned size = ((sizeof(inst) + (inst_addr & 0x0f) + 0x0f) & ~0x0f);
-
-      assert(pc < mach->NumInstructions);
-      mfc_get(buffer, inst_addr & ~0x0f, size, TAG_INSTRUCTION_FETCH, 0, 0);
-      wait_on_mask(1 << TAG_INSTRUCTION_FETCH);
-
-      memcpy(& inst, buffer + (inst_addr & 0x0f), sizeof(inst));
-      exec_instruction( mach, & inst, &pc );
+      union {
+         struct tgsi_full_instruction inst;
+         qword buffer[2 * ((sizeof(struct tgsi_full_instruction) + 31) 
+                           / 32)];
+      } i ALIGN16_ATTRIB;
+      unsigned ea = (unsigned) (mach->Instructions + pc);
+
+      spu_dcache_fetch_unaligned(i.buffer, ea, sizeof(i.inst));
+      exec_instruction( mach, & i.inst, &pc );
    }
 
 #if 0
diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index dbc3705c24..1136dba62d 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -462,6 +462,14 @@ cmd_batch(uint opcode)
          pos += (1 + ROUNDUP8(sizeof(struct cell_attribute_fetch_code)) / 8);
          break;
       }
+      case CELL_CMD_FLUSH_BUFFER_RANGE: {
+	 struct cell_buffer_range *br = (struct cell_buffer_range *)
+	     &buffer[pos+1];
+
+	 spu_dcache_mark_dirty((unsigned) br->base, br->size);
+         pos += (1 + ROUNDUP8(sizeof(struct cell_buffer_range)) / 8);
+	 break;
+      }
       default:
          printf("SPU %u: bad opcode: 0x%llx\n", spu.init.id, buffer[pos]);
          ASSERT(0);
diff --git a/src/gallium/drivers/cell/spu/spu_vertex_fetch.c b/src/gallium/drivers/cell/spu/spu_vertex_fetch.c
index e5d9910ff3..f7e4e653e3 100644
--- a/src/gallium/drivers/cell/spu/spu_vertex_fetch.c
+++ b/src/gallium/drivers/cell/spu/spu_vertex_fetch.c
@@ -40,25 +40,7 @@
 #include "spu_exec.h"
 #include "spu_vertex_shader.h"
 #include "spu_main.h"
-
-#define CACHE_NAME            attribute
-#define CACHED_TYPE           qword
-#define CACHE_TYPE            CACHE_TYPE_RO
-#define CACHE_SET_TAGID(set)  TAG_VERTEX_BUFFER
-#define CACHE_LOG2NNWAY       2
-#define CACHE_LOG2NSETS       6
-#include <cache-api.h>
-
-/* Yes folks, this is ugly.
- */
-#undef CACHE_NWAY
-#undef CACHE_NSETS
-#define CACHE_NAME            attribute
-#define CACHE_NWAY            4
-#define CACHE_NSETS           (1U << 6)
-
-
-#define DRAW_DBG 0
+#include "spu_dcache.h"
 
 typedef void (*spu_fetch_func)(qword *out, const qword *in,
 			       const qword *shuffle_data);
@@ -102,44 +84,6 @@ static const qword fetch_shuffle_data[5] ALIGN16_ATTRIB = {
 };
 
 
-/**
- * Fetch between 1 and 32 bytes from an unaligned address
- */
-static INLINE void
-fetch_unaligned(qword *dst, unsigned ea, unsigned size)
-{
-   qword tmp[4] ALIGN16_ATTRIB;
-   const int shift = ea & 0x0f;
-   const unsigned aligned_start_ea = ea & ~0x0f;
-   const unsigned aligned_end_ea = (ea + size) & ~0x0f;
-   const unsigned num_entries = ((aligned_end_ea - aligned_start_ea) / 16) + 1;
-   unsigned i;
-
-
-   if (shift == 0) {
-      /* Data is already aligned.  Fetch directly into the destination buffer.
-       */
-      for (i = 0; i < num_entries; i++) {
-	 dst[i] = cache_rd(attribute, (ea & ~0x0f) + (i * 16));
-      }
-   } else {
-      /* Fetch data from the cache to the local buffer.
-       */
-      for (i = 0; i < num_entries; i++) {
-	 tmp[i] = cache_rd(attribute, (ea & ~0x0f) + (i * 16));
-      }
-
-
-      /* Fix the alignment of the data and write to the destination buffer.
-       */
-      for (i = 0; i < ((size + 15) / 16); i++) {
-	 dst[i] = si_or((qword) spu_slqwbyte(tmp[i], shift),
-			(qword) spu_rlmaskqwbyte(tmp[i + 1], shift - 16));
-      }
-   }
-}
-
-
 /**
  * Fetch vertex attributes for 'count' vertices.
  */
@@ -182,7 +126,7 @@ static void generic_vertex_fetch(struct spu_vs_context *draw,
          printf("SPU: fetching = 0x%llx\n", addr);
 #endif
 
-         fetch_unaligned(& in[idx], addr, bytes_per_entry);
+         spu_dcache_fetch_unaligned(& in[idx], addr, bytes_per_entry);
          idx += quads_per_entry;
       }
 
@@ -200,15 +144,5 @@ static void generic_vertex_fetch(struct spu_vs_context *draw,
 
 void spu_update_vertex_fetch( struct spu_vs_context *draw )
 {
-   unsigned i;
-
-   
-   /* Invalidate the vertex cache.
-    */
-   for (i = 0; i < (CACHE_NWAY * CACHE_NSETS); i++) {
-      CACHELINE_CLEARVALID(i);
-   }
-
-
    draw->vertex_fetch.fetch_func = generic_vertex_fetch;
 }
-- 
cgit v1.2.3


From 2d1f086c12b6d64f5c3fb80474f26775aeb71370 Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Wed, 20 Feb 2008 14:45:08 -0800
Subject: Cell: Fix off-by-one error in spu_dcache_fetch_unaligned

An off-by-one error caused an extra qword to be fetched under certain
alignment / size combinations.
---
 src/gallium/drivers/cell/spu/spu_dcache.c | 5 +++--
 src/gallium/drivers/cell/spu/spu_exec.c   | 7 +++----
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/spu/spu_dcache.c b/src/gallium/drivers/cell/spu/spu_dcache.c
index 9e30e17880..68aa5c4ae8 100644
--- a/src/gallium/drivers/cell/spu/spu_dcache.c
+++ b/src/gallium/drivers/cell/spu/spu_dcache.c
@@ -22,6 +22,7 @@
  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
 
+#include "cell/common.h"
 #include "spu_main.h"
 #include "spu_dcache.h"
 
@@ -50,8 +51,8 @@ spu_dcache_fetch_unaligned(qword *dst, unsigned ea, unsigned size)
 {
    const int shift = ea & 0x0f;
    const unsigned aligned_start_ea = ea & ~0x0f;
-   const unsigned aligned_end_ea = (ea + size) & ~0x0f;
-   const unsigned num_entries = ((aligned_end_ea - aligned_start_ea) / 16) + 1;
+   const unsigned aligned_end_ea = ROUNDUP16(ea + size);
+   const unsigned num_entries = (aligned_end_ea - aligned_start_ea) / 16;
    unsigned i;
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_exec.c b/src/gallium/drivers/cell/spu/spu_exec.c
index 94ac6a2885..cf81bee8fd 100644
--- a/src/gallium/drivers/cell/spu/spu_exec.c
+++ b/src/gallium/drivers/cell/spu/spu_exec.c
@@ -73,6 +73,7 @@
 #include "spu_main.h"
 #include "spu_vertex_shader.h"
 #include "spu_dcache.h"
+#include "cell/common.h"
 
 #define TILE_TOP_LEFT     0
 #define TILE_TOP_RIGHT    1
@@ -1900,8 +1901,7 @@ spu_exec_machine_run( struct spu_exec_machine *mach )
       for (i = 0; i < mach->NumDeclarations; i++) {
          union {
             struct tgsi_full_declaration decl;
-            qword buffer[2 * ((sizeof(struct tgsi_full_declaration) + 31) 
-                              / 32)];
+            qword buffer[ROUNDUP16(sizeof(struct tgsi_full_declaration)) / 16];
          } d ALIGN16_ATTRIB;
          unsigned ea = (unsigned) (mach->Declarations + pc);
 
@@ -1915,8 +1915,7 @@ spu_exec_machine_run( struct spu_exec_machine *mach )
    while (pc != -1) {
       union {
          struct tgsi_full_instruction inst;
-         qword buffer[2 * ((sizeof(struct tgsi_full_instruction) + 31) 
-                           / 32)];
+         qword buffer[ROUNDUP16(sizeof(struct tgsi_full_instruction)) / 16];
       } i ALIGN16_ATTRIB;
       unsigned ea = (unsigned) (mach->Instructions + pc);
 
-- 
cgit v1.2.3


From e78fc9f2f4d89b0cae0d56d84dd16cb76a6757dc Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Thu, 21 Feb 2008 09:03:29 -0800
Subject: Cell: Initial scalar implementation of spu_dcache_mark_dirty

---
 src/gallium/drivers/cell/spu/spu_dcache.c | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/spu/spu_dcache.c b/src/gallium/drivers/cell/spu/spu_dcache.c
index 68aa5c4ae8..698a5790bb 100644
--- a/src/gallium/drivers/cell/spu/spu_dcache.c
+++ b/src/gallium/drivers/cell/spu/spu_dcache.c
@@ -26,6 +26,10 @@
 #include "spu_main.h"
 #include "spu_dcache.h"
 
+#define CACHELINE_LOG2SIZE    7
+#define LINE_SIZE             (1U << 7)
+#define ALIGN_MASK            (~(LINE_SIZE - 1))
+
 #define CACHE_NAME            data
 #define CACHED_TYPE           qword
 #define CACHE_TYPE            CACHE_TYPE_RO
@@ -60,7 +64,7 @@ spu_dcache_fetch_unaligned(qword *dst, unsigned ea, unsigned size)
       /* Data is already aligned.  Fetch directly into the destination buffer.
        */
       for (i = 0; i < num_entries; i++) {
-         dst[i] = cache_rd(data, (ea & ~0x0f) + (i * 16));
+         dst[i] = cache_rd(data, ea + (i * 16));
       }
    } else {
       qword tmp[2] ALIGN16_ATTRIB;
@@ -85,17 +89,23 @@ spu_dcache_fetch_unaligned(qword *dst, unsigned ea, unsigned size)
 }
 
 
+/**
+ * Notify the cache that a range of main memory may have been modified
+ */
 void
 spu_dcache_mark_dirty(unsigned ea, unsigned size)
 {
    unsigned i;
+   const unsigned aligned_start = (ea & ALIGN_MASK);
+   const unsigned aligned_end = (ea + size + (LINE_SIZE - 1)) 
+       & ALIGN_MASK;
 
-   (void) ea;
-   (void) size;
 
-   /* Invalidate the whole cache for now.
-    */
    for (i = 0; i < (CACHE_NWAY * CACHE_NSETS); i++) {
-      CACHELINE_CLEARVALID(i);
+      const unsigned entry = __cache_dir[i];
+      const unsigned addr = entry & ~0x0f;
+
+      __cache_dir[i] = ((addr >= aligned_start) && (addr < aligned_end))
+          ? (entry & ~CACHELINE_VALID) : entry;
    }
 }
-- 
cgit v1.2.3


From 6dd47c264a8642a4e3dbe0b4fc194174743c64fc Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Thu, 21 Feb 2008 10:24:29 -0800
Subject: Cell: Add spu_dcache.c to Makefile.

This was erroneously missing in previous commits.
---
 src/gallium/drivers/cell/spu/Makefile | 1 +
 1 file changed, 1 insertion(+)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/spu/Makefile b/src/gallium/drivers/cell/spu/Makefile
index 30ef2450ec..c071de1900 100644
--- a/src/gallium/drivers/cell/spu/Makefile
+++ b/src/gallium/drivers/cell/spu/Makefile
@@ -18,6 +18,7 @@ PROG_SPU_EMBED_O = $(PROG)_spu-embed.o
 SOURCES = \
 	spu_main.c \
 	spu_blend.c \
+	spu_dcache.c \
 	spu_render.c \
 	spu_texture.c \
 	spu_tile.c \
-- 
cgit v1.2.3


From 7976a084e792daf0b23c688bfa8f577de141ecca Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Thu, 21 Feb 2008 11:01:35 -0800
Subject: Cell: Use multiple DMA tags for the dcache.

---
 src/gallium/drivers/cell/spu/spu_dcache.c | 2 +-
 src/gallium/drivers/cell/spu/spu_main.h   | 5 ++++-
 2 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/spu/spu_dcache.c b/src/gallium/drivers/cell/spu/spu_dcache.c
index 698a5790bb..3baeaea998 100644
--- a/src/gallium/drivers/cell/spu/spu_dcache.c
+++ b/src/gallium/drivers/cell/spu/spu_dcache.c
@@ -33,7 +33,7 @@
 #define CACHE_NAME            data
 #define CACHED_TYPE           qword
 #define CACHE_TYPE            CACHE_TYPE_RO
-#define CACHE_SET_TAGID(set)  TAG_VERTEX_BUFFER
+#define CACHE_SET_TAGID(set)  (((set) & 0x03) + TAG_DCACHE0)
 #define CACHE_LOG2NNWAY       2
 #define CACHE_LOG2NSETS       6
 #include <cache-api.h>
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 5c95d112ac..d14f1abbe7 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -131,7 +131,10 @@ extern boolean Debug;
 #define TAG_BATCH_BUFFER      17
 #define TAG_MISC              18
 #define TAG_TEXTURE_TILE      19
-#define TAG_INSTRUCTION_FETCH 20
+#define TAG_DCACHE0           20
+#define TAG_DCACHE1           21
+#define TAG_DCACHE2           22
+#define TAG_DCACHE3           23
 
 
-- 
cgit v1.2.3


From fb68daceec312a90910dad3882e6ef57c370b7fd Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Thu, 21 Feb 2008 16:41:12 -0800
Subject: Cell: Remove unnecessary include files

---
 src/gallium/drivers/cell/spu/spu_exec.c         | 2 --
 src/gallium/drivers/cell/spu/spu_vertex_fetch.c | 2 --
 2 files changed, 4 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/spu/spu_exec.c b/src/gallium/drivers/cell/spu/spu_exec.c
index cf81bee8fd..fff0114a23 100644
--- a/src/gallium/drivers/cell/spu/spu_exec.c
+++ b/src/gallium/drivers/cell/spu/spu_exec.c
@@ -50,8 +50,6 @@
  *   Brian Paul
  */
 
-#include <libmisc.h>
-#include <spu_mfcio.h>
 #include <transpose_matrix4x4.h>
 #include <simdmath/ceilf4.h>
 #include <simdmath/cosf4.h>
diff --git a/src/gallium/drivers/cell/spu/spu_vertex_fetch.c b/src/gallium/drivers/cell/spu/spu_vertex_fetch.c
index f7e4e653e3..219fd90cc0 100644
--- a/src/gallium/drivers/cell/spu/spu_vertex_fetch.c
+++ b/src/gallium/drivers/cell/spu/spu_vertex_fetch.c
@@ -32,8 +32,6 @@
   *   Ian Romanick <idr@us.ibm.com>
   */
 
-#include <spu_mfcio.h>
-
 #include "pipe/p_util.h"
 #include "pipe/p_state.h"
 #include "pipe/p_shader_tokens.h"
-- 
cgit v1.2.3


From a63fd641a01a50e1be51664bf863e01ddaf61d3e Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Fri, 22 Feb 2008 16:27:39 -0800
Subject: cell: Trivial compiler warning clean-ups.

---
 src/gallium/drivers/cell/ppu/cell_draw_arrays.c   | 1 +
 src/gallium/drivers/cell/ppu/cell_vertex_shader.c | 1 -
 src/gallium/drivers/cell/spu/spu_exec.c           | 6 ++++--
 src/gallium/drivers/cell/spu/spu_main.c           | 5 +++--
 4 files changed, 8 insertions(+), 5 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/ppu/cell_draw_arrays.c b/src/gallium/drivers/cell/ppu/cell_draw_arrays.c
index cbd387f014..c839fb4d12 100644
--- a/src/gallium/drivers/cell/ppu/cell_draw_arrays.c
+++ b/src/gallium/drivers/cell/ppu/cell_draw_arrays.c
@@ -38,6 +38,7 @@
 #include "cell_context.h"
 #include "cell_draw_arrays.h"
 #include "cell_state.h"
+#include "cell_flush.h"
 
 #include "draw/draw_context.h"
 
diff --git a/src/gallium/drivers/cell/ppu/cell_vertex_shader.c b/src/gallium/drivers/cell/ppu/cell_vertex_shader.c
index 42cc47cbfe..17141924be 100644
--- a/src/gallium/drivers/cell/ppu/cell_vertex_shader.c
+++ b/src/gallium/drivers/cell/ppu/cell_vertex_shader.c
@@ -55,7 +55,6 @@ cell_vertex_shader_queue_flush(struct draw_context *draw)
    struct cell_command_vs *const vs = &cell_global.command[0].vs;
    uint64_t *batch;
    struct cell_array_info *array_info;
-   struct cell_shader_info *shader_info;
    unsigned i, j;
    struct cell_attribute_fetch_code *cf;
 
diff --git a/src/gallium/drivers/cell/spu/spu_exec.c b/src/gallium/drivers/cell/spu/spu_exec.c
index fff0114a23..1560c0f157 100644
--- a/src/gallium/drivers/cell/spu/spu_exec.c
+++ b/src/gallium/drivers/cell/spu/spu_exec.c
@@ -149,6 +149,7 @@ spu_exec_machine_init(struct spu_exec_machine *mach,
    const qword zero = si_il(0);
    const qword not_zero = si_il(~0);
 
+   (void) numSamplers;
    mach->Samplers = samplers;
    mach->Processor = processor;
    mach->Addrs = &mach->Temps[TGSI_EXEC_NUM_TEMPS];
@@ -657,9 +658,10 @@ fetch_texel( struct spu_sampler *sampler,
    qword rgba[4];
    qword out[4];
 
-   sampler->get_samples(sampler, s->f, t->f, p->f, lodbias, (float *) rgba);
+   sampler->get_samples(sampler, s->f, t->f, p->f, lodbias, 
+			(float (*)[4]) rgba);
 
-   _transpose_matrix4x4(out, rgba);
+   _transpose_matrix4x4((vec_float4 *) out, (vec_float4 *) rgba);
    r->q = out[0];
    g->q = out[1];
    b->q = out[2];
diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index 1136dba62d..cc4bafdb3a 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -38,6 +38,7 @@
 #include "spu_tile.h"
 //#include "spu_test.h"
 #include "spu_vertex_shader.h"
+#include "spu_dcache.h"
 #include "cell/common.h"
 #include "pipe/p_defines.h"
 
@@ -434,7 +435,7 @@ cmd_batch(uint opcode)
          pos += (1 + ROUNDUP8(sizeof(struct pipe_viewport_state)) / 8);
          break;
       case CELL_CMD_STATE_UNIFORMS:
-         draw.constants = (float (*)[4]) (uintptr_t) buffer[pos + 1];
+         draw.constants = (const float (*)[4]) (uintptr_t) buffer[pos + 1];
          pos += 2;
          break;
       case CELL_CMD_STATE_VS_ARRAY_INFO:
@@ -583,7 +584,7 @@ main(main_param_t speid, main_param_t argp)
    one_time_init();
 
    if (Debug)
-      printf("SPU: main() speid=%lu\n", speid);
+      printf("SPU: main() speid=%lu\n", (unsigned long) speid);
 
    mfc_get(&spu.init,  /* dest */
            (unsigned int) argp, /* src */
-- 
cgit v1.2.3


From 2efa7e9489541b6a86c3d46e3d58cbf5bf399189 Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Fri, 22 Feb 2008 17:51:55 -0800
Subject: cell: Fix off-by-one error in spu_dcache_fetch_unaligned

This time the off-by-one error caused an extra qword to be fetched
under certain circumstances when the source ea was not qword aligned.
---
 src/gallium/drivers/cell/spu/spu_dcache.c | 50 ++++++++++++++++++++-----------
 1 file changed, 32 insertions(+), 18 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/spu/spu_dcache.c b/src/gallium/drivers/cell/spu/spu_dcache.c
index 3baeaea998..a1701d80d1 100644
--- a/src/gallium/drivers/cell/spu/spu_dcache.c
+++ b/src/gallium/drivers/cell/spu/spu_dcache.c
@@ -49,43 +49,57 @@
 
 /**
  * Fetch between arbitrary number of bytes from an unaligned address
+ *
+ * \param dst   Destination data buffer
+ * \param ea    Main memory effective address of source data
+ * \param size  Number of bytes to read
+ *
+ * \warning
+ * As is hinted by the type of the \c dst pointer, this function writes
+ * multiples of 16-bytes.
  */
 void
 spu_dcache_fetch_unaligned(qword *dst, unsigned ea, unsigned size)
 {
    const int shift = ea & 0x0f;
-   const unsigned aligned_start_ea = ea & ~0x0f;
-   const unsigned aligned_end_ea = ROUNDUP16(ea + size);
-   const unsigned num_entries = (aligned_end_ea - aligned_start_ea) / 16;
+   const unsigned read_size = ROUNDUP16(size + shift);
+   const unsigned last_read = ROUNDUP16(ea + size);
+   const qword *const last_write = dst + (ROUNDUP16(size) / 16);
    unsigned i;
 
 
    if (shift == 0) {
       /* Data is already aligned.  Fetch directly into the destination buffer.
        */
-      for (i = 0; i < num_entries; i++) {
-         dst[i] = cache_rd(data, ea + (i * 16));
+      for (i = 0; i < size; i += 16) {
+         *(dst++) = cache_rd(data, ea + i);
       }
    } else {
-      qword tmp[2] ALIGN16_ATTRIB;
-
+      qword hi;
 
-      tmp[0] = cache_rd(data, (ea & ~0x0f));
-      for (i = 0; i < (num_entries & ~1); i++) {
-         const unsigned curr = i & 1;
-         const unsigned next = curr ^ 1;
 
-         tmp[next] = cache_rd(data, (ea & ~0x0f) + (next * 16));
-
-         dst[i] = si_or((qword) spu_slqwbyte(tmp[curr], shift),
-                        (qword) spu_rlmaskqwbyte(tmp[next], shift - 16));
+      /* Please exercise extreme caution when modifying this code.  This code
+       * must not read past the end of the page containing the source data,
+       * and it must not write more than ((size + 15) / 16) qwords to the
+       * destination buffer.
+       */
+      ea &= ~0x0f;
+      hi = cache_rd(data, ea);
+      for (i = 16; i < read_size; i += 16) {
+         qword lo = cache_rd(data, ea + i);
+
+         *(dst++) = si_or((qword) spu_slqwbyte(hi, shift),
+                          (qword) spu_rlmaskqwbyte(lo, shift - 16));
+         hi = lo;
       }
 
-      if (i < num_entries) {
-         dst[i] = si_or((qword) spu_slqwbyte(tmp[(i & 1)], shift),
-                        si_il(0));
+      if (dst != last_write) {
+         *(dst++) = si_or((qword) spu_slqwbyte(hi, shift), si_il(0));
       }
    }
+   
+   ASSERT((ea + i) == last_read);
+   ASSERT(dst == last_write);
 }
 
 
-- 
cgit v1.2.3


From 1c50ea2cd9ab8752793c99b4a7a2a6656bdde1ac Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Wed, 27 Feb 2008 13:40:23 -0800
Subject: cell: Use unified data cache for textures too

---
 src/gallium/drivers/cell/spu/spu_main.c    |   2 +
 src/gallium/drivers/cell/spu/spu_main.h    |   3 +-
 src/gallium/drivers/cell/spu/spu_texture.c | 184 +++++++++++------------------
 3 files changed, 72 insertions(+), 117 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index cc4bafdb3a..59300028d4 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -286,6 +286,8 @@ cmd_state_texture(const struct cell_command_texture *texture)
       { spu.texture.width, spu.texture.height, 0.0, 0.0};
    spu.tex_size_mask = (vector unsigned int)
       { spu.texture.width - 1, spu.texture.height - 1, 0, 0 };
+   spu.tex_size_x_mask = spu_splats(spu.texture.width - 1);
+   spu.tex_size_y_mask = spu_splats(spu.texture.height - 1);
 }
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index d14f1abbe7..a13edd1702 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -107,6 +107,8 @@ struct spu_global
 
    vector float tex_size;
    vector unsigned int tex_size_mask; /**< == int(size - 1) */
+   vector unsigned int tex_size_x_mask; /**< == int(size - 1) */
+   vector unsigned int tex_size_y_mask; /**< == int(size - 1) */
 
    vector float (*sample_texture)(vector float texcoord);
 
@@ -130,7 +132,6 @@ extern boolean Debug;
 #define TAG_INDEX_BUFFER      16
 #define TAG_BATCH_BUFFER      17
 #define TAG_MISC              18
-#define TAG_TEXTURE_TILE      19
 #define TAG_DCACHE0           20
 #define TAG_DCACHE1           21
 #define TAG_DCACHE2           22
diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
index 3962aaa4a9..67eb08196a 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.c
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -31,19 +31,7 @@
 #include "spu_texture.h"
 #include "spu_tile.h"
 #include "spu_colorpack.h"
-
-
-/**
- * Number of texture tiles to cache.
- * Note that this will probably be the largest consumer of SPU local store/
- * memory for this driver!
- */
-#define CACHE_SIZE 16
-
-static tile_t tex_tiles[CACHE_SIZE]  ALIGN16_ATTRIB;
-
-static vector unsigned int tex_tile_xy[CACHE_SIZE];
-
+#include "spu_dcache.h"
 
 
 /**
@@ -52,78 +40,60 @@ static vector unsigned int tex_tile_xy[CACHE_SIZE];
 void
 invalidate_tex_cache(void)
 {
-   /* XXX memset? */
-   uint i;
-   for (i = 0; i < CACHE_SIZE; i++) {
-      tex_tile_xy[i] = ((vector unsigned int) { ~0U, ~0U, ~0U, ~0U });
-   }
+   spu_dcache_mark_dirty((unsigned) spu.texture.start,
+                         4 * spu.texture.width * spu.texture.height);
 }
 
 
-/**
- * Return the cache pos/index which corresponds to tile (tx,ty)
- */
-static INLINE uint
-cache_pos(vector unsigned int txty)
+static uint
+get_texel(vec_uint4 coordinate)
 {
-   uint pos = (spu_extract(txty,0) + spu_extract(txty,1) * 4) % CACHE_SIZE;
-   return pos;
+   vec_uint4 tmp;
+   unsigned x = spu_extract(coordinate, 0);
+   unsigned y = spu_extract(coordinate, 1);
+   const unsigned tiles_per_row = spu.texture.width / TILE_SIZE;
+   unsigned tile_offset = sizeof(tile_t) * ((y / TILE_SIZE * tiles_per_row) 
+                                            + (x / TILE_SIZE));
+   unsigned texel_offset = 4 * (((y % TILE_SIZE) * TILE_SIZE)
+                                + (x % TILE_SIZE));
+
+   spu_dcache_fetch_unaligned((qword *) & tmp,
+                              spu.texture.start + tile_offset + texel_offset,
+                              4);
+   return spu_extract(tmp, 0);
 }
 
 
-/**
- * Make sure the tile for texel (i,j) is present, return its position/index
- * in the cache.
- */
-static uint
-get_tex_tile(vector unsigned int ij)
+static void
+get_four_texels(vec_uint4 x, vec_uint4 y, vec_uint4 *texels)
 {
-   /* tile address: tx,ty */
-   const vector unsigned int txty = spu_rlmask(ij, -5);  /* divide by 32 */
-   const uint pos = cache_pos(txty);
-
-   if ((spu_extract(tex_tile_xy[pos], 0) != spu_extract(txty, 0)) ||
-       (spu_extract(tex_tile_xy[pos], 1) != spu_extract(txty, 1))) {
-
-      /* texture cache miss, fetch tile from main memory */
-      const uint tiles_per_row = spu.texture.width / TILE_SIZE;
-      const uint bytes_per_tile = sizeof(tile_t);
-      const void *src = (const ubyte *) spu.texture.start
-         + (spu_extract(txty,1) * tiles_per_row + spu_extract(txty,0)) * bytes_per_tile;
-
-      printf("SPU %u: tex cache miss at %d, %d  pos=%u  old=%d,%d\n",
-             spu.init.id,
-             spu_extract(txty,0),
-             spu_extract(txty,1),
-             pos,
-             spu_extract(tex_tile_xy[pos],0),
-             spu_extract(tex_tile_xy[pos],1));
-
-      ASSERT_ALIGN16(tex_tiles[pos].ui);
-      ASSERT_ALIGN16(src);
-
-      mfc_get(tex_tiles[pos].ui,  /* dest */
-              (unsigned int) src,
-              bytes_per_tile,      /* size */
-              TAG_TEXTURE_TILE,
-              0, /* tid */
-              0  /* rid */);
-
-      wait_on_mask(1 << TAG_TEXTURE_TILE);
-
-      tex_tile_xy[pos] = txty;
-   }
-   else {
-#if 0
-      printf("SPU %u: tex cache HIT at %d, %d\n",
-             spu.init.id, tx, ty);
-#endif
-   }
-
-   return pos;
+   const unsigned texture_ea = (uintptr_t) spu.texture.start;
+   vec_uint4 tile_x = spu_rlmask(x, -5);
+   vec_uint4 tile_y = spu_rlmask(y, -5);
+   const qword offset_x = si_andi((qword) x, 0x1f);
+   const qword offset_y = si_andi((qword) y, 0x1f);
+
+   const qword tiles_per_row = (qword) spu_splats(spu.texture.width / TILE_SIZE);
+   const qword tile_size = (qword) spu_splats(sizeof(tile_t));
+
+   qword tile_offset = si_mpya((qword) tile_y, tiles_per_row, (qword) tile_x);
+   tile_offset = si_mpy((qword) tile_offset, tile_size);
+
+   qword texel_offset = si_a(si_mpyui(offset_y, 32), offset_x);
+   texel_offset = si_mpyui(texel_offset, 4);
+   
+   vec_uint4 offset = (vec_uint4) si_a(tile_offset, texel_offset);
+   
+   spu_dcache_fetch_unaligned((qword *) & texels[0],
+                              texture_ea + spu_extract(offset, 0), 4);
+   spu_dcache_fetch_unaligned((qword *) & texels[1],
+                              texture_ea + spu_extract(offset, 1), 4);
+   spu_dcache_fetch_unaligned((qword *) & texels[2],
+                              texture_ea + spu_extract(offset, 2), 4);
+   spu_dcache_fetch_unaligned((qword *) & texels[3],
+                              texture_ea + spu_extract(offset, 3), 4);
 }
 
-
 /**
  * Get texture sample at texcoord.
  * XXX this is extremely primitive for now.
@@ -134,9 +104,7 @@ sample_texture_nearest(vector float texcoord)
    vector float tc = spu_mul(texcoord, spu.tex_size);
    vector unsigned int itc = spu_convtu(tc, 0);  /* convert to int */
    itc = spu_and(itc, spu.tex_size_mask);        /* mask (GL_REPEAT) */
-   vector unsigned int ij = spu_and(itc, TILE_SIZE-1); /* intra tile addr */
-   uint pos = get_tex_tile(itc);
-   uint texel = tex_tiles[pos].ui[spu_extract(ij, 1)][spu_extract(ij, 0)];
+   uint texel = get_texel(itc);
    return spu_unpack_A8R8G8B8(texel);
 }
 
@@ -144,49 +112,33 @@ sample_texture_nearest(vector float texcoord)
 vector float
 sample_texture_bilinear(vector float texcoord)
 {
-   static const vector unsigned int offset10 = {1, 0, 0, 0};
-   static const vector unsigned int offset01 = {0, 1, 0, 0};
+   static const vec_uint4 offset_x = {0, 0, 1, 1};
+   static const vec_uint4 offset_y = {0, 1, 0, 1};
 
    vector float tc = spu_mul(texcoord, spu.tex_size);
    tc = spu_add(tc, spu_splats(-0.5f));  /* half texel bias */
 
    /* integer texcoords S,T: */
-   vector unsigned int itc00 = spu_convtu(tc, 0);  /* convert to int */
-   vector unsigned int itc01 = spu_add(itc00, offset01);
-   vector unsigned int itc10 = spu_add(itc00, offset10);
-   vector unsigned int itc11 = spu_add(itc10, offset01);
-
-   /* mask (GL_REPEAT) */
-   itc00 = spu_and(itc00, spu.tex_size_mask);
-   itc01 = spu_and(itc01, spu.tex_size_mask);
-   itc10 = spu_and(itc10, spu.tex_size_mask);
-   itc11 = spu_and(itc11, spu.tex_size_mask);
-
-   /* intra tile addr */
-   vector unsigned int ij00 = spu_and(itc00, TILE_SIZE-1);
-   vector unsigned int ij01 = spu_and(itc01, TILE_SIZE-1);
-   vector unsigned int ij10 = spu_and(itc10, TILE_SIZE-1);
-   vector unsigned int ij11 = spu_and(itc11, TILE_SIZE-1);
-
-   /* get tile cache positions */
-   uint pos00 = get_tex_tile(itc00);
-   uint pos01, pos10, pos11;
-   if ((spu_extract(ij00, 0) < TILE_SIZE-1) &&
-       (spu_extract(ij00, 1) < TILE_SIZE-1)) {
-      /* all texels are in the same tile */
-      pos01 = pos10 = pos11 = pos00;
-   }
-   else {
-      pos01 = get_tex_tile(itc01);
-      pos10 = get_tex_tile(itc10);
-      pos11 = get_tex_tile(itc11);
-   }
-
-   /* get texels from tiles and convert to float[4] */
-   vector float texel00 = spu_unpack_A8R8G8B8(tex_tiles[pos00].ui[spu_extract(ij00, 1)][spu_extract(ij00, 0)]);
-   vector float texel01 = spu_unpack_A8R8G8B8(tex_tiles[pos01].ui[spu_extract(ij01, 1)][spu_extract(ij01, 0)]);
-   vector float texel10 = spu_unpack_A8R8G8B8(tex_tiles[pos10].ui[spu_extract(ij10, 1)][spu_extract(ij10, 0)]);
-   vector float texel11 = spu_unpack_A8R8G8B8(tex_tiles[pos11].ui[spu_extract(ij11, 1)][spu_extract(ij11, 0)]);
+   vec_uint4 itc = spu_convtu(tc, 0);  /* convert to int */
+
+   vec_uint4 texels[4];
+   
+   vec_uint4 x = spu_splats(spu_extract(itc, 0));
+   vec_uint4 y = spu_splats(spu_extract(itc, 1));
+
+   x = spu_add(x, offset_x);
+   y = spu_add(y, offset_y);
+
+   x = spu_and(x, spu.tex_size_x_mask);
+   y = spu_and(y, spu.tex_size_y_mask);
+
+   get_four_texels(x, y, texels);
+
+   vector float texel00 = spu_unpack_A8R8G8B8(spu_extract(texels[0], 0));
+   vector float texel01 = spu_unpack_A8R8G8B8(spu_extract(texels[1], 0));
+   vector float texel10 = spu_unpack_A8R8G8B8(spu_extract(texels[2], 0));
+   vector float texel11 = spu_unpack_A8R8G8B8(spu_extract(texels[3], 0));
+
 
    /* Compute weighting factors in [0,1]
     * Multiply texcoord by 1024, AND with 1023, convert back to float.
-- 
cgit v1.2.3


From 647213804582a6b6cfd4fbfeb1c9874ef53307f3 Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Thu, 13 Mar 2008 11:19:50 -0700
Subject: Replicate TXP changes in the SPU version of TGSI exec

Replicate changes from commit ba75e82b6ebaf88dd2e4a8f764b2d296d715bf8a
in spu_exec.c
---
 src/gallium/drivers/cell/spu/spu_exec.c | 45 ++++++++++-----------------------
 1 file changed, 14 insertions(+), 31 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/spu/spu_exec.c b/src/gallium/drivers/cell/spu/spu_exec.c
index 1560c0f157..061fbebf61 100644
--- a/src/gallium/drivers/cell/spu/spu_exec.c
+++ b/src/gallium/drivers/cell/spu/spu_exec.c
@@ -672,7 +672,7 @@ fetch_texel( struct spu_sampler *sampler,
 static void
 exec_tex(struct spu_exec_machine *mach,
          const struct tgsi_full_instruction *inst,
-         boolean biasLod)
+         boolean biasLod, boolean projected)
 {
    const uint unit = inst->FullSrcRegisters[1].SrcRegister.Index;
    union spu_exec_channel r[8];
@@ -686,17 +686,9 @@ exec_tex(struct spu_exec_machine *mach,
 
       FETCH(&r[0], 0, CHAN_X);
 
-      switch (inst->FullSrcRegisters[0].SrcRegisterExtSwz.ExtDivide) {
-      case TGSI_EXTSWIZZLE_W:
+      if (projected) {
          FETCH(&r[1], 0, CHAN_W);
          r[0].q = micro_div(r[0].q, r[1].q);
-         break;
-
-      case TGSI_EXTSWIZZLE_ONE:
-         break;
-
-      default:
-         assert (0);
       }
 
       if (biasLod) {
@@ -718,19 +710,11 @@ exec_tex(struct spu_exec_machine *mach,
       FETCH(&r[1], 0, CHAN_Y);
       FETCH(&r[2], 0, CHAN_Z);
 
-      switch (inst->FullSrcRegisters[0].SrcRegisterExtSwz.ExtDivide) {
-      case TGSI_EXTSWIZZLE_W:
+      if (projected) {
          FETCH(&r[3], 0, CHAN_W);
          r[0].q = micro_div(r[0].q, r[3].q);
          r[1].q = micro_div(r[1].q, r[3].q);
          r[2].q = micro_div(r[2].q, r[3].q);
-         break;
-
-      case TGSI_EXTSWIZZLE_ONE:
-         break;
-
-      default:
-         assert (0);
       }
 
       if (biasLod) {
@@ -752,19 +736,11 @@ exec_tex(struct spu_exec_machine *mach,
       FETCH(&r[1], 0, CHAN_Y);
       FETCH(&r[2], 0, CHAN_Z);
 
-      switch (inst->FullSrcRegisters[0].SrcRegisterExtSwz.ExtDivide) {
-      case TGSI_EXTSWIZZLE_W:
+      if (projected) {
          FETCH(&r[3], 0, CHAN_W);
          r[0].q = micro_div(r[0].q, r[3].q);
          r[1].q = micro_div(r[1].q, r[3].q);
          r[2].q = micro_div(r[2].q, r[3].q);
-         break;
-
-      case TGSI_EXTSWIZZLE_ONE:
-         break;
-
-      default:
-         assert (0);
       }
 
       if (biasLod) {
@@ -1450,14 +1426,14 @@ exec_instruction(
       /* simple texture lookup */
       /* src[0] = texcoord */
       /* src[1] = sampler unit */
-      exec_tex(mach, inst, FALSE);
+      exec_tex(mach, inst, FALSE, FALSE);
       break;
 
    case TGSI_OPCODE_TXB:
       /* Texture lookup with lod bias */
       /* src[0] = texcoord (src[0].w = load bias) */
       /* src[1] = sampler unit */
-      exec_tex(mach, inst, TRUE);
+      exec_tex(mach, inst, TRUE, FALSE);
       break;
 
    case TGSI_OPCODE_TXD:
@@ -1473,7 +1449,14 @@ exec_instruction(
       /* Texture lookup with explit LOD */
       /* src[0] = texcoord (src[0].w = load bias) */
       /* src[1] = sampler unit */
-      exec_tex(mach, inst, TRUE);
+      exec_tex(mach, inst, TRUE, FALSE);
+      break;
+
+   case TGSI_OPCODE_TXP:
+      /* Texture lookup with projection
+      /* src[0] = texcoord (src[0].w = projection) */
+      /* src[1] = sampler unit */
+      exec_tex(mach, inst, TRUE, TRUE);
       break;
 
    case TGSI_OPCODE_UP2H:
-- 
cgit v1.2.3


From 1936e4bdfd776f78f9fe44f77ce66066fd166360 Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Mon, 17 Mar 2008 15:45:52 -0700
Subject: cell: Initial code-gen for alpha / stencil / depth testing

Alpha test is currently broken because all per-fragment testing occurs
before alpha is calculated.

Stencil test is currently broken because the Z-clear code asserts if
there is a stencil buffer.
---
 src/gallium/drivers/cell/common.h                  |   10 +
 src/gallium/drivers/cell/ppu/Makefile              |    1 +
 src/gallium/drivers/cell/ppu/cell_context.h        |   25 +-
 src/gallium/drivers/cell/ppu/cell_pipe_state.c     |   37 +-
 src/gallium/drivers/cell/ppu/cell_state_emit.c     |   24 +-
 .../drivers/cell/ppu/cell_state_per_fragment.c     | 1020 ++++++++++++++++++++
 .../drivers/cell/ppu/cell_state_per_fragment.h     |   35 +
 src/gallium/drivers/cell/spu/Makefile              |    1 +
 src/gallium/drivers/cell/spu/spu_main.c            |   25 +-
 src/gallium/drivers/cell/spu/spu_main.h            |   16 +-
 src/gallium/drivers/cell/spu/spu_per_fragment_op.c |  191 ++++
 src/gallium/drivers/cell/spu/spu_per_fragment_op.h |   32 +
 src/gallium/drivers/cell/spu/spu_render.c          |    4 +-
 src/gallium/drivers/cell/spu/spu_tri.c             |   23 +-
 src/gallium/drivers/cell/spu/spu_ztest.h           |  135 ---
 15 files changed, 1409 insertions(+), 170 deletions(-)
 create mode 100644 src/gallium/drivers/cell/ppu/cell_state_per_fragment.c
 create mode 100644 src/gallium/drivers/cell/ppu/cell_state_per_fragment.h
 create mode 100644 src/gallium/drivers/cell/spu/spu_per_fragment_op.c
 create mode 100644 src/gallium/drivers/cell/spu/spu_per_fragment_op.h
 delete mode 100644 src/gallium/drivers/cell/spu/spu_ztest.h

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index 9a4004535e..fe93fd8e1a 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -104,6 +104,16 @@
 
 
+/**
+ */
+struct cell_command_depth_stencil_alpha_test {
+   uint64_t base;               /**< Effective address of code start. */
+   unsigned size;               /**< Size in bytes of test code. */
+   unsigned read_depth;         /**< Flag: should depth be read? */
+   unsigned read_stencil;       /**< Flag: should stencil be read? */
+};
+
+
 /**
  * Tell SPUs about the framebuffer size, location
  */
diff --git a/src/gallium/drivers/cell/ppu/Makefile b/src/gallium/drivers/cell/ppu/Makefile
index d38fa6ce07..0389a9554c 100644
--- a/src/gallium/drivers/cell/ppu/Makefile
+++ b/src/gallium/drivers/cell/ppu/Makefile
@@ -27,6 +27,7 @@ SOURCES = \
 	cell_flush.c \
 	cell_state_derived.c \
 	cell_state_emit.c \
+	cell_state_per_fragment.c \
 	cell_state_shader.c \
 	cell_pipe_state.c \
 	cell_screen.c \
diff --git a/src/gallium/drivers/cell/ppu/cell_context.h b/src/gallium/drivers/cell/ppu/cell_context.h
index b221424323..9e79db0ace 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.h
+++ b/src/gallium/drivers/cell/ppu/cell_context.h
@@ -57,16 +57,37 @@ struct cell_fragment_shader_state
 };
 
 
+struct cell_blend_state {
+   struct pipe_blend_state base;
+
+   /**
+    * Generated code to perform alpha blending
+    */
+   struct spe_function code;
+};
+
+
+struct cell_depth_stencil_alpha_state {
+   struct pipe_depth_stencil_alpha_state   base;
+
+   /**
+    * Generated code to perform alpha, stencil, and depth testing on the SPE
+    */
+   struct spe_function code;
+
+};
+
+
 struct cell_context
 {
    struct pipe_context pipe;
 
    struct cell_winsys *winsys;
 
-   const struct pipe_blend_state *blend;
+   const struct cell_blend_state *blend;
    const struct pipe_sampler_state *sampler[PIPE_MAX_SAMPLERS];
    uint num_samplers;
-   const struct pipe_depth_stencil_alpha_state   *depth_stencil;
+   const struct cell_depth_stencil_alpha_state   *depth_stencil;
    const struct pipe_rasterizer_state *rasterizer;
    const struct cell_vertex_shader_state *vs;
    const struct cell_fragment_shader_state *fs;
diff --git a/src/gallium/drivers/cell/ppu/cell_pipe_state.c b/src/gallium/drivers/cell/ppu/cell_pipe_state.c
index 025ed3bbbf..66ede99d13 100644
--- a/src/gallium/drivers/cell/ppu/cell_pipe_state.c
+++ b/src/gallium/drivers/cell/ppu/cell_pipe_state.c
@@ -36,6 +36,7 @@
 #include "cell_context.h"
 #include "cell_state.h"
 #include "cell_texture.h"
+#include "cell_state_per_fragment.h"
 
 
@@ -43,7 +44,12 @@ static void *
 cell_create_blend_state(struct pipe_context *pipe,
                         const struct pipe_blend_state *blend)
 {
-   return mem_dup(blend, sizeof(*blend));
+   struct cell_blend_state *cb = MALLOC(sizeof(struct cell_blend_state));
+
+   (void) memcpy(cb, blend, sizeof(*blend));
+   cb->code.store = NULL;
+
+   return cb;
 }
 
 
@@ -54,7 +60,7 @@ cell_bind_blend_state(struct pipe_context *pipe, void *blend)
 
    draw_flush(cell->draw);
 
-   cell->blend = (const struct pipe_blend_state *)blend;
+   cell->blend = (const struct cell_blend_state *)blend;
 
    cell->dirty |= CELL_NEW_BLEND;
 }
@@ -63,7 +69,10 @@ cell_bind_blend_state(struct pipe_context *pipe, void *blend)
 static void
 cell_delete_blend_state(struct pipe_context *pipe, void *blend)
 {
-   FREE(blend);
+   struct cell_blend_state *cb = (struct cell_blend_state *) blend;
+   
+   spe_release_func(& cb->code);
+   FREE(cb);
 }
 
 
@@ -87,7 +96,13 @@ static void *
 cell_create_depth_stencil_alpha_state(struct pipe_context *pipe,
                  const struct pipe_depth_stencil_alpha_state *depth_stencil)
 {
-   return mem_dup(depth_stencil, sizeof(*depth_stencil));
+   struct cell_depth_stencil_alpha_state *cdsa =
+       MALLOC(sizeof(struct cell_depth_stencil_alpha_state));
+
+   (void) memcpy(cdsa, depth_stencil, sizeof(*depth_stencil));
+   cdsa->code.store = NULL;
+
+   return cdsa;
 }
 
 
@@ -96,12 +111,16 @@ cell_bind_depth_stencil_alpha_state(struct pipe_context *pipe,
                                     void *depth_stencil)
 {
    struct cell_context *cell = cell_context(pipe);
+   struct cell_depth_stencil_alpha_state *cdsa =
+       (struct cell_depth_stencil_alpha_state *) depth_stencil;
 
    draw_flush(cell->draw);
 
-   cell->depth_stencil
-      = (const struct pipe_depth_stencil_alpha_state *) depth_stencil;
+   if (cdsa->code.store == NULL) {
+      cell_generate_depth_stencil_test(cdsa);
+   }
 
+   cell->depth_stencil = cdsa;
    cell->dirty |= CELL_NEW_DEPTH_STENCIL;
 }
 
@@ -109,7 +128,11 @@ cell_bind_depth_stencil_alpha_state(struct pipe_context *pipe,
 static void
 cell_delete_depth_stencil_alpha_state(struct pipe_context *pipe, void *depth)
 {
-   FREE(depth);
+   struct cell_depth_stencil_alpha_state *cdsa =
+       (struct cell_depth_stencil_alpha_state *) depth;
+
+   spe_release_func(& cdsa->code);
+   FREE(cdsa);
 }
 
 
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index 670eb26bdd..c8d5fdf709 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -71,9 +71,27 @@ cell_emit_state(struct cell_context *cell)
    }
 
    if (cell->dirty & CELL_NEW_DEPTH_STENCIL) {
-      emit_state_cmd(cell, CELL_CMD_STATE_DEPTH_STENCIL,
-                     cell->depth_stencil,
-                     sizeof(struct pipe_depth_stencil_alpha_state));
+      struct cell_command_depth_stencil_alpha_test dsat;
+      
+
+      dsat.base = (intptr_t) cell->depth_stencil->code.store;
+      dsat.size = (char *) cell->depth_stencil->code.csr
+	  - (char *) cell->depth_stencil->code.store;
+      dsat.read_depth = TRUE;
+      dsat.read_stencil = FALSE;
+
+      {
+	 uint32_t *p = cell->depth_stencil->code.store;
+
+	 printf("\t.text\n");
+	 for (/* empty */; p < cell->depth_stencil->code.csr; p++) {
+	    printf("\t.long\t0x%04x\n", *p);
+	 }
+	 fflush(stdout);
+      }
+
+      emit_state_cmd(cell, CELL_CMD_STATE_DEPTH_STENCIL, &dsat,
+		     sizeof(dsat));
    }
 
    if (cell->dirty & CELL_NEW_SAMPLER) {
diff --git a/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c b/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c
new file mode 100644
index 0000000000..60b64438d8
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c
@@ -0,0 +1,1020 @@
+/*
+ * (C) Copyright IBM Corporation 2008
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * \file
+ * Generate code to perform all per-fragment operations.
+ *
+ * Code generated by these functions perform both alpha, depth, and stencil
+ * testing as well as alpha blending.
+ *
+ * \note
+ * Occlusion query is not supported, but this is the right place to add that
+ * support.
+ *
+ * \author Ian Romanick <idr@us.ibm.com>
+ */
+
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+#include "cell_context.h"
+
+#include "rtasm/rtasm_ppc_spe.h"
+
+
+/**
+ * Generate code to perform alpha testing.
+ *
+ * The code generated by this function uses the register specificed by
+ * \c mask as both an input and an output.
+ *
+ * \param dsa    Current alpha-test state
+ * \param f      Function to which code should be appended
+ * \param mask   Index of register containing active fragment mask
+ * \param alphas Index of register containing per-fragment alpha values
+ *
+ * \note Emits a maximum of 6 instructions.
+ */
+static void
+emit_alpha_test(struct pipe_depth_stencil_alpha_state *dsa,
+                struct spe_function *f, int mask, int alphas)
+{
+   /* If the alpha function is either NEVER or ALWAYS, there is no need to
+    * load the reference value into a register.  ALWAYS is a fairly common
+    * case, and this optimization saves 2 instructions.
+    */
+   if (dsa->alpha.enabled
+       && (dsa->alpha.func != PIPE_FUNC_NEVER)
+       && (dsa->alpha.func != PIPE_FUNC_ALWAYS)) {
+      int ref = spe_allocate_available_register(f);
+      int tmp_a = spe_allocate_available_register(f);
+      int tmp_b = spe_allocate_available_register(f);
+      union {
+	 float f;
+	 unsigned u;
+      } ref_val;
+      boolean complement = FALSE;
+
+      ref_val.f = dsa->alpha.ref;
+
+      spe_il(f, ref, ref_val.u & 0x0000ffff);
+      spe_ilh(f, ref, ref_val.u >> 16);
+
+      switch (dsa->alpha.func) {
+      case PIPE_FUNC_NOTEQUAL:
+	 complement = TRUE;
+	 /* FALLTHROUGH */
+
+      case PIPE_FUNC_EQUAL:
+	 spe_fceq(f, tmp_a, ref, alphas);
+	 break;
+
+      case PIPE_FUNC_LEQUAL:
+	 complement = TRUE;
+	 /* FALLTHROUGH */
+
+      case PIPE_FUNC_GREATER:
+	 spe_fcgt(f, tmp_a, ref, alphas);
+	 break;
+
+      case PIPE_FUNC_LESS:
+	 complement = TRUE;
+	 /* FALLTHROUGH */
+
+      case PIPE_FUNC_GEQUAL:
+	 spe_fcgt(f, tmp_a, ref, alphas);
+	 spe_fceq(f, tmp_b, ref, alphas);
+	 spe_or(f, tmp_a, tmp_b, tmp_a);
+	 break;
+
+      case PIPE_FUNC_ALWAYS:
+      case PIPE_FUNC_NEVER:
+      default:
+	 assert(0);
+	 break;
+      }
+
+      if (complement) {
+	 spe_andc(f, mask, mask, tmp_a);
+      } else {
+	 spe_and(f, mask, mask, tmp_a);
+      }
+
+      spe_release_register(f, ref);
+      spe_release_register(f, tmp_a);
+      spe_release_register(f, tmp_b);
+   } else if (dsa->alpha.enabled && (dsa->alpha.func == PIPE_FUNC_NEVER)) {
+      spe_il(f, mask, 0);
+   }
+}
+
+
+/**
+ * \param dsa        Current depth-test state
+ * \param f          Function to which code should be appended
+ * \param m          Mask of allocated / free SPE registers
+ * \param mask       Index of register to contain depth-pass mask
+ * \param stored     Index of register containing values from depth buffer
+ * \param calculated Index of register containing per-fragment depth values
+ *
+ * \return
+ * If the calculated depth comparison mask is the actual mask, \c FALSE is
+ * returned.  If the calculated depth comparison mask is the compliment of
+ * the actual mask, \c TRUE is returned.
+ *
+ * \note Emits a maximum of 3 instructions.
+ */
+static boolean
+emit_depth_test(struct pipe_depth_stencil_alpha_state *dsa,
+                struct spe_function *f, int mask, int stored, int calculated)
+{
+   unsigned func = (dsa->depth.enabled)
+       ? dsa->depth.func : PIPE_FUNC_ALWAYS;
+   int tmp = spe_allocate_available_register(f);
+   boolean compliment = FALSE;
+
+   switch (func) {
+   case PIPE_FUNC_NEVER:
+      spe_il(f, mask, 0);
+      break;
+
+   case PIPE_FUNC_NOTEQUAL:
+      compliment = TRUE;
+      /* FALLTHROUGH */
+   case PIPE_FUNC_EQUAL:
+      spe_ceq(f, mask, calculated, stored);
+      break;
+
+   case PIPE_FUNC_LEQUAL:
+      compliment = TRUE;
+      /* FALLTHROUGH */
+   case PIPE_FUNC_GREATER:
+      spe_clgt(f, mask, calculated, stored);
+      break;
+
+   case PIPE_FUNC_LESS:
+      compliment = TRUE;
+      /* FALLTHROUGH */
+   case PIPE_FUNC_GEQUAL:
+      spe_clgt(f, mask, calculated, stored);
+      spe_ceq(f, tmp, calculated, stored);
+      spe_or(f, mask, mask, tmp);
+      break;
+
+   case PIPE_FUNC_ALWAYS:
+      spe_il(f, mask, ~0);
+      break;
+
+   default:
+      assert(0);
+      break;
+   }
+
+   spe_release_register(f, tmp);
+   return compliment;
+}
+
+
+/**
+ * \note Emits a maximum of 5 instructions.
+ */
+static void
+emit_stencil_op(struct spe_function *f,
+                int out, int in, int mask, unsigned op, unsigned ref)
+{
+   const int clamp = spe_allocate_available_register(f);
+   const int tmp = spe_allocate_available_register(f);
+
+   switch(op) {
+   case PIPE_STENCIL_OP_KEEP:
+      assert(0);
+   case PIPE_STENCIL_OP_ZERO:
+      spe_il(f, out, 0);
+      break;
+   case PIPE_STENCIL_OP_REPLACE:
+      spe_il(f, out, ref);
+      break;
+   case PIPE_STENCIL_OP_INCR:
+      spe_il(f, clamp, 0x0ff);
+      spe_ai(f, out, in, 1);
+      spe_cgti(f, tmp, out, clamp);
+      spe_selb(f, out, out, clamp, tmp);
+      break;
+   case PIPE_STENCIL_OP_DECR:
+      spe_il(f, clamp, 0);
+      spe_ai(f, out, in, -1);
+      spe_cgti(f, tmp, out, clamp);
+      spe_selb(f, out, clamp, out, tmp);
+      break;
+   case PIPE_STENCIL_OP_INCR_WRAP:
+      spe_ai(f, out, in, 1);
+      break;
+   case PIPE_STENCIL_OP_DECR_WRAP:
+      spe_ai(f, out, in, -1);
+      break;
+   case PIPE_STENCIL_OP_INVERT:
+      spe_nor(f, out, in, in);
+      break;
+   default:
+      assert(0);
+   }
+
+   spe_release_register(f, tmp);
+   spe_release_register(f, clamp);
+
+   spe_selb(f, out, in, out, mask);
+}
+
+
+/**
+ * \param dsa        Depth / stencil test state
+ * \param face       0 for front face, 1 for back face
+ * \param f          Function to append instructions to
+ * \param reg_mask   Mask of allocated registers
+ * \param mask       Register containing mask of fragments passing the
+ *                   alpha test
+ * \param depth_mask Register containing mask of fragments passing the
+ *                   depth test
+ * \param depth_compliment  Is \c depth_mask the compliment of the actual mask?
+ * \param stencil    Register containing values from stencil buffer
+ * \param depth_pass Register to store mask of fragments passing stencil test
+ *                   and depth test
+ * 
+ * \note
+ * Emits a maximum of 10 + (3 * 5) = 25 instructions.
+ */
+static int
+emit_stencil_test(struct pipe_depth_stencil_alpha_state *dsa,
+                  unsigned face, 
+                  struct spe_function *f,
+                  int mask,
+                  int depth_mask,
+                  boolean depth_complement,
+                  int stencil,
+                  int depth_pass)
+{
+   int stencil_fail = spe_allocate_available_register(f);
+   int depth_fail = spe_allocate_available_register(f);
+   int stencil_mask = spe_allocate_available_register(f);
+   int stencil_pass = spe_allocate_available_register(f);
+   int face_stencil = spe_allocate_available_register(f);
+   int stencil_src = stencil;
+   const unsigned ref = (dsa->stencil[face].ref_value
+                         & dsa->stencil[face].value_mask);
+   boolean complement = FALSE;
+   int stored = spe_allocate_available_register(f);
+   int tmp = spe_allocate_available_register(f);
+
+
+   if ((dsa->stencil[face].func != PIPE_FUNC_NEVER)
+       && (dsa->stencil[face].func != PIPE_FUNC_ALWAYS)
+       && (dsa->stencil[face].value_mask != 0x0ff)) {
+      spe_andi(f, stored, stencil, dsa->stencil[face].value_mask);
+   }
+
+
+   switch (dsa->stencil[face].func) {
+   case PIPE_FUNC_NEVER:
+      spe_il(f, stencil_mask, 0);
+      break;
+
+   case PIPE_FUNC_NOTEQUAL:
+      complement = TRUE;
+      /* FALLTHROUGH */
+   case PIPE_FUNC_EQUAL:
+      spe_ceqi(f, stencil_mask, stored, ref);
+      break;
+
+   case PIPE_FUNC_LEQUAL:
+      complement = TRUE;
+      /* FALLTHROUGH */
+   case PIPE_FUNC_GREATER:
+      spe_clgti(f, stencil_mask, stored, ref);
+      break;
+
+   case PIPE_FUNC_LESS:
+      complement = TRUE;
+      /* FALLTHROUGH */
+   case PIPE_FUNC_GEQUAL:
+      spe_clgti(f, stencil_mask, stored, ref);
+      spe_ceqi(f, tmp, stored, ref);
+      spe_or(f, stencil_mask, stencil_mask, tmp);
+      break;
+
+   case PIPE_FUNC_ALWAYS:
+      /* See comment below. */
+      break;
+
+   default:
+      assert(0);
+      break;
+   }
+
+   spe_release_register(f, stored);
+   spe_release_register(f, tmp);
+
+
+   /* ALWAYS is a very common stencil-test, so some effort is applied to
+    * optimize that case.  The stencil-pass mask is the same as the input
+    * fragment mask.  This makes the stencil-test (above) a no-op, and the
+    * input fragment mask can be "renamed" the stencil-pass mask.
+    */
+   if (dsa->stencil[face].func == PIPE_FUNC_ALWAYS) {
+      spe_release_register(f, stencil_pass);
+      stencil_pass = mask;
+   } else {
+      if (complement) {
+         spe_andc(f, stencil_pass, mask, stencil_mask);
+      } else {
+         spe_and(f, stencil_pass, mask, stencil_mask);
+      }
+   }
+
+   if (depth_complement) {
+      spe_andc(f, depth_pass, stencil_pass, depth_mask);
+   } else {
+      spe_and(f, depth_pass, stencil_pass, depth_mask);
+   }
+
+
+   /* Conditionally emit code to update the stencil value under various
+    * condititons.  Note that there is no need to generate code under the
+    * following circumstances:
+    * 
+    * - Stencil write mask is zero.
+    * - For stencil-fail if the stencil test is ALWAYS
+    * - For depth-fail if the stencil test is NEVER
+    * - For depth-pass if the stencil test is NEVER
+    * - Any of the 3 conditions if the operation is KEEP
+    */
+   if (dsa->stencil[face].write_mask != 0) {
+      if ((dsa->stencil[face].func != PIPE_FUNC_ALWAYS)
+          && (dsa->stencil[face].fail_op != PIPE_STENCIL_OP_KEEP)) {
+         if (complement) {
+            spe_and(f, stencil_fail, mask, stencil_mask);
+         } else {
+            spe_andc(f, stencil_fail, mask, stencil_mask);
+         }
+
+         emit_stencil_op(f, face_stencil, stencil_src, stencil_fail,
+                         dsa->stencil[face].fail_op,
+                         dsa->stencil[face].ref_value);
+
+         stencil_src = face_stencil;
+      }
+
+      if ((dsa->stencil[face].func != PIPE_FUNC_NEVER)
+          && (dsa->stencil[face].zfail_op != PIPE_STENCIL_OP_KEEP)) {
+         if (depth_complement) {
+            spe_and(f, depth_fail, stencil_pass, depth_mask);
+         } else {
+            spe_andc(f, depth_fail, stencil_pass, depth_mask);
+         }
+
+         emit_stencil_op(f, face_stencil, stencil_src, depth_fail,
+                         dsa->stencil[face].zfail_op,
+                         dsa->stencil[face].ref_value);
+         stencil_src = face_stencil;
+      }
+
+      if ((dsa->stencil[face].func != PIPE_FUNC_NEVER)
+          && (dsa->stencil[face].zpass_op != PIPE_STENCIL_OP_KEEP)) {
+         emit_stencil_op(f, face_stencil, stencil_src, depth_pass,
+                         dsa->stencil[face].zpass_op,
+                         dsa->stencil[face].ref_value);
+         stencil_src = face_stencil;
+      }
+   }
+
+   spe_release_register(f, stencil_fail);
+   spe_release_register(f, depth_fail);
+   spe_release_register(f, stencil_mask);
+   if (stencil_pass != mask) {
+      spe_release_register(f, stencil_pass);
+   }
+
+   /* If all of the stencil operations were KEEP or the stencil write mask was
+    * zero, "stencil_src" will still be set to "stencil".  In this case
+    * release the "face_stencil" register.  Otherwise apply the stencil write
+    * mask to select bits from the calculated stencil value and the previous
+    * stencil value.
+    */
+   if (stencil_src == stencil) {
+      spe_release_register(f, face_stencil);
+   } else if (dsa->stencil[face].write_mask != 0x0ff) {
+      int tmp = spe_allocate_available_register(f);
+      
+      spe_il(f, tmp, dsa->stencil[face].write_mask);
+      spe_selb(f, stencil_src, stencil, stencil_src, tmp);
+
+      spe_release_register(f, tmp);
+   }
+
+   return stencil_src;
+}
+
+
+void
+cell_generate_depth_stencil_test(struct cell_depth_stencil_alpha_state *cdsa)
+{
+   struct pipe_depth_stencil_alpha_state *const dsa = &cdsa->base;
+   struct spe_function *const f = &cdsa->code;
+
+   /* This code generates a maximum of 6 (alpha test) + 3 (depth test)
+    * + 25 (front stencil) + 25 (back stencil) + 4 = 63 instructions.  Round
+    * up to 64 to make it a happy power-of-two.
+    */
+   spe_init_func(f, 4 * 64);
+
+
+   /* Allocate registers for the function's input parameters.  Cleverly (and
+    * clever code is usually dangerous, but I couldn't resist) the generated
+    * function returns a structure.  Returned structures start with register
+    * 3, and the structure fields are ordered to match up exactly with the
+    * input parameters.
+    */
+   int mask = spe_allocate_register(f, 3);
+   int depth = spe_allocate_register(f, 4);
+   int stencil = spe_allocate_register(f, 5);
+   int zvals = spe_allocate_register(f, 6);
+   int frag_a = spe_allocate_register(f, 7);
+   int facing = spe_allocate_register(f, 8);
+
+   int depth_mask = spe_allocate_available_register(f);
+
+   boolean depth_complement;
+
+
+   emit_alpha_test(dsa, f, mask, frag_a);
+
+   depth_complement = emit_depth_test(dsa, f, depth_mask, depth, zvals);
+
+   if (dsa->stencil[0].enabled) {
+      const int front_depth_pass = spe_allocate_available_register(f);
+      int front_stencil = emit_stencil_test(dsa, 0, f, mask,
+                                            depth_mask, depth_complement,
+                                            stencil, front_depth_pass);
+
+      if (dsa->stencil[1].enabled) {
+         const int back_depth_pass = spe_allocate_available_register(f);
+         int back_stencil = emit_stencil_test(dsa, 1, f, mask,
+                                              depth_mask,  depth_complement,
+                                              stencil, back_depth_pass);
+
+         /* If the front facing stencil value and the back facing stencil
+          * value are stored in the same register, there is no need to select
+          * a value based on the facing.  This can happen if the stencil value
+          * was not modified due to the write masks being zero, the stencil
+          * operations being KEEP, etc.
+          */
+         if (front_stencil != back_stencil) {
+            spe_selb(f, stencil, back_stencil, front_stencil, facing);
+         }
+         
+         if (back_stencil != stencil) { 
+            spe_release_register(f, back_stencil);
+         }
+
+         if (front_stencil != stencil) { 
+            spe_release_register(f, front_stencil);
+         }
+
+         spe_selb(f, mask, back_depth_pass, front_depth_pass, facing);
+
+         spe_release_register(f, back_depth_pass);
+      } else {
+         if (front_stencil != stencil) { 
+            spe_or(f, stencil, front_stencil, front_stencil);
+            spe_release_register(f, front_stencil);
+         }
+      }
+
+      spe_release_register(f, front_depth_pass);
+   } else if (dsa->depth.enabled) {
+      if (depth_complement) {
+         spe_andc(f, mask, mask, depth_mask);
+      } else {
+         spe_and(f, mask, mask, depth_mask);
+      }
+   }
+
+   if (dsa->depth.writemask) {
+         spe_selb(f, depth, depth, zvals, mask);
+   }
+
+   spe_bi(f, 0, 0, 0);
+}
+
+
+/**
+ * \note Emits a maximum of 3 instructions
+ */
+static int
+emit_alpha_factor_calculation(struct spe_function *f,
+                              unsigned factor, float const_alpha,
+                              int src_alpha, int dst_alpha)
+{
+   union {
+      float f;
+      unsigned u;
+   } alpha;
+   int factor_reg;
+   int tmp;
+
+
+   alpha.f = const_alpha;
+
+   switch (factor) {
+   case PIPE_BLENDFACTOR_ONE:
+      factor_reg = -1;
+      break;
+
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+      factor_reg = spe_allocate_available_register(f);
+
+      spe_or(f, factor_reg, src_alpha, src_alpha);
+      break;
+
+   case PIPE_BLENDFACTOR_DST_ALPHA:
+      factor_reg = dst_alpha;
+      break;
+
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+      factor_reg = -1;
+      break;
+
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+      const_alpha = 1.0 - const_alpha;
+      /* FALLTHROUGH */
+   case PIPE_BLENDFACTOR_CONST_ALPHA:
+      factor_reg = spe_allocate_available_register(f);
+
+      spe_il(f, factor_reg, alpha.u & 0x0ffff);
+      spe_ilh(f, factor_reg, alpha.u >> 16);
+      break;
+
+   case PIPE_BLENDFACTOR_ZERO:
+      factor_reg = -1;
+      break;
+
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+      tmp = spe_allocate_available_register(f);
+      factor_reg = spe_allocate_available_register(f);
+
+      spe_il(f, tmp, 1);
+      spe_cuflt(f, tmp, tmp, 0);
+      spe_fs(f, factor_reg, tmp, src_alpha);
+
+      spe_release_register(f, tmp);
+      break;
+
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+      tmp = spe_allocate_available_register(f);
+      factor_reg = spe_allocate_available_register(f);
+
+      spe_il(f, tmp, 1);
+      spe_cuflt(f, tmp, tmp, 0);
+      spe_fs(f, factor_reg, tmp, dst_alpha);
+
+      spe_release_register(f, tmp);
+      break;
+
+   case PIPE_BLENDFACTOR_SRC1_ALPHA:
+   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+   default:
+      assert(0);
+      factor_reg = -1;
+      break;
+   }
+
+   return factor_reg;
+}
+
+
+/**
+ * \note Emits a maximum of 5 instructions
+ */
+static void
+emit_color_factor_calculation(struct spe_function *f,
+                              unsigned sF, unsigned mask,
+			      const struct pipe_blend_color *blend_color,
+                              const int *src,
+                              const int *dst,
+                              int *factor)
+{
+   union {
+      float f[4];
+      unsigned u[4];
+   } color;
+   int tmp;
+   unsigned i;
+
+
+   color.f[0] = blend_color->color[0];
+   color.f[1] = blend_color->color[1];
+   color.f[2] = blend_color->color[2];
+   color.f[3] = blend_color->color[3];
+
+   factor[0] = -1;
+   factor[1] = -1;
+   factor[2] = -1;
+   factor[3] = -1;
+
+   switch (sF) {
+   case PIPE_BLENDFACTOR_ONE:
+      break;
+
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+      for (i = 0; i < 3; ++i) {
+         if ((mask & (1U << i)) != 0) {
+            factor[i] = spe_allocate_available_register(f);
+            spe_or(f, factor[i], src[i], src[i]);
+         }
+      }
+      break;
+
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+      factor[0] = spe_allocate_available_register(f);
+      factor[1] = factor[0];
+      factor[2] = factor[0];
+
+      spe_or(f, factor[0], src[3], src[3]);
+      break;
+
+   case PIPE_BLENDFACTOR_DST_ALPHA:
+      factor[0] = dst[3];
+      factor[1] = dst[3];
+      factor[2] = dst[3];
+      break;
+
+   case PIPE_BLENDFACTOR_DST_COLOR:
+      factor[0] = dst[0];
+      factor[1] = dst[1];
+      factor[2] = dst[2];
+      break;
+
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+      tmp = spe_allocate_available_register(f);
+      factor[0] = spe_allocate_available_register(f);
+      factor[1] = factor[0];
+      factor[2] = factor[0];
+
+      /* Alpha saturate means min(As, 1-Ad).
+       */
+      spe_il(f, tmp, 1);
+      spe_cuflt(f, tmp, tmp, 0);
+      spe_fs(f, tmp, tmp, dst[3]);
+      spe_fcgt(f, factor[0], tmp, src[3]);
+      spe_selb(f, factor[0], src[3], tmp, factor[0]);
+
+      spe_release_register(f, tmp);
+      break;
+
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+      color.f[0] = 1.0 - color.f[0];
+      color.f[1] = 1.0 - color.f[1];
+      color.f[2] = 1.0 - color.f[2];
+      /* FALLTHROUGH */
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+      for (i = 0; i < 3; i++) {
+	 factor[i] = spe_allocate_available_register(f);
+
+	 spe_il(f, factor[i], color.u[i] & 0x0ffff);
+	 spe_ilh(f, factor[i], color.u[i] >> 16);
+      }
+      break;
+
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+      color.f[3] = 1.0 - color.f[3];
+      /* FALLTHROUGH */
+   case PIPE_BLENDFACTOR_CONST_ALPHA:
+      factor[0] = spe_allocate_available_register(f);
+      factor[1] = factor[0];
+      factor[2] = factor[0];
+
+      spe_il(f, factor[0], color.u[3] & 0x0ffff);
+      spe_ilh(f, factor[0], color.u[3] >> 16);
+      break;
+
+   case PIPE_BLENDFACTOR_ZERO:
+      break;
+
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+      tmp = spe_allocate_available_register(f);
+
+      spe_il(f, tmp, 1);
+      spe_cuflt(f, tmp, tmp, 0);
+
+      for (i = 0; i < 3; ++i) {
+         if ((mask & (1U << i)) != 0) {
+            factor[i] = spe_allocate_available_register(f);
+            spe_fs(f, factor[i], tmp, src[i]);
+         }
+      }
+
+      spe_release_register(f, tmp);
+      break;
+
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+      tmp = spe_allocate_available_register(f);
+      factor[0] = spe_allocate_available_register(f);
+      factor[1] = factor[0];
+      factor[2] = factor[0];
+
+      spe_il(f, tmp, 1);
+      spe_cuflt(f, tmp, tmp, 0);
+      spe_fs(f, factor[0], tmp, src[3]);
+
+      spe_release_register(f, tmp);
+      break;
+
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+      tmp = spe_allocate_available_register(f);
+      factor[0] = spe_allocate_available_register(f);
+      factor[1] = factor[0];
+      factor[2] = factor[0];
+
+      spe_il(f, tmp, 1);
+      spe_cuflt(f, tmp, tmp, 0);
+      spe_fs(f, factor[0], tmp, dst[3]);
+
+      spe_release_register(f, tmp);
+      break;
+
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+      tmp = spe_allocate_available_register(f);
+
+      spe_il(f, tmp, 1);
+      spe_cuflt(f, tmp, tmp, 0);
+
+      for (i = 0; i < 3; ++i) {
+         if ((mask & (1U << i)) != 0) {
+            factor[i] = spe_allocate_available_register(f);
+            spe_fs(f, factor[i], tmp, dst[i]);
+         }
+      }
+
+      spe_release_register(f, tmp);
+      break;
+
+   case PIPE_BLENDFACTOR_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_SRC1_ALPHA:
+   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+   default:
+      assert(0);
+   }
+}
+
+
+static void
+emit_blend_calculation(struct spe_function *f,
+                       unsigned func, unsigned sF, unsigned dF,
+                       int src, int src_factor, int dst, int dst_factor)
+{
+   int tmp = spe_allocate_available_register(f);
+
+   switch (func) {
+   case PIPE_BLEND_ADD:
+      if (sF == PIPE_BLENDFACTOR_ONE) {
+         if (dF == PIPE_BLENDFACTOR_ZERO) {
+            /* Do nothing. */
+         } else if (dF == PIPE_BLENDFACTOR_ONE) {
+            spe_fa(f, src, src, dst);
+         }
+      } else if (sF == PIPE_BLENDFACTOR_ZERO) {
+         if (dF == PIPE_BLENDFACTOR_ZERO) {
+            spe_il(f, src, 0);
+         } else if (dF == PIPE_BLENDFACTOR_ONE) {
+            spe_or(f, src, dst, dst);
+         }
+      } else {
+         spe_fm(f, tmp, dst, dst_factor);
+         spe_fma(f, src, src, src_factor, tmp);
+      }
+      break;
+
+   case PIPE_BLEND_SUBTRACT:
+      if (sF == PIPE_BLENDFACTOR_ONE) {
+         if (dF == PIPE_BLENDFACTOR_ZERO) {
+            /* Do nothing. */
+         } else if (dF == PIPE_BLENDFACTOR_ONE) {
+            spe_fs(f, src, src, dst);
+         }
+      } else if (sF == PIPE_BLENDFACTOR_ZERO) {
+         if (dF == PIPE_BLENDFACTOR_ZERO) {
+            spe_il(f, src, 0);
+         } else if (dF == PIPE_BLENDFACTOR_ONE) {
+            spe_il(f, tmp, 0);
+            spe_fs(f, src, tmp, dst);
+         }
+      } else {
+         spe_fm(f, tmp, dst, dst_factor);
+         spe_fms(f, src, src, src_factor, tmp);
+      }
+      break;
+
+   case PIPE_BLEND_REVERSE_SUBTRACT:
+      if (sF == PIPE_BLENDFACTOR_ONE) {
+         if (dF == PIPE_BLENDFACTOR_ZERO) {
+            spe_il(f, tmp, 0);
+            spe_fs(f, src, tmp, src);
+         } else if (dF == PIPE_BLENDFACTOR_ONE) {
+            spe_fs(f, src, dst, src);
+         }
+      } else if (sF == PIPE_BLENDFACTOR_ZERO) {
+         if (dF == PIPE_BLENDFACTOR_ZERO) {
+            spe_il(f, src, 0);
+         } else if (dF == PIPE_BLENDFACTOR_ONE) {
+            spe_or(f, src, dst, dst);
+         }
+      } else {
+         spe_fm(f, tmp, src, src_factor);
+         spe_fms(f, src, src, dst_factor, tmp);
+      }
+      break;
+
+   case PIPE_BLEND_MIN:
+      spe_cgt(f, tmp, src, dst);
+      spe_selb(f, src, dst, src, tmp);
+      break;
+
+   case PIPE_BLEND_MAX:
+      spe_cgt(f, tmp, src, dst);
+      spe_selb(f, src, src, dst, tmp);
+      break;
+
+   default:
+      assert(0);
+   }
+
+   spe_release_register(f, tmp);
+}
+
+
+/**
+ * Generate code to perform alpha blending on the SPE
+ */
+void
+cell_generate_alpha_blend(struct cell_blend_state *cb,
+			  const struct pipe_blend_color *blend_color)
+{
+   struct pipe_blend_state *const b = &cb->base;
+   struct spe_function *const f = &cb->code;
+
+   /* This code generates a maximum of 3 (source alpha factor)
+    * + 3 (destination alpha factor) + (3 * 5) (source color factor)
+    * + (3 * 5) (destination color factor) + (4 * 2) (blend equation)
+    * + 4 (fragment mask) + 1 (return) = 49 instlructions.  Round up to 64 to
+    * make it a happy power-of-two.
+    */
+   spe_init_func(f, 4 * 64);
+
+
+   const int frag[4] = {
+      spe_allocate_register(f, 3),
+      spe_allocate_register(f, 4),
+      spe_allocate_register(f, 5),
+      spe_allocate_register(f, 6),
+   };
+   const int pixel[4] = {
+      spe_allocate_register(f, 7),
+      spe_allocate_register(f, 8),
+      spe_allocate_register(f, 9),
+      spe_allocate_register(f, 10),
+   };
+   const int mask = spe_allocate_register(f, 11);
+   unsigned func[4];
+   unsigned sF[4];
+   unsigned dF[4];
+   unsigned i;
+   int src_factor[4];
+   int dst_factor[4];
+
+
+   /* Does the selected blend mode make use of the source / destination
+    * color (RGB) blend factors?
+    */
+   boolean need_color_factor = b->blend_enable
+       && (b->rgb_func != PIPE_BLEND_MIN)
+       && (b->rgb_func != PIPE_BLEND_MAX);
+
+   /* Does the selected blend mode make use of the source / destination
+    * alpha blend factors?
+    */
+   boolean need_alpha_factor = b->blend_enable
+       && (b->alpha_func != PIPE_BLEND_MIN)
+       && (b->alpha_func != PIPE_BLEND_MAX);
+
+
+   sF[0] = b->rgb_src_factor;
+   sF[1] = sF[0];
+   sF[2] = sF[0];
+   sF[3] = (b->alpha_src_factor == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE)
+       ? PIPE_BLENDFACTOR_ONE : b->alpha_src_factor;
+
+   dF[0] = b->rgb_dst_factor;
+   dF[1] = dF[0];
+   dF[2] = dF[0];
+   dF[3] = b->rgb_dst_factor;
+
+
+   /* If alpha writing is enabled and the alpha blend mode requires use of
+    * the alpha factor, calculate the alpha factor.
+    */
+   if (((b->colormask & 8) != 0) && need_alpha_factor) {
+      src_factor[3] = emit_alpha_factor_calculation(f, sF[3],
+						    blend_color->color[3],
+                                                    frag[3], pixel[3]);
+
+      /* If the alpha destination blend factor is the same as the alpha source
+       * blend factor, re-use the previously calculated value.
+       */
+      dst_factor[3] = (dF[3] == sF[3])
+          ? src_factor[3]
+          : emit_alpha_factor_calculation(f, dF[3],
+					  blend_color->color[3],
+                                          frag[3], pixel[3]);
+   }
+   
+
+   if (sF[0] == sF[3]) {
+      src_factor[0] = src_factor[3];
+      src_factor[1] = src_factor[3];
+      src_factor[2] = src_factor[3];
+   } else if (sF[0] == dF[3]) {
+      src_factor[0] = dst_factor[3];
+      src_factor[1] = dst_factor[3];
+      src_factor[2] = dst_factor[3];
+   } else if (need_color_factor) {
+      emit_color_factor_calculation(f,
+                                    b->rgb_src_factor,
+                                    b->colormask,
+                                    blend_color,
+                                    frag, pixel, src_factor);
+   }
+
+
+   if (dF[0] == sF[3]) {
+      dst_factor[0] = src_factor[3];
+      dst_factor[1] = src_factor[3];
+      dst_factor[2] = src_factor[3];
+   } else if (dF[0] == dF[3]) {
+      dst_factor[0] = dst_factor[3];
+      dst_factor[1] = dst_factor[3];
+      dst_factor[2] = dst_factor[3];
+   } else if (dF[0] == sF[0]) {
+      dst_factor[0] = src_factor[0];
+      dst_factor[1] = src_factor[1];
+      dst_factor[2] = src_factor[2];
+   } else if (need_color_factor) {
+      emit_color_factor_calculation(f,
+                                    b->rgb_dst_factor,
+                                    b->colormask,
+                                    blend_color,
+                                    frag, pixel, dst_factor);
+   }
+
+   
+
+   func[0] = b->rgb_func;
+   func[1] = func[0];
+   func[2] = func[0];
+   func[3] = b->alpha_func;
+
+   for (i = 0; i < 4; ++i) {
+      if ((b->colormask & (1U << i)) != 0) {
+         emit_blend_calculation(f,
+                                func[i], sF[i], dF[i],
+                                frag[i], src_factor[i],
+                                pixel[i], dst_factor[i]);
+         spe_selb(f, frag[i], pixel[i], frag[i], mask);
+      } else {
+         spe_or(f, frag[i], pixel[i], pixel[i]);
+      }
+   }
+
+   spe_bi(f, 0, 0, 0);
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_state_per_fragment.h b/src/gallium/drivers/cell/ppu/cell_state_per_fragment.h
new file mode 100644
index 0000000000..541c3b3be0
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_state_per_fragment.h
@@ -0,0 +1,35 @@
+/*
+ * (C) Copyright IBM Corporation 2008
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef CELL_STATE_PER_FRAGMENT_H
+#define CELL_STATE_PER_FRAGMENT_H
+
+extern void
+cell_generate_depth_stencil_test(struct cell_depth_stencil_alpha_state *cdsa);
+
+extern void
+cell_generate_alpha_blend(struct cell_blend_state *cb,
+			  const struct pipe_blend_color *blend_color);
+
+#endif /* CELL_STATE_PER_FRAGMENT_H */
diff --git a/src/gallium/drivers/cell/spu/Makefile b/src/gallium/drivers/cell/spu/Makefile
index c071de1900..115ca8cd90 100644
--- a/src/gallium/drivers/cell/spu/Makefile
+++ b/src/gallium/drivers/cell/spu/Makefile
@@ -19,6 +19,7 @@ SOURCES = \
 	spu_main.c \
 	spu_blend.c \
 	spu_dcache.c \
+	spu_per_fragment_op.c \
 	spu_render.c \
 	spu_texture.c \
 	spu_tile.c \
diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index 59300028d4..8e46f6e471 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -58,6 +58,9 @@ struct spu_vs_context draw;
 static unsigned char attribute_fetch_code_buffer[136 * PIPE_ATTRIB_MAX]
     ALIGN16_ATTRIB;
 
+static unsigned char depth_stencil_code_buffer[4 * 64]
+    ALIGN16_ATTRIB;
+
 /**
  * Tell the PPU that this SPU has finished copying a buffer to
  * local store and that it may be reused by the PPU.
@@ -248,14 +251,26 @@ cmd_state_blend(const struct pipe_blend_state *state)
 
 
 static void
-cmd_state_depth_stencil(const struct pipe_depth_stencil_alpha_state *state)
+cmd_state_depth_stencil(const struct cell_command_depth_stencil_alpha_test *state)
 {
    if (Debug)
       printf("SPU %u: DEPTH_STENCIL: ztest %d\n",
              spu.init.id,
-             state->depth.enabled);
+             state->read_depth);
+
+   ASSERT_ALIGN16(state->base);
+
+   mfc_get(depth_stencil_code_buffer,
+	   (unsigned int) state->base,  /* src */
+	   ROUNDUP16(state->size),
+	   TAG_BATCH_BUFFER,
+	   0, /* tid */
+	   0  /* rid */);
+   wait_on_mask(1 << TAG_BATCH_BUFFER);
 
-   memcpy(&spu.depth_stencil, state, sizeof(*state));
+   spu.frag_test = (frag_test_func) depth_stencil_code_buffer;
+   spu.read_depth = state->read_depth;
+   spu.read_stencil = state->read_stencil;
 }
 
 
@@ -415,9 +430,9 @@ cmd_batch(uint opcode)
          pos += (1 + ROUNDUP8(sizeof(struct pipe_blend_state)) / 8);
          break;
       case CELL_CMD_STATE_DEPTH_STENCIL:
-         cmd_state_depth_stencil((struct pipe_depth_stencil_alpha_state *)
+         cmd_state_depth_stencil((struct cell_command_depth_stencil_alpha_test *)
                                  &buffer[pos+1]);
-         pos += (1 + ROUNDUP8(sizeof(struct pipe_depth_stencil_alpha_state)) / 8);
+         pos += (1 + ROUNDUP8(sizeof(struct cell_command_depth_stencil_alpha_test)) / 8);
          break;
       case CELL_CMD_STATE_SAMPLER:
          cmd_state_sampler((struct pipe_sampler_state *) &buffer[pos+1]);
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index a13edd1702..444e218645 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -56,6 +56,17 @@ typedef union {
 #define TILE_STATUS_GETTING 5  /**< mfc_get() called but not yet arrived */
 
 
+struct spu_frag_test_results {
+   qword mask;
+   qword depth;
+   qword stencil;
+};
+
+typedef struct spu_frag_test_results (*frag_test_func)(qword frag_mask,
+    qword pixel_depth, qword pixel_stencil, qword frag_depth,
+    qword frag_alpha, qword facing);
+
+
 struct spu_framebuffer {
    void *color_start;              /**< addr of color surface in main memory */
    void *depth_start;              /**< addr of depth surface in main memory */
@@ -79,8 +90,9 @@ struct spu_global
    struct cell_init_info init;
 
    struct spu_framebuffer fb;
-   struct pipe_blend_state blend_stencil;
-   struct pipe_depth_stencil_alpha_state depth_stencil;
+   boolean read_depth;
+   boolean read_stencil;
+   frag_test_func frag_test;
    struct pipe_blend_state blend;
    struct pipe_sampler_state sampler[PIPE_MAX_SAMPLERS];
    struct cell_command_texture texture;
diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
new file mode 100644
index 0000000000..d42b522b41
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
@@ -0,0 +1,191 @@
+/*
+ * (C) Copyright IBM Corporation 2008
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * \file spu_per_fragment_op.c
+ * SPU implementation various per-fragment operations.
+ *
+ * \author Ian Romanick <idr@us.ibm.com>
+ */
+
+#include "pipe/p_format.h"
+#include "spu_main.h"
+#include "spu_per_fragment_op.h"
+
+#define ZERO 0x80
+
+static void
+read_ds_quad(tile_t *buffer, unsigned x, unsigned y,
+             enum pipe_format depth_format, qword *depth,
+             qword *stencil)
+{
+   const int ix = x / 2;
+   const int iy = y / 2;
+
+   switch (depth_format) {
+   case PIPE_FORMAT_Z16_UNORM: {
+      qword *ptr = (qword *) &buffer->us8[iy][ix / 2];
+
+      const qword shuf_vec = (qword) {
+         ZERO, ZERO, 0, 1, ZERO, ZERO, 2, 3,
+         ZERO, ZERO, 4, 5, ZERO, ZERO, 6, 7
+      };
+
+
+      /* At even X values we want the first 4 shorts, and at odd X values we
+       * want the second 4 shorts.
+       */
+      qword bias = (qword) spu_splats((unsigned char) ((ix & 0x01) << 3));
+      qword bias_mask = si_fsmbi(0x3333);
+      qword sv = si_a(shuf_vec, si_and(bias_mask, bias));
+
+      *depth = si_shufb(*ptr, *ptr, sv);
+      *stencil = si_il(0);
+      break;
+   }
+
+
+   case PIPE_FORMAT_Z32_UNORM: {
+      qword *ptr = (qword *) &buffer->ui4[iy][ix];
+
+      *depth = *ptr;
+      *stencil = si_il(0);
+      break;
+   }
+      
+
+   case PIPE_FORMAT_Z24S8_UNORM: {
+      qword *ptr = (qword *) &buffer->ui4[iy][ix];
+      qword mask = si_fsmbi(0x7777);
+
+      *depth = si_and(*ptr, mask);
+      *stencil = si_rotmai(si_andc(*ptr, mask), -24);
+      break;
+   }
+
+
+   default:
+      assert(0);
+      break;
+   }
+}
+
+
+static void
+write_ds_quad(tile_t *buffer, unsigned x, unsigned y,
+              enum pipe_format depth_format,
+              qword depth, qword stencil)
+{
+   const int ix = x / 2;
+   const int iy = y / 2;
+
+   (void) stencil;
+
+   switch (depth_format) {
+   case PIPE_FORMAT_Z16_UNORM: {
+      qword *ptr = (qword *) &buffer->us8[iy][ix / 2];
+
+      qword sv = ((ix & 0x01) == 0) 
+          ? (qword) { 2, 3, 6, 7, 10, 11, 14, 15,
+                      24, 25, 26, 27, 28, 29, 30, 31 }
+          : (qword) { 16, 17, 18, 19, 20 , 21, 22, 23,
+                      2, 3, 6, 7, 10, 11, 14, 15 };
+      *ptr = si_shufb(depth, *ptr, sv);
+      break;
+   }
+
+
+   case PIPE_FORMAT_Z32_UNORM: {
+      qword *ptr = (qword *) &buffer->ui4[iy][ix];
+      *ptr = depth;
+      break;
+   }
+
+
+   case PIPE_FORMAT_Z24S8_UNORM: {
+      qword *ptr = (qword *) &buffer->ui4[iy][ix];
+      qword mask = si_fsmbi(0x7777);
+
+      stencil = si_rotmai(stencil, 24);
+      *ptr = si_selb(stencil, depth, mask);
+      break;
+   }
+
+
+   default:
+      assert(0);
+      break;
+   }
+}
+
+
+qword
+spu_do_depth_stencil(int x, int y,
+                     qword frag_mask, qword frag_depth, qword frag_alpha,
+                     qword facing)
+{
+   struct spu_frag_test_results  result;
+   qword pixel_depth;
+   qword pixel_stencil;
+
+   /* All of this preable code (everthing before the call to frag_test) should
+    * be generated on the PPU and upload to the SPU.
+    */
+   if (spu.read_depth || spu.read_stencil) {
+      read_ds_quad(&spu.ztile, x, y, spu.fb.depth_format,
+                   &pixel_depth, &pixel_stencil);
+   }
+   
+   switch (spu.fb.depth_format) {
+   case PIPE_FORMAT_Z16_UNORM:
+      frag_depth = si_fm(frag_depth, (qword)spu_splats((float)(0x0000ffffu)));
+      frag_depth = si_cfltu(frag_depth, 0);
+      break;
+   case PIPE_FORMAT_Z32_UNORM:
+      frag_depth = si_fm(frag_depth, (qword)spu_splats((float)(0xffffffffu)));
+      frag_depth = si_cfltu(frag_depth, 0);
+      break;
+   case PIPE_FORMAT_Z24S8_UNORM:
+      frag_depth = si_fm(frag_depth, (qword)spu_splats((float)(0x00ffffffu)));
+      frag_depth = si_cfltu(frag_depth, 0);
+      break;
+   default:
+      assert(0);
+      break;
+   }
+
+   result = (*spu.frag_test)(frag_mask, pixel_depth, pixel_stencil,
+                             frag_depth, frag_alpha, facing);
+
+
+   /* This code (everthing after the call to frag_test) should
+    * be generated on the PPU and upload to the SPU.
+    */
+   if (spu.read_depth || spu.read_stencil) {
+      write_ds_quad(&spu.ztile, x, y, spu.fb.depth_format,
+                    result.depth, result.stencil);
+   }
+
+   return result.mask;
+}
diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.h b/src/gallium/drivers/cell/spu/spu_per_fragment_op.h
new file mode 100644
index 0000000000..6571258699
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.h
@@ -0,0 +1,32 @@
+/*
+ * (C) Copyright IBM Corporation 2008
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef SPU_PER_FRAGMENT_OP
+#define SPU_PER_FRAGMENT_OP
+
+extern qword
+spu_do_depth_stencil(int x, int y, qword frag_mask, qword frag_depth,
+		     qword frag_alpha, qword facing);
+
+#endif /* SPU_PER_FRAGMENT_OP */
diff --git a/src/gallium/drivers/cell/spu/spu_render.c b/src/gallium/drivers/cell/spu/spu_render.c
index 20e77aa2e6..6df59abd36 100644
--- a/src/gallium/drivers/cell/spu/spu_render.c
+++ b/src/gallium/drivers/cell/spu/spu_render.c
@@ -98,7 +98,7 @@ my_tile(uint tx, uint ty)
 static INLINE void
 get_cz_tiles(uint tx, uint ty)
 {
-   if (spu.depth_stencil.depth.enabled) {
+   if (spu.read_depth) {
       if (spu.cur_ztile_status != TILE_STATUS_CLEAR) {
          //printf("SPU %u: getting Z tile %u, %u\n", spu.init.id, tx, ty);
          get_tile(tx, ty, &spu.ztile, TAG_READ_TILE_Z, 1);
@@ -153,7 +153,7 @@ static INLINE void
 wait_put_cz_tiles(void)
 {
    wait_on_mask(1 << TAG_WRITE_TILE_COLOR);
-   if (spu.depth_stencil.depth.enabled) {
+   if (spu.read_depth) {
       wait_on_mask(1 << TAG_WRITE_TILE_Z);
    }
 }
diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index be9624cf7d..81823f2463 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -38,8 +38,7 @@
 #include "spu_texture.h"
 #include "spu_tile.h"
 #include "spu_tri.h"
-
-#include "spu_ztest.h"
+#include "spu_per_fragment_op.h"
 
 
 /** Masks are uint[4] vectors with each element being 0 or 0xffffffff */
@@ -264,16 +263,12 @@ do_depth_test(int x, int y, mask_t quadmask)
 
    zvals.v = eval_z((float) x, (float) y);
 
-   if (spu.fb.depth_format == PIPE_FORMAT_Z16_UNORM) {
-      int ix = (x - setup.cliprect_minx) / 4;
-      int iy = (y - setup.cliprect_miny) / 2;
-      mask = spu_z16_test_less(zvals.v, &spu.ztile.us8[iy][ix], x>>1, quadmask);
-   }
-   else {
-      int ix = (x - setup.cliprect_minx) / 2;
-      int iy = (y - setup.cliprect_miny) / 2;
-      mask = spu_z32_test_less(zvals.v, &spu.ztile.ui4[iy][ix], quadmask);
-   }
+   mask = (mask_t) spu_do_depth_stencil(x - setup.cliprect_minx,
+					y - setup.cliprect_miny,
+					(qword) quadmask, 
+					(qword) zvals.v,
+					(qword) spu_splats((unsigned char) 0x0ffu),
+					(qword) spu_splats((unsigned int) 0x01u));
 
    if (spu_extract(spu_orx(mask), 0))
       spu.cur_ztile_status = TILE_STATUS_DIRTY;
@@ -299,7 +294,7 @@ emit_quad( int x, int y, mask_t mask )
    sp->quad.first->run(sp->quad.first, &setup.quad);
 #else
 
-   if (spu.depth_stencil.depth.enabled) {
+   if (spu.read_depth) {
       mask = do_depth_test(x, y, mask);
    }
 
@@ -434,7 +429,7 @@ static void flush_spans( void )
    }
    ASSERT(spu.cur_ctile_status != TILE_STATUS_DEFINED);
 
-   if (spu.depth_stencil.depth.enabled) {
+   if (spu.read_depth) {
       if (spu.cur_ztile_status == TILE_STATUS_GETTING) {
          /* wait for mfc_get() to complete */
          //printf("SPU: %u: waiting for ztile\n", spu.init.id);
diff --git a/src/gallium/drivers/cell/spu/spu_ztest.h b/src/gallium/drivers/cell/spu/spu_ztest.h
deleted file mode 100644
index ce8ad00339..0000000000
--- a/src/gallium/drivers/cell/spu/spu_ztest.h
+++ /dev/null
@@ -1,135 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-
-/**
- * Zbuffer/depth test code.
- */
-
-
-#ifndef SPU_ZTEST_H
-#define SPU_ZTEST_H
-
-
-#ifdef __SPU__
-#include <spu_intrinsics.h>
-#endif
-
-
-
-/**
- * Perform Z testing for a 16-bit/value Z buffer.
- *
- * \param zvals  vector of four fragment zvalues as floats
- * \param zbuf   ptr to vector of ushort[8] zbuffer values.  Note that this
- *               contains the Z values for 2 quads, 8 pixels.
- * \param x      x coordinate of quad (only lsbit is significant)
- * \param inMask indicates which fragments in the quad are alive
- * \return new mask indicating which fragments are alive after ztest
- */
-static INLINE vector unsigned int
-spu_z16_test_less(vector float zvals, vector unsigned short *zbuf,
-                  uint x, vector unsigned int inMask)
-{
-#define ZERO 0x80
-   vector unsigned int zvals_ui4, zbuf_ui4, mask;
-
-   /* convert floats to uints in [0, 65535] */
-   zvals_ui4 = spu_convtu(zvals, 32); /* convert to [0, 2^32] */
-   zvals_ui4 = spu_rlmask(zvals_ui4, -16);  /* right shift 16 */
-
-   /* XXX this conditional could be removed with a bit of work */
-   if (x & 1) {
-      /* convert zbuffer values from ushorts to uints */
-      /* gather lower four ushorts */
-      zbuf_ui4 = spu_shuffle((vector unsigned int) *zbuf,
-                             (vector unsigned int) *zbuf,
-                             ((vector unsigned char) {
-                                ZERO, ZERO,  8,  9, ZERO, ZERO, 10, 11,
-                                ZERO, ZERO, 12, 13, ZERO, ZERO, 14, 15}));
-      /* mask = (zbuf_ui4 < zvals_ui4) ? ~0 : 0 */
-      mask = spu_cmpgt(zbuf_ui4, zvals_ui4);
-      /* mask &= inMask */
-      mask = spu_and(mask, inMask);
-      /* zbuf = mask ? zval : zbuf */
-      zbuf_ui4 = spu_sel(zbuf_ui4, zvals_ui4, mask);
-      /* convert zbuffer values from uints back to ushorts, preserve lower 4 */
-      *zbuf = (vector unsigned short)
-         spu_shuffle(zbuf_ui4, (vector unsigned int) *zbuf,
-                     ((vector unsigned char) {
-                        16, 17, 18, 19, 20, 21, 22, 23,
-                        2, 3, 6, 7, 10, 11, 14, 15}));
-   }
-   else {
-      /* convert zbuffer values from ushorts to uints */
-      /* gather upper four ushorts */
-      zbuf_ui4 = spu_shuffle((vector unsigned int) *zbuf,
-                             (vector unsigned int) *zbuf,
-                             ((vector unsigned char) {
-                                ZERO, ZERO, 0, 1, ZERO, ZERO, 2, 3,
-                                ZERO, ZERO, 4, 5, ZERO, ZERO, 6, 7}));
-      /* mask = (zbuf_ui4 < zvals_ui4) ? ~0 : 0 */
-      mask = spu_cmpgt(zbuf_ui4, zvals_ui4);
-      /* mask &= inMask */
-      mask = spu_and(mask, inMask);
-      /* zbuf = mask ? zval : zbuf */
-      zbuf_ui4 = spu_sel(zbuf_ui4, zvals_ui4, mask);
-      /* convert zbuffer values from uints back to ushorts, preserve upper 4 */
-      *zbuf = (vector unsigned short)
-         spu_shuffle(zbuf_ui4, (vector unsigned int) *zbuf,
-                     ((vector unsigned char) {
-                        2, 3, 6, 7, 10, 11, 14, 15,
-                        24, 25, 26, 27, 28, 29, 30, 31}));
-   }
-   return mask;
-#undef ZERO
-}
-
-
-/**
- * As above, but Zbuffer values as 32-bit uints
- */
-static INLINE vector unsigned int
-spu_z32_test_less(vector float zvals, vector unsigned int *zbuf_ptr,
-                  vector unsigned int inMask)
-{
-   vector unsigned int zvals_ui4, mask, zbuf = *zbuf_ptr;
-
-   /* convert floats to uints in [0, 0xffffffff] */
-   zvals_ui4 = spu_convtu(zvals, 32);
-   /* mask = (zbuf < zvals_ui4) ? ~0 : 0 */
-   mask = spu_cmpgt(zbuf, zvals_ui4);
-   /* mask &= inMask */
-   mask = spu_and(mask, inMask);
-   /* zbuf = mask ? zval : zbuf */
-   *zbuf_ptr = spu_sel(zbuf, zvals_ui4, mask);
-
-   return mask;
-}
-
-
-#endif /* SPU_ZTEST_H */
-- 
cgit v1.2.3


From 9f93e6701902312edf48821bb4c0558c3d62aaa3 Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Mon, 17 Mar 2008 16:09:28 -0700
Subject: cell: Don't segfault when unbinding alpha / stencil / depth test
 state

---
 src/gallium/drivers/cell/ppu/cell_pipe_state.c |  2 +-
 src/gallium/drivers/cell/ppu/cell_state_emit.c | 17 ++++++++++++-----
 src/gallium/drivers/cell/spu/spu_main.c        | 23 ++++++++++++++++-------
 3 files changed, 29 insertions(+), 13 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/ppu/cell_pipe_state.c b/src/gallium/drivers/cell/ppu/cell_pipe_state.c
index 66ede99d13..c880760e4b 100644
--- a/src/gallium/drivers/cell/ppu/cell_pipe_state.c
+++ b/src/gallium/drivers/cell/ppu/cell_pipe_state.c
@@ -116,7 +116,7 @@ cell_bind_depth_stencil_alpha_state(struct pipe_context *pipe,
 
    draw_flush(cell->draw);
 
-   if (cdsa->code.store == NULL) {
+   if ((cdsa != NULL) && (cdsa->code.store == NULL)) {
       cell_generate_depth_stencil_test(cdsa);
    }
 
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index c8d5fdf709..e2cc9de48a 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -74,11 +74,18 @@ cell_emit_state(struct cell_context *cell)
       struct cell_command_depth_stencil_alpha_test dsat;
       
 
-      dsat.base = (intptr_t) cell->depth_stencil->code.store;
-      dsat.size = (char *) cell->depth_stencil->code.csr
-	  - (char *) cell->depth_stencil->code.store;
-      dsat.read_depth = TRUE;
-      dsat.read_stencil = FALSE;
+      if (cell->depth_stencil != NULL) {
+	 dsat.base = (intptr_t) cell->depth_stencil->code.store;
+	 dsat.size = (char *) cell->depth_stencil->code.csr
+	     - (char *) cell->depth_stencil->code.store;
+	 dsat.read_depth = TRUE;
+	 dsat.read_stencil = FALSE;
+      } else {
+	 dsat.base = 0;
+	 dsat.size = 0;
+	 dsat.read_depth = FALSE;
+	 dsat.read_stencil = FALSE;
+      }
 
       {
 	 uint32_t *p = cell->depth_stencil->code.store;
diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index 8e46f6e471..b8bb8e9449 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -260,13 +260,22 @@ cmd_state_depth_stencil(const struct cell_command_depth_stencil_alpha_test *stat
 
    ASSERT_ALIGN16(state->base);
 
-   mfc_get(depth_stencil_code_buffer,
-	   (unsigned int) state->base,  /* src */
-	   ROUNDUP16(state->size),
-	   TAG_BATCH_BUFFER,
-	   0, /* tid */
-	   0  /* rid */);
-   wait_on_mask(1 << TAG_BATCH_BUFFER);
+   if (state->size != 0) {
+      mfc_get(depth_stencil_code_buffer,
+	      (unsigned int) state->base,  /* src */
+	      ROUNDUP16(state->size),
+	      TAG_BATCH_BUFFER,
+	      0, /* tid */
+	      0  /* rid */);
+      wait_on_mask(1 << TAG_BATCH_BUFFER);
+   } else {
+      /* If there is no code, emit a return instruction.
+       */
+      depth_stencil_code_buffer[0] = 0x35;
+      depth_stencil_code_buffer[1] = 0x00;
+      depth_stencil_code_buffer[2] = 0x00;
+      depth_stencil_code_buffer[3] = 0x00;
+   }
 
    spu.frag_test = (frag_test_func) depth_stencil_code_buffer;
    spu.read_depth = state->read_depth;
-- 
cgit v1.2.3


From 5fdaebc51c5433ebc73f89690fd435dae2fcef60 Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Tue, 18 Mar 2008 10:26:45 -0700
Subject: cell: Minor changes to make stencil not crash

I'm not sure these are quite correct.  The reflect demo doesn't assert
anymore, but it doesn't produce correct results either.  SPE-based
vertex shader code needs to be disabled for relfect to run.
---
 src/gallium/drivers/cell/spu/spu_main.c | 12 +++++++++---
 src/gallium/drivers/cell/spu/spu_tile.h |  4 ++--
 2 files changed, 11 insertions(+), 5 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index b8bb8e9449..122cf337a6 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -218,12 +218,18 @@ cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
    spu.fb.width_tiles = (spu.fb.width + TILE_SIZE - 1) / TILE_SIZE;
    spu.fb.height_tiles = (spu.fb.height + TILE_SIZE - 1) / TILE_SIZE;
 
-   if (spu.fb.depth_format == PIPE_FORMAT_Z32_UNORM)
+   switch (spu.fb.depth_format) {
+   case PIPE_FORMAT_Z32_UNORM:
+   case PIPE_FORMAT_Z24S8_UNORM:
       spu.fb.zsize = 4;
-   else if (spu.fb.depth_format == PIPE_FORMAT_Z16_UNORM)
+      break;
+   case PIPE_FORMAT_Z16_UNORM:
       spu.fb.zsize = 2;
-   else
+      break;
+   default:
       spu.fb.zsize = 0;
+      break;
+   }
 
    if (spu.fb.color_format == PIPE_FORMAT_A8R8G8B8_UNORM)
       spu.color_shuffle = ((vector unsigned char) {
diff --git a/src/gallium/drivers/cell/spu/spu_tile.h b/src/gallium/drivers/cell/spu/spu_tile.h
index 3105b848fd..1b5491112d 100644
--- a/src/gallium/drivers/cell/spu/spu_tile.h
+++ b/src/gallium/drivers/cell/spu/spu_tile.h
@@ -56,13 +56,13 @@ clear_c_tile(tile_t *ctile)
 static INLINE void
 clear_z_tile(tile_t *ztile)
 {
-   if (spu.fb.depth_format == PIPE_FORMAT_Z16_UNORM) {
+   if (spu.fb.zsize == 2) {
       memset16((ushort*) ztile->us,
                spu.fb.depth_clear_value,
                TILE_SIZE * TILE_SIZE);
    }
    else {
-      ASSERT(spu.fb.depth_format == PIPE_FORMAT_Z32_UNORM);
+      ASSERT(spu.fb.zsize != 0);
       memset32((uint*) ztile->ui,
                spu.fb.depth_clear_value,
                TILE_SIZE * TILE_SIZE);
-- 
cgit v1.2.3


From 17b234ae3319d8a36afc44d0cceb30fea6b42d67 Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Tue, 18 Mar 2008 11:47:37 -0700
Subject: cell: Fix depth read / write for s8z24.

Stencil is still broken.
---
 src/gallium/drivers/cell/spu/spu_main.c            |  1 +
 src/gallium/drivers/cell/spu/spu_per_fragment_op.c | 25 ++++++++++++++++++++--
 2 files changed, 24 insertions(+), 2 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index 122cf337a6..937962285d 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -221,6 +221,7 @@ cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
    switch (spu.fb.depth_format) {
    case PIPE_FORMAT_Z32_UNORM:
    case PIPE_FORMAT_Z24S8_UNORM:
+   case PIPE_FORMAT_S8Z24_UNORM:
       spu.fb.zsize = 4;
       break;
    case PIPE_FORMAT_Z16_UNORM:
diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
index d42b522b41..06d68f5604 100644
--- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
+++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
@@ -76,6 +76,16 @@ read_ds_quad(tile_t *buffer, unsigned x, unsigned y,
       
 
    case PIPE_FORMAT_Z24S8_UNORM: {
+      qword *ptr = (qword *) &buffer->ui4[iy][ix];
+      qword mask = si_fsmbi(0xEEEE);
+
+      *depth = si_rotmai(si_and(*ptr, mask), -8);
+      *stencil = si_andc(*ptr, mask);
+      break;
+   }
+
+
+   case PIPE_FORMAT_S8Z24_UNORM: {
       qword *ptr = (qword *) &buffer->ui4[iy][ix];
       qword mask = si_fsmbi(0x7777);
 
@@ -124,10 +134,20 @@ write_ds_quad(tile_t *buffer, unsigned x, unsigned y,
 
 
    case PIPE_FORMAT_Z24S8_UNORM: {
+      qword *ptr = (qword *) &buffer->ui4[iy][ix];
+      qword mask = si_fsmbi(0xEEEE);
+
+      depth = si_shli(depth, 8);
+      *ptr = si_selb(stencil, depth, mask);
+      break;
+   }
+
+
+   case PIPE_FORMAT_S8Z24_UNORM: {
       qword *ptr = (qword *) &buffer->ui4[iy][ix];
       qword mask = si_fsmbi(0x7777);
 
-      stencil = si_rotmai(stencil, 24);
+      stencil = si_shli(stencil, 24);
       *ptr = si_selb(stencil, depth, mask);
       break;
    }
@@ -167,11 +187,12 @@ spu_do_depth_stencil(int x, int y,
       frag_depth = si_cfltu(frag_depth, 0);
       break;
    case PIPE_FORMAT_Z24S8_UNORM:
+   case PIPE_FORMAT_S8Z24_UNORM:
       frag_depth = si_fm(frag_depth, (qword)spu_splats((float)(0x00ffffffu)));
       frag_depth = si_cfltu(frag_depth, 0);
       break;
    default:
-      assert(0);
+      ASSERT(0);
       break;
    }
 
-- 
cgit v1.2.3


From fa69a6e1bb7a1fe96848456255e5370f1904706d Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Tue, 18 Mar 2008 15:59:06 -0700
Subject: cell: Correctly load stencil for PIPE_FORMAT_S8Z24_UNORM

---
 src/gallium/drivers/cell/spu/spu_per_fragment_op.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
index 06d68f5604..b4cffeeb32 100644
--- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
+++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
@@ -87,10 +87,9 @@ read_ds_quad(tile_t *buffer, unsigned x, unsigned y,
 
    case PIPE_FORMAT_S8Z24_UNORM: {
       qword *ptr = (qword *) &buffer->ui4[iy][ix];
-      qword mask = si_fsmbi(0x7777);
 
-      *depth = si_and(*ptr, mask);
-      *stencil = si_rotmai(si_andc(*ptr, mask), -24);
+      *depth = si_and(*ptr, si_fsmbi(0x7777));
+      *stencil = si_andi(si_roti(*ptr, 8), 0x0ff);
       break;
    }
 
-- 
cgit v1.2.3


From 2b21bde3b1fa6fe357a3a5adc6249e89d6915524 Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Wed, 19 Mar 2008 17:29:39 -0700
Subject: cell: Use code-gen for alpha blend

So far this is only tested when GL_BLEND is disabled.
---
 src/gallium/drivers/cell/common.h              | 10 +++++
 src/gallium/drivers/cell/ppu/cell_pipe_state.c | 11 +++--
 src/gallium/drivers/cell/ppu/cell_state_emit.c | 17 +++++--
 src/gallium/drivers/cell/spu/Makefile          |  1 -
 src/gallium/drivers/cell/spu/spu_blend.c       | 62 --------------------------
 src/gallium/drivers/cell/spu/spu_blend.h       | 37 ---------------
 src/gallium/drivers/cell/spu/spu_main.c        | 50 ++++++++++++++++++---
 src/gallium/drivers/cell/spu/spu_main.h        | 17 ++++++-
 src/gallium/drivers/cell/spu/spu_tri.c         | 56 +++++++++++++++--------
 9 files changed, 129 insertions(+), 132 deletions(-)
 delete mode 100644 src/gallium/drivers/cell/spu/spu_blend.c
 delete mode 100644 src/gallium/drivers/cell/spu/spu_blend.h

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index fe93fd8e1a..d59e4f7036 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -114,6 +114,16 @@ struct cell_command_depth_stencil_alpha_test {
 };
 
 
+/**
+ * Upload code to perform framebuffer blend operation
+ */
+struct cell_command_blend {
+   uint64_t base;               /**< Effective address of code start. */
+   unsigned size;               /**< Size in bytes of test code. */
+   unsigned read_fb;            /**< Flag: should framebuffer be read? */
+};
+
+
 /**
  * Tell SPUs about the framebuffer size, location
  */
diff --git a/src/gallium/drivers/cell/ppu/cell_pipe_state.c b/src/gallium/drivers/cell/ppu/cell_pipe_state.c
index c880760e4b..4ca8c153b0 100644
--- a/src/gallium/drivers/cell/ppu/cell_pipe_state.c
+++ b/src/gallium/drivers/cell/ppu/cell_pipe_state.c
@@ -54,14 +54,19 @@ cell_create_blend_state(struct pipe_context *pipe,
 
 
 static void
-cell_bind_blend_state(struct pipe_context *pipe, void *blend)
+cell_bind_blend_state(struct pipe_context *pipe, void *state)
 {
    struct cell_context *cell = cell_context(pipe);
+   struct cell_blend_state *blend = (struct cell_blend_state *) state;
+
 
    draw_flush(cell->draw);
 
-   cell->blend = (const struct cell_blend_state *)blend;
+   if ((blend != NULL) && (blend->code.store == NULL)) {
+      cell_generate_depth_stencil_test(blend);
+   }
 
+   cell->blend = blend;
    cell->dirty |= CELL_NEW_BLEND;
 }
 
@@ -70,7 +75,7 @@ static void
 cell_delete_blend_state(struct pipe_context *pipe, void *blend)
 {
    struct cell_blend_state *cb = (struct cell_blend_state *) blend;
-   
+
    spe_release_func(& cb->code);
    FREE(cb);
 }
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index 31cc938f07..5709b48f12 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -65,9 +65,20 @@ cell_emit_state(struct cell_context *cell)
    }
 
    if (cell->dirty & CELL_NEW_BLEND) {
-      emit_state_cmd(cell, CELL_CMD_STATE_BLEND,
-                     cell->blend,
-                     sizeof(struct pipe_blend_state));
+      struct cell_command_blend blend;
+
+      if (cell->blend != NULL) {
+         blend.base = (intptr_t) cell->blend->code.store;
+         blend.size = (char *) cell->blend->code.csr
+             - (char *) cell->blend->code.store;
+         blend.read_fb = TRUE;
+      } else {
+         blend.base = 0;
+         blend.size = 0;
+         blend.read_fb = FALSE;
+      }
+
+      emit_state_cmd(cell, CELL_CMD_STATE_BLEND, &blend, sizeof(blend));
    }
 
    if (cell->dirty & CELL_NEW_DEPTH_STENCIL) {
diff --git a/src/gallium/drivers/cell/spu/Makefile b/src/gallium/drivers/cell/spu/Makefile
index 115ca8cd90..8e83610790 100644
--- a/src/gallium/drivers/cell/spu/Makefile
+++ b/src/gallium/drivers/cell/spu/Makefile
@@ -17,7 +17,6 @@ PROG_SPU_EMBED_O = $(PROG)_spu-embed.o
 
 SOURCES = \
 	spu_main.c \
-	spu_blend.c \
 	spu_dcache.c \
 	spu_per_fragment_op.c \
 	spu_render.c \
diff --git a/src/gallium/drivers/cell/spu/spu_blend.c b/src/gallium/drivers/cell/spu/spu_blend.c
deleted file mode 100644
index 23ec0eeb45..0000000000
--- a/src/gallium/drivers/cell/spu/spu_blend.c
+++ /dev/null
@@ -1,62 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-
-#include "spu_main.h"
-#include "spu_blend.h"
-#include "spu_colorpack.h"
-
-
-void
-blend_quad(uint itx, uint ity, vector float colors[4])
-{
-   /* simple SRC_ALPHA, ONE_MINUS_SRC_ALPHA blending */
-   vector float fbc00 = spu_unpack_color(spu.ctile.ui[ity][itx]);
-   vector float fbc01 = spu_unpack_color(spu.ctile.ui[ity][itx+1]);
-   vector float fbc10 = spu_unpack_color(spu.ctile.ui[ity+1][itx]);
-   vector float fbc11 = spu_unpack_color(spu.ctile.ui[ity+1][itx+1]);
-
-   vector float alpha00 = spu_splats(spu_extract(colors[0], 3));
-   vector float alpha01 = spu_splats(spu_extract(colors[1], 3));
-   vector float alpha10 = spu_splats(spu_extract(colors[2], 3));
-   vector float alpha11 = spu_splats(spu_extract(colors[3], 3));
-
-   vector float one_minus_alpha00 = spu_sub(spu_splats(1.0f), alpha00);
-   vector float one_minus_alpha01 = spu_sub(spu_splats(1.0f), alpha01);
-   vector float one_minus_alpha10 = spu_sub(spu_splats(1.0f), alpha10);
-   vector float one_minus_alpha11 = spu_sub(spu_splats(1.0f), alpha11);
-
-   colors[0] = spu_add(spu_mul(colors[0], alpha00),
-                       spu_mul(fbc00, one_minus_alpha00));
-   colors[1] = spu_add(spu_mul(colors[1], alpha01),
-                       spu_mul(fbc01, one_minus_alpha01));
-   colors[2] = spu_add(spu_mul(colors[2], alpha10),
-                       spu_mul(fbc10, one_minus_alpha10));
-   colors[3] = spu_add(spu_mul(colors[3], alpha11),
-                       spu_mul(fbc11, one_minus_alpha11));
-}
-
diff --git a/src/gallium/drivers/cell/spu/spu_blend.h b/src/gallium/drivers/cell/spu/spu_blend.h
deleted file mode 100644
index 2b594b578b..0000000000
--- a/src/gallium/drivers/cell/spu/spu_blend.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-
-#ifndef SPU_BLEND_H
-#define SPU_BLEND_H
-
-
-extern void
-blend_quad(uint itx, uint ity, vector float colors[4]);
-
-
-#endif /* SPU_BLEND_H */
diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index 937962285d..41bebf5362 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -61,6 +61,25 @@ static unsigned char attribute_fetch_code_buffer[136 * PIPE_ATTRIB_MAX]
 static unsigned char depth_stencil_code_buffer[4 * 64]
     ALIGN16_ATTRIB;
 
+static unsigned char fb_blend_code_buffer[4 * 64]
+    ALIGN16_ATTRIB;
+
+static struct spu_blend_results
+default_blend(qword frag_r, qword frag_g, qword frag_b, qword frag_a,
+              qword pixel_r, qword pixel_g, qword pixel_b, qword pixel_a,
+              qword frag_mask)
+{
+   struct spu_blend_results result;
+
+   result.r = si_selb(pixel_r, frag_r, frag_mask);
+   result.g = si_selb(pixel_g, frag_g, frag_mask);
+   result.b = si_selb(pixel_b, frag_b, frag_mask);
+   result.a = si_selb(pixel_a, frag_a, frag_mask);
+
+   return result;
+}
+
+
 /**
  * Tell the PPU that this SPU has finished copying a buffer to
  * local store and that it may be reused by the PPU.
@@ -246,14 +265,31 @@ cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
 
 
 static void
-cmd_state_blend(const struct pipe_blend_state *state)
+cmd_state_blend(const struct cell_command_blend *state)
 {
    if (Debug)
       printf("SPU %u: BLEND: enabled %d\n",
              spu.init.id,
-             state->blend_enable);
+             (state->size != 0));
+
+   ASSERT_ALIGN16(state->base);
 
-   memcpy(&spu.blend, state, sizeof(*state));
+   if (state->size != 0) {
+      mfc_get(fb_blend_code_buffer,
+              (unsigned int) state->base,  /* src */
+              ROUNDUP16(state->size),
+              TAG_BATCH_BUFFER,
+              0, /* tid */
+              0  /* rid */);
+      wait_on_mask(1 << TAG_BATCH_BUFFER);
+      spu.blend = (blend_func) fb_blend_code_buffer;
+      spu.read_fb = state->read_fb;
+   } else {
+      /* If there is no code, use the default;
+       */
+      spu.blend = default_blend;
+      spu.read_fb = FALSE;
+   }
 }
 
 
@@ -441,9 +477,8 @@ cmd_batch(uint opcode)
          pos += 1;
          break;
       case CELL_CMD_STATE_BLEND:
-         cmd_state_blend((struct pipe_blend_state *)
-                                 &buffer[pos+1]);
-         pos += (1 + ROUNDUP8(sizeof(struct pipe_blend_state)) / 8);
+         cmd_state_blend((struct cell_command_blend *) &buffer[pos+1]);
+         pos += (1 + ROUNDUP8(sizeof(struct cell_command_blend)) / 8);
          break;
       case CELL_CMD_STATE_DEPTH_STENCIL:
          cmd_state_depth_stencil((struct cell_command_depth_stencil_alpha_test *)
@@ -587,6 +622,9 @@ one_time_init(void)
    memset(spu.ctile_status, TILE_STATUS_DEFINED, sizeof(spu.ctile_status));
    memset(spu.ztile_status, TILE_STATUS_DEFINED, sizeof(spu.ztile_status));
    invalidate_tex_cache();
+
+   spu.blend = default_blend;
+   spu.read_fb = FALSE;
 }
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 444e218645..56d0968676 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -67,6 +67,18 @@ typedef struct spu_frag_test_results (*frag_test_func)(qword frag_mask,
     qword frag_alpha, qword facing);
 
 
+struct spu_blend_results {
+   qword r;
+   qword g;
+   qword b;
+   qword a;
+};
+
+typedef struct spu_blend_results (*blend_func)(
+    qword frag_r, qword frag_g, qword frag_b, qword frag_a,
+    qword pixel_r, qword pixel_g, qword pixel_b, qword pixel_a,
+    qword frag_mask);
+
 struct spu_framebuffer {
    void *color_start;              /**< addr of color surface in main memory */
    void *depth_start;              /**< addr of depth surface in main memory */
@@ -93,7 +105,10 @@ struct spu_global
    boolean read_depth;
    boolean read_stencil;
    frag_test_func frag_test;
-   struct pipe_blend_state blend;
+   
+   boolean read_fb;
+   blend_func blend;
+
    struct pipe_sampler_state sampler[PIPE_MAX_SAMPLERS];
    struct cell_command_texture texture;
 
diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index 81823f2463..c9f8cadcda 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -29,10 +29,10 @@
  * Triangle rendering within a tile.
  */
 
+#include <transpose_matrix4x4.h>
 #include "pipe/p_compiler.h"
 #include "pipe/p_format.h"
 #include "pipe/p_util.h"
-#include "spu_blend.h"
 #include "spu_colorpack.h"
 #include "spu_main.h"
 #include "spu_texture.h"
@@ -326,27 +326,45 @@ emit_quad( int x, int y, mask_t mask )
          eval_coeff(1, (float) x, (float) y, colors);
       }
 
-#if 1
-      if (spu.blend.blend_enable)
-         blend_quad(ix % TILE_SIZE, iy % TILE_SIZE, colors);
-#endif
 
-      if (spu_extract(mask, 0))
-         spu.ctile.ui[iy][ix] = spu_pack_color_shuffle(colors[0], shuffle);
-      if (spu_extract(mask, 1))
-         spu.ctile.ui[iy][ix+1] = spu_pack_color_shuffle(colors[1], shuffle);
-      if (spu_extract(mask, 2))
-         spu.ctile.ui[iy+1][ix] = spu_pack_color_shuffle(colors[2], shuffle);
-      if (spu_extract(mask, 3))
-         spu.ctile.ui[iy+1][ix+1] = spu_pack_color_shuffle(colors[3], shuffle);
+      /* Read the current framebuffer values.
+       *
+       * Ignore read_fb for now.  In the future we can use this to avoid
+       * reading the framebuffer if read_fb is false and the fragment mask is
+       * all 0xffffffff.  This is the common case, so it is probably worth
+       * the effort.  We'll have to profile to determine whether or not the
+       * extra conditional branches hurt overall performance.
+       */
+      vec_float4 aos_pix[4] = {
+         spu_unpack_A8R8G8B8(spu.ctile.ui[iy+0][ix+0]),
+         spu_unpack_A8R8G8B8(spu.ctile.ui[iy+0][ix+1]),
+         spu_unpack_A8R8G8B8(spu.ctile.ui[iy+1][ix+0]),
+         spu_unpack_A8R8G8B8(spu.ctile.ui[iy+1][ix+1]),
+      };
 
-#if 0
-      /* SIMD_Z with swizzled color buffer (someday) */
-      vector unsigned int uicolors = *((vector unsigned int *) &colors);
-      spu.ctile.ui4[iy/2][ix/2] = spu_sel(spu.ctile.ui4[iy/2][ix/2], uicolors, mask);
-#endif
-   }
+      qword soa_pix[4];
+      qword soa_frag[4];
 
+      /* Convert pixel and fragment data from AoS to SoA format.
+       */
+      _transpose_matrix4x4((vec_float4 *) soa_pix, aos_pix);
+      _transpose_matrix4x4((vec_float4 *) soa_frag, colors);
+
+      const struct spu_blend_results result =
+          (*spu.blend)(soa_frag[0], soa_frag[1], soa_frag[2], soa_frag[3],
+                       soa_pix[0], soa_pix[1], soa_pix[2], soa_pix[3],
+                       (qword) mask);
+
+
+      /* Convert final pixel data from SoA to AoS format.
+       */
+      _transpose_matrix4x4(aos_pix, (const vec_float4 *) &result);
+
+      spu.ctile.ui[iy+0][ix+0] = spu_pack_color_shuffle(aos_pix[0], shuffle);
+      spu.ctile.ui[iy+0][ix+1] = spu_pack_color_shuffle(aos_pix[1], shuffle);
+      spu.ctile.ui[iy+1][ix+0] = spu_pack_color_shuffle(aos_pix[2], shuffle);
+      spu.ctile.ui[iy+1][ix+1] = spu_pack_color_shuffle(aos_pix[3], shuffle);
+   }
 #endif
 }
 
-- 
cgit v1.2.3


From df1d6e2410dbc6af66ca416124587918b9764ee8 Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Thu, 20 Mar 2008 17:36:05 -0700
Subject: cell: Fix bus error when there is no depth buffer

---
 src/gallium/drivers/cell/spu/spu_tri.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index c9f8cadcda..c4272d6e93 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -261,6 +261,9 @@ do_depth_test(int x, int y, mask_t quadmask)
    float4 zvals;
    mask_t mask;
 
+   if (spu.fb.depth_format == PIPE_FORMAT_NONE)
+      return quadmask;
+
    zvals.v = eval_z((float) x, (float) y);
 
    mask = (mask_t) spu_do_depth_stencil(x - setup.cliprect_minx,
-- 
cgit v1.2.3


From 2902c164a22b6bcb6a42d7cd7fa82b608875093b Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Fri, 21 Mar 2008 10:23:52 -0700
Subject: cell: Remove unnecessary default_blend work-around

I suspect that there was some other bug in the blend code-gen that
made this work-around necessary.
---
 src/gallium/drivers/cell/spu/spu_main.c | 21 ---------------------
 1 file changed, 21 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index 41bebf5362..0a490ab277 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -64,21 +64,6 @@ static unsigned char depth_stencil_code_buffer[4 * 64]
 static unsigned char fb_blend_code_buffer[4 * 64]
     ALIGN16_ATTRIB;
 
-static struct spu_blend_results
-default_blend(qword frag_r, qword frag_g, qword frag_b, qword frag_a,
-              qword pixel_r, qword pixel_g, qword pixel_b, qword pixel_a,
-              qword frag_mask)
-{
-   struct spu_blend_results result;
-
-   result.r = si_selb(pixel_r, frag_r, frag_mask);
-   result.g = si_selb(pixel_g, frag_g, frag_mask);
-   result.b = si_selb(pixel_b, frag_b, frag_mask);
-   result.a = si_selb(pixel_a, frag_a, frag_mask);
-
-   return result;
-}
-
 
 /**
  * Tell the PPU that this SPU has finished copying a buffer to
@@ -285,9 +270,6 @@ cmd_state_blend(const struct cell_command_blend *state)
       spu.blend = (blend_func) fb_blend_code_buffer;
       spu.read_fb = state->read_fb;
    } else {
-      /* If there is no code, use the default;
-       */
-      spu.blend = default_blend;
       spu.read_fb = FALSE;
    }
 }
@@ -622,9 +604,6 @@ one_time_init(void)
    memset(spu.ctile_status, TILE_STATUS_DEFINED, sizeof(spu.ctile_status));
    memset(spu.ztile_status, TILE_STATUS_DEFINED, sizeof(spu.ztile_status));
    invalidate_tex_cache();
-
-   spu.blend = default_blend;
-   spu.read_fb = FALSE;
 }
 
 
-- 
cgit v1.2.3


From 600499cf888fee9a91ff3106beca939ea0c7b2bd Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Fri, 21 Mar 2008 11:15:49 -0700
Subject: cell: Change code-gen for CONST_COLOR blend factor

Previously the constant color blend factor was compiled into the
generated code.  This meant that the code had to be regenerated each
time the constant color was changed.  This doesn't fit with the model
used in Gallium.

As-is, the code could be better.  The constant color is loaded for
every quad processed, even if it is not used.  Also, if a lot of (1-x)
blend factors are used, 1.0 will be loaded and reloaded into registers
many times.
---
 src/gallium/drivers/cell/ppu/cell_pipe_state.c     |  2 +-
 .../drivers/cell/ppu/cell_state_per_fragment.c     | 93 +++++++++++-----------
 .../drivers/cell/ppu/cell_state_per_fragment.h     |  3 +-
 src/gallium/drivers/cell/spu/spu_main.h            |  2 +
 src/gallium/drivers/cell/spu/spu_tri.c             |  2 +
 5 files changed, 53 insertions(+), 49 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/ppu/cell_pipe_state.c b/src/gallium/drivers/cell/ppu/cell_pipe_state.c
index 86fcdcff1f..d956d6fad4 100644
--- a/src/gallium/drivers/cell/ppu/cell_pipe_state.c
+++ b/src/gallium/drivers/cell/ppu/cell_pipe_state.c
@@ -63,7 +63,7 @@ cell_bind_blend_state(struct pipe_context *pipe, void *state)
    draw_flush(cell->draw);
 
    if ((blend != NULL) && (blend->code.store == NULL)) {
-      cell_generate_alpha_blend(blend, &cell->blend_color);
+      cell_generate_alpha_blend(blend);
    }
 
    cell->blend = blend;
diff --git a/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c b/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c
index f353aeab0a..c750b1d89d 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c
@@ -588,19 +588,13 @@ cell_generate_depth_stencil_test(struct cell_depth_stencil_alpha_state *cdsa)
  */
 static int
 emit_alpha_factor_calculation(struct spe_function *f,
-                              unsigned factor, float const_alpha,
-                              int src_alpha, int dst_alpha)
+                              unsigned factor,
+                              int src_alpha, int dst_alpha, int const_alpha)
 {
-   union {
-      float f;
-      unsigned u;
-   } alpha;
    int factor_reg;
    int tmp;
 
 
-   alpha.f = const_alpha;
-
    switch (factor) {
    case PIPE_BLENDFACTOR_ONE:
       factor_reg = -1;
@@ -621,13 +615,17 @@ emit_alpha_factor_calculation(struct spe_function *f,
       break;
 
    case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
-      const_alpha = 1.0 - const_alpha;
-      /* FALLTHROUGH */
-   case PIPE_BLENDFACTOR_CONST_ALPHA:
       factor_reg = spe_allocate_available_register(f);
 
-      spe_il(f, factor_reg, alpha.u & 0x0ffff);
-      spe_ilh(f, factor_reg, alpha.u >> 16);
+      tmp = spe_allocate_available_register(f);
+      spe_il(f, tmp, 1);
+      spe_cuflt(f, tmp, tmp, 0);
+      spe_fs(f, factor_reg, tmp, const_alpha);
+      spe_release_register(f, tmp);
+      break;
+
+   case PIPE_BLENDFACTOR_CONST_ALPHA:
+      factor_reg = const_alpha;
       break;
 
    case PIPE_BLENDFACTOR_ZERO:
@@ -674,24 +672,15 @@ emit_alpha_factor_calculation(struct spe_function *f,
 static void
 emit_color_factor_calculation(struct spe_function *f,
                               unsigned sF, unsigned mask,
-                              const struct pipe_blend_color *blend_color,
                               const int *src,
                               const int *dst,
+                              const int *const_color,
                               int *factor)
 {
-   union {
-      float f[4];
-      unsigned u[4];
-   } color;
    int tmp;
    unsigned i;
 
 
-   color.f[0] = blend_color->color[0];
-   color.f[1] = blend_color->color[1];
-   color.f[2] = blend_color->color[2];
-   color.f[3] = blend_color->color[3];
-
    factor[0] = -1;
    factor[1] = -1;
    factor[2] = -1;
@@ -748,29 +737,40 @@ emit_color_factor_calculation(struct spe_function *f,
       break;
 
    case PIPE_BLENDFACTOR_INV_CONST_COLOR:
-      color.f[0] = 1.0 - color.f[0];
-      color.f[1] = 1.0 - color.f[1];
-      color.f[2] = 1.0 - color.f[2];
-      /* FALLTHROUGH */
-   case PIPE_BLENDFACTOR_CONST_COLOR:
+      tmp = spe_allocate_available_register(f);
+      spe_il(f, tmp, 1);
+      spe_cuflt(f, tmp, tmp, 0);
+
       for (i = 0; i < 3; i++) {
          factor[i] = spe_allocate_available_register(f);
 
-         spe_il(f, factor[i], color.u[i] & 0x0ffff);
-         spe_ilh(f, factor[i], color.u[i] >> 16);
+         spe_fs(f, factor[i], tmp, const_color[i]);
+      }
+      spe_release_register(f, tmp);
+      break;
+
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+      for (i = 0; i < 3; i++) {
+         factor[i] = const_color[i];
       }
       break;
 
    case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
-      color.f[3] = 1.0 - color.f[3];
-      /* FALLTHROUGH */
-   case PIPE_BLENDFACTOR_CONST_ALPHA:
       factor[0] = spe_allocate_available_register(f);
       factor[1] = factor[0];
       factor[2] = factor[0];
 
-      spe_il(f, factor[0], color.u[3] & 0x0ffff);
-      spe_ilh(f, factor[0], color.u[3] >> 16);
+      tmp = spe_allocate_available_register(f);
+      spe_il(f, tmp, 1);
+      spe_cuflt(f, tmp, tmp, 0);
+      spe_fs(f, factor[0], tmp, const_color[3]);
+      spe_release_register(f, tmp);
+      break;
+
+   case PIPE_BLENDFACTOR_CONST_ALPHA:
+      factor[0] = const_color[3];
+      factor[1] = factor[0];
+      factor[2] = factor[0];
       break;
 
    case PIPE_BLENDFACTOR_ZERO:
@@ -945,8 +945,7 @@ emit_blend_calculation(struct spe_function *f,
  * Generate code to perform alpha blending on the SPE
  */
 void
-cell_generate_alpha_blend(struct cell_blend_state *cb,
-                          const struct pipe_blend_color *blend_color)
+cell_generate_alpha_blend(struct cell_blend_state *cb)
 {
    struct pipe_blend_state *const b = &cb->base;
    struct spe_function *const f = &cb->code;
@@ -972,7 +971,13 @@ cell_generate_alpha_blend(struct cell_blend_state *cb,
       spe_allocate_register(f, 9),
       spe_allocate_register(f, 10),
    };
-   const int mask = spe_allocate_register(f, 11);
+   const int const_color[4] = {
+      spe_allocate_register(f, 11),
+      spe_allocate_register(f, 12),
+      spe_allocate_register(f, 13),
+      spe_allocate_register(f, 14),
+   };
+   const int mask = spe_allocate_register(f, 15);
    unsigned func[4];
    unsigned sF[4];
    unsigned dF[4];
@@ -1053,8 +1058,7 @@ cell_generate_alpha_blend(struct cell_blend_state *cb,
     * the alpha factor, calculate the alpha factor.
     */
    if (((b->colormask & 8) != 0) && need_alpha_factor) {
-      src_factor[3] = emit_alpha_factor_calculation(f, sF[3],
-                                                    blend_color->color[3],
+      src_factor[3] = emit_alpha_factor_calculation(f, sF[3], const_color[3],
                                                     frag[3], pixel[3]);
 
       /* If the alpha destination blend factor is the same as the alpha source
@@ -1062,8 +1066,7 @@ cell_generate_alpha_blend(struct cell_blend_state *cb,
        */
       dst_factor[3] = (dF[3] == sF[3])
           ? src_factor[3]
-          : emit_alpha_factor_calculation(f, dF[3],
-                                          blend_color->color[3],
+          : emit_alpha_factor_calculation(f, dF[3], const_color[3],
                                           frag[3], pixel[3]);
    }
 
@@ -1080,8 +1083,7 @@ cell_generate_alpha_blend(struct cell_blend_state *cb,
       emit_color_factor_calculation(f,
                                     b->rgb_src_factor,
                                     b->colormask,
-                                    blend_color,
-                                    frag, pixel, src_factor);
+                                    frag, pixel, const_color, src_factor);
    }
 
 
@@ -1101,8 +1103,7 @@ cell_generate_alpha_blend(struct cell_blend_state *cb,
       emit_color_factor_calculation(f,
                                     b->rgb_dst_factor,
                                     b->colormask,
-                                    blend_color,
-                                    frag, pixel, dst_factor);
+                                    frag, pixel, const_color, dst_factor);
    }
 
 
diff --git a/src/gallium/drivers/cell/ppu/cell_state_per_fragment.h b/src/gallium/drivers/cell/ppu/cell_state_per_fragment.h
index 541c3b3be0..f699247f9e 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_per_fragment.h
+++ b/src/gallium/drivers/cell/ppu/cell_state_per_fragment.h
@@ -29,7 +29,6 @@ extern void
 cell_generate_depth_stencil_test(struct cell_depth_stencil_alpha_state *cdsa);
 
 extern void
-cell_generate_alpha_blend(struct cell_blend_state *cb,
-			  const struct pipe_blend_color *blend_color);
+cell_generate_alpha_blend(struct cell_blend_state *cb);
 
 #endif /* CELL_STATE_PER_FRAGMENT_H */
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 56d0968676..49f5d99674 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -77,6 +77,7 @@ struct spu_blend_results {
 typedef struct spu_blend_results (*blend_func)(
     qword frag_r, qword frag_g, qword frag_b, qword frag_a,
     qword pixel_r, qword pixel_g, qword pixel_b, qword pixel_a,
+    qword const_r, qword const_g, qword const_b, qword const_a,
     qword frag_mask);
 
 struct spu_framebuffer {
@@ -108,6 +109,7 @@ struct spu_global
    
    boolean read_fb;
    blend_func blend;
+   qword const_blend_color[4] ALIGN16_ATTRIB;
 
    struct pipe_sampler_state sampler[PIPE_MAX_SAMPLERS];
    struct cell_command_texture texture;
diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index c4272d6e93..e6a1ce01df 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -356,6 +356,8 @@ emit_quad( int x, int y, mask_t mask )
       const struct spu_blend_results result =
           (*spu.blend)(soa_frag[0], soa_frag[1], soa_frag[2], soa_frag[3],
                        soa_pix[0], soa_pix[1], soa_pix[2], soa_pix[3],
+                       spu.const_blend_color[0], spu.const_blend_color[1],
+                       spu.const_blend_color[2], spu.const_blend_color[3],
                        (qword) mask);
 
 
-- 
cgit v1.2.3


From 92126cea846959bb2152905a7712753d1114bd6b Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Wed, 26 Mar 2008 10:45:32 -0700
Subject: cell: Implement code-gen for logic op

This also implements code-gen for the float-to-packed color
conversion.  It's currently hardcoded for A8R8G8B8, but that can
easily be fixed as soon as other color depths are supported by the
Cell driver.
---
 src/gallium/drivers/cell/common.h                  |  11 +-
 src/gallium/drivers/cell/ppu/cell_context.h        |   2 +
 src/gallium/drivers/cell/ppu/cell_state_emit.c     |  17 ++
 .../drivers/cell/ppu/cell_state_per_fragment.c     | 261 ++++++++++++++++++++-
 .../drivers/cell/ppu/cell_state_per_fragment.h     |   4 +
 src/gallium/drivers/cell/spu/spu_main.c            |  19 ++
 src/gallium/drivers/cell/spu/spu_main.h            |   9 +-
 src/gallium/drivers/cell/spu/spu_tri.c             |  59 +++--
 8 files changed, 349 insertions(+), 33 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index d59e4f7036..b0928fefd2 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -92,8 +92,9 @@
 #define CELL_CMD_STATE_BIND_VS       18
 #define CELL_CMD_STATE_BLEND         19
 #define CELL_CMD_STATE_ATTRIB_FETCH  20
-#define CELL_CMD_VS_EXECUTE          21
-#define CELL_CMD_FLUSH_BUFFER_RANGE  22
+#define CELL_CMD_STATE_LOGICOP       21
+#define CELL_CMD_VS_EXECUTE          22
+#define CELL_CMD_FLUSH_BUFFER_RANGE  23
 
 
 #define CELL_NUM_BUFFERS 4
@@ -124,6 +125,12 @@ struct cell_command_blend {
 };
 
 
+struct cell_command_logicop {
+   uint64_t base;               /**< Effective address of code start. */
+   unsigned size;               /**< Size in bytes of test code. */
+};
+
+
 /**
  * Tell SPUs about the framebuffer size, location
  */
diff --git a/src/gallium/drivers/cell/ppu/cell_context.h b/src/gallium/drivers/cell/ppu/cell_context.h
index 9e79db0ace..0442abddc1 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.h
+++ b/src/gallium/drivers/cell/ppu/cell_context.h
@@ -92,6 +92,8 @@ struct cell_context
    const struct cell_vertex_shader_state *vs;
    const struct cell_fragment_shader_state *fs;
 
+   struct spe_function logic_op;
+
    struct pipe_blend_color blend_color;
    struct pipe_clip_state clip;
    struct pipe_constant_buffer constants[2];
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index 5709b48f12..5c1310d0b0 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -50,6 +50,23 @@ emit_state_cmd(struct cell_context *cell, uint cmd,
 void
 cell_emit_state(struct cell_context *cell)
 {
+   if (cell->dirty & (CELL_NEW_FRAMEBUFFER | CELL_NEW_BLEND)) {
+      struct cell_command_logicop logicop;
+
+      if (cell->logic_op.store != NULL) {
+	 spe_release_func(& cell->logic_op);
+      }
+
+      cell_generate_logic_op(& cell->logic_op,
+			     & cell->blend->base,
+			     cell->framebuffer.cbufs[0]);
+
+      logicop.base = (intptr_t) cell->logic_op.store;
+      logicop.size = 64 * 4;
+      emit_state_cmd(cell, CELL_CMD_STATE_LOGICOP, &logicop,
+		     sizeof(logicop));
+   }
+
    if (cell->dirty & CELL_NEW_FRAMEBUFFER) {
       struct pipe_surface *cbuf = cell->framebuffer.cbufs[0];
       struct pipe_surface *zbuf = cell->framebuffer.zsbuf;
diff --git a/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c b/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c
index c750b1d89d..f10025bd7c 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c
@@ -977,7 +977,6 @@ cell_generate_alpha_blend(struct cell_blend_state *cb)
       spe_allocate_register(f, 13),
       spe_allocate_register(f, 14),
    };
-   const int mask = spe_allocate_register(f, 15);
    unsigned func[4];
    unsigned sF[4];
    unsigned dF[4];
@@ -1114,9 +1113,6 @@ cell_generate_alpha_blend(struct cell_blend_state *cb)
                                 func[i], sF[i], dF[i],
                                 frag[i], src_factor[i],
                                 pixel[i], dst_factor[i]);
-         spe_selb(f, frag[i], pixel[i], frag[i], mask);
-      } else {
-         spe_or(f, frag[i], pixel[i], pixel[i]);
       }
    }
 
@@ -1146,3 +1142,260 @@ cell_generate_alpha_blend(struct cell_blend_state *cb)
    }
 #endif
 }
+
+
+int PC_OFFSET(const struct spe_function *f, const void *d)
+{
+   const intptr_t pc = (intptr_t) f->csr;
+   const intptr_t ea = ~0x0f & (intptr_t) d;
+
+   return (ea - pc) >> 2;
+}
+
+
+/**
+ * Generate code to perform color conversion and logic op
+ *
+ * \bug
+ * The code generated by this function should also perform dithering.
+ *
+ * \bug
+ * The code generated by this function should also perform color-write
+ * masking.
+ *
+ * \bug
+ * This routine is hard-coded to only work with ARGB8 data.
+ */
+void
+cell_generate_logic_op(struct spe_function *f, struct pipe_blend_state *blend,
+                       struct pipe_surface *surf)
+{
+   const unsigned logic_op = (blend->logicop_enable)
+       ? blend->logicop_func : PIPE_LOGICOP_COPY;
+
+   /* This code generates a maximum of 37 instructions.  An additional 32
+    * bytes (equiv. to 8 instructions) are needed for data storage.  Round up
+    * to 64 to make it a happy power-of-two.
+    */
+   spe_init_func(f, 4 * 64);
+
+
+   /* Pixel colors in framebuffer format in AoS layout.
+    */
+   const int pixel[4] = {
+      spe_allocate_register(f, 3),
+      spe_allocate_register(f, 4),
+      spe_allocate_register(f, 5),
+      spe_allocate_register(f, 6),
+   };
+
+   /* Fragment colors stored as floats in SoA layout.
+    */
+   const int frag[4] = {
+      spe_allocate_register(f, 7),
+      spe_allocate_register(f, 8),
+      spe_allocate_register(f, 9),
+      spe_allocate_register(f, 10),
+   };
+
+   const int mask = spe_allocate_register(f, 11);
+
+
+   /* Short-circuit the noop and invert cases.
+    */
+   if ((logic_op == PIPE_LOGICOP_NOOP) || (blend->colormask == 0)) {
+      spe_bi(f, 0, 0, 0);
+      return;
+   } else if (logic_op == PIPE_LOGICOP_INVERT) {
+      spe_nor(f, pixel[0], pixel[0], pixel[0]);
+      spe_nor(f, pixel[1], pixel[1], pixel[1]);
+      spe_nor(f, pixel[2], pixel[2], pixel[2]);
+      spe_nor(f, pixel[3], pixel[3], pixel[3]);
+      spe_bi(f, 0, 0, 0);
+      return;
+   }
+
+
+   const int tmp[4] = {
+      spe_allocate_available_register(f),
+      spe_allocate_available_register(f),
+      spe_allocate_available_register(f),
+      spe_allocate_available_register(f),
+   };
+
+   const int shuf_xpose_hi = spe_allocate_available_register(f);
+   const int shuf_xpose_lo = spe_allocate_available_register(f);
+   const int shuf_color = spe_allocate_available_register(f);
+
+
+   /* Pointer to the begining of the function's private data area.
+    */
+   uint32_t *const data = ((uint32_t *) f->store) + (64 - 8);
+
+
+   /* Convert fragment colors to framebuffer format in AoS layout.
+    */
+   data[0] = 0x00010203;
+   data[1] = 0x10111213;
+   data[2] = 0x04050607;
+   data[3] = 0x14151617;
+
+   data[4] = 0x0c000408;
+   data[5] = 0x80808080;
+   data[6] = 0x80808080;
+   data[7] = 0x80808080;
+
+   spe_ilh(f, tmp[0], 0x0808);
+   spe_lqr(f, shuf_xpose_hi, PC_OFFSET(f, data+0));
+   spe_lqr(f, shuf_color, PC_OFFSET(f, data+4));
+   spe_a(f, shuf_xpose_lo, shuf_xpose_hi, tmp[0]);
+
+   spe_shufb(f, tmp[0], frag[0], frag[2], shuf_xpose_hi);
+   spe_shufb(f, tmp[1], frag[0], frag[2], shuf_xpose_lo);
+   spe_shufb(f, tmp[2], frag[1], frag[3], shuf_xpose_hi);
+   spe_shufb(f, tmp[3], frag[1], frag[3], shuf_xpose_lo);
+
+   spe_shufb(f, frag[0], tmp[0], tmp[2], shuf_xpose_hi);
+   spe_shufb(f, frag[1], tmp[0], tmp[2], shuf_xpose_lo);
+   spe_shufb(f, frag[2], tmp[1], tmp[3], shuf_xpose_hi);
+   spe_shufb(f, frag[3], tmp[1], tmp[3], shuf_xpose_lo);
+
+   spe_cfltu(f, frag[0], frag[0], 32);
+   spe_cfltu(f, frag[1], frag[1], 32);
+   spe_cfltu(f, frag[2], frag[2], 32);
+   spe_cfltu(f, frag[3], frag[3], 32);
+
+   spe_shufb(f, frag[0], frag[0], pixel[0], shuf_color);
+   spe_shufb(f, frag[1], frag[1], pixel[1], shuf_color);
+   spe_shufb(f, frag[2], frag[2], pixel[2], shuf_color);
+   spe_shufb(f, frag[3], frag[3], pixel[3], shuf_color);
+
+
+   /* If logic op is enabled, perform the requested logical operation on the
+    * converted fragment colors and the pixel colors.
+    */
+   switch (logic_op) {
+   case PIPE_LOGICOP_CLEAR:
+      spe_il(f, frag[0], 0);
+      spe_il(f, frag[1], 0);
+      spe_il(f, frag[2], 0);
+      spe_il(f, frag[3], 0);
+      break;
+   case PIPE_LOGICOP_NOR:
+      spe_nor(f, frag[0], frag[0], pixel[0]);
+      spe_nor(f, frag[1], frag[1], pixel[1]);
+      spe_nor(f, frag[2], frag[2], pixel[2]);
+      spe_nor(f, frag[3], frag[3], pixel[3]);
+      break;
+   case PIPE_LOGICOP_AND_INVERTED:
+      spe_andc(f, frag[0], pixel[0], frag[0]);
+      spe_andc(f, frag[1], pixel[1], frag[1]);
+      spe_andc(f, frag[2], pixel[2], frag[2]);
+      spe_andc(f, frag[3], pixel[3], frag[3]);
+      break;
+   case PIPE_LOGICOP_COPY_INVERTED:
+      spe_nor(f, frag[0], frag[0], frag[0]);
+      spe_nor(f, frag[1], frag[1], frag[1]);
+      spe_nor(f, frag[2], frag[2], frag[2]);
+      spe_nor(f, frag[3], frag[3], frag[3]);
+      break;
+   case PIPE_LOGICOP_AND_REVERSE:
+      spe_andc(f, frag[0], frag[0], pixel[0]);
+      spe_andc(f, frag[1], frag[1], pixel[1]);
+      spe_andc(f, frag[2], frag[2], pixel[2]);
+      spe_andc(f, frag[3], frag[3], pixel[3]);
+      break;
+   case PIPE_LOGICOP_XOR:
+      spe_xor(f, frag[0], frag[0], pixel[0]);
+      spe_xor(f, frag[1], frag[1], pixel[1]);
+      spe_xor(f, frag[2], frag[2], pixel[2]);
+      spe_xor(f, frag[3], frag[3], pixel[3]);
+      break;
+   case PIPE_LOGICOP_NAND:
+      spe_nand(f, frag[0], frag[0], pixel[0]);
+      spe_nand(f, frag[1], frag[1], pixel[1]);
+      spe_nand(f, frag[2], frag[2], pixel[2]);
+      spe_nand(f, frag[3], frag[3], pixel[3]);
+      break;
+   case PIPE_LOGICOP_AND:
+      spe_and(f, frag[0], frag[0], pixel[0]);
+      spe_and(f, frag[1], frag[1], pixel[1]);
+      spe_and(f, frag[2], frag[2], pixel[2]);
+      spe_and(f, frag[3], frag[3], pixel[3]);
+      break;
+   case PIPE_LOGICOP_EQUIV:
+      spe_eqv(f, frag[0], frag[0], pixel[0]);
+      spe_eqv(f, frag[1], frag[1], pixel[1]);
+      spe_eqv(f, frag[2], frag[2], pixel[2]);
+      spe_eqv(f, frag[3], frag[3], pixel[3]);
+      break;
+   case PIPE_LOGICOP_OR_INVERTED:
+      spe_orc(f, frag[0], pixel[0], frag[0]);
+      spe_orc(f, frag[1], pixel[1], frag[1]);
+      spe_orc(f, frag[2], pixel[2], frag[2]);
+      spe_orc(f, frag[3], pixel[3], frag[3]);
+      break;
+   case PIPE_LOGICOP_COPY:
+      break;
+   case PIPE_LOGICOP_OR_REVERSE:
+      spe_orc(f, frag[0], frag[0], pixel[0]);
+      spe_orc(f, frag[1], frag[1], pixel[1]);
+      spe_orc(f, frag[2], frag[2], pixel[2]);
+      spe_orc(f, frag[3], frag[3], pixel[3]);
+      break;
+   case PIPE_LOGICOP_OR:
+      spe_or(f, frag[0], frag[0], pixel[0]);
+      spe_or(f, frag[1], frag[1], pixel[1]);
+      spe_or(f, frag[2], frag[2], pixel[2]);
+      spe_or(f, frag[3], frag[3], pixel[3]);
+      break;
+   case PIPE_LOGICOP_SET:
+      spe_il(f, frag[0], ~0);
+      spe_il(f, frag[1], ~0);
+      spe_il(f, frag[2], ~0);
+      spe_il(f, frag[3], ~0);
+      break;
+
+   /* These two cases are short-circuited above.
+    */
+   case PIPE_LOGICOP_INVERT:
+   case PIPE_LOGICOP_NOOP:
+   default:
+      assert(0);
+   }
+
+
+   /* Apply fragment mask.
+    */
+   spe_ilh(f, tmp[0], 0x0000);
+   spe_ilh(f, tmp[1], 0x0404);
+   spe_ilh(f, tmp[2], 0x0808);
+   spe_ilh(f, tmp[3], 0x0c0c);
+
+   spe_shufb(f, tmp[0], mask, mask, tmp[0]);
+   spe_shufb(f, tmp[1], mask, mask, tmp[1]);
+   spe_shufb(f, tmp[2], mask, mask, tmp[2]);
+   spe_shufb(f, tmp[3], mask, mask, tmp[3]);
+
+   spe_selb(f, pixel[0], pixel[0], frag[0], tmp[0]);
+   spe_selb(f, pixel[1], pixel[1], frag[1], tmp[1]);
+   spe_selb(f, pixel[2], pixel[2], frag[2], tmp[2]);
+   spe_selb(f, pixel[3], pixel[3], frag[3], tmp[3]);
+
+   spe_bi(f, 0, 0, 0);
+
+#if 0
+   {
+      const uint32_t *p = f->store;
+      unsigned i;
+
+      printf("# %u instructions\n", f->csr - f->store);
+
+      printf("\t.text\n");
+      for (i = 0; i < 64; i++) {
+         printf("\t.long\t0x%04x\n", p[i]);
+      }
+      fflush(stdout);
+   }
+#endif
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_state_per_fragment.h b/src/gallium/drivers/cell/ppu/cell_state_per_fragment.h
index f699247f9e..ab4de96c69 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_per_fragment.h
+++ b/src/gallium/drivers/cell/ppu/cell_state_per_fragment.h
@@ -31,4 +31,8 @@ cell_generate_depth_stencil_test(struct cell_depth_stencil_alpha_state *cdsa);
 extern void
 cell_generate_alpha_blend(struct cell_blend_state *cb);
 
+extern void
+cell_generate_logic_op(struct spe_function *f, struct pipe_blend_state *blend,
+    struct pipe_surface *surf);
+
 #endif /* CELL_STATE_PER_FRAGMENT_H */
diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index 0a490ab277..fccff01e10 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -64,6 +64,9 @@ static unsigned char depth_stencil_code_buffer[4 * 64]
 static unsigned char fb_blend_code_buffer[4 * 64]
     ALIGN16_ATTRIB;
 
+static unsigned char logicop_code_buffer[4 * 64]
+    ALIGN16_ATTRIB;
+
 
 /**
  * Tell the PPU that this SPU has finished copying a buffer to
@@ -513,6 +516,22 @@ cmd_batch(uint opcode)
          pos += (1 + ROUNDUP8(sizeof(struct cell_attribute_fetch_code)) / 8);
          break;
       }
+      case CELL_CMD_STATE_LOGICOP: {
+         struct cell_command_logicop *code =
+             (struct cell_command_logicop *) &buffer[pos+1];
+
+              mfc_get(logicop_code_buffer,
+                      (unsigned int) code->base,  /* src */
+                      code->size,
+                      TAG_BATCH_BUFFER,
+                      0, /* tid */
+                      0  /* rid */);
+         wait_on_mask(1 << TAG_BATCH_BUFFER);
+
+	 spu.logicop = (logicop_func) logicop_code_buffer;
+         pos += (1 + ROUNDUP8(sizeof(struct cell_command_logicop)) / 8);
+         break;
+      }
       case CELL_CMD_FLUSH_BUFFER_RANGE: {
 	 struct cell_buffer_range *br = (struct cell_buffer_range *)
 	     &buffer[pos+1];
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 49f5d99674..c20452931a 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -77,9 +77,14 @@ struct spu_blend_results {
 typedef struct spu_blend_results (*blend_func)(
     qword frag_r, qword frag_g, qword frag_b, qword frag_a,
     qword pixel_r, qword pixel_g, qword pixel_b, qword pixel_a,
-    qword const_r, qword const_g, qword const_b, qword const_a,
+    qword const_r, qword const_g, qword const_b, qword const_a);
+
+typedef struct spu_blend_results (*logicop_func)(
+    qword pixel_r, qword pixel_g, qword pixel_b, qword pixel_a,
+    qword frag_r, qword frag_g, qword frag_b, qword frag_a,
     qword frag_mask);
 
+
 struct spu_framebuffer {
    void *color_start;              /**< addr of color surface in main memory */
    void *depth_start;              /**< addr of depth surface in main memory */
@@ -111,6 +116,8 @@ struct spu_global
    blend_func blend;
    qword const_blend_color[4] ALIGN16_ATTRIB;
 
+   logicop_func logicop;
+
    struct pipe_sampler_state sampler[PIPE_MAX_SAMPLERS];
    struct cell_command_texture texture;
 
diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index e6a1ce01df..95c629a8aa 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -305,7 +305,6 @@ emit_quad( int x, int y, mask_t mask )
    if (spu_extract(spu_orx(mask), 0)) {
       const int ix = x - setup.cliprect_minx;
       const int iy = y - setup.cliprect_miny;
-      const vector unsigned char shuffle = spu.color_shuffle;
       vector float colors[4];
 
       spu.cur_ctile_status = TILE_STATUS_DIRTY;
@@ -330,45 +329,53 @@ emit_quad( int x, int y, mask_t mask )
       }
 
 
+      /* Convert fragment data from AoS to SoA format.
+       */
+      qword soa_frag[4];
+      _transpose_matrix4x4((vec_float4 *) soa_frag, colors);
+
       /* Read the current framebuffer values.
-       *
-       * Ignore read_fb for now.  In the future we can use this to avoid
-       * reading the framebuffer if read_fb is false and the fragment mask is
-       * all 0xffffffff.  This is the common case, so it is probably worth
-       * the effort.  We'll have to profile to determine whether or not the
-       * extra conditional branches hurt overall performance.
        */
-      vec_float4 aos_pix[4] = {
-         spu_unpack_A8R8G8B8(spu.ctile.ui[iy+0][ix+0]),
-         spu_unpack_A8R8G8B8(spu.ctile.ui[iy+0][ix+1]),
-         spu_unpack_A8R8G8B8(spu.ctile.ui[iy+1][ix+0]),
-         spu_unpack_A8R8G8B8(spu.ctile.ui[iy+1][ix+1]),
+      const qword pix[4] = {
+         (qword) spu_splats(spu.ctile.ui[iy+0][ix+0]),
+         (qword) spu_splats(spu.ctile.ui[iy+0][ix+1]),
+         (qword) spu_splats(spu.ctile.ui[iy+1][ix+0]),
+         (qword) spu_splats(spu.ctile.ui[iy+1][ix+1]),
       };
 
       qword soa_pix[4];
-      qword soa_frag[4];
 
-      /* Convert pixel and fragment data from AoS to SoA format.
-       */
-      _transpose_matrix4x4((vec_float4 *) soa_pix, aos_pix);
-      _transpose_matrix4x4((vec_float4 *) soa_frag, colors);
+      if (spu.read_fb) {
+         /* Convert pixel data from AoS to SoA format.
+          */
+         vec_float4 aos_pix[4] = {
+            spu_unpack_A8R8G8B8(spu.ctile.ui[iy+0][ix+0]),
+            spu_unpack_A8R8G8B8(spu.ctile.ui[iy+0][ix+1]),
+            spu_unpack_A8R8G8B8(spu.ctile.ui[iy+1][ix+0]),
+            spu_unpack_A8R8G8B8(spu.ctile.ui[iy+1][ix+1]),
+         };
+
+         _transpose_matrix4x4((vec_float4 *) soa_pix, aos_pix);
+      }
 
-      const struct spu_blend_results result =
+
+      struct spu_blend_results result =
           (*spu.blend)(soa_frag[0], soa_frag[1], soa_frag[2], soa_frag[3],
                        soa_pix[0], soa_pix[1], soa_pix[2], soa_pix[3],
                        spu.const_blend_color[0], spu.const_blend_color[1],
-                       spu.const_blend_color[2], spu.const_blend_color[3],
-                       (qword) mask);
+                       spu.const_blend_color[2], spu.const_blend_color[3]);
 
 
       /* Convert final pixel data from SoA to AoS format.
        */
-      _transpose_matrix4x4(aos_pix, (const vec_float4 *) &result);
-
-      spu.ctile.ui[iy+0][ix+0] = spu_pack_color_shuffle(aos_pix[0], shuffle);
-      spu.ctile.ui[iy+0][ix+1] = spu_pack_color_shuffle(aos_pix[1], shuffle);
-      spu.ctile.ui[iy+1][ix+0] = spu_pack_color_shuffle(aos_pix[2], shuffle);
-      spu.ctile.ui[iy+1][ix+1] = spu_pack_color_shuffle(aos_pix[3], shuffle);
+      result = (*spu.logicop)(pix[0], pix[1], pix[2], pix[3],
+                              result.r, result.g, result.b, result.a,
+                              (qword) mask);
+
+      spu.ctile.ui[iy+0][ix+0] = spu_extract((vec_uint4) result.r, 0);
+      spu.ctile.ui[iy+0][ix+1] = spu_extract((vec_uint4) result.g, 0);
+      spu.ctile.ui[iy+1][ix+0] = spu_extract((vec_uint4) result.b, 0);
+      spu.ctile.ui[iy+1][ix+1] = spu_extract((vec_uint4) result.a, 0);
    }
 #endif
 }
-- 
cgit v1.2.3


From 979358c47115d8ea50001832372f8043a60a5b80 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Thu, 27 Mar 2008 15:26:22 -0600
Subject: cell: fix unclosed comment

---
 src/gallium/drivers/cell/spu/spu_exec.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/spu/spu_exec.c b/src/gallium/drivers/cell/spu/spu_exec.c
index 061fbebf61..48edc62f49 100644
--- a/src/gallium/drivers/cell/spu/spu_exec.c
+++ b/src/gallium/drivers/cell/spu/spu_exec.c
@@ -1453,7 +1453,7 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_TXP:
-      /* Texture lookup with projection
+      /* Texture lookup with projection */
       /* src[0] = texcoord (src[0].w = projection) */
       /* src[1] = sampler unit */
       exec_tex(mach, inst, TRUE, TRUE);
-- 
cgit v1.2.3


From 39038c11699bbc9baab744542e96d54e91cb452a Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Thu, 27 Mar 2008 17:41:55 -0600
Subject: gallium: replace PIPE_ATTRIB_MAX with PIPE_MAX_ATTRIBS

The later follows the naming scheme of other limits.
Keep the old definition until all possible usage is updated.
---
 src/gallium/auxiliary/draw/draw_context.c        |  4 ++--
 src/gallium/auxiliary/draw/draw_private.h        | 12 ++++++------
 src/gallium/auxiliary/draw/draw_pt_fetch_emit.c  |  2 +-
 src/gallium/auxiliary/draw/draw_vf.c             |  4 ++--
 src/gallium/auxiliary/draw/draw_vf.h             |  2 +-
 src/gallium/auxiliary/draw/draw_vs_exec.c        |  4 ++--
 src/gallium/auxiliary/draw/draw_vs_llvm.c        |  4 ++--
 src/gallium/auxiliary/draw/draw_vs_sse.c         |  4 ++--
 src/gallium/drivers/cell/ppu/cell_context.h      |  6 +++---
 src/gallium/drivers/cell/ppu/cell_draw_arrays.c  |  4 ++--
 src/gallium/drivers/cell/ppu/cell_state_vertex.c |  4 ++--
 src/gallium/drivers/cell/ppu/cell_vertex_fetch.c |  2 +-
 src/gallium/drivers/cell/spu/spu_main.c          |  4 ++--
 src/gallium/drivers/cell/spu/spu_vertex_shader.c |  4 ++--
 src/gallium/drivers/cell/spu/spu_vertex_shader.h |  8 ++++----
 src/gallium/drivers/failover/fo_context.h        |  4 ++--
 src/gallium/drivers/failover/fo_state_emit.c     |  4 ++--
 src/gallium/drivers/i915simple/i915_context.c    |  4 ++--
 src/gallium/drivers/i915simple/i915_context.h    |  2 +-
 src/gallium/drivers/i965simple/brw_clip.h        |  2 +-
 src/gallium/drivers/i965simple/brw_context.h     | 12 ++++++------
 src/gallium/drivers/i965simple/brw_state.c       |  2 +-
 src/gallium/drivers/i965simple/brw_wm.h          |  4 ++--
 src/gallium/drivers/softpipe/sp_context.h        |  6 +++---
 src/gallium/drivers/softpipe/sp_draw_arrays.c    |  4 ++--
 src/gallium/drivers/softpipe/sp_quad_fs.c        |  4 ++--
 src/gallium/drivers/softpipe/sp_state_vertex.c   |  4 ++--
 src/gallium/include/pipe/p_state.h               |  3 ++-
 src/mesa/state_tracker/st_draw.c                 |  2 +-
 29 files changed, 63 insertions(+), 62 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/auxiliary/draw/draw_context.c b/src/gallium/auxiliary/draw/draw_context.c
index 903cc26766..81858e01ca 100644
--- a/src/gallium/auxiliary/draw/draw_context.c
+++ b/src/gallium/auxiliary/draw/draw_context.c
@@ -238,7 +238,7 @@ draw_set_vertex_buffer(struct draw_context *draw,
                        const struct pipe_vertex_buffer *buffer)
 {
    draw_do_flush( draw, DRAW_FLUSH_VERTEX_CACHE/*STATE_CHANGE*/ );
-   assert(attr < PIPE_ATTRIB_MAX);
+   assert(attr < PIPE_MAX_ATTRIBS);
    draw->vertex_buffer[attr] = *buffer;
 }
 
@@ -249,7 +249,7 @@ draw_set_vertex_element(struct draw_context *draw,
                         const struct pipe_vertex_element *element)
 {
    draw_do_flush( draw, DRAW_FLUSH_VERTEX_CACHE/*STATE_CHANGE*/ );
-   assert(attr < PIPE_ATTRIB_MAX);
+   assert(attr < PIPE_MAX_ATTRIBS);
    draw->vertex_element[attr] = *element;
 }
 
diff --git a/src/gallium/auxiliary/draw/draw_private.h b/src/gallium/auxiliary/draw/draw_private.h
index 7007ee22c4..8eb2f515cb 100644
--- a/src/gallium/auxiliary/draw/draw_private.h
+++ b/src/gallium/auxiliary/draw/draw_private.h
@@ -228,8 +228,8 @@ struct draw_context
    /* pipe state that we need: */
    const struct pipe_rasterizer_state *rasterizer;
    struct pipe_viewport_state viewport;
-   struct pipe_vertex_buffer vertex_buffer[PIPE_ATTRIB_MAX];
-   struct pipe_vertex_element vertex_element[PIPE_ATTRIB_MAX];
+   struct pipe_vertex_buffer vertex_buffer[PIPE_MAX_ATTRIBS];
+   struct pipe_vertex_element vertex_element[PIPE_MAX_ATTRIBS];
    struct draw_vertex_shader *vertex_shader;
 
    uint num_vs_outputs;  /**< convenience, from vertex_shader */
@@ -242,7 +242,7 @@ struct draw_context
       unsigned eltSize;
 
       /** vertex arrays */
-      const void *vbuffer[PIPE_ATTRIB_MAX];
+      const void *vbuffer[PIPE_MAX_ATTRIBS];
 
       /** constant buffer (for vertex shader) */
       const void *constants;
@@ -275,9 +275,9 @@ struct draw_context
    /* Vertex fetch internal state
     */
    struct {
-      const ubyte *src_ptr[PIPE_ATTRIB_MAX];
-      unsigned pitch[PIPE_ATTRIB_MAX];
-      fetch_func fetch[PIPE_ATTRIB_MAX];
+      const ubyte *src_ptr[PIPE_MAX_ATTRIBS];
+      unsigned pitch[PIPE_MAX_ATTRIBS];
+      fetch_func fetch[PIPE_MAX_ATTRIBS];
       unsigned nr_attrs;
       full_fetch_func fetch_func;
       pt_fetch_func pt_fetch;
diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c b/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c
index 64ef83d800..9b098bc173 100644
--- a/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c
@@ -78,7 +78,7 @@ struct fetch_emit_middle_end {
       unsigned pitch;
       void (*fetch)( const void *from, float *attrib);
       void (*emit)( const float *attrib, float **out );
-   } fetch[PIPE_ATTRIB_MAX];
+   } fetch[PIPE_MAX_ATTRIBS];
    
    unsigned nr_fetch;
    unsigned hw_vertex_size;
diff --git a/src/gallium/auxiliary/draw/draw_vf.c b/src/gallium/auxiliary/draw/draw_vf.c
index f4e29a6293..7bb34ace7a 100644
--- a/src/gallium/auxiliary/draw/draw_vf.c
+++ b/src/gallium/auxiliary/draw/draw_vf.c
@@ -158,7 +158,7 @@ draw_vf_set_vertex_attributes( struct draw_vertex_fetch *vf,
    unsigned offset = 0;
    unsigned i, j;
 
-   assert(nr < PIPE_ATTRIB_MAX);
+   assert(nr < PIPE_MAX_ATTRIBS);
 
    for (j = 0, i = 0; i < nr; i++) {
       const unsigned format = map[i].format;
@@ -390,7 +390,7 @@ struct draw_vertex_fetch *draw_vf_create( void )
    struct draw_vertex_fetch *vf = CALLOC_STRUCT(draw_vertex_fetch);
    unsigned i;
 
-   for (i = 0; i < PIPE_ATTRIB_MAX; i++)
+   for (i = 0; i < PIPE_MAX_ATTRIBS; i++)
       vf->attr[i].vf = vf;
 
    vf->identity[0] = 0.0;
diff --git a/src/gallium/auxiliary/draw/draw_vf.h b/src/gallium/auxiliary/draw/draw_vf.h
index 011c8f0ff1..7555d1bd58 100644
--- a/src/gallium/auxiliary/draw/draw_vf.h
+++ b/src/gallium/auxiliary/draw/draw_vf.h
@@ -169,7 +169,7 @@ struct draw_vf_attr
 
 struct draw_vertex_fetch
 {
-   struct draw_vf_attr attr[PIPE_ATTRIB_MAX];
+   struct draw_vf_attr attr[PIPE_MAX_ATTRIBS];
    unsigned attr_count;
    unsigned vertex_stride;
 
diff --git a/src/gallium/auxiliary/draw/draw_vs_exec.c b/src/gallium/auxiliary/draw/draw_vs_exec.c
index 4e2fa72707..487d0ea7f4 100644
--- a/src/gallium/auxiliary/draw/draw_vs_exec.c
+++ b/src/gallium/auxiliary/draw/draw_vs_exec.c
@@ -99,8 +99,8 @@ vs_exec_run( struct draw_vertex_shader *shader,
    struct tgsi_exec_machine *machine = &draw->machine;
    unsigned int j;
 
-   ALIGN16_DECL(struct tgsi_exec_vector, inputs, PIPE_ATTRIB_MAX);
-   ALIGN16_DECL(struct tgsi_exec_vector, outputs, PIPE_ATTRIB_MAX);
+   ALIGN16_DECL(struct tgsi_exec_vector, inputs, PIPE_MAX_ATTRIBS);
+   ALIGN16_DECL(struct tgsi_exec_vector, outputs, PIPE_MAX_ATTRIBS);
    const float *scale = draw->viewport.scale;
    const float *trans = draw->viewport.translate;
 
diff --git a/src/gallium/auxiliary/draw/draw_vs_llvm.c b/src/gallium/auxiliary/draw/draw_vs_llvm.c
index bd983f2ddf..d29cb18efe 100644
--- a/src/gallium/auxiliary/draw/draw_vs_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_vs_llvm.c
@@ -107,8 +107,8 @@ vs_llvm_run( struct draw_vertex_shader *base,
    struct tgsi_exec_machine *machine = &draw->machine;
    unsigned int j;
 
-   ALIGN16_DECL(struct tgsi_exec_vector, inputs, PIPE_ATTRIB_MAX);
-   ALIGN16_DECL(struct tgsi_exec_vector, outputs, PIPE_ATTRIB_MAX);
+   ALIGN16_DECL(struct tgsi_exec_vector, inputs, PIPE_MAX_ATTRIBS);
+   ALIGN16_DECL(struct tgsi_exec_vector, outputs, PIPE_MAX_ATTRIBS);
    const float *scale = draw->viewport.scale;
    const float *trans = draw->viewport.translate;
 
diff --git a/src/gallium/auxiliary/draw/draw_vs_sse.c b/src/gallium/auxiliary/draw/draw_vs_sse.c
index a4503c143e..bc910dc2d0 100644
--- a/src/gallium/auxiliary/draw/draw_vs_sse.c
+++ b/src/gallium/auxiliary/draw/draw_vs_sse.c
@@ -114,8 +114,8 @@ vs_sse_run( struct draw_vertex_shader *base,
    struct tgsi_exec_machine *machine = &draw->machine;
    unsigned int j;
 
-   ALIGN16_DECL(struct tgsi_exec_vector, inputs, PIPE_ATTRIB_MAX);
-   ALIGN16_DECL(struct tgsi_exec_vector, outputs, PIPE_ATTRIB_MAX);
+   ALIGN16_DECL(struct tgsi_exec_vector, inputs, PIPE_MAX_ATTRIBS);
+   ALIGN16_DECL(struct tgsi_exec_vector, outputs, PIPE_MAX_ATTRIBS);
    const float *scale = draw->viewport.scale;
    const float *trans = draw->viewport.translate;
 
diff --git a/src/gallium/drivers/cell/ppu/cell_context.h b/src/gallium/drivers/cell/ppu/cell_context.h
index 0442abddc1..7f656a9744 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.h
+++ b/src/gallium/drivers/cell/ppu/cell_context.h
@@ -103,8 +103,8 @@ struct cell_context
    struct cell_texture *texture[PIPE_MAX_SAMPLERS];
    uint num_textures;
    struct pipe_viewport_state viewport;
-   struct pipe_vertex_buffer vertex_buffer[PIPE_ATTRIB_MAX];
-   struct pipe_vertex_element vertex_element[PIPE_ATTRIB_MAX];
+   struct pipe_vertex_buffer vertex_buffer[PIPE_MAX_ATTRIBS];
+   struct pipe_vertex_element vertex_element[PIPE_MAX_ATTRIBS];
 
    ubyte *cbuf_map[PIPE_MAX_COLOR_BUFS];
    ubyte *zsbuf_map;
@@ -141,7 +141,7 @@ struct cell_context
 
 
    struct spe_function attrib_fetch;
-   unsigned attrib_fetch_offsets[PIPE_ATTRIB_MAX];
+   unsigned attrib_fetch_offsets[PIPE_MAX_ATTRIBS];
 };
 
 
diff --git a/src/gallium/drivers/cell/ppu/cell_draw_arrays.c b/src/gallium/drivers/cell/ppu/cell_draw_arrays.c
index c839fb4d12..b896252f81 100644
--- a/src/gallium/drivers/cell/ppu/cell_draw_arrays.c
+++ b/src/gallium/drivers/cell/ppu/cell_draw_arrays.c
@@ -123,7 +123,7 @@ cell_draw_elements(struct pipe_context *pipe,
    /*
     * Map vertex buffers
     */
-   for (i = 0; i < PIPE_ATTRIB_MAX; i++) {
+   for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
       if (sp->vertex_buffer[i].buffer) {
          void *buf = pipe->winsys->buffer_map(pipe->winsys,
                                               sp->vertex_buffer[i].buffer,
@@ -151,7 +151,7 @@ cell_draw_elements(struct pipe_context *pipe,
    /*
     * unmap vertex/index buffers - will cause draw module to flush
     */
-   for (i = 0; i < PIPE_ATTRIB_MAX; i++) {
+   for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
       if (sp->vertex_buffer[i].buffer) {
          draw_set_mapped_vertex_buffer(draw, i, NULL);
          pipe->winsys->buffer_unmap(pipe->winsys, sp->vertex_buffer[i].buffer);
diff --git a/src/gallium/drivers/cell/ppu/cell_state_vertex.c b/src/gallium/drivers/cell/ppu/cell_state_vertex.c
index 563831b62d..37d25fb357 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_vertex.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_vertex.c
@@ -41,7 +41,7 @@ cell_set_vertex_element(struct pipe_context *pipe,
                             const struct pipe_vertex_element *attrib)
 {
    struct cell_context *cell = cell_context(pipe);
-   assert(index < PIPE_ATTRIB_MAX);
+   assert(index < PIPE_MAX_ATTRIBS);
    cell->vertex_element[index] = *attrib; /* struct copy */
    cell->dirty |= CELL_NEW_VERTEX;
 
@@ -55,7 +55,7 @@ cell_set_vertex_buffer(struct pipe_context *pipe,
                            const struct pipe_vertex_buffer *buffer)
 {
    struct cell_context *cell = cell_context(pipe);
-   assert(index < PIPE_ATTRIB_MAX);
+   assert(index < PIPE_MAX_ATTRIBS);
    cell->vertex_buffer[index] = *buffer; /* struct copy */
    cell->dirty |= CELL_NEW_VERTEX;
 
diff --git a/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c b/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c
index 4828a8023b..49d5443cde 100644
--- a/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c
+++ b/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c
@@ -263,7 +263,7 @@ void cell_update_vertex_fetch(struct draw_context *draw)
    struct cell_context *const cell =
        (struct cell_context *) draw->driver_private;
    struct spe_function *p = &cell->attrib_fetch;
-   unsigned function_index[PIPE_ATTRIB_MAX];
+   unsigned function_index[PIPE_MAX_ATTRIBS];
    unsigned unique_attr_formats;
    int out_ptr;
    int in_ptr;
diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index fccff01e10..d7f46f8024 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -55,7 +55,7 @@ struct spu_global spu;
 
 struct spu_vs_context draw;
 
-static unsigned char attribute_fetch_code_buffer[136 * PIPE_ATTRIB_MAX]
+static unsigned char attribute_fetch_code_buffer[136 * PIPE_MAX_ATTRIBS]
     ALIGN16_ATTRIB;
 
 static unsigned char depth_stencil_code_buffer[4 * 64]
@@ -361,7 +361,7 @@ cmd_state_vs_array_info(const struct cell_array_info *vs_info)
 {
    const unsigned attr = vs_info->attr;
 
-   ASSERT(attr < PIPE_ATTRIB_MAX);
+   ASSERT(attr < PIPE_MAX_ATTRIBS);
    draw.vertex_fetch.src_ptr[attr] = vs_info->base;
    draw.vertex_fetch.pitch[attr] = vs_info->pitch;
    draw.vertex_fetch.size[attr] = vs_info->size;
diff --git a/src/gallium/drivers/cell/spu/spu_vertex_shader.c b/src/gallium/drivers/cell/spu/spu_vertex_shader.c
index 8363efeeb6..3119a78c06 100644
--- a/src/gallium/drivers/cell/spu/spu_vertex_shader.c
+++ b/src/gallium/drivers/cell/spu/spu_vertex_shader.c
@@ -86,8 +86,8 @@ run_vertex_program(struct spu_vs_context *draw,
    struct spu_exec_machine *machine = &draw->machine;
    unsigned int j;
 
-   ALIGN16_DECL(struct spu_exec_vector, inputs, PIPE_ATTRIB_MAX);
-   ALIGN16_DECL(struct spu_exec_vector, outputs, PIPE_ATTRIB_MAX);
+   ALIGN16_DECL(struct spu_exec_vector, inputs, PIPE_MAX_ATTRIBS);
+   ALIGN16_DECL(struct spu_exec_vector, outputs, PIPE_MAX_ATTRIBS);
    const float *scale = draw->viewport.scale;
    const float *trans = draw->viewport.translate;
 
diff --git a/src/gallium/drivers/cell/spu/spu_vertex_shader.h b/src/gallium/drivers/cell/spu/spu_vertex_shader.h
index 54a4b8d9b9..4c74f5e74d 100644
--- a/src/gallium/drivers/cell/spu/spu_vertex_shader.h
+++ b/src/gallium/drivers/cell/spu/spu_vertex_shader.h
@@ -16,10 +16,10 @@ struct spu_vs_context {
    struct pipe_viewport_state viewport;
 
    struct {
-      uint64_t src_ptr[PIPE_ATTRIB_MAX];
-      unsigned pitch[PIPE_ATTRIB_MAX];
-      unsigned size[PIPE_ATTRIB_MAX];
-      unsigned code_offset[PIPE_ATTRIB_MAX];
+      uint64_t src_ptr[PIPE_MAX_ATTRIBS];
+      unsigned pitch[PIPE_MAX_ATTRIBS];
+      unsigned size[PIPE_MAX_ATTRIBS];
+      unsigned code_offset[PIPE_MAX_ATTRIBS];
       unsigned nr_attrs;
       boolean dirty;
 
diff --git a/src/gallium/drivers/failover/fo_context.h b/src/gallium/drivers/failover/fo_context.h
index 8f3ad3ee79..4afe10c4b8 100644
--- a/src/gallium/drivers/failover/fo_context.h
+++ b/src/gallium/drivers/failover/fo_context.h
@@ -84,8 +84,8 @@ struct failover_context {
    struct pipe_scissor_state scissor;
    struct pipe_texture *texture[PIPE_MAX_SAMPLERS];
    struct pipe_viewport_state viewport;
-   struct pipe_vertex_buffer vertex_buffer[PIPE_ATTRIB_MAX];
-   struct pipe_vertex_element vertex_element[PIPE_ATTRIB_MAX];
+   struct pipe_vertex_buffer vertex_buffer[PIPE_MAX_ATTRIBS];
+   struct pipe_vertex_element vertex_element[PIPE_MAX_ATTRIBS];
 
    void *sw_sampler_state[PIPE_MAX_SAMPLERS];
    void *hw_sampler_state[PIPE_MAX_SAMPLERS];
diff --git a/src/gallium/drivers/failover/fo_state_emit.c b/src/gallium/drivers/failover/fo_state_emit.c
index 3de931e04e..bb89f925e9 100644
--- a/src/gallium/drivers/failover/fo_state_emit.c
+++ b/src/gallium/drivers/failover/fo_state_emit.c
@@ -104,7 +104,7 @@ failover_state_emit( struct failover_context *failover )
    }
 
    if (failover->dirty & FO_NEW_VERTEX_BUFFER) {
-      for (i = 0; i < PIPE_ATTRIB_MAX; i++) {
+      for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
 	 if (failover->dirty_vertex_buffer & (1<<i)) {
 	    failover->sw->set_vertex_buffer( failover->sw, i, 
 					     &failover->vertex_buffer[i] );
@@ -113,7 +113,7 @@ failover_state_emit( struct failover_context *failover )
    }
 
    if (failover->dirty & FO_NEW_VERTEX_ELEMENT) {
-      for (i = 0; i < PIPE_ATTRIB_MAX; i++) {
+      for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
 	 if (failover->dirty_vertex_element & (1<<i)) {
 	    failover->sw->set_vertex_element( failover->sw, i, 
 					      &failover->vertex_element[i] );
diff --git a/src/gallium/drivers/i915simple/i915_context.c b/src/gallium/drivers/i915simple/i915_context.c
index 15ff2360b7..fee33d82de 100644
--- a/src/gallium/drivers/i915simple/i915_context.c
+++ b/src/gallium/drivers/i915simple/i915_context.c
@@ -65,7 +65,7 @@ i915_draw_elements( struct pipe_context *pipe,
    /*
     * Map vertex buffers
     */
-   for (i = 0; i < PIPE_ATTRIB_MAX; i++) {
+   for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
       if (i915->vertex_buffer[i].buffer) {
          void *buf
             = pipe->winsys->buffer_map(pipe->winsys,
@@ -96,7 +96,7 @@ i915_draw_elements( struct pipe_context *pipe,
    /*
     * unmap vertex/index buffers
     */
-   for (i = 0; i < PIPE_ATTRIB_MAX; i++) {
+   for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
       if (i915->vertex_buffer[i].buffer) {
          pipe->winsys->buffer_unmap(pipe->winsys, i915->vertex_buffer[i].buffer);
          draw_set_mapped_vertex_buffer(draw, i, NULL);
diff --git a/src/gallium/drivers/i915simple/i915_context.h b/src/gallium/drivers/i915simple/i915_context.h
index 746f18ba38..8e707ea574 100644
--- a/src/gallium/drivers/i915simple/i915_context.h
+++ b/src/gallium/drivers/i915simple/i915_context.h
@@ -232,7 +232,7 @@ struct i915_context
    struct pipe_scissor_state scissor;
    struct i915_texture *texture[PIPE_MAX_SAMPLERS];
    struct pipe_viewport_state viewport;
-   struct pipe_vertex_buffer vertex_buffer[PIPE_ATTRIB_MAX];
+   struct pipe_vertex_buffer vertex_buffer[PIPE_MAX_ATTRIBS];
 
    unsigned dirty;
 
diff --git a/src/gallium/drivers/i965simple/brw_clip.h b/src/gallium/drivers/i965simple/brw_clip.h
index a89d08b791..d70fc094ff 100644
--- a/src/gallium/drivers/i965simple/brw_clip.h
+++ b/src/gallium/drivers/i965simple/brw_clip.h
@@ -116,7 +116,7 @@ struct brw_clip_compile {
    unsigned last_mrf;
 
    unsigned header_position_offset;
-   unsigned offset[PIPE_ATTRIB_MAX];
+   unsigned offset[PIPE_MAX_ATTRIBS];
 };
 
 #define ATTR_SIZE  (4*4)
diff --git a/src/gallium/drivers/i965simple/brw_context.h b/src/gallium/drivers/i965simple/brw_context.h
index b83a13c3b6..0c96ba1732 100644
--- a/src/gallium/drivers/i965simple/brw_context.h
+++ b/src/gallium/drivers/i965simple/brw_context.h
@@ -433,17 +433,17 @@ struct brw_cached_batch_item {
 
 
-/* Protect against a future where PIPE_ATTRIB_MAX > 32.  Wouldn't life
+/* Protect against a future where PIPE_MAX_ATTRIBS > 32.  Wouldn't life
  * be easier if C allowed arrays of packed elements?
  */
-#define ATTRIB_BIT_DWORDS  ((PIPE_ATTRIB_MAX+31)/32)
+#define ATTRIB_BIT_DWORDS  ((PIPE_MAX_ATTRIBS+31)/32)
 
 
 struct brw_vertex_info {
-   unsigned varying;  /* varying:1[PIPE_ATTRIB_MAX] */
-   unsigned sizes[ATTRIB_BIT_DWORDS * 2]; /* sizes:2[PIPE_ATTRIB_MAX] */
+   unsigned varying;  /* varying:1[PIPE_MAX_ATTRIBS] */
+   unsigned sizes[ATTRIB_BIT_DWORDS * 2]; /* sizes:2[PIPE_MAX_ATTRIBS] */
 };
 
 
@@ -496,9 +496,9 @@ struct brw_context
       /* Arrays with buffer objects to copy non-bufferobj arrays into
        * for upload:
        */
-      const struct pipe_vertex_buffer *vbo_array[PIPE_ATTRIB_MAX];
+      const struct pipe_vertex_buffer *vbo_array[PIPE_MAX_ATTRIBS];
 
-      struct brw_vertex_element_state inputs[PIPE_ATTRIB_MAX];
+      struct brw_vertex_element_state inputs[PIPE_MAX_ATTRIBS];
 
 #define BRW_NR_UPLOAD_BUFS 17
 #define BRW_UPLOAD_INIT_SIZE (128*1024)
diff --git a/src/gallium/drivers/i965simple/brw_state.c b/src/gallium/drivers/i965simple/brw_state.c
index f5efe9fc06..0d04a8a594 100644
--- a/src/gallium/drivers/i965simple/brw_state.c
+++ b/src/gallium/drivers/i965simple/brw_state.c
@@ -292,7 +292,7 @@ static void brw_set_vertex_element(struct pipe_context *pipe,
    /* flush ? */
    struct brw_context *brw = brw_context(pipe);
 
-   assert(index < PIPE_ATTRIB_MAX);
+   assert(index < PIPE_MAX_ATTRIBS);
    struct brw_vertex_element_state el;
    memset(&el, 0, sizeof(el));
 
diff --git a/src/gallium/drivers/i965simple/brw_wm.h b/src/gallium/drivers/i965simple/brw_wm.h
index a1ac0f504a..b29c4393f0 100644
--- a/src/gallium/drivers/i965simple/brw_wm.h
+++ b/src/gallium/drivers/i965simple/brw_wm.h
@@ -76,7 +76,7 @@ struct brw_wm_prog_key {
 
 #define PROGRAM_INTERNAL_PARAM
 #define MAX_NV_FRAGMENT_PROGRAM_INSTRUCTIONS 1024 /* 72 for GL_ARB_f_p */
-#define BRW_WM_MAX_INSN  (MAX_NV_FRAGMENT_PROGRAM_INSTRUCTIONS*3 + PIPE_ATTRIB_MAX + 3)
+#define BRW_WM_MAX_INSN  (MAX_NV_FRAGMENT_PROGRAM_INSTRUCTIONS*3 + PIPE_MAX_ATTRIBS + 3)
 #define BRW_WM_MAX_GRF   128		/* hardware limit */
 #define BRW_WM_MAX_VREG  (BRW_WM_MAX_INSN * 4)
 #define BRW_WM_MAX_REF   (BRW_WM_MAX_INSN * 12)
@@ -84,7 +84,7 @@ struct brw_wm_prog_key {
 #define BRW_WM_MAX_CONST 256
 #define BRW_WM_MAX_KILLS MAX_NV_FRAGMENT_PROGRAM_INSTRUCTIONS
 
-#define PAYLOAD_DEPTH     (PIPE_ATTRIB_MAX)
+#define PAYLOAD_DEPTH     (PIPE_MAX_ATTRIBS)
 
 #define MAX_IFSN 32
 #define MAX_LOOP_DEPTH 32
diff --git a/src/gallium/drivers/softpipe/sp_context.h b/src/gallium/drivers/softpipe/sp_context.h
index 19e6cfaf02..dc9d0e6d5d 100644
--- a/src/gallium/drivers/softpipe/sp_context.h
+++ b/src/gallium/drivers/softpipe/sp_context.h
@@ -77,8 +77,8 @@ struct softpipe_context {
    struct pipe_scissor_state scissor;
    struct pipe_texture *texture[PIPE_MAX_SAMPLERS];
    struct pipe_viewport_state viewport;
-   struct pipe_vertex_buffer vertex_buffer[PIPE_ATTRIB_MAX];
-   struct pipe_vertex_element vertex_element[PIPE_ATTRIB_MAX];
+   struct pipe_vertex_buffer vertex_buffer[PIPE_MAX_ATTRIBS];
+   struct pipe_vertex_element vertex_element[PIPE_MAX_ATTRIBS];
    unsigned dirty;
 
    unsigned num_samplers;
@@ -92,7 +92,7 @@ struct softpipe_context {
    /*
     * Mapped vertex buffers
     */
-   ubyte *mapped_vbuffer[PIPE_ATTRIB_MAX];
+   ubyte *mapped_vbuffer[PIPE_MAX_ATTRIBS];
    
    /** Mapped constant buffers */
    void *mapped_constants[PIPE_SHADER_TYPES];
diff --git a/src/gallium/drivers/softpipe/sp_draw_arrays.c b/src/gallium/drivers/softpipe/sp_draw_arrays.c
index 5b5a0fe573..ab54050d3f 100644
--- a/src/gallium/drivers/softpipe/sp_draw_arrays.c
+++ b/src/gallium/drivers/softpipe/sp_draw_arrays.c
@@ -125,7 +125,7 @@ softpipe_draw_elements(struct pipe_context *pipe,
    /*
     * Map vertex buffers
     */
-   for (i = 0; i < PIPE_ATTRIB_MAX; i++) {
+   for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
       if (sp->vertex_buffer[i].buffer) {
          void *buf
             = pipe->winsys->buffer_map(pipe->winsys,
@@ -153,7 +153,7 @@ softpipe_draw_elements(struct pipe_context *pipe,
    /*
     * unmap vertex/index buffers - will cause draw module to flush
     */
-   for (i = 0; i < PIPE_ATTRIB_MAX; i++) {
+   for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
       if (sp->vertex_buffer[i].buffer) {
          draw_set_mapped_vertex_buffer(draw, i, NULL);
          pipe->winsys->buffer_unmap(pipe->winsys, sp->vertex_buffer[i].buffer);
diff --git a/src/gallium/drivers/softpipe/sp_quad_fs.c b/src/gallium/drivers/softpipe/sp_quad_fs.c
index 861285101f..c10ad80e01 100644
--- a/src/gallium/drivers/softpipe/sp_quad_fs.c
+++ b/src/gallium/drivers/softpipe/sp_quad_fs.c
@@ -185,8 +185,8 @@ struct quad_stage *sp_quad_shade_stage( struct softpipe_context *softpipe )
    uint i;
 
    /* allocate storage for program inputs/outputs, aligned to 16 bytes */
-   qss->inputs = MALLOC(PIPE_ATTRIB_MAX * sizeof(*qss->inputs) + 16);
-   qss->outputs = MALLOC(PIPE_ATTRIB_MAX * sizeof(*qss->outputs) + 16);
+   qss->inputs = MALLOC(PIPE_MAX_ATTRIBS * sizeof(*qss->inputs) + 16);
+   qss->outputs = MALLOC(PIPE_MAX_ATTRIBS * sizeof(*qss->outputs) + 16);
    qss->machine.Inputs = align16(qss->inputs);
    qss->machine.Outputs = align16(qss->outputs);
 
diff --git a/src/gallium/drivers/softpipe/sp_state_vertex.c b/src/gallium/drivers/softpipe/sp_state_vertex.c
index f01a10de3b..c054e76d9b 100644
--- a/src/gallium/drivers/softpipe/sp_state_vertex.c
+++ b/src/gallium/drivers/softpipe/sp_state_vertex.c
@@ -42,7 +42,7 @@ softpipe_set_vertex_element(struct pipe_context *pipe,
                             const struct pipe_vertex_element *attrib)
 {
    struct softpipe_context *softpipe = softpipe_context(pipe);
-   assert(index < PIPE_ATTRIB_MAX);
+   assert(index < PIPE_MAX_ATTRIBS);
    softpipe->vertex_element[index] = *attrib; /* struct copy */
    softpipe->dirty |= SP_NEW_VERTEX;
 
@@ -56,7 +56,7 @@ softpipe_set_vertex_buffer(struct pipe_context *pipe,
                            const struct pipe_vertex_buffer *buffer)
 {
    struct softpipe_context *softpipe = softpipe_context(pipe);
-   assert(index < PIPE_ATTRIB_MAX);
+   assert(index < PIPE_MAX_ATTRIBS);
    softpipe->vertex_buffer[index] = *buffer; /* struct copy */
    softpipe->dirty |= SP_NEW_VERTEX;
 
diff --git a/src/gallium/include/pipe/p_state.h b/src/gallium/include/pipe/p_state.h
index a2bd8c6aaa..2490412126 100644
--- a/src/gallium/include/pipe/p_state.h
+++ b/src/gallium/include/pipe/p_state.h
@@ -54,7 +54,8 @@ extern "C" {
 #define PIPE_MAX_SAMPLERS     8
 #define PIPE_MAX_CLIP_PLANES  6
 #define PIPE_MAX_CONSTANT    32
-#define PIPE_ATTRIB_MAX      32
+#define PIPE_MAX_ATTRIBS     32
+#define PIPE_ATTRIB_MAX      32 /* XXX obsolete - remove */
 #define PIPE_MAX_COLOR_BUFS   8
 #define PIPE_MAX_TEXTURE_LEVELS  16
 #define PIPE_MAX_FEEDBACK_ATTRIBS 16
diff --git a/src/mesa/state_tracker/st_draw.c b/src/mesa/state_tracker/st_draw.c
index 20af90df7d..4aca3311b7 100644
--- a/src/mesa/state_tracker/st_draw.c
+++ b/src/mesa/state_tracker/st_draw.c
@@ -606,7 +606,7 @@ st_feedback_draw_vbo(GLcontext *ctx,
    /*
     * unmap vertex/index buffers
     */
-   for (i = 0; i < PIPE_ATTRIB_MAX; i++) {
+   for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
       if (draw->vertex_buffer[i].buffer) {
          pipe->winsys->buffer_unmap(pipe->winsys,
                                     draw->vertex_buffer[i].buffer);
-- 
cgit v1.2.3


From 14452aee73e16f2ede075cf894e69d62cc539f5e Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 31 Mar 2008 17:38:21 -0600
Subject: cell: initial work to support multi-texture

---
 src/gallium/drivers/cell/common.h              |  8 +++++--
 src/gallium/drivers/cell/ppu/cell_state_emit.c | 18 +++++++---------
 src/gallium/drivers/cell/spu/spu_main.c        | 27 ++++++++++++++++--------
 src/gallium/drivers/cell/spu/spu_main.h        |  8 +++----
 src/gallium/drivers/cell/spu/spu_texture.c     | 29 ++++++++++++++++----------
 src/gallium/drivers/cell/spu/spu_tri.c         |  2 +-
 6 files changed, 55 insertions(+), 37 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index c9e873b35c..298812fc20 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -66,6 +66,8 @@
 
 #define CELL_MAX_SPUS 6
 
+#define CELL_MAX_SAMPLERS 4
+
 #define TILE_SIZE 32
 
 
@@ -228,8 +230,10 @@ struct cell_command_release_verts
 
 struct cell_command_texture
 {
-   void *start;         /**< Address in main memory */
-   uint width, height;
+   struct {
+      void *start;         /**< Address in main memory */
+      ushort width, height;
+   } texture[CELL_MAX_SAMPLERS];
 };
 
 
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index 4c75caa025..4fbe1a21b8 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -129,17 +129,15 @@ cell_emit_state(struct cell_context *cell)
 
    if (cell->dirty & CELL_NEW_TEXTURE) {
       struct cell_command_texture texture;
-      if (cell->texture[0]) {
-         texture.start = cell->texture[0]->tiled_data;
-         texture.width = cell->texture[0]->base.width[0];
-         texture.height = cell->texture[0]->base.height[0];
+      uint i;
+      memset(&texture, 0, sizeof(texture));
+      for (i = 0;i < CELL_MAX_SAMPLERS; i++) {
+         if (cell->texture[i]) {
+            texture.texture[i].start = cell->texture[i]->tiled_data;
+            texture.texture[i].width = cell->texture[i]->base.width[0];
+            texture.texture[i].height = cell->texture[i]->base.height[0];
+         }
       }
-      else {
-         texture.start = NULL;
-         texture.width = 0;
-         texture.height = 0;
-      }
-
       emit_state_cmd(cell, CELL_CMD_STATE_TEXTURE,
                      &texture, sizeof(struct cell_command_texture));
    }
diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index d7f46f8024..80fa5f7859 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -329,17 +329,26 @@ cmd_state_sampler(const struct pipe_sampler_state *state)
 static void
 cmd_state_texture(const struct cell_command_texture *texture)
 {
-   if (Debug)
-      printf("SPU %u: TEXTURE at %p  size %u x %u\n",
-             spu.init.id, texture->start, texture->width, texture->height);
+   uint i;
+
+   if (1||Debug) {
+      printf("SPU %u: TEXTURE\n", spu.init.id);
+      for (i = 0; i < CELL_MAX_SAMPLERS; i++) {
+         printf("  %d: at %p  size %u x %u\n", i, texture->texture[i].start,
+                texture->texture[i].width, texture->texture[i].height);
+      }
+   }
 
    memcpy(&spu.texture, texture, sizeof(*texture));
-   spu.tex_size = (vector float)
-      { spu.texture.width, spu.texture.height, 0.0, 0.0};
-   spu.tex_size_mask = (vector unsigned int)
-      { spu.texture.width - 1, spu.texture.height - 1, 0, 0 };
-   spu.tex_size_x_mask = spu_splats(spu.texture.width - 1);
-   spu.tex_size_y_mask = spu_splats(spu.texture.height - 1);
+   for (i = 0; i < CELL_MAX_SAMPLERS; i++) {
+      const uint width = texture->texture[i].width;
+      const uint height = texture->texture[i].height;
+      spu.tex_size[i] = (vector float) { width, height, 0.0, 0.0};
+      spu.tex_size_mask[i] = (vector unsigned int)
+         { width - 1, height - 1, 0, 0 };
+      spu.tex_size_x_mask[i] = spu_splats(width - 1);
+      spu.tex_size_y_mask[i] = spu_splats(height - 1);
+   }
 }
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index c20452931a..8a87787537 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -141,10 +141,10 @@ struct spu_global
    /** for converting RGBA to PIPE_FORMAT_x colors */
    vector unsigned char color_shuffle;
 
-   vector float tex_size;
-   vector unsigned int tex_size_mask; /**< == int(size - 1) */
-   vector unsigned int tex_size_x_mask; /**< == int(size - 1) */
-   vector unsigned int tex_size_y_mask; /**< == int(size - 1) */
+   vector float tex_size[CELL_MAX_SAMPLERS];
+   vector unsigned int tex_size_mask[CELL_MAX_SAMPLERS]; /**< == int(size - 1) */
+   vector unsigned int tex_size_x_mask[CELL_MAX_SAMPLERS]; /**< == int(size - 1) */
+   vector unsigned int tex_size_y_mask[CELL_MAX_SAMPLERS]; /**< == int(size - 1) */
 
    vector float (*sample_texture)(vector float texcoord);
 
diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
index 67eb08196a..91a6aec5ec 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.c
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -40,25 +40,29 @@
 void
 invalidate_tex_cache(void)
 {
-   spu_dcache_mark_dirty((unsigned) spu.texture.start,
-                         4 * spu.texture.width * spu.texture.height);
+   uint unit = 0;
+   uint bytes = 4 * spu.texture.texture[unit].width
+      * spu.texture.texture[unit].height;
+
+   spu_dcache_mark_dirty((unsigned) spu.texture.texture[unit].start, bytes);
 }
 
 
 static uint
 get_texel(vec_uint4 coordinate)
 {
+   const uint unit = 0;
    vec_uint4 tmp;
    unsigned x = spu_extract(coordinate, 0);
    unsigned y = spu_extract(coordinate, 1);
-   const unsigned tiles_per_row = spu.texture.width / TILE_SIZE;
+   const unsigned tiles_per_row = spu.texture.texture[unit].width / TILE_SIZE;
    unsigned tile_offset = sizeof(tile_t) * ((y / TILE_SIZE * tiles_per_row) 
                                             + (x / TILE_SIZE));
    unsigned texel_offset = 4 * (((y % TILE_SIZE) * TILE_SIZE)
                                 + (x % TILE_SIZE));
 
    spu_dcache_fetch_unaligned((qword *) & tmp,
-                              spu.texture.start + tile_offset + texel_offset,
+                              spu.texture.texture[unit].start + tile_offset + texel_offset,
                               4);
    return spu_extract(tmp, 0);
 }
@@ -67,13 +71,14 @@ get_texel(vec_uint4 coordinate)
 static void
 get_four_texels(vec_uint4 x, vec_uint4 y, vec_uint4 *texels)
 {
-   const unsigned texture_ea = (uintptr_t) spu.texture.start;
+   const uint unit = 0;
+   const unsigned texture_ea = (uintptr_t) spu.texture.texture[unit].start;
    vec_uint4 tile_x = spu_rlmask(x, -5);
    vec_uint4 tile_y = spu_rlmask(y, -5);
    const qword offset_x = si_andi((qword) x, 0x1f);
    const qword offset_y = si_andi((qword) y, 0x1f);
 
-   const qword tiles_per_row = (qword) spu_splats(spu.texture.width / TILE_SIZE);
+   const qword tiles_per_row = (qword) spu_splats(spu.texture.texture[unit].width / TILE_SIZE);
    const qword tile_size = (qword) spu_splats(sizeof(tile_t));
 
    qword tile_offset = si_mpya((qword) tile_y, tiles_per_row, (qword) tile_x);
@@ -101,9 +106,10 @@ get_four_texels(vec_uint4 x, vec_uint4 y, vec_uint4 *texels)
 vector float
 sample_texture_nearest(vector float texcoord)
 {
-   vector float tc = spu_mul(texcoord, spu.tex_size);
+   const uint unit = 0;
+   vector float tc = spu_mul(texcoord, spu.tex_size[unit]);
    vector unsigned int itc = spu_convtu(tc, 0);  /* convert to int */
-   itc = spu_and(itc, spu.tex_size_mask);        /* mask (GL_REPEAT) */
+   itc = spu_and(itc, spu.tex_size_mask[unit]);        /* mask (GL_REPEAT) */
    uint texel = get_texel(itc);
    return spu_unpack_A8R8G8B8(texel);
 }
@@ -112,10 +118,11 @@ sample_texture_nearest(vector float texcoord)
 vector float
 sample_texture_bilinear(vector float texcoord)
 {
+   const uint unit = 0;
    static const vec_uint4 offset_x = {0, 0, 1, 1};
    static const vec_uint4 offset_y = {0, 1, 0, 1};
 
-   vector float tc = spu_mul(texcoord, spu.tex_size);
+   vector float tc = spu_mul(texcoord, spu.tex_size[unit]);
    tc = spu_add(tc, spu_splats(-0.5f));  /* half texel bias */
 
    /* integer texcoords S,T: */
@@ -129,8 +136,8 @@ sample_texture_bilinear(vector float texcoord)
    x = spu_add(x, offset_x);
    y = spu_add(y, offset_y);
 
-   x = spu_and(x, spu.tex_size_x_mask);
-   y = spu_and(y, spu.tex_size_y_mask);
+   x = spu_and(x, spu.tex_size_x_mask[unit]);
+   y = spu_and(y, spu.tex_size_y_mask[unit]);
 
    get_four_texels(x, y, texels);
 
diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index 95c629a8aa..9f63317b1f 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -309,7 +309,7 @@ emit_quad( int x, int y, mask_t mask )
 
       spu.cur_ctile_status = TILE_STATUS_DIRTY;
 
-      if (spu.texture.start) {
+      if (spu.texture.texture[0].start) {
          /* texture mapping */
          vector float texcoords[4];
          eval_coeff(2, (float) x, (float) y, texcoords);
-- 
cgit v1.2.3


From e6c981f22c0b6469ef44e9d7a34113db34647fef Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 31 Mar 2008 21:09:02 -0600
Subject: cell: more work for multi-texture support

---
 src/gallium/drivers/cell/common.h              | 17 ++++++--
 src/gallium/drivers/cell/ppu/cell_state_emit.c | 31 ++++++++++-----
 src/gallium/drivers/cell/spu/spu_main.c        | 55 +++++++++++++++-----------
 src/gallium/drivers/cell/spu/spu_main.h        | 18 ++++++---
 src/gallium/drivers/cell/spu/spu_texture.c     | 24 +++++------
 src/gallium/drivers/cell/spu/spu_tri.c         |  2 +-
 6 files changed, 90 insertions(+), 57 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index 298812fc20..f430e88b9c 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -36,6 +36,7 @@
 #include "pipe/p_compiler.h"
 #include "pipe/p_util.h"
 #include "pipe/p_format.h"
+#include "pipe/p_state.h"
 
 
 /** The standard assert macro doesn't seem to work reliably */
@@ -228,12 +229,20 @@ struct cell_command_release_verts
 };
 
 
+struct cell_command_sampler
+{
+   uint64_t opcode;         /**< CELL_CMD_STATE_SAMPLER */
+   uint unit;
+   struct pipe_sampler_state state;
+};
+
+
 struct cell_command_texture
 {
-   struct {
-      void *start;         /**< Address in main memory */
-      ushort width, height;
-   } texture[CELL_MAX_SAMPLERS];
+   uint64_t opcode;     /**< CELL_CMD_STATE_TEXTURE */
+   uint unit;
+   void *start;         /**< Address in main memory */
+   ushort width, height;
 };
 
 
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index 4fbe1a21b8..9cae67f091 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -121,25 +121,36 @@ cell_emit_state(struct cell_context *cell)
    }
 
    if (cell->dirty & CELL_NEW_SAMPLER) {
-      if (cell->sampler[0]) {
-         emit_state_cmd(cell, CELL_CMD_STATE_SAMPLER,
-                        cell->sampler[0], sizeof(struct pipe_sampler_state));
+      uint i;
+      for (i = 0; i < CELL_MAX_SAMPLERS; i++) {
+         if (cell->sampler[i]) {
+            struct cell_command_sampler *sampler
+               = cell_batch_alloc(cell, sizeof(*sampler));
+            sampler->opcode = CELL_CMD_STATE_SAMPLER;
+            sampler->unit = i;
+            sampler->state = *cell->sampler[i];
+         }
       }
    }
 
    if (cell->dirty & CELL_NEW_TEXTURE) {
-      struct cell_command_texture texture;
       uint i;
-      memset(&texture, 0, sizeof(texture));
       for (i = 0;i < CELL_MAX_SAMPLERS; i++) {
+         struct cell_command_texture *texture
+            =  cell_batch_alloc(cell, sizeof(*texture));
+         texture->opcode = CELL_CMD_STATE_TEXTURE;
+         texture->unit = i;
          if (cell->texture[i]) {
-            texture.texture[i].start = cell->texture[i]->tiled_data;
-            texture.texture[i].width = cell->texture[i]->base.width[0];
-            texture.texture[i].height = cell->texture[i]->base.height[0];
+            texture->start = cell->texture[i]->tiled_data;
+            texture->width = cell->texture[i]->base.width[0];
+            texture->height = cell->texture[i]->base.height[0];
+         }
+         else {
+            texture->start = NULL;
+            texture->width = 1;
+            texture->height = 1;
          }
       }
-      emit_state_cmd(cell, CELL_CMD_STATE_TEXTURE,
-                     &texture, sizeof(struct cell_command_texture));
    }
 
    if (cell->dirty & CELL_NEW_VERTEX_INFO) {
diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index 80fa5f7859..7f0473d198 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -312,13 +312,13 @@ cmd_state_depth_stencil(const struct cell_command_depth_stencil_alpha_test *stat
 
 
 static void
-cmd_state_sampler(const struct pipe_sampler_state *state)
+cmd_state_sampler(const struct cell_command_sampler *sampler)
 {
    if (Debug)
-      printf("SPU %u: SAMPLER\n",
-             spu.init.id);
+      printf("SPU %u: SAMPLER [%u]\n",
+             spu.init.id, sampler->unit);
 
-   memcpy(&spu.sampler[0], state, sizeof(*state));
+   spu.sampler[sampler->unit] = sampler->state;
    if (spu.sampler[0].min_img_filter == PIPE_TEX_FILTER_LINEAR)
       spu.sample_texture = sample_texture_bilinear;
    else
@@ -329,26 +329,25 @@ cmd_state_sampler(const struct pipe_sampler_state *state)
 static void
 cmd_state_texture(const struct cell_command_texture *texture)
 {
-   uint i;
+   const uint unit = texture->unit;
+   const uint width = texture->width;
+   const uint height = texture->height;
 
-   if (1||Debug) {
-      printf("SPU %u: TEXTURE\n", spu.init.id);
-      for (i = 0; i < CELL_MAX_SAMPLERS; i++) {
-         printf("  %d: at %p  size %u x %u\n", i, texture->texture[i].start,
-                texture->texture[i].width, texture->texture[i].height);
-      }
+   if (Debug) {
+      printf("SPU %u: TEXTURE [%u] at %p  size %u x %u\n", spu.init.id,
+             texture->unit, texture->start,
+             texture->width, texture->height);
    }
 
-   memcpy(&spu.texture, texture, sizeof(*texture));
-   for (i = 0; i < CELL_MAX_SAMPLERS; i++) {
-      const uint width = texture->texture[i].width;
-      const uint height = texture->texture[i].height;
-      spu.tex_size[i] = (vector float) { width, height, 0.0, 0.0};
-      spu.tex_size_mask[i] = (vector unsigned int)
+   spu.texture[unit].start = texture->start;
+   spu.texture[unit].width = width;
+   spu.texture[unit].height = height;
+
+   spu.texture[unit].tex_size = (vector float) { width, height, 0.0, 0.0};
+   spu.texture[unit].tex_size_mask = (vector unsigned int)
          { width - 1, height - 1, 0, 0 };
-      spu.tex_size_x_mask[i] = spu_splats(width - 1);
-      spu.tex_size_y_mask[i] = spu_splats(height - 1);
-   }
+   spu.texture[unit].tex_size_x_mask = spu_splats(width - 1);
+   spu.texture[unit].tex_size_y_mask = spu_splats(height - 1);
 }
 
 
@@ -480,12 +479,20 @@ cmd_batch(uint opcode)
          pos += (1 + ROUNDUP8(sizeof(struct cell_command_depth_stencil_alpha_test)) / 8);
          break;
       case CELL_CMD_STATE_SAMPLER:
-         cmd_state_sampler((struct pipe_sampler_state *) &buffer[pos+1]);
-         pos += (1 + ROUNDUP8(sizeof(struct pipe_sampler_state)) / 8);
+         {
+            struct cell_command_sampler *sampler
+               = (struct cell_command_sampler *) &buffer[pos];
+            cmd_state_sampler(sampler);
+            pos += sizeof(*sampler) / 8;
+         }
          break;
       case CELL_CMD_STATE_TEXTURE:
-         cmd_state_texture((struct cell_command_texture *) &buffer[pos+1]);
-         pos += (1 + ROUNDUP8(sizeof(struct cell_command_texture)) / 8);
+         {
+            struct cell_command_texture *texture
+               = (struct cell_command_texture *) &buffer[pos];
+            cmd_state_texture(texture);
+            pos += sizeof(*texture) / 8;
+         }
          break;
       case CELL_CMD_STATE_VERTEX_INFO:
          cmd_state_vertex_info((struct vertex_info *) &buffer[pos+1]);
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 8a87787537..2bfad3535a 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -100,6 +100,17 @@ struct spu_framebuffer {
 } ALIGN16_ATTRIB;
 
 
+struct spu_texture
+{
+   void *start;
+   uint width, height;
+   vector float tex_size;
+   vector unsigned int tex_size_mask; /**< == int(size - 1) */
+   vector unsigned int tex_size_x_mask; /**< == int(size - 1) */
+   vector unsigned int tex_size_y_mask; /**< == int(size - 1) */
+} ALIGN16_ATTRIB;
+
+
 /**
  * All SPU global/context state will be in singleton object of this type:
  */
@@ -119,7 +130,7 @@ struct spu_global
    logicop_func logicop;
 
    struct pipe_sampler_state sampler[PIPE_MAX_SAMPLERS];
-   struct cell_command_texture texture;
+   struct spu_texture texture[PIPE_MAX_SAMPLERS];
 
    struct vertex_info vertex_info;
 
@@ -141,11 +152,6 @@ struct spu_global
    /** for converting RGBA to PIPE_FORMAT_x colors */
    vector unsigned char color_shuffle;
 
-   vector float tex_size[CELL_MAX_SAMPLERS];
-   vector unsigned int tex_size_mask[CELL_MAX_SAMPLERS]; /**< == int(size - 1) */
-   vector unsigned int tex_size_x_mask[CELL_MAX_SAMPLERS]; /**< == int(size - 1) */
-   vector unsigned int tex_size_y_mask[CELL_MAX_SAMPLERS]; /**< == int(size - 1) */
-
    vector float (*sample_texture)(vector float texcoord);
 
 } ALIGN16_ATTRIB;
diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
index 91a6aec5ec..4612501eb3 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.c
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -41,10 +41,10 @@ void
 invalidate_tex_cache(void)
 {
    uint unit = 0;
-   uint bytes = 4 * spu.texture.texture[unit].width
-      * spu.texture.texture[unit].height;
+   uint bytes = 4 * spu.texture[unit].width
+      * spu.texture[unit].height;
 
-   spu_dcache_mark_dirty((unsigned) spu.texture.texture[unit].start, bytes);
+   spu_dcache_mark_dirty((unsigned) spu.texture[unit].start, bytes);
 }
 
 
@@ -55,14 +55,14 @@ get_texel(vec_uint4 coordinate)
    vec_uint4 tmp;
    unsigned x = spu_extract(coordinate, 0);
    unsigned y = spu_extract(coordinate, 1);
-   const unsigned tiles_per_row = spu.texture.texture[unit].width / TILE_SIZE;
+   const unsigned tiles_per_row = spu.texture[unit].width / TILE_SIZE;
    unsigned tile_offset = sizeof(tile_t) * ((y / TILE_SIZE * tiles_per_row) 
                                             + (x / TILE_SIZE));
    unsigned texel_offset = 4 * (((y % TILE_SIZE) * TILE_SIZE)
                                 + (x % TILE_SIZE));
 
    spu_dcache_fetch_unaligned((qword *) & tmp,
-                              spu.texture.texture[unit].start + tile_offset + texel_offset,
+                              spu.texture[unit].start + tile_offset + texel_offset,
                               4);
    return spu_extract(tmp, 0);
 }
@@ -72,13 +72,13 @@ static void
 get_four_texels(vec_uint4 x, vec_uint4 y, vec_uint4 *texels)
 {
    const uint unit = 0;
-   const unsigned texture_ea = (uintptr_t) spu.texture.texture[unit].start;
+   const unsigned texture_ea = (uintptr_t) spu.texture[unit].start;
    vec_uint4 tile_x = spu_rlmask(x, -5);
    vec_uint4 tile_y = spu_rlmask(y, -5);
    const qword offset_x = si_andi((qword) x, 0x1f);
    const qword offset_y = si_andi((qword) y, 0x1f);
 
-   const qword tiles_per_row = (qword) spu_splats(spu.texture.texture[unit].width / TILE_SIZE);
+   const qword tiles_per_row = (qword) spu_splats(spu.texture[unit].width / TILE_SIZE);
    const qword tile_size = (qword) spu_splats(sizeof(tile_t));
 
    qword tile_offset = si_mpya((qword) tile_y, tiles_per_row, (qword) tile_x);
@@ -107,9 +107,9 @@ vector float
 sample_texture_nearest(vector float texcoord)
 {
    const uint unit = 0;
-   vector float tc = spu_mul(texcoord, spu.tex_size[unit]);
+   vector float tc = spu_mul(texcoord, spu.texture[unit].tex_size);
    vector unsigned int itc = spu_convtu(tc, 0);  /* convert to int */
-   itc = spu_and(itc, spu.tex_size_mask[unit]);        /* mask (GL_REPEAT) */
+   itc = spu_and(itc, spu.texture[unit].tex_size_mask); /* mask (GL_REPEAT) */
    uint texel = get_texel(itc);
    return spu_unpack_A8R8G8B8(texel);
 }
@@ -122,7 +122,7 @@ sample_texture_bilinear(vector float texcoord)
    static const vec_uint4 offset_x = {0, 0, 1, 1};
    static const vec_uint4 offset_y = {0, 1, 0, 1};
 
-   vector float tc = spu_mul(texcoord, spu.tex_size[unit]);
+   vector float tc = spu_mul(texcoord, spu.texture[unit].tex_size);
    tc = spu_add(tc, spu_splats(-0.5f));  /* half texel bias */
 
    /* integer texcoords S,T: */
@@ -136,8 +136,8 @@ sample_texture_bilinear(vector float texcoord)
    x = spu_add(x, offset_x);
    y = spu_add(y, offset_y);
 
-   x = spu_and(x, spu.tex_size_x_mask[unit]);
-   y = spu_and(y, spu.tex_size_y_mask[unit]);
+   x = spu_and(x, spu.texture[unit].tex_size_x_mask);
+   y = spu_and(y, spu.texture[unit].tex_size_y_mask);
 
    get_four_texels(x, y, texels);
 
diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index 9f63317b1f..17e337bbdf 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -309,7 +309,7 @@ emit_quad( int x, int y, mask_t mask )
 
       spu.cur_ctile_status = TILE_STATUS_DIRTY;
 
-      if (spu.texture.texture[0].start) {
+      if (spu.texture[0].start) {
          /* texture mapping */
          vector float texcoords[4];
          eval_coeff(2, (float) x, (float) y, texcoords);
-- 
cgit v1.2.3


From f8c09464f801e97b24ccdb1ba70444c60d4235bd Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Tue, 1 Apr 2008 11:05:32 -0600
Subject: cell: enable #define CACHE_STATS to print a cache report upon exit

---
 src/gallium/drivers/cell/spu/spu_dcache.c | 18 ++++++++++++++++++
 src/gallium/drivers/cell/spu/spu_dcache.h |  3 +++
 src/gallium/drivers/cell/spu/spu_main.c   |  2 ++
 3 files changed, 23 insertions(+)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/spu/spu_dcache.c b/src/gallium/drivers/cell/spu/spu_dcache.c
index a1701d80d1..167404cdc5 100644
--- a/src/gallium/drivers/cell/spu/spu_dcache.c
+++ b/src/gallium/drivers/cell/spu/spu_dcache.c
@@ -36,6 +36,7 @@
 #define CACHE_SET_TAGID(set)  (((set) & 0x03) + TAG_DCACHE0)
 #define CACHE_LOG2NNWAY       2
 #define CACHE_LOG2NSETS       6
+/*#define CACHE_STATS           1*/
 #include <cache-api.h>
 
 /* Yes folks, this is ugly.
@@ -123,3 +124,20 @@ spu_dcache_mark_dirty(unsigned ea, unsigned size)
           ? (entry & ~CACHELINE_VALID) : entry;
    }
 }
+
+
+/**
+ * Print cache utilization report
+ */
+void
+spu_dcache_report(void)
+{
+#ifdef CACHE_STATS
+   if (spu.init.id == 0) {
+      printf("SPU 0: Texture cache report:\n");
+      cache_pr_stats(data);
+   }
+#endif
+}
+
+
diff --git a/src/gallium/drivers/cell/spu/spu_dcache.h b/src/gallium/drivers/cell/spu/spu_dcache.h
index 7a06b8c25a..39a19eb31b 100644
--- a/src/gallium/drivers/cell/spu/spu_dcache.h
+++ b/src/gallium/drivers/cell/spu/spu_dcache.h
@@ -31,4 +31,7 @@ spu_dcache_fetch_unaligned(qword *dst, unsigned ea, unsigned size);
 extern void
 spu_dcache_mark_dirty(unsigned ea, unsigned size);
 
+extern void
+spu_dcache_report(void);
+
 #endif /* SPU_DCACHE_H */
diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index 7f0473d198..5b5a570a3c 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -629,6 +629,8 @@ main_loop(void)
 
    if (Debug)
       printf("SPU %u: Exit main loop\n", spu.init.id);
+
+   spu_dcache_report();
 }
 
 
-- 
cgit v1.2.3


From e7b23d36df1ab3ac5b54ef8e4e56c4fd46db8257 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Tue, 1 Apr 2008 11:35:53 -0600
Subject: cell: checkpoint: more multi-texture work

---
 src/gallium/drivers/cell/ppu/cell_texture.c |  8 +++++--
 src/gallium/drivers/cell/spu/spu_main.c     |  2 +-
 src/gallium/drivers/cell/spu/spu_main.h     |  2 +-
 src/gallium/drivers/cell/spu/spu_texture.c  |  6 ++---
 src/gallium/drivers/cell/spu/spu_texture.h  |  4 ++--
 src/gallium/drivers/cell/spu/spu_tri.c      | 34 +++++++++++++++++++++++++----
 6 files changed, 42 insertions(+), 14 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/ppu/cell_texture.c b/src/gallium/drivers/cell/ppu/cell_texture.c
index 9c694e136d..c59d1f7f23 100644
--- a/src/gallium/drivers/cell/ppu/cell_texture.c
+++ b/src/gallium/drivers/cell/ppu/cell_texture.c
@@ -245,9 +245,13 @@ void
 cell_update_texture_mapping(struct cell_context *cell)
 {
    uint face = 0, level = 0, zslice = 0;
+   uint i;
+
+   for (i = 0; i < CELL_MAX_SAMPLERS; i++) {
+      if (cell->texture[i])
+         cell_tile_texture(cell, cell->texture[i]);
+   }
 
-   if (cell->texture[0])
-      cell_tile_texture(cell, cell->texture[0]);
 #if 0
    if (cell->tex_surf && cell->tex_map) {
       pipe_surface_unmap(cell->tex_surf);
diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index 5b5a570a3c..a840d01596 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -333,7 +333,7 @@ cmd_state_texture(const struct cell_command_texture *texture)
    const uint width = texture->width;
    const uint height = texture->height;
 
-   if (Debug) {
+   if (1||Debug) {
       printf("SPU %u: TEXTURE [%u] at %p  size %u x %u\n", spu.init.id,
              texture->unit, texture->start,
              texture->width, texture->height);
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 2bfad3535a..26e050cfc3 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -152,7 +152,7 @@ struct spu_global
    /** for converting RGBA to PIPE_FORMAT_x colors */
    vector unsigned char color_shuffle;
 
-   vector float (*sample_texture)(vector float texcoord);
+   vector float (*sample_texture)(uint unit, vector float texcoord);
 
 } ALIGN16_ATTRIB;
 
diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
index 4612501eb3..58a426cc40 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.c
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -104,9 +104,8 @@ get_four_texels(vec_uint4 x, vec_uint4 y, vec_uint4 *texels)
  * XXX this is extremely primitive for now.
  */
 vector float
-sample_texture_nearest(vector float texcoord)
+sample_texture_nearest(uint unit, vector float texcoord)
 {
-   const uint unit = 0;
    vector float tc = spu_mul(texcoord, spu.texture[unit].tex_size);
    vector unsigned int itc = spu_convtu(tc, 0);  /* convert to int */
    itc = spu_and(itc, spu.texture[unit].tex_size_mask); /* mask (GL_REPEAT) */
@@ -116,9 +115,8 @@ sample_texture_nearest(vector float texcoord)
 
 
 vector float
-sample_texture_bilinear(vector float texcoord)
+sample_texture_bilinear(uint unit, vector float texcoord)
 {
-   const uint unit = 0;
    static const vec_uint4 offset_x = {0, 0, 1, 1};
    static const vec_uint4 offset_y = {0, 1, 0, 1};
 
diff --git a/src/gallium/drivers/cell/spu/spu_texture.h b/src/gallium/drivers/cell/spu/spu_texture.h
index 95eb87080f..f7c9738be8 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.h
+++ b/src/gallium/drivers/cell/spu/spu_texture.h
@@ -37,11 +37,11 @@ invalidate_tex_cache(void);
 
 
 extern vector float
-sample_texture_nearest(vector float texcoord);
+sample_texture_nearest(uint unit, vector float texcoord);
 
 
 extern vector float
-sample_texture_bilinear(vector float texcoord);
+sample_texture_bilinear(uint unit, vector float texcoord);
 
 
 #endif /* SPU_TEXTURE_H */
diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index 17e337bbdf..51abcb1757 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -311,17 +311,43 @@ emit_quad( int x, int y, mask_t mask )
 
       if (spu.texture[0].start) {
          /* texture mapping */
+         const uint unit = 0;
          vector float texcoords[4];
          eval_coeff(2, (float) x, (float) y, texcoords);
 
          if (spu_extract(mask, 0))
-            colors[0] = spu.sample_texture(texcoords[0]);
+            colors[0] = spu.sample_texture(unit, texcoords[0]);
          if (spu_extract(mask, 1))
-            colors[1] = spu.sample_texture(texcoords[1]);
+            colors[1] = spu.sample_texture(unit, texcoords[1]);
          if (spu_extract(mask, 2))
-            colors[2] = spu.sample_texture(texcoords[2]);
+            colors[2] = spu.sample_texture(unit, texcoords[2]);
          if (spu_extract(mask, 3))
-            colors[3] = spu.sample_texture(texcoords[3]);
+            colors[3] = spu.sample_texture(unit, texcoords[3]);
+
+
+         if (spu.texture[1].start) {
+            /* multi-texture mapping */
+            const uint unit = 1;
+            vector float colors1[4];
+
+            eval_coeff(3, (float) x, (float) y, texcoords);
+
+            if (spu_extract(mask, 0))
+               colors1[0] = spu.sample_texture(unit, texcoords[0]);
+            if (spu_extract(mask, 1))
+               colors1[1] = spu.sample_texture(unit, texcoords[1]);
+            if (spu_extract(mask, 2))
+               colors1[2] = spu.sample_texture(unit, texcoords[2]);
+            if (spu_extract(mask, 3))
+               colors1[3] = spu.sample_texture(unit, texcoords[3]);
+
+            /* hack: modulate first texture by second */
+            colors[0] = spu_mul(colors[0], colors1[0]);
+            colors[1] = spu_mul(colors[1], colors1[1]);
+            colors[2] = spu_mul(colors[2], colors1[2]);
+            colors[3] = spu_mul(colors[3], colors1[3]);
+         }
+
       }
       else {
          /* simple shading */
-- 
cgit v1.2.3


From 9e7b730eb25f08065d718dee4a67c67d1d133b6e Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Tue, 1 Apr 2008 14:52:25 -0600
Subject: cell: pass tex unit to get_texel()

---
 src/gallium/drivers/cell/spu/spu_texture.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
index 58a426cc40..ba0d57b959 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.c
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -49,9 +49,8 @@ invalidate_tex_cache(void)
 
 
 static uint
-get_texel(vec_uint4 coordinate)
+get_texel(uint unit, vec_uint4 coordinate)
 {
-   const uint unit = 0;
    vec_uint4 tmp;
    unsigned x = spu_extract(coordinate, 0);
    unsigned y = spu_extract(coordinate, 1);
@@ -109,7 +108,7 @@ sample_texture_nearest(uint unit, vector float texcoord)
    vector float tc = spu_mul(texcoord, spu.texture[unit].tex_size);
    vector unsigned int itc = spu_convtu(tc, 0);  /* convert to int */
    itc = spu_and(itc, spu.texture[unit].tex_size_mask); /* mask (GL_REPEAT) */
-   uint texel = get_texel(itc);
+   uint texel = get_texel(unit, itc);
    return spu_unpack_A8R8G8B8(texel);
 }
 
-- 
cgit v1.2.3


From 9d1444092fbfd9f975cfb996695f0533a78410f7 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Tue, 1 Apr 2008 14:55:31 -0600
Subject: cell: turn off some debug output

---
 src/gallium/drivers/cell/spu/spu_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index a840d01596..5b5a570a3c 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -333,7 +333,7 @@ cmd_state_texture(const struct cell_command_texture *texture)
    const uint width = texture->width;
    const uint height = texture->height;
 
-   if (1||Debug) {
+   if (Debug) {
       printf("SPU %u: TEXTURE [%u] at %p  size %u x %u\n", spu.init.id,
              texture->unit, texture->start,
              texture->width, texture->height);
-- 
cgit v1.2.3


From bccd3f138ccc717fad9073110d15e3321b590476 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Tue, 1 Apr 2008 15:42:42 -0600
Subject: cell: more multi-texture fixes (mostly working now)

---
 src/gallium/drivers/cell/spu/spu_main.c    |  6 +++---
 src/gallium/drivers/cell/spu/spu_main.h    |  4 +++-
 src/gallium/drivers/cell/spu/spu_texture.c |  5 ++---
 src/gallium/drivers/cell/spu/spu_tri.c     | 18 +++++++++---------
 4 files changed, 17 insertions(+), 16 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index 5b5a570a3c..1ab1c40379 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -319,10 +319,10 @@ cmd_state_sampler(const struct cell_command_sampler *sampler)
              spu.init.id, sampler->unit);
 
    spu.sampler[sampler->unit] = sampler->state;
-   if (spu.sampler[0].min_img_filter == PIPE_TEX_FILTER_LINEAR)
-      spu.sample_texture = sample_texture_bilinear;
+   if (spu.sampler[sampler->unit].min_img_filter == PIPE_TEX_FILTER_LINEAR)
+      spu.sample_texture[sampler->unit] = sample_texture_bilinear;
    else
-      spu.sample_texture = sample_texture_nearest;
+      spu.sample_texture[sampler->unit] = sample_texture_nearest;
 }
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 26e050cfc3..e9e39cbeab 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -85,6 +85,8 @@ typedef struct spu_blend_results (*logicop_func)(
     qword frag_mask);
 
 
+typedef vector float (*sample_texture_func)(uint unit, vector float texcoord);
+
 struct spu_framebuffer {
    void *color_start;              /**< addr of color surface in main memory */
    void *depth_start;              /**< addr of depth surface in main memory */
@@ -152,7 +154,7 @@ struct spu_global
    /** for converting RGBA to PIPE_FORMAT_x colors */
    vector unsigned char color_shuffle;
 
-   vector float (*sample_texture)(uint unit, vector float texcoord);
+   sample_texture_func sample_texture[CELL_MAX_SAMPLERS];
 
 } ALIGN16_ATTRIB;
 
diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
index ba0d57b959..ceab246980 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.c
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -68,9 +68,8 @@ get_texel(uint unit, vec_uint4 coordinate)
 
 
 static void
-get_four_texels(vec_uint4 x, vec_uint4 y, vec_uint4 *texels)
+get_four_texels(uint unit, vec_uint4 x, vec_uint4 y, vec_uint4 *texels)
 {
-   const uint unit = 0;
    const unsigned texture_ea = (uintptr_t) spu.texture[unit].start;
    vec_uint4 tile_x = spu_rlmask(x, -5);
    vec_uint4 tile_y = spu_rlmask(y, -5);
@@ -136,7 +135,7 @@ sample_texture_bilinear(uint unit, vector float texcoord)
    x = spu_and(x, spu.texture[unit].tex_size_x_mask);
    y = spu_and(y, spu.texture[unit].tex_size_y_mask);
 
-   get_four_texels(x, y, texels);
+   get_four_texels(unit, x, y, texels);
 
    vector float texel00 = spu_unpack_A8R8G8B8(spu_extract(texels[0], 0));
    vector float texel01 = spu_unpack_A8R8G8B8(spu_extract(texels[1], 0));
diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index 51abcb1757..ab4ff8160a 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -316,13 +316,13 @@ emit_quad( int x, int y, mask_t mask )
          eval_coeff(2, (float) x, (float) y, texcoords);
 
          if (spu_extract(mask, 0))
-            colors[0] = spu.sample_texture(unit, texcoords[0]);
+            colors[0] = spu.sample_texture[unit](unit, texcoords[0]);
          if (spu_extract(mask, 1))
-            colors[1] = spu.sample_texture(unit, texcoords[1]);
+            colors[1] = spu.sample_texture[unit](unit, texcoords[1]);
          if (spu_extract(mask, 2))
-            colors[2] = spu.sample_texture(unit, texcoords[2]);
+            colors[2] = spu.sample_texture[unit](unit, texcoords[2]);
          if (spu_extract(mask, 3))
-            colors[3] = spu.sample_texture(unit, texcoords[3]);
+            colors[3] = spu.sample_texture[unit](unit, texcoords[3]);
 
 
          if (spu.texture[1].start) {
@@ -330,16 +330,16 @@ emit_quad( int x, int y, mask_t mask )
             const uint unit = 1;
             vector float colors1[4];
 
-            eval_coeff(3, (float) x, (float) y, texcoords);
+            eval_coeff(2, (float) x, (float) y, texcoords);
 
             if (spu_extract(mask, 0))
-               colors1[0] = spu.sample_texture(unit, texcoords[0]);
+               colors1[0] = spu.sample_texture[unit](unit, texcoords[0]);
             if (spu_extract(mask, 1))
-               colors1[1] = spu.sample_texture(unit, texcoords[1]);
+               colors1[1] = spu.sample_texture[unit](unit, texcoords[1]);
             if (spu_extract(mask, 2))
-               colors1[2] = spu.sample_texture(unit, texcoords[2]);
+               colors1[2] = spu.sample_texture[unit](unit, texcoords[2]);
             if (spu_extract(mask, 3))
-               colors1[3] = spu.sample_texture(unit, texcoords[3]);
+               colors1[3] = spu.sample_texture[unit](unit, texcoords[3]);
 
             /* hack: modulate first texture by second */
             colors[0] = spu_mul(colors[0], colors1[0]);
-- 
cgit v1.2.3


From 217d37940771dd02ff1aa365105eca2c7a09d623 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Wed, 2 Apr 2008 14:01:42 -0600
Subject: cell: minor texture improvements

Precompute tiles_per_row.  Use ushort multiplies in a few places.  New comments.
---
 src/gallium/drivers/cell/spu/spu_main.c    |  2 ++
 src/gallium/drivers/cell/spu/spu_main.h    |  3 ++-
 src/gallium/drivers/cell/spu/spu_texture.c | 32 ++++++++++++++++++++----------
 3 files changed, 26 insertions(+), 11 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index 1ab1c40379..e04ffeb9b1 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -343,6 +343,8 @@ cmd_state_texture(const struct cell_command_texture *texture)
    spu.texture[unit].width = width;
    spu.texture[unit].height = height;
 
+   spu.texture[unit].tiles_per_row = width / TILE_SIZE;
+
    spu.texture[unit].tex_size = (vector float) { width, height, 0.0, 0.0};
    spu.texture[unit].tex_size_mask = (vector unsigned int)
          { width - 1, height - 1, 0, 0 };
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index e9e39cbeab..e962e1426c 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -105,7 +105,8 @@ struct spu_framebuffer {
 struct spu_texture
 {
    void *start;
-   uint width, height;
+   ushort width, height;
+   ushort tiles_per_row;
    vector float tex_size;
    vector unsigned int tex_size_mask; /**< == int(size - 1) */
    vector unsigned int tex_size_x_mask; /**< == int(size - 1) */
diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
index ceab246980..e9a2754e57 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.c
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -51,22 +51,25 @@ invalidate_tex_cache(void)
 static uint
 get_texel(uint unit, vec_uint4 coordinate)
 {
+   const unsigned texture_ea = (uintptr_t) spu.texture[unit].start;
+   ushort x = spu_extract(coordinate, 0);
+   ushort y = spu_extract(coordinate, 1);
+   unsigned tile_offset = sizeof(tile_t)
+      * ((y / TILE_SIZE * spu.texture[unit].tiles_per_row) + (x / TILE_SIZE));
+   ushort texel_offset = (ushort) 4
+      * (ushort) (((ushort) (y % TILE_SIZE) * (ushort) TILE_SIZE) + (x % TILE_SIZE));
    vec_uint4 tmp;
-   unsigned x = spu_extract(coordinate, 0);
-   unsigned y = spu_extract(coordinate, 1);
-   const unsigned tiles_per_row = spu.texture[unit].width / TILE_SIZE;
-   unsigned tile_offset = sizeof(tile_t) * ((y / TILE_SIZE * tiles_per_row) 
-                                            + (x / TILE_SIZE));
-   unsigned texel_offset = 4 * (((y % TILE_SIZE) * TILE_SIZE)
-                                + (x % TILE_SIZE));
 
    spu_dcache_fetch_unaligned((qword *) & tmp,
-                              spu.texture[unit].start + tile_offset + texel_offset,
+                              texture_ea + tile_offset + texel_offset,
                               4);
    return spu_extract(tmp, 0);
 }
 
 
+/**
+ * Get four texels from locations (x[0], y[0]), (x[1], y[1]) ...
+ */
 static void
 get_four_texels(uint unit, vec_uint4 x, vec_uint4 y, vec_uint4 *texels)
 {
@@ -76,7 +79,7 @@ get_four_texels(uint unit, vec_uint4 x, vec_uint4 y, vec_uint4 *texels)
    const qword offset_x = si_andi((qword) x, 0x1f);
    const qword offset_y = si_andi((qword) y, 0x1f);
 
-   const qword tiles_per_row = (qword) spu_splats(spu.texture[unit].width / TILE_SIZE);
+   const qword tiles_per_row = (qword) spu_splats(spu.texture[unit].tiles_per_row);
    const qword tile_size = (qword) spu_splats(sizeof(tile_t));
 
    qword tile_offset = si_mpya((qword) tile_y, tiles_per_row, (qword) tile_x);
@@ -97,6 +100,7 @@ get_four_texels(uint unit, vec_uint4 x, vec_uint4 y, vec_uint4 *texels)
                               texture_ea + spu_extract(offset, 3), 4);
 }
 
+
 /**
  * Get texture sample at texcoord.
  * XXX this is extremely primitive for now.
@@ -126,17 +130,25 @@ sample_texture_bilinear(uint unit, vector float texcoord)
 
    vec_uint4 texels[4];
    
+   /* setup texcoords for quad:
+    *  +-----+-----+
+    *  |x0,y0|x1,y1|
+    *  +-----+-----+
+    *  |x2,y2|x3,y3|
+    *  +-----+-----+
+    */
    vec_uint4 x = spu_splats(spu_extract(itc, 0));
    vec_uint4 y = spu_splats(spu_extract(itc, 1));
-
    x = spu_add(x, offset_x);
    y = spu_add(y, offset_y);
 
+   /* GL_REPEAT wrap mode: */
    x = spu_and(x, spu.texture[unit].tex_size_x_mask);
    y = spu_and(y, spu.texture[unit].tex_size_y_mask);
 
    get_four_texels(unit, x, y, texels);
 
+   /* integer A8R8G8B8 to float texel conversion */
    vector float texel00 = spu_unpack_A8R8G8B8(spu_extract(texels[0], 0));
    vector float texel01 = spu_unpack_A8R8G8B8(spu_extract(texels[1], 0));
    vector float texel10 = spu_unpack_A8R8G8B8(spu_extract(texels[2], 0));
-- 
cgit v1.2.3


From a7504ad587ee8cbfa9958ad23321a691ce0823d3 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Wed, 2 Apr 2008 14:30:28 -0600
Subject: cell: added some comments/ideas about better texture sampling

---
 src/gallium/drivers/cell/spu/spu_texture.c | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
index e9a2754e57..5051774f00 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.c
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -48,9 +48,16 @@ invalidate_tex_cache(void)
 }
 
 
+/**
+ * XXX look into getting texels for all four pixels in a quad at once.
+ */
 static uint
 get_texel(uint unit, vec_uint4 coordinate)
 {
+   /*
+    * XXX we could do the "/ TILE_SIZE" and "% TILE_SIZE" operations as
+    * SIMD since X and Y are already in a SIMD register.
+    */
    const unsigned texture_ea = (uintptr_t) spu.texture[unit].start;
    ushort x = spu_extract(coordinate, 0);
    ushort y = spu_extract(coordinate, 1);
@@ -69,6 +76,16 @@ get_texel(uint unit, vec_uint4 coordinate)
 
 /**
  * Get four texels from locations (x[0], y[0]), (x[1], y[1]) ...
+ *
+ * NOTE: in the typical case of bilinear filtering, the four texels
+ * are in a 2x2 group so we could get by with just two dcache fetches
+ * (two side-by-side texels per fetch).  But when bilinear filtering
+ * wraps around a texture edge, we'll probably need code like we have
+ * now.
+ * FURTHERMORE: since we're rasterizing a quad of 2x2 pixels at a time,
+ * it's quite likely that the four pixels in a quad will need some of the
+ * same texels.  So look into doing texture fetches for four pixels at
+ * a time.
  */
 static void
 get_four_texels(uint unit, vec_uint4 x, vec_uint4 y, vec_uint4 *texels)
@@ -103,7 +120,6 @@ get_four_texels(uint unit, vec_uint4 x, vec_uint4 y, vec_uint4 *texels)
 
 /**
  * Get texture sample at texcoord.
- * XXX this is extremely primitive for now.
  */
 vector float
 sample_texture_nearest(uint unit, vector float texcoord)
-- 
cgit v1.2.3


From 347d28fd20645674c3509b9fb8ebf8c31a24c239 Mon Sep 17 00:00:00 2001
From: Michal Krol <michal@tungstengraphics.com>
Date: Sat, 31 May 2008 19:51:50 +0200
Subject: cell: Fix build after TGSI declaration interface changes.

---
 src/gallium/drivers/cell/spu/spu_exec.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/spu/spu_exec.c b/src/gallium/drivers/cell/spu/spu_exec.c
index 48edc62f49..69b0526120 100644
--- a/src/gallium/drivers/cell/spu/spu_exec.c
+++ b/src/gallium/drivers/cell/spu/spu_exec.c
@@ -830,13 +830,11 @@ exec_declaration(struct spu_exec_machine *mach,
          unsigned first, last, mask;
          interpolation_func interp;
 
-         assert( decl->Declaration.Declare == TGSI_DECLARE_RANGE );
-
-         first = decl->u.DeclarationRange.First;
-         last = decl->u.DeclarationRange.Last;
+         first = decl->DeclarationRange.First;
+         last = decl->DeclarationRange.Last;
          mask = decl->Declaration.UsageMask;
 
-         switch( decl->Interpolation.Interpolate ) {
+         switch( decl->Declaration.Interpolate ) {
          case TGSI_INTERPOLATE_CONSTANT:
             interp = constant_interpolation;
             break;
-- 
cgit v1.2.3


From dc6068a8bcd66e2cbcf76962c70ba202e0078a49 Mon Sep 17 00:00:00 2001
From: Michal Krol <michal@tungstengraphics.com>
Date: Mon, 2 Jun 2008 11:40:44 +0200
Subject: cell: SWZ no longer aliases MOV.

---
 src/gallium/drivers/cell/spu/spu_exec.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/spu/spu_exec.c b/src/gallium/drivers/cell/spu/spu_exec.c
index 69b0526120..3a80df427d 100644
--- a/src/gallium/drivers/cell/spu/spu_exec.c
+++ b/src/gallium/drivers/cell/spu/spu_exec.c
@@ -896,7 +896,7 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_MOV:
-   /* TGSI_OPCODE_SWZ */
+   case TGSI_OPCODE_SWZ:
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
          STORE( &r[0], 0, chan_index );
-- 
cgit v1.2.3


From c208a2c791fa24c7c5887fc496738cbddbfafc72 Mon Sep 17 00:00:00 2001
From: José Fonseca <jrfonseca@tungstengraphics.com>
Date: Mon, 28 Jul 2008 12:42:13 +0900
Subject: Merge tgsi/exec and tgsi/util directories.

---
 src/gallium/auxiliary/cso_cache/cso_context.c      |    2 +-
 src/gallium/auxiliary/draw/draw_pipe_aaline.c      |    4 +-
 src/gallium/auxiliary/draw/draw_pipe_aapoint.c     |    4 +-
 src/gallium/auxiliary/draw/draw_pipe_pstipple.c    |    4 +-
 src/gallium/auxiliary/draw/draw_private.h          |    4 +-
 src/gallium/auxiliary/draw/draw_vs_aos.c           |    8 +-
 src/gallium/auxiliary/draw/draw_vs_aos_io.c        |    6 +-
 src/gallium/auxiliary/draw/draw_vs_aos_machine.c   |    6 +-
 src/gallium/auxiliary/draw/draw_vs_exec.c          |    4 +-
 src/gallium/auxiliary/draw/draw_vs_llvm.c          |    2 +-
 src/gallium/auxiliary/draw/draw_vs_sse.c           |    4 +-
 src/gallium/auxiliary/gallivm/gallivm.cpp          |    4 +-
 src/gallium/auxiliary/gallivm/gallivm_cpu.cpp      |    4 +-
 src/gallium/auxiliary/gallivm/tgsitollvm.cpp       |   10 +-
 src/gallium/auxiliary/tgsi/Makefile                |   18 +-
 src/gallium/auxiliary/tgsi/SConscript              |   24 +-
 src/gallium/auxiliary/tgsi/exec/Makefile           |    2 -
 src/gallium/auxiliary/tgsi/exec/tgsi_exec.c        | 2522 --------------------
 src/gallium/auxiliary/tgsi/exec/tgsi_exec.h        |  253 --
 src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c        | 2275 ------------------
 src/gallium/auxiliary/tgsi/exec/tgsi_sse2.h        |   49 -
 src/gallium/auxiliary/tgsi/tgsi_build.c            | 1324 ++++++++++
 src/gallium/auxiliary/tgsi/tgsi_build.h            |  332 +++
 src/gallium/auxiliary/tgsi/tgsi_dump.c             |  582 +++++
 src/gallium/auxiliary/tgsi/tgsi_dump.h             |   63 +
 src/gallium/auxiliary/tgsi/tgsi_dump_c.c           |  845 +++++++
 src/gallium/auxiliary/tgsi/tgsi_dump_c.h           |   49 +
 src/gallium/auxiliary/tgsi/tgsi_exec.c             | 2522 ++++++++++++++++++++
 src/gallium/auxiliary/tgsi/tgsi_exec.h             |  253 ++
 src/gallium/auxiliary/tgsi/tgsi_iterate.c          |   85 +
 src/gallium/auxiliary/tgsi/tgsi_iterate.h          |   76 +
 src/gallium/auxiliary/tgsi/tgsi_parse.c            |  332 +++
 src/gallium/auxiliary/tgsi/tgsi_parse.h            |  151 ++
 src/gallium/auxiliary/tgsi/tgsi_sanity.c           |  341 +++
 src/gallium/auxiliary/tgsi/tgsi_sanity.h           |   49 +
 src/gallium/auxiliary/tgsi/tgsi_scan.c             |  226 ++
 src/gallium/auxiliary/tgsi/tgsi_scan.h             |   74 +
 src/gallium/auxiliary/tgsi/tgsi_sse2.c             | 2275 ++++++++++++++++++
 src/gallium/auxiliary/tgsi/tgsi_sse2.h             |   49 +
 src/gallium/auxiliary/tgsi/tgsi_text.c             | 1221 ++++++++++
 src/gallium/auxiliary/tgsi/tgsi_text.h             |   47 +
 src/gallium/auxiliary/tgsi/tgsi_transform.c        |  199 ++
 src/gallium/auxiliary/tgsi/tgsi_transform.h        |   93 +
 src/gallium/auxiliary/tgsi/tgsi_util.c             |  300 +++
 src/gallium/auxiliary/tgsi/tgsi_util.h             |   96 +
 src/gallium/auxiliary/tgsi/util/tgsi_build.c       | 1324 ----------
 src/gallium/auxiliary/tgsi/util/tgsi_build.h       |  332 ---
 src/gallium/auxiliary/tgsi/util/tgsi_dump.c        |  582 -----
 src/gallium/auxiliary/tgsi/util/tgsi_dump.h        |   63 -
 src/gallium/auxiliary/tgsi/util/tgsi_dump_c.c      |  845 -------
 src/gallium/auxiliary/tgsi/util/tgsi_dump_c.h      |   49 -
 src/gallium/auxiliary/tgsi/util/tgsi_iterate.c     |   85 -
 src/gallium/auxiliary/tgsi/util/tgsi_iterate.h     |   76 -
 src/gallium/auxiliary/tgsi/util/tgsi_parse.c       |  332 ---
 src/gallium/auxiliary/tgsi/util/tgsi_parse.h       |  151 --
 src/gallium/auxiliary/tgsi/util/tgsi_sanity.c      |  341 ---
 src/gallium/auxiliary/tgsi/util/tgsi_sanity.h      |   49 -
 src/gallium/auxiliary/tgsi/util/tgsi_scan.c        |  226 --
 src/gallium/auxiliary/tgsi/util/tgsi_scan.h        |   74 -
 src/gallium/auxiliary/tgsi/util/tgsi_text.c        | 1221 ----------
 src/gallium/auxiliary/tgsi/util/tgsi_text.h        |   47 -
 src/gallium/auxiliary/tgsi/util/tgsi_transform.c   |  199 --
 src/gallium/auxiliary/tgsi/util/tgsi_transform.h   |   93 -
 src/gallium/auxiliary/tgsi/util/tgsi_util.c        |  300 ---
 src/gallium/auxiliary/tgsi/util/tgsi_util.h        |   96 -
 src/gallium/auxiliary/util/u_gen_mipmap.c          |    6 +-
 src/gallium/auxiliary/util/u_simple_shaders.c      |    6 +-
 src/gallium/drivers/cell/ppu/cell_context.h        |    2 +-
 src/gallium/drivers/cell/ppu/cell_state_shader.c   |    2 +-
 src/gallium/drivers/cell/spu/spu_exec.c            |    4 +-
 src/gallium/drivers/cell/spu/spu_exec.h            |    2 +-
 src/gallium/drivers/cell/spu/spu_util.c            |    4 +-
 src/gallium/drivers/i915simple/i915_context.h      |    2 +-
 .../drivers/i915simple/i915_fpc_translate.c        |    4 +-
 src/gallium/drivers/i915simple/i915_state.c        |    2 +-
 src/gallium/drivers/i965simple/brw_context.h       |    2 +-
 src/gallium/drivers/i965simple/brw_sf.c            |    2 +-
 src/gallium/drivers/i965simple/brw_shader_info.c   |    2 +-
 src/gallium/drivers/i965simple/brw_state.c         |    4 +-
 src/gallium/drivers/i965simple/brw_vs_emit.c       |    2 +-
 src/gallium/drivers/i965simple/brw_wm_decl.c       |    2 +-
 src/gallium/drivers/i965simple/brw_wm_glsl.c       |    2 +-
 src/gallium/drivers/softpipe/sp_fs_exec.c          |    4 +-
 src/gallium/drivers/softpipe/sp_fs_llvm.c          |    2 +-
 src/gallium/drivers/softpipe/sp_fs_sse.c           |    4 +-
 src/gallium/drivers/softpipe/sp_headers.h          |    2 +-
 src/gallium/drivers/softpipe/sp_state.h            |    2 +-
 src/gallium/drivers/softpipe/sp_state_fs.c         |    4 +-
 src/gallium/drivers/softpipe/sp_tex_sample.c       |    2 +-
 src/gallium/state_trackers/python/gallium.i        |    4 +-
 src/mesa/state_tracker/st_debug.c                  |    2 +-
 src/mesa/state_tracker/st_mesa_to_tgsi.c           |    6 +-
 src/mesa/state_tracker/st_program.c                |    2 +-
 93 files changed, 11680 insertions(+), 11682 deletions(-)
 delete mode 100644 src/gallium/auxiliary/tgsi/exec/Makefile
 delete mode 100644 src/gallium/auxiliary/tgsi/exec/tgsi_exec.c
 delete mode 100644 src/gallium/auxiliary/tgsi/exec/tgsi_exec.h
 delete mode 100755 src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c
 delete mode 100755 src/gallium/auxiliary/tgsi/exec/tgsi_sse2.h
 create mode 100644 src/gallium/auxiliary/tgsi/tgsi_build.c
 create mode 100644 src/gallium/auxiliary/tgsi/tgsi_build.h
 create mode 100644 src/gallium/auxiliary/tgsi/tgsi_dump.c
 create mode 100644 src/gallium/auxiliary/tgsi/tgsi_dump.h
 create mode 100644 src/gallium/auxiliary/tgsi/tgsi_dump_c.c
 create mode 100644 src/gallium/auxiliary/tgsi/tgsi_dump_c.h
 create mode 100644 src/gallium/auxiliary/tgsi/tgsi_exec.c
 create mode 100644 src/gallium/auxiliary/tgsi/tgsi_exec.h
 create mode 100644 src/gallium/auxiliary/tgsi/tgsi_iterate.c
 create mode 100644 src/gallium/auxiliary/tgsi/tgsi_iterate.h
 create mode 100644 src/gallium/auxiliary/tgsi/tgsi_parse.c
 create mode 100644 src/gallium/auxiliary/tgsi/tgsi_parse.h
 create mode 100644 src/gallium/auxiliary/tgsi/tgsi_sanity.c
 create mode 100644 src/gallium/auxiliary/tgsi/tgsi_sanity.h
 create mode 100644 src/gallium/auxiliary/tgsi/tgsi_scan.c
 create mode 100644 src/gallium/auxiliary/tgsi/tgsi_scan.h
 create mode 100644 src/gallium/auxiliary/tgsi/tgsi_sse2.c
 create mode 100644 src/gallium/auxiliary/tgsi/tgsi_sse2.h
 create mode 100644 src/gallium/auxiliary/tgsi/tgsi_text.c
 create mode 100644 src/gallium/auxiliary/tgsi/tgsi_text.h
 create mode 100644 src/gallium/auxiliary/tgsi/tgsi_transform.c
 create mode 100644 src/gallium/auxiliary/tgsi/tgsi_transform.h
 create mode 100644 src/gallium/auxiliary/tgsi/tgsi_util.c
 create mode 100644 src/gallium/auxiliary/tgsi/tgsi_util.h
 delete mode 100644 src/gallium/auxiliary/tgsi/util/tgsi_build.c
 delete mode 100644 src/gallium/auxiliary/tgsi/util/tgsi_build.h
 delete mode 100644 src/gallium/auxiliary/tgsi/util/tgsi_dump.c
 delete mode 100644 src/gallium/auxiliary/tgsi/util/tgsi_dump.h
 delete mode 100644 src/gallium/auxiliary/tgsi/util/tgsi_dump_c.c
 delete mode 100644 src/gallium/auxiliary/tgsi/util/tgsi_dump_c.h
 delete mode 100644 src/gallium/auxiliary/tgsi/util/tgsi_iterate.c
 delete mode 100644 src/gallium/auxiliary/tgsi/util/tgsi_iterate.h
 delete mode 100644 src/gallium/auxiliary/tgsi/util/tgsi_parse.c
 delete mode 100644 src/gallium/auxiliary/tgsi/util/tgsi_parse.h
 delete mode 100644 src/gallium/auxiliary/tgsi/util/tgsi_sanity.c
 delete mode 100644 src/gallium/auxiliary/tgsi/util/tgsi_sanity.h
 delete mode 100644 src/gallium/auxiliary/tgsi/util/tgsi_scan.c
 delete mode 100644 src/gallium/auxiliary/tgsi/util/tgsi_scan.h
 delete mode 100644 src/gallium/auxiliary/tgsi/util/tgsi_text.c
 delete mode 100644 src/gallium/auxiliary/tgsi/util/tgsi_text.h
 delete mode 100644 src/gallium/auxiliary/tgsi/util/tgsi_transform.c
 delete mode 100644 src/gallium/auxiliary/tgsi/util/tgsi_transform.h
 delete mode 100644 src/gallium/auxiliary/tgsi/util/tgsi_util.c
 delete mode 100644 src/gallium/auxiliary/tgsi/util/tgsi_util.h

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/auxiliary/cso_cache/cso_context.c b/src/gallium/auxiliary/cso_cache/cso_context.c
index af4af8ac1d..86e4d46a20 100644
--- a/src/gallium/auxiliary/cso_cache/cso_context.c
+++ b/src/gallium/auxiliary/cso_cache/cso_context.c
@@ -38,7 +38,7 @@
 #include "pipe/p_state.h"
 #include "pipe/p_util.h"
 #include "pipe/p_inlines.h"
-#include "tgsi/util/tgsi_parse.h"
+#include "tgsi/tgsi_parse.h"
 
 #include "cso_cache/cso_context.h"
 #include "cso_cache/cso_cache.h"
diff --git a/src/gallium/auxiliary/draw/draw_pipe_aaline.c b/src/gallium/auxiliary/draw/draw_pipe_aaline.c
index 3dd7ee19fd..991304b2c8 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_aaline.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_aaline.c
@@ -38,8 +38,8 @@
 #include "pipe/p_defines.h"
 #include "pipe/p_shader_tokens.h"
 
-#include "tgsi/util/tgsi_transform.h"
-#include "tgsi/util/tgsi_dump.h"
+#include "tgsi/tgsi_transform.h"
+#include "tgsi/tgsi_dump.h"
 
 #include "draw_context.h"
 #include "draw_private.h"
diff --git a/src/gallium/auxiliary/draw/draw_pipe_aapoint.c b/src/gallium/auxiliary/draw/draw_pipe_aapoint.c
index 87fd303649..13b4401521 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_aapoint.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_aapoint.c
@@ -44,8 +44,8 @@
 #include "pipe/p_defines.h"
 #include "pipe/p_shader_tokens.h"
 
-#include "tgsi/util/tgsi_transform.h"
-#include "tgsi/util/tgsi_dump.h"
+#include "tgsi/tgsi_transform.h"
+#include "tgsi/tgsi_dump.h"
 
 #include "draw_context.h"
 #include "draw_vs.h"
diff --git a/src/gallium/auxiliary/draw/draw_pipe_pstipple.c b/src/gallium/auxiliary/draw/draw_pipe_pstipple.c
index 1f63f94365..d3bd9baddd 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_pstipple.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_pstipple.c
@@ -40,8 +40,8 @@
 #include "pipe/p_defines.h"
 #include "pipe/p_shader_tokens.h"
 
-#include "tgsi/util/tgsi_transform.h"
-#include "tgsi/util/tgsi_dump.h"
+#include "tgsi/tgsi_transform.h"
+#include "tgsi/tgsi_dump.h"
 
 #include "draw_context.h"
 #include "draw_pipe.h"
diff --git a/src/gallium/auxiliary/draw/draw_private.h b/src/gallium/auxiliary/draw/draw_private.h
index 7bd1e670b4..626a2e3e30 100644
--- a/src/gallium/auxiliary/draw/draw_private.h
+++ b/src/gallium/auxiliary/draw/draw_private.h
@@ -44,8 +44,8 @@
 #include "pipe/p_state.h"
 #include "pipe/p_defines.h"
 
-#include "tgsi/exec/tgsi_exec.h"
-#include "tgsi/util/tgsi_scan.h"
+#include "tgsi/tgsi_exec.h"
+#include "tgsi/tgsi_scan.h"
 
 
 struct pipe_context;
diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.c b/src/gallium/auxiliary/draw/draw_vs_aos.c
index 1f926b3e85..441877d46f 100644
--- a/src/gallium/auxiliary/draw/draw_vs_aos.c
+++ b/src/gallium/auxiliary/draw/draw_vs_aos.c
@@ -31,10 +31,10 @@
 
 #include "pipe/p_util.h"
 #include "pipe/p_shader_tokens.h"
-#include "tgsi/util/tgsi_parse.h"
-#include "tgsi/util/tgsi_util.h"
-#include "tgsi/exec/tgsi_exec.h"
-#include "tgsi/util/tgsi_dump.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_util.h"
+#include "tgsi/tgsi_exec.h"
+#include "tgsi/tgsi_dump.h"
 
 #include "draw_vs.h"
 #include "draw_vs_aos.h"
diff --git a/src/gallium/auxiliary/draw/draw_vs_aos_io.c b/src/gallium/auxiliary/draw/draw_vs_aos_io.c
index 8e834501a4..eda677cc62 100644
--- a/src/gallium/auxiliary/draw/draw_vs_aos_io.c
+++ b/src/gallium/auxiliary/draw/draw_vs_aos_io.c
@@ -28,9 +28,9 @@
 
 #include "pipe/p_util.h"
 #include "pipe/p_shader_tokens.h"
-#include "tgsi/util/tgsi_parse.h"
-#include "tgsi/util/tgsi_util.h"
-#include "tgsi/exec/tgsi_exec.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_util.h"
+#include "tgsi/tgsi_exec.h"
 #include "draw_vs.h"
 #include "draw_vs_aos.h"
 #include "draw_vertex.h"
diff --git a/src/gallium/auxiliary/draw/draw_vs_aos_machine.c b/src/gallium/auxiliary/draw/draw_vs_aos_machine.c
index 6a54917ae3..e029b7b4bb 100644
--- a/src/gallium/auxiliary/draw/draw_vs_aos_machine.c
+++ b/src/gallium/auxiliary/draw/draw_vs_aos_machine.c
@@ -31,9 +31,9 @@
 
 #include "pipe/p_util.h"
 #include "pipe/p_shader_tokens.h"
-#include "tgsi/util/tgsi_parse.h"
-#include "tgsi/util/tgsi_util.h"
-#include "tgsi/exec/tgsi_exec.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_util.h"
+#include "tgsi/tgsi_exec.h"
 #include "draw_vs.h"
 #include "draw_vs_aos.h"
 #include "draw_vertex.h"
diff --git a/src/gallium/auxiliary/draw/draw_vs_exec.c b/src/gallium/auxiliary/draw/draw_vs_exec.c
index 4501877efc..e26903d8cc 100644
--- a/src/gallium/auxiliary/draw/draw_vs_exec.c
+++ b/src/gallium/auxiliary/draw/draw_vs_exec.c
@@ -38,8 +38,8 @@
 #include "draw_context.h"
 #include "draw_vs.h"
 
-#include "tgsi/util/tgsi_parse.h"
-#include "tgsi/util/tgsi_scan.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_scan.h"
 
 
 struct exec_vertex_shader {
diff --git a/src/gallium/auxiliary/draw/draw_vs_llvm.c b/src/gallium/auxiliary/draw/draw_vs_llvm.c
index c63bd51a10..fc03473b91 100644
--- a/src/gallium/auxiliary/draw/draw_vs_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_vs_llvm.c
@@ -38,7 +38,7 @@
 #include "draw_context.h"
 #include "draw_vs.h"
 
-#include "tgsi/util/tgsi_parse.h"
+#include "tgsi/tgsi_parse.h"
 
 #ifdef MESA_LLVM
 
diff --git a/src/gallium/auxiliary/draw/draw_vs_sse.c b/src/gallium/auxiliary/draw/draw_vs_sse.c
index c3189c707d..61f0c084c3 100644
--- a/src/gallium/auxiliary/draw/draw_vs_sse.c
+++ b/src/gallium/auxiliary/draw/draw_vs_sse.c
@@ -45,8 +45,8 @@
 
 #include "rtasm/rtasm_cpu.h"
 #include "rtasm/rtasm_x86sse.h"
-#include "tgsi/exec/tgsi_sse2.h"
-#include "tgsi/util/tgsi_parse.h"
+#include "tgsi/tgsi_sse2.h"
+#include "tgsi/tgsi_parse.h"
 
 #define SSE_MAX_VERTICES 4
 
diff --git a/src/gallium/auxiliary/gallivm/gallivm.cpp b/src/gallium/auxiliary/gallivm/gallivm.cpp
index 77900e342b..29adeea47d 100644
--- a/src/gallium/auxiliary/gallivm/gallivm.cpp
+++ b/src/gallium/auxiliary/gallivm/gallivm.cpp
@@ -42,8 +42,8 @@
 #include "pipe/p_context.h"
 #include "pipe/p_shader_tokens.h"
 
-#include "tgsi/exec/tgsi_exec.h"
-#include "tgsi/util/tgsi_dump.h"
+#include "tgsi/tgsi_exec.h"
+#include "tgsi/tgsi_dump.h"
 
 #include <llvm/Module.h>
 #include <llvm/CallingConv.h>
diff --git a/src/gallium/auxiliary/gallivm/gallivm_cpu.cpp b/src/gallium/auxiliary/gallivm/gallivm_cpu.cpp
index 857c190f7b..cf5b978837 100644
--- a/src/gallium/auxiliary/gallivm/gallivm_cpu.cpp
+++ b/src/gallium/auxiliary/gallivm/gallivm_cpu.cpp
@@ -43,8 +43,8 @@
 #include "pipe/p_shader_tokens.h"
 #include "pipe/p_util.h"
 
-#include "tgsi/exec/tgsi_exec.h"
-#include "tgsi/util/tgsi_dump.h"
+#include "tgsi/tgsi_exec.h"
+#include "tgsi/tgsi_dump.h"
 
 #include <llvm/Module.h>
 #include <llvm/CallingConv.h>
diff --git a/src/gallium/auxiliary/gallivm/tgsitollvm.cpp b/src/gallium/auxiliary/gallivm/tgsitollvm.cpp
index 98014bdaa1..b14e2affd6 100644
--- a/src/gallium/auxiliary/gallivm/tgsitollvm.cpp
+++ b/src/gallium/auxiliary/gallivm/tgsitollvm.cpp
@@ -10,11 +10,11 @@
 
 #include "pipe/p_shader_tokens.h"
 
-#include "tgsi/util/tgsi_parse.h"
-#include "tgsi/exec/tgsi_exec.h"
-#include "tgsi/util/tgsi_util.h"
-#include "tgsi/util/tgsi_build.h"
-#include "tgsi/util/tgsi_dump.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_exec.h"
+#include "tgsi/tgsi_util.h"
+#include "tgsi/tgsi_build.h"
+#include "tgsi/tgsi_dump.h"
 
 
 #include <llvm/Module.h>
diff --git a/src/gallium/auxiliary/tgsi/Makefile b/src/gallium/auxiliary/tgsi/Makefile
index 9c4b967651..bbeff1304d 100644
--- a/src/gallium/auxiliary/tgsi/Makefile
+++ b/src/gallium/auxiliary/tgsi/Makefile
@@ -4,15 +4,15 @@ include $(TOP)/configs/current
 LIBNAME = tgsi
 
 C_SOURCES = \
-	exec/tgsi_exec.c \
-	exec/tgsi_sse2.c \
-	util/tgsi_iterate.c \
-	util/tgsi_build.c \
-	util/tgsi_dump.c \
-	util/tgsi_parse.c \
-	util/tgsi_scan.c \
-	util/tgsi_transform.c \
-	util/tgsi_util.c
+	tgsi_build.c \
+	tgsi_dump.c \
+	tgsi_exec.c \
+	tgsi_iterate.c \
+	tgsi_parse.c \
+	tgsi_scan.c \
+	tgsi_sse2.c \
+	tgsi_transform.c \
+	tgsi_util.c
 
 include ../../Makefile.template
 
diff --git a/src/gallium/auxiliary/tgsi/SConscript b/src/gallium/auxiliary/tgsi/SConscript
index 3bbfa1be54..03982e2194 100644
--- a/src/gallium/auxiliary/tgsi/SConscript
+++ b/src/gallium/auxiliary/tgsi/SConscript
@@ -3,18 +3,18 @@ Import('*')
 tgsi = env.ConvenienceLibrary(
 	target = 'tgsi',
 	source = [
-		'exec/tgsi_exec.c',
-		'exec/tgsi_sse2.c',
-		'util/tgsi_build.c',
-		'util/tgsi_dump.c',
-		'util/tgsi_dump_c.c',
-		'util/tgsi_iterate.c',
-		'util/tgsi_parse.c',
-		'util/tgsi_sanity.c',
-		'util/tgsi_scan.c',
-		'util/tgsi_text.c',
-		'util/tgsi_transform.c',
-		'util/tgsi_util.c',
+		'tgsi_build.c',
+		'tgsi_dump.c',
+		'tgsi_dump_c.c',
+		'tgsi_exec.c',
+		'tgsi_iterate.c',
+		'tgsi_parse.c',
+		'tgsi_sanity.c',
+		'tgsi_scan.c',
+		'tgsi_sse2.c',
+		'tgsi_text.c',
+		'tgsi_transform.c',
+		'tgsi_util.c',
 	])
 
 auxiliaries.insert(0, tgsi)
diff --git a/src/gallium/auxiliary/tgsi/exec/Makefile b/src/gallium/auxiliary/tgsi/exec/Makefile
deleted file mode 100644
index 451911a354..0000000000
--- a/src/gallium/auxiliary/tgsi/exec/Makefile
+++ /dev/null
@@ -1,2 +0,0 @@
-default:
-	cd .. ; make
diff --git a/src/gallium/auxiliary/tgsi/exec/tgsi_exec.c b/src/gallium/auxiliary/tgsi/exec/tgsi_exec.c
deleted file mode 100644
index 001a4c4b15..0000000000
--- a/src/gallium/auxiliary/tgsi/exec/tgsi_exec.c
+++ /dev/null
@@ -1,2522 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-/**
- * TGSI interpretor/executor.
- *
- * Flow control information:
- *
- * Since we operate on 'quads' (4 pixels or 4 vertices in parallel)
- * flow control statements (IF/ELSE/ENDIF, LOOP/ENDLOOP) require special
- * care since a condition may be true for some quad components but false
- * for other components.
- *
- * We basically execute all statements (even if they're in the part of
- * an IF/ELSE clause that's "not taken") and use a special mask to
- * control writing to destination registers.  This is the ExecMask.
- * See store_dest().
- *
- * The ExecMask is computed from three other masks (CondMask, LoopMask and
- * ContMask) which are controlled by the flow control instructions (namely:
- * (IF/ELSE/ENDIF, LOOP/ENDLOOP and CONT).
- *
- *
- * Authors:
- *   Michal Krol
- *   Brian Paul
- */
-
-#include "pipe/p_compiler.h"
-#include "pipe/p_state.h"
-#include "pipe/p_util.h"
-#include "pipe/p_shader_tokens.h"
-#include "tgsi/util/tgsi_parse.h"
-#include "tgsi/util/tgsi_util.h"
-#include "tgsi_exec.h"
-
-#define TILE_TOP_LEFT     0
-#define TILE_TOP_RIGHT    1
-#define TILE_BOTTOM_LEFT  2
-#define TILE_BOTTOM_RIGHT 3
-
-/*
- * Shorthand locations of various utility registers (_I = Index, _C = Channel)
- */
-#define TEMP_0_I           TGSI_EXEC_TEMP_00000000_I
-#define TEMP_0_C           TGSI_EXEC_TEMP_00000000_C
-#define TEMP_7F_I          TGSI_EXEC_TEMP_7FFFFFFF_I
-#define TEMP_7F_C          TGSI_EXEC_TEMP_7FFFFFFF_C
-#define TEMP_80_I          TGSI_EXEC_TEMP_80000000_I
-#define TEMP_80_C          TGSI_EXEC_TEMP_80000000_C
-#define TEMP_FF_I          TGSI_EXEC_TEMP_FFFFFFFF_I
-#define TEMP_FF_C          TGSI_EXEC_TEMP_FFFFFFFF_C
-#define TEMP_1_I           TGSI_EXEC_TEMP_ONE_I
-#define TEMP_1_C           TGSI_EXEC_TEMP_ONE_C
-#define TEMP_2_I           TGSI_EXEC_TEMP_TWO_I
-#define TEMP_2_C           TGSI_EXEC_TEMP_TWO_C
-#define TEMP_128_I         TGSI_EXEC_TEMP_128_I
-#define TEMP_128_C         TGSI_EXEC_TEMP_128_C
-#define TEMP_M128_I        TGSI_EXEC_TEMP_MINUS_128_I
-#define TEMP_M128_C        TGSI_EXEC_TEMP_MINUS_128_C
-#define TEMP_KILMASK_I     TGSI_EXEC_TEMP_KILMASK_I
-#define TEMP_KILMASK_C     TGSI_EXEC_TEMP_KILMASK_C
-#define TEMP_OUTPUT_I      TGSI_EXEC_TEMP_OUTPUT_I
-#define TEMP_OUTPUT_C      TGSI_EXEC_TEMP_OUTPUT_C
-#define TEMP_PRIMITIVE_I   TGSI_EXEC_TEMP_PRIMITIVE_I
-#define TEMP_PRIMITIVE_C   TGSI_EXEC_TEMP_PRIMITIVE_C
-#define TEMP_3_I           TGSI_EXEC_TEMP_THREE_I
-#define TEMP_3_C           TGSI_EXEC_TEMP_THREE_C
-#define TEMP_HALF_I        TGSI_EXEC_TEMP_HALF_I
-#define TEMP_HALF_C        TGSI_EXEC_TEMP_HALF_C
-#define TEMP_R0            TGSI_EXEC_TEMP_R0
-
-#define FOR_EACH_CHANNEL(CHAN)\
-   for (CHAN = 0; CHAN < 4; CHAN++)
-
-#define IS_CHANNEL_ENABLED(INST, CHAN)\
-   ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
-
-#define IS_CHANNEL_ENABLED2(INST, CHAN)\
-   ((INST).FullDstRegisters[1].DstRegister.WriteMask & (1 << (CHAN)))
-
-#define FOR_EACH_ENABLED_CHANNEL(INST, CHAN)\
-   FOR_EACH_CHANNEL( CHAN )\
-      if (IS_CHANNEL_ENABLED( INST, CHAN ))
-
-#define FOR_EACH_ENABLED_CHANNEL2(INST, CHAN)\
-   FOR_EACH_CHANNEL( CHAN )\
-      if (IS_CHANNEL_ENABLED2( INST, CHAN ))
-
-
-/** The execution mask depends on the conditional mask and the loop mask */
-#define UPDATE_EXEC_MASK(MACH) \
-      MACH->ExecMask = MACH->CondMask & MACH->LoopMask & MACH->ContMask & MACH->FuncMask
-
-
-#define CHAN_X  0
-#define CHAN_Y  1
-#define CHAN_Z  2
-#define CHAN_W  3
-
-
-
-/**
- * Initialize machine state by expanding tokens to full instructions,
- * allocating temporary storage, setting up constants, etc.
- * After this, we can call tgsi_exec_machine_run() many times.
- */
-void 
-tgsi_exec_machine_bind_shader(
-   struct tgsi_exec_machine *mach,
-   const struct tgsi_token *tokens,
-   uint numSamplers,
-   struct tgsi_sampler *samplers)
-{
-   uint k;
-   struct tgsi_parse_context parse;
-   struct tgsi_exec_labels *labels = &mach->Labels;
-   struct tgsi_full_instruction *instructions;
-   struct tgsi_full_declaration *declarations;
-   uint maxInstructions = 10, numInstructions = 0;
-   uint maxDeclarations = 10, numDeclarations = 0;
-   uint instno = 0;
-
-#if 0
-   tgsi_dump(tokens, 0);
-#endif
-
-   mach->Tokens = tokens;
-   mach->Samplers = samplers;
-
-   k = tgsi_parse_init (&parse, mach->Tokens);
-   if (k != TGSI_PARSE_OK) {
-      debug_printf( "Problem parsing!\n" );
-      return;
-   }
-
-   mach->Processor = parse.FullHeader.Processor.Processor;
-   mach->ImmLimit = 0;
-   labels->count = 0;
-
-   declarations = (struct tgsi_full_declaration *)
-      MALLOC( maxDeclarations * sizeof(struct tgsi_full_declaration) );
-
-   if (!declarations) {
-      return;
-   }
-
-   instructions = (struct tgsi_full_instruction *)
-      MALLOC( maxInstructions * sizeof(struct tgsi_full_instruction) );
-
-   if (!instructions) {
-      FREE( declarations );
-      return;
-   }
-
-   while( !tgsi_parse_end_of_tokens( &parse ) ) {
-      uint pointer = parse.Position;
-      uint i;
-
-      tgsi_parse_token( &parse );
-      switch( parse.FullToken.Token.Type ) {
-      case TGSI_TOKEN_TYPE_DECLARATION:
-         /* save expanded declaration */
-         if (numDeclarations == maxDeclarations) {
-            declarations = REALLOC(declarations,
-                                   maxDeclarations
-                                   * sizeof(struct tgsi_full_declaration),
-                                   (maxDeclarations + 10)
-                                   * sizeof(struct tgsi_full_declaration));
-            maxDeclarations += 10;
-         }
-         memcpy(declarations + numDeclarations,
-                &parse.FullToken.FullDeclaration,
-                sizeof(declarations[0]));
-         numDeclarations++;
-         break;
-
-      case TGSI_TOKEN_TYPE_IMMEDIATE:
-         {
-            uint size = parse.FullToken.FullImmediate.Immediate.Size - 1;
-            assert( size % 4 == 0 );
-            assert( mach->ImmLimit + size / 4 <= TGSI_EXEC_NUM_IMMEDIATES );
-
-            for( i = 0; i < size; i++ ) {
-               mach->Imms[mach->ImmLimit + i / 4][i % 4] = 
-		  parse.FullToken.FullImmediate.u.ImmediateFloat32[i].Float;
-            }
-            mach->ImmLimit += size / 4;
-         }
-         break;
-
-      case TGSI_TOKEN_TYPE_INSTRUCTION:
-         assert( labels->count < MAX_LABELS );
-
-         labels->labels[labels->count][0] = instno;
-         labels->labels[labels->count][1] = pointer;
-         labels->count++;
-
-         /* save expanded instruction */
-         if (numInstructions == maxInstructions) {
-            instructions = REALLOC(instructions,
-                                   maxInstructions
-                                   * sizeof(struct tgsi_full_instruction),
-                                   (maxInstructions + 10)
-                                   * sizeof(struct tgsi_full_instruction));
-            maxInstructions += 10;
-         }
-         memcpy(instructions + numInstructions,
-                &parse.FullToken.FullInstruction,
-                sizeof(instructions[0]));
-         numInstructions++;
-         break;
-
-      default:
-         assert( 0 );
-      }
-   }
-   tgsi_parse_free (&parse);
-
-   if (mach->Declarations) {
-      FREE( mach->Declarations );
-   }
-   mach->Declarations = declarations;
-   mach->NumDeclarations = numDeclarations;
-
-   if (mach->Instructions) {
-      FREE( mach->Instructions );
-   }
-   mach->Instructions = instructions;
-   mach->NumInstructions = numInstructions;
-}
-
-
-void
-tgsi_exec_machine_init(
-   struct tgsi_exec_machine *mach )
-{
-   uint i;
-
-   mach->Temps = (struct tgsi_exec_vector *) tgsi_align_128bit( mach->_Temps);
-   mach->Addrs = &mach->Temps[TGSI_EXEC_TEMP_ADDR];
-
-   /* Setup constants. */
-   for( i = 0; i < 4; i++ ) {
-      mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].u[i] = 0x00000000;
-      mach->Temps[TEMP_7F_I].xyzw[TEMP_7F_C].u[i] = 0x7FFFFFFF;
-      mach->Temps[TEMP_80_I].xyzw[TEMP_80_C].u[i] = 0x80000000;
-      mach->Temps[TEMP_FF_I].xyzw[TEMP_FF_C].u[i] = 0xFFFFFFFF;
-      mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].f[i] = 1.0f;
-      mach->Temps[TEMP_2_I].xyzw[TEMP_2_C].f[i] = 2.0f;
-      mach->Temps[TEMP_128_I].xyzw[TEMP_128_C].f[i] = 128.0f;
-      mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C].f[i] = -128.0f;
-      mach->Temps[TEMP_3_I].xyzw[TEMP_3_C].f[i] = 3.0f;
-      mach->Temps[TEMP_HALF_I].xyzw[TEMP_HALF_C].f[i] = 0.5f;
-   }
-}
-
-
-void
-tgsi_exec_machine_free_data(struct tgsi_exec_machine *mach)
-{
-   if (mach->Instructions) {
-      FREE(mach->Instructions);
-      mach->Instructions = NULL;
-      mach->NumInstructions = 0;
-   }
-   if (mach->Declarations) {
-      FREE(mach->Declarations);
-      mach->Declarations = NULL;
-      mach->NumDeclarations = 0;
-   }
-}
-
-
-static void
-micro_abs(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src )
-{
-   dst->f[0] = fabsf( src->f[0] );
-   dst->f[1] = fabsf( src->f[1] );
-   dst->f[2] = fabsf( src->f[2] );
-   dst->f[3] = fabsf( src->f[3] );
-}
-
-static void
-micro_add(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1 )
-{
-   dst->f[0] = src0->f[0] + src1->f[0];
-   dst->f[1] = src0->f[1] + src1->f[1];
-   dst->f[2] = src0->f[2] + src1->f[2];
-   dst->f[3] = src0->f[3] + src1->f[3];
-}
-
-static void
-micro_iadd(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1 )
-{
-   dst->i[0] = src0->i[0] + src1->i[0];
-   dst->i[1] = src0->i[1] + src1->i[1];
-   dst->i[2] = src0->i[2] + src1->i[2];
-   dst->i[3] = src0->i[3] + src1->i[3];
-}
-
-static void
-micro_and(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1 )
-{
-   dst->u[0] = src0->u[0] & src1->u[0];
-   dst->u[1] = src0->u[1] & src1->u[1];
-   dst->u[2] = src0->u[2] & src1->u[2];
-   dst->u[3] = src0->u[3] & src1->u[3];
-}
-
-static void
-micro_ceil(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src )
-{
-   dst->f[0] = ceilf( src->f[0] );
-   dst->f[1] = ceilf( src->f[1] );
-   dst->f[2] = ceilf( src->f[2] );
-   dst->f[3] = ceilf( src->f[3] );
-}
-
-static void
-micro_cos(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src )
-{
-   dst->f[0] = cosf( src->f[0] );
-   dst->f[1] = cosf( src->f[1] );
-   dst->f[2] = cosf( src->f[2] );
-   dst->f[3] = cosf( src->f[3] );
-}
-
-static void
-micro_ddx(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src )
-{
-   dst->f[0] =
-   dst->f[1] =
-   dst->f[2] =
-   dst->f[3] = src->f[TILE_BOTTOM_RIGHT] - src->f[TILE_BOTTOM_LEFT];
-}
-
-static void
-micro_ddy(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src )
-{
-   dst->f[0] =
-   dst->f[1] =
-   dst->f[2] =
-   dst->f[3] = src->f[TILE_TOP_LEFT] - src->f[TILE_BOTTOM_LEFT];
-}
-
-static void
-micro_div(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1 )
-{
-   dst->f[0] = src0->f[0] / src1->f[0];
-   dst->f[1] = src0->f[1] / src1->f[1];
-   dst->f[2] = src0->f[2] / src1->f[2];
-   dst->f[3] = src0->f[3] / src1->f[3];
-}
-
-static void
-micro_udiv(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1 )
-{
-   dst->u[0] = src0->u[0] / src1->u[0];
-   dst->u[1] = src0->u[1] / src1->u[1];
-   dst->u[2] = src0->u[2] / src1->u[2];
-   dst->u[3] = src0->u[3] / src1->u[3];
-}
-
-static void
-micro_eq(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1,
-   const union tgsi_exec_channel *src2,
-   const union tgsi_exec_channel *src3 )
-{
-   dst->f[0] = src0->f[0] == src1->f[0] ? src2->f[0] : src3->f[0];
-   dst->f[1] = src0->f[1] == src1->f[1] ? src2->f[1] : src3->f[1];
-   dst->f[2] = src0->f[2] == src1->f[2] ? src2->f[2] : src3->f[2];
-   dst->f[3] = src0->f[3] == src1->f[3] ? src2->f[3] : src3->f[3];
-}
-
-static void
-micro_ieq(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1,
-   const union tgsi_exec_channel *src2,
-   const union tgsi_exec_channel *src3 )
-{
-   dst->i[0] = src0->i[0] == src1->i[0] ? src2->i[0] : src3->i[0];
-   dst->i[1] = src0->i[1] == src1->i[1] ? src2->i[1] : src3->i[1];
-   dst->i[2] = src0->i[2] == src1->i[2] ? src2->i[2] : src3->i[2];
-   dst->i[3] = src0->i[3] == src1->i[3] ? src2->i[3] : src3->i[3];
-}
-
-static void
-micro_exp2(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src)
-{
-   dst->f[0] = powf( 2.0f, src->f[0] );
-   dst->f[1] = powf( 2.0f, src->f[1] );
-   dst->f[2] = powf( 2.0f, src->f[2] );
-   dst->f[3] = powf( 2.0f, src->f[3] );
-}
-
-static void
-micro_f2it(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src )
-{
-   dst->i[0] = (int) src->f[0];
-   dst->i[1] = (int) src->f[1];
-   dst->i[2] = (int) src->f[2];
-   dst->i[3] = (int) src->f[3];
-}
-
-static void
-micro_f2ut(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src )
-{
-   dst->u[0] = (uint) src->f[0];
-   dst->u[1] = (uint) src->f[1];
-   dst->u[2] = (uint) src->f[2];
-   dst->u[3] = (uint) src->f[3];
-}
-
-static void
-micro_flr(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src )
-{
-   dst->f[0] = floorf( src->f[0] );
-   dst->f[1] = floorf( src->f[1] );
-   dst->f[2] = floorf( src->f[2] );
-   dst->f[3] = floorf( src->f[3] );
-}
-
-static void
-micro_frc(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src )
-{
-   dst->f[0] = src->f[0] - floorf( src->f[0] );
-   dst->f[1] = src->f[1] - floorf( src->f[1] );
-   dst->f[2] = src->f[2] - floorf( src->f[2] );
-   dst->f[3] = src->f[3] - floorf( src->f[3] );
-}
-
-static void
-micro_ge(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1,
-   const union tgsi_exec_channel *src2,
-   const union tgsi_exec_channel *src3 )
-{
-   dst->f[0] = src0->f[0] >= src1->f[0] ? src2->f[0] : src3->f[0];
-   dst->f[1] = src0->f[1] >= src1->f[1] ? src2->f[1] : src3->f[1];
-   dst->f[2] = src0->f[2] >= src1->f[2] ? src2->f[2] : src3->f[2];
-   dst->f[3] = src0->f[3] >= src1->f[3] ? src2->f[3] : src3->f[3];
-}
-
-static void
-micro_i2f(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src )
-{
-   dst->f[0] = (float) src->i[0];
-   dst->f[1] = (float) src->i[1];
-   dst->f[2] = (float) src->i[2];
-   dst->f[3] = (float) src->i[3];
-}
-
-static void
-micro_lg2(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src )
-{
-   dst->f[0] = logf( src->f[0] ) * 1.442695f;
-   dst->f[1] = logf( src->f[1] ) * 1.442695f;
-   dst->f[2] = logf( src->f[2] ) * 1.442695f;
-   dst->f[3] = logf( src->f[3] ) * 1.442695f;
-}
-
-static void
-micro_le(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1,
-   const union tgsi_exec_channel *src2,
-   const union tgsi_exec_channel *src3 )
-{
-   dst->f[0] = src0->f[0] <= src1->f[0] ? src2->f[0] : src3->f[0];
-   dst->f[1] = src0->f[1] <= src1->f[1] ? src2->f[1] : src3->f[1];
-   dst->f[2] = src0->f[2] <= src1->f[2] ? src2->f[2] : src3->f[2];
-   dst->f[3] = src0->f[3] <= src1->f[3] ? src2->f[3] : src3->f[3];
-}
-
-static void
-micro_lt(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1,
-   const union tgsi_exec_channel *src2,
-   const union tgsi_exec_channel *src3 )
-{
-   dst->f[0] = src0->f[0] < src1->f[0] ? src2->f[0] : src3->f[0];
-   dst->f[1] = src0->f[1] < src1->f[1] ? src2->f[1] : src3->f[1];
-   dst->f[2] = src0->f[2] < src1->f[2] ? src2->f[2] : src3->f[2];
-   dst->f[3] = src0->f[3] < src1->f[3] ? src2->f[3] : src3->f[3];
-}
-
-static void
-micro_ilt(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1,
-   const union tgsi_exec_channel *src2,
-   const union tgsi_exec_channel *src3 )
-{
-   dst->i[0] = src0->i[0] < src1->i[0] ? src2->i[0] : src3->i[0];
-   dst->i[1] = src0->i[1] < src1->i[1] ? src2->i[1] : src3->i[1];
-   dst->i[2] = src0->i[2] < src1->i[2] ? src2->i[2] : src3->i[2];
-   dst->i[3] = src0->i[3] < src1->i[3] ? src2->i[3] : src3->i[3];
-}
-
-static void
-micro_ult(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1,
-   const union tgsi_exec_channel *src2,
-   const union tgsi_exec_channel *src3 )
-{
-   dst->u[0] = src0->u[0] < src1->u[0] ? src2->u[0] : src3->u[0];
-   dst->u[1] = src0->u[1] < src1->u[1] ? src2->u[1] : src3->u[1];
-   dst->u[2] = src0->u[2] < src1->u[2] ? src2->u[2] : src3->u[2];
-   dst->u[3] = src0->u[3] < src1->u[3] ? src2->u[3] : src3->u[3];
-}
-
-static void
-micro_max(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1 )
-{
-   dst->f[0] = src0->f[0] > src1->f[0] ? src0->f[0] : src1->f[0];
-   dst->f[1] = src0->f[1] > src1->f[1] ? src0->f[1] : src1->f[1];
-   dst->f[2] = src0->f[2] > src1->f[2] ? src0->f[2] : src1->f[2];
-   dst->f[3] = src0->f[3] > src1->f[3] ? src0->f[3] : src1->f[3];
-}
-
-static void
-micro_imax(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1 )
-{
-   dst->i[0] = src0->i[0] > src1->i[0] ? src0->i[0] : src1->i[0];
-   dst->i[1] = src0->i[1] > src1->i[1] ? src0->i[1] : src1->i[1];
-   dst->i[2] = src0->i[2] > src1->i[2] ? src0->i[2] : src1->i[2];
-   dst->i[3] = src0->i[3] > src1->i[3] ? src0->i[3] : src1->i[3];
-}
-
-static void
-micro_umax(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1 )
-{
-   dst->u[0] = src0->u[0] > src1->u[0] ? src0->u[0] : src1->u[0];
-   dst->u[1] = src0->u[1] > src1->u[1] ? src0->u[1] : src1->u[1];
-   dst->u[2] = src0->u[2] > src1->u[2] ? src0->u[2] : src1->u[2];
-   dst->u[3] = src0->u[3] > src1->u[3] ? src0->u[3] : src1->u[3];
-}
-
-static void
-micro_min(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1 )
-{
-   dst->f[0] = src0->f[0] < src1->f[0] ? src0->f[0] : src1->f[0];
-   dst->f[1] = src0->f[1] < src1->f[1] ? src0->f[1] : src1->f[1];
-   dst->f[2] = src0->f[2] < src1->f[2] ? src0->f[2] : src1->f[2];
-   dst->f[3] = src0->f[3] < src1->f[3] ? src0->f[3] : src1->f[3];
-}
-
-static void
-micro_imin(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1 )
-{
-   dst->i[0] = src0->i[0] < src1->i[0] ? src0->i[0] : src1->i[0];
-   dst->i[1] = src0->i[1] < src1->i[1] ? src0->i[1] : src1->i[1];
-   dst->i[2] = src0->i[2] < src1->i[2] ? src0->i[2] : src1->i[2];
-   dst->i[3] = src0->i[3] < src1->i[3] ? src0->i[3] : src1->i[3];
-}
-
-static void
-micro_umin(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1 )
-{
-   dst->u[0] = src0->u[0] < src1->u[0] ? src0->u[0] : src1->u[0];
-   dst->u[1] = src0->u[1] < src1->u[1] ? src0->u[1] : src1->u[1];
-   dst->u[2] = src0->u[2] < src1->u[2] ? src0->u[2] : src1->u[2];
-   dst->u[3] = src0->u[3] < src1->u[3] ? src0->u[3] : src1->u[3];
-}
-
-static void
-micro_umod(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1 )
-{
-   dst->u[0] = src0->u[0] % src1->u[0];
-   dst->u[1] = src0->u[1] % src1->u[1];
-   dst->u[2] = src0->u[2] % src1->u[2];
-   dst->u[3] = src0->u[3] % src1->u[3];
-}
-
-static void
-micro_mul(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1 )
-{
-   dst->f[0] = src0->f[0] * src1->f[0];
-   dst->f[1] = src0->f[1] * src1->f[1];
-   dst->f[2] = src0->f[2] * src1->f[2];
-   dst->f[3] = src0->f[3] * src1->f[3];
-}
-
-static void
-micro_imul(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1 )
-{
-   dst->i[0] = src0->i[0] * src1->i[0];
-   dst->i[1] = src0->i[1] * src1->i[1];
-   dst->i[2] = src0->i[2] * src1->i[2];
-   dst->i[3] = src0->i[3] * src1->i[3];
-}
-
-static void
-micro_imul64(
-   union tgsi_exec_channel *dst0,
-   union tgsi_exec_channel *dst1,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1 )
-{
-   dst1->i[0] = src0->i[0] * src1->i[0];
-   dst1->i[1] = src0->i[1] * src1->i[1];
-   dst1->i[2] = src0->i[2] * src1->i[2];
-   dst1->i[3] = src0->i[3] * src1->i[3];
-   dst0->i[0] = 0;
-   dst0->i[1] = 0;
-   dst0->i[2] = 0;
-   dst0->i[3] = 0;
-}
-
-static void
-micro_umul64(
-   union tgsi_exec_channel *dst0,
-   union tgsi_exec_channel *dst1,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1 )
-{
-   dst1->u[0] = src0->u[0] * src1->u[0];
-   dst1->u[1] = src0->u[1] * src1->u[1];
-   dst1->u[2] = src0->u[2] * src1->u[2];
-   dst1->u[3] = src0->u[3] * src1->u[3];
-   dst0->u[0] = 0;
-   dst0->u[1] = 0;
-   dst0->u[2] = 0;
-   dst0->u[3] = 0;
-}
-
-static void
-micro_movc(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1,
-   const union tgsi_exec_channel *src2 )
-{
-   dst->u[0] = src0->u[0] ? src1->u[0] : src2->u[0];
-   dst->u[1] = src0->u[1] ? src1->u[1] : src2->u[1];
-   dst->u[2] = src0->u[2] ? src1->u[2] : src2->u[2];
-   dst->u[3] = src0->u[3] ? src1->u[3] : src2->u[3];
-}
-
-static void
-micro_neg(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src )
-{
-   dst->f[0] = -src->f[0];
-   dst->f[1] = -src->f[1];
-   dst->f[2] = -src->f[2];
-   dst->f[3] = -src->f[3];
-}
-
-static void
-micro_ineg(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src )
-{
-   dst->i[0] = -src->i[0];
-   dst->i[1] = -src->i[1];
-   dst->i[2] = -src->i[2];
-   dst->i[3] = -src->i[3];
-}
-
-static void
-micro_not(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src )
-{
-   dst->u[0] = ~src->u[0];
-   dst->u[1] = ~src->u[1];
-   dst->u[2] = ~src->u[2];
-   dst->u[3] = ~src->u[3];
-}
-
-static void
-micro_or(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1 )
-{
-   dst->u[0] = src0->u[0] | src1->u[0];
-   dst->u[1] = src0->u[1] | src1->u[1];
-   dst->u[2] = src0->u[2] | src1->u[2];
-   dst->u[3] = src0->u[3] | src1->u[3];
-}
-
-static void
-micro_pow(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1 )
-{
-   dst->f[0] = powf( src0->f[0], src1->f[0] );
-   dst->f[1] = powf( src0->f[1], src1->f[1] );
-   dst->f[2] = powf( src0->f[2], src1->f[2] );
-   dst->f[3] = powf( src0->f[3], src1->f[3] );
-}
-
-static void
-micro_rnd(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src )
-{
-   dst->f[0] = floorf( src->f[0] + 0.5f );
-   dst->f[1] = floorf( src->f[1] + 0.5f );
-   dst->f[2] = floorf( src->f[2] + 0.5f );
-   dst->f[3] = floorf( src->f[3] + 0.5f );
-}
-
-static void
-micro_shl(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1 )
-{
-   dst->i[0] = src0->i[0] << src1->i[0];
-   dst->i[1] = src0->i[1] << src1->i[1];
-   dst->i[2] = src0->i[2] << src1->i[2];
-   dst->i[3] = src0->i[3] << src1->i[3];
-}
-
-static void
-micro_ishr(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1 )
-{
-   dst->i[0] = src0->i[0] >> src1->i[0];
-   dst->i[1] = src0->i[1] >> src1->i[1];
-   dst->i[2] = src0->i[2] >> src1->i[2];
-   dst->i[3] = src0->i[3] >> src1->i[3];
-}
-
-static void
-micro_trunc(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0 )
-{
-   dst->f[0] = (float) (int) src0->f[0];
-   dst->f[1] = (float) (int) src0->f[1];
-   dst->f[2] = (float) (int) src0->f[2];
-   dst->f[3] = (float) (int) src0->f[3];
-}
-
-static void
-micro_ushr(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1 )
-{
-   dst->u[0] = src0->u[0] >> src1->u[0];
-   dst->u[1] = src0->u[1] >> src1->u[1];
-   dst->u[2] = src0->u[2] >> src1->u[2];
-   dst->u[3] = src0->u[3] >> src1->u[3];
-}
-
-static void
-micro_sin(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src )
-{
-   dst->f[0] = sinf( src->f[0] );
-   dst->f[1] = sinf( src->f[1] );
-   dst->f[2] = sinf( src->f[2] );
-   dst->f[3] = sinf( src->f[3] );
-}
-
-static void
-micro_sqrt( union tgsi_exec_channel *dst,
-            const union tgsi_exec_channel *src )
-{
-   dst->f[0] = sqrtf( src->f[0] );
-   dst->f[1] = sqrtf( src->f[1] );
-   dst->f[2] = sqrtf( src->f[2] );
-   dst->f[3] = sqrtf( src->f[3] );
-}
-
-static void
-micro_sub(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1 )
-{
-   dst->f[0] = src0->f[0] - src1->f[0];
-   dst->f[1] = src0->f[1] - src1->f[1];
-   dst->f[2] = src0->f[2] - src1->f[2];
-   dst->f[3] = src0->f[3] - src1->f[3];
-}
-
-static void
-micro_u2f(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src )
-{
-   dst->f[0] = (float) src->u[0];
-   dst->f[1] = (float) src->u[1];
-   dst->f[2] = (float) src->u[2];
-   dst->f[3] = (float) src->u[3];
-}
-
-static void
-micro_xor(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1 )
-{
-   dst->u[0] = src0->u[0] ^ src1->u[0];
-   dst->u[1] = src0->u[1] ^ src1->u[1];
-   dst->u[2] = src0->u[2] ^ src1->u[2];
-   dst->u[3] = src0->u[3] ^ src1->u[3];
-}
-
-static void
-fetch_src_file_channel(
-   const struct tgsi_exec_machine *mach,
-   const uint file,
-   const uint swizzle,
-   const union tgsi_exec_channel *index,
-   union tgsi_exec_channel *chan )
-{
-   switch( swizzle ) {
-   case TGSI_EXTSWIZZLE_X:
-   case TGSI_EXTSWIZZLE_Y:
-   case TGSI_EXTSWIZZLE_Z:
-   case TGSI_EXTSWIZZLE_W:
-      switch( file ) {
-      case TGSI_FILE_CONSTANT:
-         chan->f[0] = mach->Consts[index->i[0]][swizzle];
-         chan->f[1] = mach->Consts[index->i[1]][swizzle];
-         chan->f[2] = mach->Consts[index->i[2]][swizzle];
-         chan->f[3] = mach->Consts[index->i[3]][swizzle];
-         break;
-
-      case TGSI_FILE_INPUT:
-         chan->u[0] = mach->Inputs[index->i[0]].xyzw[swizzle].u[0];
-         chan->u[1] = mach->Inputs[index->i[1]].xyzw[swizzle].u[1];
-         chan->u[2] = mach->Inputs[index->i[2]].xyzw[swizzle].u[2];
-         chan->u[3] = mach->Inputs[index->i[3]].xyzw[swizzle].u[3];
-         break;
-
-      case TGSI_FILE_TEMPORARY:
-         assert(index->i[0] < TGSI_EXEC_NUM_TEMPS);
-         chan->u[0] = mach->Temps[index->i[0]].xyzw[swizzle].u[0];
-         chan->u[1] = mach->Temps[index->i[1]].xyzw[swizzle].u[1];
-         chan->u[2] = mach->Temps[index->i[2]].xyzw[swizzle].u[2];
-         chan->u[3] = mach->Temps[index->i[3]].xyzw[swizzle].u[3];
-         break;
-
-      case TGSI_FILE_IMMEDIATE:
-         assert( index->i[0] < (int) mach->ImmLimit );
-         chan->f[0] = mach->Imms[index->i[0]][swizzle];
-         assert( index->i[1] < (int) mach->ImmLimit );
-         chan->f[1] = mach->Imms[index->i[1]][swizzle];
-         assert( index->i[2] < (int) mach->ImmLimit );
-         chan->f[2] = mach->Imms[index->i[2]][swizzle];
-         assert( index->i[3] < (int) mach->ImmLimit );
-         chan->f[3] = mach->Imms[index->i[3]][swizzle];
-         break;
-
-      case TGSI_FILE_ADDRESS:
-         chan->u[0] = mach->Addrs[index->i[0]].xyzw[swizzle].u[0];
-         chan->u[1] = mach->Addrs[index->i[1]].xyzw[swizzle].u[1];
-         chan->u[2] = mach->Addrs[index->i[2]].xyzw[swizzle].u[2];
-         chan->u[3] = mach->Addrs[index->i[3]].xyzw[swizzle].u[3];
-         break;
-
-      case TGSI_FILE_OUTPUT:
-         /* vertex/fragment output vars can be read too */
-         chan->u[0] = mach->Outputs[index->i[0]].xyzw[swizzle].u[0];
-         chan->u[1] = mach->Outputs[index->i[1]].xyzw[swizzle].u[1];
-         chan->u[2] = mach->Outputs[index->i[2]].xyzw[swizzle].u[2];
-         chan->u[3] = mach->Outputs[index->i[3]].xyzw[swizzle].u[3];
-         break;
-
-      default:
-         assert( 0 );
-      }
-      break;
-
-   case TGSI_EXTSWIZZLE_ZERO:
-      *chan = mach->Temps[TEMP_0_I].xyzw[TEMP_0_C];
-      break;
-
-   case TGSI_EXTSWIZZLE_ONE:
-      *chan = mach->Temps[TEMP_1_I].xyzw[TEMP_1_C];
-      break;
-
-   default:
-      assert( 0 );
-   }
-}
-
-static void
-fetch_source(
-   const struct tgsi_exec_machine *mach,
-   union tgsi_exec_channel *chan,
-   const struct tgsi_full_src_register *reg,
-   const uint chan_index )
-{
-   union tgsi_exec_channel index;
-   uint swizzle;
-
-   index.i[0] =
-   index.i[1] =
-   index.i[2] =
-   index.i[3] = reg->SrcRegister.Index;
-
-   if (reg->SrcRegister.Indirect) {
-      union tgsi_exec_channel index2;
-      union tgsi_exec_channel indir_index;
-
-      index2.i[0] =
-      index2.i[1] =
-      index2.i[2] =
-      index2.i[3] = reg->SrcRegisterInd.Index;
-
-      swizzle = tgsi_util_get_src_register_swizzle( &reg->SrcRegisterInd, CHAN_X );
-      fetch_src_file_channel(
-         mach,
-         reg->SrcRegisterInd.File,
-         swizzle,
-         &index2,
-         &indir_index );
-
-      index.i[0] += indir_index.i[0];
-      index.i[1] += indir_index.i[1];
-      index.i[2] += indir_index.i[2];
-      index.i[3] += indir_index.i[3];
-   }
-
-   if( reg->SrcRegister.Dimension ) {
-      switch( reg->SrcRegister.File ) {
-      case TGSI_FILE_INPUT:
-         index.i[0] *= 17;
-         index.i[1] *= 17;
-         index.i[2] *= 17;
-         index.i[3] *= 17;
-         break;
-      case TGSI_FILE_CONSTANT:
-         index.i[0] *= 4096;
-         index.i[1] *= 4096;
-         index.i[2] *= 4096;
-         index.i[3] *= 4096;
-         break;
-      default:
-         assert( 0 );
-      }
-
-      index.i[0] += reg->SrcRegisterDim.Index;
-      index.i[1] += reg->SrcRegisterDim.Index;
-      index.i[2] += reg->SrcRegisterDim.Index;
-      index.i[3] += reg->SrcRegisterDim.Index;
-
-      if (reg->SrcRegisterDim.Indirect) {
-         union tgsi_exec_channel index2;
-         union tgsi_exec_channel indir_index;
-
-         index2.i[0] =
-         index2.i[1] =
-         index2.i[2] =
-         index2.i[3] = reg->SrcRegisterDimInd.Index;
-
-         swizzle = tgsi_util_get_src_register_swizzle( &reg->SrcRegisterDimInd, CHAN_X );
-         fetch_src_file_channel(
-            mach,
-            reg->SrcRegisterDimInd.File,
-            swizzle,
-            &index2,
-            &indir_index );
-
-         index.i[0] += indir_index.i[0];
-         index.i[1] += indir_index.i[1];
-         index.i[2] += indir_index.i[2];
-         index.i[3] += indir_index.i[3];
-      }
-   }
-
-   swizzle = tgsi_util_get_full_src_register_extswizzle( reg, chan_index );
-   fetch_src_file_channel(
-      mach,
-      reg->SrcRegister.File,
-      swizzle,
-      &index,
-      chan );
-
-   switch (tgsi_util_get_full_src_register_sign_mode( reg, chan_index )) {
-   case TGSI_UTIL_SIGN_CLEAR:
-      micro_abs( chan, chan );
-      break;
-
-   case TGSI_UTIL_SIGN_SET:
-      micro_abs( chan, chan );
-      micro_neg( chan, chan );
-      break;
-
-   case TGSI_UTIL_SIGN_TOGGLE:
-      micro_neg( chan, chan );
-      break;
-
-   case TGSI_UTIL_SIGN_KEEP:
-      break;
-   }
-
-   if (reg->SrcRegisterExtMod.Complement) {
-      micro_sub( chan, &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], chan );
-   }
-}
-
-static void
-store_dest(
-   struct tgsi_exec_machine *mach,
-   const union tgsi_exec_channel *chan,
-   const struct tgsi_full_dst_register *reg,
-   const struct tgsi_full_instruction *inst,
-   uint chan_index )
-{
-   union tgsi_exec_channel *dst;
-
-   switch( reg->DstRegister.File ) {
-   case TGSI_FILE_NULL:
-      return;
-
-   case TGSI_FILE_OUTPUT:
-      dst = &mach->Outputs[mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0]
-                           + reg->DstRegister.Index].xyzw[chan_index];
-      break;
-
-   case TGSI_FILE_TEMPORARY:
-      assert(reg->DstRegister.Index < TGSI_EXEC_NUM_TEMPS);
-      dst = &mach->Temps[reg->DstRegister.Index].xyzw[chan_index];
-      break;
-
-   case TGSI_FILE_ADDRESS:
-      dst = &mach->Addrs[reg->DstRegister.Index].xyzw[chan_index];
-      break;
-
-   default:
-      assert( 0 );
-      return;
-   }
-
-   switch (inst->Instruction.Saturate)
-   {
-   case TGSI_SAT_NONE:
-      if (mach->ExecMask & 0x1)
-         dst->i[0] = chan->i[0];
-      if (mach->ExecMask & 0x2)
-         dst->i[1] = chan->i[1];
-      if (mach->ExecMask & 0x4)
-         dst->i[2] = chan->i[2];
-      if (mach->ExecMask & 0x8)
-         dst->i[3] = chan->i[3];
-      break;
-
-   case TGSI_SAT_ZERO_ONE:
-      /* XXX need to obey ExecMask here */
-      micro_max(dst, chan, &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]);
-      micro_min(dst, dst, &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C]);
-      break;
-
-   case TGSI_SAT_MINUS_PLUS_ONE:
-      assert( 0 );
-      break;
-
-   default:
-      assert( 0 );
-   }
-}
-
-#define FETCH(VAL,INDEX,CHAN)\
-    fetch_source (mach, VAL, &inst->FullSrcRegisters[INDEX], CHAN)
-
-#define STORE(VAL,INDEX,CHAN)\
-    store_dest (mach, VAL, &inst->FullDstRegisters[INDEX], inst, CHAN )
-
-
-/**
- * Execute ARB-style KIL which is predicated by a src register.
- * Kill fragment if any of the four values is less than zero.
- */
-static void
-exec_kilp(struct tgsi_exec_machine *mach,
-          const struct tgsi_full_instruction *inst)
-{
-   uint uniquemask;
-   uint chan_index;
-   uint kilmask = 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
-   union tgsi_exec_channel r[1];
-
-   /* This mask stores component bits that were already tested. Note that
-    * we test if the value is less than zero, so 1.0 and 0.0 need not to be
-    * tested. */
-   uniquemask = (1 << TGSI_EXTSWIZZLE_ZERO) | (1 << TGSI_EXTSWIZZLE_ONE);
-
-   for (chan_index = 0; chan_index < 4; chan_index++)
-   {
-      uint swizzle;
-      uint i;
-
-      /* unswizzle channel */
-      swizzle = tgsi_util_get_full_src_register_extswizzle (
-                        &inst->FullSrcRegisters[0],
-                        chan_index);
-
-      /* check if the component has not been already tested */
-      if (uniquemask & (1 << swizzle))
-         continue;
-      uniquemask |= 1 << swizzle;
-
-      FETCH(&r[0], 0, chan_index);
-      for (i = 0; i < 4; i++)
-         if (r[0].f[i] < 0.0f)
-            kilmask |= 1 << i;
-   }
-
-   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
-}
-
-
-/*
- * Fetch a texel using STR texture coordinates.
- */
-static void
-fetch_texel( struct tgsi_sampler *sampler,
-             const union tgsi_exec_channel *s,
-             const union tgsi_exec_channel *t,
-             const union tgsi_exec_channel *p,
-             float lodbias,  /* XXX should be float[4] */
-             union tgsi_exec_channel *r,
-             union tgsi_exec_channel *g,
-             union tgsi_exec_channel *b,
-             union tgsi_exec_channel *a )
-{
-   uint j;
-   float rgba[NUM_CHANNELS][QUAD_SIZE];
-
-   sampler->get_samples(sampler, s->f, t->f, p->f, lodbias, rgba);
-
-   for (j = 0; j < 4; j++) {
-      r->f[j] = rgba[0][j];
-      g->f[j] = rgba[1][j];
-      b->f[j] = rgba[2][j];
-      a->f[j] = rgba[3][j];
-   }
-}
-
-
-static void
-exec_tex(struct tgsi_exec_machine *mach,
-         const struct tgsi_full_instruction *inst,
-         boolean biasLod,
-         boolean projected)
-{
-   const uint unit = inst->FullSrcRegisters[1].SrcRegister.Index;
-   union tgsi_exec_channel r[8];
-   uint chan_index;
-   float lodBias;
-
-   /*   debug_printf("Sampler %u unit %u\n", sampler, unit); */
-
-   switch (inst->InstructionExtTexture.Texture) {
-   case TGSI_TEXTURE_1D:
-
-      FETCH(&r[0], 0, CHAN_X);
-
-      if (projected) {
-         FETCH(&r[1], 0, CHAN_W);
-         micro_div( &r[0], &r[0], &r[1] );
-      }
-
-      if (biasLod) {
-         FETCH(&r[1], 0, CHAN_W);
-         lodBias = r[2].f[0];
-      }
-      else
-         lodBias = 0.0;
-
-      fetch_texel(&mach->Samplers[unit],
-                  &r[0], NULL, NULL, lodBias,  /* S, T, P, BIAS */
-                  &r[0], &r[1], &r[2], &r[3]); /* R, G, B, A */
-      break;
-
-   case TGSI_TEXTURE_2D:
-   case TGSI_TEXTURE_RECT:
-
-      FETCH(&r[0], 0, CHAN_X);
-      FETCH(&r[1], 0, CHAN_Y);
-      FETCH(&r[2], 0, CHAN_Z);
-
-      if (projected) {
-         FETCH(&r[3], 0, CHAN_W);
-         micro_div( &r[0], &r[0], &r[3] );
-         micro_div( &r[1], &r[1], &r[3] );
-         micro_div( &r[2], &r[2], &r[3] );
-      }
-
-      if (biasLod) {
-         FETCH(&r[3], 0, CHAN_W);
-         lodBias = r[3].f[0];
-      }
-      else
-         lodBias = 0.0;
-
-      fetch_texel(&mach->Samplers[unit],
-                  &r[0], &r[1], &r[2], lodBias,  /* inputs */
-                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
-      break;
-
-   case TGSI_TEXTURE_3D:
-   case TGSI_TEXTURE_CUBE:
-
-      FETCH(&r[0], 0, CHAN_X);
-      FETCH(&r[1], 0, CHAN_Y);
-      FETCH(&r[2], 0, CHAN_Z);
-
-      if (projected) {
-         FETCH(&r[3], 0, CHAN_W);
-         micro_div( &r[0], &r[0], &r[3] );
-         micro_div( &r[1], &r[1], &r[3] );
-         micro_div( &r[2], &r[2], &r[3] );
-      }
-
-      if (biasLod) {
-         FETCH(&r[3], 0, CHAN_W);
-         lodBias = r[3].f[0];
-      }
-      else
-         lodBias = 0.0;
-
-      fetch_texel(&mach->Samplers[unit],
-                  &r[0], &r[1], &r[2], lodBias,
-                  &r[0], &r[1], &r[2], &r[3]);
-      break;
-
-   default:
-      assert (0);
-   }
-
-   FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-      STORE( &r[chan_index], 0, chan_index );
-   }
-}
-
-
-/**
- * Evaluate a constant-valued coefficient at the position of the
- * current quad.
- */
-static void
-eval_constant_coef(
-   struct tgsi_exec_machine *mach,
-   unsigned attrib,
-   unsigned chan )
-{
-   unsigned i;
-
-   for( i = 0; i < QUAD_SIZE; i++ ) {
-      mach->Inputs[attrib].xyzw[chan].f[i] = mach->InterpCoefs[attrib].a0[chan];
-   }
-}
-
-/**
- * Evaluate a linear-valued coefficient at the position of the
- * current quad.
- */
-static void
-eval_linear_coef(
-   struct tgsi_exec_machine *mach,
-   unsigned attrib,
-   unsigned chan )
-{
-   const float x = mach->QuadPos.xyzw[0].f[0];
-   const float y = mach->QuadPos.xyzw[1].f[0];
-   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
-   const float dady = mach->InterpCoefs[attrib].dady[chan];
-   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
-   mach->Inputs[attrib].xyzw[chan].f[0] = a0;
-   mach->Inputs[attrib].xyzw[chan].f[1] = a0 + dadx;
-   mach->Inputs[attrib].xyzw[chan].f[2] = a0 + dady;
-   mach->Inputs[attrib].xyzw[chan].f[3] = a0 + dadx + dady;
-}
-
-/**
- * Evaluate a perspective-valued coefficient at the position of the
- * current quad.
- */
-static void
-eval_perspective_coef(
-   struct tgsi_exec_machine *mach,
-   unsigned attrib,
-   unsigned chan )
-{
-   const float x = mach->QuadPos.xyzw[0].f[0];
-   const float y = mach->QuadPos.xyzw[1].f[0];
-   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
-   const float dady = mach->InterpCoefs[attrib].dady[chan];
-   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
-   const float *w = mach->QuadPos.xyzw[3].f;
-   /* divide by W here */
-   mach->Inputs[attrib].xyzw[chan].f[0] = a0 / w[0];
-   mach->Inputs[attrib].xyzw[chan].f[1] = (a0 + dadx) / w[1];
-   mach->Inputs[attrib].xyzw[chan].f[2] = (a0 + dady) / w[2];
-   mach->Inputs[attrib].xyzw[chan].f[3] = (a0 + dadx + dady) / w[3];
-}
-
-
-typedef void (* eval_coef_func)(
-   struct tgsi_exec_machine *mach,
-   unsigned attrib,
-   unsigned chan );
-
-static void
-exec_declaration(
-   struct tgsi_exec_machine *mach,
-   const struct tgsi_full_declaration *decl )
-{
-   if( mach->Processor == TGSI_PROCESSOR_FRAGMENT ) {
-      if( decl->Declaration.File == TGSI_FILE_INPUT ) {
-         unsigned first, last, mask;
-         eval_coef_func eval;
-
-         first = decl->DeclarationRange.First;
-         last = decl->DeclarationRange.Last;
-         mask = decl->Declaration.UsageMask;
-
-         switch( decl->Declaration.Interpolate ) {
-         case TGSI_INTERPOLATE_CONSTANT:
-            eval = eval_constant_coef;
-            break;
-
-         case TGSI_INTERPOLATE_LINEAR:
-            eval = eval_linear_coef;
-            break;
-
-         case TGSI_INTERPOLATE_PERSPECTIVE:
-            eval = eval_perspective_coef;
-            break;
-
-         default:
-            assert( 0 );
-         }
-
-         if( mask == TGSI_WRITEMASK_XYZW ) {
-            unsigned i, j;
-
-            for( i = first; i <= last; i++ ) {
-               for( j = 0; j < NUM_CHANNELS; j++ ) {
-                  eval( mach, i, j );
-               }
-            }
-         }
-         else {
-            unsigned i, j;
-
-            for( j = 0; j < NUM_CHANNELS; j++ ) {
-               if( mask & (1 << j) ) {
-                  for( i = first; i <= last; i++ ) {
-                     eval( mach, i, j );
-                  }
-               }
-            }
-         }
-      }
-   }
-}
-
-static void
-exec_instruction(
-   struct tgsi_exec_machine *mach,
-   const struct tgsi_full_instruction *inst,
-   int *pc )
-{
-   uint chan_index;
-   union tgsi_exec_channel r[8];
-
-   (*pc)++;
-
-   switch (inst->Instruction.Opcode) {
-   case TGSI_OPCODE_ARL:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-	 FETCH( &r[0], 0, chan_index );
-	 micro_f2it( &r[0], &r[0] );
-	 STORE( &r[0], 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_MOV:
-   case TGSI_OPCODE_SWZ:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         STORE( &r[0], 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_LIT:
-      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
-	 STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X );
-      }
-
-      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y ) || IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
-	 FETCH( &r[0], 0, CHAN_X );
-	 if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
-	    micro_max( &r[0], &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
-	    STORE( &r[0], 0, CHAN_Y );
-	 }
-
-	 if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
-	    FETCH( &r[1], 0, CHAN_Y );
-	    micro_max( &r[1], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
-
-	    FETCH( &r[2], 0, CHAN_W );
-	    micro_min( &r[2], &r[2], &mach->Temps[TEMP_128_I].xyzw[TEMP_128_C] );
-	    micro_max( &r[2], &r[2], &mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C] );
-	    micro_pow( &r[1], &r[1], &r[2] );
-	    micro_lt( &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
-	    STORE( &r[0], 0, CHAN_Z );
-	 }
-      }
-
-      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
-	 STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
-      }
-      break;
-
-   case TGSI_OPCODE_RCP:
-   /* TGSI_OPCODE_RECIP */
-      FETCH( &r[0], 0, CHAN_X );
-      micro_div( &r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &r[0] );
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-	 STORE( &r[0], 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_RSQ:
-   /* TGSI_OPCODE_RECIPSQRT */
-      FETCH( &r[0], 0, CHAN_X );
-      micro_sqrt( &r[0], &r[0] );
-      micro_div( &r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &r[0] );
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-	 STORE( &r[0], 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_EXP:
-      FETCH( &r[0], 0, CHAN_X );
-      micro_flr( &r[1], &r[0] );  /* r1 = floor(r0) */
-      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
-         micro_exp2( &r[2], &r[1] );       /* r2 = 2 ^ r1 */
-	 STORE( &r[2], 0, CHAN_X );        /* store r2 */
-      }
-      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
-         micro_sub( &r[2], &r[0], &r[1] ); /* r2 = r0 - r1 */
-	 STORE( &r[2], 0, CHAN_Y );        /* store r2 */
-      }
-      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
-         micro_exp2( &r[2], &r[0] );       /* r2 = 2 ^ r0 */
-	 STORE( &r[2], 0, CHAN_Z );        /* store r2 */
-      }
-      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
-	 STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
-      }
-      break;
-
-   case TGSI_OPCODE_LOG:
-      FETCH( &r[0], 0, CHAN_X );
-      micro_abs( &r[2], &r[0] );  /* r2 = abs(r0) */
-      micro_lg2( &r[1], &r[2] );  /* r1 = lg2(r2) */
-      micro_flr( &r[0], &r[1] );  /* r0 = floor(r1) */
-      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
-	 STORE( &r[0], 0, CHAN_X );
-      }
-      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
-         micro_exp2( &r[0], &r[0] );       /* r0 = 2 ^ r0 */
-         micro_div( &r[0], &r[2], &r[0] ); /* r0 = r2 / r0 */
-	 STORE( &r[0], 0, CHAN_Y );
-      }
-      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
-	 STORE( &r[1], 0, CHAN_Z );
-      }
-      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
-	 STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
-      }
-      break;
-
-   case TGSI_OPCODE_MUL:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index )
-      {
-         FETCH(&r[0], 0, chan_index);
-         FETCH(&r[1], 1, chan_index);
-
-         micro_mul( &r[0], &r[0], &r[1] );
-
-         STORE(&r[0], 0, chan_index);
-      }
-      break;
-
-   case TGSI_OPCODE_ADD:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         FETCH( &r[1], 1, chan_index );
-         micro_add( &r[0], &r[0], &r[1] );
-         STORE( &r[0], 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_DP3:
-   /* TGSI_OPCODE_DOT3 */
-      FETCH( &r[0], 0, CHAN_X );
-      FETCH( &r[1], 1, CHAN_X );
-      micro_mul( &r[0], &r[0], &r[1] );
-
-      FETCH( &r[1], 0, CHAN_Y );
-      FETCH( &r[2], 1, CHAN_Y );
-      micro_mul( &r[1], &r[1], &r[2] );
-      micro_add( &r[0], &r[0], &r[1] );
-
-      FETCH( &r[1], 0, CHAN_Z );
-      FETCH( &r[2], 1, CHAN_Z );
-      micro_mul( &r[1], &r[1], &r[2] );
-      micro_add( &r[0], &r[0], &r[1] );
-
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( &r[0], 0, chan_index );
-      }
-      break;
-
-    case TGSI_OPCODE_DP4:
-    /* TGSI_OPCODE_DOT4 */
-       FETCH(&r[0], 0, CHAN_X);
-       FETCH(&r[1], 1, CHAN_X);
-
-       micro_mul( &r[0], &r[0], &r[1] );
-
-       FETCH(&r[1], 0, CHAN_Y);
-       FETCH(&r[2], 1, CHAN_Y);
-
-       micro_mul( &r[1], &r[1], &r[2] );
-       micro_add( &r[0], &r[0], &r[1] );
-
-       FETCH(&r[1], 0, CHAN_Z);
-       FETCH(&r[2], 1, CHAN_Z);
-
-       micro_mul( &r[1], &r[1], &r[2] );
-       micro_add( &r[0], &r[0], &r[1] );
-
-       FETCH(&r[1], 0, CHAN_W);
-       FETCH(&r[2], 1, CHAN_W);
-
-       micro_mul( &r[1], &r[1], &r[2] );
-       micro_add( &r[0], &r[0], &r[1] );
-
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-	 STORE( &r[0], 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_DST:
-      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
-	 STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X );
-      }
-
-      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
-	 FETCH( &r[0], 0, CHAN_Y );
-	 FETCH( &r[1], 1, CHAN_Y);
-	 micro_mul( &r[0], &r[0], &r[1] );
-	 STORE( &r[0], 0, CHAN_Y );
-      }
-
-      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
-	 FETCH( &r[0], 0, CHAN_Z );
-	 STORE( &r[0], 0, CHAN_Z );
-      }
-
-      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
-	 FETCH( &r[0], 1, CHAN_W );
-	 STORE( &r[0], 0, CHAN_W );
-      }
-      break;
-
-   case TGSI_OPCODE_MIN:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH(&r[0], 0, chan_index);
-         FETCH(&r[1], 1, chan_index);
-
-         /* XXX use micro_min()?? */
-         micro_lt( &r[0], &r[0], &r[1], &r[0], &r[1] );
-
-         STORE(&r[0], 0, chan_index);
-      }
-      break;
-
-   case TGSI_OPCODE_MAX:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH(&r[0], 0, chan_index);
-         FETCH(&r[1], 1, chan_index);
-
-         /* XXX use micro_max()?? */
-         micro_lt( &r[0], &r[0], &r[1], &r[1], &r[0] );
-
-         STORE(&r[0], 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_SLT:
-   /* TGSI_OPCODE_SETLT */
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         FETCH( &r[1], 1, chan_index );
-         micro_lt( &r[0], &r[0], &r[1], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
-         STORE( &r[0], 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_SGE:
-   /* TGSI_OPCODE_SETGE */
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         FETCH( &r[1], 1, chan_index );
-         micro_ge( &r[0], &r[0], &r[1], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
-         STORE( &r[0], 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_MAD:
-   /* TGSI_OPCODE_MADD */
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         FETCH( &r[1], 1, chan_index );
-         micro_mul( &r[0], &r[0], &r[1] );
-         FETCH( &r[1], 2, chan_index );
-         micro_add( &r[0], &r[0], &r[1] );
-         STORE( &r[0], 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_SUB:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH(&r[0], 0, chan_index);
-         FETCH(&r[1], 1, chan_index);
-
-         micro_sub( &r[0], &r[0], &r[1] );
-
-         STORE(&r[0], 0, chan_index);
-      }
-      break;
-
-   case TGSI_OPCODE_LERP:
-   /* TGSI_OPCODE_LRP */
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH(&r[0], 0, chan_index);
-         FETCH(&r[1], 1, chan_index);
-         FETCH(&r[2], 2, chan_index);
-
-         micro_sub( &r[1], &r[1], &r[2] );
-         micro_mul( &r[0], &r[0], &r[1] );
-         micro_add( &r[0], &r[0], &r[2] );
-
-         STORE(&r[0], 0, chan_index);
-      }
-      break;
-
-   case TGSI_OPCODE_CND:
-      assert (0);
-      break;
-
-   case TGSI_OPCODE_CND0:
-      assert (0);
-      break;
-
-   case TGSI_OPCODE_DOT2ADD:
-      /* TGSI_OPCODE_DP2A */
-      assert (0);
-      break;
-
-   case TGSI_OPCODE_INDEX:
-      assert (0);
-      break;
-
-   case TGSI_OPCODE_NEGATE:
-      assert (0);
-      break;
-
-   case TGSI_OPCODE_FRAC:
-   /* TGSI_OPCODE_FRC */
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         micro_frc( &r[0], &r[0] );
-         STORE( &r[0], 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_CLAMP:
-      assert (0);
-      break;
-
-   case TGSI_OPCODE_FLOOR:
-   /* TGSI_OPCODE_FLR */
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         micro_flr( &r[0], &r[0] );
-         STORE( &r[0], 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_ROUND:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         micro_rnd( &r[0], &r[0] );
-         STORE( &r[0], 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_EXPBASE2:
-    /* TGSI_OPCODE_EX2 */
-      FETCH(&r[0], 0, CHAN_X);
-
-      micro_pow( &r[0], &mach->Temps[TEMP_2_I].xyzw[TEMP_2_C], &r[0] );
-
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-	 STORE( &r[0], 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_LOGBASE2:
-   /* TGSI_OPCODE_LG2 */
-      FETCH( &r[0], 0, CHAN_X );
-      micro_lg2( &r[0], &r[0] );
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( &r[0], 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_POWER:
-      /* TGSI_OPCODE_POW */
-      FETCH(&r[0], 0, CHAN_X);
-      FETCH(&r[1], 1, CHAN_X);
-
-      micro_pow( &r[0], &r[0], &r[1] );
-
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-	 STORE( &r[0], 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_CROSSPRODUCT:
-      /* TGSI_OPCODE_XPD */
-      FETCH(&r[0], 0, CHAN_Y);
-      FETCH(&r[1], 1, CHAN_Z);
-
-      micro_mul( &r[2], &r[0], &r[1] );
-
-      FETCH(&r[3], 0, CHAN_Z);
-      FETCH(&r[4], 1, CHAN_Y);
-
-      micro_mul( &r[5], &r[3], &r[4] );
-      micro_sub( &r[2], &r[2], &r[5] );
-
-      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
-         STORE( &r[2], 0, CHAN_X );
-      }
-
-      FETCH(&r[2], 1, CHAN_X);
-
-      micro_mul( &r[3], &r[3], &r[2] );
-
-      FETCH(&r[5], 0, CHAN_X);
-
-      micro_mul( &r[1], &r[1], &r[5] );
-      micro_sub( &r[3], &r[3], &r[1] );
-
-      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
-         STORE( &r[3], 0, CHAN_Y );
-      }
-
-      micro_mul( &r[5], &r[5], &r[4] );
-      micro_mul( &r[0], &r[0], &r[2] );
-      micro_sub( &r[5], &r[5], &r[0] );
-
-      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
-         STORE( &r[5], 0, CHAN_Z );
-      }
-
-      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
-         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
-      }
-      break;
-
-    case TGSI_OPCODE_MULTIPLYMATRIX:
-       assert (0);
-       break;
-
-    case TGSI_OPCODE_ABS:
-       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-          FETCH(&r[0], 0, chan_index);
-
-          micro_abs( &r[0], &r[0] );
-
-          STORE(&r[0], 0, chan_index);
-       }
-       break;
-
-   case TGSI_OPCODE_RCC:
-      assert (0);
-      break;
-
-   case TGSI_OPCODE_DPH:
-      FETCH(&r[0], 0, CHAN_X);
-      FETCH(&r[1], 1, CHAN_X);
-
-      micro_mul( &r[0], &r[0], &r[1] );
-
-      FETCH(&r[1], 0, CHAN_Y);
-      FETCH(&r[2], 1, CHAN_Y);
-
-      micro_mul( &r[1], &r[1], &r[2] );
-      micro_add( &r[0], &r[0], &r[1] );
-
-      FETCH(&r[1], 0, CHAN_Z);
-      FETCH(&r[2], 1, CHAN_Z);
-
-      micro_mul( &r[1], &r[1], &r[2] );
-      micro_add( &r[0], &r[0], &r[1] );
-
-      FETCH(&r[1], 1, CHAN_W);
-
-      micro_add( &r[0], &r[0], &r[1] );
-
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-	 STORE( &r[0], 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_COS:
-      FETCH(&r[0], 0, CHAN_X);
-
-      micro_cos( &r[0], &r[0] );
-
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-	 STORE( &r[0], 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_DDX:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         micro_ddx( &r[0], &r[0] );
-         STORE( &r[0], 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_DDY:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         micro_ddy( &r[0], &r[0] );
-         STORE( &r[0], 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_KILP:
-      exec_kilp (mach, inst);
-      break;
-
-   case TGSI_OPCODE_KIL:
-      /* for enabled ExecMask bits, set the killed bit */
-      mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= mach->ExecMask;
-      break;
-
-   case TGSI_OPCODE_PK2H:
-      assert (0);
-      break;
-
-   case TGSI_OPCODE_PK2US:
-      assert (0);
-      break;
-
-   case TGSI_OPCODE_PK4B:
-      assert (0);
-      break;
-
-   case TGSI_OPCODE_PK4UB:
-      assert (0);
-      break;
-
-   case TGSI_OPCODE_RFL:
-      assert (0);
-      break;
-
-   case TGSI_OPCODE_SEQ:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         FETCH( &r[1], 1, chan_index );
-         micro_eq( &r[0], &r[0], &r[1],
-                   &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C],
-                   &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
-         STORE( &r[0], 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_SFL:
-      assert (0);
-      break;
-
-   case TGSI_OPCODE_SGT:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         FETCH( &r[1], 1, chan_index );
-         micro_le( &r[0], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C] );
-         STORE( &r[0], 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_SIN:
-      FETCH( &r[0], 0, CHAN_X );
-      micro_sin( &r[0], &r[0] );
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( &r[0], 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_SLE:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         FETCH( &r[1], 1, chan_index );
-         micro_le( &r[0], &r[0], &r[1], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
-         STORE( &r[0], 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_SNE:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         FETCH( &r[1], 1, chan_index );
-         micro_eq( &r[0], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C] );
-         STORE( &r[0], 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_STR:
-      assert (0);
-      break;
-
-   case TGSI_OPCODE_TEX:
-      /* simple texture lookup */
-      /* src[0] = texcoord */
-      /* src[1] = sampler unit */
-      exec_tex(mach, inst, FALSE, FALSE);
-      break;
-
-   case TGSI_OPCODE_TXB:
-      /* Texture lookup with lod bias */
-      /* src[0] = texcoord (src[0].w = LOD bias) */
-      /* src[1] = sampler unit */
-      exec_tex(mach, inst, TRUE, FALSE);
-      break;
-
-   case TGSI_OPCODE_TXD:
-      /* Texture lookup with explict partial derivatives */
-      /* src[0] = texcoord */
-      /* src[1] = d[strq]/dx */
-      /* src[2] = d[strq]/dy */
-      /* src[3] = sampler unit */
-      assert (0);
-      break;
-
-   case TGSI_OPCODE_TXL:
-      /* Texture lookup with explit LOD */
-      /* src[0] = texcoord (src[0].w = LOD) */
-      /* src[1] = sampler unit */
-      exec_tex(mach, inst, TRUE, FALSE);
-      break;
-
-   case TGSI_OPCODE_TXP:
-      /* Texture lookup with projection */
-      /* src[0] = texcoord (src[0].w = projection) */
-      /* src[1] = sampler unit */
-      exec_tex(mach, inst, FALSE, TRUE);
-      break;
-
-   case TGSI_OPCODE_UP2H:
-      assert (0);
-      break;
-
-   case TGSI_OPCODE_UP2US:
-      assert (0);
-      break;
-
-   case TGSI_OPCODE_UP4B:
-      assert (0);
-      break;
-
-   case TGSI_OPCODE_UP4UB:
-      assert (0);
-      break;
-
-   case TGSI_OPCODE_X2D:
-      assert (0);
-      break;
-
-   case TGSI_OPCODE_ARA:
-      assert (0);
-      break;
-
-   case TGSI_OPCODE_ARR:
-      assert (0);
-      break;
-
-   case TGSI_OPCODE_BRA:
-      assert (0);
-      break;
-
-   case TGSI_OPCODE_CAL:
-      /* skip the call if no execution channels are enabled */
-      if (mach->ExecMask) {
-         /* do the call */
-
-         /* push the Cond, Loop, Cont stacks */
-         assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
-         mach->CondStack[mach->CondStackTop++] = mach->CondMask;
-         assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
-         mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
-         assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
-         mach->ContStack[mach->ContStackTop++] = mach->ContMask;
-
-         assert(mach->FuncStackTop < TGSI_EXEC_MAX_CALL_NESTING);
-         mach->FuncStack[mach->FuncStackTop++] = mach->FuncMask;
-
-         /* note that PC was already incremented above */
-         mach->CallStack[mach->CallStackTop++] = *pc;
-         *pc = inst->InstructionExtLabel.Label;
-      }
-      break;
-
-   case TGSI_OPCODE_RET:
-      mach->FuncMask &= ~mach->ExecMask;
-      UPDATE_EXEC_MASK(mach);
-
-      if (mach->ExecMask == 0x0) {
-         /* really return now (otherwise, keep executing */
-
-         if (mach->CallStackTop == 0) {
-            /* returning from main() */
-            *pc = -1;
-            return;
-         }
-         *pc = mach->CallStack[--mach->CallStackTop];
-
-         /* pop the Cond, Loop, Cont stacks */
-         assert(mach->CondStackTop > 0);
-         mach->CondMask = mach->CondStack[--mach->CondStackTop];
-         assert(mach->LoopStackTop > 0);
-         mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
-         assert(mach->ContStackTop > 0);
-         mach->ContMask = mach->ContStack[--mach->ContStackTop];
-         assert(mach->FuncStackTop > 0);
-         mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
-
-         UPDATE_EXEC_MASK(mach);
-      }
-      break;
-
-   case TGSI_OPCODE_SSG:
-      assert (0);
-      break;
-
-   case TGSI_OPCODE_CMP:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH(&r[0], 0, chan_index);
-         FETCH(&r[1], 1, chan_index);
-         FETCH(&r[2], 2, chan_index);
-
-         micro_lt( &r[0], &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &r[1], &r[2] );
-
-         STORE(&r[0], 0, chan_index);
-      }
-      break;
-
-   case TGSI_OPCODE_SCS:
-      if( IS_CHANNEL_ENABLED( *inst, CHAN_X ) || IS_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
-         FETCH( &r[0], 0, CHAN_X );
-      }
-      if( IS_CHANNEL_ENABLED( *inst, CHAN_X ) ) {
-         micro_cos( &r[1], &r[0] );
-         STORE( &r[1], 0, CHAN_X );
-      }
-      if( IS_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
-         micro_sin( &r[1], &r[0] );
-         STORE( &r[1], 0, CHAN_Y );
-      }
-      if( IS_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
-         STORE( &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], 0, CHAN_Z );
-      }
-      if( IS_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
-         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
-      }
-      break;
-
-   case TGSI_OPCODE_NRM:
-      assert (0);
-      break;
-
-   case TGSI_OPCODE_DIV:
-      assert( 0 );
-      break;
-
-   case TGSI_OPCODE_DP2:
-      FETCH( &r[0], 0, CHAN_X );
-      FETCH( &r[1], 1, CHAN_X );
-      micro_mul( &r[0], &r[0], &r[1] );
-
-      FETCH( &r[1], 0, CHAN_Y );
-      FETCH( &r[2], 1, CHAN_Y );
-      micro_mul( &r[1], &r[1], &r[2] );
-      micro_add( &r[0], &r[0], &r[1] );
-
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( &r[0], 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_IF:
-      /* push CondMask */
-      assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
-      mach->CondStack[mach->CondStackTop++] = mach->CondMask;
-      FETCH( &r[0], 0, CHAN_X );
-      /* update CondMask */
-      if( ! r[0].u[0] ) {
-         mach->CondMask &= ~0x1;
-      }
-      if( ! r[0].u[1] ) {
-         mach->CondMask &= ~0x2;
-      }
-      if( ! r[0].u[2] ) {
-         mach->CondMask &= ~0x4;
-      }
-      if( ! r[0].u[3] ) {
-         mach->CondMask &= ~0x8;
-      }
-      UPDATE_EXEC_MASK(mach);
-      /* Todo: If CondMask==0, jump to ELSE */
-      break;
-
-   case TGSI_OPCODE_ELSE:
-      /* invert CondMask wrt previous mask */
-      {
-         uint prevMask;
-         assert(mach->CondStackTop > 0);
-         prevMask = mach->CondStack[mach->CondStackTop - 1];
-         mach->CondMask = ~mach->CondMask & prevMask;
-         UPDATE_EXEC_MASK(mach);
-         /* Todo: If CondMask==0, jump to ENDIF */
-      }
-      break;
-
-   case TGSI_OPCODE_ENDIF:
-      /* pop CondMask */
-      assert(mach->CondStackTop > 0);
-      mach->CondMask = mach->CondStack[--mach->CondStackTop];
-      UPDATE_EXEC_MASK(mach);
-      break;
-
-   case TGSI_OPCODE_END:
-      /* halt execution */
-      *pc = -1;
-      break;
-
-   case TGSI_OPCODE_REP:
-      assert (0);
-      break;
-
-   case TGSI_OPCODE_ENDREP:
-       assert (0);
-       break;
-
-   case TGSI_OPCODE_PUSHA:
-      assert (0);
-      break;
-
-   case TGSI_OPCODE_POPA:
-      assert (0);
-      break;
-
-   case TGSI_OPCODE_CEIL:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         micro_ceil( &r[0], &r[0] );
-         STORE( &r[0], 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_I2F:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         micro_i2f( &r[0], &r[0] );
-         STORE( &r[0], 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_NOT:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         micro_not( &r[0], &r[0] );
-         STORE( &r[0], 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_TRUNC:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         micro_trunc( &r[0], &r[0] );
-         STORE( &r[0], 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_SHL:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         FETCH( &r[1], 1, chan_index );
-         micro_shl( &r[0], &r[0], &r[1] );
-         STORE( &r[0], 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_SHR:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         FETCH( &r[1], 1, chan_index );
-         micro_ishr( &r[0], &r[0], &r[1] );
-         STORE( &r[0], 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_AND:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         FETCH( &r[1], 1, chan_index );
-         micro_and( &r[0], &r[0], &r[1] );
-         STORE( &r[0], 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_OR:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         FETCH( &r[1], 1, chan_index );
-         micro_or( &r[0], &r[0], &r[1] );
-         STORE( &r[0], 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_MOD:
-      assert (0);
-      break;
-
-   case TGSI_OPCODE_XOR:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         FETCH( &r[1], 1, chan_index );
-         micro_xor( &r[0], &r[0], &r[1] );
-         STORE( &r[0], 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_SAD:
-      assert (0);
-      break;
-
-   case TGSI_OPCODE_TXF:
-      assert (0);
-      break;
-
-   case TGSI_OPCODE_TXQ:
-      assert (0);
-      break;
-
-   case TGSI_OPCODE_EMIT:
-      mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] += 16;
-      mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]]++;
-      break;
-
-   case TGSI_OPCODE_ENDPRIM:
-      mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]++;
-      mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]] = 0;
-      break;
-
-   case TGSI_OPCODE_LOOP:
-      /* fall-through (for now) */
-   case TGSI_OPCODE_BGNLOOP2:
-      /* push LoopMask and ContMasks */
-      assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
-      mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
-      assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
-      mach->ContStack[mach->ContStackTop++] = mach->ContMask;
-      break;
-
-   case TGSI_OPCODE_ENDLOOP:
-      /* fall-through (for now at least) */
-   case TGSI_OPCODE_ENDLOOP2:
-      /* Restore ContMask, but don't pop */
-      assert(mach->ContStackTop > 0);
-      mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
-      UPDATE_EXEC_MASK(mach);
-      if (mach->ExecMask) {
-         /* repeat loop: jump to instruction just past BGNLOOP */
-         *pc = inst->InstructionExtLabel.Label + 1;
-      }
-      else {
-         /* exit loop: pop LoopMask */
-         assert(mach->LoopStackTop > 0);
-         mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
-         /* pop ContMask */
-         assert(mach->ContStackTop > 0);
-         mach->ContMask = mach->ContStack[--mach->ContStackTop];
-      }
-      UPDATE_EXEC_MASK(mach);
-      break;
-
-   case TGSI_OPCODE_BRK:
-      /* turn off loop channels for each enabled exec channel */
-      mach->LoopMask &= ~mach->ExecMask;
-      /* Todo: if mach->LoopMask == 0, jump to end of loop */
-      UPDATE_EXEC_MASK(mach);
-      break;
-
-   case TGSI_OPCODE_CONT:
-      /* turn off cont channels for each enabled exec channel */
-      mach->ContMask &= ~mach->ExecMask;
-      /* Todo: if mach->LoopMask == 0, jump to end of loop */
-      UPDATE_EXEC_MASK(mach);
-      break;
-
-   case TGSI_OPCODE_BGNSUB:
-      /* no-op */
-      break;
-
-   case TGSI_OPCODE_ENDSUB:
-      /* no-op */
-      break;
-
-   case TGSI_OPCODE_NOISE1:
-      assert( 0 );
-      break;
-
-   case TGSI_OPCODE_NOISE2:
-      assert( 0 );
-      break;
-
-   case TGSI_OPCODE_NOISE3:
-      assert( 0 );
-      break;
-
-   case TGSI_OPCODE_NOISE4:
-      assert( 0 );
-      break;
-
-   case TGSI_OPCODE_NOP:
-      break;
-
-   default:
-      assert( 0 );
-   }
-}
-
-
-/**
- * Run TGSI interpreter.
- * \return bitmask of "alive" quad components
- */
-uint
-tgsi_exec_machine_run( struct tgsi_exec_machine *mach )
-{
-   uint i;
-   int pc = 0;
-
-   mach->CondMask = 0xf;
-   mach->LoopMask = 0xf;
-   mach->ContMask = 0xf;
-   mach->FuncMask = 0xf;
-   mach->ExecMask = 0xf;
-
-   mach->CondStackTop = 0; /* temporarily subvert this assertion */
-   assert(mach->CondStackTop == 0);
-   assert(mach->LoopStackTop == 0);
-   assert(mach->ContStackTop == 0);
-   assert(mach->CallStackTop == 0);
-
-   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] = 0;
-   mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] = 0;
-
-   if( mach->Processor == TGSI_PROCESSOR_GEOMETRY ) {
-      mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0] = 0;
-      mach->Primitives[0] = 0;
-   }
-
-
-   /* execute declarations (interpolants) */
-   for (i = 0; i < mach->NumDeclarations; i++) {
-      exec_declaration( mach, mach->Declarations+i );
-   }
-
-   /* execute instructions, until pc is set to -1 */
-   while (pc != -1) {
-      assert(pc < (int) mach->NumInstructions);
-      exec_instruction( mach, mach->Instructions + pc, &pc );
-   }
-
-#if 0
-   /* we scale from floats in [0,1] to Zbuffer ints in sp_quad_depth_test.c */
-   if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) {
-      /*
-       * Scale back depth component.
-       */
-      for (i = 0; i < 4; i++)
-         mach->Outputs[0].xyzw[2].f[i] *= ctx->DrawBuffer->_DepthMaxF;
-   }
-#endif
-
-   return ~mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
-}
-
-
diff --git a/src/gallium/auxiliary/tgsi/exec/tgsi_exec.h b/src/gallium/auxiliary/tgsi/exec/tgsi_exec.h
deleted file mode 100644
index 4f30650b07..0000000000
--- a/src/gallium/auxiliary/tgsi/exec/tgsi_exec.h
+++ /dev/null
@@ -1,253 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-#if !defined TGSI_EXEC_H
-#define TGSI_EXEC_H
-
-#include "pipe/p_compiler.h"
-
-#if defined __cplusplus
-extern "C" {
-#endif
-
-#define MAX_LABELS 1024
-
-#define NUM_CHANNELS 4  /* R,G,B,A */
-#define QUAD_SIZE    4  /* 4 pixel/quad */
-
-/**
-  * Registers may be treated as float, signed int or unsigned int.
-  */
-union tgsi_exec_channel
-{
-   float    f[QUAD_SIZE];
-   int      i[QUAD_SIZE];
-   unsigned u[QUAD_SIZE];
-};
-
-/**
-  * A vector[RGBA] of channels[4 pixels]
-  */
-struct tgsi_exec_vector
-{
-   union tgsi_exec_channel xyzw[NUM_CHANNELS];
-};
-
-/**
- * For fragment programs, information for computing fragment input
- * values from plane equation of the triangle/line.
- */
-struct tgsi_interp_coef
-{
-   float a0[NUM_CHANNELS];	/* in an xyzw layout */
-   float dadx[NUM_CHANNELS];
-   float dady[NUM_CHANNELS];
-};
-
-
-struct softpipe_tile_cache;  /**< Opaque to TGSI */
-
-/**
- * Information for sampling textures, which must be implemented
- * by code outside the TGSI executor.
- */
-struct tgsi_sampler
-{
-   const struct pipe_sampler_state *state;
-   struct pipe_texture *texture;
-   /** Get samples for four fragments in a quad */
-   void (*get_samples)(struct tgsi_sampler *sampler,
-                       const float s[QUAD_SIZE],
-                       const float t[QUAD_SIZE],
-                       const float p[QUAD_SIZE],
-                       float lodbias,
-                       float rgba[NUM_CHANNELS][QUAD_SIZE]);
-   void *pipe; /*XXX temporary*/
-   struct softpipe_tile_cache *cache;
-};
-
-/**
- * For branching/calling subroutines.
- */
-struct tgsi_exec_labels
-{
-   unsigned labels[MAX_LABELS][2];
-   unsigned count;
-};
-
-
-#define TGSI_EXEC_NUM_TEMPS       128
-#define TGSI_EXEC_NUM_TEMP_EXTRAS   6
-#define TGSI_EXEC_NUM_IMMEDIATES  256
-
-/*
- * Locations of various utility registers (_I = Index, _C = Channel)
- */
-#define TGSI_EXEC_TEMP_00000000_I   (TGSI_EXEC_NUM_TEMPS + 0)
-#define TGSI_EXEC_TEMP_00000000_C   0
-
-#define TGSI_EXEC_TEMP_7FFFFFFF_I   (TGSI_EXEC_NUM_TEMPS + 0)
-#define TGSI_EXEC_TEMP_7FFFFFFF_C   1
-
-#define TGSI_EXEC_TEMP_80000000_I   (TGSI_EXEC_NUM_TEMPS + 0)
-#define TGSI_EXEC_TEMP_80000000_C   2
-
-#define TGSI_EXEC_TEMP_FFFFFFFF_I   (TGSI_EXEC_NUM_TEMPS + 0)
-#define TGSI_EXEC_TEMP_FFFFFFFF_C   3
-
-#define TGSI_EXEC_TEMP_ONE_I        (TGSI_EXEC_NUM_TEMPS + 1)
-#define TGSI_EXEC_TEMP_ONE_C        0
-
-#define TGSI_EXEC_TEMP_TWO_I        (TGSI_EXEC_NUM_TEMPS + 1)
-#define TGSI_EXEC_TEMP_TWO_C        1
-
-#define TGSI_EXEC_TEMP_128_I        (TGSI_EXEC_NUM_TEMPS + 1)
-#define TGSI_EXEC_TEMP_128_C        2
-
-#define TGSI_EXEC_TEMP_MINUS_128_I  (TGSI_EXEC_NUM_TEMPS + 1)
-#define TGSI_EXEC_TEMP_MINUS_128_C  3
-
-#define TGSI_EXEC_TEMP_KILMASK_I    (TGSI_EXEC_NUM_TEMPS + 2)
-#define TGSI_EXEC_TEMP_KILMASK_C    0
-
-#define TGSI_EXEC_TEMP_OUTPUT_I     (TGSI_EXEC_NUM_TEMPS + 2)
-#define TGSI_EXEC_TEMP_OUTPUT_C     1
-
-#define TGSI_EXEC_TEMP_PRIMITIVE_I  (TGSI_EXEC_NUM_TEMPS + 2)
-#define TGSI_EXEC_TEMP_PRIMITIVE_C  2
-
-#define TGSI_EXEC_TEMP_THREE_I      (TGSI_EXEC_NUM_TEMPS + 2)
-#define TGSI_EXEC_TEMP_THREE_C      3
-
-#define TGSI_EXEC_TEMP_HALF_I       (TGSI_EXEC_NUM_TEMPS + 3)
-#define TGSI_EXEC_TEMP_HALF_C       0
-
-#define TGSI_EXEC_TEMP_R0           (TGSI_EXEC_NUM_TEMPS + 4)
-
-#define TGSI_EXEC_TEMP_ADDR         (TGSI_EXEC_NUM_TEMPS + 5)
-
-
-#define TGSI_EXEC_MAX_COND_NESTING  20
-#define TGSI_EXEC_MAX_LOOP_NESTING  20
-#define TGSI_EXEC_MAX_CALL_NESTING  20
-
-/**
- * Run-time virtual machine state for executing TGSI shader.
- */
-struct tgsi_exec_machine
-{
-   /* Total = program temporaries + internal temporaries
-    *         + 1 padding to align to 16 bytes
-    */
-   struct tgsi_exec_vector       _Temps[TGSI_EXEC_NUM_TEMPS +
-                                        TGSI_EXEC_NUM_TEMP_EXTRAS + 1];
-
-   /*
-    * This will point to _Temps after aligning to 16B boundary.
-    */
-   struct tgsi_exec_vector       *Temps;
-   struct tgsi_exec_vector       *Addrs;
-
-   struct tgsi_sampler           *Samplers;
-
-   float                         Imms[TGSI_EXEC_NUM_IMMEDIATES][4];
-   unsigned                      ImmLimit;
-   const float                   (*Consts)[4];
-   struct tgsi_exec_vector       *Inputs;
-   struct tgsi_exec_vector       *Outputs;
-   const struct tgsi_token       *Tokens;
-   unsigned                      Processor;
-
-   /* GEOMETRY processor only. */
-   unsigned                      *Primitives;
-
-   /* FRAGMENT processor only. */
-   const struct tgsi_interp_coef *InterpCoefs;
-   struct tgsi_exec_vector       QuadPos;
-
-   /* Conditional execution masks */
-   uint CondMask;  /**< For IF/ELSE/ENDIF */
-   uint LoopMask;  /**< For BGNLOOP/ENDLOOP */
-   uint ContMask;  /**< For loop CONT statements */
-   uint FuncMask;  /**< For function calls */
-   uint ExecMask;  /**< = CondMask & LoopMask */
-
-   /** Condition mask stack (for nested conditionals) */
-   uint CondStack[TGSI_EXEC_MAX_COND_NESTING];
-   int CondStackTop;
-
-   /** Loop mask stack (for nested loops) */
-   uint LoopStack[TGSI_EXEC_MAX_LOOP_NESTING];
-   int LoopStackTop;
-
-   /** Loop continue mask stack (see comments in tgsi_exec.c) */
-   uint ContStack[TGSI_EXEC_MAX_LOOP_NESTING];
-   int ContStackTop;
-
-   /** Function execution mask stack (for executing subroutine code) */
-   uint FuncStack[TGSI_EXEC_MAX_CALL_NESTING];
-   int FuncStackTop;
-
-   /** Function call stack for saving/restoring the program counter */
-   uint CallStack[TGSI_EXEC_MAX_CALL_NESTING];
-   int CallStackTop;
-
-   struct tgsi_full_instruction *Instructions;
-   uint NumInstructions;
-
-   struct tgsi_full_declaration *Declarations;
-   uint NumDeclarations;
-
-   struct tgsi_exec_labels Labels;
-};
-
-void
-tgsi_exec_machine_init(
-   struct tgsi_exec_machine *mach );
-
-
-void 
-tgsi_exec_machine_bind_shader(
-   struct tgsi_exec_machine *mach,
-   const struct tgsi_token *tokens,
-   uint numSamplers,
-   struct tgsi_sampler *samplers);
-
-uint
-tgsi_exec_machine_run(
-   struct tgsi_exec_machine *mach );
-
-
-void
-tgsi_exec_machine_free_data(struct tgsi_exec_machine *mach);
-
-
-#if defined __cplusplus
-} /* extern "C" */
-#endif
-
-#endif /* TGSI_EXEC_H */
diff --git a/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c b/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c
deleted file mode 100755
index cdbdf5c882..0000000000
--- a/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c
+++ /dev/null
@@ -1,2275 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-#include "pipe/p_util.h"
-#include "pipe/p_shader_tokens.h"
-#include "tgsi/util/tgsi_parse.h"
-#include "tgsi/util/tgsi_util.h"
-#include "tgsi_exec.h"
-#include "tgsi_sse2.h"
-
-#include "rtasm/rtasm_x86sse.h"
-
-#ifdef PIPE_ARCH_X86
-
-/* for 1/sqrt()
- *
- * This costs about 100fps (close to 10%) in gears:
- */
-#define HIGH_PRECISION 1
-
-
-#define FOR_EACH_CHANNEL( CHAN )\
-   for( CHAN = 0; CHAN < 4; CHAN++ )
-
-#define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
-   ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
-
-#define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
-   if( IS_DST0_CHANNEL_ENABLED( INST, CHAN ))
-
-#define FOR_EACH_DST0_ENABLED_CHANNEL( INST, CHAN )\
-   FOR_EACH_CHANNEL( CHAN )\
-      IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )
-
-#define CHAN_X 0
-#define CHAN_Y 1
-#define CHAN_Z 2
-#define CHAN_W 3
-
-#define TEMP_R0   TGSI_EXEC_TEMP_R0
-
-/**
- * X86 utility functions.
- */
-
-static struct x86_reg
-make_xmm(
-   unsigned xmm )
-{
-   return x86_make_reg(
-      file_XMM,
-      (enum x86_reg_name) xmm );
-}
-
-/**
- * X86 register mapping helpers.
- */
-
-static struct x86_reg
-get_const_base( void )
-{
-   return x86_make_reg(
-      file_REG32,
-      reg_CX );
-}
-
-static struct x86_reg
-get_input_base( void )
-{
-   return x86_make_reg(
-      file_REG32,
-      reg_AX );
-}
-
-static struct x86_reg
-get_output_base( void )
-{
-   return x86_make_reg(
-      file_REG32,
-      reg_DX );
-}
-
-static struct x86_reg
-get_temp_base( void )
-{
-   return x86_make_reg(
-      file_REG32,
-      reg_BX );
-}
-
-static struct x86_reg
-get_coef_base( void )
-{
-   return get_output_base();
-}
-
-static struct x86_reg
-get_immediate_base( void )
-{
-   return x86_make_reg(
-      file_REG32,
-      reg_DI );
-}
-
-
-/**
- * Data access helpers.
- */
-
-
-static struct x86_reg
-get_immediate(
-   unsigned vec,
-   unsigned chan )
-{
-   return x86_make_disp(
-      get_immediate_base(),
-      (vec * 4 + chan) * 4 );
-}
-
-static struct x86_reg
-get_const(
-   unsigned vec,
-   unsigned chan )
-{
-   return x86_make_disp(
-      get_const_base(),
-      (vec * 4 + chan) * 4 );
-}
-
-static struct x86_reg
-get_input(
-   unsigned vec,
-   unsigned chan )
-{
-   return x86_make_disp(
-      get_input_base(),
-      (vec * 4 + chan) * 16 );
-}
-
-static struct x86_reg
-get_output(
-   unsigned vec,
-   unsigned chan )
-{
-   return x86_make_disp(
-      get_output_base(),
-      (vec * 4 + chan) * 16 );
-}
-
-static struct x86_reg
-get_temp(
-   unsigned vec,
-   unsigned chan )
-{
-   return x86_make_disp(
-      get_temp_base(),
-      (vec * 4 + chan) * 16 );
-}
-
-static struct x86_reg
-get_coef(
-   unsigned vec,
-   unsigned chan,
-   unsigned member )
-{
-   return x86_make_disp(
-      get_coef_base(),
-      ((vec * 3 + member) * 4 + chan) * 4 );
-}
-
-
-static void
-emit_ret(
-   struct x86_function  *func )
-{
-   x86_ret( func );
-}
-
-
-/**
- * Data fetch helpers.
- */
-
-/**
- * Copy a shader constant to xmm register
- * \param xmm  the destination xmm register
- * \param vec  the src const buffer index
- * \param chan  src channel to fetch (X, Y, Z or W)
- */
-static void
-emit_const(
-   struct x86_function *func,
-   unsigned xmm,
-   unsigned vec,
-   unsigned chan )
-{
-   sse_movss(
-      func,
-      make_xmm( xmm ),
-      get_const( vec, chan ) );
-   sse_shufps(
-      func,
-      make_xmm( xmm ),
-      make_xmm( xmm ),
-      SHUF( 0, 0, 0, 0 ) );
-}
-
-static void
-emit_immediate(
-   struct x86_function *func,
-   unsigned xmm,
-   unsigned vec,
-   unsigned chan )
-{
-   sse_movss(
-      func,
-      make_xmm( xmm ),
-      get_immediate( vec, chan ) );
-   sse_shufps(
-      func,
-      make_xmm( xmm ),
-      make_xmm( xmm ),
-      SHUF( 0, 0, 0, 0 ) );
-}
-
-
-/**
- * Copy a shader input to xmm register
- * \param xmm  the destination xmm register
- * \param vec  the src input attrib
- * \param chan  src channel to fetch (X, Y, Z or W)
- */
-static void
-emit_inputf(
-   struct x86_function *func,
-   unsigned xmm,
-   unsigned vec,
-   unsigned chan )
-{
-   sse_movups(
-      func,
-      make_xmm( xmm ),
-      get_input( vec, chan ) );
-}
-
-/**
- * Store an xmm register to a shader output
- * \param xmm  the source xmm register
- * \param vec  the dest output attrib
- * \param chan  src dest channel to store (X, Y, Z or W)
- */
-static void
-emit_output(
-   struct x86_function *func,
-   unsigned xmm,
-   unsigned vec,
-   unsigned chan )
-{
-   sse_movups(
-      func,
-      get_output( vec, chan ),
-      make_xmm( xmm ) );
-}
-
-/**
- * Copy a shader temporary to xmm register
- * \param xmm  the destination xmm register
- * \param vec  the src temp register
- * \param chan  src channel to fetch (X, Y, Z or W)
- */
-static void
-emit_tempf(
-   struct x86_function *func,
-   unsigned xmm,
-   unsigned vec,
-   unsigned chan )
-{
-   sse_movaps(
-      func,
-      make_xmm( xmm ),
-      get_temp( vec, chan ) );
-}
-
-/**
- * Load an xmm register with an input attrib coefficient (a0, dadx or dady)
- * \param xmm  the destination xmm register
- * \param vec  the src input/attribute coefficient index
- * \param chan  src channel to fetch (X, Y, Z or W)
- * \param member  0=a0, 1=dadx, 2=dady
- */
-static void
-emit_coef(
-   struct x86_function *func,
-   unsigned xmm,
-   unsigned vec,
-   unsigned chan,
-   unsigned member )
-{
-   sse_movss(
-      func,
-      make_xmm( xmm ),
-      get_coef( vec, chan, member ) );
-   sse_shufps(
-      func,
-      make_xmm( xmm ),
-      make_xmm( xmm ),
-      SHUF( 0, 0, 0, 0 ) );
-}
-
-/**
- * Data store helpers.
- */
-
-static void
-emit_inputs(
-   struct x86_function *func,
-   unsigned xmm,
-   unsigned vec,
-   unsigned chan )
-{
-   sse_movups(
-      func,
-      get_input( vec, chan ),
-      make_xmm( xmm ) );
-}
-
-static void
-emit_temps(
-   struct x86_function *func,
-   unsigned xmm,
-   unsigned vec,
-   unsigned chan )
-{
-   sse_movaps(
-      func,
-      get_temp( vec, chan ),
-      make_xmm( xmm ) );
-}
-
-static void
-emit_addrs(
-   struct x86_function *func,
-   unsigned xmm,
-   unsigned vec,
-   unsigned chan )
-{
-   emit_temps(
-      func,
-      xmm,
-      vec + TGSI_EXEC_NUM_TEMPS,
-      chan );
-}
-
-/**
- * Coefficent fetch helpers.
- */
-
-static void
-emit_coef_a0(
-   struct x86_function *func,
-   unsigned xmm,
-   unsigned vec,
-   unsigned chan )
-{
-   emit_coef(
-      func,
-      xmm,
-      vec,
-      chan,
-      0 );
-}
-
-static void
-emit_coef_dadx(
-   struct x86_function *func,
-   unsigned xmm,
-   unsigned vec,
-   unsigned chan )
-{
-   emit_coef(
-      func,
-      xmm,
-      vec,
-      chan,
-      1 );
-}
-
-static void
-emit_coef_dady(
-   struct x86_function *func,
-   unsigned xmm,
-   unsigned vec,
-   unsigned chan )
-{
-   emit_coef(
-      func,
-      xmm,
-      vec,
-      chan,
-      2 );
-}
-
-/**
- * Function call helpers.
- */
-
-static void
-emit_push_gp(
-   struct x86_function *func )
-{
-   x86_push(
-      func,
-      x86_make_reg( file_REG32, reg_AX) );
-   x86_push(
-      func,
-      x86_make_reg( file_REG32, reg_CX) );
-   x86_push(
-      func,
-      x86_make_reg( file_REG32, reg_DX) );
-}
-
-static void
-x86_pop_gp(
-   struct x86_function *func )
-{
-   /* Restore GP registers in a reverse order.
-    */
-   x86_pop(
-      func,
-      x86_make_reg( file_REG32, reg_DX) );
-   x86_pop(
-      func,
-      x86_make_reg( file_REG32, reg_CX) );
-   x86_pop(
-      func,
-      x86_make_reg( file_REG32, reg_AX) );
-}
-
-static void
-emit_func_call_dst(
-   struct x86_function *func,
-   unsigned xmm_dst,
-   void (PIPE_CDECL *code)() )
-{
-   sse_movaps(
-      func,
-      get_temp( TEMP_R0, 0 ),
-      make_xmm( xmm_dst ) );
-
-   emit_push_gp(
-      func );
-
-   {
-      struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
-
-      x86_lea(
-         func,
-         ecx,
-         get_temp( TEMP_R0, 0 ) );
-
-      x86_push( func, ecx );
-      x86_mov_reg_imm( func, ecx, (unsigned long) code );
-      x86_call( func, ecx );
-      x86_pop(func, ecx ); 
-   }
-
-
-   x86_pop_gp(
-      func );
-
-   sse_movaps(
-      func,
-      make_xmm( xmm_dst ),
-      get_temp( TEMP_R0, 0 ) );
-}
-
-static void
-emit_func_call_dst_src(
-   struct x86_function *func,
-   unsigned xmm_dst,
-   unsigned xmm_src,
-   void (PIPE_CDECL *code)() )
-{
-   sse_movaps(
-      func,
-      get_temp( TEMP_R0, 1 ),
-      make_xmm( xmm_src ) );
-
-   emit_func_call_dst(
-      func,
-      xmm_dst,
-      code );
-}
-
-/**
- * Low-level instruction translators.
- */
-
-static void
-emit_abs(
-   struct x86_function *func,
-   unsigned xmm )
-{
-   sse_andps(
-      func,
-      make_xmm( xmm ),
-      get_temp(
-         TGSI_EXEC_TEMP_7FFFFFFF_I,
-         TGSI_EXEC_TEMP_7FFFFFFF_C ) );
-}
-
-static void
-emit_add(
-   struct x86_function *func,
-   unsigned xmm_dst,
-   unsigned xmm_src )
-{
-   sse_addps(
-      func,
-      make_xmm( xmm_dst ),
-      make_xmm( xmm_src ) );
-}
-
-static void PIPE_CDECL
-cos4f(
-   float *store )
-{
-   const unsigned X = 0;
-
-   store[X + 0] = cosf( store[X + 0] );
-   store[X + 1] = cosf( store[X + 1] );
-   store[X + 2] = cosf( store[X + 2] );
-   store[X + 3] = cosf( store[X + 3] );
-}
-
-static void
-emit_cos(
-   struct x86_function *func,
-   unsigned xmm_dst )
-{
-   emit_func_call_dst(
-      func,
-      xmm_dst,
-      cos4f );
-}
-
-static void PIPE_CDECL
-ex24f(
-   float *store )
-{
-   const unsigned X = 0;
-
-   store[X + 0] = powf( 2.0f, store[X + 0] );
-   store[X + 1] = powf( 2.0f, store[X + 1] );
-   store[X + 2] = powf( 2.0f, store[X + 2] );
-   store[X + 3] = powf( 2.0f, store[X + 3] );
-}
-
-static void
-emit_ex2(
-   struct x86_function *func,
-   unsigned xmm_dst )
-{
-   emit_func_call_dst(
-      func,
-      xmm_dst,
-      ex24f );
-}
-
-static void
-emit_f2it(
-   struct x86_function *func,
-   unsigned xmm )
-{
-   sse2_cvttps2dq(
-      func,
-      make_xmm( xmm ),
-      make_xmm( xmm ) );
-}
-
-static void PIPE_CDECL
-flr4f(
-   float *store )
-{
-   const unsigned X = 0;
-
-   store[X + 0] = floorf( store[X + 0] );
-   store[X + 1] = floorf( store[X + 1] );
-   store[X + 2] = floorf( store[X + 2] );
-   store[X + 3] = floorf( store[X + 3] );
-}
-
-static void
-emit_flr(
-   struct x86_function *func,
-   unsigned xmm_dst )
-{
-   emit_func_call_dst(
-      func,
-      xmm_dst,
-      flr4f );
-}
-
-static void PIPE_CDECL
-frc4f(
-   float *store )
-{
-   const unsigned X = 0;
-
-   store[X + 0] -= floorf( store[X + 0] );
-   store[X + 1] -= floorf( store[X + 1] );
-   store[X + 2] -= floorf( store[X + 2] );
-   store[X + 3] -= floorf( store[X + 3] );
-}
-
-static void
-emit_frc(
-   struct x86_function *func,
-   unsigned xmm_dst )
-{
-   emit_func_call_dst(
-      func,
-      xmm_dst,
-      frc4f );
-}
-
-static void PIPE_CDECL
-lg24f(
-   float *store )
-{
-   const unsigned X = 0;
-
-   store[X + 0] = LOG2( store[X + 0] );
-   store[X + 1] = LOG2( store[X + 1] );
-   store[X + 2] = LOG2( store[X + 2] );
-   store[X + 3] = LOG2( store[X + 3] );
-}
-
-static void
-emit_lg2(
-   struct x86_function *func,
-   unsigned xmm_dst )
-{
-   emit_func_call_dst(
-      func,
-      xmm_dst,
-      lg24f );
-}
-
-static void
-emit_MOV(
-   struct x86_function *func,
-   unsigned xmm_dst,
-   unsigned xmm_src )
-{
-   sse_movups(
-      func,
-      make_xmm( xmm_dst ),
-      make_xmm( xmm_src ) );
-}
-
-static void
-emit_mul (struct x86_function *func,
-          unsigned xmm_dst,
-          unsigned xmm_src)
-{
-   sse_mulps(
-      func,
-      make_xmm( xmm_dst ),
-      make_xmm( xmm_src ) );
-}
-
-static void
-emit_neg(
-   struct x86_function *func,
-   unsigned xmm )
-{
-   sse_xorps(
-      func,
-      make_xmm( xmm ),
-      get_temp(
-         TGSI_EXEC_TEMP_80000000_I,
-         TGSI_EXEC_TEMP_80000000_C ) );
-}
-
-static void PIPE_CDECL
-pow4f(
-   float *store )
-{
-   const unsigned X = 0;
-
-   store[X + 0] = powf( store[X + 0], store[X + 4] );
-   store[X + 1] = powf( store[X + 1], store[X + 5] );
-   store[X + 2] = powf( store[X + 2], store[X + 6] );
-   store[X + 3] = powf( store[X + 3], store[X + 7] );
-}
-
-static void
-emit_pow(
-   struct x86_function *func,
-   unsigned xmm_dst,
-   unsigned xmm_src )
-{
-   emit_func_call_dst_src(
-      func,
-      xmm_dst,
-      xmm_src,
-      pow4f );
-}
-
-static void
-emit_rcp (
-   struct x86_function *func,
-   unsigned xmm_dst,
-   unsigned xmm_src )
-{
-   /* On Intel CPUs at least, this is only accurate to 12 bits -- not
-    * good enough.  Need to either emit a proper divide or use the
-    * iterative technique described below in emit_rsqrt().
-    */
-   sse2_rcpps(
-      func,
-      make_xmm( xmm_dst ),
-      make_xmm( xmm_src ) );
-}
-
-static void
-emit_rsqrt(
-   struct x86_function *func,
-   unsigned xmm_dst,
-   unsigned xmm_src )
-{
-#if HIGH_PRECISION
-   /* Although rsqrtps() and rcpps() are low precision on some/all SSE
-    * implementations, it is possible to improve its precision at
-    * fairly low cost, using a newton/raphson step, as below:
-    * 
-    * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
-    * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
-    *
-    * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
-    */
-   {
-      struct x86_reg dst = make_xmm( xmm_dst );
-      struct x86_reg src = make_xmm( xmm_src );
-      struct x86_reg tmp0 = make_xmm( 2 );
-      struct x86_reg tmp1 = make_xmm( 3 );
-
-      assert( xmm_dst != xmm_src );
-      assert( xmm_dst != 2 && xmm_dst != 3 );
-      assert( xmm_src != 2 && xmm_src != 3 );
-
-      sse_movaps(  func, dst,  get_temp( TGSI_EXEC_TEMP_HALF_I, TGSI_EXEC_TEMP_HALF_C ) );
-      sse_movaps(  func, tmp0, get_temp( TGSI_EXEC_TEMP_THREE_I, TGSI_EXEC_TEMP_THREE_C ) );
-      sse_rsqrtps( func, tmp1, src  );
-      sse_mulps(   func, src,  tmp1 );
-      sse_mulps(   func, dst,  tmp1 );
-      sse_mulps(   func, src,  tmp1 );
-      sse_subps(   func, tmp0, src  );
-      sse_mulps(   func, dst,  tmp0 );
-   }
-#else
-   /* On Intel CPUs at least, this is only accurate to 12 bits -- not
-    * good enough.
-    */
-   sse_rsqrtps(
-      func,
-      make_xmm( xmm_dst ),
-      make_xmm( xmm_src ) );
-#endif
-}
-
-static void
-emit_setsign(
-   struct x86_function *func,
-   unsigned xmm )
-{
-   sse_orps(
-      func,
-      make_xmm( xmm ),
-      get_temp(
-         TGSI_EXEC_TEMP_80000000_I,
-         TGSI_EXEC_TEMP_80000000_C ) );
-}
-
-static void PIPE_CDECL
-sin4f(
-   float *store )
-{
-   const unsigned X = 0;
-
-   store[X + 0] = sinf( store[X + 0] );
-   store[X + 1] = sinf( store[X + 1] );
-   store[X + 2] = sinf( store[X + 2] );
-   store[X + 3] = sinf( store[X + 3] );
-}
-
-static void
-emit_sin (struct x86_function *func,
-          unsigned xmm_dst)
-{
-   emit_func_call_dst(
-      func,
-      xmm_dst,
-      sin4f );
-}
-
-static void
-emit_sub(
-   struct x86_function *func,
-   unsigned xmm_dst,
-   unsigned xmm_src )
-{
-   sse_subps(
-      func,
-      make_xmm( xmm_dst ),
-      make_xmm( xmm_src ) );
-}
-
-/**
- * Register fetch.
- */
-
-static void
-emit_fetch(
-   struct x86_function *func,
-   unsigned xmm,
-   const struct tgsi_full_src_register *reg,
-   const unsigned chan_index )
-{
-   unsigned swizzle = tgsi_util_get_full_src_register_extswizzle( reg, chan_index );
-
-   switch( swizzle ) {
-   case TGSI_EXTSWIZZLE_X:
-   case TGSI_EXTSWIZZLE_Y:
-   case TGSI_EXTSWIZZLE_Z:
-   case TGSI_EXTSWIZZLE_W:
-      switch( reg->SrcRegister.File ) {
-      case TGSI_FILE_CONSTANT:
-         emit_const(
-            func,
-            xmm,
-            reg->SrcRegister.Index,
-            swizzle );
-         break;
-
-      case TGSI_FILE_IMMEDIATE:
-         emit_immediate(
-            func,
-            xmm,
-            reg->SrcRegister.Index,
-            swizzle );
-         break;
-
-      case TGSI_FILE_INPUT:
-         emit_inputf(
-            func,
-            xmm,
-            reg->SrcRegister.Index,
-            swizzle );
-         break;
-
-      case TGSI_FILE_TEMPORARY:
-         emit_tempf(
-            func,
-            xmm,
-            reg->SrcRegister.Index,
-            swizzle );
-         break;
-
-      default:
-         assert( 0 );
-      }
-      break;
-
-   case TGSI_EXTSWIZZLE_ZERO:
-      emit_tempf(
-         func,
-         xmm,
-         TGSI_EXEC_TEMP_00000000_I,
-         TGSI_EXEC_TEMP_00000000_C );
-      break;
-
-   case TGSI_EXTSWIZZLE_ONE:
-      emit_tempf(
-         func,
-         xmm,
-         TGSI_EXEC_TEMP_ONE_I,
-         TGSI_EXEC_TEMP_ONE_C );
-      break;
-
-   default:
-      assert( 0 );
-   }
-
-   switch( tgsi_util_get_full_src_register_sign_mode( reg, chan_index ) ) {
-   case TGSI_UTIL_SIGN_CLEAR:
-      emit_abs( func, xmm );
-      break;
-
-   case TGSI_UTIL_SIGN_SET:
-      emit_setsign( func, xmm );
-      break;
-
-   case TGSI_UTIL_SIGN_TOGGLE:
-      emit_neg( func, xmm );
-      break;
-
-   case TGSI_UTIL_SIGN_KEEP:
-      break;
-   }
-}
-
-#define FETCH( FUNC, INST, XMM, INDEX, CHAN )\
-   emit_fetch( FUNC, XMM, &(INST).FullSrcRegisters[INDEX], CHAN )
-
-/**
- * Register store.
- */
-
-static void
-emit_store(
-   struct x86_function *func,
-   unsigned xmm,
-   const struct tgsi_full_dst_register *reg,
-   const struct tgsi_full_instruction *inst,
-   unsigned chan_index )
-{
-   switch( reg->DstRegister.File ) {
-   case TGSI_FILE_OUTPUT:
-      emit_output(
-         func,
-         xmm,
-         reg->DstRegister.Index,
-         chan_index );
-      break;
-
-   case TGSI_FILE_TEMPORARY:
-      emit_temps(
-         func,
-         xmm,
-         reg->DstRegister.Index,
-         chan_index );
-      break;
-
-   case TGSI_FILE_ADDRESS:
-      emit_addrs(
-         func,
-         xmm,
-         reg->DstRegister.Index,
-         chan_index );
-      break;
-
-   default:
-      assert( 0 );
-   }
-
-   switch( inst->Instruction.Saturate ) {
-   case TGSI_SAT_NONE:
-      break;
-
-   case TGSI_SAT_ZERO_ONE:
-      /* assert( 0 ); */
-      break;
-
-   case TGSI_SAT_MINUS_PLUS_ONE:
-      assert( 0 );
-      break;
-   }
-}
-
-#define STORE( FUNC, INST, XMM, INDEX, CHAN )\
-   emit_store( FUNC, XMM, &(INST).FullDstRegisters[INDEX], &(INST), CHAN )
-
-/**
- * High-level instruction translators.
- */
-
-static void
-emit_kil(
-   struct x86_function *func,
-   const struct tgsi_full_src_register *reg )
-{
-   unsigned uniquemask;
-   unsigned registers[4];
-   unsigned nextregister = 0;
-   unsigned firstchan = ~0;
-   unsigned chan_index;
-
-   /* This mask stores component bits that were already tested. Note that
-    * we test if the value is less than zero, so 1.0 and 0.0 need not to be
-    * tested. */
-   uniquemask = (1 << TGSI_EXTSWIZZLE_ZERO) | (1 << TGSI_EXTSWIZZLE_ONE);
-
-   FOR_EACH_CHANNEL( chan_index ) {
-      unsigned swizzle;
-
-      /* unswizzle channel */
-      swizzle = tgsi_util_get_full_src_register_extswizzle(
-         reg,
-         chan_index );
-
-      /* check if the component has not been already tested */
-      if( !(uniquemask & (1 << swizzle)) ) {
-         uniquemask |= 1 << swizzle;
-
-         /* allocate register */
-         registers[chan_index] = nextregister;
-         emit_fetch(
-            func,
-            nextregister,
-            reg,
-            chan_index );
-         nextregister++;
-
-         /* mark the first channel used */
-         if( firstchan == ~0 ) {
-            firstchan = chan_index;
-         }
-      }
-   }
-
-   x86_push(
-      func,
-      x86_make_reg( file_REG32, reg_AX ) );
-   x86_push(
-      func,
-      x86_make_reg( file_REG32, reg_DX ) );
-
-   FOR_EACH_CHANNEL( chan_index ) {
-      if( uniquemask & (1 << chan_index) ) {
-         sse_cmpps(
-            func,
-            make_xmm( registers[chan_index] ),
-            get_temp(
-               TGSI_EXEC_TEMP_00000000_I,
-               TGSI_EXEC_TEMP_00000000_C ),
-            cc_LessThan );
-
-         if( chan_index == firstchan ) {
-            sse_pmovmskb(
-               func,
-               x86_make_reg( file_REG32, reg_AX ),
-               make_xmm( registers[chan_index] ) );
-         }
-         else {
-            sse_pmovmskb(
-               func,
-               x86_make_reg( file_REG32, reg_DX ),
-               make_xmm( registers[chan_index] ) );
-            x86_or(
-               func,
-               x86_make_reg( file_REG32, reg_AX ),
-               x86_make_reg( file_REG32, reg_DX ) );
-         }
-      }
-   }
-
-   x86_or(
-      func,
-      get_temp(
-         TGSI_EXEC_TEMP_KILMASK_I,
-         TGSI_EXEC_TEMP_KILMASK_C ),
-      x86_make_reg( file_REG32, reg_AX ) );
-
-   x86_pop(
-      func,
-      x86_make_reg( file_REG32, reg_DX ) );
-   x86_pop(
-      func,
-      x86_make_reg( file_REG32, reg_AX ) );
-}
-
-static void
-emit_setcc(
-   struct x86_function *func,
-   struct tgsi_full_instruction *inst,
-   enum sse_cc cc )
-{
-   unsigned chan_index;
-
-   FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-      FETCH( func, *inst, 0, 0, chan_index );
-      FETCH( func, *inst, 1, 1, chan_index );
-      sse_cmpps(
-         func,
-         make_xmm( 0 ),
-         make_xmm( 1 ),
-         cc );
-      sse_andps(
-         func,
-         make_xmm( 0 ),
-         get_temp(
-            TGSI_EXEC_TEMP_ONE_I,
-            TGSI_EXEC_TEMP_ONE_C ) );
-      STORE( func, *inst, 0, 0, chan_index );
-   }
-}
-
-static void
-emit_cmp(
-   struct x86_function *func,
-   struct tgsi_full_instruction *inst )
-{
-   unsigned chan_index;
-
-   FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-      FETCH( func, *inst, 0, 0, chan_index );
-      FETCH( func, *inst, 1, 1, chan_index );
-      FETCH( func, *inst, 2, 2, chan_index );
-      sse_cmpps(
-         func,
-         make_xmm( 0 ),
-         get_temp(
-            TGSI_EXEC_TEMP_00000000_I,
-            TGSI_EXEC_TEMP_00000000_C ),
-         cc_LessThan );
-      sse_andps(
-         func,
-         make_xmm( 1 ),
-         make_xmm( 0 ) );
-      sse_andnps(
-         func,
-         make_xmm( 0 ),
-         make_xmm( 2 ) );
-      sse_orps(
-         func,
-         make_xmm( 0 ),
-         make_xmm( 1 ) );
-      STORE( func, *inst, 0, 0, chan_index );
-   }
-}
-
-static int
-emit_instruction(
-   struct x86_function *func,
-   struct tgsi_full_instruction *inst )
-{
-   unsigned chan_index;
-
-   switch( inst->Instruction.Opcode ) {
-   case TGSI_OPCODE_ARL:
-#if 0
-      /* XXX this isn't working properly (see glean vertProg1 test) */
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( func, *inst, 0, 0, chan_index );
-         emit_f2it( func, 0 );
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-#else
-      return 0;
-#endif
-      break;
-
-   case TGSI_OPCODE_MOV:
-   case TGSI_OPCODE_SWZ:
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( func, *inst, 0, 0, chan_index );
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_LIT:
-      if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
-          IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
-         emit_tempf(
-            func,
-            0,
-            TGSI_EXEC_TEMP_ONE_I,
-            TGSI_EXEC_TEMP_ONE_C);
-         if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ) {
-            STORE( func, *inst, 0, 0, CHAN_X );
-         }
-         if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
-            STORE( func, *inst, 0, 0, CHAN_W );
-         }
-      }
-      if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
-          IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
-         if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
-            FETCH( func, *inst, 0, 0, CHAN_X );
-            sse_maxps(
-               func,
-               make_xmm( 0 ),
-               get_temp(
-                  TGSI_EXEC_TEMP_00000000_I,
-                  TGSI_EXEC_TEMP_00000000_C ) );
-            STORE( func, *inst, 0, 0, CHAN_Y );
-         }
-         if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
-            /* XMM[1] = SrcReg[0].yyyy */
-            FETCH( func, *inst, 1, 0, CHAN_Y );
-            /* XMM[1] = max(XMM[1], 0) */
-            sse_maxps(
-               func,
-               make_xmm( 1 ),
-               get_temp(
-                  TGSI_EXEC_TEMP_00000000_I,
-                  TGSI_EXEC_TEMP_00000000_C ) );
-            /* XMM[2] = SrcReg[0].wwww */
-            FETCH( func, *inst, 2, 0, CHAN_W );
-            /* XMM[2] = min(XMM[2], 128.0) */
-            sse_minps(
-               func,
-               make_xmm( 2 ),
-               get_temp(
-                  TGSI_EXEC_TEMP_128_I,
-                  TGSI_EXEC_TEMP_128_C ) );
-            /* XMM[2] = max(XMM[2], -128.0) */
-            sse_maxps(
-               func,
-               make_xmm( 2 ),
-               get_temp(
-                  TGSI_EXEC_TEMP_MINUS_128_I,
-                  TGSI_EXEC_TEMP_MINUS_128_C ) );
-            emit_pow( func, 1, 2 );
-            FETCH( func, *inst, 0, 0, CHAN_X );
-            sse_xorps(
-               func,
-               make_xmm( 2 ),
-               make_xmm( 2 ) );
-            sse_cmpps(
-               func,
-               make_xmm( 2 ),
-               make_xmm( 0 ),
-               cc_LessThanEqual );
-            sse_andps(
-               func,
-               make_xmm( 2 ),
-               make_xmm( 1 ) );
-            STORE( func, *inst, 2, 0, CHAN_Z );
-         }
-      }
-      break;
-
-   case TGSI_OPCODE_RCP:
-   /* TGSI_OPCODE_RECIP */
-      FETCH( func, *inst, 0, 0, CHAN_X );
-      emit_rcp( func, 0, 0 );
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_RSQ:
-   /* TGSI_OPCODE_RECIPSQRT */
-      FETCH( func, *inst, 0, 0, CHAN_X );
-      emit_rsqrt( func, 1, 0 );
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( func, *inst, 1, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_EXP:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_LOG:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_MUL:
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( func, *inst, 0, 0, chan_index );
-         FETCH( func, *inst, 1, 1, chan_index );
-         emit_mul( func, 0, 1 );
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_ADD:
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( func, *inst, 0, 0, chan_index );
-         FETCH( func, *inst, 1, 1, chan_index );
-         emit_add( func, 0, 1 );
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_DP3:
-   /* TGSI_OPCODE_DOT3 */
-      FETCH( func, *inst, 0, 0, CHAN_X );
-      FETCH( func, *inst, 1, 1, CHAN_X );
-      emit_mul( func, 0, 1 );
-      FETCH( func, *inst, 1, 0, CHAN_Y );
-      FETCH( func, *inst, 2, 1, CHAN_Y );
-      emit_mul( func, 1, 2 );
-      emit_add( func, 0, 1 );
-      FETCH( func, *inst, 1, 0, CHAN_Z );
-      FETCH( func, *inst, 2, 1, CHAN_Z );
-      emit_mul( func, 1, 2 );
-      emit_add( func, 0, 1 );
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_DP4:
-   /* TGSI_OPCODE_DOT4 */
-      FETCH( func, *inst, 0, 0, CHAN_X );
-      FETCH( func, *inst, 1, 1, CHAN_X );
-      emit_mul( func, 0, 1 );
-      FETCH( func, *inst, 1, 0, CHAN_Y );
-      FETCH( func, *inst, 2, 1, CHAN_Y );
-      emit_mul( func, 1, 2 );
-      emit_add( func, 0, 1 );
-      FETCH( func, *inst, 1, 0, CHAN_Z );
-      FETCH( func, *inst, 2, 1, CHAN_Z );
-      emit_mul(func, 1, 2 );
-      emit_add(func, 0, 1 );
-      FETCH( func, *inst, 1, 0, CHAN_W );
-      FETCH( func, *inst, 2, 1, CHAN_W );
-      emit_mul( func, 1, 2 );
-      emit_add( func, 0, 1 );
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_DST:
-      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
-         emit_tempf(
-            func,
-            0,
-            TGSI_EXEC_TEMP_ONE_I,
-            TGSI_EXEC_TEMP_ONE_C );
-         STORE( func, *inst, 0, 0, CHAN_X );
-      }
-      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
-         FETCH( func, *inst, 0, 0, CHAN_Y );
-         FETCH( func, *inst, 1, 1, CHAN_Y );
-         emit_mul( func, 0, 1 );
-         STORE( func, *inst, 0, 0, CHAN_Y );
-      }
-      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
-         FETCH( func, *inst, 0, 0, CHAN_Z );
-         STORE( func, *inst, 0, 0, CHAN_Z );
-      }
-      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
-         FETCH( func, *inst, 0, 1, CHAN_W );
-         STORE( func, *inst, 0, 0, CHAN_W );
-      }
-      break;
-
-   case TGSI_OPCODE_MIN:
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( func, *inst, 0, 0, chan_index );
-         FETCH( func, *inst, 1, 1, chan_index );
-         sse_minps(
-            func,
-            make_xmm( 0 ),
-            make_xmm( 1 ) );
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_MAX:
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( func, *inst, 0, 0, chan_index );
-         FETCH( func, *inst, 1, 1, chan_index );
-         sse_maxps(
-            func,
-            make_xmm( 0 ),
-            make_xmm( 1 ) );
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_SLT:
-   /* TGSI_OPCODE_SETLT */
-      emit_setcc( func, inst, cc_LessThan );
-      break;
-
-   case TGSI_OPCODE_SGE:
-   /* TGSI_OPCODE_SETGE */
-      emit_setcc( func, inst, cc_NotLessThan );
-      break;
-
-   case TGSI_OPCODE_MAD:
-   /* TGSI_OPCODE_MADD */
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( func, *inst, 0, 0, chan_index );
-         FETCH( func, *inst, 1, 1, chan_index );
-         FETCH( func, *inst, 2, 2, chan_index );
-         emit_mul( func, 0, 1 );
-         emit_add( func, 0, 2 );
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_SUB:
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( func, *inst, 0, 0, chan_index );
-         FETCH( func, *inst, 1, 1, chan_index );
-         emit_sub( func, 0, 1 );
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_LERP:
-   /* TGSI_OPCODE_LRP */
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( func, *inst, 0, 0, chan_index );
-         FETCH( func, *inst, 1, 1, chan_index );
-         FETCH( func, *inst, 2, 2, chan_index );
-         emit_sub( func, 1, 2 );
-         emit_mul( func, 0, 1 );
-         emit_add( func, 0, 2 );
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_CND:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_CND0:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_DOT2ADD:
-   /* TGSI_OPCODE_DP2A */
-      return 0;
-      break;
-
-   case TGSI_OPCODE_INDEX:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_NEGATE:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_FRAC:
-   /* TGSI_OPCODE_FRC */
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( func, *inst, 0, 0, chan_index );
-         emit_frc( func, 0 );
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_CLAMP:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_FLOOR:
-   /* TGSI_OPCODE_FLR */
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( func, *inst, 0, 0, chan_index );
-         emit_flr( func, 0 );
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_ROUND:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_EXPBASE2:
-   /* TGSI_OPCODE_EX2 */
-      FETCH( func, *inst, 0, 0, CHAN_X );
-      emit_ex2( func, 0 );
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_LOGBASE2:
-   /* TGSI_OPCODE_LG2 */
-      FETCH( func, *inst, 0, 0, CHAN_X );
-      emit_lg2( func, 0 );
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_POWER:
-   /* TGSI_OPCODE_POW */
-      FETCH( func, *inst, 0, 0, CHAN_X );
-      FETCH( func, *inst, 1, 1, CHAN_X );
-      emit_pow( func, 0, 1 );
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_CROSSPRODUCT:
-   /* TGSI_OPCODE_XPD */
-      if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
-          IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
-         FETCH( func, *inst, 1, 1, CHAN_Z );
-         FETCH( func, *inst, 3, 0, CHAN_Z );
-      }
-      if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
-          IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
-         FETCH( func, *inst, 0, 0, CHAN_Y );
-         FETCH( func, *inst, 4, 1, CHAN_Y );
-      }
-      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
-         emit_MOV( func, 2, 0 );
-         emit_mul( func, 2, 1 );
-         emit_MOV( func, 5, 3 );
-         emit_mul( func, 5, 4 );
-         emit_sub( func, 2, 5 );
-         STORE( func, *inst, 2, 0, CHAN_X );
-      }
-      if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
-          IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
-         FETCH( func, *inst, 2, 1, CHAN_X );
-         FETCH( func, *inst, 5, 0, CHAN_X );
-      }
-      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
-         emit_mul( func, 3, 2 );
-         emit_mul( func, 1, 5 );
-         emit_sub( func, 3, 1 );
-         STORE( func, *inst, 3, 0, CHAN_Y );
-      }
-      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
-         emit_mul( func, 5, 4 );
-         emit_mul( func, 0, 2 );
-         emit_sub( func, 5, 0 );
-         STORE( func, *inst, 5, 0, CHAN_Z );
-      }
-      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
-	 emit_tempf(
-	    func,
-	    0,
-	    TGSI_EXEC_TEMP_ONE_I,
-	    TGSI_EXEC_TEMP_ONE_C );
-         STORE( func, *inst, 0, 0, CHAN_W );
-      }
-      break;
-
-   case TGSI_OPCODE_MULTIPLYMATRIX:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_ABS:
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( func, *inst, 0, 0, chan_index );
-         emit_abs( func, 0) ;
-
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_RCC:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_DPH:
-      FETCH( func, *inst, 0, 0, CHAN_X );
-      FETCH( func, *inst, 1, 1, CHAN_X );
-      emit_mul( func, 0, 1 );
-      FETCH( func, *inst, 1, 0, CHAN_Y );
-      FETCH( func, *inst, 2, 1, CHAN_Y );
-      emit_mul( func, 1, 2 );
-      emit_add( func, 0, 1 );
-      FETCH( func, *inst, 1, 0, CHAN_Z );
-      FETCH( func, *inst, 2, 1, CHAN_Z );
-      emit_mul( func, 1, 2 );
-      emit_add( func, 0, 1 );
-      FETCH( func, *inst, 1, 1, CHAN_W );
-      emit_add( func, 0, 1 );
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_COS:
-      FETCH( func, *inst, 0, 0, CHAN_X );
-      emit_cos( func, 0 );
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_DDX:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_DDY:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_KIL:
-      emit_kil( func, &inst->FullSrcRegisters[0] );
-      break;
-
-   case TGSI_OPCODE_PK2H:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_PK2US:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_PK4B:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_PK4UB:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_RFL:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_SEQ:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_SFL:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_SGT:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_SIN:
-      FETCH( func, *inst, 0, 0, CHAN_X );
-      emit_sin( func, 0 );
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_SLE:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_SNE:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_STR:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_TEX:
-      if (0) {
-	 /* Disable dummy texture code: 
-	  */
-	 emit_tempf(
-	    func,
-	    0,
-	    TGSI_EXEC_TEMP_ONE_I,
-	    TGSI_EXEC_TEMP_ONE_C );
-	 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-	    STORE( func, *inst, 0, 0, chan_index );
-	 }
-      }
-      else {
-	 return 0;
-      }
-      break;
-
-   case TGSI_OPCODE_TXD:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_UP2H:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_UP2US:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_UP4B:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_UP4UB:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_X2D:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_ARA:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_ARR:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_BRA:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_CAL:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_RET:
-      emit_ret( func );
-      break;
-
-   case TGSI_OPCODE_END:
-      break;
-
-   case TGSI_OPCODE_SSG:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_CMP:
-      emit_cmp (func, inst);
-      break;
-
-   case TGSI_OPCODE_SCS:
-      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
-         FETCH( func, *inst, 0, 0, CHAN_X );
-         emit_cos( func, 0 );
-         STORE( func, *inst, 0, 0, CHAN_X );
-      }
-      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
-         FETCH( func, *inst, 0, 0, CHAN_X );
-         emit_sin( func, 0 );
-         STORE( func, *inst, 0, 0, CHAN_Y );
-      }
-      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
-	 emit_tempf(
-	    func,
-	    0,
-	    TGSI_EXEC_TEMP_00000000_I,
-	    TGSI_EXEC_TEMP_00000000_C );
-         STORE( func, *inst, 0, 0, CHAN_Z );
-      }
-      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
-	 emit_tempf(
-	    func,
-	    0,
-	    TGSI_EXEC_TEMP_ONE_I,
-	    TGSI_EXEC_TEMP_ONE_C );
-         STORE( func, *inst, 0, 0, CHAN_W );
-      }
-      break;
-
-   case TGSI_OPCODE_TXB:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_NRM:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_DIV:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_DP2:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_TXL:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_BRK:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_IF:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_LOOP:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_REP:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_ELSE:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_ENDIF:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_ENDLOOP:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_ENDREP:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_PUSHA:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_POPA:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_CEIL:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_I2F:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_NOT:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_TRUNC:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_SHL:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_SHR:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_AND:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_OR:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_MOD:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_XOR:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_SAD:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_TXF:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_TXQ:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_CONT:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_EMIT:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_ENDPRIM:
-      return 0;
-      break;
-
-   default:
-      return 0;
-   }
-   
-   return 1;
-}
-
-static void
-emit_declaration(
-   struct x86_function *func,
-   struct tgsi_full_declaration *decl )
-{
-   if( decl->Declaration.File == TGSI_FILE_INPUT ) {
-      unsigned first, last, mask;
-      unsigned i, j;
-
-      first = decl->DeclarationRange.First;
-      last = decl->DeclarationRange.Last;
-      mask = decl->Declaration.UsageMask;
-
-      for( i = first; i <= last; i++ ) {
-         for( j = 0; j < NUM_CHANNELS; j++ ) {
-            if( mask & (1 << j) ) {
-               switch( decl->Declaration.Interpolate ) {
-               case TGSI_INTERPOLATE_CONSTANT:
-                  emit_coef_a0( func, 0, i, j );
-                  emit_inputs( func, 0, i, j );
-                  break;
-
-               case TGSI_INTERPOLATE_LINEAR:
-                  emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
-                  emit_coef_dadx( func, 1, i, j );
-                  emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
-                  emit_coef_dady( func, 3, i, j );
-                  emit_mul( func, 0, 1 );    /* x * dadx */
-                  emit_coef_a0( func, 4, i, j );
-                  emit_mul( func, 2, 3 );    /* y * dady */
-                  emit_add( func, 0, 4 );    /* x * dadx + a0 */
-                  emit_add( func, 0, 2 );    /* x * dadx + y * dady + a0 */
-                  emit_inputs( func, 0, i, j );
-                  break;
-
-               case TGSI_INTERPOLATE_PERSPECTIVE:
-                  emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
-                  emit_coef_dadx( func, 1, i, j );
-                  emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
-                  emit_coef_dady( func, 3, i, j );
-                  emit_mul( func, 0, 1 );    /* x * dadx */
-                  emit_tempf( func, 4, 0, TGSI_SWIZZLE_W );
-                  emit_coef_a0( func, 5, i, j );
-                  emit_rcp( func, 4, 4 );    /* 1.0 / w */
-                  emit_mul( func, 2, 3 );    /* y * dady */
-                  emit_add( func, 0, 5 );    /* x * dadx + a0 */
-                  emit_add( func, 0, 2 );    /* x * dadx + y * dady + a0 */
-                  emit_mul( func, 0, 4 );    /* (x * dadx + y * dady + a0) / w */
-                  emit_inputs( func, 0, i, j );
-                  break;
-
-               default:
-                  assert( 0 );
-		  break;
-               }
-            }
-         }
-      }
-   }
-}
-
-static void aos_to_soa( struct x86_function *func, 
-                        uint arg_aos,
-                        uint arg_soa, 
-                        uint arg_num, 
-                        uint arg_stride )
-{
-   struct x86_reg soa_input = x86_make_reg( file_REG32, reg_AX );
-   struct x86_reg aos_input = x86_make_reg( file_REG32, reg_BX );
-   struct x86_reg num_inputs = x86_make_reg( file_REG32, reg_CX );
-   struct x86_reg stride = x86_make_reg( file_REG32, reg_DX );
-   int inner_loop;
-
-
-   /* Save EBX */
-   x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
-
-   x86_mov( func, aos_input,  x86_fn_arg( func, arg_aos ) );
-   x86_mov( func, soa_input,  x86_fn_arg( func, arg_soa ) );
-   x86_mov( func, num_inputs, x86_fn_arg( func, arg_num ) );
-   x86_mov( func, stride,     x86_fn_arg( func, arg_stride ) );
-
-   /* do */
-   inner_loop = x86_get_label( func );
-   {
-      x86_push( func, aos_input );
-      sse_movlps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
-      sse_movlps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
-      x86_add( func, aos_input, stride );
-      sse_movhps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
-      sse_movhps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
-      x86_add( func, aos_input, stride );
-      sse_movlps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
-      sse_movlps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
-      x86_add( func, aos_input, stride );
-      sse_movhps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
-      sse_movhps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
-      x86_pop( func, aos_input );
-
-      sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
-      sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
-      sse_shufps( func, make_xmm( 0 ), make_xmm( 1 ), 0x88 );
-      sse_shufps( func, make_xmm( 2 ), make_xmm( 1 ), 0xdd );
-      sse_shufps( func, make_xmm( 3 ), make_xmm( 4 ), 0x88 );
-      sse_shufps( func, make_xmm( 5 ), make_xmm( 4 ), 0xdd );
-
-      sse_movups( func, x86_make_disp( soa_input, 0 ), make_xmm( 0 ) );
-      sse_movups( func, x86_make_disp( soa_input, 16 ), make_xmm( 2 ) );
-      sse_movups( func, x86_make_disp( soa_input, 32 ), make_xmm( 3 ) );
-      sse_movups( func, x86_make_disp( soa_input, 48 ), make_xmm( 5 ) );
-
-      /* Advance to next input */
-      x86_lea( func, aos_input, x86_make_disp(aos_input, 16) );
-      x86_lea( func, soa_input, x86_make_disp(soa_input, 64) );
-   }
-   /* while --num_inputs */
-   x86_dec( func, num_inputs );
-   x86_jcc( func, cc_NE, inner_loop );
-
-   /* Restore EBX */
-   x86_pop( func, aos_input );
-}
-
-static void soa_to_aos( struct x86_function *func, uint aos, uint soa, uint num, uint stride )
-{
-   struct x86_reg soa_output;
-   struct x86_reg aos_output;
-   struct x86_reg num_outputs;
-   struct x86_reg temp;
-   int inner_loop;
-
-   soa_output = x86_make_reg( file_REG32, reg_AX );
-   aos_output = x86_make_reg( file_REG32, reg_BX );
-   num_outputs = x86_make_reg( file_REG32, reg_CX );
-   temp = x86_make_reg( file_REG32, reg_DX );
-
-   /* Save EBX */
-   x86_push( func, aos_output );
-
-   x86_mov( func, soa_output, x86_fn_arg( func, soa ) );
-   x86_mov( func, aos_output, x86_fn_arg( func, aos ) );
-   x86_mov( func, num_outputs, x86_fn_arg( func, num ) );
-
-   /* do */
-   inner_loop = x86_get_label( func );
-   {
-      sse_movups( func, make_xmm( 0 ), x86_make_disp( soa_output, 0 ) );
-      sse_movups( func, make_xmm( 1 ), x86_make_disp( soa_output, 16 ) );
-      sse_movups( func, make_xmm( 3 ), x86_make_disp( soa_output, 32 ) );
-      sse_movups( func, make_xmm( 4 ), x86_make_disp( soa_output, 48 ) );
-
-      sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
-      sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
-      sse_unpcklps( func, make_xmm( 0 ), make_xmm( 1 ) );
-      sse_unpckhps( func, make_xmm( 2 ), make_xmm( 1 ) );
-      sse_unpcklps( func, make_xmm( 3 ), make_xmm( 4 ) );
-      sse_unpckhps( func, make_xmm( 5 ), make_xmm( 4 ) );
-
-      x86_mov( func, temp, x86_fn_arg( func, stride ) );
-      x86_push( func, aos_output );
-      sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
-      sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
-      x86_add( func, aos_output, temp );
-      sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
-      sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
-      x86_add( func, aos_output, temp );
-      sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
-      sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
-      x86_add( func, aos_output, temp );
-      sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
-      sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
-      x86_pop( func, aos_output );
-
-      /* Advance to next output */
-      x86_lea( func, aos_output, x86_make_disp(aos_output, 16) );
-      x86_lea( func, soa_output, x86_make_disp(soa_output, 64) );
-   }
-   /* while --num_outputs */
-   x86_dec( func, num_outputs );
-   x86_jcc( func, cc_NE, inner_loop );
-
-   /* Restore EBX */
-   x86_pop( func, aos_output );
-}
-
-/**
- * Translate a TGSI vertex/fragment shader to SSE2 code.
- * Slightly different things are done for vertex vs. fragment shaders.
- *
- * Note that fragment shaders are responsible for interpolating shader
- * inputs. Because on x86 we have only 4 GP registers, and here we
- * have 5 shader arguments (input, output, const, temp and coef), the
- * code is split into two phases -- DECLARATION and INSTRUCTION phase.
- * GP register holding the output argument is aliased with the coeff
- * argument, as outputs are not needed in the DECLARATION phase.
- *
- * \param tokens  the TGSI input shader
- * \param func  the output SSE code/function
- * \param immediates  buffer to place immediates, later passed to SSE func
- * \param return  1 for success, 0 if translation failed
- */
-unsigned
-tgsi_emit_sse2(
-   const struct tgsi_token *tokens,
-   struct x86_function *func,
-   float (*immediates)[4],
-   boolean do_swizzles )
-{
-   struct tgsi_parse_context parse;
-   boolean instruction_phase = FALSE;
-   unsigned ok = 1;
-   uint num_immediates = 0;
-
-   func->csr = func->store;
-
-   tgsi_parse_init( &parse, tokens );
-
-   /* Can't just use EDI, EBX without save/restoring them:
-    */
-   x86_push(
-      func,
-      get_immediate_base() );
-
-   x86_push(
-      func,
-      get_temp_base() );
-
-
-   /*
-    * Different function args for vertex/fragment shaders:
-    */
-   if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
-      /* DECLARATION phase, do not load output argument. */
-      x86_mov(
-         func,
-         get_input_base(),
-         x86_fn_arg( func, 1 ) );
-      /* skipping outputs argument here */
-      x86_mov(
-         func,
-         get_const_base(),
-         x86_fn_arg( func, 3 ) );
-      x86_mov(
-         func,
-         get_temp_base(),
-         x86_fn_arg( func, 4 ) );
-      x86_mov(
-         func,
-         get_coef_base(),
-         x86_fn_arg( func, 5 ) );
-      x86_mov(
-         func,
-         get_immediate_base(),
-         x86_fn_arg( func, 6 ) );
-   }
-   else {
-      assert(parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX);
-
-      if (do_swizzles)
-         aos_to_soa( func, 
-                     6,         /* aos_input */
-                     1,         /* machine->input */
-                     7,         /* num_inputs */
-                     8 );       /* input_stride */
-
-      x86_mov(
-         func,
-         get_input_base(),
-         x86_fn_arg( func, 1 ) );
-      x86_mov(
-         func,
-         get_output_base(),
-         x86_fn_arg( func, 2 ) );
-      x86_mov(
-         func,
-         get_const_base(),
-         x86_fn_arg( func, 3 ) );
-      x86_mov(
-         func,
-         get_temp_base(),
-         x86_fn_arg( func, 4 ) );
-      x86_mov(
-         func,
-         get_immediate_base(),
-         x86_fn_arg( func, 5 ) );
-   }
-
-   while( !tgsi_parse_end_of_tokens( &parse ) && ok ) {
-      tgsi_parse_token( &parse );
-
-      switch( parse.FullToken.Token.Type ) {
-      case TGSI_TOKEN_TYPE_DECLARATION:
-         if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
-            emit_declaration(
-               func,
-               &parse.FullToken.FullDeclaration );
-         }
-         break;
-
-      case TGSI_TOKEN_TYPE_INSTRUCTION:
-         if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
-            if( !instruction_phase ) {
-               /* INSTRUCTION phase, overwrite coeff with output. */
-               instruction_phase = TRUE;
-               x86_mov(
-                  func,
-                  get_output_base(),
-                  x86_fn_arg( func, 2 ) );
-            }
-         }
-
-         ok = emit_instruction(
-            func,
-            &parse.FullToken.FullInstruction );
-
-	 if (!ok) {
-	    debug_printf("failed to translate tgsi opcode %d to SSE (%s)\n", 
-			 parse.FullToken.FullInstruction.Instruction.Opcode,
-                         parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX ?
-                         "vertex shader" : "fragment shader");
-	 }
-         break;
-
-      case TGSI_TOKEN_TYPE_IMMEDIATE:
-         /* simply copy the immediate values into the next immediates[] slot */
-         {
-            const uint size = parse.FullToken.FullImmediate.Immediate.Size - 1;
-            uint i;
-            assert(size <= 4);
-            assert(num_immediates < TGSI_EXEC_NUM_IMMEDIATES);
-            for( i = 0; i < size; i++ ) {
-               immediates[num_immediates][i] =
-		  parse.FullToken.FullImmediate.u.ImmediateFloat32[i].Float;
-            }
-#if 0
-            debug_printf("SSE FS immediate[%d] = %f %f %f %f\n",
-                   num_immediates,
-                   immediates[num_immediates][0],
-                   immediates[num_immediates][1],
-                   immediates[num_immediates][2],
-                   immediates[num_immediates][3]);
-#endif
-            num_immediates++;
-         }
-         break;
-
-      default:
-	 ok = 0;
-         assert( 0 );
-      }
-   }
-
-   if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
-      if (do_swizzles)
-         soa_to_aos( func, 9, 2, 10, 11 );
-   }
-
-   /* Can't just use EBX, EDI without save/restoring them:
-    */
-   x86_pop(
-      func,
-      get_temp_base() );
-
-   x86_pop(
-      func,
-      get_immediate_base() );
-
-   emit_ret( func );
-
-   tgsi_parse_free( &parse );
-
-   return ok;
-}
-
-#endif /* PIPE_ARCH_X86 */
diff --git a/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.h b/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.h
deleted file mode 100755
index af838b2a25..0000000000
--- a/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-#ifndef TGSI_SSE2_H
-#define TGSI_SSE2_H
-
-#if defined __cplusplus
-extern "C" {
-#endif
-
-struct tgsi_token;
-struct x86_function;
-
-unsigned
-tgsi_emit_sse2(
-   const struct tgsi_token *tokens,
-   struct x86_function *function,
-   float (*immediates)[4],
-   boolean do_swizzles );
-
-#if defined __cplusplus
-}
-#endif
-
-#endif /* TGSI_SSE2_H */
diff --git a/src/gallium/auxiliary/tgsi/tgsi_build.c b/src/gallium/auxiliary/tgsi/tgsi_build.c
new file mode 100644
index 0000000000..742ef14c35
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/tgsi_build.c
@@ -0,0 +1,1324 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "pipe/p_debug.h"
+#include "pipe/p_util.h"
+#include "pipe/p_shader_tokens.h"
+#include "tgsi_build.h"
+#include "tgsi_parse.h"
+
+/*
+ * version
+ */
+
+struct tgsi_version
+tgsi_build_version( void )
+{
+   struct tgsi_version  version;
+
+   version.MajorVersion = 1;
+   version.MinorVersion = 1;
+   version.Padding = 0;
+
+   return version;
+}
+
+/*
+ * header
+ */
+
+struct tgsi_header
+tgsi_build_header( void )
+{
+   struct tgsi_header header;
+
+   header.HeaderSize = 1;
+   header.BodySize = 0;
+
+   return header;
+}
+
+static void
+header_headersize_grow( struct tgsi_header *header )
+{
+   assert( header->HeaderSize < 0xFF );
+   assert( header->BodySize == 0 );
+
+   header->HeaderSize++;
+}
+
+static void
+header_bodysize_grow( struct tgsi_header *header )
+{
+   assert( header->BodySize < 0xFFFFFF );
+
+   header->BodySize++;
+}
+
+struct tgsi_processor
+tgsi_default_processor( void )
+{
+   struct tgsi_processor processor;
+
+   processor.Processor = TGSI_PROCESSOR_FRAGMENT;
+   processor.Padding = 0;
+
+   return processor;
+}
+
+struct tgsi_processor
+tgsi_build_processor(
+   unsigned type,
+   struct tgsi_header *header )
+{
+   struct tgsi_processor processor;
+
+   processor = tgsi_default_processor();
+   processor.Processor = type;
+
+   header_headersize_grow( header );
+
+   return processor;
+}
+
+/*
+ * declaration
+ */
+
+struct tgsi_declaration
+tgsi_default_declaration( void )
+{
+   struct tgsi_declaration declaration;
+
+   declaration.Type = TGSI_TOKEN_TYPE_DECLARATION;
+   declaration.Size = 1;
+   declaration.File = TGSI_FILE_NULL;
+   declaration.UsageMask = TGSI_WRITEMASK_XYZW;
+   declaration.Interpolate = TGSI_INTERPOLATE_CONSTANT;
+   declaration.Semantic = 0;
+   declaration.Padding = 0;
+   declaration.Extended = 0;
+
+   return declaration;
+}
+
+struct tgsi_declaration
+tgsi_build_declaration(
+   unsigned file,
+   unsigned usage_mask,
+   unsigned interpolate,
+   unsigned semantic,
+   struct tgsi_header *header )
+{
+   struct tgsi_declaration declaration;
+
+   assert( file <= TGSI_FILE_IMMEDIATE );
+   assert( interpolate <= TGSI_INTERPOLATE_PERSPECTIVE );
+
+   declaration = tgsi_default_declaration();
+   declaration.File = file;
+   declaration.UsageMask = usage_mask;
+   declaration.Interpolate = interpolate;
+   declaration.Semantic = semantic;
+
+   header_bodysize_grow( header );
+
+   return declaration;
+}
+
+static void
+declaration_grow(
+   struct tgsi_declaration *declaration,
+   struct tgsi_header *header )
+{
+   assert( declaration->Size < 0xFF );
+
+   declaration->Size++;
+
+   header_bodysize_grow( header );
+}
+
+struct tgsi_full_declaration
+tgsi_default_full_declaration( void )
+{
+   struct tgsi_full_declaration  full_declaration;
+
+   full_declaration.Declaration  = tgsi_default_declaration();
+   full_declaration.DeclarationRange = tgsi_default_declaration_range();
+   full_declaration.Semantic = tgsi_default_declaration_semantic();
+
+   return full_declaration;
+}
+
+unsigned
+tgsi_build_full_declaration(
+   const struct tgsi_full_declaration *full_decl,
+   struct tgsi_token *tokens,
+   struct tgsi_header *header,
+   unsigned maxsize )
+{
+   unsigned size = 0;
+   struct tgsi_declaration *declaration;
+   struct tgsi_declaration_range *dr;
+
+   if( maxsize <= size )
+     return 0;
+   declaration = (struct tgsi_declaration *) &tokens[size];
+   size++;
+
+   *declaration = tgsi_build_declaration(
+      full_decl->Declaration.File,
+      full_decl->Declaration.UsageMask,
+      full_decl->Declaration.Interpolate,
+      full_decl->Declaration.Semantic,
+      header );
+
+   if (maxsize <= size)
+      return 0;
+   dr = (struct tgsi_declaration_range *) &tokens[size];
+   size++;
+
+   *dr = tgsi_build_declaration_range(
+      full_decl->DeclarationRange.First,
+      full_decl->DeclarationRange.Last,
+      declaration,
+      header );
+
+   if( full_decl->Declaration.Semantic ) {
+      struct tgsi_declaration_semantic *ds;
+
+      if( maxsize <= size )
+         return  0;
+      ds = (struct tgsi_declaration_semantic *) &tokens[size];
+      size++;
+
+      *ds = tgsi_build_declaration_semantic(
+         full_decl->Semantic.SemanticName,
+         full_decl->Semantic.SemanticIndex,
+         declaration,
+         header );
+   }
+
+   return size;
+}
+
+struct tgsi_declaration_range
+tgsi_default_declaration_range( void )
+{
+   struct tgsi_declaration_range dr;
+
+   dr.First = 0;
+   dr.Last = 0;
+
+   return dr;
+}
+
+struct tgsi_declaration_range
+tgsi_build_declaration_range(
+   unsigned first,
+   unsigned last,
+   struct tgsi_declaration *declaration,
+   struct tgsi_header *header )
+{
+   struct tgsi_declaration_range declaration_range;
+
+   assert( last >= first );
+   assert( last <= 0xFFFF );
+
+   declaration_range = tgsi_default_declaration_range();
+   declaration_range.First = first;
+   declaration_range.Last = last;
+
+   declaration_grow( declaration, header );
+
+   return declaration_range;
+}
+
+struct tgsi_declaration_semantic
+tgsi_default_declaration_semantic( void )
+{
+   struct tgsi_declaration_semantic ds;
+
+   ds.SemanticName = TGSI_SEMANTIC_POSITION;
+   ds.SemanticIndex = 0;
+   ds.Padding = 0;
+
+   return ds;
+}
+
+struct tgsi_declaration_semantic
+tgsi_build_declaration_semantic(
+   unsigned semantic_name,
+   unsigned semantic_index,
+   struct tgsi_declaration *declaration,
+   struct tgsi_header *header )
+{
+   struct tgsi_declaration_semantic ds;
+
+   assert( semantic_name <= TGSI_SEMANTIC_COUNT );
+   assert( semantic_index <= 0xFFFF );
+
+   ds = tgsi_default_declaration_semantic();
+   ds.SemanticName = semantic_name;
+   ds.SemanticIndex = semantic_index;
+
+   declaration_grow( declaration, header );
+
+   return ds;
+}
+
+/*
+ * immediate
+ */
+
+struct tgsi_immediate
+tgsi_default_immediate( void )
+{
+   struct tgsi_immediate immediate;
+
+   immediate.Type = TGSI_TOKEN_TYPE_IMMEDIATE;
+   immediate.Size = 1;
+   immediate.DataType = TGSI_IMM_FLOAT32;
+   immediate.Padding = 0;
+   immediate.Extended = 0;
+
+   return immediate;
+}
+
+struct tgsi_immediate
+tgsi_build_immediate(
+   struct tgsi_header *header )
+{
+   struct tgsi_immediate immediate;
+
+   immediate = tgsi_default_immediate();
+
+   header_bodysize_grow( header );
+
+   return immediate;
+}
+
+struct tgsi_full_immediate
+tgsi_default_full_immediate( void )
+{
+   struct tgsi_full_immediate fullimm;
+
+   fullimm.Immediate = tgsi_default_immediate();
+   fullimm.u.Pointer = (void *) 0;
+
+   return fullimm;
+}
+
+static void
+immediate_grow(
+   struct tgsi_immediate *immediate,
+   struct tgsi_header *header )
+{
+   assert( immediate->Size < 0xFF );
+
+   immediate->Size++;
+
+   header_bodysize_grow( header );
+}
+
+struct tgsi_immediate_float32
+tgsi_build_immediate_float32(
+   float value,
+   struct tgsi_immediate *immediate,
+   struct tgsi_header *header )
+{
+   struct tgsi_immediate_float32 immediate_float32;
+
+   immediate_float32.Float = value;
+
+   immediate_grow( immediate, header );
+
+   return immediate_float32;
+}
+
+unsigned
+tgsi_build_full_immediate(
+   const struct tgsi_full_immediate *full_imm,
+   struct tgsi_token *tokens,
+   struct tgsi_header *header,
+   unsigned maxsize )
+{
+   unsigned size = 0, i;
+   struct tgsi_immediate *immediate;
+
+   if( maxsize <= size )
+      return 0;
+   immediate = (struct tgsi_immediate *) &tokens[size];
+   size++;
+
+   *immediate = tgsi_build_immediate( header );
+
+   for( i = 0; i < full_imm->Immediate.Size - 1; i++ ) {
+      struct tgsi_immediate_float32 *if32;
+
+      if( maxsize <= size )
+         return  0;
+      if32 = (struct tgsi_immediate_float32 *) &tokens[size];
+      size++;
+
+      *if32 = tgsi_build_immediate_float32(
+         full_imm->u.ImmediateFloat32[i].Float,
+         immediate,
+         header );
+   }
+
+   return size;
+}
+
+/*
+ * instruction
+ */
+
+struct tgsi_instruction
+tgsi_default_instruction( void )
+{
+   struct tgsi_instruction instruction;
+
+   instruction.Type = TGSI_TOKEN_TYPE_INSTRUCTION;
+   instruction.Size = 1;
+   instruction.Opcode = TGSI_OPCODE_MOV;
+   instruction.Saturate = TGSI_SAT_NONE;
+   instruction.NumDstRegs = 1;
+   instruction.NumSrcRegs = 1;
+   instruction.Padding  = 0;
+   instruction.Extended = 0;
+
+   return instruction;
+}
+
+struct tgsi_instruction
+tgsi_build_instruction(
+   unsigned opcode,
+   unsigned saturate,
+   unsigned num_dst_regs,
+   unsigned num_src_regs,
+   struct tgsi_header *header )
+{
+   struct tgsi_instruction instruction;
+
+   assert (opcode <= TGSI_OPCODE_LAST);
+   assert (saturate <= TGSI_SAT_MINUS_PLUS_ONE);
+   assert (num_dst_regs <= 3);
+   assert (num_src_regs <= 15);
+
+   instruction = tgsi_default_instruction();
+   instruction.Opcode = opcode;
+   instruction.Saturate = saturate;
+   instruction.NumDstRegs = num_dst_regs;
+   instruction.NumSrcRegs = num_src_regs;
+
+   header_bodysize_grow( header );
+
+   return instruction;
+}
+
+static void
+instruction_grow(
+   struct tgsi_instruction *instruction,
+   struct tgsi_header *header )
+{
+   assert (instruction->Size <   0xFF);
+
+   instruction->Size++;
+
+   header_bodysize_grow( header );
+}
+
+struct tgsi_full_instruction
+tgsi_default_full_instruction( void )
+{
+   struct tgsi_full_instruction full_instruction;
+   unsigned i;
+
+   full_instruction.Instruction = tgsi_default_instruction();
+   full_instruction.InstructionExtNv = tgsi_default_instruction_ext_nv();
+   full_instruction.InstructionExtLabel = tgsi_default_instruction_ext_label();
+   full_instruction.InstructionExtTexture = tgsi_default_instruction_ext_texture();
+   for( i = 0;  i < TGSI_FULL_MAX_DST_REGISTERS; i++ ) {
+      full_instruction.FullDstRegisters[i] = tgsi_default_full_dst_register();
+   }
+   for( i = 0;  i < TGSI_FULL_MAX_SRC_REGISTERS; i++ ) {
+      full_instruction.FullSrcRegisters[i] = tgsi_default_full_src_register();
+   }
+
+   return full_instruction;
+}
+
+unsigned
+tgsi_build_full_instruction(
+   const struct tgsi_full_instruction *full_inst,
+   struct  tgsi_token *tokens,
+   struct  tgsi_header *header,
+   unsigned  maxsize )
+{
+   unsigned size = 0;
+   unsigned i;
+   struct tgsi_instruction *instruction;
+   struct tgsi_token *prev_token;
+
+   if( maxsize <= size )
+      return 0;
+   instruction = (struct tgsi_instruction *) &tokens[size];
+   size++;
+
+   *instruction = tgsi_build_instruction(
+      full_inst->Instruction.Opcode,
+      full_inst->Instruction.Saturate,
+      full_inst->Instruction.NumDstRegs,
+      full_inst->Instruction.NumSrcRegs,
+      header );
+   prev_token = (struct tgsi_token  *) instruction;
+
+   if( tgsi_compare_instruction_ext_nv(
+         full_inst->InstructionExtNv,
+         tgsi_default_instruction_ext_nv() ) ) {
+      struct tgsi_instruction_ext_nv *instruction_ext_nv;
+
+      if( maxsize <= size )
+         return 0;
+      instruction_ext_nv =
+         (struct  tgsi_instruction_ext_nv *) &tokens[size];
+      size++;
+
+      *instruction_ext_nv  = tgsi_build_instruction_ext_nv(
+         full_inst->InstructionExtNv.Precision,
+         full_inst->InstructionExtNv.CondDstIndex,
+         full_inst->InstructionExtNv.CondFlowIndex,
+         full_inst->InstructionExtNv.CondMask,
+         full_inst->InstructionExtNv.CondSwizzleX,
+         full_inst->InstructionExtNv.CondSwizzleY,
+         full_inst->InstructionExtNv.CondSwizzleZ,
+         full_inst->InstructionExtNv.CondSwizzleW,
+         full_inst->InstructionExtNv.CondDstUpdate,
+         full_inst->InstructionExtNv.CondFlowEnable,
+         prev_token,
+         instruction,
+         header );
+      prev_token = (struct tgsi_token  *) instruction_ext_nv;
+   }
+
+   if( tgsi_compare_instruction_ext_label(
+         full_inst->InstructionExtLabel,
+         tgsi_default_instruction_ext_label() ) ) {
+      struct tgsi_instruction_ext_label *instruction_ext_label;
+
+      if( maxsize <= size )
+         return 0;
+      instruction_ext_label =
+         (struct  tgsi_instruction_ext_label *) &tokens[size];
+      size++;
+
+      *instruction_ext_label = tgsi_build_instruction_ext_label(
+         full_inst->InstructionExtLabel.Label,
+         prev_token,
+         instruction,
+         header );
+      prev_token = (struct tgsi_token  *) instruction_ext_label;
+   }
+
+   if( tgsi_compare_instruction_ext_texture(
+         full_inst->InstructionExtTexture,
+         tgsi_default_instruction_ext_texture() ) ) {
+      struct tgsi_instruction_ext_texture *instruction_ext_texture;
+
+      if( maxsize <= size )
+         return 0;
+      instruction_ext_texture =
+         (struct  tgsi_instruction_ext_texture *) &tokens[size];
+      size++;
+
+      *instruction_ext_texture = tgsi_build_instruction_ext_texture(
+         full_inst->InstructionExtTexture.Texture,
+         prev_token,
+         instruction,
+         header   );
+      prev_token = (struct tgsi_token  *) instruction_ext_texture;
+   }
+
+   for( i = 0;  i <   full_inst->Instruction.NumDstRegs; i++ ) {
+      const struct tgsi_full_dst_register *reg = &full_inst->FullDstRegisters[i];
+      struct tgsi_dst_register *dst_register;
+      struct tgsi_token *prev_token;
+
+      if( maxsize <= size )
+         return 0;
+      dst_register = (struct tgsi_dst_register *) &tokens[size];
+      size++;
+
+      *dst_register = tgsi_build_dst_register(
+         reg->DstRegister.File,
+         reg->DstRegister.WriteMask,
+         reg->DstRegister.Index,
+         instruction,
+         header );
+      prev_token = (struct tgsi_token  *) dst_register;
+
+      if( tgsi_compare_dst_register_ext_concode(
+            reg->DstRegisterExtConcode,
+            tgsi_default_dst_register_ext_concode() ) ) {
+         struct tgsi_dst_register_ext_concode *dst_register_ext_concode;
+
+         if( maxsize <= size )
+            return 0;
+         dst_register_ext_concode =
+            (struct  tgsi_dst_register_ext_concode *) &tokens[size];
+         size++;
+
+         *dst_register_ext_concode =   tgsi_build_dst_register_ext_concode(
+            reg->DstRegisterExtConcode.CondMask,
+            reg->DstRegisterExtConcode.CondSwizzleX,
+            reg->DstRegisterExtConcode.CondSwizzleY,
+            reg->DstRegisterExtConcode.CondSwizzleZ,
+            reg->DstRegisterExtConcode.CondSwizzleW,
+            reg->DstRegisterExtConcode.CondSrcIndex,
+            prev_token,
+            instruction,
+            header );
+         prev_token = (struct tgsi_token  *) dst_register_ext_concode;
+      }
+
+      if( tgsi_compare_dst_register_ext_modulate(
+            reg->DstRegisterExtModulate,
+            tgsi_default_dst_register_ext_modulate() ) ) {
+         struct tgsi_dst_register_ext_modulate *dst_register_ext_modulate;
+
+         if( maxsize <= size )
+            return 0;
+         dst_register_ext_modulate =
+            (struct  tgsi_dst_register_ext_modulate *) &tokens[size];
+         size++;
+
+         *dst_register_ext_modulate = tgsi_build_dst_register_ext_modulate(
+            reg->DstRegisterExtModulate.Modulate,
+            prev_token,
+            instruction,
+            header );
+         prev_token = (struct tgsi_token  *) dst_register_ext_modulate;
+      }
+   }
+
+   for( i = 0;  i < full_inst->Instruction.NumSrcRegs; i++ ) {
+      const struct tgsi_full_src_register *reg = &full_inst->FullSrcRegisters[i];
+      struct tgsi_src_register *src_register;
+      struct tgsi_token *prev_token;
+
+      if( maxsize <= size )
+         return 0;
+      src_register = (struct tgsi_src_register *)  &tokens[size];
+      size++;
+
+      *src_register = tgsi_build_src_register(
+         reg->SrcRegister.File,
+         reg->SrcRegister.SwizzleX,
+         reg->SrcRegister.SwizzleY,
+         reg->SrcRegister.SwizzleZ,
+         reg->SrcRegister.SwizzleW,
+         reg->SrcRegister.Negate,
+         reg->SrcRegister.Indirect,
+         reg->SrcRegister.Dimension,
+         reg->SrcRegister.Index,
+         instruction,
+         header );
+      prev_token = (struct tgsi_token  *) src_register;
+
+      if( tgsi_compare_src_register_ext_swz(
+            reg->SrcRegisterExtSwz,
+            tgsi_default_src_register_ext_swz() ) ) {
+         struct tgsi_src_register_ext_swz *src_register_ext_swz;
+
+         /* Use of the extended swizzle requires the simple swizzle to be identity.
+          */
+         assert( reg->SrcRegister.SwizzleX == TGSI_SWIZZLE_X );
+         assert( reg->SrcRegister.SwizzleY == TGSI_SWIZZLE_Y );
+         assert( reg->SrcRegister.SwizzleZ == TGSI_SWIZZLE_Z );
+         assert( reg->SrcRegister.SwizzleW == TGSI_SWIZZLE_W );
+         assert( reg->SrcRegister.Negate == FALSE );
+
+         if( maxsize <= size )
+            return 0;
+         src_register_ext_swz =
+            (struct  tgsi_src_register_ext_swz *) &tokens[size];
+         size++;
+
+         *src_register_ext_swz = tgsi_build_src_register_ext_swz(
+            reg->SrcRegisterExtSwz.ExtSwizzleX,
+            reg->SrcRegisterExtSwz.ExtSwizzleY,
+            reg->SrcRegisterExtSwz.ExtSwizzleZ,
+            reg->SrcRegisterExtSwz.ExtSwizzleW,
+            reg->SrcRegisterExtSwz.NegateX,
+            reg->SrcRegisterExtSwz.NegateY,
+            reg->SrcRegisterExtSwz.NegateZ,
+            reg->SrcRegisterExtSwz.NegateW,
+            prev_token,
+            instruction,
+            header );
+         prev_token = (struct tgsi_token  *) src_register_ext_swz;
+      }
+
+      if( tgsi_compare_src_register_ext_mod(
+            reg->SrcRegisterExtMod,
+            tgsi_default_src_register_ext_mod() ) ) {
+         struct tgsi_src_register_ext_mod *src_register_ext_mod;
+
+         if( maxsize <= size )
+            return 0;
+         src_register_ext_mod =
+            (struct  tgsi_src_register_ext_mod *) &tokens[size];
+         size++;
+
+         *src_register_ext_mod = tgsi_build_src_register_ext_mod(
+            reg->SrcRegisterExtMod.Complement,
+            reg->SrcRegisterExtMod.Bias,
+            reg->SrcRegisterExtMod.Scale2X,
+            reg->SrcRegisterExtMod.Absolute,
+            reg->SrcRegisterExtMod.Negate,
+            prev_token,
+            instruction,
+            header );
+         prev_token = (struct tgsi_token  *) src_register_ext_mod;
+      }
+
+      if( reg->SrcRegister.Indirect ) {
+         struct  tgsi_src_register *ind;
+
+         if( maxsize <= size )
+            return 0;
+         ind = (struct tgsi_src_register *) &tokens[size];
+         size++;
+
+         *ind = tgsi_build_src_register(
+            reg->SrcRegisterInd.File,
+            reg->SrcRegisterInd.SwizzleX,
+            reg->SrcRegisterInd.SwizzleY,
+            reg->SrcRegisterInd.SwizzleZ,
+            reg->SrcRegisterInd.SwizzleW,
+            reg->SrcRegisterInd.Negate,
+            reg->SrcRegisterInd.Indirect,
+            reg->SrcRegisterInd.Dimension,
+            reg->SrcRegisterInd.Index,
+            instruction,
+            header );
+      }
+
+      if( reg->SrcRegister.Dimension ) {
+         struct  tgsi_dimension *dim;
+
+         assert( !reg->SrcRegisterDim.Dimension );
+
+         if( maxsize <= size )
+            return 0;
+         dim = (struct tgsi_dimension *) &tokens[size];
+         size++;
+
+         *dim = tgsi_build_dimension(
+            reg->SrcRegisterDim.Indirect,
+            reg->SrcRegisterDim.Index,
+            instruction,
+            header );
+
+         if( reg->SrcRegisterDim.Indirect ) {
+            struct tgsi_src_register *ind;
+
+            if( maxsize <= size )
+               return 0;
+            ind = (struct tgsi_src_register *) &tokens[size];
+            size++;
+
+            *ind = tgsi_build_src_register(
+               reg->SrcRegisterDimInd.File,
+               reg->SrcRegisterDimInd.SwizzleX,
+               reg->SrcRegisterDimInd.SwizzleY,
+               reg->SrcRegisterDimInd.SwizzleZ,
+               reg->SrcRegisterDimInd.SwizzleW,
+               reg->SrcRegisterDimInd.Negate,
+               reg->SrcRegisterDimInd.Indirect,
+               reg->SrcRegisterDimInd.Dimension,
+               reg->SrcRegisterDimInd.Index,
+               instruction,
+               header );
+         }
+      }
+   }
+
+   return size;
+}
+
+struct tgsi_instruction_ext_nv
+tgsi_default_instruction_ext_nv( void )
+{
+   struct tgsi_instruction_ext_nv instruction_ext_nv;
+
+   instruction_ext_nv.Type = TGSI_INSTRUCTION_EXT_TYPE_NV;
+   instruction_ext_nv.Precision = TGSI_PRECISION_DEFAULT;
+   instruction_ext_nv.CondDstIndex = 0;
+   instruction_ext_nv.CondFlowIndex = 0;
+   instruction_ext_nv.CondMask = TGSI_CC_TR;
+   instruction_ext_nv.CondSwizzleX = TGSI_SWIZZLE_X;
+   instruction_ext_nv.CondSwizzleY = TGSI_SWIZZLE_Y;
+   instruction_ext_nv.CondSwizzleZ = TGSI_SWIZZLE_Z;
+   instruction_ext_nv.CondSwizzleW = TGSI_SWIZZLE_W;
+   instruction_ext_nv.CondDstUpdate = 0;
+   instruction_ext_nv.CondFlowEnable = 0;
+   instruction_ext_nv.Padding = 0;
+   instruction_ext_nv.Extended = 0;
+
+   return instruction_ext_nv;
+}
+
+union token_u32
+{
+   unsigned u32;
+};
+
+unsigned
+tgsi_compare_instruction_ext_nv(
+   struct tgsi_instruction_ext_nv a,
+   struct tgsi_instruction_ext_nv b )
+{
+   a.Padding = b.Padding = 0;
+   a.Extended = b.Extended = 0;
+   return ((union token_u32 *) &a)->u32 != ((union token_u32 *) &b)->u32;
+}
+
+struct tgsi_instruction_ext_nv
+tgsi_build_instruction_ext_nv(
+   unsigned precision,
+   unsigned cond_dst_index,
+   unsigned cond_flow_index,
+   unsigned cond_mask,
+   unsigned cond_swizzle_x,
+   unsigned cond_swizzle_y,
+   unsigned cond_swizzle_z,
+   unsigned cond_swizzle_w,
+   unsigned cond_dst_update,
+   unsigned cond_flow_update,
+   struct tgsi_token *prev_token,
+   struct tgsi_instruction *instruction,
+   struct tgsi_header *header )
+{
+   struct tgsi_instruction_ext_nv instruction_ext_nv;
+
+   instruction_ext_nv = tgsi_default_instruction_ext_nv();
+   instruction_ext_nv.Precision = precision;
+   instruction_ext_nv.CondDstIndex = cond_dst_index;
+   instruction_ext_nv.CondFlowIndex = cond_flow_index;
+   instruction_ext_nv.CondMask = cond_mask;
+   instruction_ext_nv.CondSwizzleX = cond_swizzle_x;
+   instruction_ext_nv.CondSwizzleY = cond_swizzle_y;
+   instruction_ext_nv.CondSwizzleZ = cond_swizzle_z;
+   instruction_ext_nv.CondSwizzleW = cond_swizzle_w;
+   instruction_ext_nv.CondDstUpdate = cond_dst_update;
+   instruction_ext_nv.CondFlowEnable = cond_flow_update;
+
+   prev_token->Extended = 1;
+   instruction_grow( instruction, header );
+
+   return instruction_ext_nv;
+}
+
+struct tgsi_instruction_ext_label
+tgsi_default_instruction_ext_label( void )
+{
+   struct tgsi_instruction_ext_label instruction_ext_label;
+
+   instruction_ext_label.Type = TGSI_INSTRUCTION_EXT_TYPE_LABEL;
+   instruction_ext_label.Label = 0;
+   instruction_ext_label.Padding = 0;
+   instruction_ext_label.Extended = 0;
+
+   return instruction_ext_label;
+}
+
+unsigned
+tgsi_compare_instruction_ext_label(
+   struct tgsi_instruction_ext_label a,
+   struct tgsi_instruction_ext_label b )
+{
+   a.Padding = b.Padding = 0;
+   a.Extended = b.Extended = 0;
+   return ((union token_u32 *) &a)->u32 != ((union token_u32 *) &b)->u32;
+}
+
+struct tgsi_instruction_ext_label
+tgsi_build_instruction_ext_label(
+   unsigned label,
+   struct tgsi_token  *prev_token,
+   struct tgsi_instruction *instruction,
+   struct tgsi_header *header )
+{
+   struct tgsi_instruction_ext_label instruction_ext_label;
+
+   instruction_ext_label = tgsi_default_instruction_ext_label();
+   instruction_ext_label.Label = label;
+
+   prev_token->Extended = 1;
+   instruction_grow( instruction, header );
+
+   return instruction_ext_label;
+}
+
+struct tgsi_instruction_ext_texture
+tgsi_default_instruction_ext_texture( void )
+{
+   struct tgsi_instruction_ext_texture instruction_ext_texture;
+
+   instruction_ext_texture.Type = TGSI_INSTRUCTION_EXT_TYPE_TEXTURE;
+   instruction_ext_texture.Texture = TGSI_TEXTURE_UNKNOWN;
+   instruction_ext_texture.Padding = 0;
+   instruction_ext_texture.Extended = 0;
+
+   return instruction_ext_texture;
+}
+
+unsigned
+tgsi_compare_instruction_ext_texture(
+   struct tgsi_instruction_ext_texture a,
+   struct tgsi_instruction_ext_texture b )
+{
+   a.Padding = b.Padding = 0;
+   a.Extended = b.Extended = 0;
+   return ((union token_u32 *) &a)->u32 != ((union token_u32 *) &b)->u32;
+}
+
+struct tgsi_instruction_ext_texture
+tgsi_build_instruction_ext_texture(
+   unsigned texture,
+   struct tgsi_token *prev_token,
+   struct tgsi_instruction *instruction,
+   struct tgsi_header *header )
+{
+   struct tgsi_instruction_ext_texture instruction_ext_texture;
+
+   instruction_ext_texture = tgsi_default_instruction_ext_texture();
+   instruction_ext_texture.Texture = texture;
+
+   prev_token->Extended = 1;
+   instruction_grow( instruction, header );
+
+   return instruction_ext_texture;
+}
+
+struct tgsi_src_register
+tgsi_default_src_register( void )
+{
+   struct tgsi_src_register src_register;
+
+   src_register.File = TGSI_FILE_NULL;
+   src_register.SwizzleX = TGSI_SWIZZLE_X;
+   src_register.SwizzleY = TGSI_SWIZZLE_Y;
+   src_register.SwizzleZ = TGSI_SWIZZLE_Z;
+   src_register.SwizzleW = TGSI_SWIZZLE_W;
+   src_register.Negate = 0;
+   src_register.Indirect = 0;
+   src_register.Dimension = 0;
+   src_register.Index = 0;
+   src_register.Extended = 0;
+
+   return src_register;
+}
+
+struct tgsi_src_register
+tgsi_build_src_register(
+   unsigned file,
+   unsigned swizzle_x,
+   unsigned swizzle_y,
+   unsigned swizzle_z,
+   unsigned swizzle_w,
+   unsigned negate,
+   unsigned indirect,
+   unsigned dimension,
+   int index,
+   struct tgsi_instruction *instruction,
+   struct tgsi_header *header )
+{
+   struct tgsi_src_register   src_register;
+
+   assert( file <= TGSI_FILE_IMMEDIATE );
+   assert( swizzle_x <= TGSI_SWIZZLE_W );
+   assert( swizzle_y <= TGSI_SWIZZLE_W );
+   assert( swizzle_z <= TGSI_SWIZZLE_W );
+   assert( swizzle_w <= TGSI_SWIZZLE_W );
+   assert( negate <= 1 );
+   assert( index >= -0x8000 && index <= 0x7FFF );
+
+   src_register = tgsi_default_src_register();
+   src_register.File = file;
+   src_register.SwizzleX = swizzle_x;
+   src_register.SwizzleY = swizzle_y;
+   src_register.SwizzleZ = swizzle_z;
+   src_register.SwizzleW = swizzle_w;
+   src_register.Negate = negate;
+   src_register.Indirect = indirect;
+   src_register.Dimension = dimension;
+   src_register.Index = index;
+
+   instruction_grow( instruction, header );
+
+   return src_register;
+}
+
+struct tgsi_full_src_register
+tgsi_default_full_src_register( void )
+{
+   struct tgsi_full_src_register full_src_register;
+
+   full_src_register.SrcRegister = tgsi_default_src_register();
+   full_src_register.SrcRegisterExtSwz = tgsi_default_src_register_ext_swz();
+   full_src_register.SrcRegisterExtMod = tgsi_default_src_register_ext_mod();
+   full_src_register.SrcRegisterInd = tgsi_default_src_register();
+   full_src_register.SrcRegisterDim = tgsi_default_dimension();
+   full_src_register.SrcRegisterDimInd = tgsi_default_src_register();
+
+   return full_src_register;
+}
+
+struct tgsi_src_register_ext_swz
+tgsi_default_src_register_ext_swz( void )
+{
+   struct tgsi_src_register_ext_swz src_register_ext_swz;
+
+   src_register_ext_swz.Type = TGSI_SRC_REGISTER_EXT_TYPE_SWZ;
+   src_register_ext_swz.ExtSwizzleX = TGSI_EXTSWIZZLE_X;
+   src_register_ext_swz.ExtSwizzleY = TGSI_EXTSWIZZLE_Y;
+   src_register_ext_swz.ExtSwizzleZ = TGSI_EXTSWIZZLE_Z;
+   src_register_ext_swz.ExtSwizzleW = TGSI_EXTSWIZZLE_W;
+   src_register_ext_swz.NegateX = 0;
+   src_register_ext_swz.NegateY = 0;
+   src_register_ext_swz.NegateZ = 0;
+   src_register_ext_swz.NegateW = 0;
+   src_register_ext_swz.Padding = 0;
+   src_register_ext_swz.Extended = 0;
+
+   return src_register_ext_swz;
+}
+
+unsigned
+tgsi_compare_src_register_ext_swz(
+   struct tgsi_src_register_ext_swz a,
+   struct tgsi_src_register_ext_swz b )
+{
+   a.Padding = b.Padding = 0;
+   a.Extended = b.Extended = 0;
+   return ((union token_u32 *) &a)->u32 != ((union token_u32 *) &b)->u32;
+}
+
+struct tgsi_src_register_ext_swz
+tgsi_build_src_register_ext_swz(
+   unsigned ext_swizzle_x,
+   unsigned ext_swizzle_y,
+   unsigned ext_swizzle_z,
+   unsigned ext_swizzle_w,
+   unsigned negate_x,
+   unsigned negate_y,
+   unsigned negate_z,
+   unsigned negate_w,
+   struct tgsi_token *prev_token,
+   struct tgsi_instruction *instruction,
+   struct tgsi_header *header )
+{
+   struct tgsi_src_register_ext_swz src_register_ext_swz;
+
+   assert( ext_swizzle_x <= TGSI_EXTSWIZZLE_ONE );
+   assert( ext_swizzle_y <= TGSI_EXTSWIZZLE_ONE );
+   assert( ext_swizzle_z <= TGSI_EXTSWIZZLE_ONE );
+   assert( ext_swizzle_w <= TGSI_EXTSWIZZLE_ONE );
+   assert( negate_x <= 1 );
+   assert( negate_y <= 1 );
+   assert( negate_z <= 1 );
+   assert( negate_w <= 1 );
+
+   src_register_ext_swz = tgsi_default_src_register_ext_swz();
+   src_register_ext_swz.ExtSwizzleX = ext_swizzle_x;
+   src_register_ext_swz.ExtSwizzleY = ext_swizzle_y;
+   src_register_ext_swz.ExtSwizzleZ = ext_swizzle_z;
+   src_register_ext_swz.ExtSwizzleW = ext_swizzle_w;
+   src_register_ext_swz.NegateX = negate_x;
+   src_register_ext_swz.NegateY = negate_y;
+   src_register_ext_swz.NegateZ = negate_z;
+   src_register_ext_swz.NegateW = negate_w;
+
+   prev_token->Extended = 1;
+   instruction_grow( instruction, header );
+
+   return src_register_ext_swz;
+}
+
+struct tgsi_src_register_ext_mod
+tgsi_default_src_register_ext_mod( void )
+{
+   struct tgsi_src_register_ext_mod src_register_ext_mod;
+
+   src_register_ext_mod.Type = TGSI_SRC_REGISTER_EXT_TYPE_MOD;
+   src_register_ext_mod.Complement = 0;
+   src_register_ext_mod.Bias = 0;
+   src_register_ext_mod.Scale2X = 0;
+   src_register_ext_mod.Absolute = 0;
+   src_register_ext_mod.Negate = 0;
+   src_register_ext_mod.Padding = 0;
+   src_register_ext_mod.Extended = 0;
+
+   return src_register_ext_mod;
+}
+
+unsigned
+tgsi_compare_src_register_ext_mod(
+   struct tgsi_src_register_ext_mod a,
+   struct tgsi_src_register_ext_mod b )
+{
+   a.Padding = b.Padding = 0;
+   a.Extended = b.Extended = 0;
+   return ((union token_u32 *) &a)->u32 != ((union token_u32 *) &b)->u32;
+}
+
+struct tgsi_src_register_ext_mod
+tgsi_build_src_register_ext_mod(
+   unsigned complement,
+   unsigned bias,
+   unsigned scale_2x,
+   unsigned absolute,
+   unsigned negate,
+   struct tgsi_token *prev_token,
+   struct tgsi_instruction *instruction,
+   struct tgsi_header *header )
+{
+   struct tgsi_src_register_ext_mod src_register_ext_mod;
+
+   assert( complement <= 1 );
+   assert( bias <= 1 );
+   assert( scale_2x <= 1 );
+   assert( absolute <= 1 );
+   assert( negate <= 1 );
+
+   src_register_ext_mod = tgsi_default_src_register_ext_mod();
+   src_register_ext_mod.Complement = complement;
+   src_register_ext_mod.Bias = bias;
+   src_register_ext_mod.Scale2X = scale_2x;
+   src_register_ext_mod.Absolute = absolute;
+   src_register_ext_mod.Negate = negate;
+
+   prev_token->Extended = 1;
+   instruction_grow( instruction, header );
+
+   return src_register_ext_mod;
+}
+
+struct tgsi_dimension
+tgsi_default_dimension( void )
+{
+   struct tgsi_dimension dimension;
+
+   dimension.Indirect = 0;
+   dimension.Dimension = 0;
+   dimension.Padding = 0;
+   dimension.Index = 0;
+   dimension.Extended = 0;
+
+   return dimension;
+}
+
+struct tgsi_dimension
+tgsi_build_dimension(
+   unsigned indirect,
+   unsigned index,
+   struct tgsi_instruction *instruction,
+   struct tgsi_header *header )
+{
+   struct tgsi_dimension dimension;
+
+   dimension = tgsi_default_dimension();
+   dimension.Indirect = indirect;
+   dimension.Index = index;
+
+   instruction_grow( instruction, header );
+
+   return dimension;
+}
+
+struct tgsi_dst_register
+tgsi_default_dst_register( void )
+{
+   struct tgsi_dst_register dst_register;
+
+   dst_register.File = TGSI_FILE_NULL;
+   dst_register.WriteMask = TGSI_WRITEMASK_XYZW;
+   dst_register.Indirect = 0;
+   dst_register.Dimension = 0;
+   dst_register.Index = 0;
+   dst_register.Padding = 0;
+   dst_register.Extended = 0;
+
+   return dst_register;
+}
+
+struct tgsi_dst_register
+tgsi_build_dst_register(
+   unsigned file,
+   unsigned mask,
+   int index,
+   struct tgsi_instruction *instruction,
+   struct tgsi_header *header )
+{
+   struct tgsi_dst_register dst_register;
+
+   assert( file <= TGSI_FILE_IMMEDIATE );
+   assert( mask <= TGSI_WRITEMASK_XYZW );
+   assert( index >= -32768 && index <= 32767 );
+
+   dst_register = tgsi_default_dst_register();
+   dst_register.File = file;
+   dst_register.WriteMask = mask;
+   dst_register.Index = index;
+
+   instruction_grow( instruction, header );
+
+   return dst_register;
+}
+
+struct tgsi_full_dst_register
+tgsi_default_full_dst_register( void )
+{
+   struct tgsi_full_dst_register full_dst_register;
+
+   full_dst_register.DstRegister = tgsi_default_dst_register();
+   full_dst_register.DstRegisterExtConcode =
+      tgsi_default_dst_register_ext_concode();
+   full_dst_register.DstRegisterExtModulate =
+      tgsi_default_dst_register_ext_modulate();
+
+   return full_dst_register;
+}
+
+struct tgsi_dst_register_ext_concode
+tgsi_default_dst_register_ext_concode( void )
+{
+   struct tgsi_dst_register_ext_concode dst_register_ext_concode;
+
+   dst_register_ext_concode.Type = TGSI_DST_REGISTER_EXT_TYPE_CONDCODE;
+   dst_register_ext_concode.CondMask = TGSI_CC_TR;
+   dst_register_ext_concode.CondSwizzleX = TGSI_SWIZZLE_X;
+   dst_register_ext_concode.CondSwizzleY = TGSI_SWIZZLE_Y;
+   dst_register_ext_concode.CondSwizzleZ = TGSI_SWIZZLE_Z;
+   dst_register_ext_concode.CondSwizzleW = TGSI_SWIZZLE_W;
+   dst_register_ext_concode.CondSrcIndex = 0;
+   dst_register_ext_concode.Padding = 0;
+   dst_register_ext_concode.Extended = 0;
+
+   return dst_register_ext_concode;
+}
+
+unsigned
+tgsi_compare_dst_register_ext_concode(
+   struct tgsi_dst_register_ext_concode a,
+   struct tgsi_dst_register_ext_concode b )
+{
+   a.Padding = b.Padding = 0;
+   a.Extended = b.Extended = 0;
+   return ((union token_u32 *) &a)->u32 != ((union token_u32 *) &b)->u32;
+}
+
+struct tgsi_dst_register_ext_concode
+tgsi_build_dst_register_ext_concode(
+   unsigned cc,
+   unsigned swizzle_x,
+   unsigned swizzle_y,
+   unsigned swizzle_z,
+   unsigned swizzle_w,
+   int index,
+   struct tgsi_token *prev_token,
+   struct tgsi_instruction *instruction,
+   struct tgsi_header *header )
+{
+   struct tgsi_dst_register_ext_concode dst_register_ext_concode;
+
+   assert( cc <= TGSI_CC_FL );
+   assert( swizzle_x <= TGSI_SWIZZLE_W );
+   assert( swizzle_y <= TGSI_SWIZZLE_W );
+   assert( swizzle_z <= TGSI_SWIZZLE_W );
+   assert( swizzle_w <= TGSI_SWIZZLE_W );
+   assert( index >= -32768 && index <= 32767 );
+
+   dst_register_ext_concode = tgsi_default_dst_register_ext_concode();
+   dst_register_ext_concode.CondMask = cc;
+   dst_register_ext_concode.CondSwizzleX = swizzle_x;
+   dst_register_ext_concode.CondSwizzleY = swizzle_y;
+   dst_register_ext_concode.CondSwizzleZ = swizzle_z;
+   dst_register_ext_concode.CondSwizzleW = swizzle_w;
+   dst_register_ext_concode.CondSrcIndex = index;
+
+   prev_token->Extended = 1;
+   instruction_grow( instruction, header );
+
+   return dst_register_ext_concode;
+}
+
+struct tgsi_dst_register_ext_modulate
+tgsi_default_dst_register_ext_modulate( void )
+{
+   struct tgsi_dst_register_ext_modulate dst_register_ext_modulate;
+
+   dst_register_ext_modulate.Type = TGSI_DST_REGISTER_EXT_TYPE_MODULATE;
+   dst_register_ext_modulate.Modulate = TGSI_MODULATE_1X;
+   dst_register_ext_modulate.Padding = 0;
+   dst_register_ext_modulate.Extended = 0;
+
+   return dst_register_ext_modulate;
+}
+
+unsigned
+tgsi_compare_dst_register_ext_modulate(
+   struct tgsi_dst_register_ext_modulate a,
+   struct tgsi_dst_register_ext_modulate b )
+{
+   a.Padding = b.Padding = 0;
+   a.Extended = b.Extended = 0;
+   return ((union token_u32 *) &a)->u32 != ((union token_u32 *) &b)->u32;
+}
+
+struct tgsi_dst_register_ext_modulate
+tgsi_build_dst_register_ext_modulate(
+   unsigned modulate,
+   struct tgsi_token *prev_token,
+   struct tgsi_instruction *instruction,
+   struct tgsi_header *header )
+{
+   struct tgsi_dst_register_ext_modulate dst_register_ext_modulate;
+
+   assert( modulate <= TGSI_MODULATE_EIGHTH );
+
+   dst_register_ext_modulate = tgsi_default_dst_register_ext_modulate();
+   dst_register_ext_modulate.Modulate = modulate;
+
+   prev_token->Extended = 1;
+   instruction_grow( instruction, header );
+
+   return dst_register_ext_modulate;
+}
diff --git a/src/gallium/auxiliary/tgsi/tgsi_build.h b/src/gallium/auxiliary/tgsi/tgsi_build.h
new file mode 100644
index 0000000000..ed25830248
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/tgsi_build.h
@@ -0,0 +1,332 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef TGSI_BUILD_H
+#define TGSI_BUILD_H
+
+#if defined __cplusplus
+extern "C" {
+#endif
+
+/*
+ * version
+ */
+
+struct tgsi_version
+tgsi_build_version( void );
+
+/*
+ * header
+ */
+
+struct tgsi_header
+tgsi_build_header( void );
+
+struct tgsi_processor
+tgsi_default_processor( void );
+
+struct tgsi_processor
+tgsi_build_processor(
+   unsigned processor,
+   struct tgsi_header *header );
+
+/*
+ * declaration
+ */
+
+struct tgsi_declaration
+tgsi_default_declaration( void );
+
+struct tgsi_declaration
+tgsi_build_declaration(
+   unsigned file,
+   unsigned usage_mask,
+   unsigned interpolate,
+   unsigned semantic,
+   struct tgsi_header *header );
+
+struct tgsi_full_declaration
+tgsi_default_full_declaration( void );
+
+unsigned
+tgsi_build_full_declaration(
+   const struct tgsi_full_declaration *full_decl,
+   struct tgsi_token *tokens,
+   struct tgsi_header *header,
+   unsigned maxsize );
+
+struct tgsi_declaration_range
+tgsi_default_declaration_range( void );
+
+struct tgsi_declaration_range
+tgsi_build_declaration_range(
+   unsigned first,
+   unsigned last,
+   struct tgsi_declaration *declaration,
+   struct tgsi_header *header );
+
+struct tgsi_declaration_semantic
+tgsi_default_declaration_semantic( void );
+
+struct tgsi_declaration_semantic
+tgsi_build_declaration_semantic(
+   unsigned semantic_name,
+   unsigned semantic_index,
+   struct tgsi_declaration *declaration,
+   struct tgsi_header *header );
+
+/*
+ * immediate
+ */
+
+struct tgsi_immediate
+tgsi_default_immediate( void );
+
+struct tgsi_immediate
+tgsi_build_immediate(
+   struct tgsi_header *header );
+
+struct tgsi_full_immediate
+tgsi_default_full_immediate( void );
+
+struct tgsi_immediate_float32
+tgsi_build_immediate_float32(
+   float value,
+   struct tgsi_immediate *immediate,
+   struct tgsi_header *header );
+
+unsigned
+tgsi_build_full_immediate(
+   const struct tgsi_full_immediate *full_imm,
+   struct tgsi_token *tokens,
+   struct tgsi_header *header,
+   unsigned maxsize );
+
+/*
+ * instruction
+ */
+
+struct tgsi_instruction
+tgsi_default_instruction( void );
+
+struct tgsi_instruction
+tgsi_build_instruction(
+   unsigned opcode,
+   unsigned saturate,
+   unsigned num_dst_regs,
+   unsigned num_src_regs,
+   struct tgsi_header *header );
+
+struct tgsi_full_instruction
+tgsi_default_full_instruction( void );
+
+unsigned
+tgsi_build_full_instruction(
+   const struct tgsi_full_instruction *full_inst,
+   struct tgsi_token *tokens,
+   struct tgsi_header *header,
+   unsigned maxsize );
+
+struct tgsi_instruction_ext_nv
+tgsi_default_instruction_ext_nv( void );
+
+unsigned
+tgsi_compare_instruction_ext_nv(
+   struct tgsi_instruction_ext_nv a,
+   struct tgsi_instruction_ext_nv b );
+
+struct tgsi_instruction_ext_nv
+tgsi_build_instruction_ext_nv(
+   unsigned precision,
+   unsigned cond_dst_index,
+   unsigned cond_flow_index,
+   unsigned cond_mask,
+   unsigned cond_swizzle_x,
+   unsigned cond_swizzle_y,
+   unsigned cond_swizzle_z,
+   unsigned cond_swizzle_w,
+   unsigned cond_dst_update,
+   unsigned cond_flow_update,
+   struct tgsi_token *prev_token,
+   struct tgsi_instruction *instruction,
+   struct tgsi_header *header );
+
+struct tgsi_instruction_ext_label
+tgsi_default_instruction_ext_label( void );
+
+unsigned
+tgsi_compare_instruction_ext_label(
+   struct tgsi_instruction_ext_label a,
+   struct tgsi_instruction_ext_label b );
+
+struct tgsi_instruction_ext_label
+tgsi_build_instruction_ext_label(
+   unsigned label,
+   struct tgsi_token *prev_token,
+   struct tgsi_instruction *instruction,
+   struct tgsi_header *header );
+
+struct tgsi_instruction_ext_texture
+tgsi_default_instruction_ext_texture( void );
+
+unsigned
+tgsi_compare_instruction_ext_texture(
+   struct tgsi_instruction_ext_texture a,
+   struct tgsi_instruction_ext_texture b );
+
+struct tgsi_instruction_ext_texture
+tgsi_build_instruction_ext_texture(
+   unsigned texture,
+   struct tgsi_token *prev_token,
+   struct tgsi_instruction *instruction,
+   struct tgsi_header *header );
+
+struct tgsi_src_register
+tgsi_default_src_register( void );
+
+struct tgsi_src_register
+tgsi_build_src_register(
+   unsigned file,
+   unsigned swizzle_x,
+   unsigned swizzle_y,
+   unsigned swizzle_z,
+   unsigned swizzle_w,
+   unsigned negate,
+   unsigned indirect,
+   unsigned dimension,
+   int index,
+   struct tgsi_instruction *instruction,
+   struct tgsi_header *header );
+
+struct tgsi_full_src_register
+tgsi_default_full_src_register( void );
+
+struct tgsi_src_register_ext_swz
+tgsi_default_src_register_ext_swz( void );
+
+unsigned
+tgsi_compare_src_register_ext_swz(
+   struct tgsi_src_register_ext_swz a,
+   struct tgsi_src_register_ext_swz b );
+
+struct tgsi_src_register_ext_swz
+tgsi_build_src_register_ext_swz(
+   unsigned ext_swizzle_x,
+   unsigned ext_swizzle_y,
+   unsigned ext_swizzle_z,
+   unsigned ext_swizzle_w,
+   unsigned negate_x,
+   unsigned negate_y,
+   unsigned negate_z,
+   unsigned negate_w,
+   struct tgsi_token *prev_token,
+   struct tgsi_instruction *instruction,
+   struct tgsi_header *header );
+
+struct tgsi_src_register_ext_mod
+tgsi_default_src_register_ext_mod( void );
+
+unsigned
+tgsi_compare_src_register_ext_mod(
+   struct tgsi_src_register_ext_mod a,
+   struct tgsi_src_register_ext_mod b );
+
+struct tgsi_src_register_ext_mod
+tgsi_build_src_register_ext_mod(
+   unsigned complement,
+   unsigned bias,
+   unsigned scale_2x,
+   unsigned absolute,
+   unsigned negate,
+   struct tgsi_token *prev_token,
+   struct tgsi_instruction *instruction,
+   struct tgsi_header *header );
+
+struct tgsi_dimension
+tgsi_default_dimension( void );
+
+struct tgsi_dimension
+tgsi_build_dimension(
+   unsigned indirect,
+   unsigned index,
+   struct tgsi_instruction *instruction,
+   struct tgsi_header *header );
+
+struct tgsi_dst_register
+tgsi_default_dst_register( void );
+
+struct tgsi_dst_register
+tgsi_build_dst_register(
+   unsigned file,
+   unsigned mask,
+   int index,
+   struct tgsi_instruction *instruction,
+   struct tgsi_header *header );
+
+struct tgsi_full_dst_register
+tgsi_default_full_dst_register( void );
+
+struct tgsi_dst_register_ext_concode
+tgsi_default_dst_register_ext_concode( void );
+
+unsigned
+tgsi_compare_dst_register_ext_concode(
+   struct tgsi_dst_register_ext_concode a,
+   struct tgsi_dst_register_ext_concode b );
+
+struct tgsi_dst_register_ext_concode
+tgsi_build_dst_register_ext_concode(
+   unsigned cc,
+   unsigned swizzle_x,
+   unsigned swizzle_y,
+   unsigned swizzle_z,
+   unsigned swizzle_w,
+   int index,
+   struct tgsi_token *prev_token,
+   struct tgsi_instruction *instruction,
+   struct tgsi_header *header );
+
+struct tgsi_dst_register_ext_modulate
+tgsi_default_dst_register_ext_modulate( void );
+
+unsigned
+tgsi_compare_dst_register_ext_modulate(
+   struct tgsi_dst_register_ext_modulate a,
+   struct tgsi_dst_register_ext_modulate b );
+
+struct tgsi_dst_register_ext_modulate
+tgsi_build_dst_register_ext_modulate(
+   unsigned modulate,
+   struct tgsi_token *prev_token,
+   struct tgsi_instruction *instruction,
+   struct tgsi_header *header );
+
+#if defined __cplusplus
+}
+#endif
+
+#endif /* TGSI_BUILD_H */
diff --git a/src/gallium/auxiliary/tgsi/tgsi_dump.c b/src/gallium/auxiliary/tgsi/tgsi_dump.c
new file mode 100644
index 0000000000..d2e6375212
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/tgsi_dump.c
@@ -0,0 +1,582 @@
+/**************************************************************************
+ * 
+ * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "pipe/p_debug.h"
+#include "tgsi_dump.h"
+#include "tgsi_iterate.h"
+
+struct dump_ctx
+{
+   struct tgsi_iterate_context iter;
+
+   uint instno;
+};
+
+static void
+dump_enum(
+   uint e,
+   const char **enums,
+   uint enum_count )
+{
+   if (e >= enum_count)
+      debug_printf( "%u", e );
+   else
+      debug_printf( "%s", enums[e] );
+}
+
+#define EOL()           debug_printf( "\n" )
+#define TXT(S)          debug_printf( "%s", S )
+#define CHR(C)          debug_printf( "%c", C )
+#define UIX(I)          debug_printf( "0x%x", I )
+#define UID(I)          debug_printf( "%u", I )
+#define SID(I)          debug_printf( "%d", I )
+#define FLT(F)          debug_printf( "%10.4f", F )
+#define ENM(E,ENUMS)    dump_enum( E, ENUMS, sizeof( ENUMS ) / sizeof( *ENUMS ) )
+
+static const char *processor_type_names[] =
+{
+   "FRAG",
+   "VERT",
+   "GEOM"
+};
+
+static const char *file_names[] =
+{
+   "NULL",
+   "CONST",
+   "IN",
+   "OUT",
+   "TEMP",
+   "SAMP",
+   "ADDR",
+   "IMM"
+};
+
+static const char *interpolate_names[] =
+{
+   "CONSTANT",
+   "LINEAR",
+   "PERSPECTIVE"
+};
+
+static const char *semantic_names[] =
+{
+   "POSITION",
+   "COLOR",
+   "BCOLOR",
+   "FOG",
+   "PSIZE",
+   "GENERIC",
+   "NORMAL"
+};
+
+static const char *immediate_type_names[] =
+{
+   "FLT32"
+};
+
+static const char *opcode_names[TGSI_OPCODE_LAST] =
+{
+   "ARL",
+   "MOV",
+   "LIT",
+   "RCP",
+   "RSQ",
+   "EXP",
+   "LOG",
+   "MUL",
+   "ADD",
+   "DP3",
+   "DP4",
+   "DST",
+   "MIN",
+   "MAX",
+   "SLT",
+   "SGE",
+   "MAD",
+   "SUB",
+   "LERP",
+   "CND",
+   "CND0",
+   "DOT2ADD",
+   "INDEX",
+   "NEGATE",
+   "FRAC",
+   "CLAMP",
+   "FLOOR",
+   "ROUND",
+   "EXPBASE2",
+   "LOGBASE2",
+   "POWER",
+   "CROSSPRODUCT",
+   "MULTIPLYMATRIX",
+   "ABS",
+   "RCC",
+   "DPH",
+   "COS",
+   "DDX",
+   "DDY",
+   "KILP",
+   "PK2H",
+   "PK2US",
+   "PK4B",
+   "PK4UB",
+   "RFL",
+   "SEQ",
+   "SFL",
+   "SGT",
+   "SIN",
+   "SLE",
+   "SNE",
+   "STR",
+   "TEX",
+   "TXD",
+   "TXP",
+   "UP2H",
+   "UP2US",
+   "UP4B",
+   "UP4UB",
+   "X2D",
+   "ARA",
+   "ARR",
+   "BRA",
+   "CAL",
+   "RET",
+   "SSG",
+   "CMP",
+   "SCS",
+   "TXB",
+   "NRM",
+   "DIV",
+   "DP2",
+   "TXL",
+   "BRK",
+   "IF",
+   "LOOP",
+   "REP",
+   "ELSE",
+   "ENDIF",
+   "ENDLOOP",
+   "ENDREP",
+   "PUSHA",
+   "POPA",
+   "CEIL",
+   "I2F",
+   "NOT",
+   "TRUNC",
+   "SHL",
+   "SHR",
+   "AND",
+   "OR",
+   "MOD",
+   "XOR",
+   "SAD",
+   "TXF",
+   "TXQ",
+   "CONT",
+   "EMIT",
+   "ENDPRIM",
+   "BGNLOOP2",
+   "BGNSUB",
+   "ENDLOOP2",
+   "ENDSUB",
+   "NOISE1",
+   "NOISE2",
+   "NOISE3",
+   "NOISE4",
+   "NOP",
+   "M4X3",
+   "M3X4",
+   "M3X3",
+   "M3X2",
+   "NRM4",
+   "CALLNZ",
+   "IFC",
+   "BREAKC",
+   "KIL",
+   "END",
+   "SWZ"
+};
+
+static const char *swizzle_names[] =
+{
+   "x",
+   "y",
+   "z",
+   "w"
+};
+
+static const char *texture_names[] =
+{
+   "UNKNOWN",
+   "1D",
+   "2D",
+   "3D",
+   "CUBE",
+   "RECT",
+   "SHADOW1D",
+   "SHADOW2D",
+   "SHADOWRECT"
+};
+
+static const char *extswizzle_names[] =
+{
+   "x",
+   "y",
+   "z",
+   "w",
+   "0",
+   "1"
+};
+
+static const char *modulate_names[TGSI_MODULATE_COUNT] =
+{
+   "",
+   "_2X",
+   "_4X",
+   "_8X",
+   "_D2",
+   "_D4",
+   "_D8"
+};
+
+static void
+_dump_register_prefix(
+   uint file,
+   uint first,
+   uint last )
+{
+   
+   
+}
+
+static void
+_dump_register(
+   uint file,
+   int first,
+   int last )
+{
+   ENM( file, file_names );
+   CHR( '[' );
+   SID( first );
+   if (first != last) {
+      TXT( ".." );
+      SID( last );
+   }
+   CHR( ']' );
+}
+
+static void
+_dump_register_ind(
+   uint file,
+   int index,
+   uint ind_file,
+   int ind_index )
+{
+   ENM( file, file_names );
+   CHR( '[' );
+   ENM( ind_file, file_names );
+   CHR( '[' );
+   SID( ind_index );
+   CHR( ']' );
+   if (index != 0) {
+      if (index > 0)
+         CHR( '+' );
+      SID( index );
+   }
+   CHR( ']' );
+}
+
+static void
+_dump_writemask(
+   uint writemask )
+{
+   if (writemask != TGSI_WRITEMASK_XYZW) {
+      CHR( '.' );
+      if (writemask & TGSI_WRITEMASK_X)
+         CHR( 'x' );
+      if (writemask & TGSI_WRITEMASK_Y)
+         CHR( 'y' );
+      if (writemask & TGSI_WRITEMASK_Z)
+         CHR( 'z' );
+      if (writemask & TGSI_WRITEMASK_W)
+         CHR( 'w' );
+   }
+}
+
+void
+tgsi_dump_declaration(
+   const struct tgsi_full_declaration *decl )
+{
+   TXT( "\nDCL " );
+
+   _dump_register(
+      decl->Declaration.File,
+      decl->DeclarationRange.First,
+      decl->DeclarationRange.Last );
+   _dump_writemask(
+      decl->Declaration.UsageMask );
+
+   if (decl->Declaration.Semantic) {
+      TXT( ", " );
+      ENM( decl->Semantic.SemanticName, semantic_names );
+      if (decl->Semantic.SemanticIndex != 0 ||
+          decl->Semantic.SemanticName == TGSI_SEMANTIC_GENERIC) {
+         CHR( '[' );
+         UID( decl->Semantic.SemanticIndex );
+         CHR( ']' );
+      }
+   }
+
+   TXT( ", " );
+   ENM( decl->Declaration.Interpolate, interpolate_names );
+}
+
+static boolean
+iter_declaration(
+   struct tgsi_iterate_context *iter,
+   struct tgsi_full_declaration *decl )
+{
+   tgsi_dump_declaration( decl );
+   return TRUE;
+}
+
+void
+tgsi_dump_immediate(
+   const struct tgsi_full_immediate *imm )
+{
+   uint i;
+
+   TXT( "\nIMM " );
+   ENM( imm->Immediate.DataType, immediate_type_names );
+
+   TXT( " { " );
+   for (i = 0; i < imm->Immediate.Size - 1; i++) {
+      switch (imm->Immediate.DataType) {
+      case TGSI_IMM_FLOAT32:
+         FLT( imm->u.ImmediateFloat32[i].Float );
+         break;
+      default:
+         assert( 0 );
+      }
+
+      if (i < imm->Immediate.Size - 2)
+         TXT( ", " );
+   }
+   TXT( " }" );
+}
+
+static boolean
+iter_immediate(
+   struct tgsi_iterate_context *iter,
+   struct tgsi_full_immediate *imm )
+{
+   tgsi_dump_immediate( imm );
+   return TRUE;
+}
+
+void
+tgsi_dump_instruction(
+   const struct tgsi_full_instruction *inst,
+   uint instno )
+{
+   uint i;
+   boolean first_reg = TRUE;
+
+   EOL();
+   UID( instno );
+   CHR( ':' );
+   ENM( inst->Instruction.Opcode, opcode_names );
+
+   switch (inst->Instruction.Saturate) {
+   case TGSI_SAT_NONE:
+      break;
+   case TGSI_SAT_ZERO_ONE:
+      TXT( "_SAT" );
+      break;
+   case TGSI_SAT_MINUS_PLUS_ONE:
+      TXT( "_SATNV" );
+      break;
+   default:
+      assert( 0 );
+   }
+
+   for (i = 0; i < inst->Instruction.NumDstRegs; i++) {
+      const struct tgsi_full_dst_register *dst = &inst->FullDstRegisters[i];
+
+      if (!first_reg)
+         CHR( ',' );
+      CHR( ' ' );
+
+      _dump_register(
+         dst->DstRegister.File,
+         dst->DstRegister.Index,
+         dst->DstRegister.Index );
+      ENM( dst->DstRegisterExtModulate.Modulate, modulate_names );
+      _dump_writemask( dst->DstRegister.WriteMask );
+
+      first_reg = FALSE;
+   }
+
+   for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
+      const struct tgsi_full_src_register *src = &inst->FullSrcRegisters[i];
+
+      if (!first_reg)
+         CHR( ',' );
+      CHR( ' ' );
+
+      if (src->SrcRegisterExtMod.Negate)
+         TXT( "-(" );
+      if (src->SrcRegisterExtMod.Absolute)
+         CHR( '|' );
+      if (src->SrcRegisterExtMod.Scale2X)
+         TXT( "2*(" );
+      if (src->SrcRegisterExtMod.Bias)
+         CHR( '(' );
+      if (src->SrcRegisterExtMod.Complement)
+         TXT( "1-(" );
+      if (src->SrcRegister.Negate)
+         CHR( '-' );
+
+      if (src->SrcRegister.Indirect) {
+         _dump_register_ind(
+            src->SrcRegister.File,
+            src->SrcRegister.Index,
+            src->SrcRegisterInd.File,
+            src->SrcRegisterInd.Index );
+      }
+      else {
+         _dump_register(
+            src->SrcRegister.File,
+            src->SrcRegister.Index,
+            src->SrcRegister.Index );
+      }
+
+      if (src->SrcRegister.SwizzleX != TGSI_SWIZZLE_X ||
+          src->SrcRegister.SwizzleY != TGSI_SWIZZLE_Y ||
+          src->SrcRegister.SwizzleZ != TGSI_SWIZZLE_Z ||
+          src->SrcRegister.SwizzleW != TGSI_SWIZZLE_W) {
+         CHR( '.' );
+         ENM( src->SrcRegister.SwizzleX, swizzle_names );
+         ENM( src->SrcRegister.SwizzleY, swizzle_names );
+         ENM( src->SrcRegister.SwizzleZ, swizzle_names );
+         ENM( src->SrcRegister.SwizzleW, swizzle_names );
+      }
+      if (src->SrcRegisterExtSwz.ExtSwizzleX != TGSI_EXTSWIZZLE_X ||
+          src->SrcRegisterExtSwz.ExtSwizzleY != TGSI_EXTSWIZZLE_Y ||
+          src->SrcRegisterExtSwz.ExtSwizzleZ != TGSI_EXTSWIZZLE_Z ||
+          src->SrcRegisterExtSwz.ExtSwizzleW != TGSI_EXTSWIZZLE_W) {
+         CHR( '.' );
+         if (src->SrcRegisterExtSwz.NegateX)
+            TXT("-");
+         ENM( src->SrcRegisterExtSwz.ExtSwizzleX, extswizzle_names );
+         if (src->SrcRegisterExtSwz.NegateY)
+            TXT("-");
+         ENM( src->SrcRegisterExtSwz.ExtSwizzleY, extswizzle_names );
+         if (src->SrcRegisterExtSwz.NegateZ)
+            TXT("-");
+         ENM( src->SrcRegisterExtSwz.ExtSwizzleZ, extswizzle_names );
+         if (src->SrcRegisterExtSwz.NegateW)
+            TXT("-");
+         ENM( src->SrcRegisterExtSwz.ExtSwizzleW, extswizzle_names );
+      }
+
+      if (src->SrcRegisterExtMod.Complement)
+         CHR( ')' );
+      if (src->SrcRegisterExtMod.Bias)
+         TXT( ")-.5" );
+      if (src->SrcRegisterExtMod.Scale2X)
+         CHR( ')' );
+      if (src->SrcRegisterExtMod.Absolute)
+         CHR( '|' );
+      if (src->SrcRegisterExtMod.Negate)
+         CHR( ')' );
+
+      first_reg = FALSE;
+   }
+
+   if (inst->InstructionExtTexture.Texture != TGSI_TEXTURE_UNKNOWN) {
+      TXT( ", " );
+      ENM( inst->InstructionExtTexture.Texture, texture_names );
+   }
+
+   switch (inst->Instruction.Opcode) {
+   case TGSI_OPCODE_IF:
+   case TGSI_OPCODE_ELSE:
+   case TGSI_OPCODE_BGNLOOP2:
+   case TGSI_OPCODE_ENDLOOP2:
+   case TGSI_OPCODE_CAL:
+      TXT( " :" );
+      UID( inst->InstructionExtLabel.Label );
+      break;
+   }
+}
+
+static boolean
+iter_instruction(
+   struct tgsi_iterate_context *iter,
+   struct tgsi_full_instruction *inst )
+{
+   struct dump_ctx *ctx = (struct dump_ctx *) iter;
+
+   tgsi_dump_instruction( inst, ctx->instno++ );
+   return TRUE;
+}
+
+static boolean
+prolog(
+   struct tgsi_iterate_context *ctx )
+{
+   EOL();
+   ENM( ctx->processor.Processor, processor_type_names );
+   UID( ctx->version.MajorVersion );
+   CHR( '.' );
+   UID( ctx->version.MinorVersion );
+   return TRUE;
+}
+
+void
+tgsi_dump(
+   const struct tgsi_token *tokens,
+   uint flags )
+{
+   struct dump_ctx ctx;
+
+   /* sanity checks */
+   assert( strcmp( opcode_names[TGSI_OPCODE_CONT], "CONT" ) == 0 );
+   assert( strcmp( opcode_names[TGSI_OPCODE_END], "END" ) == 0 );
+
+   ctx.iter.prolog = prolog;
+   ctx.iter.iterate_instruction = iter_instruction;
+   ctx.iter.iterate_declaration = iter_declaration;
+   ctx.iter.iterate_immediate = iter_immediate;
+   ctx.iter.epilog = NULL;
+
+   ctx.instno = 0;
+
+   tgsi_iterate_shader( tokens, &ctx.iter );
+}
diff --git a/src/gallium/auxiliary/tgsi/tgsi_dump.h b/src/gallium/auxiliary/tgsi/tgsi_dump.h
new file mode 100644
index 0000000000..51c230b5db
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/tgsi_dump.h
@@ -0,0 +1,63 @@
+/**************************************************************************
+ * 
+ * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef TGSI_DUMP_H
+#define TGSI_DUMP_H
+
+#include "pipe/p_shader_tokens.h"
+
+#if defined __cplusplus
+extern "C" {
+#endif
+
+void
+tgsi_dump(
+   const struct tgsi_token *tokens,
+   uint flags );
+
+struct tgsi_full_immediate;
+struct tgsi_full_instruction;
+struct tgsi_full_declaration;
+
+void
+tgsi_dump_immediate(
+   const struct tgsi_full_immediate *imm );
+
+void
+tgsi_dump_instruction(
+   const struct tgsi_full_instruction *inst,
+   uint instno );
+
+void
+tgsi_dump_declaration(
+   const struct tgsi_full_declaration *decl );
+
+#if defined __cplusplus
+}
+#endif
+
+#endif /* TGSI_DUMP_H */
diff --git a/src/gallium/auxiliary/tgsi/tgsi_dump_c.c b/src/gallium/auxiliary/tgsi/tgsi_dump_c.c
new file mode 100644
index 0000000000..eabd74bd6d
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/tgsi_dump_c.c
@@ -0,0 +1,845 @@
+/**************************************************************************
+ * 
+ * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "pipe/p_debug.h"
+#include "pipe/p_util.h"
+#include "util/u_string.h"
+#include "tgsi_dump_c.h"
+#include "tgsi_parse.h"
+#include "tgsi_build.h"
+
+static void
+dump_enum(
+   const unsigned    e,
+   const char        **enums,
+   const unsigned    enums_count )
+{
+   if (e >= enums_count) {
+      debug_printf( "%u", e );
+   }
+   else {
+      debug_printf( "%s", enums[e] );
+   }
+}
+
+#define EOL()           debug_printf( "\n" )
+#define TXT(S)          debug_printf( "%s", S )
+#define CHR(C)          debug_printf( "%c", C )
+#define UIX(I)          debug_printf( "0x%x", I )
+#define UID(I)          debug_printf( "%u", I )
+#define SID(I)          debug_printf( "%d", I )
+#define FLT(F)          debug_printf( "%10.4f", F )
+#define ENM(E,ENUMS)    dump_enum( E, ENUMS, sizeof( ENUMS ) / sizeof( *ENUMS ) )
+
+static const char *TGSI_PROCESSOR_TYPES[] =
+{
+   "PROCESSOR_FRAGMENT",
+   "PROCESSOR_VERTEX",
+   "PROCESSOR_GEOMETRY"
+};
+
+static const char *TGSI_TOKEN_TYPES[] =
+{
+   "TOKEN_TYPE_DECLARATION",
+   "TOKEN_TYPE_IMMEDIATE",
+   "TOKEN_TYPE_INSTRUCTION"
+};
+
+static const char *TGSI_FILES[] =
+{
+   "FILE_NULL",
+   "FILE_CONSTANT",
+   "FILE_INPUT",
+   "FILE_OUTPUT",
+   "FILE_TEMPORARY",
+   "FILE_SAMPLER",
+   "FILE_ADDRESS",
+   "FILE_IMMEDIATE"
+};
+
+static const char *TGSI_INTERPOLATES[] =
+{
+   "INTERPOLATE_CONSTANT",
+   "INTERPOLATE_LINEAR",
+   "INTERPOLATE_PERSPECTIVE"
+};
+
+static const char *TGSI_SEMANTICS[] =
+{
+   "SEMANTIC_POSITION",
+   "SEMANTIC_COLOR",
+   "SEMANTIC_BCOLOR",
+   "SEMANTIC_FOG",
+   "SEMANTIC_PSIZE",
+   "SEMANTIC_GENERIC",
+   "SEMANTIC_NORMAL"
+};
+
+static const char *TGSI_IMMS[] =
+{
+   "IMM_FLOAT32"
+};
+
+static const char *TGSI_OPCODES[TGSI_OPCODE_LAST] =
+{
+   "OPCODE_ARL",
+   "OPCODE_MOV",
+   "OPCODE_LIT",
+   "OPCODE_RCP",
+   "OPCODE_RSQ",
+   "OPCODE_EXP",
+   "OPCODE_LOG",
+   "OPCODE_MUL",
+   "OPCODE_ADD",
+   "OPCODE_DP3",
+   "OPCODE_DP4",
+   "OPCODE_DST",
+   "OPCODE_MIN",
+   "OPCODE_MAX",
+   "OPCODE_SLT",
+   "OPCODE_SGE",
+   "OPCODE_MAD",
+   "OPCODE_SUB",
+   "OPCODE_LERP",
+   "OPCODE_CND",
+   "OPCODE_CND0",
+   "OPCODE_DOT2ADD",
+   "OPCODE_INDEX",
+   "OPCODE_NEGATE",
+   "OPCODE_FRAC",
+   "OPCODE_CLAMP",
+   "OPCODE_FLOOR",
+   "OPCODE_ROUND",
+   "OPCODE_EXPBASE2",
+   "OPCODE_LOGBASE2",
+   "OPCODE_POWER",
+   "OPCODE_CROSSPRODUCT",
+   "OPCODE_MULTIPLYMATRIX",
+   "OPCODE_ABS",
+   "OPCODE_RCC",
+   "OPCODE_DPH",
+   "OPCODE_COS",
+   "OPCODE_DDX",
+   "OPCODE_DDY",
+   "OPCODE_KILP",
+   "OPCODE_PK2H",
+   "OPCODE_PK2US",
+   "OPCODE_PK4B",
+   "OPCODE_PK4UB",
+   "OPCODE_RFL",
+   "OPCODE_SEQ",
+   "OPCODE_SFL",
+   "OPCODE_SGT",
+   "OPCODE_SIN",
+   "OPCODE_SLE",
+   "OPCODE_SNE",
+   "OPCODE_STR",
+   "OPCODE_TEX",
+   "OPCODE_TXD",
+   "OPCODE_TXP",
+   "OPCODE_UP2H",
+   "OPCODE_UP2US",
+   "OPCODE_UP4B",
+   "OPCODE_UP4UB",
+   "OPCODE_X2D",
+   "OPCODE_ARA",
+   "OPCODE_ARR",
+   "OPCODE_BRA",
+   "OPCODE_CAL",
+   "OPCODE_RET",
+   "OPCODE_SSG",
+   "OPCODE_CMP",
+   "OPCODE_SCS",
+   "OPCODE_TXB",
+   "OPCODE_NRM",
+   "OPCODE_DIV",
+   "OPCODE_DP2",
+   "OPCODE_TXL",
+   "OPCODE_BRK",
+   "OPCODE_IF",
+   "OPCODE_LOOP",
+   "OPCODE_REP",
+   "OPCODE_ELSE",
+   "OPCODE_ENDIF",
+   "OPCODE_ENDLOOP",
+   "OPCODE_ENDREP",
+   "OPCODE_PUSHA",
+   "OPCODE_POPA",
+   "OPCODE_CEIL",
+   "OPCODE_I2F",
+   "OPCODE_NOT",
+   "OPCODE_TRUNC",
+   "OPCODE_SHL",
+   "OPCODE_SHR",
+   "OPCODE_AND",
+   "OPCODE_OR",
+   "OPCODE_MOD",
+   "OPCODE_XOR",
+   "OPCODE_SAD",
+   "OPCODE_TXF",
+   "OPCODE_TXQ",
+   "OPCODE_CONT",
+   "OPCODE_EMIT",
+   "OPCODE_ENDPRIM",
+   "OPCODE_BGNLOOP2",
+   "OPCODE_BGNSUB",
+   "OPCODE_ENDLOOP2",
+   "OPCODE_ENDSUB",
+   "OPCODE_NOISE1",
+   "OPCODE_NOISE2",
+   "OPCODE_NOISE3",
+   "OPCODE_NOISE4",
+   "OPCODE_NOP",
+   "OPCODE_M4X3",
+   "OPCODE_M3X4",
+   "OPCODE_M3X3",
+   "OPCODE_M3X2",
+   "OPCODE_NRM4",
+   "OPCODE_CALLNZ",
+   "OPCODE_IFC",
+   "OPCODE_BREAKC",
+   "OPCODE_KIL",
+   "OPCODE_END"
+};
+
+static const char *TGSI_SATS[] =
+{
+   "SAT_NONE",
+   "SAT_ZERO_ONE",
+   "SAT_MINUS_PLUS_ONE"
+};
+
+static const char *TGSI_INSTRUCTION_EXTS[] =
+{
+   "INSTRUCTION_EXT_TYPE_NV",
+   "INSTRUCTION_EXT_TYPE_LABEL",
+   "INSTRUCTION_EXT_TYPE_TEXTURE"
+};
+
+static const char *TGSI_PRECISIONS[] =
+{
+   "PRECISION_DEFAULT",
+   "PRECISION_FLOAT32",
+   "PRECISION_FLOAT16",
+   "PRECISION_FIXED12"
+};
+
+static const char *TGSI_CCS[] =
+{
+   "CC_GT",
+   "CC_EQ",
+   "CC_LT",
+   "CC_UN",
+   "CC_GE",
+   "CC_LE",
+   "CC_NE",
+   "CC_TR",
+   "CC_FL"
+};
+
+static const char *TGSI_SWIZZLES[] =
+{
+   "SWIZZLE_X",
+   "SWIZZLE_Y",
+   "SWIZZLE_Z",
+   "SWIZZLE_W"
+};
+
+static const char *TGSI_TEXTURES[] =
+{
+   "TEXTURE_UNKNOWN",
+   "TEXTURE_1D",
+   "TEXTURE_2D",
+   "TEXTURE_3D",
+   "TEXTURE_CUBE",
+   "TEXTURE_RECT",
+   "TEXTURE_SHADOW1D",
+   "TEXTURE_SHADOW2D",
+   "TEXTURE_SHADOWRECT"
+};
+
+static const char *TGSI_SRC_REGISTER_EXTS[] =
+{
+   "SRC_REGISTER_EXT_TYPE_SWZ",
+   "SRC_REGISTER_EXT_TYPE_MOD"
+};
+
+static const char *TGSI_EXTSWIZZLES[] =
+{
+   "EXTSWIZZLE_X",
+   "EXTSWIZZLE_Y",
+   "EXTSWIZZLE_Z",
+   "EXTSWIZZLE_W",
+   "EXTSWIZZLE_ZERO",
+   "EXTSWIZZLE_ONE"
+};
+
+static const char *TGSI_WRITEMASKS[] =
+{
+   "0",
+   "WRITEMASK_X",
+   "WRITEMASK_Y",
+   "WRITEMASK_XY",
+   "WRITEMASK_Z",
+   "WRITEMASK_XZ",
+   "WRITEMASK_YZ",
+   "WRITEMASK_XYZ",
+   "WRITEMASK_W",
+   "WRITEMASK_XW",
+   "WRITEMASK_YW",
+   "WRITEMASK_XYW",
+   "WRITEMASK_ZW",
+   "WRITEMASK_XZW",
+   "WRITEMASK_YZW",
+   "WRITEMASK_XYZW"
+};
+
+static const char *TGSI_DST_REGISTER_EXTS[] =
+{
+   "DST_REGISTER_EXT_TYPE_CONDCODE",
+   "DST_REGISTER_EXT_TYPE_MODULATE"
+};
+
+static const char *TGSI_MODULATES[] =
+{
+   "MODULATE_1X",
+   "MODULATE_2X",
+   "MODULATE_4X",
+   "MODULATE_8X",
+   "MODULATE_HALF",
+   "MODULATE_QUARTER",
+   "MODULATE_EIGHTH"
+};
+
+static void
+dump_declaration_verbose(
+   struct tgsi_full_declaration  *decl,
+   unsigned                      ignored,
+   unsigned                      deflt,
+   struct tgsi_full_declaration  *fd )
+{
+   TXT( "\nFile       : " );
+   ENM( decl->Declaration.File, TGSI_FILES );
+   if( deflt || fd->Declaration.UsageMask != decl->Declaration.UsageMask ) {
+      TXT( "\nUsageMask  : " );
+      if( decl->Declaration.UsageMask & TGSI_WRITEMASK_X ) {
+         CHR( 'X' );
+      }
+      if( decl->Declaration.UsageMask & TGSI_WRITEMASK_Y ) {
+         CHR( 'Y' );
+      }
+      if( decl->Declaration.UsageMask & TGSI_WRITEMASK_Z ) {
+         CHR( 'Z' );
+      }
+      if( decl->Declaration.UsageMask & TGSI_WRITEMASK_W ) {
+         CHR( 'W' );
+      }
+   }
+   if( deflt || fd->Declaration.Interpolate != decl->Declaration.Interpolate ) {
+      TXT( "\nInterpolate: " );
+      ENM( decl->Declaration.Interpolate, TGSI_INTERPOLATES );
+   }
+   if( deflt || fd->Declaration.Semantic != decl->Declaration.Semantic ) {
+      TXT( "\nSemantic   : " );
+      UID( decl->Declaration.Semantic );
+   }
+   if( ignored ) {
+      TXT( "\nPadding    : " );
+      UIX( decl->Declaration.Padding );
+   }
+
+   EOL();
+   TXT( "\nFirst: " );
+   UID( decl->DeclarationRange.First );
+   TXT( "\nLast : " );
+   UID( decl->DeclarationRange.Last );
+
+   if( decl->Declaration.Semantic ) {
+      EOL();
+      TXT( "\nSemanticName : " );
+      ENM( decl->Semantic.SemanticName, TGSI_SEMANTICS );
+      TXT( "\nSemanticIndex: " );
+      UID( decl->Semantic.SemanticIndex );
+      if( ignored ) {
+         TXT( "\nPadding      : " );
+         UIX( decl->Semantic.Padding );
+      }
+   }
+}
+
+static void
+dump_immediate_verbose(
+   struct tgsi_full_immediate *imm,
+   unsigned                   ignored )
+{
+   unsigned i;
+
+   TXT( "\nDataType   : " );
+   ENM( imm->Immediate.DataType, TGSI_IMMS );
+   if( ignored ) {
+      TXT( "\nPadding    : " );
+      UIX( imm->Immediate.Padding );
+   }
+
+   for( i = 0; i < imm->Immediate.Size - 1; i++ ) {
+      EOL();
+      switch( imm->Immediate.DataType ) {
+      case TGSI_IMM_FLOAT32:
+         TXT( "\nFloat: " );
+         FLT( imm->u.ImmediateFloat32[i].Float );
+         break;
+
+      default:
+         assert( 0 );
+      }
+   }
+}
+
+static void
+dump_instruction_verbose(
+   struct tgsi_full_instruction  *inst,
+   unsigned                      ignored,
+   unsigned                      deflt,
+   struct tgsi_full_instruction  *fi )
+{
+   unsigned i;
+
+   TXT( "\nOpcode     : " );
+   ENM( inst->Instruction.Opcode, TGSI_OPCODES );
+   if( deflt || fi->Instruction.Saturate != inst->Instruction.Saturate ) {
+      TXT( "\nSaturate   : " );
+      ENM( inst->Instruction.Saturate, TGSI_SATS );
+   }
+   if( deflt || fi->Instruction.NumDstRegs != inst->Instruction.NumDstRegs ) {
+      TXT( "\nNumDstRegs : " );
+      UID( inst->Instruction.NumDstRegs );
+   }
+   if( deflt || fi->Instruction.NumSrcRegs != inst->Instruction.NumSrcRegs ) {
+      TXT( "\nNumSrcRegs : " );
+      UID( inst->Instruction.NumSrcRegs );
+   }
+   if( ignored ) {
+      TXT( "\nPadding    : " );
+      UIX( inst->Instruction.Padding );
+   }
+
+   if( deflt || tgsi_compare_instruction_ext_nv( inst->InstructionExtNv, fi->InstructionExtNv ) ) {
+      EOL();
+      TXT( "\nType          : " );
+      ENM( inst->InstructionExtNv.Type, TGSI_INSTRUCTION_EXTS );
+      if( deflt || fi->InstructionExtNv.Precision != inst->InstructionExtNv.Precision ) {
+         TXT( "\nPrecision     : " );
+         ENM( inst->InstructionExtNv.Precision, TGSI_PRECISIONS );
+      }
+      if( deflt || fi->InstructionExtNv.CondDstIndex != inst->InstructionExtNv.CondDstIndex ) {
+         TXT( "\nCondDstIndex  : " );
+         UID( inst->InstructionExtNv.CondDstIndex );
+      }
+      if( deflt || fi->InstructionExtNv.CondFlowIndex != inst->InstructionExtNv.CondFlowIndex ) {
+         TXT( "\nCondFlowIndex : " );
+         UID( inst->InstructionExtNv.CondFlowIndex );
+      }
+      if( deflt || fi->InstructionExtNv.CondMask != inst->InstructionExtNv.CondMask ) {
+         TXT( "\nCondMask      : " );
+         ENM( inst->InstructionExtNv.CondMask, TGSI_CCS );
+      }
+      if( deflt || fi->InstructionExtNv.CondSwizzleX != inst->InstructionExtNv.CondSwizzleX ) {
+         TXT( "\nCondSwizzleX  : " );
+         ENM( inst->InstructionExtNv.CondSwizzleX, TGSI_SWIZZLES );
+      }
+      if( deflt || fi->InstructionExtNv.CondSwizzleY != inst->InstructionExtNv.CondSwizzleY ) {
+         TXT( "\nCondSwizzleY  : " );
+         ENM( inst->InstructionExtNv.CondSwizzleY, TGSI_SWIZZLES );
+      }
+      if( deflt || fi->InstructionExtNv.CondSwizzleZ != inst->InstructionExtNv.CondSwizzleZ ) {
+         TXT( "\nCondSwizzleZ  : " );
+         ENM( inst->InstructionExtNv.CondSwizzleZ, TGSI_SWIZZLES );
+      }
+      if( deflt || fi->InstructionExtNv.CondSwizzleW != inst->InstructionExtNv.CondSwizzleW ) {
+         TXT( "\nCondSwizzleW  : " );
+         ENM( inst->InstructionExtNv.CondSwizzleW, TGSI_SWIZZLES );
+      }
+      if( deflt || fi->InstructionExtNv.CondDstUpdate != inst->InstructionExtNv.CondDstUpdate ) {
+         TXT( "\nCondDstUpdate : " );
+         UID( inst->InstructionExtNv.CondDstUpdate );
+      }
+      if( deflt || fi->InstructionExtNv.CondFlowEnable != inst->InstructionExtNv.CondFlowEnable ) {
+         TXT( "\nCondFlowEnable: " );
+         UID( inst->InstructionExtNv.CondFlowEnable );
+      }
+      if( ignored ) {
+         TXT( "\nPadding       : " );
+         UIX( inst->InstructionExtNv.Padding );
+         if( deflt || fi->InstructionExtNv.Extended != inst->InstructionExtNv.Extended ) {
+            TXT( "\nExtended      : " );
+            UID( inst->InstructionExtNv.Extended );
+         }
+      }
+   }
+
+   if( deflt || tgsi_compare_instruction_ext_label( inst->InstructionExtLabel, fi->InstructionExtLabel ) ) {
+      EOL();
+      TXT( "\nType    : " );
+      ENM( inst->InstructionExtLabel.Type, TGSI_INSTRUCTION_EXTS );
+      if( deflt || fi->InstructionExtLabel.Label != inst->InstructionExtLabel.Label ) {
+         TXT( "\nLabel   : " );
+         UID( inst->InstructionExtLabel.Label );
+      }
+      if( ignored ) {
+         TXT( "\nPadding : " );
+         UIX( inst->InstructionExtLabel.Padding );
+         if( deflt || fi->InstructionExtLabel.Extended != inst->InstructionExtLabel.Extended ) {
+            TXT( "\nExtended: " );
+            UID( inst->InstructionExtLabel.Extended );
+         }
+      }
+   }
+
+   if( deflt || tgsi_compare_instruction_ext_texture( inst->InstructionExtTexture, fi->InstructionExtTexture ) ) {
+      EOL();
+      TXT( "\nType    : " );
+      ENM( inst->InstructionExtTexture.Type, TGSI_INSTRUCTION_EXTS );
+      if( deflt || fi->InstructionExtTexture.Texture != inst->InstructionExtTexture.Texture ) {
+         TXT( "\nTexture : " );
+         ENM( inst->InstructionExtTexture.Texture, TGSI_TEXTURES );
+      }
+      if( ignored ) {
+         TXT( "\nPadding : " );
+         UIX( inst->InstructionExtTexture.Padding );
+         if( deflt || fi->InstructionExtTexture.Extended != inst->InstructionExtTexture.Extended ) {
+            TXT( "\nExtended: " );
+            UID( inst->InstructionExtTexture.Extended );
+         }
+      }
+   }
+
+   for( i = 0; i < inst->Instruction.NumDstRegs; i++ ) {
+      struct tgsi_full_dst_register *dst = &inst->FullDstRegisters[i];
+      struct tgsi_full_dst_register *fd = &fi->FullDstRegisters[i];
+
+      EOL();
+      TXT( "\nFile     : " );
+      ENM( dst->DstRegister.File, TGSI_FILES );
+      if( deflt || fd->DstRegister.WriteMask != dst->DstRegister.WriteMask ) {
+         TXT( "\nWriteMask: " );
+         ENM( dst->DstRegister.WriteMask, TGSI_WRITEMASKS );
+      }
+      if( ignored ) {
+         if( deflt || fd->DstRegister.Indirect != dst->DstRegister.Indirect ) {
+            TXT( "\nIndirect : " );
+            UID( dst->DstRegister.Indirect );
+         }
+         if( deflt || fd->DstRegister.Dimension != dst->DstRegister.Dimension ) {
+            TXT( "\nDimension: " );
+            UID( dst->DstRegister.Dimension );
+         }
+      }
+      if( deflt || fd->DstRegister.Index != dst->DstRegister.Index ) {
+         TXT( "\nIndex    : " );
+         SID( dst->DstRegister.Index );
+      }
+      if( ignored ) {
+         TXT( "\nPadding  : " );
+         UIX( dst->DstRegister.Padding );
+         if( deflt || fd->DstRegister.Extended != dst->DstRegister.Extended ) {
+            TXT( "\nExtended : " );
+            UID( dst->DstRegister.Extended );
+         }
+      }
+
+      if( deflt || tgsi_compare_dst_register_ext_concode( dst->DstRegisterExtConcode, fd->DstRegisterExtConcode ) ) {
+         EOL();
+         TXT( "\nType        : " );
+         ENM( dst->DstRegisterExtConcode.Type, TGSI_DST_REGISTER_EXTS );
+         if( deflt || fd->DstRegisterExtConcode.CondMask != dst->DstRegisterExtConcode.CondMask ) {
+            TXT( "\nCondMask    : " );
+            ENM( dst->DstRegisterExtConcode.CondMask, TGSI_CCS );
+         }
+         if( deflt || fd->DstRegisterExtConcode.CondSwizzleX != dst->DstRegisterExtConcode.CondSwizzleX ) {
+            TXT( "\nCondSwizzleX: " );
+            ENM( dst->DstRegisterExtConcode.CondSwizzleX, TGSI_SWIZZLES );
+         }
+         if( deflt || fd->DstRegisterExtConcode.CondSwizzleY != dst->DstRegisterExtConcode.CondSwizzleY ) {
+            TXT( "\nCondSwizzleY: " );
+            ENM( dst->DstRegisterExtConcode.CondSwizzleY, TGSI_SWIZZLES );
+         }
+         if( deflt || fd->DstRegisterExtConcode.CondSwizzleZ != dst->DstRegisterExtConcode.CondSwizzleZ ) {
+            TXT( "\nCondSwizzleZ: " );
+            ENM( dst->DstRegisterExtConcode.CondSwizzleZ, TGSI_SWIZZLES );
+         }
+         if( deflt || fd->DstRegisterExtConcode.CondSwizzleW != dst->DstRegisterExtConcode.CondSwizzleW ) {
+            TXT( "\nCondSwizzleW: " );
+            ENM( dst->DstRegisterExtConcode.CondSwizzleW, TGSI_SWIZZLES );
+         }
+         if( deflt || fd->DstRegisterExtConcode.CondSrcIndex != dst->DstRegisterExtConcode.CondSrcIndex ) {
+            TXT( "\nCondSrcIndex: " );
+            UID( dst->DstRegisterExtConcode.CondSrcIndex );
+         }
+         if( ignored ) {
+            TXT( "\nPadding     : " );
+            UIX( dst->DstRegisterExtConcode.Padding );
+            if( deflt || fd->DstRegisterExtConcode.Extended != dst->DstRegisterExtConcode.Extended ) {
+               TXT( "\nExtended    : " );
+               UID( dst->DstRegisterExtConcode.Extended );
+            }
+         }
+      }
+
+      if( deflt || tgsi_compare_dst_register_ext_modulate( dst->DstRegisterExtModulate, fd->DstRegisterExtModulate ) ) {
+         EOL();
+         TXT( "\nType    : " );
+         ENM( dst->DstRegisterExtModulate.Type, TGSI_DST_REGISTER_EXTS );
+         if( deflt || fd->DstRegisterExtModulate.Modulate != dst->DstRegisterExtModulate.Modulate ) {
+            TXT( "\nModulate: " );
+            ENM( dst->DstRegisterExtModulate.Modulate, TGSI_MODULATES );
+         }
+         if( ignored ) {
+            TXT( "\nPadding : " );
+            UIX( dst->DstRegisterExtModulate.Padding );
+            if( deflt || fd->DstRegisterExtModulate.Extended != dst->DstRegisterExtModulate.Extended ) {
+               TXT( "\nExtended: " );
+               UID( dst->DstRegisterExtModulate.Extended );
+            }
+         }
+      }
+   }
+
+   for( i = 0; i < inst->Instruction.NumSrcRegs; i++ ) {
+      struct tgsi_full_src_register *src = &inst->FullSrcRegisters[i];
+      struct tgsi_full_src_register *fs = &fi->FullSrcRegisters[i];
+
+      EOL();
+      TXT( "\nFile     : ");
+      ENM( src->SrcRegister.File, TGSI_FILES );
+      if( deflt || fs->SrcRegister.SwizzleX != src->SrcRegister.SwizzleX ) {
+         TXT( "\nSwizzleX : " );
+         ENM( src->SrcRegister.SwizzleX, TGSI_SWIZZLES );
+      }
+      if( deflt || fs->SrcRegister.SwizzleY != src->SrcRegister.SwizzleY ) {
+         TXT( "\nSwizzleY : " );
+         ENM( src->SrcRegister.SwizzleY, TGSI_SWIZZLES );
+      }
+      if( deflt || fs->SrcRegister.SwizzleZ != src->SrcRegister.SwizzleZ ) {
+         TXT( "\nSwizzleZ : " );
+         ENM( src->SrcRegister.SwizzleZ, TGSI_SWIZZLES );
+      }
+      if( deflt || fs->SrcRegister.SwizzleW != src->SrcRegister.SwizzleW ) {
+         TXT( "\nSwizzleW : " );
+         ENM( src->SrcRegister.SwizzleW, TGSI_SWIZZLES );
+      }
+      if( deflt || fs->SrcRegister.Negate != src->SrcRegister.Negate ) {
+         TXT( "\nNegate   : " );
+         UID( src->SrcRegister.Negate );
+      }
+      if( ignored ) {
+         if( deflt || fs->SrcRegister.Indirect != src->SrcRegister.Indirect ) {
+            TXT( "\nIndirect : " );
+            UID( src->SrcRegister.Indirect );
+         }
+         if( deflt || fs->SrcRegister.Dimension != src->SrcRegister.Dimension ) {
+            TXT( "\nDimension: " );
+            UID( src->SrcRegister.Dimension );
+         }
+      }
+      if( deflt || fs->SrcRegister.Index != src->SrcRegister.Index ) {
+         TXT( "\nIndex    : " );
+         SID( src->SrcRegister.Index );
+      }
+      if( ignored ) {
+         if( deflt || fs->SrcRegister.Extended != src->SrcRegister.Extended ) {
+            TXT( "\nExtended : " );
+            UID( src->SrcRegister.Extended );
+         }
+      }
+
+      if( deflt || tgsi_compare_src_register_ext_swz( src->SrcRegisterExtSwz, fs->SrcRegisterExtSwz ) ) {
+         EOL();
+         TXT( "\nType       : " );
+         ENM( src->SrcRegisterExtSwz.Type, TGSI_SRC_REGISTER_EXTS );
+         if( deflt || fs->SrcRegisterExtSwz.ExtSwizzleX != src->SrcRegisterExtSwz.ExtSwizzleX ) {
+            TXT( "\nExtSwizzleX: " );
+            ENM( src->SrcRegisterExtSwz.ExtSwizzleX, TGSI_EXTSWIZZLES );
+         }
+         if( deflt || fs->SrcRegisterExtSwz.ExtSwizzleY != src->SrcRegisterExtSwz.ExtSwizzleY ) {
+            TXT( "\nExtSwizzleY: " );
+            ENM( src->SrcRegisterExtSwz.ExtSwizzleY, TGSI_EXTSWIZZLES );
+         }
+         if( deflt || fs->SrcRegisterExtSwz.ExtSwizzleZ != src->SrcRegisterExtSwz.ExtSwizzleZ ) {
+            TXT( "\nExtSwizzleZ: " );
+            ENM( src->SrcRegisterExtSwz.ExtSwizzleZ, TGSI_EXTSWIZZLES );
+         }
+         if( deflt || fs->SrcRegisterExtSwz.ExtSwizzleW != src->SrcRegisterExtSwz.ExtSwizzleW ) {
+            TXT( "\nExtSwizzleW: " );
+            ENM( src->SrcRegisterExtSwz.ExtSwizzleW, TGSI_EXTSWIZZLES );
+         }
+         if( deflt || fs->SrcRegisterExtSwz.NegateX != src->SrcRegisterExtSwz.NegateX ) {
+            TXT( "\nNegateX   : " );
+            UID( src->SrcRegisterExtSwz.NegateX );
+         }
+         if( deflt || fs->SrcRegisterExtSwz.NegateY != src->SrcRegisterExtSwz.NegateY ) {
+            TXT( "\nNegateY   : " );
+            UID( src->SrcRegisterExtSwz.NegateY );
+         }
+         if( deflt || fs->SrcRegisterExtSwz.NegateZ != src->SrcRegisterExtSwz.NegateZ ) {
+            TXT( "\nNegateZ   : " );
+            UID( src->SrcRegisterExtSwz.NegateZ );
+         }
+         if( deflt || fs->SrcRegisterExtSwz.NegateW != src->SrcRegisterExtSwz.NegateW ) {
+            TXT( "\nNegateW   : " );
+            UID( src->SrcRegisterExtSwz.NegateW );
+         }
+         if( ignored ) {
+            TXT( "\nPadding   : " );
+            UIX( src->SrcRegisterExtSwz.Padding );
+            if( deflt || fs->SrcRegisterExtSwz.Extended != src->SrcRegisterExtSwz.Extended ) {
+               TXT( "\nExtended   : " );
+               UID( src->SrcRegisterExtSwz.Extended );
+            }
+         }
+      }
+
+      if( deflt || tgsi_compare_src_register_ext_mod( src->SrcRegisterExtMod, fs->SrcRegisterExtMod ) ) {
+         EOL();
+         TXT( "\nType     : " );
+         ENM( src->SrcRegisterExtMod.Type, TGSI_SRC_REGISTER_EXTS );
+         if( deflt || fs->SrcRegisterExtMod.Complement != src->SrcRegisterExtMod.Complement ) {
+            TXT( "\nComplement: " );
+            UID( src->SrcRegisterExtMod.Complement );
+         }
+         if( deflt || fs->SrcRegisterExtMod.Bias != src->SrcRegisterExtMod.Bias ) {
+            TXT( "\nBias     : " );
+            UID( src->SrcRegisterExtMod.Bias );
+         }
+         if( deflt || fs->SrcRegisterExtMod.Scale2X != src->SrcRegisterExtMod.Scale2X ) {
+            TXT( "\nScale2X   : " );
+            UID( src->SrcRegisterExtMod.Scale2X );
+         }
+         if( deflt || fs->SrcRegisterExtMod.Absolute != src->SrcRegisterExtMod.Absolute ) {
+            TXT( "\nAbsolute  : " );
+            UID( src->SrcRegisterExtMod.Absolute );
+         }
+         if( deflt || fs->SrcRegisterExtMod.Negate != src->SrcRegisterExtMod.Negate ) {
+            TXT( "\nNegate   : " );
+            UID( src->SrcRegisterExtMod.Negate );
+         }
+         if( ignored ) {
+            TXT( "\nPadding   : " );
+            UIX( src->SrcRegisterExtMod.Padding );
+            if( deflt || fs->SrcRegisterExtMod.Extended != src->SrcRegisterExtMod.Extended ) {
+               TXT( "\nExtended  : " );
+               UID( src->SrcRegisterExtMod.Extended );
+            }
+         }
+      }
+   }
+}
+
+void
+tgsi_dump_c(
+   const struct tgsi_token *tokens,
+   uint flags )
+{
+   struct tgsi_parse_context parse;
+   struct tgsi_full_instruction fi;
+   struct tgsi_full_declaration fd;
+   uint ignored = flags & TGSI_DUMP_C_IGNORED;
+   uint deflt = flags & TGSI_DUMP_C_DEFAULT;
+   uint instno = 0;
+
+   /* sanity checks */
+   assert(strcmp(TGSI_OPCODES[TGSI_OPCODE_CONT], "OPCODE_CONT") == 0);
+   assert(strcmp(TGSI_OPCODES[TGSI_OPCODE_END], "OPCODE_END") == 0);
+
+   tgsi_parse_init( &parse, tokens );
+
+   TXT( "tgsi-dump begin -----------------" );
+
+   TXT( "\nMajorVersion: " );
+   UID( parse.FullVersion.Version.MajorVersion );
+   TXT( "\nMinorVersion: " );
+   UID( parse.FullVersion.Version.MinorVersion );
+   EOL();
+
+   TXT( "\nHeaderSize: " );
+   UID( parse.FullHeader.Header.HeaderSize );
+   TXT( "\nBodySize  : " );
+   UID( parse.FullHeader.Header.BodySize );
+   TXT( "\nProcessor : " );
+   ENM( parse.FullHeader.Processor.Processor, TGSI_PROCESSOR_TYPES );
+   EOL();
+
+   fi = tgsi_default_full_instruction();
+   fd = tgsi_default_full_declaration();
+
+   while( !tgsi_parse_end_of_tokens( &parse ) ) {
+      tgsi_parse_token( &parse );
+
+      TXT( "\nType       : " );
+      ENM( parse.FullToken.Token.Type, TGSI_TOKEN_TYPES );
+      if( ignored ) {
+         TXT( "\nSize       : " );
+         UID( parse.FullToken.Token.Size );
+         if( deflt || parse.FullToken.Token.Extended ) {
+            TXT( "\nExtended   : " );
+            UID( parse.FullToken.Token.Extended );
+         }
+      }
+
+      switch( parse.FullToken.Token.Type ) {
+      case TGSI_TOKEN_TYPE_DECLARATION:
+         dump_declaration_verbose(
+            &parse.FullToken.FullDeclaration,
+            ignored,
+            deflt,
+            &fd );
+         break;
+
+      case TGSI_TOKEN_TYPE_IMMEDIATE:
+         dump_immediate_verbose(
+            &parse.FullToken.FullImmediate,
+            ignored );
+         break;
+
+      case TGSI_TOKEN_TYPE_INSTRUCTION:
+         dump_instruction_verbose(
+            &parse.FullToken.FullInstruction,
+            ignored,
+            deflt,
+            &fi );
+         break;
+
+      default:
+         assert( 0 );
+      }
+
+      EOL();
+   }
+
+   TXT( "\ntgsi-dump end -------------------\n" );
+
+   tgsi_parse_free( &parse );
+}
diff --git a/src/gallium/auxiliary/tgsi/tgsi_dump_c.h b/src/gallium/auxiliary/tgsi/tgsi_dump_c.h
new file mode 100644
index 0000000000..d91cd35b3b
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/tgsi_dump_c.h
@@ -0,0 +1,49 @@
+/**************************************************************************
+ * 
+ * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef TGSI_DUMP_C_H
+#define TGSI_DUMP_C_H
+
+#include "pipe/p_shader_tokens.h"
+
+#if defined __cplusplus
+extern "C" {
+#endif
+
+#define TGSI_DUMP_C_IGNORED 1
+#define TGSI_DUMP_C_DEFAULT 2
+
+void
+tgsi_dump_c(
+   const struct tgsi_token *tokens,
+   uint flags );
+
+#if defined __cplusplus
+}
+#endif
+
+#endif /* TGSI_DUMP_C_H */
diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.c b/src/gallium/auxiliary/tgsi/tgsi_exec.c
new file mode 100644
index 0000000000..8b430548bc
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.c
@@ -0,0 +1,2522 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * TGSI interpretor/executor.
+ *
+ * Flow control information:
+ *
+ * Since we operate on 'quads' (4 pixels or 4 vertices in parallel)
+ * flow control statements (IF/ELSE/ENDIF, LOOP/ENDLOOP) require special
+ * care since a condition may be true for some quad components but false
+ * for other components.
+ *
+ * We basically execute all statements (even if they're in the part of
+ * an IF/ELSE clause that's "not taken") and use a special mask to
+ * control writing to destination registers.  This is the ExecMask.
+ * See store_dest().
+ *
+ * The ExecMask is computed from three other masks (CondMask, LoopMask and
+ * ContMask) which are controlled by the flow control instructions (namely:
+ * (IF/ELSE/ENDIF, LOOP/ENDLOOP and CONT).
+ *
+ *
+ * Authors:
+ *   Michal Krol
+ *   Brian Paul
+ */
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_state.h"
+#include "pipe/p_util.h"
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_util.h"
+#include "tgsi_exec.h"
+
+#define TILE_TOP_LEFT     0
+#define TILE_TOP_RIGHT    1
+#define TILE_BOTTOM_LEFT  2
+#define TILE_BOTTOM_RIGHT 3
+
+/*
+ * Shorthand locations of various utility registers (_I = Index, _C = Channel)
+ */
+#define TEMP_0_I           TGSI_EXEC_TEMP_00000000_I
+#define TEMP_0_C           TGSI_EXEC_TEMP_00000000_C
+#define TEMP_7F_I          TGSI_EXEC_TEMP_7FFFFFFF_I
+#define TEMP_7F_C          TGSI_EXEC_TEMP_7FFFFFFF_C
+#define TEMP_80_I          TGSI_EXEC_TEMP_80000000_I
+#define TEMP_80_C          TGSI_EXEC_TEMP_80000000_C
+#define TEMP_FF_I          TGSI_EXEC_TEMP_FFFFFFFF_I
+#define TEMP_FF_C          TGSI_EXEC_TEMP_FFFFFFFF_C
+#define TEMP_1_I           TGSI_EXEC_TEMP_ONE_I
+#define TEMP_1_C           TGSI_EXEC_TEMP_ONE_C
+#define TEMP_2_I           TGSI_EXEC_TEMP_TWO_I
+#define TEMP_2_C           TGSI_EXEC_TEMP_TWO_C
+#define TEMP_128_I         TGSI_EXEC_TEMP_128_I
+#define TEMP_128_C         TGSI_EXEC_TEMP_128_C
+#define TEMP_M128_I        TGSI_EXEC_TEMP_MINUS_128_I
+#define TEMP_M128_C        TGSI_EXEC_TEMP_MINUS_128_C
+#define TEMP_KILMASK_I     TGSI_EXEC_TEMP_KILMASK_I
+#define TEMP_KILMASK_C     TGSI_EXEC_TEMP_KILMASK_C
+#define TEMP_OUTPUT_I      TGSI_EXEC_TEMP_OUTPUT_I
+#define TEMP_OUTPUT_C      TGSI_EXEC_TEMP_OUTPUT_C
+#define TEMP_PRIMITIVE_I   TGSI_EXEC_TEMP_PRIMITIVE_I
+#define TEMP_PRIMITIVE_C   TGSI_EXEC_TEMP_PRIMITIVE_C
+#define TEMP_3_I           TGSI_EXEC_TEMP_THREE_I
+#define TEMP_3_C           TGSI_EXEC_TEMP_THREE_C
+#define TEMP_HALF_I        TGSI_EXEC_TEMP_HALF_I
+#define TEMP_HALF_C        TGSI_EXEC_TEMP_HALF_C
+#define TEMP_R0            TGSI_EXEC_TEMP_R0
+
+#define FOR_EACH_CHANNEL(CHAN)\
+   for (CHAN = 0; CHAN < 4; CHAN++)
+
+#define IS_CHANNEL_ENABLED(INST, CHAN)\
+   ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
+
+#define IS_CHANNEL_ENABLED2(INST, CHAN)\
+   ((INST).FullDstRegisters[1].DstRegister.WriteMask & (1 << (CHAN)))
+
+#define FOR_EACH_ENABLED_CHANNEL(INST, CHAN)\
+   FOR_EACH_CHANNEL( CHAN )\
+      if (IS_CHANNEL_ENABLED( INST, CHAN ))
+
+#define FOR_EACH_ENABLED_CHANNEL2(INST, CHAN)\
+   FOR_EACH_CHANNEL( CHAN )\
+      if (IS_CHANNEL_ENABLED2( INST, CHAN ))
+
+
+/** The execution mask depends on the conditional mask and the loop mask */
+#define UPDATE_EXEC_MASK(MACH) \
+      MACH->ExecMask = MACH->CondMask & MACH->LoopMask & MACH->ContMask & MACH->FuncMask
+
+
+#define CHAN_X  0
+#define CHAN_Y  1
+#define CHAN_Z  2
+#define CHAN_W  3
+
+
+
+/**
+ * Initialize machine state by expanding tokens to full instructions,
+ * allocating temporary storage, setting up constants, etc.
+ * After this, we can call tgsi_exec_machine_run() many times.
+ */
+void 
+tgsi_exec_machine_bind_shader(
+   struct tgsi_exec_machine *mach,
+   const struct tgsi_token *tokens,
+   uint numSamplers,
+   struct tgsi_sampler *samplers)
+{
+   uint k;
+   struct tgsi_parse_context parse;
+   struct tgsi_exec_labels *labels = &mach->Labels;
+   struct tgsi_full_instruction *instructions;
+   struct tgsi_full_declaration *declarations;
+   uint maxInstructions = 10, numInstructions = 0;
+   uint maxDeclarations = 10, numDeclarations = 0;
+   uint instno = 0;
+
+#if 0
+   tgsi_dump(tokens, 0);
+#endif
+
+   mach->Tokens = tokens;
+   mach->Samplers = samplers;
+
+   k = tgsi_parse_init (&parse, mach->Tokens);
+   if (k != TGSI_PARSE_OK) {
+      debug_printf( "Problem parsing!\n" );
+      return;
+   }
+
+   mach->Processor = parse.FullHeader.Processor.Processor;
+   mach->ImmLimit = 0;
+   labels->count = 0;
+
+   declarations = (struct tgsi_full_declaration *)
+      MALLOC( maxDeclarations * sizeof(struct tgsi_full_declaration) );
+
+   if (!declarations) {
+      return;
+   }
+
+   instructions = (struct tgsi_full_instruction *)
+      MALLOC( maxInstructions * sizeof(struct tgsi_full_instruction) );
+
+   if (!instructions) {
+      FREE( declarations );
+      return;
+   }
+
+   while( !tgsi_parse_end_of_tokens( &parse ) ) {
+      uint pointer = parse.Position;
+      uint i;
+
+      tgsi_parse_token( &parse );
+      switch( parse.FullToken.Token.Type ) {
+      case TGSI_TOKEN_TYPE_DECLARATION:
+         /* save expanded declaration */
+         if (numDeclarations == maxDeclarations) {
+            declarations = REALLOC(declarations,
+                                   maxDeclarations
+                                   * sizeof(struct tgsi_full_declaration),
+                                   (maxDeclarations + 10)
+                                   * sizeof(struct tgsi_full_declaration));
+            maxDeclarations += 10;
+         }
+         memcpy(declarations + numDeclarations,
+                &parse.FullToken.FullDeclaration,
+                sizeof(declarations[0]));
+         numDeclarations++;
+         break;
+
+      case TGSI_TOKEN_TYPE_IMMEDIATE:
+         {
+            uint size = parse.FullToken.FullImmediate.Immediate.Size - 1;
+            assert( size % 4 == 0 );
+            assert( mach->ImmLimit + size / 4 <= TGSI_EXEC_NUM_IMMEDIATES );
+
+            for( i = 0; i < size; i++ ) {
+               mach->Imms[mach->ImmLimit + i / 4][i % 4] = 
+		  parse.FullToken.FullImmediate.u.ImmediateFloat32[i].Float;
+            }
+            mach->ImmLimit += size / 4;
+         }
+         break;
+
+      case TGSI_TOKEN_TYPE_INSTRUCTION:
+         assert( labels->count < MAX_LABELS );
+
+         labels->labels[labels->count][0] = instno;
+         labels->labels[labels->count][1] = pointer;
+         labels->count++;
+
+         /* save expanded instruction */
+         if (numInstructions == maxInstructions) {
+            instructions = REALLOC(instructions,
+                                   maxInstructions
+                                   * sizeof(struct tgsi_full_instruction),
+                                   (maxInstructions + 10)
+                                   * sizeof(struct tgsi_full_instruction));
+            maxInstructions += 10;
+         }
+         memcpy(instructions + numInstructions,
+                &parse.FullToken.FullInstruction,
+                sizeof(instructions[0]));
+         numInstructions++;
+         break;
+
+      default:
+         assert( 0 );
+      }
+   }
+   tgsi_parse_free (&parse);
+
+   if (mach->Declarations) {
+      FREE( mach->Declarations );
+   }
+   mach->Declarations = declarations;
+   mach->NumDeclarations = numDeclarations;
+
+   if (mach->Instructions) {
+      FREE( mach->Instructions );
+   }
+   mach->Instructions = instructions;
+   mach->NumInstructions = numInstructions;
+}
+
+
+void
+tgsi_exec_machine_init(
+   struct tgsi_exec_machine *mach )
+{
+   uint i;
+
+   mach->Temps = (struct tgsi_exec_vector *) tgsi_align_128bit( mach->_Temps);
+   mach->Addrs = &mach->Temps[TGSI_EXEC_TEMP_ADDR];
+
+   /* Setup constants. */
+   for( i = 0; i < 4; i++ ) {
+      mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].u[i] = 0x00000000;
+      mach->Temps[TEMP_7F_I].xyzw[TEMP_7F_C].u[i] = 0x7FFFFFFF;
+      mach->Temps[TEMP_80_I].xyzw[TEMP_80_C].u[i] = 0x80000000;
+      mach->Temps[TEMP_FF_I].xyzw[TEMP_FF_C].u[i] = 0xFFFFFFFF;
+      mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].f[i] = 1.0f;
+      mach->Temps[TEMP_2_I].xyzw[TEMP_2_C].f[i] = 2.0f;
+      mach->Temps[TEMP_128_I].xyzw[TEMP_128_C].f[i] = 128.0f;
+      mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C].f[i] = -128.0f;
+      mach->Temps[TEMP_3_I].xyzw[TEMP_3_C].f[i] = 3.0f;
+      mach->Temps[TEMP_HALF_I].xyzw[TEMP_HALF_C].f[i] = 0.5f;
+   }
+}
+
+
+void
+tgsi_exec_machine_free_data(struct tgsi_exec_machine *mach)
+{
+   if (mach->Instructions) {
+      FREE(mach->Instructions);
+      mach->Instructions = NULL;
+      mach->NumInstructions = 0;
+   }
+   if (mach->Declarations) {
+      FREE(mach->Declarations);
+      mach->Declarations = NULL;
+      mach->NumDeclarations = 0;
+   }
+}
+
+
+static void
+micro_abs(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src )
+{
+   dst->f[0] = fabsf( src->f[0] );
+   dst->f[1] = fabsf( src->f[1] );
+   dst->f[2] = fabsf( src->f[2] );
+   dst->f[3] = fabsf( src->f[3] );
+}
+
+static void
+micro_add(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1 )
+{
+   dst->f[0] = src0->f[0] + src1->f[0];
+   dst->f[1] = src0->f[1] + src1->f[1];
+   dst->f[2] = src0->f[2] + src1->f[2];
+   dst->f[3] = src0->f[3] + src1->f[3];
+}
+
+static void
+micro_iadd(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1 )
+{
+   dst->i[0] = src0->i[0] + src1->i[0];
+   dst->i[1] = src0->i[1] + src1->i[1];
+   dst->i[2] = src0->i[2] + src1->i[2];
+   dst->i[3] = src0->i[3] + src1->i[3];
+}
+
+static void
+micro_and(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1 )
+{
+   dst->u[0] = src0->u[0] & src1->u[0];
+   dst->u[1] = src0->u[1] & src1->u[1];
+   dst->u[2] = src0->u[2] & src1->u[2];
+   dst->u[3] = src0->u[3] & src1->u[3];
+}
+
+static void
+micro_ceil(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src )
+{
+   dst->f[0] = ceilf( src->f[0] );
+   dst->f[1] = ceilf( src->f[1] );
+   dst->f[2] = ceilf( src->f[2] );
+   dst->f[3] = ceilf( src->f[3] );
+}
+
+static void
+micro_cos(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src )
+{
+   dst->f[0] = cosf( src->f[0] );
+   dst->f[1] = cosf( src->f[1] );
+   dst->f[2] = cosf( src->f[2] );
+   dst->f[3] = cosf( src->f[3] );
+}
+
+static void
+micro_ddx(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src )
+{
+   dst->f[0] =
+   dst->f[1] =
+   dst->f[2] =
+   dst->f[3] = src->f[TILE_BOTTOM_RIGHT] - src->f[TILE_BOTTOM_LEFT];
+}
+
+static void
+micro_ddy(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src )
+{
+   dst->f[0] =
+   dst->f[1] =
+   dst->f[2] =
+   dst->f[3] = src->f[TILE_TOP_LEFT] - src->f[TILE_BOTTOM_LEFT];
+}
+
+static void
+micro_div(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1 )
+{
+   dst->f[0] = src0->f[0] / src1->f[0];
+   dst->f[1] = src0->f[1] / src1->f[1];
+   dst->f[2] = src0->f[2] / src1->f[2];
+   dst->f[3] = src0->f[3] / src1->f[3];
+}
+
+static void
+micro_udiv(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1 )
+{
+   dst->u[0] = src0->u[0] / src1->u[0];
+   dst->u[1] = src0->u[1] / src1->u[1];
+   dst->u[2] = src0->u[2] / src1->u[2];
+   dst->u[3] = src0->u[3] / src1->u[3];
+}
+
+static void
+micro_eq(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1,
+   const union tgsi_exec_channel *src2,
+   const union tgsi_exec_channel *src3 )
+{
+   dst->f[0] = src0->f[0] == src1->f[0] ? src2->f[0] : src3->f[0];
+   dst->f[1] = src0->f[1] == src1->f[1] ? src2->f[1] : src3->f[1];
+   dst->f[2] = src0->f[2] == src1->f[2] ? src2->f[2] : src3->f[2];
+   dst->f[3] = src0->f[3] == src1->f[3] ? src2->f[3] : src3->f[3];
+}
+
+static void
+micro_ieq(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1,
+   const union tgsi_exec_channel *src2,
+   const union tgsi_exec_channel *src3 )
+{
+   dst->i[0] = src0->i[0] == src1->i[0] ? src2->i[0] : src3->i[0];
+   dst->i[1] = src0->i[1] == src1->i[1] ? src2->i[1] : src3->i[1];
+   dst->i[2] = src0->i[2] == src1->i[2] ? src2->i[2] : src3->i[2];
+   dst->i[3] = src0->i[3] == src1->i[3] ? src2->i[3] : src3->i[3];
+}
+
+static void
+micro_exp2(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src)
+{
+   dst->f[0] = powf( 2.0f, src->f[0] );
+   dst->f[1] = powf( 2.0f, src->f[1] );
+   dst->f[2] = powf( 2.0f, src->f[2] );
+   dst->f[3] = powf( 2.0f, src->f[3] );
+}
+
+static void
+micro_f2it(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src )
+{
+   dst->i[0] = (int) src->f[0];
+   dst->i[1] = (int) src->f[1];
+   dst->i[2] = (int) src->f[2];
+   dst->i[3] = (int) src->f[3];
+}
+
+static void
+micro_f2ut(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src )
+{
+   dst->u[0] = (uint) src->f[0];
+   dst->u[1] = (uint) src->f[1];
+   dst->u[2] = (uint) src->f[2];
+   dst->u[3] = (uint) src->f[3];
+}
+
+static void
+micro_flr(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src )
+{
+   dst->f[0] = floorf( src->f[0] );
+   dst->f[1] = floorf( src->f[1] );
+   dst->f[2] = floorf( src->f[2] );
+   dst->f[3] = floorf( src->f[3] );
+}
+
+static void
+micro_frc(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src )
+{
+   dst->f[0] = src->f[0] - floorf( src->f[0] );
+   dst->f[1] = src->f[1] - floorf( src->f[1] );
+   dst->f[2] = src->f[2] - floorf( src->f[2] );
+   dst->f[3] = src->f[3] - floorf( src->f[3] );
+}
+
+static void
+micro_ge(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1,
+   const union tgsi_exec_channel *src2,
+   const union tgsi_exec_channel *src3 )
+{
+   dst->f[0] = src0->f[0] >= src1->f[0] ? src2->f[0] : src3->f[0];
+   dst->f[1] = src0->f[1] >= src1->f[1] ? src2->f[1] : src3->f[1];
+   dst->f[2] = src0->f[2] >= src1->f[2] ? src2->f[2] : src3->f[2];
+   dst->f[3] = src0->f[3] >= src1->f[3] ? src2->f[3] : src3->f[3];
+}
+
+static void
+micro_i2f(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src )
+{
+   dst->f[0] = (float) src->i[0];
+   dst->f[1] = (float) src->i[1];
+   dst->f[2] = (float) src->i[2];
+   dst->f[3] = (float) src->i[3];
+}
+
+static void
+micro_lg2(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src )
+{
+   dst->f[0] = logf( src->f[0] ) * 1.442695f;
+   dst->f[1] = logf( src->f[1] ) * 1.442695f;
+   dst->f[2] = logf( src->f[2] ) * 1.442695f;
+   dst->f[3] = logf( src->f[3] ) * 1.442695f;
+}
+
+static void
+micro_le(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1,
+   const union tgsi_exec_channel *src2,
+   const union tgsi_exec_channel *src3 )
+{
+   dst->f[0] = src0->f[0] <= src1->f[0] ? src2->f[0] : src3->f[0];
+   dst->f[1] = src0->f[1] <= src1->f[1] ? src2->f[1] : src3->f[1];
+   dst->f[2] = src0->f[2] <= src1->f[2] ? src2->f[2] : src3->f[2];
+   dst->f[3] = src0->f[3] <= src1->f[3] ? src2->f[3] : src3->f[3];
+}
+
+static void
+micro_lt(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1,
+   const union tgsi_exec_channel *src2,
+   const union tgsi_exec_channel *src3 )
+{
+   dst->f[0] = src0->f[0] < src1->f[0] ? src2->f[0] : src3->f[0];
+   dst->f[1] = src0->f[1] < src1->f[1] ? src2->f[1] : src3->f[1];
+   dst->f[2] = src0->f[2] < src1->f[2] ? src2->f[2] : src3->f[2];
+   dst->f[3] = src0->f[3] < src1->f[3] ? src2->f[3] : src3->f[3];
+}
+
+static void
+micro_ilt(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1,
+   const union tgsi_exec_channel *src2,
+   const union tgsi_exec_channel *src3 )
+{
+   dst->i[0] = src0->i[0] < src1->i[0] ? src2->i[0] : src3->i[0];
+   dst->i[1] = src0->i[1] < src1->i[1] ? src2->i[1] : src3->i[1];
+   dst->i[2] = src0->i[2] < src1->i[2] ? src2->i[2] : src3->i[2];
+   dst->i[3] = src0->i[3] < src1->i[3] ? src2->i[3] : src3->i[3];
+}
+
+static void
+micro_ult(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1,
+   const union tgsi_exec_channel *src2,
+   const union tgsi_exec_channel *src3 )
+{
+   dst->u[0] = src0->u[0] < src1->u[0] ? src2->u[0] : src3->u[0];
+   dst->u[1] = src0->u[1] < src1->u[1] ? src2->u[1] : src3->u[1];
+   dst->u[2] = src0->u[2] < src1->u[2] ? src2->u[2] : src3->u[2];
+   dst->u[3] = src0->u[3] < src1->u[3] ? src2->u[3] : src3->u[3];
+}
+
+static void
+micro_max(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1 )
+{
+   dst->f[0] = src0->f[0] > src1->f[0] ? src0->f[0] : src1->f[0];
+   dst->f[1] = src0->f[1] > src1->f[1] ? src0->f[1] : src1->f[1];
+   dst->f[2] = src0->f[2] > src1->f[2] ? src0->f[2] : src1->f[2];
+   dst->f[3] = src0->f[3] > src1->f[3] ? src0->f[3] : src1->f[3];
+}
+
+static void
+micro_imax(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1 )
+{
+   dst->i[0] = src0->i[0] > src1->i[0] ? src0->i[0] : src1->i[0];
+   dst->i[1] = src0->i[1] > src1->i[1] ? src0->i[1] : src1->i[1];
+   dst->i[2] = src0->i[2] > src1->i[2] ? src0->i[2] : src1->i[2];
+   dst->i[3] = src0->i[3] > src1->i[3] ? src0->i[3] : src1->i[3];
+}
+
+static void
+micro_umax(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1 )
+{
+   dst->u[0] = src0->u[0] > src1->u[0] ? src0->u[0] : src1->u[0];
+   dst->u[1] = src0->u[1] > src1->u[1] ? src0->u[1] : src1->u[1];
+   dst->u[2] = src0->u[2] > src1->u[2] ? src0->u[2] : src1->u[2];
+   dst->u[3] = src0->u[3] > src1->u[3] ? src0->u[3] : src1->u[3];
+}
+
+static void
+micro_min(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1 )
+{
+   dst->f[0] = src0->f[0] < src1->f[0] ? src0->f[0] : src1->f[0];
+   dst->f[1] = src0->f[1] < src1->f[1] ? src0->f[1] : src1->f[1];
+   dst->f[2] = src0->f[2] < src1->f[2] ? src0->f[2] : src1->f[2];
+   dst->f[3] = src0->f[3] < src1->f[3] ? src0->f[3] : src1->f[3];
+}
+
+static void
+micro_imin(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1 )
+{
+   dst->i[0] = src0->i[0] < src1->i[0] ? src0->i[0] : src1->i[0];
+   dst->i[1] = src0->i[1] < src1->i[1] ? src0->i[1] : src1->i[1];
+   dst->i[2] = src0->i[2] < src1->i[2] ? src0->i[2] : src1->i[2];
+   dst->i[3] = src0->i[3] < src1->i[3] ? src0->i[3] : src1->i[3];
+}
+
+static void
+micro_umin(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1 )
+{
+   dst->u[0] = src0->u[0] < src1->u[0] ? src0->u[0] : src1->u[0];
+   dst->u[1] = src0->u[1] < src1->u[1] ? src0->u[1] : src1->u[1];
+   dst->u[2] = src0->u[2] < src1->u[2] ? src0->u[2] : src1->u[2];
+   dst->u[3] = src0->u[3] < src1->u[3] ? src0->u[3] : src1->u[3];
+}
+
+static void
+micro_umod(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1 )
+{
+   dst->u[0] = src0->u[0] % src1->u[0];
+   dst->u[1] = src0->u[1] % src1->u[1];
+   dst->u[2] = src0->u[2] % src1->u[2];
+   dst->u[3] = src0->u[3] % src1->u[3];
+}
+
+static void
+micro_mul(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1 )
+{
+   dst->f[0] = src0->f[0] * src1->f[0];
+   dst->f[1] = src0->f[1] * src1->f[1];
+   dst->f[2] = src0->f[2] * src1->f[2];
+   dst->f[3] = src0->f[3] * src1->f[3];
+}
+
+static void
+micro_imul(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1 )
+{
+   dst->i[0] = src0->i[0] * src1->i[0];
+   dst->i[1] = src0->i[1] * src1->i[1];
+   dst->i[2] = src0->i[2] * src1->i[2];
+   dst->i[3] = src0->i[3] * src1->i[3];
+}
+
+static void
+micro_imul64(
+   union tgsi_exec_channel *dst0,
+   union tgsi_exec_channel *dst1,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1 )
+{
+   dst1->i[0] = src0->i[0] * src1->i[0];
+   dst1->i[1] = src0->i[1] * src1->i[1];
+   dst1->i[2] = src0->i[2] * src1->i[2];
+   dst1->i[3] = src0->i[3] * src1->i[3];
+   dst0->i[0] = 0;
+   dst0->i[1] = 0;
+   dst0->i[2] = 0;
+   dst0->i[3] = 0;
+}
+
+static void
+micro_umul64(
+   union tgsi_exec_channel *dst0,
+   union tgsi_exec_channel *dst1,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1 )
+{
+   dst1->u[0] = src0->u[0] * src1->u[0];
+   dst1->u[1] = src0->u[1] * src1->u[1];
+   dst1->u[2] = src0->u[2] * src1->u[2];
+   dst1->u[3] = src0->u[3] * src1->u[3];
+   dst0->u[0] = 0;
+   dst0->u[1] = 0;
+   dst0->u[2] = 0;
+   dst0->u[3] = 0;
+}
+
+static void
+micro_movc(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1,
+   const union tgsi_exec_channel *src2 )
+{
+   dst->u[0] = src0->u[0] ? src1->u[0] : src2->u[0];
+   dst->u[1] = src0->u[1] ? src1->u[1] : src2->u[1];
+   dst->u[2] = src0->u[2] ? src1->u[2] : src2->u[2];
+   dst->u[3] = src0->u[3] ? src1->u[3] : src2->u[3];
+}
+
+static void
+micro_neg(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src )
+{
+   dst->f[0] = -src->f[0];
+   dst->f[1] = -src->f[1];
+   dst->f[2] = -src->f[2];
+   dst->f[3] = -src->f[3];
+}
+
+static void
+micro_ineg(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src )
+{
+   dst->i[0] = -src->i[0];
+   dst->i[1] = -src->i[1];
+   dst->i[2] = -src->i[2];
+   dst->i[3] = -src->i[3];
+}
+
+static void
+micro_not(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src )
+{
+   dst->u[0] = ~src->u[0];
+   dst->u[1] = ~src->u[1];
+   dst->u[2] = ~src->u[2];
+   dst->u[3] = ~src->u[3];
+}
+
+static void
+micro_or(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1 )
+{
+   dst->u[0] = src0->u[0] | src1->u[0];
+   dst->u[1] = src0->u[1] | src1->u[1];
+   dst->u[2] = src0->u[2] | src1->u[2];
+   dst->u[3] = src0->u[3] | src1->u[3];
+}
+
+static void
+micro_pow(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1 )
+{
+   dst->f[0] = powf( src0->f[0], src1->f[0] );
+   dst->f[1] = powf( src0->f[1], src1->f[1] );
+   dst->f[2] = powf( src0->f[2], src1->f[2] );
+   dst->f[3] = powf( src0->f[3], src1->f[3] );
+}
+
+static void
+micro_rnd(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src )
+{
+   dst->f[0] = floorf( src->f[0] + 0.5f );
+   dst->f[1] = floorf( src->f[1] + 0.5f );
+   dst->f[2] = floorf( src->f[2] + 0.5f );
+   dst->f[3] = floorf( src->f[3] + 0.5f );
+}
+
+static void
+micro_shl(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1 )
+{
+   dst->i[0] = src0->i[0] << src1->i[0];
+   dst->i[1] = src0->i[1] << src1->i[1];
+   dst->i[2] = src0->i[2] << src1->i[2];
+   dst->i[3] = src0->i[3] << src1->i[3];
+}
+
+static void
+micro_ishr(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1 )
+{
+   dst->i[0] = src0->i[0] >> src1->i[0];
+   dst->i[1] = src0->i[1] >> src1->i[1];
+   dst->i[2] = src0->i[2] >> src1->i[2];
+   dst->i[3] = src0->i[3] >> src1->i[3];
+}
+
+static void
+micro_trunc(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0 )
+{
+   dst->f[0] = (float) (int) src0->f[0];
+   dst->f[1] = (float) (int) src0->f[1];
+   dst->f[2] = (float) (int) src0->f[2];
+   dst->f[3] = (float) (int) src0->f[3];
+}
+
+static void
+micro_ushr(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1 )
+{
+   dst->u[0] = src0->u[0] >> src1->u[0];
+   dst->u[1] = src0->u[1] >> src1->u[1];
+   dst->u[2] = src0->u[2] >> src1->u[2];
+   dst->u[3] = src0->u[3] >> src1->u[3];
+}
+
+static void
+micro_sin(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src )
+{
+   dst->f[0] = sinf( src->f[0] );
+   dst->f[1] = sinf( src->f[1] );
+   dst->f[2] = sinf( src->f[2] );
+   dst->f[3] = sinf( src->f[3] );
+}
+
+static void
+micro_sqrt( union tgsi_exec_channel *dst,
+            const union tgsi_exec_channel *src )
+{
+   dst->f[0] = sqrtf( src->f[0] );
+   dst->f[1] = sqrtf( src->f[1] );
+   dst->f[2] = sqrtf( src->f[2] );
+   dst->f[3] = sqrtf( src->f[3] );
+}
+
+static void
+micro_sub(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1 )
+{
+   dst->f[0] = src0->f[0] - src1->f[0];
+   dst->f[1] = src0->f[1] - src1->f[1];
+   dst->f[2] = src0->f[2] - src1->f[2];
+   dst->f[3] = src0->f[3] - src1->f[3];
+}
+
+static void
+micro_u2f(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src )
+{
+   dst->f[0] = (float) src->u[0];
+   dst->f[1] = (float) src->u[1];
+   dst->f[2] = (float) src->u[2];
+   dst->f[3] = (float) src->u[3];
+}
+
+static void
+micro_xor(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1 )
+{
+   dst->u[0] = src0->u[0] ^ src1->u[0];
+   dst->u[1] = src0->u[1] ^ src1->u[1];
+   dst->u[2] = src0->u[2] ^ src1->u[2];
+   dst->u[3] = src0->u[3] ^ src1->u[3];
+}
+
+static void
+fetch_src_file_channel(
+   const struct tgsi_exec_machine *mach,
+   const uint file,
+   const uint swizzle,
+   const union tgsi_exec_channel *index,
+   union tgsi_exec_channel *chan )
+{
+   switch( swizzle ) {
+   case TGSI_EXTSWIZZLE_X:
+   case TGSI_EXTSWIZZLE_Y:
+   case TGSI_EXTSWIZZLE_Z:
+   case TGSI_EXTSWIZZLE_W:
+      switch( file ) {
+      case TGSI_FILE_CONSTANT:
+         chan->f[0] = mach->Consts[index->i[0]][swizzle];
+         chan->f[1] = mach->Consts[index->i[1]][swizzle];
+         chan->f[2] = mach->Consts[index->i[2]][swizzle];
+         chan->f[3] = mach->Consts[index->i[3]][swizzle];
+         break;
+
+      case TGSI_FILE_INPUT:
+         chan->u[0] = mach->Inputs[index->i[0]].xyzw[swizzle].u[0];
+         chan->u[1] = mach->Inputs[index->i[1]].xyzw[swizzle].u[1];
+         chan->u[2] = mach->Inputs[index->i[2]].xyzw[swizzle].u[2];
+         chan->u[3] = mach->Inputs[index->i[3]].xyzw[swizzle].u[3];
+         break;
+
+      case TGSI_FILE_TEMPORARY:
+         assert(index->i[0] < TGSI_EXEC_NUM_TEMPS);
+         chan->u[0] = mach->Temps[index->i[0]].xyzw[swizzle].u[0];
+         chan->u[1] = mach->Temps[index->i[1]].xyzw[swizzle].u[1];
+         chan->u[2] = mach->Temps[index->i[2]].xyzw[swizzle].u[2];
+         chan->u[3] = mach->Temps[index->i[3]].xyzw[swizzle].u[3];
+         break;
+
+      case TGSI_FILE_IMMEDIATE:
+         assert( index->i[0] < (int) mach->ImmLimit );
+         chan->f[0] = mach->Imms[index->i[0]][swizzle];
+         assert( index->i[1] < (int) mach->ImmLimit );
+         chan->f[1] = mach->Imms[index->i[1]][swizzle];
+         assert( index->i[2] < (int) mach->ImmLimit );
+         chan->f[2] = mach->Imms[index->i[2]][swizzle];
+         assert( index->i[3] < (int) mach->ImmLimit );
+         chan->f[3] = mach->Imms[index->i[3]][swizzle];
+         break;
+
+      case TGSI_FILE_ADDRESS:
+         chan->u[0] = mach->Addrs[index->i[0]].xyzw[swizzle].u[0];
+         chan->u[1] = mach->Addrs[index->i[1]].xyzw[swizzle].u[1];
+         chan->u[2] = mach->Addrs[index->i[2]].xyzw[swizzle].u[2];
+         chan->u[3] = mach->Addrs[index->i[3]].xyzw[swizzle].u[3];
+         break;
+
+      case TGSI_FILE_OUTPUT:
+         /* vertex/fragment output vars can be read too */
+         chan->u[0] = mach->Outputs[index->i[0]].xyzw[swizzle].u[0];
+         chan->u[1] = mach->Outputs[index->i[1]].xyzw[swizzle].u[1];
+         chan->u[2] = mach->Outputs[index->i[2]].xyzw[swizzle].u[2];
+         chan->u[3] = mach->Outputs[index->i[3]].xyzw[swizzle].u[3];
+         break;
+
+      default:
+         assert( 0 );
+      }
+      break;
+
+   case TGSI_EXTSWIZZLE_ZERO:
+      *chan = mach->Temps[TEMP_0_I].xyzw[TEMP_0_C];
+      break;
+
+   case TGSI_EXTSWIZZLE_ONE:
+      *chan = mach->Temps[TEMP_1_I].xyzw[TEMP_1_C];
+      break;
+
+   default:
+      assert( 0 );
+   }
+}
+
+static void
+fetch_source(
+   const struct tgsi_exec_machine *mach,
+   union tgsi_exec_channel *chan,
+   const struct tgsi_full_src_register *reg,
+   const uint chan_index )
+{
+   union tgsi_exec_channel index;
+   uint swizzle;
+
+   index.i[0] =
+   index.i[1] =
+   index.i[2] =
+   index.i[3] = reg->SrcRegister.Index;
+
+   if (reg->SrcRegister.Indirect) {
+      union tgsi_exec_channel index2;
+      union tgsi_exec_channel indir_index;
+
+      index2.i[0] =
+      index2.i[1] =
+      index2.i[2] =
+      index2.i[3] = reg->SrcRegisterInd.Index;
+
+      swizzle = tgsi_util_get_src_register_swizzle( &reg->SrcRegisterInd, CHAN_X );
+      fetch_src_file_channel(
+         mach,
+         reg->SrcRegisterInd.File,
+         swizzle,
+         &index2,
+         &indir_index );
+
+      index.i[0] += indir_index.i[0];
+      index.i[1] += indir_index.i[1];
+      index.i[2] += indir_index.i[2];
+      index.i[3] += indir_index.i[3];
+   }
+
+   if( reg->SrcRegister.Dimension ) {
+      switch( reg->SrcRegister.File ) {
+      case TGSI_FILE_INPUT:
+         index.i[0] *= 17;
+         index.i[1] *= 17;
+         index.i[2] *= 17;
+         index.i[3] *= 17;
+         break;
+      case TGSI_FILE_CONSTANT:
+         index.i[0] *= 4096;
+         index.i[1] *= 4096;
+         index.i[2] *= 4096;
+         index.i[3] *= 4096;
+         break;
+      default:
+         assert( 0 );
+      }
+
+      index.i[0] += reg->SrcRegisterDim.Index;
+      index.i[1] += reg->SrcRegisterDim.Index;
+      index.i[2] += reg->SrcRegisterDim.Index;
+      index.i[3] += reg->SrcRegisterDim.Index;
+
+      if (reg->SrcRegisterDim.Indirect) {
+         union tgsi_exec_channel index2;
+         union tgsi_exec_channel indir_index;
+
+         index2.i[0] =
+         index2.i[1] =
+         index2.i[2] =
+         index2.i[3] = reg->SrcRegisterDimInd.Index;
+
+         swizzle = tgsi_util_get_src_register_swizzle( &reg->SrcRegisterDimInd, CHAN_X );
+         fetch_src_file_channel(
+            mach,
+            reg->SrcRegisterDimInd.File,
+            swizzle,
+            &index2,
+            &indir_index );
+
+         index.i[0] += indir_index.i[0];
+         index.i[1] += indir_index.i[1];
+         index.i[2] += indir_index.i[2];
+         index.i[3] += indir_index.i[3];
+      }
+   }
+
+   swizzle = tgsi_util_get_full_src_register_extswizzle( reg, chan_index );
+   fetch_src_file_channel(
+      mach,
+      reg->SrcRegister.File,
+      swizzle,
+      &index,
+      chan );
+
+   switch (tgsi_util_get_full_src_register_sign_mode( reg, chan_index )) {
+   case TGSI_UTIL_SIGN_CLEAR:
+      micro_abs( chan, chan );
+      break;
+
+   case TGSI_UTIL_SIGN_SET:
+      micro_abs( chan, chan );
+      micro_neg( chan, chan );
+      break;
+
+   case TGSI_UTIL_SIGN_TOGGLE:
+      micro_neg( chan, chan );
+      break;
+
+   case TGSI_UTIL_SIGN_KEEP:
+      break;
+   }
+
+   if (reg->SrcRegisterExtMod.Complement) {
+      micro_sub( chan, &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], chan );
+   }
+}
+
+static void
+store_dest(
+   struct tgsi_exec_machine *mach,
+   const union tgsi_exec_channel *chan,
+   const struct tgsi_full_dst_register *reg,
+   const struct tgsi_full_instruction *inst,
+   uint chan_index )
+{
+   union tgsi_exec_channel *dst;
+
+   switch( reg->DstRegister.File ) {
+   case TGSI_FILE_NULL:
+      return;
+
+   case TGSI_FILE_OUTPUT:
+      dst = &mach->Outputs[mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0]
+                           + reg->DstRegister.Index].xyzw[chan_index];
+      break;
+
+   case TGSI_FILE_TEMPORARY:
+      assert(reg->DstRegister.Index < TGSI_EXEC_NUM_TEMPS);
+      dst = &mach->Temps[reg->DstRegister.Index].xyzw[chan_index];
+      break;
+
+   case TGSI_FILE_ADDRESS:
+      dst = &mach->Addrs[reg->DstRegister.Index].xyzw[chan_index];
+      break;
+
+   default:
+      assert( 0 );
+      return;
+   }
+
+   switch (inst->Instruction.Saturate)
+   {
+   case TGSI_SAT_NONE:
+      if (mach->ExecMask & 0x1)
+         dst->i[0] = chan->i[0];
+      if (mach->ExecMask & 0x2)
+         dst->i[1] = chan->i[1];
+      if (mach->ExecMask & 0x4)
+         dst->i[2] = chan->i[2];
+      if (mach->ExecMask & 0x8)
+         dst->i[3] = chan->i[3];
+      break;
+
+   case TGSI_SAT_ZERO_ONE:
+      /* XXX need to obey ExecMask here */
+      micro_max(dst, chan, &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]);
+      micro_min(dst, dst, &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C]);
+      break;
+
+   case TGSI_SAT_MINUS_PLUS_ONE:
+      assert( 0 );
+      break;
+
+   default:
+      assert( 0 );
+   }
+}
+
+#define FETCH(VAL,INDEX,CHAN)\
+    fetch_source (mach, VAL, &inst->FullSrcRegisters[INDEX], CHAN)
+
+#define STORE(VAL,INDEX,CHAN)\
+    store_dest (mach, VAL, &inst->FullDstRegisters[INDEX], inst, CHAN )
+
+
+/**
+ * Execute ARB-style KIL which is predicated by a src register.
+ * Kill fragment if any of the four values is less than zero.
+ */
+static void
+exec_kilp(struct tgsi_exec_machine *mach,
+          const struct tgsi_full_instruction *inst)
+{
+   uint uniquemask;
+   uint chan_index;
+   uint kilmask = 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
+   union tgsi_exec_channel r[1];
+
+   /* This mask stores component bits that were already tested. Note that
+    * we test if the value is less than zero, so 1.0 and 0.0 need not to be
+    * tested. */
+   uniquemask = (1 << TGSI_EXTSWIZZLE_ZERO) | (1 << TGSI_EXTSWIZZLE_ONE);
+
+   for (chan_index = 0; chan_index < 4; chan_index++)
+   {
+      uint swizzle;
+      uint i;
+
+      /* unswizzle channel */
+      swizzle = tgsi_util_get_full_src_register_extswizzle (
+                        &inst->FullSrcRegisters[0],
+                        chan_index);
+
+      /* check if the component has not been already tested */
+      if (uniquemask & (1 << swizzle))
+         continue;
+      uniquemask |= 1 << swizzle;
+
+      FETCH(&r[0], 0, chan_index);
+      for (i = 0; i < 4; i++)
+         if (r[0].f[i] < 0.0f)
+            kilmask |= 1 << i;
+   }
+
+   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
+}
+
+
+/*
+ * Fetch a texel using STR texture coordinates.
+ */
+static void
+fetch_texel( struct tgsi_sampler *sampler,
+             const union tgsi_exec_channel *s,
+             const union tgsi_exec_channel *t,
+             const union tgsi_exec_channel *p,
+             float lodbias,  /* XXX should be float[4] */
+             union tgsi_exec_channel *r,
+             union tgsi_exec_channel *g,
+             union tgsi_exec_channel *b,
+             union tgsi_exec_channel *a )
+{
+   uint j;
+   float rgba[NUM_CHANNELS][QUAD_SIZE];
+
+   sampler->get_samples(sampler, s->f, t->f, p->f, lodbias, rgba);
+
+   for (j = 0; j < 4; j++) {
+      r->f[j] = rgba[0][j];
+      g->f[j] = rgba[1][j];
+      b->f[j] = rgba[2][j];
+      a->f[j] = rgba[3][j];
+   }
+}
+
+
+static void
+exec_tex(struct tgsi_exec_machine *mach,
+         const struct tgsi_full_instruction *inst,
+         boolean biasLod,
+         boolean projected)
+{
+   const uint unit = inst->FullSrcRegisters[1].SrcRegister.Index;
+   union tgsi_exec_channel r[8];
+   uint chan_index;
+   float lodBias;
+
+   /*   debug_printf("Sampler %u unit %u\n", sampler, unit); */
+
+   switch (inst->InstructionExtTexture.Texture) {
+   case TGSI_TEXTURE_1D:
+
+      FETCH(&r[0], 0, CHAN_X);
+
+      if (projected) {
+         FETCH(&r[1], 0, CHAN_W);
+         micro_div( &r[0], &r[0], &r[1] );
+      }
+
+      if (biasLod) {
+         FETCH(&r[1], 0, CHAN_W);
+         lodBias = r[2].f[0];
+      }
+      else
+         lodBias = 0.0;
+
+      fetch_texel(&mach->Samplers[unit],
+                  &r[0], NULL, NULL, lodBias,  /* S, T, P, BIAS */
+                  &r[0], &r[1], &r[2], &r[3]); /* R, G, B, A */
+      break;
+
+   case TGSI_TEXTURE_2D:
+   case TGSI_TEXTURE_RECT:
+
+      FETCH(&r[0], 0, CHAN_X);
+      FETCH(&r[1], 0, CHAN_Y);
+      FETCH(&r[2], 0, CHAN_Z);
+
+      if (projected) {
+         FETCH(&r[3], 0, CHAN_W);
+         micro_div( &r[0], &r[0], &r[3] );
+         micro_div( &r[1], &r[1], &r[3] );
+         micro_div( &r[2], &r[2], &r[3] );
+      }
+
+      if (biasLod) {
+         FETCH(&r[3], 0, CHAN_W);
+         lodBias = r[3].f[0];
+      }
+      else
+         lodBias = 0.0;
+
+      fetch_texel(&mach->Samplers[unit],
+                  &r[0], &r[1], &r[2], lodBias,  /* inputs */
+                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
+      break;
+
+   case TGSI_TEXTURE_3D:
+   case TGSI_TEXTURE_CUBE:
+
+      FETCH(&r[0], 0, CHAN_X);
+      FETCH(&r[1], 0, CHAN_Y);
+      FETCH(&r[2], 0, CHAN_Z);
+
+      if (projected) {
+         FETCH(&r[3], 0, CHAN_W);
+         micro_div( &r[0], &r[0], &r[3] );
+         micro_div( &r[1], &r[1], &r[3] );
+         micro_div( &r[2], &r[2], &r[3] );
+      }
+
+      if (biasLod) {
+         FETCH(&r[3], 0, CHAN_W);
+         lodBias = r[3].f[0];
+      }
+      else
+         lodBias = 0.0;
+
+      fetch_texel(&mach->Samplers[unit],
+                  &r[0], &r[1], &r[2], lodBias,
+                  &r[0], &r[1], &r[2], &r[3]);
+      break;
+
+   default:
+      assert (0);
+   }
+
+   FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+      STORE( &r[chan_index], 0, chan_index );
+   }
+}
+
+
+/**
+ * Evaluate a constant-valued coefficient at the position of the
+ * current quad.
+ */
+static void
+eval_constant_coef(
+   struct tgsi_exec_machine *mach,
+   unsigned attrib,
+   unsigned chan )
+{
+   unsigned i;
+
+   for( i = 0; i < QUAD_SIZE; i++ ) {
+      mach->Inputs[attrib].xyzw[chan].f[i] = mach->InterpCoefs[attrib].a0[chan];
+   }
+}
+
+/**
+ * Evaluate a linear-valued coefficient at the position of the
+ * current quad.
+ */
+static void
+eval_linear_coef(
+   struct tgsi_exec_machine *mach,
+   unsigned attrib,
+   unsigned chan )
+{
+   const float x = mach->QuadPos.xyzw[0].f[0];
+   const float y = mach->QuadPos.xyzw[1].f[0];
+   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
+   const float dady = mach->InterpCoefs[attrib].dady[chan];
+   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
+   mach->Inputs[attrib].xyzw[chan].f[0] = a0;
+   mach->Inputs[attrib].xyzw[chan].f[1] = a0 + dadx;
+   mach->Inputs[attrib].xyzw[chan].f[2] = a0 + dady;
+   mach->Inputs[attrib].xyzw[chan].f[3] = a0 + dadx + dady;
+}
+
+/**
+ * Evaluate a perspective-valued coefficient at the position of the
+ * current quad.
+ */
+static void
+eval_perspective_coef(
+   struct tgsi_exec_machine *mach,
+   unsigned attrib,
+   unsigned chan )
+{
+   const float x = mach->QuadPos.xyzw[0].f[0];
+   const float y = mach->QuadPos.xyzw[1].f[0];
+   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
+   const float dady = mach->InterpCoefs[attrib].dady[chan];
+   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
+   const float *w = mach->QuadPos.xyzw[3].f;
+   /* divide by W here */
+   mach->Inputs[attrib].xyzw[chan].f[0] = a0 / w[0];
+   mach->Inputs[attrib].xyzw[chan].f[1] = (a0 + dadx) / w[1];
+   mach->Inputs[attrib].xyzw[chan].f[2] = (a0 + dady) / w[2];
+   mach->Inputs[attrib].xyzw[chan].f[3] = (a0 + dadx + dady) / w[3];
+}
+
+
+typedef void (* eval_coef_func)(
+   struct tgsi_exec_machine *mach,
+   unsigned attrib,
+   unsigned chan );
+
+static void
+exec_declaration(
+   struct tgsi_exec_machine *mach,
+   const struct tgsi_full_declaration *decl )
+{
+   if( mach->Processor == TGSI_PROCESSOR_FRAGMENT ) {
+      if( decl->Declaration.File == TGSI_FILE_INPUT ) {
+         unsigned first, last, mask;
+         eval_coef_func eval;
+
+         first = decl->DeclarationRange.First;
+         last = decl->DeclarationRange.Last;
+         mask = decl->Declaration.UsageMask;
+
+         switch( decl->Declaration.Interpolate ) {
+         case TGSI_INTERPOLATE_CONSTANT:
+            eval = eval_constant_coef;
+            break;
+
+         case TGSI_INTERPOLATE_LINEAR:
+            eval = eval_linear_coef;
+            break;
+
+         case TGSI_INTERPOLATE_PERSPECTIVE:
+            eval = eval_perspective_coef;
+            break;
+
+         default:
+            assert( 0 );
+         }
+
+         if( mask == TGSI_WRITEMASK_XYZW ) {
+            unsigned i, j;
+
+            for( i = first; i <= last; i++ ) {
+               for( j = 0; j < NUM_CHANNELS; j++ ) {
+                  eval( mach, i, j );
+               }
+            }
+         }
+         else {
+            unsigned i, j;
+
+            for( j = 0; j < NUM_CHANNELS; j++ ) {
+               if( mask & (1 << j) ) {
+                  for( i = first; i <= last; i++ ) {
+                     eval( mach, i, j );
+                  }
+               }
+            }
+         }
+      }
+   }
+}
+
+static void
+exec_instruction(
+   struct tgsi_exec_machine *mach,
+   const struct tgsi_full_instruction *inst,
+   int *pc )
+{
+   uint chan_index;
+   union tgsi_exec_channel r[8];
+
+   (*pc)++;
+
+   switch (inst->Instruction.Opcode) {
+   case TGSI_OPCODE_ARL:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 FETCH( &r[0], 0, chan_index );
+	 micro_f2it( &r[0], &r[0] );
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_MOV:
+   case TGSI_OPCODE_SWZ:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_LIT:
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
+	 STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X );
+      }
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y ) || IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
+	 FETCH( &r[0], 0, CHAN_X );
+	 if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
+	    micro_max( &r[0], &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
+	    STORE( &r[0], 0, CHAN_Y );
+	 }
+
+	 if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
+	    FETCH( &r[1], 0, CHAN_Y );
+	    micro_max( &r[1], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
+
+	    FETCH( &r[2], 0, CHAN_W );
+	    micro_min( &r[2], &r[2], &mach->Temps[TEMP_128_I].xyzw[TEMP_128_C] );
+	    micro_max( &r[2], &r[2], &mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C] );
+	    micro_pow( &r[1], &r[1], &r[2] );
+	    micro_lt( &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
+	    STORE( &r[0], 0, CHAN_Z );
+	 }
+      }
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
+	 STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
+      }
+      break;
+
+   case TGSI_OPCODE_RCP:
+   /* TGSI_OPCODE_RECIP */
+      FETCH( &r[0], 0, CHAN_X );
+      micro_div( &r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &r[0] );
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_RSQ:
+   /* TGSI_OPCODE_RECIPSQRT */
+      FETCH( &r[0], 0, CHAN_X );
+      micro_sqrt( &r[0], &r[0] );
+      micro_div( &r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &r[0] );
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_EXP:
+      FETCH( &r[0], 0, CHAN_X );
+      micro_flr( &r[1], &r[0] );  /* r1 = floor(r0) */
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
+         micro_exp2( &r[2], &r[1] );       /* r2 = 2 ^ r1 */
+	 STORE( &r[2], 0, CHAN_X );        /* store r2 */
+      }
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
+         micro_sub( &r[2], &r[0], &r[1] ); /* r2 = r0 - r1 */
+	 STORE( &r[2], 0, CHAN_Y );        /* store r2 */
+      }
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
+         micro_exp2( &r[2], &r[0] );       /* r2 = 2 ^ r0 */
+	 STORE( &r[2], 0, CHAN_Z );        /* store r2 */
+      }
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
+	 STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
+      }
+      break;
+
+   case TGSI_OPCODE_LOG:
+      FETCH( &r[0], 0, CHAN_X );
+      micro_abs( &r[2], &r[0] );  /* r2 = abs(r0) */
+      micro_lg2( &r[1], &r[2] );  /* r1 = lg2(r2) */
+      micro_flr( &r[0], &r[1] );  /* r0 = floor(r1) */
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
+	 STORE( &r[0], 0, CHAN_X );
+      }
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
+         micro_exp2( &r[0], &r[0] );       /* r0 = 2 ^ r0 */
+         micro_div( &r[0], &r[2], &r[0] ); /* r0 = r2 / r0 */
+	 STORE( &r[0], 0, CHAN_Y );
+      }
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
+	 STORE( &r[1], 0, CHAN_Z );
+      }
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
+	 STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
+      }
+      break;
+
+   case TGSI_OPCODE_MUL:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index )
+      {
+         FETCH(&r[0], 0, chan_index);
+         FETCH(&r[1], 1, chan_index);
+
+         micro_mul( &r[0], &r[0], &r[1] );
+
+         STORE(&r[0], 0, chan_index);
+      }
+      break;
+
+   case TGSI_OPCODE_ADD:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         micro_add( &r[0], &r[0], &r[1] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_DP3:
+   /* TGSI_OPCODE_DOT3 */
+      FETCH( &r[0], 0, CHAN_X );
+      FETCH( &r[1], 1, CHAN_X );
+      micro_mul( &r[0], &r[0], &r[1] );
+
+      FETCH( &r[1], 0, CHAN_Y );
+      FETCH( &r[2], 1, CHAN_Y );
+      micro_mul( &r[1], &r[1], &r[2] );
+      micro_add( &r[0], &r[0], &r[1] );
+
+      FETCH( &r[1], 0, CHAN_Z );
+      FETCH( &r[2], 1, CHAN_Z );
+      micro_mul( &r[1], &r[1], &r[2] );
+      micro_add( &r[0], &r[0], &r[1] );
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+    case TGSI_OPCODE_DP4:
+    /* TGSI_OPCODE_DOT4 */
+       FETCH(&r[0], 0, CHAN_X);
+       FETCH(&r[1], 1, CHAN_X);
+
+       micro_mul( &r[0], &r[0], &r[1] );
+
+       FETCH(&r[1], 0, CHAN_Y);
+       FETCH(&r[2], 1, CHAN_Y);
+
+       micro_mul( &r[1], &r[1], &r[2] );
+       micro_add( &r[0], &r[0], &r[1] );
+
+       FETCH(&r[1], 0, CHAN_Z);
+       FETCH(&r[2], 1, CHAN_Z);
+
+       micro_mul( &r[1], &r[1], &r[2] );
+       micro_add( &r[0], &r[0], &r[1] );
+
+       FETCH(&r[1], 0, CHAN_W);
+       FETCH(&r[2], 1, CHAN_W);
+
+       micro_mul( &r[1], &r[1], &r[2] );
+       micro_add( &r[0], &r[0], &r[1] );
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_DST:
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
+	 STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X );
+      }
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
+	 FETCH( &r[0], 0, CHAN_Y );
+	 FETCH( &r[1], 1, CHAN_Y);
+	 micro_mul( &r[0], &r[0], &r[1] );
+	 STORE( &r[0], 0, CHAN_Y );
+      }
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
+	 FETCH( &r[0], 0, CHAN_Z );
+	 STORE( &r[0], 0, CHAN_Z );
+      }
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
+	 FETCH( &r[0], 1, CHAN_W );
+	 STORE( &r[0], 0, CHAN_W );
+      }
+      break;
+
+   case TGSI_OPCODE_MIN:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH(&r[0], 0, chan_index);
+         FETCH(&r[1], 1, chan_index);
+
+         /* XXX use micro_min()?? */
+         micro_lt( &r[0], &r[0], &r[1], &r[0], &r[1] );
+
+         STORE(&r[0], 0, chan_index);
+      }
+      break;
+
+   case TGSI_OPCODE_MAX:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH(&r[0], 0, chan_index);
+         FETCH(&r[1], 1, chan_index);
+
+         /* XXX use micro_max()?? */
+         micro_lt( &r[0], &r[0], &r[1], &r[1], &r[0] );
+
+         STORE(&r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SLT:
+   /* TGSI_OPCODE_SETLT */
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         micro_lt( &r[0], &r[0], &r[1], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SGE:
+   /* TGSI_OPCODE_SETGE */
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         micro_ge( &r[0], &r[0], &r[1], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_MAD:
+   /* TGSI_OPCODE_MADD */
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         micro_mul( &r[0], &r[0], &r[1] );
+         FETCH( &r[1], 2, chan_index );
+         micro_add( &r[0], &r[0], &r[1] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SUB:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH(&r[0], 0, chan_index);
+         FETCH(&r[1], 1, chan_index);
+
+         micro_sub( &r[0], &r[0], &r[1] );
+
+         STORE(&r[0], 0, chan_index);
+      }
+      break;
+
+   case TGSI_OPCODE_LERP:
+   /* TGSI_OPCODE_LRP */
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH(&r[0], 0, chan_index);
+         FETCH(&r[1], 1, chan_index);
+         FETCH(&r[2], 2, chan_index);
+
+         micro_sub( &r[1], &r[1], &r[2] );
+         micro_mul( &r[0], &r[0], &r[1] );
+         micro_add( &r[0], &r[0], &r[2] );
+
+         STORE(&r[0], 0, chan_index);
+      }
+      break;
+
+   case TGSI_OPCODE_CND:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_CND0:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_DOT2ADD:
+      /* TGSI_OPCODE_DP2A */
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_INDEX:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_NEGATE:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_FRAC:
+   /* TGSI_OPCODE_FRC */
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         micro_frc( &r[0], &r[0] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_CLAMP:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_FLOOR:
+   /* TGSI_OPCODE_FLR */
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         micro_flr( &r[0], &r[0] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_ROUND:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         micro_rnd( &r[0], &r[0] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_EXPBASE2:
+    /* TGSI_OPCODE_EX2 */
+      FETCH(&r[0], 0, CHAN_X);
+
+      micro_pow( &r[0], &mach->Temps[TEMP_2_I].xyzw[TEMP_2_C], &r[0] );
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_LOGBASE2:
+   /* TGSI_OPCODE_LG2 */
+      FETCH( &r[0], 0, CHAN_X );
+      micro_lg2( &r[0], &r[0] );
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_POWER:
+      /* TGSI_OPCODE_POW */
+      FETCH(&r[0], 0, CHAN_X);
+      FETCH(&r[1], 1, CHAN_X);
+
+      micro_pow( &r[0], &r[0], &r[1] );
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_CROSSPRODUCT:
+      /* TGSI_OPCODE_XPD */
+      FETCH(&r[0], 0, CHAN_Y);
+      FETCH(&r[1], 1, CHAN_Z);
+
+      micro_mul( &r[2], &r[0], &r[1] );
+
+      FETCH(&r[3], 0, CHAN_Z);
+      FETCH(&r[4], 1, CHAN_Y);
+
+      micro_mul( &r[5], &r[3], &r[4] );
+      micro_sub( &r[2], &r[2], &r[5] );
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
+         STORE( &r[2], 0, CHAN_X );
+      }
+
+      FETCH(&r[2], 1, CHAN_X);
+
+      micro_mul( &r[3], &r[3], &r[2] );
+
+      FETCH(&r[5], 0, CHAN_X);
+
+      micro_mul( &r[1], &r[1], &r[5] );
+      micro_sub( &r[3], &r[3], &r[1] );
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
+         STORE( &r[3], 0, CHAN_Y );
+      }
+
+      micro_mul( &r[5], &r[5], &r[4] );
+      micro_mul( &r[0], &r[0], &r[2] );
+      micro_sub( &r[5], &r[5], &r[0] );
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
+         STORE( &r[5], 0, CHAN_Z );
+      }
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
+         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
+      }
+      break;
+
+    case TGSI_OPCODE_MULTIPLYMATRIX:
+       assert (0);
+       break;
+
+    case TGSI_OPCODE_ABS:
+       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+          FETCH(&r[0], 0, chan_index);
+
+          micro_abs( &r[0], &r[0] );
+
+          STORE(&r[0], 0, chan_index);
+       }
+       break;
+
+   case TGSI_OPCODE_RCC:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_DPH:
+      FETCH(&r[0], 0, CHAN_X);
+      FETCH(&r[1], 1, CHAN_X);
+
+      micro_mul( &r[0], &r[0], &r[1] );
+
+      FETCH(&r[1], 0, CHAN_Y);
+      FETCH(&r[2], 1, CHAN_Y);
+
+      micro_mul( &r[1], &r[1], &r[2] );
+      micro_add( &r[0], &r[0], &r[1] );
+
+      FETCH(&r[1], 0, CHAN_Z);
+      FETCH(&r[2], 1, CHAN_Z);
+
+      micro_mul( &r[1], &r[1], &r[2] );
+      micro_add( &r[0], &r[0], &r[1] );
+
+      FETCH(&r[1], 1, CHAN_W);
+
+      micro_add( &r[0], &r[0], &r[1] );
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_COS:
+      FETCH(&r[0], 0, CHAN_X);
+
+      micro_cos( &r[0], &r[0] );
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_DDX:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         micro_ddx( &r[0], &r[0] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_DDY:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         micro_ddy( &r[0], &r[0] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_KILP:
+      exec_kilp (mach, inst);
+      break;
+
+   case TGSI_OPCODE_KIL:
+      /* for enabled ExecMask bits, set the killed bit */
+      mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= mach->ExecMask;
+      break;
+
+   case TGSI_OPCODE_PK2H:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_PK2US:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_PK4B:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_PK4UB:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_RFL:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_SEQ:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         micro_eq( &r[0], &r[0], &r[1],
+                   &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C],
+                   &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SFL:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_SGT:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         micro_le( &r[0], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SIN:
+      FETCH( &r[0], 0, CHAN_X );
+      micro_sin( &r[0], &r[0] );
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SLE:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         micro_le( &r[0], &r[0], &r[1], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SNE:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         micro_eq( &r[0], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_STR:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_TEX:
+      /* simple texture lookup */
+      /* src[0] = texcoord */
+      /* src[1] = sampler unit */
+      exec_tex(mach, inst, FALSE, FALSE);
+      break;
+
+   case TGSI_OPCODE_TXB:
+      /* Texture lookup with lod bias */
+      /* src[0] = texcoord (src[0].w = LOD bias) */
+      /* src[1] = sampler unit */
+      exec_tex(mach, inst, TRUE, FALSE);
+      break;
+
+   case TGSI_OPCODE_TXD:
+      /* Texture lookup with explict partial derivatives */
+      /* src[0] = texcoord */
+      /* src[1] = d[strq]/dx */
+      /* src[2] = d[strq]/dy */
+      /* src[3] = sampler unit */
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_TXL:
+      /* Texture lookup with explit LOD */
+      /* src[0] = texcoord (src[0].w = LOD) */
+      /* src[1] = sampler unit */
+      exec_tex(mach, inst, TRUE, FALSE);
+      break;
+
+   case TGSI_OPCODE_TXP:
+      /* Texture lookup with projection */
+      /* src[0] = texcoord (src[0].w = projection) */
+      /* src[1] = sampler unit */
+      exec_tex(mach, inst, FALSE, TRUE);
+      break;
+
+   case TGSI_OPCODE_UP2H:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_UP2US:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_UP4B:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_UP4UB:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_X2D:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_ARA:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_ARR:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_BRA:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_CAL:
+      /* skip the call if no execution channels are enabled */
+      if (mach->ExecMask) {
+         /* do the call */
+
+         /* push the Cond, Loop, Cont stacks */
+         assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
+         mach->CondStack[mach->CondStackTop++] = mach->CondMask;
+         assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
+         mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
+         assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
+         mach->ContStack[mach->ContStackTop++] = mach->ContMask;
+
+         assert(mach->FuncStackTop < TGSI_EXEC_MAX_CALL_NESTING);
+         mach->FuncStack[mach->FuncStackTop++] = mach->FuncMask;
+
+         /* note that PC was already incremented above */
+         mach->CallStack[mach->CallStackTop++] = *pc;
+         *pc = inst->InstructionExtLabel.Label;
+      }
+      break;
+
+   case TGSI_OPCODE_RET:
+      mach->FuncMask &= ~mach->ExecMask;
+      UPDATE_EXEC_MASK(mach);
+
+      if (mach->ExecMask == 0x0) {
+         /* really return now (otherwise, keep executing */
+
+         if (mach->CallStackTop == 0) {
+            /* returning from main() */
+            *pc = -1;
+            return;
+         }
+         *pc = mach->CallStack[--mach->CallStackTop];
+
+         /* pop the Cond, Loop, Cont stacks */
+         assert(mach->CondStackTop > 0);
+         mach->CondMask = mach->CondStack[--mach->CondStackTop];
+         assert(mach->LoopStackTop > 0);
+         mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
+         assert(mach->ContStackTop > 0);
+         mach->ContMask = mach->ContStack[--mach->ContStackTop];
+         assert(mach->FuncStackTop > 0);
+         mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
+
+         UPDATE_EXEC_MASK(mach);
+      }
+      break;
+
+   case TGSI_OPCODE_SSG:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_CMP:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH(&r[0], 0, chan_index);
+         FETCH(&r[1], 1, chan_index);
+         FETCH(&r[2], 2, chan_index);
+
+         micro_lt( &r[0], &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &r[1], &r[2] );
+
+         STORE(&r[0], 0, chan_index);
+      }
+      break;
+
+   case TGSI_OPCODE_SCS:
+      if( IS_CHANNEL_ENABLED( *inst, CHAN_X ) || IS_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
+         FETCH( &r[0], 0, CHAN_X );
+      }
+      if( IS_CHANNEL_ENABLED( *inst, CHAN_X ) ) {
+         micro_cos( &r[1], &r[0] );
+         STORE( &r[1], 0, CHAN_X );
+      }
+      if( IS_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
+         micro_sin( &r[1], &r[0] );
+         STORE( &r[1], 0, CHAN_Y );
+      }
+      if( IS_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
+         STORE( &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], 0, CHAN_Z );
+      }
+      if( IS_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
+         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
+      }
+      break;
+
+   case TGSI_OPCODE_NRM:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_DIV:
+      assert( 0 );
+      break;
+
+   case TGSI_OPCODE_DP2:
+      FETCH( &r[0], 0, CHAN_X );
+      FETCH( &r[1], 1, CHAN_X );
+      micro_mul( &r[0], &r[0], &r[1] );
+
+      FETCH( &r[1], 0, CHAN_Y );
+      FETCH( &r[2], 1, CHAN_Y );
+      micro_mul( &r[1], &r[1], &r[2] );
+      micro_add( &r[0], &r[0], &r[1] );
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_IF:
+      /* push CondMask */
+      assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
+      mach->CondStack[mach->CondStackTop++] = mach->CondMask;
+      FETCH( &r[0], 0, CHAN_X );
+      /* update CondMask */
+      if( ! r[0].u[0] ) {
+         mach->CondMask &= ~0x1;
+      }
+      if( ! r[0].u[1] ) {
+         mach->CondMask &= ~0x2;
+      }
+      if( ! r[0].u[2] ) {
+         mach->CondMask &= ~0x4;
+      }
+      if( ! r[0].u[3] ) {
+         mach->CondMask &= ~0x8;
+      }
+      UPDATE_EXEC_MASK(mach);
+      /* Todo: If CondMask==0, jump to ELSE */
+      break;
+
+   case TGSI_OPCODE_ELSE:
+      /* invert CondMask wrt previous mask */
+      {
+         uint prevMask;
+         assert(mach->CondStackTop > 0);
+         prevMask = mach->CondStack[mach->CondStackTop - 1];
+         mach->CondMask = ~mach->CondMask & prevMask;
+         UPDATE_EXEC_MASK(mach);
+         /* Todo: If CondMask==0, jump to ENDIF */
+      }
+      break;
+
+   case TGSI_OPCODE_ENDIF:
+      /* pop CondMask */
+      assert(mach->CondStackTop > 0);
+      mach->CondMask = mach->CondStack[--mach->CondStackTop];
+      UPDATE_EXEC_MASK(mach);
+      break;
+
+   case TGSI_OPCODE_END:
+      /* halt execution */
+      *pc = -1;
+      break;
+
+   case TGSI_OPCODE_REP:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_ENDREP:
+       assert (0);
+       break;
+
+   case TGSI_OPCODE_PUSHA:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_POPA:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_CEIL:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         micro_ceil( &r[0], &r[0] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_I2F:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         micro_i2f( &r[0], &r[0] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_NOT:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         micro_not( &r[0], &r[0] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_TRUNC:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         micro_trunc( &r[0], &r[0] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SHL:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         micro_shl( &r[0], &r[0], &r[1] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SHR:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         micro_ishr( &r[0], &r[0], &r[1] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_AND:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         micro_and( &r[0], &r[0], &r[1] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_OR:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         micro_or( &r[0], &r[0], &r[1] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_MOD:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_XOR:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         micro_xor( &r[0], &r[0], &r[1] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SAD:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_TXF:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_TXQ:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_EMIT:
+      mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] += 16;
+      mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]]++;
+      break;
+
+   case TGSI_OPCODE_ENDPRIM:
+      mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]++;
+      mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]] = 0;
+      break;
+
+   case TGSI_OPCODE_LOOP:
+      /* fall-through (for now) */
+   case TGSI_OPCODE_BGNLOOP2:
+      /* push LoopMask and ContMasks */
+      assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
+      mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
+      assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
+      mach->ContStack[mach->ContStackTop++] = mach->ContMask;
+      break;
+
+   case TGSI_OPCODE_ENDLOOP:
+      /* fall-through (for now at least) */
+   case TGSI_OPCODE_ENDLOOP2:
+      /* Restore ContMask, but don't pop */
+      assert(mach->ContStackTop > 0);
+      mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
+      UPDATE_EXEC_MASK(mach);
+      if (mach->ExecMask) {
+         /* repeat loop: jump to instruction just past BGNLOOP */
+         *pc = inst->InstructionExtLabel.Label + 1;
+      }
+      else {
+         /* exit loop: pop LoopMask */
+         assert(mach->LoopStackTop > 0);
+         mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
+         /* pop ContMask */
+         assert(mach->ContStackTop > 0);
+         mach->ContMask = mach->ContStack[--mach->ContStackTop];
+      }
+      UPDATE_EXEC_MASK(mach);
+      break;
+
+   case TGSI_OPCODE_BRK:
+      /* turn off loop channels for each enabled exec channel */
+      mach->LoopMask &= ~mach->ExecMask;
+      /* Todo: if mach->LoopMask == 0, jump to end of loop */
+      UPDATE_EXEC_MASK(mach);
+      break;
+
+   case TGSI_OPCODE_CONT:
+      /* turn off cont channels for each enabled exec channel */
+      mach->ContMask &= ~mach->ExecMask;
+      /* Todo: if mach->LoopMask == 0, jump to end of loop */
+      UPDATE_EXEC_MASK(mach);
+      break;
+
+   case TGSI_OPCODE_BGNSUB:
+      /* no-op */
+      break;
+
+   case TGSI_OPCODE_ENDSUB:
+      /* no-op */
+      break;
+
+   case TGSI_OPCODE_NOISE1:
+      assert( 0 );
+      break;
+
+   case TGSI_OPCODE_NOISE2:
+      assert( 0 );
+      break;
+
+   case TGSI_OPCODE_NOISE3:
+      assert( 0 );
+      break;
+
+   case TGSI_OPCODE_NOISE4:
+      assert( 0 );
+      break;
+
+   case TGSI_OPCODE_NOP:
+      break;
+
+   default:
+      assert( 0 );
+   }
+}
+
+
+/**
+ * Run TGSI interpreter.
+ * \return bitmask of "alive" quad components
+ */
+uint
+tgsi_exec_machine_run( struct tgsi_exec_machine *mach )
+{
+   uint i;
+   int pc = 0;
+
+   mach->CondMask = 0xf;
+   mach->LoopMask = 0xf;
+   mach->ContMask = 0xf;
+   mach->FuncMask = 0xf;
+   mach->ExecMask = 0xf;
+
+   mach->CondStackTop = 0; /* temporarily subvert this assertion */
+   assert(mach->CondStackTop == 0);
+   assert(mach->LoopStackTop == 0);
+   assert(mach->ContStackTop == 0);
+   assert(mach->CallStackTop == 0);
+
+   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] = 0;
+   mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] = 0;
+
+   if( mach->Processor == TGSI_PROCESSOR_GEOMETRY ) {
+      mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0] = 0;
+      mach->Primitives[0] = 0;
+   }
+
+
+   /* execute declarations (interpolants) */
+   for (i = 0; i < mach->NumDeclarations; i++) {
+      exec_declaration( mach, mach->Declarations+i );
+   }
+
+   /* execute instructions, until pc is set to -1 */
+   while (pc != -1) {
+      assert(pc < (int) mach->NumInstructions);
+      exec_instruction( mach, mach->Instructions + pc, &pc );
+   }
+
+#if 0
+   /* we scale from floats in [0,1] to Zbuffer ints in sp_quad_depth_test.c */
+   if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) {
+      /*
+       * Scale back depth component.
+       */
+      for (i = 0; i < 4; i++)
+         mach->Outputs[0].xyzw[2].f[i] *= ctx->DrawBuffer->_DepthMaxF;
+   }
+#endif
+
+   return ~mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
+}
+
+
diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.h b/src/gallium/auxiliary/tgsi/tgsi_exec.h
new file mode 100644
index 0000000000..4f30650b07
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.h
@@ -0,0 +1,253 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#if !defined TGSI_EXEC_H
+#define TGSI_EXEC_H
+
+#include "pipe/p_compiler.h"
+
+#if defined __cplusplus
+extern "C" {
+#endif
+
+#define MAX_LABELS 1024
+
+#define NUM_CHANNELS 4  /* R,G,B,A */
+#define QUAD_SIZE    4  /* 4 pixel/quad */
+
+/**
+  * Registers may be treated as float, signed int or unsigned int.
+  */
+union tgsi_exec_channel
+{
+   float    f[QUAD_SIZE];
+   int      i[QUAD_SIZE];
+   unsigned u[QUAD_SIZE];
+};
+
+/**
+  * A vector[RGBA] of channels[4 pixels]
+  */
+struct tgsi_exec_vector
+{
+   union tgsi_exec_channel xyzw[NUM_CHANNELS];
+};
+
+/**
+ * For fragment programs, information for computing fragment input
+ * values from plane equation of the triangle/line.
+ */
+struct tgsi_interp_coef
+{
+   float a0[NUM_CHANNELS];	/* in an xyzw layout */
+   float dadx[NUM_CHANNELS];
+   float dady[NUM_CHANNELS];
+};
+
+
+struct softpipe_tile_cache;  /**< Opaque to TGSI */
+
+/**
+ * Information for sampling textures, which must be implemented
+ * by code outside the TGSI executor.
+ */
+struct tgsi_sampler
+{
+   const struct pipe_sampler_state *state;
+   struct pipe_texture *texture;
+   /** Get samples for four fragments in a quad */
+   void (*get_samples)(struct tgsi_sampler *sampler,
+                       const float s[QUAD_SIZE],
+                       const float t[QUAD_SIZE],
+                       const float p[QUAD_SIZE],
+                       float lodbias,
+                       float rgba[NUM_CHANNELS][QUAD_SIZE]);
+   void *pipe; /*XXX temporary*/
+   struct softpipe_tile_cache *cache;
+};
+
+/**
+ * For branching/calling subroutines.
+ */
+struct tgsi_exec_labels
+{
+   unsigned labels[MAX_LABELS][2];
+   unsigned count;
+};
+
+
+#define TGSI_EXEC_NUM_TEMPS       128
+#define TGSI_EXEC_NUM_TEMP_EXTRAS   6
+#define TGSI_EXEC_NUM_IMMEDIATES  256
+
+/*
+ * Locations of various utility registers (_I = Index, _C = Channel)
+ */
+#define TGSI_EXEC_TEMP_00000000_I   (TGSI_EXEC_NUM_TEMPS + 0)
+#define TGSI_EXEC_TEMP_00000000_C   0
+
+#define TGSI_EXEC_TEMP_7FFFFFFF_I   (TGSI_EXEC_NUM_TEMPS + 0)
+#define TGSI_EXEC_TEMP_7FFFFFFF_C   1
+
+#define TGSI_EXEC_TEMP_80000000_I   (TGSI_EXEC_NUM_TEMPS + 0)
+#define TGSI_EXEC_TEMP_80000000_C   2
+
+#define TGSI_EXEC_TEMP_FFFFFFFF_I   (TGSI_EXEC_NUM_TEMPS + 0)
+#define TGSI_EXEC_TEMP_FFFFFFFF_C   3
+
+#define TGSI_EXEC_TEMP_ONE_I        (TGSI_EXEC_NUM_TEMPS + 1)
+#define TGSI_EXEC_TEMP_ONE_C        0
+
+#define TGSI_EXEC_TEMP_TWO_I        (TGSI_EXEC_NUM_TEMPS + 1)
+#define TGSI_EXEC_TEMP_TWO_C        1
+
+#define TGSI_EXEC_TEMP_128_I        (TGSI_EXEC_NUM_TEMPS + 1)
+#define TGSI_EXEC_TEMP_128_C        2
+
+#define TGSI_EXEC_TEMP_MINUS_128_I  (TGSI_EXEC_NUM_TEMPS + 1)
+#define TGSI_EXEC_TEMP_MINUS_128_C  3
+
+#define TGSI_EXEC_TEMP_KILMASK_I    (TGSI_EXEC_NUM_TEMPS + 2)
+#define TGSI_EXEC_TEMP_KILMASK_C    0
+
+#define TGSI_EXEC_TEMP_OUTPUT_I     (TGSI_EXEC_NUM_TEMPS + 2)
+#define TGSI_EXEC_TEMP_OUTPUT_C     1
+
+#define TGSI_EXEC_TEMP_PRIMITIVE_I  (TGSI_EXEC_NUM_TEMPS + 2)
+#define TGSI_EXEC_TEMP_PRIMITIVE_C  2
+
+#define TGSI_EXEC_TEMP_THREE_I      (TGSI_EXEC_NUM_TEMPS + 2)
+#define TGSI_EXEC_TEMP_THREE_C      3
+
+#define TGSI_EXEC_TEMP_HALF_I       (TGSI_EXEC_NUM_TEMPS + 3)
+#define TGSI_EXEC_TEMP_HALF_C       0
+
+#define TGSI_EXEC_TEMP_R0           (TGSI_EXEC_NUM_TEMPS + 4)
+
+#define TGSI_EXEC_TEMP_ADDR         (TGSI_EXEC_NUM_TEMPS + 5)
+
+
+#define TGSI_EXEC_MAX_COND_NESTING  20
+#define TGSI_EXEC_MAX_LOOP_NESTING  20
+#define TGSI_EXEC_MAX_CALL_NESTING  20
+
+/**
+ * Run-time virtual machine state for executing TGSI shader.
+ */
+struct tgsi_exec_machine
+{
+   /* Total = program temporaries + internal temporaries
+    *         + 1 padding to align to 16 bytes
+    */
+   struct tgsi_exec_vector       _Temps[TGSI_EXEC_NUM_TEMPS +
+                                        TGSI_EXEC_NUM_TEMP_EXTRAS + 1];
+
+   /*
+    * This will point to _Temps after aligning to 16B boundary.
+    */
+   struct tgsi_exec_vector       *Temps;
+   struct tgsi_exec_vector       *Addrs;
+
+   struct tgsi_sampler           *Samplers;
+
+   float                         Imms[TGSI_EXEC_NUM_IMMEDIATES][4];
+   unsigned                      ImmLimit;
+   const float                   (*Consts)[4];
+   struct tgsi_exec_vector       *Inputs;
+   struct tgsi_exec_vector       *Outputs;
+   const struct tgsi_token       *Tokens;
+   unsigned                      Processor;
+
+   /* GEOMETRY processor only. */
+   unsigned                      *Primitives;
+
+   /* FRAGMENT processor only. */
+   const struct tgsi_interp_coef *InterpCoefs;
+   struct tgsi_exec_vector       QuadPos;
+
+   /* Conditional execution masks */
+   uint CondMask;  /**< For IF/ELSE/ENDIF */
+   uint LoopMask;  /**< For BGNLOOP/ENDLOOP */
+   uint ContMask;  /**< For loop CONT statements */
+   uint FuncMask;  /**< For function calls */
+   uint ExecMask;  /**< = CondMask & LoopMask */
+
+   /** Condition mask stack (for nested conditionals) */
+   uint CondStack[TGSI_EXEC_MAX_COND_NESTING];
+   int CondStackTop;
+
+   /** Loop mask stack (for nested loops) */
+   uint LoopStack[TGSI_EXEC_MAX_LOOP_NESTING];
+   int LoopStackTop;
+
+   /** Loop continue mask stack (see comments in tgsi_exec.c) */
+   uint ContStack[TGSI_EXEC_MAX_LOOP_NESTING];
+   int ContStackTop;
+
+   /** Function execution mask stack (for executing subroutine code) */
+   uint FuncStack[TGSI_EXEC_MAX_CALL_NESTING];
+   int FuncStackTop;
+
+   /** Function call stack for saving/restoring the program counter */
+   uint CallStack[TGSI_EXEC_MAX_CALL_NESTING];
+   int CallStackTop;
+
+   struct tgsi_full_instruction *Instructions;
+   uint NumInstructions;
+
+   struct tgsi_full_declaration *Declarations;
+   uint NumDeclarations;
+
+   struct tgsi_exec_labels Labels;
+};
+
+void
+tgsi_exec_machine_init(
+   struct tgsi_exec_machine *mach );
+
+
+void 
+tgsi_exec_machine_bind_shader(
+   struct tgsi_exec_machine *mach,
+   const struct tgsi_token *tokens,
+   uint numSamplers,
+   struct tgsi_sampler *samplers);
+
+uint
+tgsi_exec_machine_run(
+   struct tgsi_exec_machine *mach );
+
+
+void
+tgsi_exec_machine_free_data(struct tgsi_exec_machine *mach);
+
+
+#if defined __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /* TGSI_EXEC_H */
diff --git a/src/gallium/auxiliary/tgsi/tgsi_iterate.c b/src/gallium/auxiliary/tgsi/tgsi_iterate.c
new file mode 100644
index 0000000000..5371a88b96
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/tgsi_iterate.c
@@ -0,0 +1,85 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "pipe/p_debug.h"
+#include "tgsi_iterate.h"
+
+boolean
+tgsi_iterate_shader(
+   const struct tgsi_token *tokens,
+   struct tgsi_iterate_context *ctx )
+{
+   struct tgsi_parse_context parse;
+
+   if (tgsi_parse_init( &parse, tokens ) != TGSI_PARSE_OK)
+      return FALSE;
+
+   ctx->processor = parse.FullHeader.Processor;
+   ctx->version = parse.FullVersion.Version;
+
+   if (ctx->prolog)
+      if (!ctx->prolog( ctx ))
+         goto fail;
+
+   while (!tgsi_parse_end_of_tokens( &parse )) {
+      tgsi_parse_token( &parse );
+
+      switch (parse.FullToken.Token.Type) {
+      case TGSI_TOKEN_TYPE_INSTRUCTION:
+         if (ctx->iterate_instruction)
+            if (!ctx->iterate_instruction( ctx, &parse.FullToken.FullInstruction ))
+               goto fail;
+         break;
+
+      case TGSI_TOKEN_TYPE_DECLARATION:
+         if (ctx->iterate_declaration)
+            if (!ctx->iterate_declaration( ctx, &parse.FullToken.FullDeclaration ))
+               goto fail;
+         break;
+
+      case TGSI_TOKEN_TYPE_IMMEDIATE:
+         if (ctx->iterate_immediate)
+            if (!ctx->iterate_immediate( ctx, &parse.FullToken.FullImmediate ))
+               goto fail;
+         break;
+
+      default:
+         assert( 0 );
+      }
+   }
+
+   if (ctx->epilog)
+      if (!ctx->epilog( ctx ))
+         goto fail;
+
+   tgsi_parse_free( &parse );
+   return TRUE;
+
+fail:
+   tgsi_parse_free( &parse );
+   return FALSE;
+}
diff --git a/src/gallium/auxiliary/tgsi/tgsi_iterate.h b/src/gallium/auxiliary/tgsi/tgsi_iterate.h
new file mode 100644
index 0000000000..ec7b85bf63
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/tgsi_iterate.h
@@ -0,0 +1,76 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef TGSI_ITERATE_H
+#define TGSI_ITERATE_H
+
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+
+#if defined __cplusplus
+extern "C" {
+#endif
+
+struct tgsi_iterate_context
+{
+   boolean
+   (* prolog)(
+      struct tgsi_iterate_context *ctx );
+
+   boolean
+   (* iterate_instruction)(
+      struct tgsi_iterate_context *ctx,
+      struct tgsi_full_instruction *inst );
+
+   boolean
+   (* iterate_declaration)(
+      struct tgsi_iterate_context *ctx,
+      struct tgsi_full_declaration *decl );
+
+   boolean
+   (* iterate_immediate)(
+      struct tgsi_iterate_context *ctx,
+      struct tgsi_full_immediate *imm );
+
+   boolean
+   (* epilog)(
+      struct tgsi_iterate_context *ctx );
+
+   struct tgsi_processor processor;
+   struct tgsi_version version;
+};
+
+boolean
+tgsi_iterate_shader(
+   const struct tgsi_token *tokens,
+   struct tgsi_iterate_context *ctx );
+
+#if defined __cplusplus
+}
+#endif
+
+#endif /* TGSI_ITERATE_H */
diff --git a/src/gallium/auxiliary/tgsi/tgsi_parse.c b/src/gallium/auxiliary/tgsi/tgsi_parse.c
new file mode 100644
index 0000000000..d16f0cdcad
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/tgsi_parse.c
@@ -0,0 +1,332 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "pipe/p_debug.h"
+#include "pipe/p_util.h"
+#include "pipe/p_shader_tokens.h"
+#include "tgsi_parse.h"
+#include "tgsi_build.h"
+
+void
+tgsi_full_token_init(
+   union tgsi_full_token *full_token )
+{
+   full_token->Token.Type = TGSI_TOKEN_TYPE_DECLARATION;
+}
+
+void
+tgsi_full_token_free(
+   union tgsi_full_token *full_token )
+{
+   if( full_token->Token.Type == TGSI_TOKEN_TYPE_IMMEDIATE ) {
+      FREE( (void *) full_token->FullImmediate.u.Pointer );
+   }
+}
+
+unsigned
+tgsi_parse_init(
+   struct tgsi_parse_context *ctx,
+   const struct tgsi_token *tokens )
+{
+   ctx->FullVersion.Version = *(struct tgsi_version *) &tokens[0];
+   if( ctx->FullVersion.Version.MajorVersion > 1 ) {
+      return TGSI_PARSE_ERROR;
+   }
+
+   ctx->FullHeader.Header = *(struct tgsi_header *) &tokens[1];
+   if( ctx->FullHeader.Header.HeaderSize >= 2 ) {
+      ctx->FullHeader.Processor = *(struct tgsi_processor *) &tokens[2];
+   }
+   else {
+      ctx->FullHeader.Processor = tgsi_default_processor();
+   }
+
+   ctx->Tokens = tokens;
+   ctx->Position = 1 + ctx->FullHeader.Header.HeaderSize;
+
+   tgsi_full_token_init( &ctx->FullToken );
+
+   return TGSI_PARSE_OK;
+}
+
+void
+tgsi_parse_free(
+   struct tgsi_parse_context *ctx )
+{
+   tgsi_full_token_free( &ctx->FullToken );
+}
+
+boolean
+tgsi_parse_end_of_tokens(
+   struct tgsi_parse_context *ctx )
+{
+   return ctx->Position >=
+      1 + ctx->FullHeader.Header.HeaderSize + ctx->FullHeader.Header.BodySize;
+}
+
+static void
+next_token(
+   struct tgsi_parse_context *ctx,
+   void *token )
+{
+   assert( !tgsi_parse_end_of_tokens( ctx ) );
+
+   *(struct tgsi_token *) token = ctx->Tokens[ctx->Position++];
+}
+
+void
+tgsi_parse_token(
+   struct tgsi_parse_context *ctx )
+{
+   struct tgsi_token token;
+   unsigned i;
+
+   tgsi_full_token_free( &ctx->FullToken );
+   tgsi_full_token_init( &ctx->FullToken );
+
+   next_token( ctx, &token );
+
+   switch( token.Type ) {
+   case TGSI_TOKEN_TYPE_DECLARATION:
+   {
+      struct tgsi_full_declaration *decl = &ctx->FullToken.FullDeclaration;
+
+      *decl = tgsi_default_full_declaration();
+      decl->Declaration = *(struct tgsi_declaration *) &token;
+
+      next_token( ctx, &decl->DeclarationRange );
+
+      if( decl->Declaration.Semantic ) {
+         next_token( ctx, &decl->Semantic );
+      }
+
+      break;
+   }
+
+   case TGSI_TOKEN_TYPE_IMMEDIATE:
+   {
+      struct tgsi_full_immediate *imm = &ctx->FullToken.FullImmediate;
+
+      *imm = tgsi_default_full_immediate();
+      imm->Immediate = *(struct tgsi_immediate *) &token;
+
+      assert( !imm->Immediate.Extended );
+
+      switch (imm->Immediate.DataType) {
+      case TGSI_IMM_FLOAT32:
+         imm->u.Pointer = MALLOC(
+            sizeof( struct tgsi_immediate_float32 ) * (imm->Immediate.Size - 1) );
+         for( i = 0; i < imm->Immediate.Size - 1; i++ ) {
+            next_token( ctx, (struct tgsi_immediate_float32 *) &imm->u.ImmediateFloat32[i] );
+         }
+         break;
+
+      default:
+         assert( 0 );
+      }
+
+      break;
+   }
+
+   case TGSI_TOKEN_TYPE_INSTRUCTION:
+   {
+      struct tgsi_full_instruction *inst = &ctx->FullToken.FullInstruction;
+      unsigned extended;
+
+      *inst = tgsi_default_full_instruction();
+      inst->Instruction = *(struct tgsi_instruction *) &token;
+
+      extended = inst->Instruction.Extended;
+
+      while( extended ) {
+         struct tgsi_src_register_ext token;
+
+         next_token( ctx, &token );
+
+         switch( token.Type ) {
+         case TGSI_INSTRUCTION_EXT_TYPE_NV:
+            inst->InstructionExtNv =
+               *(struct tgsi_instruction_ext_nv *) &token;
+            break;
+
+         case TGSI_INSTRUCTION_EXT_TYPE_LABEL:
+            inst->InstructionExtLabel =
+               *(struct tgsi_instruction_ext_label *) &token;
+            break;
+
+         case TGSI_INSTRUCTION_EXT_TYPE_TEXTURE:
+            inst->InstructionExtTexture =
+               *(struct tgsi_instruction_ext_texture *) &token;
+            break;
+
+         default:
+            assert( 0 );
+         }
+
+         extended = token.Extended;
+      }
+
+      assert( inst->Instruction.NumDstRegs <= TGSI_FULL_MAX_DST_REGISTERS );
+
+      for(  i = 0; i < inst->Instruction.NumDstRegs; i++ ) {
+         unsigned extended;
+
+         next_token( ctx, &inst->FullDstRegisters[i].DstRegister );
+
+         /*
+          * No support for indirect or multi-dimensional addressing.
+          */
+         assert( !inst->FullDstRegisters[i].DstRegister.Indirect );
+         assert( !inst->FullDstRegisters[i].DstRegister.Dimension );
+
+         extended = inst->FullDstRegisters[i].DstRegister.Extended;
+
+         while( extended ) {
+            struct tgsi_src_register_ext token;
+
+            next_token( ctx, &token );
+
+            switch( token.Type ) {
+            case TGSI_DST_REGISTER_EXT_TYPE_CONDCODE:
+               inst->FullDstRegisters[i].DstRegisterExtConcode =
+                  *(struct tgsi_dst_register_ext_concode *) &token;
+               break;
+
+            case TGSI_DST_REGISTER_EXT_TYPE_MODULATE:
+               inst->FullDstRegisters[i].DstRegisterExtModulate =
+                  *(struct tgsi_dst_register_ext_modulate *) &token;
+               break;
+
+            default:
+               assert( 0 );
+            }
+
+            extended = token.Extended;
+         }
+      }
+
+      assert( inst->Instruction.NumSrcRegs <= TGSI_FULL_MAX_SRC_REGISTERS );
+
+      for( i = 0; i < inst->Instruction.NumSrcRegs; i++ ) {
+         unsigned extended;
+
+         next_token( ctx, &inst->FullSrcRegisters[i].SrcRegister );
+
+         extended = inst->FullSrcRegisters[i].SrcRegister.Extended;
+
+         while( extended ) {
+            struct tgsi_src_register_ext token;
+
+            next_token( ctx, &token );
+
+            switch( token.Type ) {
+            case TGSI_SRC_REGISTER_EXT_TYPE_SWZ:
+               inst->FullSrcRegisters[i].SrcRegisterExtSwz =
+                  *(struct tgsi_src_register_ext_swz *) &token;
+               break;
+
+            case TGSI_SRC_REGISTER_EXT_TYPE_MOD:
+               inst->FullSrcRegisters[i].SrcRegisterExtMod =
+                  *(struct tgsi_src_register_ext_mod *) &token;
+               break;
+
+            default:
+               assert( 0 );
+            }
+
+            extended = token.Extended;
+         }
+
+         if( inst->FullSrcRegisters[i].SrcRegister.Indirect ) {
+            next_token( ctx, &inst->FullSrcRegisters[i].SrcRegisterInd );
+
+            /*
+             * No support for indirect or multi-dimensional addressing.
+             */
+            assert( !inst->FullSrcRegisters[i].SrcRegisterInd.Indirect );
+            assert( !inst->FullSrcRegisters[i].SrcRegisterInd.Dimension );
+            assert( !inst->FullSrcRegisters[i].SrcRegisterInd.Extended );
+         }
+
+         if( inst->FullSrcRegisters[i].SrcRegister.Dimension ) {
+            next_token( ctx, &inst->FullSrcRegisters[i].SrcRegisterDim );
+
+            /*
+             * No support for multi-dimensional addressing.
+             */
+            assert( !inst->FullSrcRegisters[i].SrcRegisterDim.Dimension );
+            assert( !inst->FullSrcRegisters[i].SrcRegisterDim.Extended );
+
+            if( inst->FullSrcRegisters[i].SrcRegisterDim.Indirect ) {
+               next_token( ctx, &inst->FullSrcRegisters[i].SrcRegisterDimInd );
+
+               /*
+               * No support for indirect or multi-dimensional addressing.
+               */
+               assert( !inst->FullSrcRegisters[i].SrcRegisterInd.Indirect );
+               assert( !inst->FullSrcRegisters[i].SrcRegisterInd.Dimension );
+               assert( !inst->FullSrcRegisters[i].SrcRegisterInd.Extended );
+            }
+         }
+      }
+
+      break;
+   }
+
+   default:
+      assert( 0 );
+   }
+}
+
+
+unsigned
+tgsi_num_tokens(const struct tgsi_token *tokens)
+{
+   struct tgsi_parse_context ctx;
+   if (tgsi_parse_init(&ctx, tokens) == TGSI_PARSE_OK) {
+      unsigned len = (ctx.FullHeader.Header.HeaderSize +
+                      ctx.FullHeader.Header.BodySize +
+                      1);
+      return len;
+   }
+   return 0;
+}
+
+
+/**
+ * Make a new copy of a token array.
+ */
+struct tgsi_token *
+tgsi_dup_tokens(const struct tgsi_token *tokens)
+{
+   unsigned n = tgsi_num_tokens(tokens);
+   unsigned bytes = n * sizeof(struct tgsi_token);
+   struct tgsi_token *new_tokens = (struct tgsi_token *) MALLOC(bytes);
+   if (new_tokens)
+      memcpy(new_tokens, tokens, bytes);
+   return new_tokens;
+}
diff --git a/src/gallium/auxiliary/tgsi/tgsi_parse.h b/src/gallium/auxiliary/tgsi/tgsi_parse.h
new file mode 100644
index 0000000000..054350712d
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/tgsi_parse.h
@@ -0,0 +1,151 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef TGSI_PARSE_H
+#define TGSI_PARSE_H
+
+#include "pipe/p_shader_tokens.h"
+
+#if defined __cplusplus
+extern "C" {
+#endif
+
+struct tgsi_full_version
+{
+   struct tgsi_version  Version;
+};
+
+struct tgsi_full_header
+{
+   struct tgsi_header      Header;
+   struct tgsi_processor   Processor;
+};
+
+struct tgsi_full_dst_register
+{
+   struct tgsi_dst_register               DstRegister;
+   struct tgsi_dst_register_ext_concode   DstRegisterExtConcode;
+   struct tgsi_dst_register_ext_modulate  DstRegisterExtModulate;
+};
+
+struct tgsi_full_src_register
+{
+   struct tgsi_src_register         SrcRegister;
+   struct tgsi_src_register_ext_swz SrcRegisterExtSwz;
+   struct tgsi_src_register_ext_mod SrcRegisterExtMod;
+   struct tgsi_src_register         SrcRegisterInd;
+   struct tgsi_dimension            SrcRegisterDim;
+   struct tgsi_src_register         SrcRegisterDimInd;
+};
+
+struct tgsi_full_declaration
+{
+   struct tgsi_declaration Declaration;
+   struct tgsi_declaration_range DeclarationRange;
+   struct tgsi_declaration_semantic Semantic;
+};
+
+struct tgsi_full_immediate
+{
+   struct tgsi_immediate   Immediate;
+   union
+   {
+      const void                          *Pointer;
+      const struct tgsi_immediate_float32 *ImmediateFloat32;
+   } u;
+};
+
+#define TGSI_FULL_MAX_DST_REGISTERS 2
+#define TGSI_FULL_MAX_SRC_REGISTERS 4 /* TXD has 4 */
+
+struct tgsi_full_instruction
+{
+   struct tgsi_instruction             Instruction;
+   struct tgsi_instruction_ext_nv      InstructionExtNv;
+   struct tgsi_instruction_ext_label   InstructionExtLabel;
+   struct tgsi_instruction_ext_texture InstructionExtTexture;
+   struct tgsi_full_dst_register       FullDstRegisters[TGSI_FULL_MAX_DST_REGISTERS];
+   struct tgsi_full_src_register       FullSrcRegisters[TGSI_FULL_MAX_SRC_REGISTERS];
+};
+
+union tgsi_full_token
+{
+   struct tgsi_token             Token;
+   struct tgsi_full_declaration  FullDeclaration;
+   struct tgsi_full_immediate    FullImmediate;
+   struct tgsi_full_instruction  FullInstruction;
+};
+
+void
+tgsi_full_token_init(
+   union tgsi_full_token *full_token );
+
+void
+tgsi_full_token_free(
+   union tgsi_full_token *full_token );
+
+struct tgsi_parse_context
+{
+   const struct tgsi_token    *Tokens;
+   unsigned                   Position;
+   struct tgsi_full_version   FullVersion;
+   struct tgsi_full_header    FullHeader;
+   union tgsi_full_token      FullToken;
+};
+
+#define TGSI_PARSE_OK      0
+#define TGSI_PARSE_ERROR   1
+
+unsigned
+tgsi_parse_init(
+   struct tgsi_parse_context *ctx,
+   const struct tgsi_token *tokens );
+
+void
+tgsi_parse_free(
+   struct tgsi_parse_context *ctx );
+
+boolean
+tgsi_parse_end_of_tokens(
+   struct tgsi_parse_context *ctx );
+
+void
+tgsi_parse_token(
+   struct tgsi_parse_context *ctx );
+
+unsigned
+tgsi_num_tokens(const struct tgsi_token *tokens);
+
+struct tgsi_token *
+tgsi_dup_tokens(const struct tgsi_token *tokens);
+
+#if defined __cplusplus
+}
+#endif
+
+#endif /* TGSI_PARSE_H */
+
diff --git a/src/gallium/auxiliary/tgsi/tgsi_sanity.c b/src/gallium/auxiliary/tgsi/tgsi_sanity.c
new file mode 100644
index 0000000000..2e3ec96b5b
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/tgsi_sanity.c
@@ -0,0 +1,341 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "pipe/p_debug.h"
+#include "tgsi_sanity.h"
+#include "tgsi_iterate.h"
+
+#define MAX_REGISTERS 256
+
+typedef uint reg_flag;
+
+#define BITS_IN_REG_FLAG (sizeof( reg_flag ) * 8)
+
+struct sanity_check_ctx
+{
+   struct tgsi_iterate_context iter;
+
+   reg_flag regs_decl[TGSI_FILE_COUNT][MAX_REGISTERS / BITS_IN_REG_FLAG];
+   reg_flag regs_used[TGSI_FILE_COUNT][MAX_REGISTERS / BITS_IN_REG_FLAG];
+   boolean regs_ind_used[TGSI_FILE_COUNT];
+   uint num_imms;
+   uint num_instructions;
+   uint index_of_END;
+
+   uint errors;
+   uint warnings;
+};
+
+static void
+report_error(
+   struct sanity_check_ctx *ctx,
+   const char *format,
+   ... )
+{
+   va_list args;
+
+   debug_printf( "Error  : " );
+   va_start( args, format );
+   _debug_vprintf( format, args );
+   va_end( args );
+   debug_printf( "\n" );
+   ctx->errors++;
+}
+
+static void
+report_warning(
+   struct sanity_check_ctx *ctx,
+   const char *format,
+   ... )
+{
+   va_list args;
+
+   debug_printf( "Warning: " );
+   va_start( args, format );
+   _debug_vprintf( format, args );
+   va_end( args );
+   debug_printf( "\n" );
+   ctx->warnings++;
+}
+
+static boolean
+check_file_name(
+   struct sanity_check_ctx *ctx,
+   uint file )
+{
+   if (file <= TGSI_FILE_NULL || file >= TGSI_FILE_COUNT) {
+      report_error( ctx, "Invalid register file name" );
+      return FALSE;
+   }
+   return TRUE;
+}
+
+static boolean
+is_register_declared(
+   struct sanity_check_ctx *ctx,
+   uint file,
+   int index )
+{
+   assert( index >= 0 && index < MAX_REGISTERS );
+
+   return (ctx->regs_decl[file][index / BITS_IN_REG_FLAG] & (1 << (index % BITS_IN_REG_FLAG))) ? TRUE : FALSE;
+}
+
+static boolean
+is_any_register_declared(
+   struct sanity_check_ctx *ctx,
+   uint file )
+{
+   uint i;
+
+   for (i = 0; i < MAX_REGISTERS / BITS_IN_REG_FLAG; i++)
+      if (ctx->regs_decl[file][i])
+         return TRUE;
+   return FALSE;
+}
+
+static boolean
+is_register_used(
+   struct sanity_check_ctx *ctx,
+   uint file,
+   int index )
+{
+   assert( index < MAX_REGISTERS );
+
+   return (ctx->regs_used[file][index / BITS_IN_REG_FLAG] & (1 << (index % BITS_IN_REG_FLAG))) ? TRUE : FALSE;
+}
+
+static const char *file_names[] =
+{
+   "NULL",
+   "CONST",
+   "IN",
+   "OUT",
+   "TEMP",
+   "SAMP",
+   "ADDR",
+   "IMM"
+};
+
+static boolean
+check_register_usage(
+   struct sanity_check_ctx *ctx,
+   uint file,
+   int index,
+   const char *name,
+   boolean indirect_access )
+{
+   if (!check_file_name( ctx, file ))
+      return FALSE;
+   if (indirect_access) {
+      if (!is_any_register_declared( ctx, file ))
+         report_error( ctx, "%s: Undeclared %s register", file_names[file], name );
+      ctx->regs_ind_used[file] = TRUE;
+   }
+   else {
+      if (!is_register_declared( ctx, file, index ))
+         report_error( ctx, "%s[%d]: Undeclared %s register", file_names[file], index, name );
+      ctx->regs_used[file][index / BITS_IN_REG_FLAG] |= (1 << (index % BITS_IN_REG_FLAG));
+   }
+   return TRUE;
+}
+
+static boolean
+iter_instruction(
+   struct tgsi_iterate_context *iter,
+   struct tgsi_full_instruction *inst )
+{
+   struct sanity_check_ctx *ctx = (struct sanity_check_ctx *) iter;
+   uint i;
+
+   /* There must be no other instructions after END.
+    */
+   if (ctx->index_of_END != ~0) {
+      report_error( ctx, "Unexpected instruction after END" );
+   }
+   else if (inst->Instruction.Opcode == TGSI_OPCODE_END) {
+      ctx->index_of_END = ctx->num_instructions;
+   }
+
+   /* Check destination and source registers' validity.
+    * Mark the registers as used.
+    */
+   for (i = 0; i < inst->Instruction.NumDstRegs; i++) {
+      check_register_usage(
+         ctx,
+         inst->FullDstRegisters[i].DstRegister.File,
+         inst->FullDstRegisters[i].DstRegister.Index,
+         "destination",
+         FALSE );
+   }
+   for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
+      check_register_usage(
+         ctx,
+         inst->FullSrcRegisters[i].SrcRegister.File,
+         inst->FullSrcRegisters[i].SrcRegister.Index,
+         "source",
+         (boolean)inst->FullSrcRegisters[i].SrcRegister.Indirect );
+      if (inst->FullSrcRegisters[i].SrcRegister.Indirect) {
+         uint file;
+         int index;
+
+         file = inst->FullSrcRegisters[i].SrcRegisterInd.File;
+         index = inst->FullSrcRegisters[i].SrcRegisterInd.Index;
+         check_register_usage(
+            ctx,
+            file,
+            index,
+            "indirect",
+            FALSE );
+         if (file != TGSI_FILE_ADDRESS || index != 0)
+            report_warning( ctx, "Indirect register not ADDR[0]" );
+      }
+   }
+
+   ctx->num_instructions++;
+
+   return TRUE;
+}
+
+static boolean
+iter_declaration(
+   struct tgsi_iterate_context *iter,
+   struct tgsi_full_declaration *decl )
+{
+   struct sanity_check_ctx *ctx = (struct sanity_check_ctx *) iter;
+   uint file;
+   uint i;
+
+   /* No declarations allowed after the first instruction.
+    */
+   if (ctx->num_instructions > 0)
+      report_error( ctx, "Instruction expected but declaration found" );
+
+   /* Check registers' validity.
+    * Mark the registers as declared.
+    */
+   file = decl->Declaration.File;
+   if (!check_file_name( ctx, file ))
+      return TRUE;
+   for (i = decl->DeclarationRange.First; i <= decl->DeclarationRange.Last; i++) {
+      if (is_register_declared( ctx, file, i ))
+         report_error( ctx, "The same register declared twice" );
+      ctx->regs_decl[file][i / BITS_IN_REG_FLAG] |= (1 << (i % BITS_IN_REG_FLAG));
+   }
+
+   return TRUE;
+}
+
+static boolean
+iter_immediate(
+   struct tgsi_iterate_context *iter,
+   struct tgsi_full_immediate *imm )
+{
+   struct sanity_check_ctx *ctx = (struct sanity_check_ctx *) iter;
+
+   assert( ctx->num_imms < MAX_REGISTERS );
+
+   /* No immediates allowed after the first instruction.
+    */
+   if (ctx->num_instructions > 0)
+      report_error( ctx, "Instruction expected but immediate found" );
+
+   /* Mark the register as declared.
+    */
+   ctx->regs_decl[TGSI_FILE_IMMEDIATE][ctx->num_imms / BITS_IN_REG_FLAG] |= (1 << (ctx->num_imms % BITS_IN_REG_FLAG));
+   ctx->num_imms++;
+
+   /* Check data type validity.
+    */
+   if (imm->Immediate.DataType != TGSI_IMM_FLOAT32) {
+      report_error( ctx, "Invalid immediate data type" );
+      return TRUE;
+   }
+
+   return TRUE;
+}
+
+static boolean
+epilog(
+   struct tgsi_iterate_context *iter )
+{
+   struct sanity_check_ctx *ctx = (struct sanity_check_ctx *) iter;
+   uint file;
+
+   /* There must be an END instruction at the end.
+    */
+   if (ctx->index_of_END == ~0 || ctx->index_of_END != ctx->num_instructions - 1) {
+      report_error( ctx, "Expected END at end of instruction sequence" );
+   }
+
+   /* Check if all declared registers were used.
+    */
+   for (file = TGSI_FILE_NULL; file < TGSI_FILE_COUNT; file++) {
+      uint i;
+
+      for (i = 0; i < MAX_REGISTERS; i++) {
+         if (is_register_declared( ctx, file, i ) && !is_register_used( ctx, file, i ) && !ctx->regs_ind_used[file]) {
+            report_warning( ctx, "Register never used" );
+         }
+      }
+   }
+
+   /* Print totals, if any.
+    */
+   if (ctx->errors || ctx->warnings)
+      debug_printf( "\n%u errors, %u warnings", ctx->errors, ctx->warnings );
+
+   return TRUE;
+}
+
+boolean
+tgsi_sanity_check(
+   struct tgsi_token *tokens )
+{
+   struct sanity_check_ctx ctx;
+
+   ctx.iter.prolog = NULL;
+   ctx.iter.iterate_instruction = iter_instruction;
+   ctx.iter.iterate_declaration = iter_declaration;
+   ctx.iter.iterate_immediate = iter_immediate;
+   ctx.iter.epilog = epilog;
+
+   memset( ctx.regs_decl, 0, sizeof( ctx.regs_decl ) );
+   memset( ctx.regs_used, 0, sizeof( ctx.regs_used ) );
+   memset( ctx.regs_ind_used, 0, sizeof( ctx.regs_ind_used ) );
+   ctx.num_imms = 0;
+   ctx.num_instructions = 0;
+   ctx.index_of_END = ~0;
+
+   ctx.errors = 0;
+   ctx.warnings = 0;
+
+   if (!tgsi_iterate_shader( tokens, &ctx.iter ))
+      return FALSE;
+
+   return ctx.errors == 0;
+}
diff --git a/src/gallium/auxiliary/tgsi/tgsi_sanity.h b/src/gallium/auxiliary/tgsi/tgsi_sanity.h
new file mode 100644
index 0000000000..ca45e94c7a
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/tgsi_sanity.h
@@ -0,0 +1,49 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef TGSI_SANITY_H
+#define TGSI_SANITY_H
+
+#include "pipe/p_shader_tokens.h"
+
+#if defined __cplusplus
+extern "C" {
+#endif
+
+/* Check the given token stream for errors and common mistakes.
+ * Diagnostic messages are printed out to the debug output.
+ * Returns TRUE if there are no errors, even though there could be some warnings.
+ */
+boolean
+tgsi_sanity_check(
+   struct tgsi_token *tokens );
+
+#if defined __cplusplus
+}
+#endif
+
+#endif /* TGSI_SANITY_H */
diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.c b/src/gallium/auxiliary/tgsi/tgsi_scan.c
new file mode 100644
index 0000000000..59bcf10b53
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.c
@@ -0,0 +1,226 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * TGSI program scan utility.
+ * Used to determine which registers and instructions are used by a shader.
+ *
+ * Authors:  Brian Paul
+ */
+
+
+#include "tgsi_scan.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_build.h"
+
+#include "pipe/p_util.h"
+
+
+
+/**
+ */
+void
+tgsi_scan_shader(const struct tgsi_token *tokens,
+                 struct tgsi_shader_info *info)
+{
+   uint procType, i;
+   struct tgsi_parse_context parse;
+
+   memset(info, 0, sizeof(*info));
+   for (i = 0; i < TGSI_FILE_COUNT; i++)
+      info->file_max[i] = -1;
+
+   /**
+    ** Setup to begin parsing input shader
+    **/
+   if (tgsi_parse_init( &parse, tokens ) != TGSI_PARSE_OK) {
+      debug_printf("tgsi_parse_init() failed in tgsi_scan_shader()!\n");
+      return;
+   }
+   procType = parse.FullHeader.Processor.Processor;
+   assert(procType == TGSI_PROCESSOR_FRAGMENT ||
+          procType == TGSI_PROCESSOR_VERTEX ||
+          procType == TGSI_PROCESSOR_GEOMETRY);
+
+
+   /**
+    ** Loop over incoming program tokens/instructions
+    */
+   while( !tgsi_parse_end_of_tokens( &parse ) ) {
+
+      info->num_tokens++;
+
+      tgsi_parse_token( &parse );
+
+      switch( parse.FullToken.Token.Type ) {
+      case TGSI_TOKEN_TYPE_INSTRUCTION:
+         {
+            struct tgsi_full_instruction *fullinst
+               = &parse.FullToken.FullInstruction;
+
+            assert(fullinst->Instruction.Opcode < TGSI_OPCODE_LAST);
+            info->opcode_count[fullinst->Instruction.Opcode]++;
+         }
+         break;
+
+      case TGSI_TOKEN_TYPE_DECLARATION:
+         {
+            struct tgsi_full_declaration *fulldecl
+               = &parse.FullToken.FullDeclaration;
+            uint file = fulldecl->Declaration.File;
+            uint i;
+            for (i = fulldecl->DeclarationRange.First;
+                 i <= fulldecl->DeclarationRange.Last;
+                 i++) {
+
+               /* only first 32 regs will appear in this bitfield */
+               info->file_mask[file] |= (1 << i);
+               info->file_count[file]++;
+               info->file_max[file] = MAX2(info->file_max[file], (int)i);
+
+               if (file == TGSI_FILE_INPUT) {
+                  info->input_semantic_name[i] = (ubyte)fulldecl->Semantic.SemanticName;
+                  info->input_semantic_index[i] = (ubyte)fulldecl->Semantic.SemanticIndex;
+                  info->num_inputs++;
+               }
+
+               if (file == TGSI_FILE_OUTPUT) {
+                  info->output_semantic_name[i] = (ubyte)fulldecl->Semantic.SemanticName;
+                  info->output_semantic_index[i] = (ubyte)fulldecl->Semantic.SemanticIndex;
+                  info->num_outputs++;
+               }
+
+               /* special case */
+               if (procType == TGSI_PROCESSOR_FRAGMENT &&
+                   file == TGSI_FILE_OUTPUT &&
+                   fulldecl->Semantic.SemanticName == TGSI_SEMANTIC_POSITION) {
+                  info->writes_z = TRUE;
+               }
+            }
+         }
+         break;
+
+      case TGSI_TOKEN_TYPE_IMMEDIATE:
+         info->immediate_count++;
+         break;
+
+      default:
+         assert( 0 );
+      }
+   }
+
+   assert( info->file_max[TGSI_FILE_INPUT] + 1 == info->num_inputs );
+   assert( info->file_max[TGSI_FILE_OUTPUT] + 1 == info->num_outputs );
+
+   info->uses_kill = (info->opcode_count[TGSI_OPCODE_KIL] ||
+                      info->opcode_count[TGSI_OPCODE_KILP]);
+
+   tgsi_parse_free (&parse);
+}
+
+
+
+/**
+ * Check if the given shader is a "passthrough" shader consisting of only
+ * MOV instructions of the form:  MOV OUT[n], IN[n]
+ *  
+ */
+boolean
+tgsi_is_passthrough_shader(const struct tgsi_token *tokens)
+{
+   struct tgsi_parse_context parse;
+
+   /**
+    ** Setup to begin parsing input shader
+    **/
+   if (tgsi_parse_init(&parse, tokens) != TGSI_PARSE_OK) {
+      debug_printf("tgsi_parse_init() failed in tgsi_is_passthrough_shader()!\n");
+      return FALSE;
+   }
+
+   /**
+    ** Loop over incoming program tokens/instructions
+    */
+   while (!tgsi_parse_end_of_tokens(&parse)) {
+
+      tgsi_parse_token(&parse);
+
+      switch (parse.FullToken.Token.Type) {
+      case TGSI_TOKEN_TYPE_INSTRUCTION:
+         {
+            struct tgsi_full_instruction *fullinst =
+               &parse.FullToken.FullInstruction;
+            const struct tgsi_full_src_register *src =
+               &fullinst->FullSrcRegisters[0];
+            const struct tgsi_full_dst_register *dst =
+               &fullinst->FullDstRegisters[0];
+
+            /* Do a whole bunch of checks for a simple move */
+            if (fullinst->Instruction.Opcode != TGSI_OPCODE_MOV ||
+                src->SrcRegister.File != TGSI_FILE_INPUT ||
+                dst->DstRegister.File != TGSI_FILE_OUTPUT ||
+                src->SrcRegister.Index != dst->DstRegister.Index ||
+
+                src->SrcRegister.Negate ||
+                src->SrcRegisterExtMod.Negate ||
+                src->SrcRegisterExtMod.Absolute ||
+                src->SrcRegisterExtMod.Scale2X ||
+                src->SrcRegisterExtMod.Bias ||
+                src->SrcRegisterExtMod.Complement ||
+
+                src->SrcRegister.SwizzleX != TGSI_SWIZZLE_X ||
+                src->SrcRegister.SwizzleY != TGSI_SWIZZLE_Y ||
+                src->SrcRegister.SwizzleZ != TGSI_SWIZZLE_Z ||
+                src->SrcRegister.SwizzleW != TGSI_SWIZZLE_W ||
+
+                src->SrcRegisterExtSwz.ExtSwizzleX != TGSI_EXTSWIZZLE_X ||
+                src->SrcRegisterExtSwz.ExtSwizzleY != TGSI_EXTSWIZZLE_Y ||
+                src->SrcRegisterExtSwz.ExtSwizzleZ != TGSI_EXTSWIZZLE_Z ||
+                src->SrcRegisterExtSwz.ExtSwizzleW != TGSI_EXTSWIZZLE_W ||
+
+                dst->DstRegister.WriteMask != TGSI_WRITEMASK_XYZW)
+            {
+               tgsi_parse_free(&parse);
+               return FALSE;
+            }
+         }
+         break;
+
+      case TGSI_TOKEN_TYPE_DECLARATION:
+         /* fall-through */
+      case TGSI_TOKEN_TYPE_IMMEDIATE:
+         /* fall-through */
+      default:
+         ; /* no-op */
+      }
+   }
+
+   tgsi_parse_free(&parse);
+
+   /* if we get here, it's a pass-through shader */
+   return TRUE;
+}
diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.h b/src/gallium/auxiliary/tgsi/tgsi_scan.h
new file mode 100644
index 0000000000..5cb6efb343
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.h
@@ -0,0 +1,74 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef TGSI_SCAN_H
+#define TGSI_SCAN_H
+
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_state.h"
+#include "pipe/p_shader_tokens.h"
+
+
+/**
+ * Shader summary info
+ */
+struct tgsi_shader_info
+{
+   uint num_tokens;
+
+   /* XXX eventually remove the corresponding fields from pipe_shader_state: */
+   ubyte num_inputs;
+   ubyte num_outputs;
+   ubyte input_semantic_name[PIPE_MAX_SHADER_INPUTS]; /**< TGSI_SEMANTIC_x */
+   ubyte input_semantic_index[PIPE_MAX_SHADER_INPUTS];
+   ubyte output_semantic_name[PIPE_MAX_SHADER_OUTPUTS]; /**< TGSI_SEMANTIC_x */
+   ubyte output_semantic_index[PIPE_MAX_SHADER_OUTPUTS];
+
+   uint file_mask[TGSI_FILE_COUNT];  /**< bitmask of declared registers */
+   uint file_count[TGSI_FILE_COUNT];  /**< number of declared registers */
+   int file_max[TGSI_FILE_COUNT];  /**< highest index of declared registers */
+
+   uint immediate_count; /**< number of immediates declared */
+
+   uint opcode_count[TGSI_OPCODE_LAST];  /**< opcode histogram */
+
+   boolean writes_z;  /**< does fragment shader write Z value? */
+   boolean uses_kill;  /**< KIL or KILP instruction used? */
+};
+
+
+extern void
+tgsi_scan_shader(const struct tgsi_token *tokens,
+                 struct tgsi_shader_info *info);
+
+
+extern boolean
+tgsi_is_passthrough_shader(const struct tgsi_token *tokens);
+
+
+#endif /* TGSI_SCAN_H */
diff --git a/src/gallium/auxiliary/tgsi/tgsi_sse2.c b/src/gallium/auxiliary/tgsi/tgsi_sse2.c
new file mode 100644
index 0000000000..0cb1f11ef2
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/tgsi_sse2.c
@@ -0,0 +1,2275 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "pipe/p_util.h"
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_util.h"
+#include "tgsi_exec.h"
+#include "tgsi_sse2.h"
+
+#include "rtasm/rtasm_x86sse.h"
+
+#ifdef PIPE_ARCH_X86
+
+/* for 1/sqrt()
+ *
+ * This costs about 100fps (close to 10%) in gears:
+ */
+#define HIGH_PRECISION 1
+
+
+#define FOR_EACH_CHANNEL( CHAN )\
+   for( CHAN = 0; CHAN < 4; CHAN++ )
+
+#define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
+   ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
+
+#define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
+   if( IS_DST0_CHANNEL_ENABLED( INST, CHAN ))
+
+#define FOR_EACH_DST0_ENABLED_CHANNEL( INST, CHAN )\
+   FOR_EACH_CHANNEL( CHAN )\
+      IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )
+
+#define CHAN_X 0
+#define CHAN_Y 1
+#define CHAN_Z 2
+#define CHAN_W 3
+
+#define TEMP_R0   TGSI_EXEC_TEMP_R0
+
+/**
+ * X86 utility functions.
+ */
+
+static struct x86_reg
+make_xmm(
+   unsigned xmm )
+{
+   return x86_make_reg(
+      file_XMM,
+      (enum x86_reg_name) xmm );
+}
+
+/**
+ * X86 register mapping helpers.
+ */
+
+static struct x86_reg
+get_const_base( void )
+{
+   return x86_make_reg(
+      file_REG32,
+      reg_CX );
+}
+
+static struct x86_reg
+get_input_base( void )
+{
+   return x86_make_reg(
+      file_REG32,
+      reg_AX );
+}
+
+static struct x86_reg
+get_output_base( void )
+{
+   return x86_make_reg(
+      file_REG32,
+      reg_DX );
+}
+
+static struct x86_reg
+get_temp_base( void )
+{
+   return x86_make_reg(
+      file_REG32,
+      reg_BX );
+}
+
+static struct x86_reg
+get_coef_base( void )
+{
+   return get_output_base();
+}
+
+static struct x86_reg
+get_immediate_base( void )
+{
+   return x86_make_reg(
+      file_REG32,
+      reg_DI );
+}
+
+
+/**
+ * Data access helpers.
+ */
+
+
+static struct x86_reg
+get_immediate(
+   unsigned vec,
+   unsigned chan )
+{
+   return x86_make_disp(
+      get_immediate_base(),
+      (vec * 4 + chan) * 4 );
+}
+
+static struct x86_reg
+get_const(
+   unsigned vec,
+   unsigned chan )
+{
+   return x86_make_disp(
+      get_const_base(),
+      (vec * 4 + chan) * 4 );
+}
+
+static struct x86_reg
+get_input(
+   unsigned vec,
+   unsigned chan )
+{
+   return x86_make_disp(
+      get_input_base(),
+      (vec * 4 + chan) * 16 );
+}
+
+static struct x86_reg
+get_output(
+   unsigned vec,
+   unsigned chan )
+{
+   return x86_make_disp(
+      get_output_base(),
+      (vec * 4 + chan) * 16 );
+}
+
+static struct x86_reg
+get_temp(
+   unsigned vec,
+   unsigned chan )
+{
+   return x86_make_disp(
+      get_temp_base(),
+      (vec * 4 + chan) * 16 );
+}
+
+static struct x86_reg
+get_coef(
+   unsigned vec,
+   unsigned chan,
+   unsigned member )
+{
+   return x86_make_disp(
+      get_coef_base(),
+      ((vec * 3 + member) * 4 + chan) * 4 );
+}
+
+
+static void
+emit_ret(
+   struct x86_function  *func )
+{
+   x86_ret( func );
+}
+
+
+/**
+ * Data fetch helpers.
+ */
+
+/**
+ * Copy a shader constant to xmm register
+ * \param xmm  the destination xmm register
+ * \param vec  the src const buffer index
+ * \param chan  src channel to fetch (X, Y, Z or W)
+ */
+static void
+emit_const(
+   struct x86_function *func,
+   unsigned xmm,
+   unsigned vec,
+   unsigned chan )
+{
+   sse_movss(
+      func,
+      make_xmm( xmm ),
+      get_const( vec, chan ) );
+   sse_shufps(
+      func,
+      make_xmm( xmm ),
+      make_xmm( xmm ),
+      SHUF( 0, 0, 0, 0 ) );
+}
+
+static void
+emit_immediate(
+   struct x86_function *func,
+   unsigned xmm,
+   unsigned vec,
+   unsigned chan )
+{
+   sse_movss(
+      func,
+      make_xmm( xmm ),
+      get_immediate( vec, chan ) );
+   sse_shufps(
+      func,
+      make_xmm( xmm ),
+      make_xmm( xmm ),
+      SHUF( 0, 0, 0, 0 ) );
+}
+
+
+/**
+ * Copy a shader input to xmm register
+ * \param xmm  the destination xmm register
+ * \param vec  the src input attrib
+ * \param chan  src channel to fetch (X, Y, Z or W)
+ */
+static void
+emit_inputf(
+   struct x86_function *func,
+   unsigned xmm,
+   unsigned vec,
+   unsigned chan )
+{
+   sse_movups(
+      func,
+      make_xmm( xmm ),
+      get_input( vec, chan ) );
+}
+
+/**
+ * Store an xmm register to a shader output
+ * \param xmm  the source xmm register
+ * \param vec  the dest output attrib
+ * \param chan  src dest channel to store (X, Y, Z or W)
+ */
+static void
+emit_output(
+   struct x86_function *func,
+   unsigned xmm,
+   unsigned vec,
+   unsigned chan )
+{
+   sse_movups(
+      func,
+      get_output( vec, chan ),
+      make_xmm( xmm ) );
+}
+
+/**
+ * Copy a shader temporary to xmm register
+ * \param xmm  the destination xmm register
+ * \param vec  the src temp register
+ * \param chan  src channel to fetch (X, Y, Z or W)
+ */
+static void
+emit_tempf(
+   struct x86_function *func,
+   unsigned xmm,
+   unsigned vec,
+   unsigned chan )
+{
+   sse_movaps(
+      func,
+      make_xmm( xmm ),
+      get_temp( vec, chan ) );
+}
+
+/**
+ * Load an xmm register with an input attrib coefficient (a0, dadx or dady)
+ * \param xmm  the destination xmm register
+ * \param vec  the src input/attribute coefficient index
+ * \param chan  src channel to fetch (X, Y, Z or W)
+ * \param member  0=a0, 1=dadx, 2=dady
+ */
+static void
+emit_coef(
+   struct x86_function *func,
+   unsigned xmm,
+   unsigned vec,
+   unsigned chan,
+   unsigned member )
+{
+   sse_movss(
+      func,
+      make_xmm( xmm ),
+      get_coef( vec, chan, member ) );
+   sse_shufps(
+      func,
+      make_xmm( xmm ),
+      make_xmm( xmm ),
+      SHUF( 0, 0, 0, 0 ) );
+}
+
+/**
+ * Data store helpers.
+ */
+
+static void
+emit_inputs(
+   struct x86_function *func,
+   unsigned xmm,
+   unsigned vec,
+   unsigned chan )
+{
+   sse_movups(
+      func,
+      get_input( vec, chan ),
+      make_xmm( xmm ) );
+}
+
+static void
+emit_temps(
+   struct x86_function *func,
+   unsigned xmm,
+   unsigned vec,
+   unsigned chan )
+{
+   sse_movaps(
+      func,
+      get_temp( vec, chan ),
+      make_xmm( xmm ) );
+}
+
+static void
+emit_addrs(
+   struct x86_function *func,
+   unsigned xmm,
+   unsigned vec,
+   unsigned chan )
+{
+   emit_temps(
+      func,
+      xmm,
+      vec + TGSI_EXEC_NUM_TEMPS,
+      chan );
+}
+
+/**
+ * Coefficent fetch helpers.
+ */
+
+static void
+emit_coef_a0(
+   struct x86_function *func,
+   unsigned xmm,
+   unsigned vec,
+   unsigned chan )
+{
+   emit_coef(
+      func,
+      xmm,
+      vec,
+      chan,
+      0 );
+}
+
+static void
+emit_coef_dadx(
+   struct x86_function *func,
+   unsigned xmm,
+   unsigned vec,
+   unsigned chan )
+{
+   emit_coef(
+      func,
+      xmm,
+      vec,
+      chan,
+      1 );
+}
+
+static void
+emit_coef_dady(
+   struct x86_function *func,
+   unsigned xmm,
+   unsigned vec,
+   unsigned chan )
+{
+   emit_coef(
+      func,
+      xmm,
+      vec,
+      chan,
+      2 );
+}
+
+/**
+ * Function call helpers.
+ */
+
+static void
+emit_push_gp(
+   struct x86_function *func )
+{
+   x86_push(
+      func,
+      x86_make_reg( file_REG32, reg_AX) );
+   x86_push(
+      func,
+      x86_make_reg( file_REG32, reg_CX) );
+   x86_push(
+      func,
+      x86_make_reg( file_REG32, reg_DX) );
+}
+
+static void
+x86_pop_gp(
+   struct x86_function *func )
+{
+   /* Restore GP registers in a reverse order.
+    */
+   x86_pop(
+      func,
+      x86_make_reg( file_REG32, reg_DX) );
+   x86_pop(
+      func,
+      x86_make_reg( file_REG32, reg_CX) );
+   x86_pop(
+      func,
+      x86_make_reg( file_REG32, reg_AX) );
+}
+
+static void
+emit_func_call_dst(
+   struct x86_function *func,
+   unsigned xmm_dst,
+   void (PIPE_CDECL *code)() )
+{
+   sse_movaps(
+      func,
+      get_temp( TEMP_R0, 0 ),
+      make_xmm( xmm_dst ) );
+
+   emit_push_gp(
+      func );
+
+   {
+      struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
+
+      x86_lea(
+         func,
+         ecx,
+         get_temp( TEMP_R0, 0 ) );
+
+      x86_push( func, ecx );
+      x86_mov_reg_imm( func, ecx, (unsigned long) code );
+      x86_call( func, ecx );
+      x86_pop(func, ecx ); 
+   }
+
+
+   x86_pop_gp(
+      func );
+
+   sse_movaps(
+      func,
+      make_xmm( xmm_dst ),
+      get_temp( TEMP_R0, 0 ) );
+}
+
+static void
+emit_func_call_dst_src(
+   struct x86_function *func,
+   unsigned xmm_dst,
+   unsigned xmm_src,
+   void (PIPE_CDECL *code)() )
+{
+   sse_movaps(
+      func,
+      get_temp( TEMP_R0, 1 ),
+      make_xmm( xmm_src ) );
+
+   emit_func_call_dst(
+      func,
+      xmm_dst,
+      code );
+}
+
+/**
+ * Low-level instruction translators.
+ */
+
+static void
+emit_abs(
+   struct x86_function *func,
+   unsigned xmm )
+{
+   sse_andps(
+      func,
+      make_xmm( xmm ),
+      get_temp(
+         TGSI_EXEC_TEMP_7FFFFFFF_I,
+         TGSI_EXEC_TEMP_7FFFFFFF_C ) );
+}
+
+static void
+emit_add(
+   struct x86_function *func,
+   unsigned xmm_dst,
+   unsigned xmm_src )
+{
+   sse_addps(
+      func,
+      make_xmm( xmm_dst ),
+      make_xmm( xmm_src ) );
+}
+
+static void PIPE_CDECL
+cos4f(
+   float *store )
+{
+   const unsigned X = 0;
+
+   store[X + 0] = cosf( store[X + 0] );
+   store[X + 1] = cosf( store[X + 1] );
+   store[X + 2] = cosf( store[X + 2] );
+   store[X + 3] = cosf( store[X + 3] );
+}
+
+static void
+emit_cos(
+   struct x86_function *func,
+   unsigned xmm_dst )
+{
+   emit_func_call_dst(
+      func,
+      xmm_dst,
+      cos4f );
+}
+
+static void PIPE_CDECL
+ex24f(
+   float *store )
+{
+   const unsigned X = 0;
+
+   store[X + 0] = powf( 2.0f, store[X + 0] );
+   store[X + 1] = powf( 2.0f, store[X + 1] );
+   store[X + 2] = powf( 2.0f, store[X + 2] );
+   store[X + 3] = powf( 2.0f, store[X + 3] );
+}
+
+static void
+emit_ex2(
+   struct x86_function *func,
+   unsigned xmm_dst )
+{
+   emit_func_call_dst(
+      func,
+      xmm_dst,
+      ex24f );
+}
+
+static void
+emit_f2it(
+   struct x86_function *func,
+   unsigned xmm )
+{
+   sse2_cvttps2dq(
+      func,
+      make_xmm( xmm ),
+      make_xmm( xmm ) );
+}
+
+static void PIPE_CDECL
+flr4f(
+   float *store )
+{
+   const unsigned X = 0;
+
+   store[X + 0] = floorf( store[X + 0] );
+   store[X + 1] = floorf( store[X + 1] );
+   store[X + 2] = floorf( store[X + 2] );
+   store[X + 3] = floorf( store[X + 3] );
+}
+
+static void
+emit_flr(
+   struct x86_function *func,
+   unsigned xmm_dst )
+{
+   emit_func_call_dst(
+      func,
+      xmm_dst,
+      flr4f );
+}
+
+static void PIPE_CDECL
+frc4f(
+   float *store )
+{
+   const unsigned X = 0;
+
+   store[X + 0] -= floorf( store[X + 0] );
+   store[X + 1] -= floorf( store[X + 1] );
+   store[X + 2] -= floorf( store[X + 2] );
+   store[X + 3] -= floorf( store[X + 3] );
+}
+
+static void
+emit_frc(
+   struct x86_function *func,
+   unsigned xmm_dst )
+{
+   emit_func_call_dst(
+      func,
+      xmm_dst,
+      frc4f );
+}
+
+static void PIPE_CDECL
+lg24f(
+   float *store )
+{
+   const unsigned X = 0;
+
+   store[X + 0] = LOG2( store[X + 0] );
+   store[X + 1] = LOG2( store[X + 1] );
+   store[X + 2] = LOG2( store[X + 2] );
+   store[X + 3] = LOG2( store[X + 3] );
+}
+
+static void
+emit_lg2(
+   struct x86_function *func,
+   unsigned xmm_dst )
+{
+   emit_func_call_dst(
+      func,
+      xmm_dst,
+      lg24f );
+}
+
+static void
+emit_MOV(
+   struct x86_function *func,
+   unsigned xmm_dst,
+   unsigned xmm_src )
+{
+   sse_movups(
+      func,
+      make_xmm( xmm_dst ),
+      make_xmm( xmm_src ) );
+}
+
+static void
+emit_mul (struct x86_function *func,
+          unsigned xmm_dst,
+          unsigned xmm_src)
+{
+   sse_mulps(
+      func,
+      make_xmm( xmm_dst ),
+      make_xmm( xmm_src ) );
+}
+
+static void
+emit_neg(
+   struct x86_function *func,
+   unsigned xmm )
+{
+   sse_xorps(
+      func,
+      make_xmm( xmm ),
+      get_temp(
+         TGSI_EXEC_TEMP_80000000_I,
+         TGSI_EXEC_TEMP_80000000_C ) );
+}
+
+static void PIPE_CDECL
+pow4f(
+   float *store )
+{
+   const unsigned X = 0;
+
+   store[X + 0] = powf( store[X + 0], store[X + 4] );
+   store[X + 1] = powf( store[X + 1], store[X + 5] );
+   store[X + 2] = powf( store[X + 2], store[X + 6] );
+   store[X + 3] = powf( store[X + 3], store[X + 7] );
+}
+
+static void
+emit_pow(
+   struct x86_function *func,
+   unsigned xmm_dst,
+   unsigned xmm_src )
+{
+   emit_func_call_dst_src(
+      func,
+      xmm_dst,
+      xmm_src,
+      pow4f );
+}
+
+static void
+emit_rcp (
+   struct x86_function *func,
+   unsigned xmm_dst,
+   unsigned xmm_src )
+{
+   /* On Intel CPUs at least, this is only accurate to 12 bits -- not
+    * good enough.  Need to either emit a proper divide or use the
+    * iterative technique described below in emit_rsqrt().
+    */
+   sse2_rcpps(
+      func,
+      make_xmm( xmm_dst ),
+      make_xmm( xmm_src ) );
+}
+
+static void
+emit_rsqrt(
+   struct x86_function *func,
+   unsigned xmm_dst,
+   unsigned xmm_src )
+{
+#if HIGH_PRECISION
+   /* Although rsqrtps() and rcpps() are low precision on some/all SSE
+    * implementations, it is possible to improve its precision at
+    * fairly low cost, using a newton/raphson step, as below:
+    * 
+    * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
+    * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
+    *
+    * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
+    */
+   {
+      struct x86_reg dst = make_xmm( xmm_dst );
+      struct x86_reg src = make_xmm( xmm_src );
+      struct x86_reg tmp0 = make_xmm( 2 );
+      struct x86_reg tmp1 = make_xmm( 3 );
+
+      assert( xmm_dst != xmm_src );
+      assert( xmm_dst != 2 && xmm_dst != 3 );
+      assert( xmm_src != 2 && xmm_src != 3 );
+
+      sse_movaps(  func, dst,  get_temp( TGSI_EXEC_TEMP_HALF_I, TGSI_EXEC_TEMP_HALF_C ) );
+      sse_movaps(  func, tmp0, get_temp( TGSI_EXEC_TEMP_THREE_I, TGSI_EXEC_TEMP_THREE_C ) );
+      sse_rsqrtps( func, tmp1, src  );
+      sse_mulps(   func, src,  tmp1 );
+      sse_mulps(   func, dst,  tmp1 );
+      sse_mulps(   func, src,  tmp1 );
+      sse_subps(   func, tmp0, src  );
+      sse_mulps(   func, dst,  tmp0 );
+   }
+#else
+   /* On Intel CPUs at least, this is only accurate to 12 bits -- not
+    * good enough.
+    */
+   sse_rsqrtps(
+      func,
+      make_xmm( xmm_dst ),
+      make_xmm( xmm_src ) );
+#endif
+}
+
+static void
+emit_setsign(
+   struct x86_function *func,
+   unsigned xmm )
+{
+   sse_orps(
+      func,
+      make_xmm( xmm ),
+      get_temp(
+         TGSI_EXEC_TEMP_80000000_I,
+         TGSI_EXEC_TEMP_80000000_C ) );
+}
+
+static void PIPE_CDECL
+sin4f(
+   float *store )
+{
+   const unsigned X = 0;
+
+   store[X + 0] = sinf( store[X + 0] );
+   store[X + 1] = sinf( store[X + 1] );
+   store[X + 2] = sinf( store[X + 2] );
+   store[X + 3] = sinf( store[X + 3] );
+}
+
+static void
+emit_sin (struct x86_function *func,
+          unsigned xmm_dst)
+{
+   emit_func_call_dst(
+      func,
+      xmm_dst,
+      sin4f );
+}
+
+static void
+emit_sub(
+   struct x86_function *func,
+   unsigned xmm_dst,
+   unsigned xmm_src )
+{
+   sse_subps(
+      func,
+      make_xmm( xmm_dst ),
+      make_xmm( xmm_src ) );
+}
+
+/**
+ * Register fetch.
+ */
+
+static void
+emit_fetch(
+   struct x86_function *func,
+   unsigned xmm,
+   const struct tgsi_full_src_register *reg,
+   const unsigned chan_index )
+{
+   unsigned swizzle = tgsi_util_get_full_src_register_extswizzle( reg, chan_index );
+
+   switch( swizzle ) {
+   case TGSI_EXTSWIZZLE_X:
+   case TGSI_EXTSWIZZLE_Y:
+   case TGSI_EXTSWIZZLE_Z:
+   case TGSI_EXTSWIZZLE_W:
+      switch( reg->SrcRegister.File ) {
+      case TGSI_FILE_CONSTANT:
+         emit_const(
+            func,
+            xmm,
+            reg->SrcRegister.Index,
+            swizzle );
+         break;
+
+      case TGSI_FILE_IMMEDIATE:
+         emit_immediate(
+            func,
+            xmm,
+            reg->SrcRegister.Index,
+            swizzle );
+         break;
+
+      case TGSI_FILE_INPUT:
+         emit_inputf(
+            func,
+            xmm,
+            reg->SrcRegister.Index,
+            swizzle );
+         break;
+
+      case TGSI_FILE_TEMPORARY:
+         emit_tempf(
+            func,
+            xmm,
+            reg->SrcRegister.Index,
+            swizzle );
+         break;
+
+      default:
+         assert( 0 );
+      }
+      break;
+
+   case TGSI_EXTSWIZZLE_ZERO:
+      emit_tempf(
+         func,
+         xmm,
+         TGSI_EXEC_TEMP_00000000_I,
+         TGSI_EXEC_TEMP_00000000_C );
+      break;
+
+   case TGSI_EXTSWIZZLE_ONE:
+      emit_tempf(
+         func,
+         xmm,
+         TGSI_EXEC_TEMP_ONE_I,
+         TGSI_EXEC_TEMP_ONE_C );
+      break;
+
+   default:
+      assert( 0 );
+   }
+
+   switch( tgsi_util_get_full_src_register_sign_mode( reg, chan_index ) ) {
+   case TGSI_UTIL_SIGN_CLEAR:
+      emit_abs( func, xmm );
+      break;
+
+   case TGSI_UTIL_SIGN_SET:
+      emit_setsign( func, xmm );
+      break;
+
+   case TGSI_UTIL_SIGN_TOGGLE:
+      emit_neg( func, xmm );
+      break;
+
+   case TGSI_UTIL_SIGN_KEEP:
+      break;
+   }
+}
+
+#define FETCH( FUNC, INST, XMM, INDEX, CHAN )\
+   emit_fetch( FUNC, XMM, &(INST).FullSrcRegisters[INDEX], CHAN )
+
+/**
+ * Register store.
+ */
+
+static void
+emit_store(
+   struct x86_function *func,
+   unsigned xmm,
+   const struct tgsi_full_dst_register *reg,
+   const struct tgsi_full_instruction *inst,
+   unsigned chan_index )
+{
+   switch( reg->DstRegister.File ) {
+   case TGSI_FILE_OUTPUT:
+      emit_output(
+         func,
+         xmm,
+         reg->DstRegister.Index,
+         chan_index );
+      break;
+
+   case TGSI_FILE_TEMPORARY:
+      emit_temps(
+         func,
+         xmm,
+         reg->DstRegister.Index,
+         chan_index );
+      break;
+
+   case TGSI_FILE_ADDRESS:
+      emit_addrs(
+         func,
+         xmm,
+         reg->DstRegister.Index,
+         chan_index );
+      break;
+
+   default:
+      assert( 0 );
+   }
+
+   switch( inst->Instruction.Saturate ) {
+   case TGSI_SAT_NONE:
+      break;
+
+   case TGSI_SAT_ZERO_ONE:
+      /* assert( 0 ); */
+      break;
+
+   case TGSI_SAT_MINUS_PLUS_ONE:
+      assert( 0 );
+      break;
+   }
+}
+
+#define STORE( FUNC, INST, XMM, INDEX, CHAN )\
+   emit_store( FUNC, XMM, &(INST).FullDstRegisters[INDEX], &(INST), CHAN )
+
+/**
+ * High-level instruction translators.
+ */
+
+static void
+emit_kil(
+   struct x86_function *func,
+   const struct tgsi_full_src_register *reg )
+{
+   unsigned uniquemask;
+   unsigned registers[4];
+   unsigned nextregister = 0;
+   unsigned firstchan = ~0;
+   unsigned chan_index;
+
+   /* This mask stores component bits that were already tested. Note that
+    * we test if the value is less than zero, so 1.0 and 0.0 need not to be
+    * tested. */
+   uniquemask = (1 << TGSI_EXTSWIZZLE_ZERO) | (1 << TGSI_EXTSWIZZLE_ONE);
+
+   FOR_EACH_CHANNEL( chan_index ) {
+      unsigned swizzle;
+
+      /* unswizzle channel */
+      swizzle = tgsi_util_get_full_src_register_extswizzle(
+         reg,
+         chan_index );
+
+      /* check if the component has not been already tested */
+      if( !(uniquemask & (1 << swizzle)) ) {
+         uniquemask |= 1 << swizzle;
+
+         /* allocate register */
+         registers[chan_index] = nextregister;
+         emit_fetch(
+            func,
+            nextregister,
+            reg,
+            chan_index );
+         nextregister++;
+
+         /* mark the first channel used */
+         if( firstchan == ~0 ) {
+            firstchan = chan_index;
+         }
+      }
+   }
+
+   x86_push(
+      func,
+      x86_make_reg( file_REG32, reg_AX ) );
+   x86_push(
+      func,
+      x86_make_reg( file_REG32, reg_DX ) );
+
+   FOR_EACH_CHANNEL( chan_index ) {
+      if( uniquemask & (1 << chan_index) ) {
+         sse_cmpps(
+            func,
+            make_xmm( registers[chan_index] ),
+            get_temp(
+               TGSI_EXEC_TEMP_00000000_I,
+               TGSI_EXEC_TEMP_00000000_C ),
+            cc_LessThan );
+
+         if( chan_index == firstchan ) {
+            sse_pmovmskb(
+               func,
+               x86_make_reg( file_REG32, reg_AX ),
+               make_xmm( registers[chan_index] ) );
+         }
+         else {
+            sse_pmovmskb(
+               func,
+               x86_make_reg( file_REG32, reg_DX ),
+               make_xmm( registers[chan_index] ) );
+            x86_or(
+               func,
+               x86_make_reg( file_REG32, reg_AX ),
+               x86_make_reg( file_REG32, reg_DX ) );
+         }
+      }
+   }
+
+   x86_or(
+      func,
+      get_temp(
+         TGSI_EXEC_TEMP_KILMASK_I,
+         TGSI_EXEC_TEMP_KILMASK_C ),
+      x86_make_reg( file_REG32, reg_AX ) );
+
+   x86_pop(
+      func,
+      x86_make_reg( file_REG32, reg_DX ) );
+   x86_pop(
+      func,
+      x86_make_reg( file_REG32, reg_AX ) );
+}
+
+static void
+emit_setcc(
+   struct x86_function *func,
+   struct tgsi_full_instruction *inst,
+   enum sse_cc cc )
+{
+   unsigned chan_index;
+
+   FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+      FETCH( func, *inst, 0, 0, chan_index );
+      FETCH( func, *inst, 1, 1, chan_index );
+      sse_cmpps(
+         func,
+         make_xmm( 0 ),
+         make_xmm( 1 ),
+         cc );
+      sse_andps(
+         func,
+         make_xmm( 0 ),
+         get_temp(
+            TGSI_EXEC_TEMP_ONE_I,
+            TGSI_EXEC_TEMP_ONE_C ) );
+      STORE( func, *inst, 0, 0, chan_index );
+   }
+}
+
+static void
+emit_cmp(
+   struct x86_function *func,
+   struct tgsi_full_instruction *inst )
+{
+   unsigned chan_index;
+
+   FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+      FETCH( func, *inst, 0, 0, chan_index );
+      FETCH( func, *inst, 1, 1, chan_index );
+      FETCH( func, *inst, 2, 2, chan_index );
+      sse_cmpps(
+         func,
+         make_xmm( 0 ),
+         get_temp(
+            TGSI_EXEC_TEMP_00000000_I,
+            TGSI_EXEC_TEMP_00000000_C ),
+         cc_LessThan );
+      sse_andps(
+         func,
+         make_xmm( 1 ),
+         make_xmm( 0 ) );
+      sse_andnps(
+         func,
+         make_xmm( 0 ),
+         make_xmm( 2 ) );
+      sse_orps(
+         func,
+         make_xmm( 0 ),
+         make_xmm( 1 ) );
+      STORE( func, *inst, 0, 0, chan_index );
+   }
+}
+
+static int
+emit_instruction(
+   struct x86_function *func,
+   struct tgsi_full_instruction *inst )
+{
+   unsigned chan_index;
+
+   switch( inst->Instruction.Opcode ) {
+   case TGSI_OPCODE_ARL:
+#if 0
+      /* XXX this isn't working properly (see glean vertProg1 test) */
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         emit_f2it( func, 0 );
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+#else
+      return 0;
+#endif
+      break;
+
+   case TGSI_OPCODE_MOV:
+   case TGSI_OPCODE_SWZ:
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_LIT:
+      if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
+          IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
+         emit_tempf(
+            func,
+            0,
+            TGSI_EXEC_TEMP_ONE_I,
+            TGSI_EXEC_TEMP_ONE_C);
+         if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ) {
+            STORE( func, *inst, 0, 0, CHAN_X );
+         }
+         if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
+            STORE( func, *inst, 0, 0, CHAN_W );
+         }
+      }
+      if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
+          IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
+         if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
+            FETCH( func, *inst, 0, 0, CHAN_X );
+            sse_maxps(
+               func,
+               make_xmm( 0 ),
+               get_temp(
+                  TGSI_EXEC_TEMP_00000000_I,
+                  TGSI_EXEC_TEMP_00000000_C ) );
+            STORE( func, *inst, 0, 0, CHAN_Y );
+         }
+         if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
+            /* XMM[1] = SrcReg[0].yyyy */
+            FETCH( func, *inst, 1, 0, CHAN_Y );
+            /* XMM[1] = max(XMM[1], 0) */
+            sse_maxps(
+               func,
+               make_xmm( 1 ),
+               get_temp(
+                  TGSI_EXEC_TEMP_00000000_I,
+                  TGSI_EXEC_TEMP_00000000_C ) );
+            /* XMM[2] = SrcReg[0].wwww */
+            FETCH( func, *inst, 2, 0, CHAN_W );
+            /* XMM[2] = min(XMM[2], 128.0) */
+            sse_minps(
+               func,
+               make_xmm( 2 ),
+               get_temp(
+                  TGSI_EXEC_TEMP_128_I,
+                  TGSI_EXEC_TEMP_128_C ) );
+            /* XMM[2] = max(XMM[2], -128.0) */
+            sse_maxps(
+               func,
+               make_xmm( 2 ),
+               get_temp(
+                  TGSI_EXEC_TEMP_MINUS_128_I,
+                  TGSI_EXEC_TEMP_MINUS_128_C ) );
+            emit_pow( func, 1, 2 );
+            FETCH( func, *inst, 0, 0, CHAN_X );
+            sse_xorps(
+               func,
+               make_xmm( 2 ),
+               make_xmm( 2 ) );
+            sse_cmpps(
+               func,
+               make_xmm( 2 ),
+               make_xmm( 0 ),
+               cc_LessThanEqual );
+            sse_andps(
+               func,
+               make_xmm( 2 ),
+               make_xmm( 1 ) );
+            STORE( func, *inst, 2, 0, CHAN_Z );
+         }
+      }
+      break;
+
+   case TGSI_OPCODE_RCP:
+   /* TGSI_OPCODE_RECIP */
+      FETCH( func, *inst, 0, 0, CHAN_X );
+      emit_rcp( func, 0, 0 );
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_RSQ:
+   /* TGSI_OPCODE_RECIPSQRT */
+      FETCH( func, *inst, 0, 0, CHAN_X );
+      emit_rsqrt( func, 1, 0 );
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( func, *inst, 1, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_EXP:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_LOG:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_MUL:
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         FETCH( func, *inst, 1, 1, chan_index );
+         emit_mul( func, 0, 1 );
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_ADD:
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         FETCH( func, *inst, 1, 1, chan_index );
+         emit_add( func, 0, 1 );
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_DP3:
+   /* TGSI_OPCODE_DOT3 */
+      FETCH( func, *inst, 0, 0, CHAN_X );
+      FETCH( func, *inst, 1, 1, CHAN_X );
+      emit_mul( func, 0, 1 );
+      FETCH( func, *inst, 1, 0, CHAN_Y );
+      FETCH( func, *inst, 2, 1, CHAN_Y );
+      emit_mul( func, 1, 2 );
+      emit_add( func, 0, 1 );
+      FETCH( func, *inst, 1, 0, CHAN_Z );
+      FETCH( func, *inst, 2, 1, CHAN_Z );
+      emit_mul( func, 1, 2 );
+      emit_add( func, 0, 1 );
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_DP4:
+   /* TGSI_OPCODE_DOT4 */
+      FETCH( func, *inst, 0, 0, CHAN_X );
+      FETCH( func, *inst, 1, 1, CHAN_X );
+      emit_mul( func, 0, 1 );
+      FETCH( func, *inst, 1, 0, CHAN_Y );
+      FETCH( func, *inst, 2, 1, CHAN_Y );
+      emit_mul( func, 1, 2 );
+      emit_add( func, 0, 1 );
+      FETCH( func, *inst, 1, 0, CHAN_Z );
+      FETCH( func, *inst, 2, 1, CHAN_Z );
+      emit_mul(func, 1, 2 );
+      emit_add(func, 0, 1 );
+      FETCH( func, *inst, 1, 0, CHAN_W );
+      FETCH( func, *inst, 2, 1, CHAN_W );
+      emit_mul( func, 1, 2 );
+      emit_add( func, 0, 1 );
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_DST:
+      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
+         emit_tempf(
+            func,
+            0,
+            TGSI_EXEC_TEMP_ONE_I,
+            TGSI_EXEC_TEMP_ONE_C );
+         STORE( func, *inst, 0, 0, CHAN_X );
+      }
+      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
+         FETCH( func, *inst, 0, 0, CHAN_Y );
+         FETCH( func, *inst, 1, 1, CHAN_Y );
+         emit_mul( func, 0, 1 );
+         STORE( func, *inst, 0, 0, CHAN_Y );
+      }
+      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
+         FETCH( func, *inst, 0, 0, CHAN_Z );
+         STORE( func, *inst, 0, 0, CHAN_Z );
+      }
+      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
+         FETCH( func, *inst, 0, 1, CHAN_W );
+         STORE( func, *inst, 0, 0, CHAN_W );
+      }
+      break;
+
+   case TGSI_OPCODE_MIN:
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         FETCH( func, *inst, 1, 1, chan_index );
+         sse_minps(
+            func,
+            make_xmm( 0 ),
+            make_xmm( 1 ) );
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_MAX:
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         FETCH( func, *inst, 1, 1, chan_index );
+         sse_maxps(
+            func,
+            make_xmm( 0 ),
+            make_xmm( 1 ) );
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SLT:
+   /* TGSI_OPCODE_SETLT */
+      emit_setcc( func, inst, cc_LessThan );
+      break;
+
+   case TGSI_OPCODE_SGE:
+   /* TGSI_OPCODE_SETGE */
+      emit_setcc( func, inst, cc_NotLessThan );
+      break;
+
+   case TGSI_OPCODE_MAD:
+   /* TGSI_OPCODE_MADD */
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         FETCH( func, *inst, 1, 1, chan_index );
+         FETCH( func, *inst, 2, 2, chan_index );
+         emit_mul( func, 0, 1 );
+         emit_add( func, 0, 2 );
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SUB:
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         FETCH( func, *inst, 1, 1, chan_index );
+         emit_sub( func, 0, 1 );
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_LERP:
+   /* TGSI_OPCODE_LRP */
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         FETCH( func, *inst, 1, 1, chan_index );
+         FETCH( func, *inst, 2, 2, chan_index );
+         emit_sub( func, 1, 2 );
+         emit_mul( func, 0, 1 );
+         emit_add( func, 0, 2 );
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_CND:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_CND0:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_DOT2ADD:
+   /* TGSI_OPCODE_DP2A */
+      return 0;
+      break;
+
+   case TGSI_OPCODE_INDEX:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_NEGATE:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_FRAC:
+   /* TGSI_OPCODE_FRC */
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         emit_frc( func, 0 );
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_CLAMP:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_FLOOR:
+   /* TGSI_OPCODE_FLR */
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         emit_flr( func, 0 );
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_ROUND:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_EXPBASE2:
+   /* TGSI_OPCODE_EX2 */
+      FETCH( func, *inst, 0, 0, CHAN_X );
+      emit_ex2( func, 0 );
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_LOGBASE2:
+   /* TGSI_OPCODE_LG2 */
+      FETCH( func, *inst, 0, 0, CHAN_X );
+      emit_lg2( func, 0 );
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_POWER:
+   /* TGSI_OPCODE_POW */
+      FETCH( func, *inst, 0, 0, CHAN_X );
+      FETCH( func, *inst, 1, 1, CHAN_X );
+      emit_pow( func, 0, 1 );
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_CROSSPRODUCT:
+   /* TGSI_OPCODE_XPD */
+      if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
+          IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
+         FETCH( func, *inst, 1, 1, CHAN_Z );
+         FETCH( func, *inst, 3, 0, CHAN_Z );
+      }
+      if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
+          IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
+         FETCH( func, *inst, 0, 0, CHAN_Y );
+         FETCH( func, *inst, 4, 1, CHAN_Y );
+      }
+      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
+         emit_MOV( func, 2, 0 );
+         emit_mul( func, 2, 1 );
+         emit_MOV( func, 5, 3 );
+         emit_mul( func, 5, 4 );
+         emit_sub( func, 2, 5 );
+         STORE( func, *inst, 2, 0, CHAN_X );
+      }
+      if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
+          IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
+         FETCH( func, *inst, 2, 1, CHAN_X );
+         FETCH( func, *inst, 5, 0, CHAN_X );
+      }
+      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
+         emit_mul( func, 3, 2 );
+         emit_mul( func, 1, 5 );
+         emit_sub( func, 3, 1 );
+         STORE( func, *inst, 3, 0, CHAN_Y );
+      }
+      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
+         emit_mul( func, 5, 4 );
+         emit_mul( func, 0, 2 );
+         emit_sub( func, 5, 0 );
+         STORE( func, *inst, 5, 0, CHAN_Z );
+      }
+      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
+	 emit_tempf(
+	    func,
+	    0,
+	    TGSI_EXEC_TEMP_ONE_I,
+	    TGSI_EXEC_TEMP_ONE_C );
+         STORE( func, *inst, 0, 0, CHAN_W );
+      }
+      break;
+
+   case TGSI_OPCODE_MULTIPLYMATRIX:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_ABS:
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         emit_abs( func, 0) ;
+
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_RCC:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_DPH:
+      FETCH( func, *inst, 0, 0, CHAN_X );
+      FETCH( func, *inst, 1, 1, CHAN_X );
+      emit_mul( func, 0, 1 );
+      FETCH( func, *inst, 1, 0, CHAN_Y );
+      FETCH( func, *inst, 2, 1, CHAN_Y );
+      emit_mul( func, 1, 2 );
+      emit_add( func, 0, 1 );
+      FETCH( func, *inst, 1, 0, CHAN_Z );
+      FETCH( func, *inst, 2, 1, CHAN_Z );
+      emit_mul( func, 1, 2 );
+      emit_add( func, 0, 1 );
+      FETCH( func, *inst, 1, 1, CHAN_W );
+      emit_add( func, 0, 1 );
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_COS:
+      FETCH( func, *inst, 0, 0, CHAN_X );
+      emit_cos( func, 0 );
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_DDX:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_DDY:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_KIL:
+      emit_kil( func, &inst->FullSrcRegisters[0] );
+      break;
+
+   case TGSI_OPCODE_PK2H:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_PK2US:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_PK4B:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_PK4UB:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_RFL:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_SEQ:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_SFL:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_SGT:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_SIN:
+      FETCH( func, *inst, 0, 0, CHAN_X );
+      emit_sin( func, 0 );
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SLE:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_SNE:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_STR:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_TEX:
+      if (0) {
+	 /* Disable dummy texture code: 
+	  */
+	 emit_tempf(
+	    func,
+	    0,
+	    TGSI_EXEC_TEMP_ONE_I,
+	    TGSI_EXEC_TEMP_ONE_C );
+	 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+	    STORE( func, *inst, 0, 0, chan_index );
+	 }
+      }
+      else {
+	 return 0;
+      }
+      break;
+
+   case TGSI_OPCODE_TXD:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_UP2H:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_UP2US:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_UP4B:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_UP4UB:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_X2D:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_ARA:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_ARR:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_BRA:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_CAL:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_RET:
+      emit_ret( func );
+      break;
+
+   case TGSI_OPCODE_END:
+      break;
+
+   case TGSI_OPCODE_SSG:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_CMP:
+      emit_cmp (func, inst);
+      break;
+
+   case TGSI_OPCODE_SCS:
+      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
+         FETCH( func, *inst, 0, 0, CHAN_X );
+         emit_cos( func, 0 );
+         STORE( func, *inst, 0, 0, CHAN_X );
+      }
+      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
+         FETCH( func, *inst, 0, 0, CHAN_X );
+         emit_sin( func, 0 );
+         STORE( func, *inst, 0, 0, CHAN_Y );
+      }
+      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
+	 emit_tempf(
+	    func,
+	    0,
+	    TGSI_EXEC_TEMP_00000000_I,
+	    TGSI_EXEC_TEMP_00000000_C );
+         STORE( func, *inst, 0, 0, CHAN_Z );
+      }
+      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
+	 emit_tempf(
+	    func,
+	    0,
+	    TGSI_EXEC_TEMP_ONE_I,
+	    TGSI_EXEC_TEMP_ONE_C );
+         STORE( func, *inst, 0, 0, CHAN_W );
+      }
+      break;
+
+   case TGSI_OPCODE_TXB:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_NRM:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_DIV:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_DP2:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_TXL:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_BRK:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_IF:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_LOOP:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_REP:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_ELSE:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_ENDIF:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_ENDLOOP:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_ENDREP:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_PUSHA:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_POPA:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_CEIL:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_I2F:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_NOT:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_TRUNC:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_SHL:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_SHR:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_AND:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_OR:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_MOD:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_XOR:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_SAD:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_TXF:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_TXQ:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_CONT:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_EMIT:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_ENDPRIM:
+      return 0;
+      break;
+
+   default:
+      return 0;
+   }
+   
+   return 1;
+}
+
+static void
+emit_declaration(
+   struct x86_function *func,
+   struct tgsi_full_declaration *decl )
+{
+   if( decl->Declaration.File == TGSI_FILE_INPUT ) {
+      unsigned first, last, mask;
+      unsigned i, j;
+
+      first = decl->DeclarationRange.First;
+      last = decl->DeclarationRange.Last;
+      mask = decl->Declaration.UsageMask;
+
+      for( i = first; i <= last; i++ ) {
+         for( j = 0; j < NUM_CHANNELS; j++ ) {
+            if( mask & (1 << j) ) {
+               switch( decl->Declaration.Interpolate ) {
+               case TGSI_INTERPOLATE_CONSTANT:
+                  emit_coef_a0( func, 0, i, j );
+                  emit_inputs( func, 0, i, j );
+                  break;
+
+               case TGSI_INTERPOLATE_LINEAR:
+                  emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
+                  emit_coef_dadx( func, 1, i, j );
+                  emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
+                  emit_coef_dady( func, 3, i, j );
+                  emit_mul( func, 0, 1 );    /* x * dadx */
+                  emit_coef_a0( func, 4, i, j );
+                  emit_mul( func, 2, 3 );    /* y * dady */
+                  emit_add( func, 0, 4 );    /* x * dadx + a0 */
+                  emit_add( func, 0, 2 );    /* x * dadx + y * dady + a0 */
+                  emit_inputs( func, 0, i, j );
+                  break;
+
+               case TGSI_INTERPOLATE_PERSPECTIVE:
+                  emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
+                  emit_coef_dadx( func, 1, i, j );
+                  emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
+                  emit_coef_dady( func, 3, i, j );
+                  emit_mul( func, 0, 1 );    /* x * dadx */
+                  emit_tempf( func, 4, 0, TGSI_SWIZZLE_W );
+                  emit_coef_a0( func, 5, i, j );
+                  emit_rcp( func, 4, 4 );    /* 1.0 / w */
+                  emit_mul( func, 2, 3 );    /* y * dady */
+                  emit_add( func, 0, 5 );    /* x * dadx + a0 */
+                  emit_add( func, 0, 2 );    /* x * dadx + y * dady + a0 */
+                  emit_mul( func, 0, 4 );    /* (x * dadx + y * dady + a0) / w */
+                  emit_inputs( func, 0, i, j );
+                  break;
+
+               default:
+                  assert( 0 );
+		  break;
+               }
+            }
+         }
+      }
+   }
+}
+
+static void aos_to_soa( struct x86_function *func, 
+                        uint arg_aos,
+                        uint arg_soa, 
+                        uint arg_num, 
+                        uint arg_stride )
+{
+   struct x86_reg soa_input = x86_make_reg( file_REG32, reg_AX );
+   struct x86_reg aos_input = x86_make_reg( file_REG32, reg_BX );
+   struct x86_reg num_inputs = x86_make_reg( file_REG32, reg_CX );
+   struct x86_reg stride = x86_make_reg( file_REG32, reg_DX );
+   int inner_loop;
+
+
+   /* Save EBX */
+   x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
+
+   x86_mov( func, aos_input,  x86_fn_arg( func, arg_aos ) );
+   x86_mov( func, soa_input,  x86_fn_arg( func, arg_soa ) );
+   x86_mov( func, num_inputs, x86_fn_arg( func, arg_num ) );
+   x86_mov( func, stride,     x86_fn_arg( func, arg_stride ) );
+
+   /* do */
+   inner_loop = x86_get_label( func );
+   {
+      x86_push( func, aos_input );
+      sse_movlps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
+      sse_movlps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
+      x86_add( func, aos_input, stride );
+      sse_movhps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
+      sse_movhps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
+      x86_add( func, aos_input, stride );
+      sse_movlps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
+      sse_movlps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
+      x86_add( func, aos_input, stride );
+      sse_movhps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
+      sse_movhps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
+      x86_pop( func, aos_input );
+
+      sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
+      sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
+      sse_shufps( func, make_xmm( 0 ), make_xmm( 1 ), 0x88 );
+      sse_shufps( func, make_xmm( 2 ), make_xmm( 1 ), 0xdd );
+      sse_shufps( func, make_xmm( 3 ), make_xmm( 4 ), 0x88 );
+      sse_shufps( func, make_xmm( 5 ), make_xmm( 4 ), 0xdd );
+
+      sse_movups( func, x86_make_disp( soa_input, 0 ), make_xmm( 0 ) );
+      sse_movups( func, x86_make_disp( soa_input, 16 ), make_xmm( 2 ) );
+      sse_movups( func, x86_make_disp( soa_input, 32 ), make_xmm( 3 ) );
+      sse_movups( func, x86_make_disp( soa_input, 48 ), make_xmm( 5 ) );
+
+      /* Advance to next input */
+      x86_lea( func, aos_input, x86_make_disp(aos_input, 16) );
+      x86_lea( func, soa_input, x86_make_disp(soa_input, 64) );
+   }
+   /* while --num_inputs */
+   x86_dec( func, num_inputs );
+   x86_jcc( func, cc_NE, inner_loop );
+
+   /* Restore EBX */
+   x86_pop( func, aos_input );
+}
+
+static void soa_to_aos( struct x86_function *func, uint aos, uint soa, uint num, uint stride )
+{
+   struct x86_reg soa_output;
+   struct x86_reg aos_output;
+   struct x86_reg num_outputs;
+   struct x86_reg temp;
+   int inner_loop;
+
+   soa_output = x86_make_reg( file_REG32, reg_AX );
+   aos_output = x86_make_reg( file_REG32, reg_BX );
+   num_outputs = x86_make_reg( file_REG32, reg_CX );
+   temp = x86_make_reg( file_REG32, reg_DX );
+
+   /* Save EBX */
+   x86_push( func, aos_output );
+
+   x86_mov( func, soa_output, x86_fn_arg( func, soa ) );
+   x86_mov( func, aos_output, x86_fn_arg( func, aos ) );
+   x86_mov( func, num_outputs, x86_fn_arg( func, num ) );
+
+   /* do */
+   inner_loop = x86_get_label( func );
+   {
+      sse_movups( func, make_xmm( 0 ), x86_make_disp( soa_output, 0 ) );
+      sse_movups( func, make_xmm( 1 ), x86_make_disp( soa_output, 16 ) );
+      sse_movups( func, make_xmm( 3 ), x86_make_disp( soa_output, 32 ) );
+      sse_movups( func, make_xmm( 4 ), x86_make_disp( soa_output, 48 ) );
+
+      sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
+      sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
+      sse_unpcklps( func, make_xmm( 0 ), make_xmm( 1 ) );
+      sse_unpckhps( func, make_xmm( 2 ), make_xmm( 1 ) );
+      sse_unpcklps( func, make_xmm( 3 ), make_xmm( 4 ) );
+      sse_unpckhps( func, make_xmm( 5 ), make_xmm( 4 ) );
+
+      x86_mov( func, temp, x86_fn_arg( func, stride ) );
+      x86_push( func, aos_output );
+      sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
+      sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
+      x86_add( func, aos_output, temp );
+      sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
+      sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
+      x86_add( func, aos_output, temp );
+      sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
+      sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
+      x86_add( func, aos_output, temp );
+      sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
+      sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
+      x86_pop( func, aos_output );
+
+      /* Advance to next output */
+      x86_lea( func, aos_output, x86_make_disp(aos_output, 16) );
+      x86_lea( func, soa_output, x86_make_disp(soa_output, 64) );
+   }
+   /* while --num_outputs */
+   x86_dec( func, num_outputs );
+   x86_jcc( func, cc_NE, inner_loop );
+
+   /* Restore EBX */
+   x86_pop( func, aos_output );
+}
+
+/**
+ * Translate a TGSI vertex/fragment shader to SSE2 code.
+ * Slightly different things are done for vertex vs. fragment shaders.
+ *
+ * Note that fragment shaders are responsible for interpolating shader
+ * inputs. Because on x86 we have only 4 GP registers, and here we
+ * have 5 shader arguments (input, output, const, temp and coef), the
+ * code is split into two phases -- DECLARATION and INSTRUCTION phase.
+ * GP register holding the output argument is aliased with the coeff
+ * argument, as outputs are not needed in the DECLARATION phase.
+ *
+ * \param tokens  the TGSI input shader
+ * \param func  the output SSE code/function
+ * \param immediates  buffer to place immediates, later passed to SSE func
+ * \param return  1 for success, 0 if translation failed
+ */
+unsigned
+tgsi_emit_sse2(
+   const struct tgsi_token *tokens,
+   struct x86_function *func,
+   float (*immediates)[4],
+   boolean do_swizzles )
+{
+   struct tgsi_parse_context parse;
+   boolean instruction_phase = FALSE;
+   unsigned ok = 1;
+   uint num_immediates = 0;
+
+   func->csr = func->store;
+
+   tgsi_parse_init( &parse, tokens );
+
+   /* Can't just use EDI, EBX without save/restoring them:
+    */
+   x86_push(
+      func,
+      get_immediate_base() );
+
+   x86_push(
+      func,
+      get_temp_base() );
+
+
+   /*
+    * Different function args for vertex/fragment shaders:
+    */
+   if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
+      /* DECLARATION phase, do not load output argument. */
+      x86_mov(
+         func,
+         get_input_base(),
+         x86_fn_arg( func, 1 ) );
+      /* skipping outputs argument here */
+      x86_mov(
+         func,
+         get_const_base(),
+         x86_fn_arg( func, 3 ) );
+      x86_mov(
+         func,
+         get_temp_base(),
+         x86_fn_arg( func, 4 ) );
+      x86_mov(
+         func,
+         get_coef_base(),
+         x86_fn_arg( func, 5 ) );
+      x86_mov(
+         func,
+         get_immediate_base(),
+         x86_fn_arg( func, 6 ) );
+   }
+   else {
+      assert(parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX);
+
+      if (do_swizzles)
+         aos_to_soa( func, 
+                     6,         /* aos_input */
+                     1,         /* machine->input */
+                     7,         /* num_inputs */
+                     8 );       /* input_stride */
+
+      x86_mov(
+         func,
+         get_input_base(),
+         x86_fn_arg( func, 1 ) );
+      x86_mov(
+         func,
+         get_output_base(),
+         x86_fn_arg( func, 2 ) );
+      x86_mov(
+         func,
+         get_const_base(),
+         x86_fn_arg( func, 3 ) );
+      x86_mov(
+         func,
+         get_temp_base(),
+         x86_fn_arg( func, 4 ) );
+      x86_mov(
+         func,
+         get_immediate_base(),
+         x86_fn_arg( func, 5 ) );
+   }
+
+   while( !tgsi_parse_end_of_tokens( &parse ) && ok ) {
+      tgsi_parse_token( &parse );
+
+      switch( parse.FullToken.Token.Type ) {
+      case TGSI_TOKEN_TYPE_DECLARATION:
+         if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
+            emit_declaration(
+               func,
+               &parse.FullToken.FullDeclaration );
+         }
+         break;
+
+      case TGSI_TOKEN_TYPE_INSTRUCTION:
+         if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
+            if( !instruction_phase ) {
+               /* INSTRUCTION phase, overwrite coeff with output. */
+               instruction_phase = TRUE;
+               x86_mov(
+                  func,
+                  get_output_base(),
+                  x86_fn_arg( func, 2 ) );
+            }
+         }
+
+         ok = emit_instruction(
+            func,
+            &parse.FullToken.FullInstruction );
+
+	 if (!ok) {
+	    debug_printf("failed to translate tgsi opcode %d to SSE (%s)\n", 
+			 parse.FullToken.FullInstruction.Instruction.Opcode,
+                         parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX ?
+                         "vertex shader" : "fragment shader");
+	 }
+         break;
+
+      case TGSI_TOKEN_TYPE_IMMEDIATE:
+         /* simply copy the immediate values into the next immediates[] slot */
+         {
+            const uint size = parse.FullToken.FullImmediate.Immediate.Size - 1;
+            uint i;
+            assert(size <= 4);
+            assert(num_immediates < TGSI_EXEC_NUM_IMMEDIATES);
+            for( i = 0; i < size; i++ ) {
+               immediates[num_immediates][i] =
+		  parse.FullToken.FullImmediate.u.ImmediateFloat32[i].Float;
+            }
+#if 0
+            debug_printf("SSE FS immediate[%d] = %f %f %f %f\n",
+                   num_immediates,
+                   immediates[num_immediates][0],
+                   immediates[num_immediates][1],
+                   immediates[num_immediates][2],
+                   immediates[num_immediates][3]);
+#endif
+            num_immediates++;
+         }
+         break;
+
+      default:
+	 ok = 0;
+         assert( 0 );
+      }
+   }
+
+   if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
+      if (do_swizzles)
+         soa_to_aos( func, 9, 2, 10, 11 );
+   }
+
+   /* Can't just use EBX, EDI without save/restoring them:
+    */
+   x86_pop(
+      func,
+      get_temp_base() );
+
+   x86_pop(
+      func,
+      get_immediate_base() );
+
+   emit_ret( func );
+
+   tgsi_parse_free( &parse );
+
+   return ok;
+}
+
+#endif /* PIPE_ARCH_X86 */
diff --git a/src/gallium/auxiliary/tgsi/tgsi_sse2.h b/src/gallium/auxiliary/tgsi/tgsi_sse2.h
new file mode 100644
index 0000000000..af838b2a25
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/tgsi_sse2.h
@@ -0,0 +1,49 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef TGSI_SSE2_H
+#define TGSI_SSE2_H
+
+#if defined __cplusplus
+extern "C" {
+#endif
+
+struct tgsi_token;
+struct x86_function;
+
+unsigned
+tgsi_emit_sse2(
+   const struct tgsi_token *tokens,
+   struct x86_function *function,
+   float (*immediates)[4],
+   boolean do_swizzles );
+
+#if defined __cplusplus
+}
+#endif
+
+#endif /* TGSI_SSE2_H */
diff --git a/src/gallium/auxiliary/tgsi/tgsi_text.c b/src/gallium/auxiliary/tgsi/tgsi_text.c
new file mode 100644
index 0000000000..35cb3055bb
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/tgsi_text.c
@@ -0,0 +1,1221 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "pipe/p_debug.h"
+#include "tgsi_text.h"
+#include "tgsi_build.h"
+#include "tgsi_parse.h"
+#include "tgsi_sanity.h"
+#include "tgsi_util.h"
+
+static boolean is_alpha_underscore( const char *cur )
+{
+   return
+      (*cur >= 'a' && *cur <= 'z') ||
+      (*cur >= 'A' && *cur <= 'Z') ||
+      *cur == '_';
+}
+
+static boolean is_digit( const char *cur )
+{
+   return *cur >= '0' && *cur <= '9';
+}
+
+static boolean is_digit_alpha_underscore( const char *cur )
+{
+   return is_digit( cur ) || is_alpha_underscore( cur );
+}
+
+static boolean str_match_no_case( const char **pcur, const char *str )
+{
+   const char *cur = *pcur;
+
+   while (*str != '\0' && *str == toupper( *cur )) {
+      str++;
+      cur++;
+   }
+   if (*str == '\0') {
+      *pcur = cur;
+      return TRUE;
+   }
+   return FALSE;
+}
+
+/* Eat zero or more whitespaces.
+ */
+static void eat_opt_white( const char **pcur )
+{
+   while (**pcur == ' ' || **pcur == '\t' || **pcur == '\n')
+      (*pcur)++;
+}
+
+/* Eat one or more whitespaces.
+ * Return TRUE if at least one whitespace eaten.
+ */
+static boolean eat_white( const char **pcur )
+{
+   const char *cur = *pcur;
+
+   eat_opt_white( pcur );
+   return *pcur > cur;
+}
+
+/* Parse unsigned integer.
+ * No checks for overflow.
+ */
+static boolean parse_uint( const char **pcur, uint *val )
+{
+   const char *cur = *pcur;
+
+   if (is_digit( cur )) {
+      *val = *cur++ - '0';
+      while (is_digit( cur ))
+         *val = *val * 10 + *cur++ - '0';
+      *pcur = cur;
+      return TRUE;
+   }
+   return FALSE;
+}
+
+/* Parse floating point.
+ */
+static boolean parse_float( const char **pcur, float *val )
+{
+   const char *cur = *pcur;
+   boolean integral_part = FALSE;
+   boolean fractional_part = FALSE;
+
+   *val = (float) atof( cur );
+
+   if (*cur == '-' || *cur == '+')
+      cur++;
+   if (is_digit( cur )) {
+      cur++;
+      integral_part = TRUE;
+      while (is_digit( cur ))
+         cur++;
+   }
+   if (*cur == '.') {
+      cur++;
+      if (is_digit( cur )) {
+         cur++;
+         fractional_part = TRUE;
+         while (is_digit( cur ))
+            cur++;
+      }
+   }
+   if (!integral_part && !fractional_part)
+      return FALSE;
+   if (toupper( *cur ) == 'E') {
+      cur++;
+      if (*cur == '-' || *cur == '+')
+         cur++;
+      if (is_digit( cur )) {
+         cur++;
+         while (is_digit( cur ))
+            cur++;
+      }
+      else
+         return FALSE;
+   }
+   *pcur = cur;
+   return TRUE;
+}
+
+struct translate_ctx
+{
+   const char *text;
+   const char *cur;
+   struct tgsi_token *tokens;
+   struct tgsi_token *tokens_cur;
+   struct tgsi_token *tokens_end;
+   struct tgsi_header *header;
+};
+
+static void report_error( struct translate_ctx *ctx, const char *msg )
+{
+   debug_printf( "\nError: %s", msg );
+}
+
+/* Parse shader header.
+ * Return TRUE for one of the following headers.
+ *    FRAG1.1
+ *    GEOM1.1
+ *    VERT1.1
+ */
+static boolean parse_header( struct translate_ctx *ctx )
+{
+   uint processor;
+
+   if (str_match_no_case( &ctx->cur, "FRAG1.1" ))
+      processor = TGSI_PROCESSOR_FRAGMENT;
+   else if (str_match_no_case( &ctx->cur, "VERT1.1" ))
+      processor = TGSI_PROCESSOR_VERTEX;
+   else if (str_match_no_case( &ctx->cur, "GEOM1.1" ))
+      processor = TGSI_PROCESSOR_GEOMETRY;
+   else {
+      report_error( ctx, "Unknown header" );
+      return FALSE;
+   }
+
+   if (ctx->tokens_cur >= ctx->tokens_end)
+      return FALSE;
+   *(struct tgsi_version *) ctx->tokens_cur++ = tgsi_build_version();
+
+   if (ctx->tokens_cur >= ctx->tokens_end)
+      return FALSE;
+   ctx->header = (struct tgsi_header *) ctx->tokens_cur++;
+   *ctx->header = tgsi_build_header();
+
+   if (ctx->tokens_cur >= ctx->tokens_end)
+      return FALSE;
+   *(struct tgsi_processor *) ctx->tokens_cur++ = tgsi_build_processor( processor, ctx->header );
+
+   return TRUE;
+}
+
+static boolean parse_label( struct translate_ctx *ctx, uint *val )
+{
+   const char *cur = ctx->cur;
+
+   if (parse_uint( &cur, val )) {
+      eat_opt_white( &cur );
+      if (*cur == ':') {
+         cur++;
+         ctx->cur = cur;
+         return TRUE;
+      }
+   }
+   return FALSE;
+}
+
+static const char *file_names[TGSI_FILE_COUNT] =
+{
+   "NULL",
+   "CONST",
+   "IN",
+   "OUT",
+   "TEMP",
+   "SAMP",
+   "ADDR",
+   "IMM"
+};
+
+static boolean
+parse_file( const char **pcur, uint *file )
+{
+   uint i;
+
+   for (i = 0; i < TGSI_FILE_COUNT; i++) {
+      const char *cur = *pcur;
+
+      if (str_match_no_case( &cur, file_names[i] )) {
+         if (!is_digit_alpha_underscore( cur )) {
+            *pcur = cur;
+            *file = i;
+            return TRUE;
+         }
+      }
+   }
+   return FALSE;
+}
+
+static boolean
+parse_opt_writemask(
+   struct translate_ctx *ctx,
+   uint *writemask )
+{
+   const char *cur;
+
+   cur = ctx->cur;
+   eat_opt_white( &cur );
+   if (*cur == '.') {
+      cur++;
+      *writemask = TGSI_WRITEMASK_NONE;
+      eat_opt_white( &cur );
+      if (toupper( *cur ) == 'X') {
+         cur++;
+         *writemask |= TGSI_WRITEMASK_X;
+      }
+      if (toupper( *cur ) == 'Y') {
+         cur++;
+         *writemask |= TGSI_WRITEMASK_Y;
+      }
+      if (toupper( *cur ) == 'Z') {
+         cur++;
+         *writemask |= TGSI_WRITEMASK_Z;
+      }
+      if (toupper( *cur ) == 'W') {
+         cur++;
+         *writemask |= TGSI_WRITEMASK_W;
+      }
+
+      if (*writemask == TGSI_WRITEMASK_NONE) {
+         report_error( ctx, "Writemask expected" );
+         return FALSE;
+      }
+
+      ctx->cur = cur;
+   }
+   else {
+      *writemask = TGSI_WRITEMASK_XYZW;
+   }
+   return TRUE;
+}
+
+/* <register_file_bracket> ::= <file> `['
+ */
+static boolean
+parse_register_file_bracket(
+   struct translate_ctx *ctx,
+   uint *file )
+{
+   if (!parse_file( &ctx->cur, file )) {
+      report_error( ctx, "Unknown register file" );
+      return FALSE;
+   }
+   eat_opt_white( &ctx->cur );
+   if (*ctx->cur != '[') {
+      report_error( ctx, "Expected `['" );
+      return FALSE;
+   }
+   ctx->cur++;
+   return TRUE;
+}
+
+/* <register_file_bracket_index> ::= <register_file_bracket> <uint>
+ */
+static boolean
+parse_register_file_bracket_index(
+   struct translate_ctx *ctx,
+   uint *file,
+   int *index )
+{
+   uint uindex;
+
+   if (!parse_register_file_bracket( ctx, file ))
+      return FALSE;
+   eat_opt_white( &ctx->cur );
+   if (!parse_uint( &ctx->cur, &uindex )) {
+      report_error( ctx, "Expected literal unsigned integer" );
+      return FALSE;
+   }
+   *index = (int) uindex;
+   return TRUE;
+}
+
+/* Parse destination register operand.
+ *    <register_dst> ::= <register_file_bracket_index> `]'
+ */
+static boolean
+parse_register_dst(
+   struct translate_ctx *ctx,
+   uint *file,
+   int *index )
+{
+   if (!parse_register_file_bracket_index( ctx, file, index ))
+      return FALSE;
+   eat_opt_white( &ctx->cur );
+   if (*ctx->cur != ']') {
+      report_error( ctx, "Expected `]'" );
+      return FALSE;
+   }
+   ctx->cur++;
+   return TRUE;
+}
+
+/* Parse source register operand.
+ *    <register_src> ::= <register_file_bracket_index> `]' |
+ *                       <register_file_bracket> <register_dst> `]' |
+ *                       <register_file_bracket> <register_dst> `+' <uint> `]' |
+ *                       <register_file_bracket> <register_dst> `-' <uint> `]'
+ */
+static boolean
+parse_register_src(
+   struct translate_ctx *ctx,
+   uint *file,
+   int *index,
+   uint *ind_file,
+   int *ind_index )
+{
+   const char *cur;
+   uint uindex;
+
+   if (!parse_register_file_bracket( ctx, file ))
+      return FALSE;
+   eat_opt_white( &ctx->cur );
+   cur = ctx->cur;
+   if (parse_file( &cur, ind_file )) {
+      if (!parse_register_dst( ctx, ind_file, ind_index ))
+         return FALSE;
+      eat_opt_white( &ctx->cur );
+      if (*ctx->cur == '+' || *ctx->cur == '-') {
+         boolean negate;
+
+         negate = *ctx->cur == '-';
+         ctx->cur++;
+         eat_opt_white( &ctx->cur );
+         if (!parse_uint( &ctx->cur, &uindex )) {
+            report_error( ctx, "Expected literal unsigned integer" );
+            return FALSE;
+         }
+         if (negate)
+            *index = -(int) uindex;
+         else
+            *index = (int) uindex;
+      }
+      else {
+         *index = 0;
+      }
+   }
+   else {
+      if (!parse_uint( &ctx->cur, &uindex )) {
+         report_error( ctx, "Expected literal unsigned integer" );
+         return FALSE;
+      }
+      *index = (int) uindex;
+      *ind_file = TGSI_FILE_NULL;
+      *ind_index = 0;
+   }
+   eat_opt_white( &ctx->cur );
+   if (*ctx->cur != ']') {
+      report_error( ctx, "Expected `]'" );
+      return FALSE;
+   }
+   ctx->cur++;
+   return TRUE;
+}
+
+/* Parse register declaration.
+ *    <register_dcl> ::= <register_file_bracket_index> `]' |
+ *                       <register_file_bracket_index> `..' <index> `]'
+ */
+static boolean
+parse_register_dcl(
+   struct translate_ctx *ctx,
+   uint *file,
+   int *first,
+   int *last )
+{
+   if (!parse_register_file_bracket_index( ctx, file, first ))
+      return FALSE;
+   eat_opt_white( &ctx->cur );
+   if (ctx->cur[0] == '.' && ctx->cur[1] == '.') {
+      uint uindex;
+
+      ctx->cur += 2;
+      eat_opt_white( &ctx->cur );
+      if (!parse_uint( &ctx->cur, &uindex )) {
+         report_error( ctx, "Expected literal integer" );
+         return FALSE;
+      }
+      *last = (int) uindex;
+      eat_opt_white( &ctx->cur );
+   }
+   else {
+      *last = *first;
+   }
+   if (*ctx->cur != ']') {
+      report_error( ctx, "Expected `]' or `..'" );
+      return FALSE;
+   }
+   ctx->cur++;
+   return TRUE;
+}
+
+static const char *modulate_names[TGSI_MODULATE_COUNT] =
+{
+   "_1X",
+   "_2X",
+   "_4X",
+   "_8X",
+   "_D2",
+   "_D4",
+   "_D8"
+};
+
+static boolean
+parse_dst_operand(
+   struct translate_ctx *ctx,
+   struct tgsi_full_dst_register *dst )
+{
+   uint file;
+   int index;
+   uint writemask;
+   const char *cur;
+
+   if (!parse_register_dst( ctx, &file, &index ))
+      return FALSE;
+
+   cur = ctx->cur;
+   eat_opt_white( &cur );
+   if (*cur == '_') {
+      uint i;
+
+      for (i = 0; i < TGSI_MODULATE_COUNT; i++) {
+         if (str_match_no_case( &cur, modulate_names[i] )) {
+            if (!is_digit_alpha_underscore( cur )) {
+               dst->DstRegisterExtModulate.Modulate = i;
+               ctx->cur = cur;
+               break;
+            }
+         }
+      }
+   }
+
+   if (!parse_opt_writemask( ctx, &writemask ))
+      return FALSE;
+
+   dst->DstRegister.File = file;
+   dst->DstRegister.Index = index;
+   dst->DstRegister.WriteMask = writemask;
+   return TRUE;
+}
+
+static boolean
+parse_optional_swizzle(
+   struct translate_ctx *ctx,
+   uint swizzle[4],
+   boolean *parsed_swizzle,
+   boolean *parsed_extswizzle )
+{
+   const char *cur = ctx->cur;
+
+   *parsed_swizzle = FALSE;
+   *parsed_extswizzle = FALSE;
+
+   eat_opt_white( &cur );
+   if (*cur == '.') {
+      uint i;
+
+      cur++;
+      eat_opt_white( &cur );
+      for (i = 0; i < 4; i++) {
+         if (toupper( *cur ) == 'X')
+            swizzle[i] = TGSI_SWIZZLE_X;
+         else if (toupper( *cur ) == 'Y')
+            swizzle[i] = TGSI_SWIZZLE_Y;
+         else if (toupper( *cur ) == 'Z')
+            swizzle[i] = TGSI_SWIZZLE_Z;
+         else if (toupper( *cur ) == 'W')
+            swizzle[i] = TGSI_SWIZZLE_W;
+         else {
+            if (*cur == '0')
+               swizzle[i] = TGSI_EXTSWIZZLE_ZERO;
+            else if (*cur == '1')
+               swizzle[i] = TGSI_EXTSWIZZLE_ONE;
+            else {
+               report_error( ctx, "Expected register swizzle component `x', `y', `z', `w', `0' or `1'" );
+               return FALSE;
+            }
+            *parsed_extswizzle = TRUE;
+         }
+         cur++;
+      }
+      *parsed_swizzle = TRUE;
+      ctx->cur = cur;
+   }
+   return TRUE;
+}
+
+static boolean
+parse_src_operand(
+   struct translate_ctx *ctx,
+   struct tgsi_full_src_register *src )
+{
+   const char *cur;
+   float value;
+   uint file;
+   int index;
+   uint ind_file;
+   int ind_index;
+   uint swizzle[4];
+   boolean parsed_swizzle;
+   boolean parsed_extswizzle;
+
+   if (*ctx->cur == '-') {
+      cur = ctx->cur;
+      cur++;
+      eat_opt_white( &cur );
+      if (*cur == '(') {
+         cur++;
+         src->SrcRegisterExtMod.Negate = 1;
+         eat_opt_white( &cur );
+         ctx->cur = cur;
+      }
+   }
+
+   if (*ctx->cur == '|') {
+      ctx->cur++;
+      eat_opt_white( &ctx->cur );
+      src->SrcRegisterExtMod.Absolute = 1;
+   }
+
+   if (*ctx->cur == '-') {
+      ctx->cur++;
+      eat_opt_white( &ctx->cur );
+      src->SrcRegister.Negate = 1;
+   }
+
+   cur = ctx->cur;
+   if (parse_float( &cur, &value )) {
+      if (value == 2.0f) {
+         eat_opt_white( &cur );
+         if (*cur != '*') {
+            report_error( ctx, "Expected `*'" );
+            return FALSE;
+         }
+         cur++;
+         if (*cur != '(') {
+            report_error( ctx, "Expected `('" );
+            return FALSE;
+         }
+         cur++;
+         src->SrcRegisterExtMod.Scale2X = 1;
+         eat_opt_white( &cur );
+         ctx->cur = cur;
+      }
+   }
+
+   if (*ctx->cur == '(') {
+      ctx->cur++;
+      eat_opt_white( &ctx->cur );
+      src->SrcRegisterExtMod.Bias = 1;
+   }
+
+   cur = ctx->cur;
+   if (parse_float( &cur, &value )) {
+      if (value == 1.0f) {
+         eat_opt_white( &cur );
+         if (*cur != '-') {
+            report_error( ctx, "Expected `-'" );
+            return FALSE;
+         }
+         cur++;
+         if (*cur != '(') {
+            report_error( ctx, "Expected `('" );
+            return FALSE;
+         }
+         cur++;
+         src->SrcRegisterExtMod.Complement = 1;
+         eat_opt_white( &cur );
+         ctx->cur = cur;
+      }
+   }
+
+   if (!parse_register_src( ctx, &file, &index, &ind_file, &ind_index ))
+      return FALSE;
+   src->SrcRegister.File = file;
+   src->SrcRegister.Index = index;
+   if (ind_file != TGSI_FILE_NULL) {
+      src->SrcRegister.Indirect = 1;
+      src->SrcRegisterInd.File = ind_file;
+      src->SrcRegisterInd.Index = ind_index;
+   }
+
+   /* Parse optional swizzle.
+    */
+   if (parse_optional_swizzle( ctx, swizzle, &parsed_swizzle, &parsed_extswizzle )) {
+      if (parsed_extswizzle) {
+         assert( parsed_swizzle );
+
+         src->SrcRegisterExtSwz.ExtSwizzleX = swizzle[0];
+         src->SrcRegisterExtSwz.ExtSwizzleY = swizzle[1];
+         src->SrcRegisterExtSwz.ExtSwizzleZ = swizzle[2];
+         src->SrcRegisterExtSwz.ExtSwizzleW = swizzle[3];
+      }
+      else if (parsed_swizzle) {
+         src->SrcRegister.SwizzleX = swizzle[0];
+         src->SrcRegister.SwizzleY = swizzle[1];
+         src->SrcRegister.SwizzleZ = swizzle[2];
+         src->SrcRegister.SwizzleW = swizzle[3];
+      }
+   }
+
+   if (src->SrcRegisterExtMod.Complement) {
+      eat_opt_white( &ctx->cur );
+      if (*ctx->cur != ')') {
+         report_error( ctx, "Expected `)'" );
+         return FALSE;
+      }
+      ctx->cur++;
+   }
+
+   if (src->SrcRegisterExtMod.Bias) {
+      eat_opt_white( &ctx->cur );
+      if (*ctx->cur != ')') {
+         report_error( ctx, "Expected `)'" );
+         return FALSE;
+      }
+      ctx->cur++;
+      eat_opt_white( &ctx->cur );
+      if (*ctx->cur != '-') {
+         report_error( ctx, "Expected `-'" );
+         return FALSE;
+      }
+      ctx->cur++;
+      eat_opt_white( &ctx->cur );
+      if (!parse_float( &ctx->cur, &value )) {
+         report_error( ctx, "Expected literal floating point" );
+         return FALSE;
+      }
+      if (value != 0.5f) {
+         report_error( ctx, "Expected 0.5" );
+         return FALSE;
+      }
+   }
+
+   if (src->SrcRegisterExtMod.Scale2X) {
+      eat_opt_white( &ctx->cur );
+      if (*ctx->cur != ')') {
+         report_error( ctx, "Expected `)'" );
+         return FALSE;
+      }
+      ctx->cur++;
+   }
+
+   if (src->SrcRegisterExtMod.Absolute) {
+      eat_opt_white( &ctx->cur );
+      if (*ctx->cur != '|') {
+         report_error( ctx, "Expected `|'" );
+         return FALSE;
+      }
+      ctx->cur++;
+   }
+
+   if (src->SrcRegisterExtMod.Negate) {
+      eat_opt_white( &ctx->cur );
+      if (*ctx->cur != ')') {
+         report_error( ctx, "Expected `)'" );
+         return FALSE;
+      }
+      ctx->cur++;
+   }
+
+   return TRUE;
+}
+
+struct opcode_info
+{
+   uint num_dst;
+   uint num_src;
+   uint is_tex;
+   uint is_branch;
+   const char *mnemonic;
+};
+
+static const struct opcode_info opcode_info[TGSI_OPCODE_LAST] =
+{
+   { 1, 1, 0, 0, "ARL" },
+   { 1, 1, 0, 0, "MOV" },
+   { 1, 1, 0, 0, "LIT" },
+   { 1, 1, 0, 0, "RCP" },
+   { 1, 1, 0, 0, "RSQ" },
+   { 1, 1, 0, 0, "EXP" },
+   { 1, 1, 0, 0, "LOG" },
+   { 1, 2, 0, 0, "MUL" },
+   { 1, 2, 0, 0, "ADD" },
+   { 1, 2, 0, 0, "DP3" },
+   { 1, 2, 0, 0, "DP4" },
+   { 1, 2, 0, 0, "DST" },
+   { 1, 2, 0, 0, "MIN" },
+   { 1, 2, 0, 0, "MAX" },
+   { 1, 2, 0, 0, "SLT" },
+   { 1, 2, 0, 0, "SGE" },
+   { 1, 3, 0, 0, "MAD" },
+   { 1, 2, 0, 0, "SUB" },
+   { 1, 3, 0, 0, "LERP" },
+   { 1, 3, 0, 0, "CND" },
+   { 1, 3, 0, 0, "CND0" },
+   { 1, 3, 0, 0, "DOT2ADD" },
+   { 1, 2, 0, 0, "INDEX" },
+   { 1, 1, 0, 0, "NEGATE" },
+   { 1, 1, 0, 0, "FRAC" },
+   { 1, 3, 0, 0, "CLAMP" },
+   { 1, 1, 0, 0, "FLOOR" },
+   { 1, 1, 0, 0, "ROUND" },
+   { 1, 1, 0, 0, "EXPBASE2" },
+   { 1, 1, 0, 0, "LOGBASE2" },
+   { 1, 2, 0, 0, "POWER" },
+   { 1, 2, 0, 0, "CROSSPRODUCT" },
+   { 1, 2, 0, 0, "MULTIPLYMATRIX" },
+   { 1, 1, 0, 0, "ABS" },
+   { 1, 1, 0, 0, "RCC" },
+   { 1, 2, 0, 0, "DPH" },
+   { 1, 1, 0, 0, "COS" },
+   { 1, 1, 0, 0, "DDX" },
+   { 1, 1, 0, 0, "DDY" },
+   { 0, 1, 0, 0, "KILP" },
+   { 1, 1, 0, 0, "PK2H" },
+   { 1, 1, 0, 0, "PK2US" },
+   { 1, 1, 0, 0, "PK4B" },
+   { 1, 1, 0, 0, "PK4UB" },
+   { 1, 2, 0, 0, "RFL" },
+   { 1, 2, 0, 0, "SEQ" },
+   { 1, 2, 0, 0, "SFL" },
+   { 1, 2, 0, 0, "SGT" },
+   { 1, 1, 0, 0, "SIN" },
+   { 1, 2, 0, 0, "SLE" },
+   { 1, 2, 0, 0, "SNE" },
+   { 1, 2, 0, 0, "STR" },
+   { 1, 2, 1, 0, "TEX" },
+   { 1, 4, 1, 0, "TXD" },
+   { 1, 2, 1, 0, "TXP" },
+   { 1, 1, 0, 0, "UP2H" },
+   { 1, 1, 0, 0, "UP2US" },
+   { 1, 1, 0, 0, "UP4B" },
+   { 1, 1, 0, 0, "UP4UB" },
+   { 1, 3, 0, 0, "X2D" },
+   { 1, 1, 0, 0, "ARA" },
+   { 1, 1, 0, 0, "ARR" },
+   { 0, 1, 0, 0, "BRA" },
+   { 0, 0, 0, 1, "CAL" },
+   { 0, 0, 0, 0, "RET" },
+   { 1, 1, 0, 0, "SSG" },
+   { 1, 3, 0, 0, "CMP" },
+   { 1, 1, 0, 0, "SCS" },
+   { 1, 2, 1, 0, "TXB" },
+   { 1, 1, 0, 0, "NRM" },
+   { 1, 2, 0, 0, "DIV" },
+   { 1, 2, 0, 0, "DP2" },
+   { 1, 2, 1, 0, "TXL" },
+   { 0, 0, 0, 0, "BRK" },
+   { 0, 1, 0, 1, "IF" },
+   { 0, 0, 0, 0, "LOOP" },
+   { 0, 1, 0, 0, "REP" },
+   { 0, 0, 0, 1, "ELSE" },
+   { 0, 0, 0, 0, "ENDIF" },
+   { 0, 0, 0, 0, "ENDLOOP" },
+   { 0, 0, 0, 0, "ENDREP" },
+   { 0, 1, 0, 0, "PUSHA" },
+   { 1, 0, 0, 0, "POPA" },
+   { 1, 1, 0, 0, "CEIL" },
+   { 1, 1, 0, 0, "I2F" },
+   { 1, 1, 0, 0, "NOT" },
+   { 1, 1, 0, 0, "TRUNC" },
+   { 1, 2, 0, 0, "SHL" },
+   { 1, 2, 0, 0, "SHR" },
+   { 1, 2, 0, 0, "AND" },
+   { 1, 2, 0, 0, "OR" },
+   { 1, 2, 0, 0, "MOD" },
+   { 1, 2, 0, 0, "XOR" },
+   { 1, 3, 0, 0, "SAD" },
+   { 1, 2, 1, 0, "TXF" },
+   { 1, 2, 1, 0, "TXQ" },
+   { 0, 0, 0, 0, "CONT" },
+   { 0, 0, 0, 0, "EMIT" },
+   { 0, 0, 0, 0, "ENDPRIM" },
+   { 0, 0, 0, 1, "BGNLOOP2" },
+   { 0, 0, 0, 0, "BGNSUB" },
+   { 0, 0, 0, 1, "ENDLOOP2" },
+   { 0, 0, 0, 0, "ENDSUB" },
+   { 1, 1, 0, 0, "NOISE1" },
+   { 1, 1, 0, 0, "NOISE2" },
+   { 1, 1, 0, 0, "NOISE3" },
+   { 1, 1, 0, 0, "NOISE4" },
+   { 0, 0, 0, 0, "NOP" },
+   { 1, 2, 0, 0, "M4X3" },
+   { 1, 2, 0, 0, "M3X4" },
+   { 1, 2, 0, 0, "M3X3" },
+   { 1, 2, 0, 0, "M3X2" },
+   { 1, 1, 0, 0, "NRM4" },
+   { 0, 1, 0, 0, "CALLNZ" },
+   { 0, 1, 0, 0, "IFC" },
+   { 0, 1, 0, 0, "BREAKC" },
+   { 0, 0, 0, 0, "KIL" },
+   { 0, 0, 0, 0, "END" },
+   { 1, 1, 0, 0, "SWZ" }
+};
+
+static const char *texture_names[TGSI_TEXTURE_COUNT] =
+{
+   "UNKNOWN",
+   "1D",
+   "2D",
+   "3D",
+   "CUBE",
+   "RECT",
+   "SHADOW1D",
+   "SHADOW2D",
+   "SHADOWRECT"
+};
+
+static boolean
+parse_instruction(
+   struct translate_ctx *ctx,
+   boolean has_label )
+{
+   uint i;
+   uint saturate = TGSI_SAT_NONE;
+   const struct opcode_info *info;
+   struct tgsi_full_instruction inst;
+   uint advance;
+
+   /* Parse instruction name.
+    */
+   eat_opt_white( &ctx->cur );
+   for (i = 0; i < TGSI_OPCODE_LAST; i++) {
+      const char *cur = ctx->cur;
+
+      info = &opcode_info[i];
+      if (str_match_no_case( &cur, info->mnemonic )) {
+         if (str_match_no_case( &cur, "_SATNV" ))
+            saturate = TGSI_SAT_MINUS_PLUS_ONE;
+         else if (str_match_no_case( &cur, "_SAT" ))
+            saturate = TGSI_SAT_ZERO_ONE;
+
+         if (info->num_dst + info->num_src + info->is_tex == 0) {
+            if (!is_digit_alpha_underscore( cur )) {
+               ctx->cur = cur;
+               break;
+            }
+         }
+         else if (*cur == '\0' || eat_white( &cur )) {
+            ctx->cur = cur;
+            break;
+         }
+      }
+   }
+   if (i == TGSI_OPCODE_LAST) {
+      if (has_label)
+         report_error( ctx, "Unknown opcode" );
+      else
+         report_error( ctx, "Expected `DCL', `IMM' or a label" );
+      return FALSE;
+   }
+
+   inst = tgsi_default_full_instruction();
+   inst.Instruction.Opcode = i;
+   inst.Instruction.Saturate = saturate;
+   inst.Instruction.NumDstRegs = info->num_dst;
+   inst.Instruction.NumSrcRegs = info->num_src;
+
+   /* Parse instruction operands.
+    */
+   for (i = 0; i < info->num_dst + info->num_src + info->is_tex; i++) {
+      if (i > 0) {
+         eat_opt_white( &ctx->cur );
+         if (*ctx->cur != ',') {
+            report_error( ctx, "Expected `,'" );
+            return FALSE;
+         }
+         ctx->cur++;
+         eat_opt_white( &ctx->cur );
+      }
+
+      if (i < info->num_dst) {
+         if (!parse_dst_operand( ctx, &inst.FullDstRegisters[i] ))
+            return FALSE;
+      }
+      else if (i < info->num_dst + info->num_src) {
+         if (!parse_src_operand( ctx, &inst.FullSrcRegisters[i - info->num_dst] ))
+            return FALSE;
+      }
+      else {
+         uint j;
+
+         for (j = 0; j < TGSI_TEXTURE_COUNT; j++) {
+            if (str_match_no_case( &ctx->cur, texture_names[j] )) {
+               if (!is_digit_alpha_underscore( ctx->cur )) {
+                  inst.InstructionExtTexture.Texture = j;
+                  break;
+               }
+            }
+         }
+         if (j == TGSI_TEXTURE_COUNT) {
+            report_error( ctx, "Expected texture target" );
+            return FALSE;
+         }
+      }
+   }
+
+   if (info->is_branch) {
+      uint target;
+
+      eat_opt_white( &ctx->cur );
+      if (*ctx->cur != ':') {
+         report_error( ctx, "Expected `:'" );
+         return FALSE;
+      }
+      ctx->cur++;
+      eat_opt_white( &ctx->cur );
+      if (!parse_uint( &ctx->cur, &target )) {
+         report_error( ctx, "Expected a label" );
+         return FALSE;
+      }
+      inst.InstructionExtLabel.Label = target;
+   }
+
+   advance = tgsi_build_full_instruction(
+      &inst,
+      ctx->tokens_cur,
+      ctx->header,
+      (uint) (ctx->tokens_end - ctx->tokens_cur) );
+   if (advance == 0)
+      return FALSE;
+   ctx->tokens_cur += advance;
+
+   return TRUE;
+}
+
+static const char *semantic_names[TGSI_SEMANTIC_COUNT] =
+{
+   "POSITION",
+   "COLOR",
+   "BCOLOR",
+   "FOG",
+   "PSIZE",
+   "GENERIC",
+   "NORMAL"
+};
+
+static const char *interpolate_names[TGSI_INTERPOLATE_COUNT] =
+{
+   "CONSTANT",
+   "LINEAR",
+   "PERSPECTIVE"
+};
+
+static boolean parse_declaration( struct translate_ctx *ctx )
+{
+   struct tgsi_full_declaration decl;
+   uint file;
+   int first;
+   int last;
+   uint writemask;
+   const char *cur;
+   uint advance;
+
+   if (!eat_white( &ctx->cur )) {
+      report_error( ctx, "Syntax error" );
+      return FALSE;
+   }
+   if (!parse_register_dcl( ctx, &file, &first, &last ))
+      return FALSE;
+   if (!parse_opt_writemask( ctx, &writemask ))
+      return FALSE;
+
+   decl = tgsi_default_full_declaration();
+   decl.Declaration.File = file;
+   decl.Declaration.UsageMask = writemask;
+   decl.DeclarationRange.First = first;
+   decl.DeclarationRange.Last = last;
+
+   cur = ctx->cur;
+   eat_opt_white( &cur );
+   if (*cur == ',') {
+      uint i;
+
+      cur++;
+      eat_opt_white( &cur );
+      for (i = 0; i < TGSI_SEMANTIC_COUNT; i++) {
+         if (str_match_no_case( &cur, semantic_names[i] )) {
+            const char *cur2 = cur;
+            uint index;
+
+            if (is_digit_alpha_underscore( cur ))
+               continue;
+            eat_opt_white( &cur2 );
+            if (*cur2 == '[') {
+               cur2++;
+               eat_opt_white( &cur2 );
+               if (!parse_uint( &cur2, &index )) {
+                  report_error( ctx, "Expected literal integer" );
+                  return FALSE;
+               }
+               eat_opt_white( &cur2 );
+               if (*cur2 != ']') {
+                  report_error( ctx, "Expected `]'" );
+                  return FALSE;
+               }
+               cur2++;
+
+               decl.Semantic.SemanticIndex = index;
+
+               cur = cur2;
+            }
+
+            decl.Declaration.Semantic = 1;
+            decl.Semantic.SemanticName = i;
+
+            ctx->cur = cur;
+            break;
+         }
+      }
+   }
+
+   cur = ctx->cur;
+   eat_opt_white( &cur );
+   if (*cur == ',') {
+      uint i;
+
+      cur++;
+      eat_opt_white( &cur );
+      for (i = 0; i < TGSI_INTERPOLATE_COUNT; i++) {
+         if (str_match_no_case( &cur, interpolate_names[i] )) {
+            if (is_digit_alpha_underscore( cur ))
+               continue;
+            decl.Declaration.Interpolate = i;
+
+            ctx->cur = cur;
+            break;
+         }
+      }
+      if (i == TGSI_INTERPOLATE_COUNT) {
+         report_error( ctx, "Expected semantic or interpolate attribute" );
+         return FALSE;
+      }
+   }
+
+   advance = tgsi_build_full_declaration(
+      &decl,
+      ctx->tokens_cur,
+      ctx->header,
+      (uint) (ctx->tokens_end - ctx->tokens_cur) );
+   if (advance == 0)
+      return FALSE;
+   ctx->tokens_cur += advance;
+
+   return TRUE;
+}
+
+static boolean parse_immediate( struct translate_ctx *ctx )
+{
+   struct tgsi_full_immediate imm;
+   uint i;
+   float values[4];
+   uint advance;
+
+   if (!eat_white( &ctx->cur )) {
+      report_error( ctx, "Syntax error" );
+      return FALSE;
+   }
+   if (!str_match_no_case( &ctx->cur, "FLT32" ) || is_digit_alpha_underscore( ctx->cur )) {
+      report_error( ctx, "Expected `FLT32'" );
+      return FALSE;
+   }
+   eat_opt_white( &ctx->cur );
+   if (*ctx->cur != '{') {
+      report_error( ctx, "Expected `{'" );
+      return FALSE;
+   }
+   ctx->cur++;
+   for (i = 0; i < 4; i++) {
+      eat_opt_white( &ctx->cur );
+      if (i > 0) {
+         if (*ctx->cur != ',') {
+            report_error( ctx, "Expected `,'" );
+            return FALSE;
+         }
+         ctx->cur++;
+         eat_opt_white( &ctx->cur );
+      }
+      if (!parse_float( &ctx->cur, &values[i] )) {
+         report_error( ctx, "Expected literal floating point" );
+         return FALSE;
+      }
+   }
+   eat_opt_white( &ctx->cur );
+   if (*ctx->cur != '}') {
+      report_error( ctx, "Expected `}'" );
+      return FALSE;
+   }
+   ctx->cur++;
+
+   imm = tgsi_default_full_immediate();
+   imm.Immediate.Size += 4;
+   imm.Immediate.DataType = TGSI_IMM_FLOAT32;
+   imm.u.Pointer = values;
+
+   advance = tgsi_build_full_immediate(
+      &imm,
+      ctx->tokens_cur,
+      ctx->header,
+      (uint) (ctx->tokens_end - ctx->tokens_cur) );
+   if (advance == 0)
+      return FALSE;
+   ctx->tokens_cur += advance;
+
+   return TRUE;
+}
+
+static boolean translate( struct translate_ctx *ctx )
+{
+   eat_opt_white( &ctx->cur );
+   if (!parse_header( ctx ))
+      return FALSE;
+
+   while (*ctx->cur != '\0') {
+      uint label_val = 0;
+
+      if (!eat_white( &ctx->cur )) {
+         report_error( ctx, "Syntax error" );
+         return FALSE;
+      }
+
+      if (*ctx->cur == '\0')
+         break;
+
+      if (parse_label( ctx, &label_val )) {
+         if (!parse_instruction( ctx, TRUE ))
+            return FALSE;
+      }
+      else if (str_match_no_case( &ctx->cur, "DCL" )) {
+         if (!parse_declaration( ctx ))
+            return FALSE;
+      }
+      else if (str_match_no_case( &ctx->cur, "IMM" )) {
+         if (!parse_immediate( ctx ))
+            return FALSE;
+      }
+      else if (!parse_instruction( ctx, FALSE )) {
+         return FALSE;
+      }
+   }
+
+   return TRUE;
+}
+
+boolean
+tgsi_text_translate(
+   const char *text,
+   struct tgsi_token *tokens,
+   uint num_tokens )
+{
+   struct translate_ctx ctx;
+
+   ctx.text = text;
+   ctx.cur = text;
+   ctx.tokens = tokens;
+   ctx.tokens_cur = tokens;
+   ctx.tokens_end = tokens + num_tokens;
+
+   if (!translate( &ctx ))
+      return FALSE;
+
+   return tgsi_sanity_check( tokens );
+}
diff --git a/src/gallium/auxiliary/tgsi/tgsi_text.h b/src/gallium/auxiliary/tgsi/tgsi_text.h
new file mode 100644
index 0000000000..8eeeeef140
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/tgsi_text.h
@@ -0,0 +1,47 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef TGSI_TEXT_H
+#define TGSI_TEXT_H
+
+#include "pipe/p_shader_tokens.h"
+
+#if defined __cplusplus
+extern "C" {
+#endif
+
+boolean
+tgsi_text_translate(
+   const char *text,
+   struct tgsi_token *tokens,
+   uint num_tokens );
+
+#if defined __cplusplus
+}
+#endif
+
+#endif /* TGSI_TEXT_H */
diff --git a/src/gallium/auxiliary/tgsi/tgsi_transform.c b/src/gallium/auxiliary/tgsi/tgsi_transform.c
new file mode 100644
index 0000000000..357f77b05a
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/tgsi_transform.c
@@ -0,0 +1,199 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * TGSI program transformation utility.
+ *
+ * Authors:  Brian Paul
+ */
+
+
+#include "tgsi_transform.h"
+
+
+
+static void
+emit_instruction(struct tgsi_transform_context *ctx,
+                 const struct tgsi_full_instruction *inst)
+{
+   uint ti = ctx->ti;
+
+   ti += tgsi_build_full_instruction(inst,
+                                     ctx->tokens_out + ti,
+                                     ctx->header,
+                                     ctx->max_tokens_out - ti);
+   ctx->ti = ti;
+}
+
+
+static void
+emit_declaration(struct tgsi_transform_context *ctx,
+                 const struct tgsi_full_declaration *decl)
+{
+   uint ti = ctx->ti;
+
+   ti += tgsi_build_full_declaration(decl,
+                                     ctx->tokens_out + ti,
+                                     ctx->header,
+                                     ctx->max_tokens_out - ti);
+   ctx->ti = ti;
+}
+
+
+static void
+emit_immediate(struct tgsi_transform_context *ctx,
+               const struct tgsi_full_immediate *imm)
+{
+   uint ti = ctx->ti;
+
+   ti += tgsi_build_full_immediate(imm,
+                                   ctx->tokens_out + ti,
+                                   ctx->header,
+                                   ctx->max_tokens_out - ti);
+   ctx->ti = ti;
+}
+
+
+
+/**
+ * Apply user-defined transformations to the input shader to produce
+ * the output shader.
+ * For example, a register search-and-replace operation could be applied
+ * by defining a transform_instruction() callback that examined and changed
+ * the instruction src/dest regs.
+ *
+ * \return number of tokens emitted
+ */
+int
+tgsi_transform_shader(const struct tgsi_token *tokens_in,
+                      struct tgsi_token *tokens_out,
+                      uint max_tokens_out,
+                      struct tgsi_transform_context *ctx)
+{
+   uint procType;
+
+   /* input shader */
+   struct tgsi_parse_context parse;
+
+   /* output shader */
+   struct tgsi_processor *processor;
+
+
+   /**
+    ** callback context init
+    **/
+   ctx->emit_instruction = emit_instruction;
+   ctx->emit_declaration = emit_declaration;
+   ctx->emit_immediate = emit_immediate;
+   ctx->tokens_out = tokens_out;
+   ctx->max_tokens_out = max_tokens_out;
+
+
+   /**
+    ** Setup to begin parsing input shader
+    **/
+   if (tgsi_parse_init( &parse, tokens_in ) != TGSI_PARSE_OK) {
+      debug_printf("tgsi_parse_init() failed in tgsi_transform_shader()!\n");
+      return -1;
+   }
+   procType = parse.FullHeader.Processor.Processor;
+   assert(procType == TGSI_PROCESSOR_FRAGMENT ||
+          procType == TGSI_PROCESSOR_VERTEX ||
+          procType == TGSI_PROCESSOR_GEOMETRY);
+
+
+   /**
+    **  Setup output shader
+    **/
+   *(struct tgsi_version *) &tokens_out[0] = tgsi_build_version();
+
+   ctx->header = (struct tgsi_header *) (tokens_out + 1);
+   *ctx->header = tgsi_build_header();
+
+   processor = (struct tgsi_processor *) (tokens_out + 2);
+   *processor = tgsi_build_processor( procType, ctx->header );
+
+   ctx->ti = 3;
+
+
+   /**
+    ** Loop over incoming program tokens/instructions
+    */
+   while( !tgsi_parse_end_of_tokens( &parse ) ) {
+
+      tgsi_parse_token( &parse );
+
+      switch( parse.FullToken.Token.Type ) {
+      case TGSI_TOKEN_TYPE_INSTRUCTION:
+         {
+            struct tgsi_full_instruction *fullinst
+               = &parse.FullToken.FullInstruction;
+
+            if (ctx->transform_instruction)
+               ctx->transform_instruction(ctx, fullinst);
+            else
+               ctx->emit_instruction(ctx, fullinst);
+         }
+         break;
+
+      case TGSI_TOKEN_TYPE_DECLARATION:
+         {
+            struct tgsi_full_declaration *fulldecl
+               = &parse.FullToken.FullDeclaration;
+
+            if (ctx->transform_declaration)
+               ctx->transform_declaration(ctx, fulldecl);
+            else
+               ctx->emit_declaration(ctx, fulldecl);
+         }
+         break;
+
+      case TGSI_TOKEN_TYPE_IMMEDIATE:
+         {
+            struct tgsi_full_immediate *fullimm
+               = &parse.FullToken.FullImmediate;
+
+            if (ctx->transform_immediate)
+               ctx->transform_immediate(ctx, fullimm);
+            else
+               ctx->emit_immediate(ctx, fullimm);
+         }
+         break;
+
+      default:
+         assert( 0 );
+      }
+   }
+
+   if (ctx->epilog) {
+      ctx->epilog(ctx);
+   }
+
+   tgsi_parse_free (&parse);
+
+   return ctx->ti;
+}
diff --git a/src/gallium/auxiliary/tgsi/tgsi_transform.h b/src/gallium/auxiliary/tgsi/tgsi_transform.h
new file mode 100644
index 0000000000..3da0b38271
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/tgsi_transform.h
@@ -0,0 +1,93 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef TGSI_TRANSFORM_H
+#define TGSI_TRANSFORM_H
+
+
+#include "pipe/p_util.h"
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_build.h"
+
+
+
+/**
+ * Subclass this to add caller-specific data
+ */
+struct tgsi_transform_context
+{
+/**** PUBLIC ***/
+
+   /**
+    * User-defined callbacks invoked per instruction.
+    */
+   void (*transform_instruction)(struct tgsi_transform_context *ctx,
+                                 struct tgsi_full_instruction *inst);
+
+   void (*transform_declaration)(struct tgsi_transform_context *ctx,
+                                 struct tgsi_full_declaration *decl);
+
+   void (*transform_immediate)(struct tgsi_transform_context *ctx,
+                               struct tgsi_full_immediate *imm);
+
+   /**
+    * Called at end of input program to allow caller to append extra
+    * instructions.  Return number of tokens emitted.
+    */
+   void (*epilog)(struct tgsi_transform_context *ctx);
+
+
+/*** PRIVATE ***/
+
+   /**
+    * These are setup by tgsi_transform_shader() and cannot be overridden.
+    * Meant to be called from in the above user callback functions.
+    */
+   void (*emit_instruction)(struct tgsi_transform_context *ctx,
+                            const struct tgsi_full_instruction *inst);
+   void (*emit_declaration)(struct tgsi_transform_context *ctx,
+                            const struct tgsi_full_declaration *decl);
+   void (*emit_immediate)(struct tgsi_transform_context *ctx,
+                          const struct tgsi_full_immediate *imm);
+
+   struct tgsi_header *header;
+   uint max_tokens_out;
+   struct tgsi_token *tokens_out;
+   uint ti;
+};
+
+
+
+extern int
+tgsi_transform_shader(const struct tgsi_token *tokens_in,
+                      struct tgsi_token *tokens_out,
+                      uint max_tokens_out,
+                      struct tgsi_transform_context *ctx);
+
+
+#endif /* TGSI_TRANSFORM_H */
diff --git a/src/gallium/auxiliary/tgsi/tgsi_util.c b/src/gallium/auxiliary/tgsi/tgsi_util.c
new file mode 100644
index 0000000000..09486e649e
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/tgsi_util.c
@@ -0,0 +1,300 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "pipe/p_debug.h"
+#include "pipe/p_util.h"
+#include "pipe/p_shader_tokens.h"
+#include "tgsi_parse.h"
+#include "tgsi_build.h"
+#include "tgsi_util.h"
+
+union pointer_hack
+{
+   void *pointer;
+   uint64_t uint64;
+};
+
+void *
+tgsi_align_128bit(
+   void *unaligned )
+{
+   union pointer_hack ph;
+
+   ph.uint64 = 0;
+   ph.pointer = unaligned;
+   ph.uint64 = (ph.uint64 + 15) & ~15;
+   return ph.pointer;
+}
+
+unsigned
+tgsi_util_get_src_register_swizzle(
+   const struct tgsi_src_register *reg,
+   unsigned component )
+{
+   switch( component ) {
+   case 0:
+      return reg->SwizzleX;
+   case 1:
+      return reg->SwizzleY;
+   case 2:
+      return reg->SwizzleZ;
+   case 3:
+      return reg->SwizzleW;
+   default:
+      assert( 0 );
+   }
+   return 0;
+}
+
+unsigned
+tgsi_util_get_src_register_extswizzle(
+   const struct tgsi_src_register_ext_swz *reg,
+   unsigned component )
+{
+   switch( component ) {
+   case 0:
+      return reg->ExtSwizzleX;
+   case 1:
+      return reg->ExtSwizzleY;
+   case 2:
+      return reg->ExtSwizzleZ;
+   case 3:
+      return reg->ExtSwizzleW;
+   default:
+      assert( 0 );
+   }
+   return 0;
+}
+
+unsigned
+tgsi_util_get_full_src_register_extswizzle(
+   const struct tgsi_full_src_register  *reg,
+   unsigned component )
+{
+   unsigned swizzle;
+
+   /*
+    * First, calculate  the   extended swizzle for a given channel. This will give
+    * us either a channel index into the simple swizzle or  a constant 1 or   0.
+    */
+   swizzle = tgsi_util_get_src_register_extswizzle(
+      &reg->SrcRegisterExtSwz,
+      component );
+
+   assert (TGSI_SWIZZLE_X == TGSI_EXTSWIZZLE_X);
+   assert (TGSI_SWIZZLE_Y == TGSI_EXTSWIZZLE_Y);
+   assert (TGSI_SWIZZLE_Z == TGSI_EXTSWIZZLE_Z);
+   assert (TGSI_SWIZZLE_W == TGSI_EXTSWIZZLE_W);
+   assert (TGSI_EXTSWIZZLE_ZERO > TGSI_SWIZZLE_W);
+   assert (TGSI_EXTSWIZZLE_ONE > TGSI_SWIZZLE_W);
+
+   /*
+    * Second, calculate the simple  swizzle  for   the   unswizzled channel index.
+    * Leave the constants intact, they are   not   affected by the   simple swizzle.
+    */
+   if( swizzle <= TGSI_SWIZZLE_W ) {
+      swizzle = tgsi_util_get_src_register_swizzle(
+         &reg->SrcRegister,
+         swizzle );
+   }
+
+   return swizzle;
+}
+
+void
+tgsi_util_set_src_register_swizzle(
+   struct tgsi_src_register *reg,
+   unsigned swizzle,
+   unsigned component )
+{
+   switch( component ) {
+   case 0:
+      reg->SwizzleX = swizzle;
+      break;
+   case 1:
+      reg->SwizzleY = swizzle;
+      break;
+   case 2:
+      reg->SwizzleZ = swizzle;
+      break;
+   case 3:
+      reg->SwizzleW = swizzle;
+      break;
+   default:
+      assert( 0 );
+   }
+}
+
+void
+tgsi_util_set_src_register_extswizzle(
+   struct tgsi_src_register_ext_swz *reg,
+   unsigned swizzle,
+   unsigned component )
+{
+   switch( component ) {
+   case 0:
+      reg->ExtSwizzleX = swizzle;
+      break;
+   case 1:
+      reg->ExtSwizzleY = swizzle;
+      break;
+   case 2:
+      reg->ExtSwizzleZ = swizzle;
+      break;
+   case 3:
+      reg->ExtSwizzleW = swizzle;
+      break;
+   default:
+      assert( 0 );
+   }
+}
+
+unsigned
+tgsi_util_get_src_register_extnegate(
+   const  struct tgsi_src_register_ext_swz *reg,
+   unsigned component )
+{
+   switch( component ) {
+   case 0:
+      return reg->NegateX;
+   case 1:
+      return reg->NegateY;
+   case 2:
+      return reg->NegateZ;
+   case 3:
+      return reg->NegateW;
+   default:
+      assert( 0 );
+   }
+   return 0;
+}
+
+void
+tgsi_util_set_src_register_extnegate(
+   struct tgsi_src_register_ext_swz *reg,
+   unsigned negate,
+   unsigned component )
+{
+   switch( component ) {
+   case 0:
+      reg->NegateX = negate;
+      break;
+   case 1:
+      reg->NegateY = negate;
+      break;
+   case 2:
+      reg->NegateZ = negate;
+      break;
+   case 3:
+      reg->NegateW = negate;
+      break;
+   default:
+      assert( 0 );
+   }
+}
+
+unsigned
+tgsi_util_get_full_src_register_sign_mode(
+   const struct  tgsi_full_src_register *reg,
+   unsigned component )
+{
+   unsigned sign_mode;
+
+   if( reg->SrcRegisterExtMod.Absolute ) {
+      /* Consider only the post-abs negation. */
+
+      if( reg->SrcRegisterExtMod.Negate ) {
+         sign_mode = TGSI_UTIL_SIGN_SET;
+      }
+      else {
+         sign_mode = TGSI_UTIL_SIGN_CLEAR;
+      }
+   }
+   else {
+      /* Accumulate the three negations. */
+
+      unsigned negate;
+
+      negate = reg->SrcRegister.Negate;
+      if( tgsi_util_get_src_register_extnegate( &reg->SrcRegisterExtSwz, component ) ) {
+         negate = !negate;
+      }
+      if( reg->SrcRegisterExtMod.Negate ) {
+         negate = !negate;
+      }
+
+      if( negate ) {
+         sign_mode = TGSI_UTIL_SIGN_TOGGLE;
+      }
+      else {
+         sign_mode = TGSI_UTIL_SIGN_KEEP;
+      }
+   }
+
+   return sign_mode;
+}
+
+void
+tgsi_util_set_full_src_register_sign_mode(
+   struct tgsi_full_src_register *reg,
+   unsigned sign_mode )
+{
+   reg->SrcRegisterExtSwz.NegateX = 0;
+   reg->SrcRegisterExtSwz.NegateY = 0;
+   reg->SrcRegisterExtSwz.NegateZ = 0;
+   reg->SrcRegisterExtSwz.NegateW = 0;
+
+   switch (sign_mode)
+   {
+   case TGSI_UTIL_SIGN_CLEAR:
+      reg->SrcRegister.Negate = 0;
+      reg->SrcRegisterExtMod.Absolute = 1;
+      reg->SrcRegisterExtMod.Negate = 0;
+      break;
+
+   case TGSI_UTIL_SIGN_SET:
+      reg->SrcRegister.Negate = 0;
+      reg->SrcRegisterExtMod.Absolute = 1;
+      reg->SrcRegisterExtMod.Negate = 1;
+      break;
+
+   case TGSI_UTIL_SIGN_TOGGLE:
+      reg->SrcRegister.Negate = 1;
+      reg->SrcRegisterExtMod.Absolute = 0;
+      reg->SrcRegisterExtMod.Negate = 0;
+      break;
+
+   case TGSI_UTIL_SIGN_KEEP:
+      reg->SrcRegister.Negate = 0;
+      reg->SrcRegisterExtMod.Absolute = 0;
+      reg->SrcRegisterExtMod.Negate = 0;
+      break;
+
+   default:
+      assert( 0 );
+   }
+}
diff --git a/src/gallium/auxiliary/tgsi/tgsi_util.h b/src/gallium/auxiliary/tgsi/tgsi_util.h
new file mode 100644
index 0000000000..7877f34558
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/tgsi_util.h
@@ -0,0 +1,96 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef TGSI_UTIL_H
+#define TGSI_UTIL_H
+
+#if defined __cplusplus
+extern "C" {
+#endif
+
+void *
+tgsi_align_128bit(
+   void *unaligned );
+
+unsigned
+tgsi_util_get_src_register_swizzle(
+   const struct tgsi_src_register *reg,
+   unsigned component );
+
+unsigned
+tgsi_util_get_src_register_extswizzle(
+   const struct tgsi_src_register_ext_swz *reg,
+   unsigned component);
+
+unsigned
+tgsi_util_get_full_src_register_extswizzle(
+   const struct tgsi_full_src_register *reg,
+   unsigned component );
+
+void
+tgsi_util_set_src_register_swizzle(
+   struct tgsi_src_register *reg,
+   unsigned swizzle,
+   unsigned component );
+
+void
+tgsi_util_set_src_register_extswizzle(
+   struct tgsi_src_register_ext_swz *reg,
+   unsigned swizzle,
+   unsigned component );
+
+unsigned
+tgsi_util_get_src_register_extnegate(
+   const struct tgsi_src_register_ext_swz *reg,
+   unsigned component );
+
+void
+tgsi_util_set_src_register_extnegate(
+   struct tgsi_src_register_ext_swz *reg,
+   unsigned negate,
+   unsigned component );
+
+#define TGSI_UTIL_SIGN_CLEAR    0   /* Force positive */
+#define TGSI_UTIL_SIGN_SET      1   /* Force negative */
+#define TGSI_UTIL_SIGN_TOGGLE   2   /* Negate */
+#define TGSI_UTIL_SIGN_KEEP     3   /* No change */
+
+unsigned
+tgsi_util_get_full_src_register_sign_mode(
+   const struct tgsi_full_src_register *reg,
+   unsigned component );
+
+void
+tgsi_util_set_full_src_register_sign_mode(
+   struct tgsi_full_src_register *reg,
+   unsigned sign_mode );
+
+#if defined __cplusplus
+}
+#endif
+
+#endif /* TGSI_UTIL_H */
diff --git a/src/gallium/auxiliary/tgsi/util/tgsi_build.c b/src/gallium/auxiliary/tgsi/util/tgsi_build.c
deleted file mode 100644
index 742ef14c35..0000000000
--- a/src/gallium/auxiliary/tgsi/util/tgsi_build.c
+++ /dev/null
@@ -1,1324 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-#include "pipe/p_debug.h"
-#include "pipe/p_util.h"
-#include "pipe/p_shader_tokens.h"
-#include "tgsi_build.h"
-#include "tgsi_parse.h"
-
-/*
- * version
- */
-
-struct tgsi_version
-tgsi_build_version( void )
-{
-   struct tgsi_version  version;
-
-   version.MajorVersion = 1;
-   version.MinorVersion = 1;
-   version.Padding = 0;
-
-   return version;
-}
-
-/*
- * header
- */
-
-struct tgsi_header
-tgsi_build_header( void )
-{
-   struct tgsi_header header;
-
-   header.HeaderSize = 1;
-   header.BodySize = 0;
-
-   return header;
-}
-
-static void
-header_headersize_grow( struct tgsi_header *header )
-{
-   assert( header->HeaderSize < 0xFF );
-   assert( header->BodySize == 0 );
-
-   header->HeaderSize++;
-}
-
-static void
-header_bodysize_grow( struct tgsi_header *header )
-{
-   assert( header->BodySize < 0xFFFFFF );
-
-   header->BodySize++;
-}
-
-struct tgsi_processor
-tgsi_default_processor( void )
-{
-   struct tgsi_processor processor;
-
-   processor.Processor = TGSI_PROCESSOR_FRAGMENT;
-   processor.Padding = 0;
-
-   return processor;
-}
-
-struct tgsi_processor
-tgsi_build_processor(
-   unsigned type,
-   struct tgsi_header *header )
-{
-   struct tgsi_processor processor;
-
-   processor = tgsi_default_processor();
-   processor.Processor = type;
-
-   header_headersize_grow( header );
-
-   return processor;
-}
-
-/*
- * declaration
- */
-
-struct tgsi_declaration
-tgsi_default_declaration( void )
-{
-   struct tgsi_declaration declaration;
-
-   declaration.Type = TGSI_TOKEN_TYPE_DECLARATION;
-   declaration.Size = 1;
-   declaration.File = TGSI_FILE_NULL;
-   declaration.UsageMask = TGSI_WRITEMASK_XYZW;
-   declaration.Interpolate = TGSI_INTERPOLATE_CONSTANT;
-   declaration.Semantic = 0;
-   declaration.Padding = 0;
-   declaration.Extended = 0;
-
-   return declaration;
-}
-
-struct tgsi_declaration
-tgsi_build_declaration(
-   unsigned file,
-   unsigned usage_mask,
-   unsigned interpolate,
-   unsigned semantic,
-   struct tgsi_header *header )
-{
-   struct tgsi_declaration declaration;
-
-   assert( file <= TGSI_FILE_IMMEDIATE );
-   assert( interpolate <= TGSI_INTERPOLATE_PERSPECTIVE );
-
-   declaration = tgsi_default_declaration();
-   declaration.File = file;
-   declaration.UsageMask = usage_mask;
-   declaration.Interpolate = interpolate;
-   declaration.Semantic = semantic;
-
-   header_bodysize_grow( header );
-
-   return declaration;
-}
-
-static void
-declaration_grow(
-   struct tgsi_declaration *declaration,
-   struct tgsi_header *header )
-{
-   assert( declaration->Size < 0xFF );
-
-   declaration->Size++;
-
-   header_bodysize_grow( header );
-}
-
-struct tgsi_full_declaration
-tgsi_default_full_declaration( void )
-{
-   struct tgsi_full_declaration  full_declaration;
-
-   full_declaration.Declaration  = tgsi_default_declaration();
-   full_declaration.DeclarationRange = tgsi_default_declaration_range();
-   full_declaration.Semantic = tgsi_default_declaration_semantic();
-
-   return full_declaration;
-}
-
-unsigned
-tgsi_build_full_declaration(
-   const struct tgsi_full_declaration *full_decl,
-   struct tgsi_token *tokens,
-   struct tgsi_header *header,
-   unsigned maxsize )
-{
-   unsigned size = 0;
-   struct tgsi_declaration *declaration;
-   struct tgsi_declaration_range *dr;
-
-   if( maxsize <= size )
-     return 0;
-   declaration = (struct tgsi_declaration *) &tokens[size];
-   size++;
-
-   *declaration = tgsi_build_declaration(
-      full_decl->Declaration.File,
-      full_decl->Declaration.UsageMask,
-      full_decl->Declaration.Interpolate,
-      full_decl->Declaration.Semantic,
-      header );
-
-   if (maxsize <= size)
-      return 0;
-   dr = (struct tgsi_declaration_range *) &tokens[size];
-   size++;
-
-   *dr = tgsi_build_declaration_range(
-      full_decl->DeclarationRange.First,
-      full_decl->DeclarationRange.Last,
-      declaration,
-      header );
-
-   if( full_decl->Declaration.Semantic ) {
-      struct tgsi_declaration_semantic *ds;
-
-      if( maxsize <= size )
-         return  0;
-      ds = (struct tgsi_declaration_semantic *) &tokens[size];
-      size++;
-
-      *ds = tgsi_build_declaration_semantic(
-         full_decl->Semantic.SemanticName,
-         full_decl->Semantic.SemanticIndex,
-         declaration,
-         header );
-   }
-
-   return size;
-}
-
-struct tgsi_declaration_range
-tgsi_default_declaration_range( void )
-{
-   struct tgsi_declaration_range dr;
-
-   dr.First = 0;
-   dr.Last = 0;
-
-   return dr;
-}
-
-struct tgsi_declaration_range
-tgsi_build_declaration_range(
-   unsigned first,
-   unsigned last,
-   struct tgsi_declaration *declaration,
-   struct tgsi_header *header )
-{
-   struct tgsi_declaration_range declaration_range;
-
-   assert( last >= first );
-   assert( last <= 0xFFFF );
-
-   declaration_range = tgsi_default_declaration_range();
-   declaration_range.First = first;
-   declaration_range.Last = last;
-
-   declaration_grow( declaration, header );
-
-   return declaration_range;
-}
-
-struct tgsi_declaration_semantic
-tgsi_default_declaration_semantic( void )
-{
-   struct tgsi_declaration_semantic ds;
-
-   ds.SemanticName = TGSI_SEMANTIC_POSITION;
-   ds.SemanticIndex = 0;
-   ds.Padding = 0;
-
-   return ds;
-}
-
-struct tgsi_declaration_semantic
-tgsi_build_declaration_semantic(
-   unsigned semantic_name,
-   unsigned semantic_index,
-   struct tgsi_declaration *declaration,
-   struct tgsi_header *header )
-{
-   struct tgsi_declaration_semantic ds;
-
-   assert( semantic_name <= TGSI_SEMANTIC_COUNT );
-   assert( semantic_index <= 0xFFFF );
-
-   ds = tgsi_default_declaration_semantic();
-   ds.SemanticName = semantic_name;
-   ds.SemanticIndex = semantic_index;
-
-   declaration_grow( declaration, header );
-
-   return ds;
-}
-
-/*
- * immediate
- */
-
-struct tgsi_immediate
-tgsi_default_immediate( void )
-{
-   struct tgsi_immediate immediate;
-
-   immediate.Type = TGSI_TOKEN_TYPE_IMMEDIATE;
-   immediate.Size = 1;
-   immediate.DataType = TGSI_IMM_FLOAT32;
-   immediate.Padding = 0;
-   immediate.Extended = 0;
-
-   return immediate;
-}
-
-struct tgsi_immediate
-tgsi_build_immediate(
-   struct tgsi_header *header )
-{
-   struct tgsi_immediate immediate;
-
-   immediate = tgsi_default_immediate();
-
-   header_bodysize_grow( header );
-
-   return immediate;
-}
-
-struct tgsi_full_immediate
-tgsi_default_full_immediate( void )
-{
-   struct tgsi_full_immediate fullimm;
-
-   fullimm.Immediate = tgsi_default_immediate();
-   fullimm.u.Pointer = (void *) 0;
-
-   return fullimm;
-}
-
-static void
-immediate_grow(
-   struct tgsi_immediate *immediate,
-   struct tgsi_header *header )
-{
-   assert( immediate->Size < 0xFF );
-
-   immediate->Size++;
-
-   header_bodysize_grow( header );
-}
-
-struct tgsi_immediate_float32
-tgsi_build_immediate_float32(
-   float value,
-   struct tgsi_immediate *immediate,
-   struct tgsi_header *header )
-{
-   struct tgsi_immediate_float32 immediate_float32;
-
-   immediate_float32.Float = value;
-
-   immediate_grow( immediate, header );
-
-   return immediate_float32;
-}
-
-unsigned
-tgsi_build_full_immediate(
-   const struct tgsi_full_immediate *full_imm,
-   struct tgsi_token *tokens,
-   struct tgsi_header *header,
-   unsigned maxsize )
-{
-   unsigned size = 0, i;
-   struct tgsi_immediate *immediate;
-
-   if( maxsize <= size )
-      return 0;
-   immediate = (struct tgsi_immediate *) &tokens[size];
-   size++;
-
-   *immediate = tgsi_build_immediate( header );
-
-   for( i = 0; i < full_imm->Immediate.Size - 1; i++ ) {
-      struct tgsi_immediate_float32 *if32;
-
-      if( maxsize <= size )
-         return  0;
-      if32 = (struct tgsi_immediate_float32 *) &tokens[size];
-      size++;
-
-      *if32 = tgsi_build_immediate_float32(
-         full_imm->u.ImmediateFloat32[i].Float,
-         immediate,
-         header );
-   }
-
-   return size;
-}
-
-/*
- * instruction
- */
-
-struct tgsi_instruction
-tgsi_default_instruction( void )
-{
-   struct tgsi_instruction instruction;
-
-   instruction.Type = TGSI_TOKEN_TYPE_INSTRUCTION;
-   instruction.Size = 1;
-   instruction.Opcode = TGSI_OPCODE_MOV;
-   instruction.Saturate = TGSI_SAT_NONE;
-   instruction.NumDstRegs = 1;
-   instruction.NumSrcRegs = 1;
-   instruction.Padding  = 0;
-   instruction.Extended = 0;
-
-   return instruction;
-}
-
-struct tgsi_instruction
-tgsi_build_instruction(
-   unsigned opcode,
-   unsigned saturate,
-   unsigned num_dst_regs,
-   unsigned num_src_regs,
-   struct tgsi_header *header )
-{
-   struct tgsi_instruction instruction;
-
-   assert (opcode <= TGSI_OPCODE_LAST);
-   assert (saturate <= TGSI_SAT_MINUS_PLUS_ONE);
-   assert (num_dst_regs <= 3);
-   assert (num_src_regs <= 15);
-
-   instruction = tgsi_default_instruction();
-   instruction.Opcode = opcode;
-   instruction.Saturate = saturate;
-   instruction.NumDstRegs = num_dst_regs;
-   instruction.NumSrcRegs = num_src_regs;
-
-   header_bodysize_grow( header );
-
-   return instruction;
-}
-
-static void
-instruction_grow(
-   struct tgsi_instruction *instruction,
-   struct tgsi_header *header )
-{
-   assert (instruction->Size <   0xFF);
-
-   instruction->Size++;
-
-   header_bodysize_grow( header );
-}
-
-struct tgsi_full_instruction
-tgsi_default_full_instruction( void )
-{
-   struct tgsi_full_instruction full_instruction;
-   unsigned i;
-
-   full_instruction.Instruction = tgsi_default_instruction();
-   full_instruction.InstructionExtNv = tgsi_default_instruction_ext_nv();
-   full_instruction.InstructionExtLabel = tgsi_default_instruction_ext_label();
-   full_instruction.InstructionExtTexture = tgsi_default_instruction_ext_texture();
-   for( i = 0;  i < TGSI_FULL_MAX_DST_REGISTERS; i++ ) {
-      full_instruction.FullDstRegisters[i] = tgsi_default_full_dst_register();
-   }
-   for( i = 0;  i < TGSI_FULL_MAX_SRC_REGISTERS; i++ ) {
-      full_instruction.FullSrcRegisters[i] = tgsi_default_full_src_register();
-   }
-
-   return full_instruction;
-}
-
-unsigned
-tgsi_build_full_instruction(
-   const struct tgsi_full_instruction *full_inst,
-   struct  tgsi_token *tokens,
-   struct  tgsi_header *header,
-   unsigned  maxsize )
-{
-   unsigned size = 0;
-   unsigned i;
-   struct tgsi_instruction *instruction;
-   struct tgsi_token *prev_token;
-
-   if( maxsize <= size )
-      return 0;
-   instruction = (struct tgsi_instruction *) &tokens[size];
-   size++;
-
-   *instruction = tgsi_build_instruction(
-      full_inst->Instruction.Opcode,
-      full_inst->Instruction.Saturate,
-      full_inst->Instruction.NumDstRegs,
-      full_inst->Instruction.NumSrcRegs,
-      header );
-   prev_token = (struct tgsi_token  *) instruction;
-
-   if( tgsi_compare_instruction_ext_nv(
-         full_inst->InstructionExtNv,
-         tgsi_default_instruction_ext_nv() ) ) {
-      struct tgsi_instruction_ext_nv *instruction_ext_nv;
-
-      if( maxsize <= size )
-         return 0;
-      instruction_ext_nv =
-         (struct  tgsi_instruction_ext_nv *) &tokens[size];
-      size++;
-
-      *instruction_ext_nv  = tgsi_build_instruction_ext_nv(
-         full_inst->InstructionExtNv.Precision,
-         full_inst->InstructionExtNv.CondDstIndex,
-         full_inst->InstructionExtNv.CondFlowIndex,
-         full_inst->InstructionExtNv.CondMask,
-         full_inst->InstructionExtNv.CondSwizzleX,
-         full_inst->InstructionExtNv.CondSwizzleY,
-         full_inst->InstructionExtNv.CondSwizzleZ,
-         full_inst->InstructionExtNv.CondSwizzleW,
-         full_inst->InstructionExtNv.CondDstUpdate,
-         full_inst->InstructionExtNv.CondFlowEnable,
-         prev_token,
-         instruction,
-         header );
-      prev_token = (struct tgsi_token  *) instruction_ext_nv;
-   }
-
-   if( tgsi_compare_instruction_ext_label(
-         full_inst->InstructionExtLabel,
-         tgsi_default_instruction_ext_label() ) ) {
-      struct tgsi_instruction_ext_label *instruction_ext_label;
-
-      if( maxsize <= size )
-         return 0;
-      instruction_ext_label =
-         (struct  tgsi_instruction_ext_label *) &tokens[size];
-      size++;
-
-      *instruction_ext_label = tgsi_build_instruction_ext_label(
-         full_inst->InstructionExtLabel.Label,
-         prev_token,
-         instruction,
-         header );
-      prev_token = (struct tgsi_token  *) instruction_ext_label;
-   }
-
-   if( tgsi_compare_instruction_ext_texture(
-         full_inst->InstructionExtTexture,
-         tgsi_default_instruction_ext_texture() ) ) {
-      struct tgsi_instruction_ext_texture *instruction_ext_texture;
-
-      if( maxsize <= size )
-         return 0;
-      instruction_ext_texture =
-         (struct  tgsi_instruction_ext_texture *) &tokens[size];
-      size++;
-
-      *instruction_ext_texture = tgsi_build_instruction_ext_texture(
-         full_inst->InstructionExtTexture.Texture,
-         prev_token,
-         instruction,
-         header   );
-      prev_token = (struct tgsi_token  *) instruction_ext_texture;
-   }
-
-   for( i = 0;  i <   full_inst->Instruction.NumDstRegs; i++ ) {
-      const struct tgsi_full_dst_register *reg = &full_inst->FullDstRegisters[i];
-      struct tgsi_dst_register *dst_register;
-      struct tgsi_token *prev_token;
-
-      if( maxsize <= size )
-         return 0;
-      dst_register = (struct tgsi_dst_register *) &tokens[size];
-      size++;
-
-      *dst_register = tgsi_build_dst_register(
-         reg->DstRegister.File,
-         reg->DstRegister.WriteMask,
-         reg->DstRegister.Index,
-         instruction,
-         header );
-      prev_token = (struct tgsi_token  *) dst_register;
-
-      if( tgsi_compare_dst_register_ext_concode(
-            reg->DstRegisterExtConcode,
-            tgsi_default_dst_register_ext_concode() ) ) {
-         struct tgsi_dst_register_ext_concode *dst_register_ext_concode;
-
-         if( maxsize <= size )
-            return 0;
-         dst_register_ext_concode =
-            (struct  tgsi_dst_register_ext_concode *) &tokens[size];
-         size++;
-
-         *dst_register_ext_concode =   tgsi_build_dst_register_ext_concode(
-            reg->DstRegisterExtConcode.CondMask,
-            reg->DstRegisterExtConcode.CondSwizzleX,
-            reg->DstRegisterExtConcode.CondSwizzleY,
-            reg->DstRegisterExtConcode.CondSwizzleZ,
-            reg->DstRegisterExtConcode.CondSwizzleW,
-            reg->DstRegisterExtConcode.CondSrcIndex,
-            prev_token,
-            instruction,
-            header );
-         prev_token = (struct tgsi_token  *) dst_register_ext_concode;
-      }
-
-      if( tgsi_compare_dst_register_ext_modulate(
-            reg->DstRegisterExtModulate,
-            tgsi_default_dst_register_ext_modulate() ) ) {
-         struct tgsi_dst_register_ext_modulate *dst_register_ext_modulate;
-
-         if( maxsize <= size )
-            return 0;
-         dst_register_ext_modulate =
-            (struct  tgsi_dst_register_ext_modulate *) &tokens[size];
-         size++;
-
-         *dst_register_ext_modulate = tgsi_build_dst_register_ext_modulate(
-            reg->DstRegisterExtModulate.Modulate,
-            prev_token,
-            instruction,
-            header );
-         prev_token = (struct tgsi_token  *) dst_register_ext_modulate;
-      }
-   }
-
-   for( i = 0;  i < full_inst->Instruction.NumSrcRegs; i++ ) {
-      const struct tgsi_full_src_register *reg = &full_inst->FullSrcRegisters[i];
-      struct tgsi_src_register *src_register;
-      struct tgsi_token *prev_token;
-
-      if( maxsize <= size )
-         return 0;
-      src_register = (struct tgsi_src_register *)  &tokens[size];
-      size++;
-
-      *src_register = tgsi_build_src_register(
-         reg->SrcRegister.File,
-         reg->SrcRegister.SwizzleX,
-         reg->SrcRegister.SwizzleY,
-         reg->SrcRegister.SwizzleZ,
-         reg->SrcRegister.SwizzleW,
-         reg->SrcRegister.Negate,
-         reg->SrcRegister.Indirect,
-         reg->SrcRegister.Dimension,
-         reg->SrcRegister.Index,
-         instruction,
-         header );
-      prev_token = (struct tgsi_token  *) src_register;
-
-      if( tgsi_compare_src_register_ext_swz(
-            reg->SrcRegisterExtSwz,
-            tgsi_default_src_register_ext_swz() ) ) {
-         struct tgsi_src_register_ext_swz *src_register_ext_swz;
-
-         /* Use of the extended swizzle requires the simple swizzle to be identity.
-          */
-         assert( reg->SrcRegister.SwizzleX == TGSI_SWIZZLE_X );
-         assert( reg->SrcRegister.SwizzleY == TGSI_SWIZZLE_Y );
-         assert( reg->SrcRegister.SwizzleZ == TGSI_SWIZZLE_Z );
-         assert( reg->SrcRegister.SwizzleW == TGSI_SWIZZLE_W );
-         assert( reg->SrcRegister.Negate == FALSE );
-
-         if( maxsize <= size )
-            return 0;
-         src_register_ext_swz =
-            (struct  tgsi_src_register_ext_swz *) &tokens[size];
-         size++;
-
-         *src_register_ext_swz = tgsi_build_src_register_ext_swz(
-            reg->SrcRegisterExtSwz.ExtSwizzleX,
-            reg->SrcRegisterExtSwz.ExtSwizzleY,
-            reg->SrcRegisterExtSwz.ExtSwizzleZ,
-            reg->SrcRegisterExtSwz.ExtSwizzleW,
-            reg->SrcRegisterExtSwz.NegateX,
-            reg->SrcRegisterExtSwz.NegateY,
-            reg->SrcRegisterExtSwz.NegateZ,
-            reg->SrcRegisterExtSwz.NegateW,
-            prev_token,
-            instruction,
-            header );
-         prev_token = (struct tgsi_token  *) src_register_ext_swz;
-      }
-
-      if( tgsi_compare_src_register_ext_mod(
-            reg->SrcRegisterExtMod,
-            tgsi_default_src_register_ext_mod() ) ) {
-         struct tgsi_src_register_ext_mod *src_register_ext_mod;
-
-         if( maxsize <= size )
-            return 0;
-         src_register_ext_mod =
-            (struct  tgsi_src_register_ext_mod *) &tokens[size];
-         size++;
-
-         *src_register_ext_mod = tgsi_build_src_register_ext_mod(
-            reg->SrcRegisterExtMod.Complement,
-            reg->SrcRegisterExtMod.Bias,
-            reg->SrcRegisterExtMod.Scale2X,
-            reg->SrcRegisterExtMod.Absolute,
-            reg->SrcRegisterExtMod.Negate,
-            prev_token,
-            instruction,
-            header );
-         prev_token = (struct tgsi_token  *) src_register_ext_mod;
-      }
-
-      if( reg->SrcRegister.Indirect ) {
-         struct  tgsi_src_register *ind;
-
-         if( maxsize <= size )
-            return 0;
-         ind = (struct tgsi_src_register *) &tokens[size];
-         size++;
-
-         *ind = tgsi_build_src_register(
-            reg->SrcRegisterInd.File,
-            reg->SrcRegisterInd.SwizzleX,
-            reg->SrcRegisterInd.SwizzleY,
-            reg->SrcRegisterInd.SwizzleZ,
-            reg->SrcRegisterInd.SwizzleW,
-            reg->SrcRegisterInd.Negate,
-            reg->SrcRegisterInd.Indirect,
-            reg->SrcRegisterInd.Dimension,
-            reg->SrcRegisterInd.Index,
-            instruction,
-            header );
-      }
-
-      if( reg->SrcRegister.Dimension ) {
-         struct  tgsi_dimension *dim;
-
-         assert( !reg->SrcRegisterDim.Dimension );
-
-         if( maxsize <= size )
-            return 0;
-         dim = (struct tgsi_dimension *) &tokens[size];
-         size++;
-
-         *dim = tgsi_build_dimension(
-            reg->SrcRegisterDim.Indirect,
-            reg->SrcRegisterDim.Index,
-            instruction,
-            header );
-
-         if( reg->SrcRegisterDim.Indirect ) {
-            struct tgsi_src_register *ind;
-
-            if( maxsize <= size )
-               return 0;
-            ind = (struct tgsi_src_register *) &tokens[size];
-            size++;
-
-            *ind = tgsi_build_src_register(
-               reg->SrcRegisterDimInd.File,
-               reg->SrcRegisterDimInd.SwizzleX,
-               reg->SrcRegisterDimInd.SwizzleY,
-               reg->SrcRegisterDimInd.SwizzleZ,
-               reg->SrcRegisterDimInd.SwizzleW,
-               reg->SrcRegisterDimInd.Negate,
-               reg->SrcRegisterDimInd.Indirect,
-               reg->SrcRegisterDimInd.Dimension,
-               reg->SrcRegisterDimInd.Index,
-               instruction,
-               header );
-         }
-      }
-   }
-
-   return size;
-}
-
-struct tgsi_instruction_ext_nv
-tgsi_default_instruction_ext_nv( void )
-{
-   struct tgsi_instruction_ext_nv instruction_ext_nv;
-
-   instruction_ext_nv.Type = TGSI_INSTRUCTION_EXT_TYPE_NV;
-   instruction_ext_nv.Precision = TGSI_PRECISION_DEFAULT;
-   instruction_ext_nv.CondDstIndex = 0;
-   instruction_ext_nv.CondFlowIndex = 0;
-   instruction_ext_nv.CondMask = TGSI_CC_TR;
-   instruction_ext_nv.CondSwizzleX = TGSI_SWIZZLE_X;
-   instruction_ext_nv.CondSwizzleY = TGSI_SWIZZLE_Y;
-   instruction_ext_nv.CondSwizzleZ = TGSI_SWIZZLE_Z;
-   instruction_ext_nv.CondSwizzleW = TGSI_SWIZZLE_W;
-   instruction_ext_nv.CondDstUpdate = 0;
-   instruction_ext_nv.CondFlowEnable = 0;
-   instruction_ext_nv.Padding = 0;
-   instruction_ext_nv.Extended = 0;
-
-   return instruction_ext_nv;
-}
-
-union token_u32
-{
-   unsigned u32;
-};
-
-unsigned
-tgsi_compare_instruction_ext_nv(
-   struct tgsi_instruction_ext_nv a,
-   struct tgsi_instruction_ext_nv b )
-{
-   a.Padding = b.Padding = 0;
-   a.Extended = b.Extended = 0;
-   return ((union token_u32 *) &a)->u32 != ((union token_u32 *) &b)->u32;
-}
-
-struct tgsi_instruction_ext_nv
-tgsi_build_instruction_ext_nv(
-   unsigned precision,
-   unsigned cond_dst_index,
-   unsigned cond_flow_index,
-   unsigned cond_mask,
-   unsigned cond_swizzle_x,
-   unsigned cond_swizzle_y,
-   unsigned cond_swizzle_z,
-   unsigned cond_swizzle_w,
-   unsigned cond_dst_update,
-   unsigned cond_flow_update,
-   struct tgsi_token *prev_token,
-   struct tgsi_instruction *instruction,
-   struct tgsi_header *header )
-{
-   struct tgsi_instruction_ext_nv instruction_ext_nv;
-
-   instruction_ext_nv = tgsi_default_instruction_ext_nv();
-   instruction_ext_nv.Precision = precision;
-   instruction_ext_nv.CondDstIndex = cond_dst_index;
-   instruction_ext_nv.CondFlowIndex = cond_flow_index;
-   instruction_ext_nv.CondMask = cond_mask;
-   instruction_ext_nv.CondSwizzleX = cond_swizzle_x;
-   instruction_ext_nv.CondSwizzleY = cond_swizzle_y;
-   instruction_ext_nv.CondSwizzleZ = cond_swizzle_z;
-   instruction_ext_nv.CondSwizzleW = cond_swizzle_w;
-   instruction_ext_nv.CondDstUpdate = cond_dst_update;
-   instruction_ext_nv.CondFlowEnable = cond_flow_update;
-
-   prev_token->Extended = 1;
-   instruction_grow( instruction, header );
-
-   return instruction_ext_nv;
-}
-
-struct tgsi_instruction_ext_label
-tgsi_default_instruction_ext_label( void )
-{
-   struct tgsi_instruction_ext_label instruction_ext_label;
-
-   instruction_ext_label.Type = TGSI_INSTRUCTION_EXT_TYPE_LABEL;
-   instruction_ext_label.Label = 0;
-   instruction_ext_label.Padding = 0;
-   instruction_ext_label.Extended = 0;
-
-   return instruction_ext_label;
-}
-
-unsigned
-tgsi_compare_instruction_ext_label(
-   struct tgsi_instruction_ext_label a,
-   struct tgsi_instruction_ext_label b )
-{
-   a.Padding = b.Padding = 0;
-   a.Extended = b.Extended = 0;
-   return ((union token_u32 *) &a)->u32 != ((union token_u32 *) &b)->u32;
-}
-
-struct tgsi_instruction_ext_label
-tgsi_build_instruction_ext_label(
-   unsigned label,
-   struct tgsi_token  *prev_token,
-   struct tgsi_instruction *instruction,
-   struct tgsi_header *header )
-{
-   struct tgsi_instruction_ext_label instruction_ext_label;
-
-   instruction_ext_label = tgsi_default_instruction_ext_label();
-   instruction_ext_label.Label = label;
-
-   prev_token->Extended = 1;
-   instruction_grow( instruction, header );
-
-   return instruction_ext_label;
-}
-
-struct tgsi_instruction_ext_texture
-tgsi_default_instruction_ext_texture( void )
-{
-   struct tgsi_instruction_ext_texture instruction_ext_texture;
-
-   instruction_ext_texture.Type = TGSI_INSTRUCTION_EXT_TYPE_TEXTURE;
-   instruction_ext_texture.Texture = TGSI_TEXTURE_UNKNOWN;
-   instruction_ext_texture.Padding = 0;
-   instruction_ext_texture.Extended = 0;
-
-   return instruction_ext_texture;
-}
-
-unsigned
-tgsi_compare_instruction_ext_texture(
-   struct tgsi_instruction_ext_texture a,
-   struct tgsi_instruction_ext_texture b )
-{
-   a.Padding = b.Padding = 0;
-   a.Extended = b.Extended = 0;
-   return ((union token_u32 *) &a)->u32 != ((union token_u32 *) &b)->u32;
-}
-
-struct tgsi_instruction_ext_texture
-tgsi_build_instruction_ext_texture(
-   unsigned texture,
-   struct tgsi_token *prev_token,
-   struct tgsi_instruction *instruction,
-   struct tgsi_header *header )
-{
-   struct tgsi_instruction_ext_texture instruction_ext_texture;
-
-   instruction_ext_texture = tgsi_default_instruction_ext_texture();
-   instruction_ext_texture.Texture = texture;
-
-   prev_token->Extended = 1;
-   instruction_grow( instruction, header );
-
-   return instruction_ext_texture;
-}
-
-struct tgsi_src_register
-tgsi_default_src_register( void )
-{
-   struct tgsi_src_register src_register;
-
-   src_register.File = TGSI_FILE_NULL;
-   src_register.SwizzleX = TGSI_SWIZZLE_X;
-   src_register.SwizzleY = TGSI_SWIZZLE_Y;
-   src_register.SwizzleZ = TGSI_SWIZZLE_Z;
-   src_register.SwizzleW = TGSI_SWIZZLE_W;
-   src_register.Negate = 0;
-   src_register.Indirect = 0;
-   src_register.Dimension = 0;
-   src_register.Index = 0;
-   src_register.Extended = 0;
-
-   return src_register;
-}
-
-struct tgsi_src_register
-tgsi_build_src_register(
-   unsigned file,
-   unsigned swizzle_x,
-   unsigned swizzle_y,
-   unsigned swizzle_z,
-   unsigned swizzle_w,
-   unsigned negate,
-   unsigned indirect,
-   unsigned dimension,
-   int index,
-   struct tgsi_instruction *instruction,
-   struct tgsi_header *header )
-{
-   struct tgsi_src_register   src_register;
-
-   assert( file <= TGSI_FILE_IMMEDIATE );
-   assert( swizzle_x <= TGSI_SWIZZLE_W );
-   assert( swizzle_y <= TGSI_SWIZZLE_W );
-   assert( swizzle_z <= TGSI_SWIZZLE_W );
-   assert( swizzle_w <= TGSI_SWIZZLE_W );
-   assert( negate <= 1 );
-   assert( index >= -0x8000 && index <= 0x7FFF );
-
-   src_register = tgsi_default_src_register();
-   src_register.File = file;
-   src_register.SwizzleX = swizzle_x;
-   src_register.SwizzleY = swizzle_y;
-   src_register.SwizzleZ = swizzle_z;
-   src_register.SwizzleW = swizzle_w;
-   src_register.Negate = negate;
-   src_register.Indirect = indirect;
-   src_register.Dimension = dimension;
-   src_register.Index = index;
-
-   instruction_grow( instruction, header );
-
-   return src_register;
-}
-
-struct tgsi_full_src_register
-tgsi_default_full_src_register( void )
-{
-   struct tgsi_full_src_register full_src_register;
-
-   full_src_register.SrcRegister = tgsi_default_src_register();
-   full_src_register.SrcRegisterExtSwz = tgsi_default_src_register_ext_swz();
-   full_src_register.SrcRegisterExtMod = tgsi_default_src_register_ext_mod();
-   full_src_register.SrcRegisterInd = tgsi_default_src_register();
-   full_src_register.SrcRegisterDim = tgsi_default_dimension();
-   full_src_register.SrcRegisterDimInd = tgsi_default_src_register();
-
-   return full_src_register;
-}
-
-struct tgsi_src_register_ext_swz
-tgsi_default_src_register_ext_swz( void )
-{
-   struct tgsi_src_register_ext_swz src_register_ext_swz;
-
-   src_register_ext_swz.Type = TGSI_SRC_REGISTER_EXT_TYPE_SWZ;
-   src_register_ext_swz.ExtSwizzleX = TGSI_EXTSWIZZLE_X;
-   src_register_ext_swz.ExtSwizzleY = TGSI_EXTSWIZZLE_Y;
-   src_register_ext_swz.ExtSwizzleZ = TGSI_EXTSWIZZLE_Z;
-   src_register_ext_swz.ExtSwizzleW = TGSI_EXTSWIZZLE_W;
-   src_register_ext_swz.NegateX = 0;
-   src_register_ext_swz.NegateY = 0;
-   src_register_ext_swz.NegateZ = 0;
-   src_register_ext_swz.NegateW = 0;
-   src_register_ext_swz.Padding = 0;
-   src_register_ext_swz.Extended = 0;
-
-   return src_register_ext_swz;
-}
-
-unsigned
-tgsi_compare_src_register_ext_swz(
-   struct tgsi_src_register_ext_swz a,
-   struct tgsi_src_register_ext_swz b )
-{
-   a.Padding = b.Padding = 0;
-   a.Extended = b.Extended = 0;
-   return ((union token_u32 *) &a)->u32 != ((union token_u32 *) &b)->u32;
-}
-
-struct tgsi_src_register_ext_swz
-tgsi_build_src_register_ext_swz(
-   unsigned ext_swizzle_x,
-   unsigned ext_swizzle_y,
-   unsigned ext_swizzle_z,
-   unsigned ext_swizzle_w,
-   unsigned negate_x,
-   unsigned negate_y,
-   unsigned negate_z,
-   unsigned negate_w,
-   struct tgsi_token *prev_token,
-   struct tgsi_instruction *instruction,
-   struct tgsi_header *header )
-{
-   struct tgsi_src_register_ext_swz src_register_ext_swz;
-
-   assert( ext_swizzle_x <= TGSI_EXTSWIZZLE_ONE );
-   assert( ext_swizzle_y <= TGSI_EXTSWIZZLE_ONE );
-   assert( ext_swizzle_z <= TGSI_EXTSWIZZLE_ONE );
-   assert( ext_swizzle_w <= TGSI_EXTSWIZZLE_ONE );
-   assert( negate_x <= 1 );
-   assert( negate_y <= 1 );
-   assert( negate_z <= 1 );
-   assert( negate_w <= 1 );
-
-   src_register_ext_swz = tgsi_default_src_register_ext_swz();
-   src_register_ext_swz.ExtSwizzleX = ext_swizzle_x;
-   src_register_ext_swz.ExtSwizzleY = ext_swizzle_y;
-   src_register_ext_swz.ExtSwizzleZ = ext_swizzle_z;
-   src_register_ext_swz.ExtSwizzleW = ext_swizzle_w;
-   src_register_ext_swz.NegateX = negate_x;
-   src_register_ext_swz.NegateY = negate_y;
-   src_register_ext_swz.NegateZ = negate_z;
-   src_register_ext_swz.NegateW = negate_w;
-
-   prev_token->Extended = 1;
-   instruction_grow( instruction, header );
-
-   return src_register_ext_swz;
-}
-
-struct tgsi_src_register_ext_mod
-tgsi_default_src_register_ext_mod( void )
-{
-   struct tgsi_src_register_ext_mod src_register_ext_mod;
-
-   src_register_ext_mod.Type = TGSI_SRC_REGISTER_EXT_TYPE_MOD;
-   src_register_ext_mod.Complement = 0;
-   src_register_ext_mod.Bias = 0;
-   src_register_ext_mod.Scale2X = 0;
-   src_register_ext_mod.Absolute = 0;
-   src_register_ext_mod.Negate = 0;
-   src_register_ext_mod.Padding = 0;
-   src_register_ext_mod.Extended = 0;
-
-   return src_register_ext_mod;
-}
-
-unsigned
-tgsi_compare_src_register_ext_mod(
-   struct tgsi_src_register_ext_mod a,
-   struct tgsi_src_register_ext_mod b )
-{
-   a.Padding = b.Padding = 0;
-   a.Extended = b.Extended = 0;
-   return ((union token_u32 *) &a)->u32 != ((union token_u32 *) &b)->u32;
-}
-
-struct tgsi_src_register_ext_mod
-tgsi_build_src_register_ext_mod(
-   unsigned complement,
-   unsigned bias,
-   unsigned scale_2x,
-   unsigned absolute,
-   unsigned negate,
-   struct tgsi_token *prev_token,
-   struct tgsi_instruction *instruction,
-   struct tgsi_header *header )
-{
-   struct tgsi_src_register_ext_mod src_register_ext_mod;
-
-   assert( complement <= 1 );
-   assert( bias <= 1 );
-   assert( scale_2x <= 1 );
-   assert( absolute <= 1 );
-   assert( negate <= 1 );
-
-   src_register_ext_mod = tgsi_default_src_register_ext_mod();
-   src_register_ext_mod.Complement = complement;
-   src_register_ext_mod.Bias = bias;
-   src_register_ext_mod.Scale2X = scale_2x;
-   src_register_ext_mod.Absolute = absolute;
-   src_register_ext_mod.Negate = negate;
-
-   prev_token->Extended = 1;
-   instruction_grow( instruction, header );
-
-   return src_register_ext_mod;
-}
-
-struct tgsi_dimension
-tgsi_default_dimension( void )
-{
-   struct tgsi_dimension dimension;
-
-   dimension.Indirect = 0;
-   dimension.Dimension = 0;
-   dimension.Padding = 0;
-   dimension.Index = 0;
-   dimension.Extended = 0;
-
-   return dimension;
-}
-
-struct tgsi_dimension
-tgsi_build_dimension(
-   unsigned indirect,
-   unsigned index,
-   struct tgsi_instruction *instruction,
-   struct tgsi_header *header )
-{
-   struct tgsi_dimension dimension;
-
-   dimension = tgsi_default_dimension();
-   dimension.Indirect = indirect;
-   dimension.Index = index;
-
-   instruction_grow( instruction, header );
-
-   return dimension;
-}
-
-struct tgsi_dst_register
-tgsi_default_dst_register( void )
-{
-   struct tgsi_dst_register dst_register;
-
-   dst_register.File = TGSI_FILE_NULL;
-   dst_register.WriteMask = TGSI_WRITEMASK_XYZW;
-   dst_register.Indirect = 0;
-   dst_register.Dimension = 0;
-   dst_register.Index = 0;
-   dst_register.Padding = 0;
-   dst_register.Extended = 0;
-
-   return dst_register;
-}
-
-struct tgsi_dst_register
-tgsi_build_dst_register(
-   unsigned file,
-   unsigned mask,
-   int index,
-   struct tgsi_instruction *instruction,
-   struct tgsi_header *header )
-{
-   struct tgsi_dst_register dst_register;
-
-   assert( file <= TGSI_FILE_IMMEDIATE );
-   assert( mask <= TGSI_WRITEMASK_XYZW );
-   assert( index >= -32768 && index <= 32767 );
-
-   dst_register = tgsi_default_dst_register();
-   dst_register.File = file;
-   dst_register.WriteMask = mask;
-   dst_register.Index = index;
-
-   instruction_grow( instruction, header );
-
-   return dst_register;
-}
-
-struct tgsi_full_dst_register
-tgsi_default_full_dst_register( void )
-{
-   struct tgsi_full_dst_register full_dst_register;
-
-   full_dst_register.DstRegister = tgsi_default_dst_register();
-   full_dst_register.DstRegisterExtConcode =
-      tgsi_default_dst_register_ext_concode();
-   full_dst_register.DstRegisterExtModulate =
-      tgsi_default_dst_register_ext_modulate();
-
-   return full_dst_register;
-}
-
-struct tgsi_dst_register_ext_concode
-tgsi_default_dst_register_ext_concode( void )
-{
-   struct tgsi_dst_register_ext_concode dst_register_ext_concode;
-
-   dst_register_ext_concode.Type = TGSI_DST_REGISTER_EXT_TYPE_CONDCODE;
-   dst_register_ext_concode.CondMask = TGSI_CC_TR;
-   dst_register_ext_concode.CondSwizzleX = TGSI_SWIZZLE_X;
-   dst_register_ext_concode.CondSwizzleY = TGSI_SWIZZLE_Y;
-   dst_register_ext_concode.CondSwizzleZ = TGSI_SWIZZLE_Z;
-   dst_register_ext_concode.CondSwizzleW = TGSI_SWIZZLE_W;
-   dst_register_ext_concode.CondSrcIndex = 0;
-   dst_register_ext_concode.Padding = 0;
-   dst_register_ext_concode.Extended = 0;
-
-   return dst_register_ext_concode;
-}
-
-unsigned
-tgsi_compare_dst_register_ext_concode(
-   struct tgsi_dst_register_ext_concode a,
-   struct tgsi_dst_register_ext_concode b )
-{
-   a.Padding = b.Padding = 0;
-   a.Extended = b.Extended = 0;
-   return ((union token_u32 *) &a)->u32 != ((union token_u32 *) &b)->u32;
-}
-
-struct tgsi_dst_register_ext_concode
-tgsi_build_dst_register_ext_concode(
-   unsigned cc,
-   unsigned swizzle_x,
-   unsigned swizzle_y,
-   unsigned swizzle_z,
-   unsigned swizzle_w,
-   int index,
-   struct tgsi_token *prev_token,
-   struct tgsi_instruction *instruction,
-   struct tgsi_header *header )
-{
-   struct tgsi_dst_register_ext_concode dst_register_ext_concode;
-
-   assert( cc <= TGSI_CC_FL );
-   assert( swizzle_x <= TGSI_SWIZZLE_W );
-   assert( swizzle_y <= TGSI_SWIZZLE_W );
-   assert( swizzle_z <= TGSI_SWIZZLE_W );
-   assert( swizzle_w <= TGSI_SWIZZLE_W );
-   assert( index >= -32768 && index <= 32767 );
-
-   dst_register_ext_concode = tgsi_default_dst_register_ext_concode();
-   dst_register_ext_concode.CondMask = cc;
-   dst_register_ext_concode.CondSwizzleX = swizzle_x;
-   dst_register_ext_concode.CondSwizzleY = swizzle_y;
-   dst_register_ext_concode.CondSwizzleZ = swizzle_z;
-   dst_register_ext_concode.CondSwizzleW = swizzle_w;
-   dst_register_ext_concode.CondSrcIndex = index;
-
-   prev_token->Extended = 1;
-   instruction_grow( instruction, header );
-
-   return dst_register_ext_concode;
-}
-
-struct tgsi_dst_register_ext_modulate
-tgsi_default_dst_register_ext_modulate( void )
-{
-   struct tgsi_dst_register_ext_modulate dst_register_ext_modulate;
-
-   dst_register_ext_modulate.Type = TGSI_DST_REGISTER_EXT_TYPE_MODULATE;
-   dst_register_ext_modulate.Modulate = TGSI_MODULATE_1X;
-   dst_register_ext_modulate.Padding = 0;
-   dst_register_ext_modulate.Extended = 0;
-
-   return dst_register_ext_modulate;
-}
-
-unsigned
-tgsi_compare_dst_register_ext_modulate(
-   struct tgsi_dst_register_ext_modulate a,
-   struct tgsi_dst_register_ext_modulate b )
-{
-   a.Padding = b.Padding = 0;
-   a.Extended = b.Extended = 0;
-   return ((union token_u32 *) &a)->u32 != ((union token_u32 *) &b)->u32;
-}
-
-struct tgsi_dst_register_ext_modulate
-tgsi_build_dst_register_ext_modulate(
-   unsigned modulate,
-   struct tgsi_token *prev_token,
-   struct tgsi_instruction *instruction,
-   struct tgsi_header *header )
-{
-   struct tgsi_dst_register_ext_modulate dst_register_ext_modulate;
-
-   assert( modulate <= TGSI_MODULATE_EIGHTH );
-
-   dst_register_ext_modulate = tgsi_default_dst_register_ext_modulate();
-   dst_register_ext_modulate.Modulate = modulate;
-
-   prev_token->Extended = 1;
-   instruction_grow( instruction, header );
-
-   return dst_register_ext_modulate;
-}
diff --git a/src/gallium/auxiliary/tgsi/util/tgsi_build.h b/src/gallium/auxiliary/tgsi/util/tgsi_build.h
deleted file mode 100644
index ed25830248..0000000000
--- a/src/gallium/auxiliary/tgsi/util/tgsi_build.h
+++ /dev/null
@@ -1,332 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-#ifndef TGSI_BUILD_H
-#define TGSI_BUILD_H
-
-#if defined __cplusplus
-extern "C" {
-#endif
-
-/*
- * version
- */
-
-struct tgsi_version
-tgsi_build_version( void );
-
-/*
- * header
- */
-
-struct tgsi_header
-tgsi_build_header( void );
-
-struct tgsi_processor
-tgsi_default_processor( void );
-
-struct tgsi_processor
-tgsi_build_processor(
-   unsigned processor,
-   struct tgsi_header *header );
-
-/*
- * declaration
- */
-
-struct tgsi_declaration
-tgsi_default_declaration( void );
-
-struct tgsi_declaration
-tgsi_build_declaration(
-   unsigned file,
-   unsigned usage_mask,
-   unsigned interpolate,
-   unsigned semantic,
-   struct tgsi_header *header );
-
-struct tgsi_full_declaration
-tgsi_default_full_declaration( void );
-
-unsigned
-tgsi_build_full_declaration(
-   const struct tgsi_full_declaration *full_decl,
-   struct tgsi_token *tokens,
-   struct tgsi_header *header,
-   unsigned maxsize );
-
-struct tgsi_declaration_range
-tgsi_default_declaration_range( void );
-
-struct tgsi_declaration_range
-tgsi_build_declaration_range(
-   unsigned first,
-   unsigned last,
-   struct tgsi_declaration *declaration,
-   struct tgsi_header *header );
-
-struct tgsi_declaration_semantic
-tgsi_default_declaration_semantic( void );
-
-struct tgsi_declaration_semantic
-tgsi_build_declaration_semantic(
-   unsigned semantic_name,
-   unsigned semantic_index,
-   struct tgsi_declaration *declaration,
-   struct tgsi_header *header );
-
-/*
- * immediate
- */
-
-struct tgsi_immediate
-tgsi_default_immediate( void );
-
-struct tgsi_immediate
-tgsi_build_immediate(
-   struct tgsi_header *header );
-
-struct tgsi_full_immediate
-tgsi_default_full_immediate( void );
-
-struct tgsi_immediate_float32
-tgsi_build_immediate_float32(
-   float value,
-   struct tgsi_immediate *immediate,
-   struct tgsi_header *header );
-
-unsigned
-tgsi_build_full_immediate(
-   const struct tgsi_full_immediate *full_imm,
-   struct tgsi_token *tokens,
-   struct tgsi_header *header,
-   unsigned maxsize );
-
-/*
- * instruction
- */
-
-struct tgsi_instruction
-tgsi_default_instruction( void );
-
-struct tgsi_instruction
-tgsi_build_instruction(
-   unsigned opcode,
-   unsigned saturate,
-   unsigned num_dst_regs,
-   unsigned num_src_regs,
-   struct tgsi_header *header );
-
-struct tgsi_full_instruction
-tgsi_default_full_instruction( void );
-
-unsigned
-tgsi_build_full_instruction(
-   const struct tgsi_full_instruction *full_inst,
-   struct tgsi_token *tokens,
-   struct tgsi_header *header,
-   unsigned maxsize );
-
-struct tgsi_instruction_ext_nv
-tgsi_default_instruction_ext_nv( void );
-
-unsigned
-tgsi_compare_instruction_ext_nv(
-   struct tgsi_instruction_ext_nv a,
-   struct tgsi_instruction_ext_nv b );
-
-struct tgsi_instruction_ext_nv
-tgsi_build_instruction_ext_nv(
-   unsigned precision,
-   unsigned cond_dst_index,
-   unsigned cond_flow_index,
-   unsigned cond_mask,
-   unsigned cond_swizzle_x,
-   unsigned cond_swizzle_y,
-   unsigned cond_swizzle_z,
-   unsigned cond_swizzle_w,
-   unsigned cond_dst_update,
-   unsigned cond_flow_update,
-   struct tgsi_token *prev_token,
-   struct tgsi_instruction *instruction,
-   struct tgsi_header *header );
-
-struct tgsi_instruction_ext_label
-tgsi_default_instruction_ext_label( void );
-
-unsigned
-tgsi_compare_instruction_ext_label(
-   struct tgsi_instruction_ext_label a,
-   struct tgsi_instruction_ext_label b );
-
-struct tgsi_instruction_ext_label
-tgsi_build_instruction_ext_label(
-   unsigned label,
-   struct tgsi_token *prev_token,
-   struct tgsi_instruction *instruction,
-   struct tgsi_header *header );
-
-struct tgsi_instruction_ext_texture
-tgsi_default_instruction_ext_texture( void );
-
-unsigned
-tgsi_compare_instruction_ext_texture(
-   struct tgsi_instruction_ext_texture a,
-   struct tgsi_instruction_ext_texture b );
-
-struct tgsi_instruction_ext_texture
-tgsi_build_instruction_ext_texture(
-   unsigned texture,
-   struct tgsi_token *prev_token,
-   struct tgsi_instruction *instruction,
-   struct tgsi_header *header );
-
-struct tgsi_src_register
-tgsi_default_src_register( void );
-
-struct tgsi_src_register
-tgsi_build_src_register(
-   unsigned file,
-   unsigned swizzle_x,
-   unsigned swizzle_y,
-   unsigned swizzle_z,
-   unsigned swizzle_w,
-   unsigned negate,
-   unsigned indirect,
-   unsigned dimension,
-   int index,
-   struct tgsi_instruction *instruction,
-   struct tgsi_header *header );
-
-struct tgsi_full_src_register
-tgsi_default_full_src_register( void );
-
-struct tgsi_src_register_ext_swz
-tgsi_default_src_register_ext_swz( void );
-
-unsigned
-tgsi_compare_src_register_ext_swz(
-   struct tgsi_src_register_ext_swz a,
-   struct tgsi_src_register_ext_swz b );
-
-struct tgsi_src_register_ext_swz
-tgsi_build_src_register_ext_swz(
-   unsigned ext_swizzle_x,
-   unsigned ext_swizzle_y,
-   unsigned ext_swizzle_z,
-   unsigned ext_swizzle_w,
-   unsigned negate_x,
-   unsigned negate_y,
-   unsigned negate_z,
-   unsigned negate_w,
-   struct tgsi_token *prev_token,
-   struct tgsi_instruction *instruction,
-   struct tgsi_header *header );
-
-struct tgsi_src_register_ext_mod
-tgsi_default_src_register_ext_mod( void );
-
-unsigned
-tgsi_compare_src_register_ext_mod(
-   struct tgsi_src_register_ext_mod a,
-   struct tgsi_src_register_ext_mod b );
-
-struct tgsi_src_register_ext_mod
-tgsi_build_src_register_ext_mod(
-   unsigned complement,
-   unsigned bias,
-   unsigned scale_2x,
-   unsigned absolute,
-   unsigned negate,
-   struct tgsi_token *prev_token,
-   struct tgsi_instruction *instruction,
-   struct tgsi_header *header );
-
-struct tgsi_dimension
-tgsi_default_dimension( void );
-
-struct tgsi_dimension
-tgsi_build_dimension(
-   unsigned indirect,
-   unsigned index,
-   struct tgsi_instruction *instruction,
-   struct tgsi_header *header );
-
-struct tgsi_dst_register
-tgsi_default_dst_register( void );
-
-struct tgsi_dst_register
-tgsi_build_dst_register(
-   unsigned file,
-   unsigned mask,
-   int index,
-   struct tgsi_instruction *instruction,
-   struct tgsi_header *header );
-
-struct tgsi_full_dst_register
-tgsi_default_full_dst_register( void );
-
-struct tgsi_dst_register_ext_concode
-tgsi_default_dst_register_ext_concode( void );
-
-unsigned
-tgsi_compare_dst_register_ext_concode(
-   struct tgsi_dst_register_ext_concode a,
-   struct tgsi_dst_register_ext_concode b );
-
-struct tgsi_dst_register_ext_concode
-tgsi_build_dst_register_ext_concode(
-   unsigned cc,
-   unsigned swizzle_x,
-   unsigned swizzle_y,
-   unsigned swizzle_z,
-   unsigned swizzle_w,
-   int index,
-   struct tgsi_token *prev_token,
-   struct tgsi_instruction *instruction,
-   struct tgsi_header *header );
-
-struct tgsi_dst_register_ext_modulate
-tgsi_default_dst_register_ext_modulate( void );
-
-unsigned
-tgsi_compare_dst_register_ext_modulate(
-   struct tgsi_dst_register_ext_modulate a,
-   struct tgsi_dst_register_ext_modulate b );
-
-struct tgsi_dst_register_ext_modulate
-tgsi_build_dst_register_ext_modulate(
-   unsigned modulate,
-   struct tgsi_token *prev_token,
-   struct tgsi_instruction *instruction,
-   struct tgsi_header *header );
-
-#if defined __cplusplus
-}
-#endif
-
-#endif /* TGSI_BUILD_H */
diff --git a/src/gallium/auxiliary/tgsi/util/tgsi_dump.c b/src/gallium/auxiliary/tgsi/util/tgsi_dump.c
deleted file mode 100644
index d2e6375212..0000000000
--- a/src/gallium/auxiliary/tgsi/util/tgsi_dump.c
+++ /dev/null
@@ -1,582 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-#include "pipe/p_debug.h"
-#include "tgsi_dump.h"
-#include "tgsi_iterate.h"
-
-struct dump_ctx
-{
-   struct tgsi_iterate_context iter;
-
-   uint instno;
-};
-
-static void
-dump_enum(
-   uint e,
-   const char **enums,
-   uint enum_count )
-{
-   if (e >= enum_count)
-      debug_printf( "%u", e );
-   else
-      debug_printf( "%s", enums[e] );
-}
-
-#define EOL()           debug_printf( "\n" )
-#define TXT(S)          debug_printf( "%s", S )
-#define CHR(C)          debug_printf( "%c", C )
-#define UIX(I)          debug_printf( "0x%x", I )
-#define UID(I)          debug_printf( "%u", I )
-#define SID(I)          debug_printf( "%d", I )
-#define FLT(F)          debug_printf( "%10.4f", F )
-#define ENM(E,ENUMS)    dump_enum( E, ENUMS, sizeof( ENUMS ) / sizeof( *ENUMS ) )
-
-static const char *processor_type_names[] =
-{
-   "FRAG",
-   "VERT",
-   "GEOM"
-};
-
-static const char *file_names[] =
-{
-   "NULL",
-   "CONST",
-   "IN",
-   "OUT",
-   "TEMP",
-   "SAMP",
-   "ADDR",
-   "IMM"
-};
-
-static const char *interpolate_names[] =
-{
-   "CONSTANT",
-   "LINEAR",
-   "PERSPECTIVE"
-};
-
-static const char *semantic_names[] =
-{
-   "POSITION",
-   "COLOR",
-   "BCOLOR",
-   "FOG",
-   "PSIZE",
-   "GENERIC",
-   "NORMAL"
-};
-
-static const char *immediate_type_names[] =
-{
-   "FLT32"
-};
-
-static const char *opcode_names[TGSI_OPCODE_LAST] =
-{
-   "ARL",
-   "MOV",
-   "LIT",
-   "RCP",
-   "RSQ",
-   "EXP",
-   "LOG",
-   "MUL",
-   "ADD",
-   "DP3",
-   "DP4",
-   "DST",
-   "MIN",
-   "MAX",
-   "SLT",
-   "SGE",
-   "MAD",
-   "SUB",
-   "LERP",
-   "CND",
-   "CND0",
-   "DOT2ADD",
-   "INDEX",
-   "NEGATE",
-   "FRAC",
-   "CLAMP",
-   "FLOOR",
-   "ROUND",
-   "EXPBASE2",
-   "LOGBASE2",
-   "POWER",
-   "CROSSPRODUCT",
-   "MULTIPLYMATRIX",
-   "ABS",
-   "RCC",
-   "DPH",
-   "COS",
-   "DDX",
-   "DDY",
-   "KILP",
-   "PK2H",
-   "PK2US",
-   "PK4B",
-   "PK4UB",
-   "RFL",
-   "SEQ",
-   "SFL",
-   "SGT",
-   "SIN",
-   "SLE",
-   "SNE",
-   "STR",
-   "TEX",
-   "TXD",
-   "TXP",
-   "UP2H",
-   "UP2US",
-   "UP4B",
-   "UP4UB",
-   "X2D",
-   "ARA",
-   "ARR",
-   "BRA",
-   "CAL",
-   "RET",
-   "SSG",
-   "CMP",
-   "SCS",
-   "TXB",
-   "NRM",
-   "DIV",
-   "DP2",
-   "TXL",
-   "BRK",
-   "IF",
-   "LOOP",
-   "REP",
-   "ELSE",
-   "ENDIF",
-   "ENDLOOP",
-   "ENDREP",
-   "PUSHA",
-   "POPA",
-   "CEIL",
-   "I2F",
-   "NOT",
-   "TRUNC",
-   "SHL",
-   "SHR",
-   "AND",
-   "OR",
-   "MOD",
-   "XOR",
-   "SAD",
-   "TXF",
-   "TXQ",
-   "CONT",
-   "EMIT",
-   "ENDPRIM",
-   "BGNLOOP2",
-   "BGNSUB",
-   "ENDLOOP2",
-   "ENDSUB",
-   "NOISE1",
-   "NOISE2",
-   "NOISE3",
-   "NOISE4",
-   "NOP",
-   "M4X3",
-   "M3X4",
-   "M3X3",
-   "M3X2",
-   "NRM4",
-   "CALLNZ",
-   "IFC",
-   "BREAKC",
-   "KIL",
-   "END",
-   "SWZ"
-};
-
-static const char *swizzle_names[] =
-{
-   "x",
-   "y",
-   "z",
-   "w"
-};
-
-static const char *texture_names[] =
-{
-   "UNKNOWN",
-   "1D",
-   "2D",
-   "3D",
-   "CUBE",
-   "RECT",
-   "SHADOW1D",
-   "SHADOW2D",
-   "SHADOWRECT"
-};
-
-static const char *extswizzle_names[] =
-{
-   "x",
-   "y",
-   "z",
-   "w",
-   "0",
-   "1"
-};
-
-static const char *modulate_names[TGSI_MODULATE_COUNT] =
-{
-   "",
-   "_2X",
-   "_4X",
-   "_8X",
-   "_D2",
-   "_D4",
-   "_D8"
-};
-
-static void
-_dump_register_prefix(
-   uint file,
-   uint first,
-   uint last )
-{
-   
-   
-}
-
-static void
-_dump_register(
-   uint file,
-   int first,
-   int last )
-{
-   ENM( file, file_names );
-   CHR( '[' );
-   SID( first );
-   if (first != last) {
-      TXT( ".." );
-      SID( last );
-   }
-   CHR( ']' );
-}
-
-static void
-_dump_register_ind(
-   uint file,
-   int index,
-   uint ind_file,
-   int ind_index )
-{
-   ENM( file, file_names );
-   CHR( '[' );
-   ENM( ind_file, file_names );
-   CHR( '[' );
-   SID( ind_index );
-   CHR( ']' );
-   if (index != 0) {
-      if (index > 0)
-         CHR( '+' );
-      SID( index );
-   }
-   CHR( ']' );
-}
-
-static void
-_dump_writemask(
-   uint writemask )
-{
-   if (writemask != TGSI_WRITEMASK_XYZW) {
-      CHR( '.' );
-      if (writemask & TGSI_WRITEMASK_X)
-         CHR( 'x' );
-      if (writemask & TGSI_WRITEMASK_Y)
-         CHR( 'y' );
-      if (writemask & TGSI_WRITEMASK_Z)
-         CHR( 'z' );
-      if (writemask & TGSI_WRITEMASK_W)
-         CHR( 'w' );
-   }
-}
-
-void
-tgsi_dump_declaration(
-   const struct tgsi_full_declaration *decl )
-{
-   TXT( "\nDCL " );
-
-   _dump_register(
-      decl->Declaration.File,
-      decl->DeclarationRange.First,
-      decl->DeclarationRange.Last );
-   _dump_writemask(
-      decl->Declaration.UsageMask );
-
-   if (decl->Declaration.Semantic) {
-      TXT( ", " );
-      ENM( decl->Semantic.SemanticName, semantic_names );
-      if (decl->Semantic.SemanticIndex != 0 ||
-          decl->Semantic.SemanticName == TGSI_SEMANTIC_GENERIC) {
-         CHR( '[' );
-         UID( decl->Semantic.SemanticIndex );
-         CHR( ']' );
-      }
-   }
-
-   TXT( ", " );
-   ENM( decl->Declaration.Interpolate, interpolate_names );
-}
-
-static boolean
-iter_declaration(
-   struct tgsi_iterate_context *iter,
-   struct tgsi_full_declaration *decl )
-{
-   tgsi_dump_declaration( decl );
-   return TRUE;
-}
-
-void
-tgsi_dump_immediate(
-   const struct tgsi_full_immediate *imm )
-{
-   uint i;
-
-   TXT( "\nIMM " );
-   ENM( imm->Immediate.DataType, immediate_type_names );
-
-   TXT( " { " );
-   for (i = 0; i < imm->Immediate.Size - 1; i++) {
-      switch (imm->Immediate.DataType) {
-      case TGSI_IMM_FLOAT32:
-         FLT( imm->u.ImmediateFloat32[i].Float );
-         break;
-      default:
-         assert( 0 );
-      }
-
-      if (i < imm->Immediate.Size - 2)
-         TXT( ", " );
-   }
-   TXT( " }" );
-}
-
-static boolean
-iter_immediate(
-   struct tgsi_iterate_context *iter,
-   struct tgsi_full_immediate *imm )
-{
-   tgsi_dump_immediate( imm );
-   return TRUE;
-}
-
-void
-tgsi_dump_instruction(
-   const struct tgsi_full_instruction *inst,
-   uint instno )
-{
-   uint i;
-   boolean first_reg = TRUE;
-
-   EOL();
-   UID( instno );
-   CHR( ':' );
-   ENM( inst->Instruction.Opcode, opcode_names );
-
-   switch (inst->Instruction.Saturate) {
-   case TGSI_SAT_NONE:
-      break;
-   case TGSI_SAT_ZERO_ONE:
-      TXT( "_SAT" );
-      break;
-   case TGSI_SAT_MINUS_PLUS_ONE:
-      TXT( "_SATNV" );
-      break;
-   default:
-      assert( 0 );
-   }
-
-   for (i = 0; i < inst->Instruction.NumDstRegs; i++) {
-      const struct tgsi_full_dst_register *dst = &inst->FullDstRegisters[i];
-
-      if (!first_reg)
-         CHR( ',' );
-      CHR( ' ' );
-
-      _dump_register(
-         dst->DstRegister.File,
-         dst->DstRegister.Index,
-         dst->DstRegister.Index );
-      ENM( dst->DstRegisterExtModulate.Modulate, modulate_names );
-      _dump_writemask( dst->DstRegister.WriteMask );
-
-      first_reg = FALSE;
-   }
-
-   for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
-      const struct tgsi_full_src_register *src = &inst->FullSrcRegisters[i];
-
-      if (!first_reg)
-         CHR( ',' );
-      CHR( ' ' );
-
-      if (src->SrcRegisterExtMod.Negate)
-         TXT( "-(" );
-      if (src->SrcRegisterExtMod.Absolute)
-         CHR( '|' );
-      if (src->SrcRegisterExtMod.Scale2X)
-         TXT( "2*(" );
-      if (src->SrcRegisterExtMod.Bias)
-         CHR( '(' );
-      if (src->SrcRegisterExtMod.Complement)
-         TXT( "1-(" );
-      if (src->SrcRegister.Negate)
-         CHR( '-' );
-
-      if (src->SrcRegister.Indirect) {
-         _dump_register_ind(
-            src->SrcRegister.File,
-            src->SrcRegister.Index,
-            src->SrcRegisterInd.File,
-            src->SrcRegisterInd.Index );
-      }
-      else {
-         _dump_register(
-            src->SrcRegister.File,
-            src->SrcRegister.Index,
-            src->SrcRegister.Index );
-      }
-
-      if (src->SrcRegister.SwizzleX != TGSI_SWIZZLE_X ||
-          src->SrcRegister.SwizzleY != TGSI_SWIZZLE_Y ||
-          src->SrcRegister.SwizzleZ != TGSI_SWIZZLE_Z ||
-          src->SrcRegister.SwizzleW != TGSI_SWIZZLE_W) {
-         CHR( '.' );
-         ENM( src->SrcRegister.SwizzleX, swizzle_names );
-         ENM( src->SrcRegister.SwizzleY, swizzle_names );
-         ENM( src->SrcRegister.SwizzleZ, swizzle_names );
-         ENM( src->SrcRegister.SwizzleW, swizzle_names );
-      }
-      if (src->SrcRegisterExtSwz.ExtSwizzleX != TGSI_EXTSWIZZLE_X ||
-          src->SrcRegisterExtSwz.ExtSwizzleY != TGSI_EXTSWIZZLE_Y ||
-          src->SrcRegisterExtSwz.ExtSwizzleZ != TGSI_EXTSWIZZLE_Z ||
-          src->SrcRegisterExtSwz.ExtSwizzleW != TGSI_EXTSWIZZLE_W) {
-         CHR( '.' );
-         if (src->SrcRegisterExtSwz.NegateX)
-            TXT("-");
-         ENM( src->SrcRegisterExtSwz.ExtSwizzleX, extswizzle_names );
-         if (src->SrcRegisterExtSwz.NegateY)
-            TXT("-");
-         ENM( src->SrcRegisterExtSwz.ExtSwizzleY, extswizzle_names );
-         if (src->SrcRegisterExtSwz.NegateZ)
-            TXT("-");
-         ENM( src->SrcRegisterExtSwz.ExtSwizzleZ, extswizzle_names );
-         if (src->SrcRegisterExtSwz.NegateW)
-            TXT("-");
-         ENM( src->SrcRegisterExtSwz.ExtSwizzleW, extswizzle_names );
-      }
-
-      if (src->SrcRegisterExtMod.Complement)
-         CHR( ')' );
-      if (src->SrcRegisterExtMod.Bias)
-         TXT( ")-.5" );
-      if (src->SrcRegisterExtMod.Scale2X)
-         CHR( ')' );
-      if (src->SrcRegisterExtMod.Absolute)
-         CHR( '|' );
-      if (src->SrcRegisterExtMod.Negate)
-         CHR( ')' );
-
-      first_reg = FALSE;
-   }
-
-   if (inst->InstructionExtTexture.Texture != TGSI_TEXTURE_UNKNOWN) {
-      TXT( ", " );
-      ENM( inst->InstructionExtTexture.Texture, texture_names );
-   }
-
-   switch (inst->Instruction.Opcode) {
-   case TGSI_OPCODE_IF:
-   case TGSI_OPCODE_ELSE:
-   case TGSI_OPCODE_BGNLOOP2:
-   case TGSI_OPCODE_ENDLOOP2:
-   case TGSI_OPCODE_CAL:
-      TXT( " :" );
-      UID( inst->InstructionExtLabel.Label );
-      break;
-   }
-}
-
-static boolean
-iter_instruction(
-   struct tgsi_iterate_context *iter,
-   struct tgsi_full_instruction *inst )
-{
-   struct dump_ctx *ctx = (struct dump_ctx *) iter;
-
-   tgsi_dump_instruction( inst, ctx->instno++ );
-   return TRUE;
-}
-
-static boolean
-prolog(
-   struct tgsi_iterate_context *ctx )
-{
-   EOL();
-   ENM( ctx->processor.Processor, processor_type_names );
-   UID( ctx->version.MajorVersion );
-   CHR( '.' );
-   UID( ctx->version.MinorVersion );
-   return TRUE;
-}
-
-void
-tgsi_dump(
-   const struct tgsi_token *tokens,
-   uint flags )
-{
-   struct dump_ctx ctx;
-
-   /* sanity checks */
-   assert( strcmp( opcode_names[TGSI_OPCODE_CONT], "CONT" ) == 0 );
-   assert( strcmp( opcode_names[TGSI_OPCODE_END], "END" ) == 0 );
-
-   ctx.iter.prolog = prolog;
-   ctx.iter.iterate_instruction = iter_instruction;
-   ctx.iter.iterate_declaration = iter_declaration;
-   ctx.iter.iterate_immediate = iter_immediate;
-   ctx.iter.epilog = NULL;
-
-   ctx.instno = 0;
-
-   tgsi_iterate_shader( tokens, &ctx.iter );
-}
diff --git a/src/gallium/auxiliary/tgsi/util/tgsi_dump.h b/src/gallium/auxiliary/tgsi/util/tgsi_dump.h
deleted file mode 100644
index 51c230b5db..0000000000
--- a/src/gallium/auxiliary/tgsi/util/tgsi_dump.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-#ifndef TGSI_DUMP_H
-#define TGSI_DUMP_H
-
-#include "pipe/p_shader_tokens.h"
-
-#if defined __cplusplus
-extern "C" {
-#endif
-
-void
-tgsi_dump(
-   const struct tgsi_token *tokens,
-   uint flags );
-
-struct tgsi_full_immediate;
-struct tgsi_full_instruction;
-struct tgsi_full_declaration;
-
-void
-tgsi_dump_immediate(
-   const struct tgsi_full_immediate *imm );
-
-void
-tgsi_dump_instruction(
-   const struct tgsi_full_instruction *inst,
-   uint instno );
-
-void
-tgsi_dump_declaration(
-   const struct tgsi_full_declaration *decl );
-
-#if defined __cplusplus
-}
-#endif
-
-#endif /* TGSI_DUMP_H */
diff --git a/src/gallium/auxiliary/tgsi/util/tgsi_dump_c.c b/src/gallium/auxiliary/tgsi/util/tgsi_dump_c.c
deleted file mode 100644
index eabd74bd6d..0000000000
--- a/src/gallium/auxiliary/tgsi/util/tgsi_dump_c.c
+++ /dev/null
@@ -1,845 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-#include "pipe/p_debug.h"
-#include "pipe/p_util.h"
-#include "util/u_string.h"
-#include "tgsi_dump_c.h"
-#include "tgsi_parse.h"
-#include "tgsi_build.h"
-
-static void
-dump_enum(
-   const unsigned    e,
-   const char        **enums,
-   const unsigned    enums_count )
-{
-   if (e >= enums_count) {
-      debug_printf( "%u", e );
-   }
-   else {
-      debug_printf( "%s", enums[e] );
-   }
-}
-
-#define EOL()           debug_printf( "\n" )
-#define TXT(S)          debug_printf( "%s", S )
-#define CHR(C)          debug_printf( "%c", C )
-#define UIX(I)          debug_printf( "0x%x", I )
-#define UID(I)          debug_printf( "%u", I )
-#define SID(I)          debug_printf( "%d", I )
-#define FLT(F)          debug_printf( "%10.4f", F )
-#define ENM(E,ENUMS)    dump_enum( E, ENUMS, sizeof( ENUMS ) / sizeof( *ENUMS ) )
-
-static const char *TGSI_PROCESSOR_TYPES[] =
-{
-   "PROCESSOR_FRAGMENT",
-   "PROCESSOR_VERTEX",
-   "PROCESSOR_GEOMETRY"
-};
-
-static const char *TGSI_TOKEN_TYPES[] =
-{
-   "TOKEN_TYPE_DECLARATION",
-   "TOKEN_TYPE_IMMEDIATE",
-   "TOKEN_TYPE_INSTRUCTION"
-};
-
-static const char *TGSI_FILES[] =
-{
-   "FILE_NULL",
-   "FILE_CONSTANT",
-   "FILE_INPUT",
-   "FILE_OUTPUT",
-   "FILE_TEMPORARY",
-   "FILE_SAMPLER",
-   "FILE_ADDRESS",
-   "FILE_IMMEDIATE"
-};
-
-static const char *TGSI_INTERPOLATES[] =
-{
-   "INTERPOLATE_CONSTANT",
-   "INTERPOLATE_LINEAR",
-   "INTERPOLATE_PERSPECTIVE"
-};
-
-static const char *TGSI_SEMANTICS[] =
-{
-   "SEMANTIC_POSITION",
-   "SEMANTIC_COLOR",
-   "SEMANTIC_BCOLOR",
-   "SEMANTIC_FOG",
-   "SEMANTIC_PSIZE",
-   "SEMANTIC_GENERIC",
-   "SEMANTIC_NORMAL"
-};
-
-static const char *TGSI_IMMS[] =
-{
-   "IMM_FLOAT32"
-};
-
-static const char *TGSI_OPCODES[TGSI_OPCODE_LAST] =
-{
-   "OPCODE_ARL",
-   "OPCODE_MOV",
-   "OPCODE_LIT",
-   "OPCODE_RCP",
-   "OPCODE_RSQ",
-   "OPCODE_EXP",
-   "OPCODE_LOG",
-   "OPCODE_MUL",
-   "OPCODE_ADD",
-   "OPCODE_DP3",
-   "OPCODE_DP4",
-   "OPCODE_DST",
-   "OPCODE_MIN",
-   "OPCODE_MAX",
-   "OPCODE_SLT",
-   "OPCODE_SGE",
-   "OPCODE_MAD",
-   "OPCODE_SUB",
-   "OPCODE_LERP",
-   "OPCODE_CND",
-   "OPCODE_CND0",
-   "OPCODE_DOT2ADD",
-   "OPCODE_INDEX",
-   "OPCODE_NEGATE",
-   "OPCODE_FRAC",
-   "OPCODE_CLAMP",
-   "OPCODE_FLOOR",
-   "OPCODE_ROUND",
-   "OPCODE_EXPBASE2",
-   "OPCODE_LOGBASE2",
-   "OPCODE_POWER",
-   "OPCODE_CROSSPRODUCT",
-   "OPCODE_MULTIPLYMATRIX",
-   "OPCODE_ABS",
-   "OPCODE_RCC",
-   "OPCODE_DPH",
-   "OPCODE_COS",
-   "OPCODE_DDX",
-   "OPCODE_DDY",
-   "OPCODE_KILP",
-   "OPCODE_PK2H",
-   "OPCODE_PK2US",
-   "OPCODE_PK4B",
-   "OPCODE_PK4UB",
-   "OPCODE_RFL",
-   "OPCODE_SEQ",
-   "OPCODE_SFL",
-   "OPCODE_SGT",
-   "OPCODE_SIN",
-   "OPCODE_SLE",
-   "OPCODE_SNE",
-   "OPCODE_STR",
-   "OPCODE_TEX",
-   "OPCODE_TXD",
-   "OPCODE_TXP",
-   "OPCODE_UP2H",
-   "OPCODE_UP2US",
-   "OPCODE_UP4B",
-   "OPCODE_UP4UB",
-   "OPCODE_X2D",
-   "OPCODE_ARA",
-   "OPCODE_ARR",
-   "OPCODE_BRA",
-   "OPCODE_CAL",
-   "OPCODE_RET",
-   "OPCODE_SSG",
-   "OPCODE_CMP",
-   "OPCODE_SCS",
-   "OPCODE_TXB",
-   "OPCODE_NRM",
-   "OPCODE_DIV",
-   "OPCODE_DP2",
-   "OPCODE_TXL",
-   "OPCODE_BRK",
-   "OPCODE_IF",
-   "OPCODE_LOOP",
-   "OPCODE_REP",
-   "OPCODE_ELSE",
-   "OPCODE_ENDIF",
-   "OPCODE_ENDLOOP",
-   "OPCODE_ENDREP",
-   "OPCODE_PUSHA",
-   "OPCODE_POPA",
-   "OPCODE_CEIL",
-   "OPCODE_I2F",
-   "OPCODE_NOT",
-   "OPCODE_TRUNC",
-   "OPCODE_SHL",
-   "OPCODE_SHR",
-   "OPCODE_AND",
-   "OPCODE_OR",
-   "OPCODE_MOD",
-   "OPCODE_XOR",
-   "OPCODE_SAD",
-   "OPCODE_TXF",
-   "OPCODE_TXQ",
-   "OPCODE_CONT",
-   "OPCODE_EMIT",
-   "OPCODE_ENDPRIM",
-   "OPCODE_BGNLOOP2",
-   "OPCODE_BGNSUB",
-   "OPCODE_ENDLOOP2",
-   "OPCODE_ENDSUB",
-   "OPCODE_NOISE1",
-   "OPCODE_NOISE2",
-   "OPCODE_NOISE3",
-   "OPCODE_NOISE4",
-   "OPCODE_NOP",
-   "OPCODE_M4X3",
-   "OPCODE_M3X4",
-   "OPCODE_M3X3",
-   "OPCODE_M3X2",
-   "OPCODE_NRM4",
-   "OPCODE_CALLNZ",
-   "OPCODE_IFC",
-   "OPCODE_BREAKC",
-   "OPCODE_KIL",
-   "OPCODE_END"
-};
-
-static const char *TGSI_SATS[] =
-{
-   "SAT_NONE",
-   "SAT_ZERO_ONE",
-   "SAT_MINUS_PLUS_ONE"
-};
-
-static const char *TGSI_INSTRUCTION_EXTS[] =
-{
-   "INSTRUCTION_EXT_TYPE_NV",
-   "INSTRUCTION_EXT_TYPE_LABEL",
-   "INSTRUCTION_EXT_TYPE_TEXTURE"
-};
-
-static const char *TGSI_PRECISIONS[] =
-{
-   "PRECISION_DEFAULT",
-   "PRECISION_FLOAT32",
-   "PRECISION_FLOAT16",
-   "PRECISION_FIXED12"
-};
-
-static const char *TGSI_CCS[] =
-{
-   "CC_GT",
-   "CC_EQ",
-   "CC_LT",
-   "CC_UN",
-   "CC_GE",
-   "CC_LE",
-   "CC_NE",
-   "CC_TR",
-   "CC_FL"
-};
-
-static const char *TGSI_SWIZZLES[] =
-{
-   "SWIZZLE_X",
-   "SWIZZLE_Y",
-   "SWIZZLE_Z",
-   "SWIZZLE_W"
-};
-
-static const char *TGSI_TEXTURES[] =
-{
-   "TEXTURE_UNKNOWN",
-   "TEXTURE_1D",
-   "TEXTURE_2D",
-   "TEXTURE_3D",
-   "TEXTURE_CUBE",
-   "TEXTURE_RECT",
-   "TEXTURE_SHADOW1D",
-   "TEXTURE_SHADOW2D",
-   "TEXTURE_SHADOWRECT"
-};
-
-static const char *TGSI_SRC_REGISTER_EXTS[] =
-{
-   "SRC_REGISTER_EXT_TYPE_SWZ",
-   "SRC_REGISTER_EXT_TYPE_MOD"
-};
-
-static const char *TGSI_EXTSWIZZLES[] =
-{
-   "EXTSWIZZLE_X",
-   "EXTSWIZZLE_Y",
-   "EXTSWIZZLE_Z",
-   "EXTSWIZZLE_W",
-   "EXTSWIZZLE_ZERO",
-   "EXTSWIZZLE_ONE"
-};
-
-static const char *TGSI_WRITEMASKS[] =
-{
-   "0",
-   "WRITEMASK_X",
-   "WRITEMASK_Y",
-   "WRITEMASK_XY",
-   "WRITEMASK_Z",
-   "WRITEMASK_XZ",
-   "WRITEMASK_YZ",
-   "WRITEMASK_XYZ",
-   "WRITEMASK_W",
-   "WRITEMASK_XW",
-   "WRITEMASK_YW",
-   "WRITEMASK_XYW",
-   "WRITEMASK_ZW",
-   "WRITEMASK_XZW",
-   "WRITEMASK_YZW",
-   "WRITEMASK_XYZW"
-};
-
-static const char *TGSI_DST_REGISTER_EXTS[] =
-{
-   "DST_REGISTER_EXT_TYPE_CONDCODE",
-   "DST_REGISTER_EXT_TYPE_MODULATE"
-};
-
-static const char *TGSI_MODULATES[] =
-{
-   "MODULATE_1X",
-   "MODULATE_2X",
-   "MODULATE_4X",
-   "MODULATE_8X",
-   "MODULATE_HALF",
-   "MODULATE_QUARTER",
-   "MODULATE_EIGHTH"
-};
-
-static void
-dump_declaration_verbose(
-   struct tgsi_full_declaration  *decl,
-   unsigned                      ignored,
-   unsigned                      deflt,
-   struct tgsi_full_declaration  *fd )
-{
-   TXT( "\nFile       : " );
-   ENM( decl->Declaration.File, TGSI_FILES );
-   if( deflt || fd->Declaration.UsageMask != decl->Declaration.UsageMask ) {
-      TXT( "\nUsageMask  : " );
-      if( decl->Declaration.UsageMask & TGSI_WRITEMASK_X ) {
-         CHR( 'X' );
-      }
-      if( decl->Declaration.UsageMask & TGSI_WRITEMASK_Y ) {
-         CHR( 'Y' );
-      }
-      if( decl->Declaration.UsageMask & TGSI_WRITEMASK_Z ) {
-         CHR( 'Z' );
-      }
-      if( decl->Declaration.UsageMask & TGSI_WRITEMASK_W ) {
-         CHR( 'W' );
-      }
-   }
-   if( deflt || fd->Declaration.Interpolate != decl->Declaration.Interpolate ) {
-      TXT( "\nInterpolate: " );
-      ENM( decl->Declaration.Interpolate, TGSI_INTERPOLATES );
-   }
-   if( deflt || fd->Declaration.Semantic != decl->Declaration.Semantic ) {
-      TXT( "\nSemantic   : " );
-      UID( decl->Declaration.Semantic );
-   }
-   if( ignored ) {
-      TXT( "\nPadding    : " );
-      UIX( decl->Declaration.Padding );
-   }
-
-   EOL();
-   TXT( "\nFirst: " );
-   UID( decl->DeclarationRange.First );
-   TXT( "\nLast : " );
-   UID( decl->DeclarationRange.Last );
-
-   if( decl->Declaration.Semantic ) {
-      EOL();
-      TXT( "\nSemanticName : " );
-      ENM( decl->Semantic.SemanticName, TGSI_SEMANTICS );
-      TXT( "\nSemanticIndex: " );
-      UID( decl->Semantic.SemanticIndex );
-      if( ignored ) {
-         TXT( "\nPadding      : " );
-         UIX( decl->Semantic.Padding );
-      }
-   }
-}
-
-static void
-dump_immediate_verbose(
-   struct tgsi_full_immediate *imm,
-   unsigned                   ignored )
-{
-   unsigned i;
-
-   TXT( "\nDataType   : " );
-   ENM( imm->Immediate.DataType, TGSI_IMMS );
-   if( ignored ) {
-      TXT( "\nPadding    : " );
-      UIX( imm->Immediate.Padding );
-   }
-
-   for( i = 0; i < imm->Immediate.Size - 1; i++ ) {
-      EOL();
-      switch( imm->Immediate.DataType ) {
-      case TGSI_IMM_FLOAT32:
-         TXT( "\nFloat: " );
-         FLT( imm->u.ImmediateFloat32[i].Float );
-         break;
-
-      default:
-         assert( 0 );
-      }
-   }
-}
-
-static void
-dump_instruction_verbose(
-   struct tgsi_full_instruction  *inst,
-   unsigned                      ignored,
-   unsigned                      deflt,
-   struct tgsi_full_instruction  *fi )
-{
-   unsigned i;
-
-   TXT( "\nOpcode     : " );
-   ENM( inst->Instruction.Opcode, TGSI_OPCODES );
-   if( deflt || fi->Instruction.Saturate != inst->Instruction.Saturate ) {
-      TXT( "\nSaturate   : " );
-      ENM( inst->Instruction.Saturate, TGSI_SATS );
-   }
-   if( deflt || fi->Instruction.NumDstRegs != inst->Instruction.NumDstRegs ) {
-      TXT( "\nNumDstRegs : " );
-      UID( inst->Instruction.NumDstRegs );
-   }
-   if( deflt || fi->Instruction.NumSrcRegs != inst->Instruction.NumSrcRegs ) {
-      TXT( "\nNumSrcRegs : " );
-      UID( inst->Instruction.NumSrcRegs );
-   }
-   if( ignored ) {
-      TXT( "\nPadding    : " );
-      UIX( inst->Instruction.Padding );
-   }
-
-   if( deflt || tgsi_compare_instruction_ext_nv( inst->InstructionExtNv, fi->InstructionExtNv ) ) {
-      EOL();
-      TXT( "\nType          : " );
-      ENM( inst->InstructionExtNv.Type, TGSI_INSTRUCTION_EXTS );
-      if( deflt || fi->InstructionExtNv.Precision != inst->InstructionExtNv.Precision ) {
-         TXT( "\nPrecision     : " );
-         ENM( inst->InstructionExtNv.Precision, TGSI_PRECISIONS );
-      }
-      if( deflt || fi->InstructionExtNv.CondDstIndex != inst->InstructionExtNv.CondDstIndex ) {
-         TXT( "\nCondDstIndex  : " );
-         UID( inst->InstructionExtNv.CondDstIndex );
-      }
-      if( deflt || fi->InstructionExtNv.CondFlowIndex != inst->InstructionExtNv.CondFlowIndex ) {
-         TXT( "\nCondFlowIndex : " );
-         UID( inst->InstructionExtNv.CondFlowIndex );
-      }
-      if( deflt || fi->InstructionExtNv.CondMask != inst->InstructionExtNv.CondMask ) {
-         TXT( "\nCondMask      : " );
-         ENM( inst->InstructionExtNv.CondMask, TGSI_CCS );
-      }
-      if( deflt || fi->InstructionExtNv.CondSwizzleX != inst->InstructionExtNv.CondSwizzleX ) {
-         TXT( "\nCondSwizzleX  : " );
-         ENM( inst->InstructionExtNv.CondSwizzleX, TGSI_SWIZZLES );
-      }
-      if( deflt || fi->InstructionExtNv.CondSwizzleY != inst->InstructionExtNv.CondSwizzleY ) {
-         TXT( "\nCondSwizzleY  : " );
-         ENM( inst->InstructionExtNv.CondSwizzleY, TGSI_SWIZZLES );
-      }
-      if( deflt || fi->InstructionExtNv.CondSwizzleZ != inst->InstructionExtNv.CondSwizzleZ ) {
-         TXT( "\nCondSwizzleZ  : " );
-         ENM( inst->InstructionExtNv.CondSwizzleZ, TGSI_SWIZZLES );
-      }
-      if( deflt || fi->InstructionExtNv.CondSwizzleW != inst->InstructionExtNv.CondSwizzleW ) {
-         TXT( "\nCondSwizzleW  : " );
-         ENM( inst->InstructionExtNv.CondSwizzleW, TGSI_SWIZZLES );
-      }
-      if( deflt || fi->InstructionExtNv.CondDstUpdate != inst->InstructionExtNv.CondDstUpdate ) {
-         TXT( "\nCondDstUpdate : " );
-         UID( inst->InstructionExtNv.CondDstUpdate );
-      }
-      if( deflt || fi->InstructionExtNv.CondFlowEnable != inst->InstructionExtNv.CondFlowEnable ) {
-         TXT( "\nCondFlowEnable: " );
-         UID( inst->InstructionExtNv.CondFlowEnable );
-      }
-      if( ignored ) {
-         TXT( "\nPadding       : " );
-         UIX( inst->InstructionExtNv.Padding );
-         if( deflt || fi->InstructionExtNv.Extended != inst->InstructionExtNv.Extended ) {
-            TXT( "\nExtended      : " );
-            UID( inst->InstructionExtNv.Extended );
-         }
-      }
-   }
-
-   if( deflt || tgsi_compare_instruction_ext_label( inst->InstructionExtLabel, fi->InstructionExtLabel ) ) {
-      EOL();
-      TXT( "\nType    : " );
-      ENM( inst->InstructionExtLabel.Type, TGSI_INSTRUCTION_EXTS );
-      if( deflt || fi->InstructionExtLabel.Label != inst->InstructionExtLabel.Label ) {
-         TXT( "\nLabel   : " );
-         UID( inst->InstructionExtLabel.Label );
-      }
-      if( ignored ) {
-         TXT( "\nPadding : " );
-         UIX( inst->InstructionExtLabel.Padding );
-         if( deflt || fi->InstructionExtLabel.Extended != inst->InstructionExtLabel.Extended ) {
-            TXT( "\nExtended: " );
-            UID( inst->InstructionExtLabel.Extended );
-         }
-      }
-   }
-
-   if( deflt || tgsi_compare_instruction_ext_texture( inst->InstructionExtTexture, fi->InstructionExtTexture ) ) {
-      EOL();
-      TXT( "\nType    : " );
-      ENM( inst->InstructionExtTexture.Type, TGSI_INSTRUCTION_EXTS );
-      if( deflt || fi->InstructionExtTexture.Texture != inst->InstructionExtTexture.Texture ) {
-         TXT( "\nTexture : " );
-         ENM( inst->InstructionExtTexture.Texture, TGSI_TEXTURES );
-      }
-      if( ignored ) {
-         TXT( "\nPadding : " );
-         UIX( inst->InstructionExtTexture.Padding );
-         if( deflt || fi->InstructionExtTexture.Extended != inst->InstructionExtTexture.Extended ) {
-            TXT( "\nExtended: " );
-            UID( inst->InstructionExtTexture.Extended );
-         }
-      }
-   }
-
-   for( i = 0; i < inst->Instruction.NumDstRegs; i++ ) {
-      struct tgsi_full_dst_register *dst = &inst->FullDstRegisters[i];
-      struct tgsi_full_dst_register *fd = &fi->FullDstRegisters[i];
-
-      EOL();
-      TXT( "\nFile     : " );
-      ENM( dst->DstRegister.File, TGSI_FILES );
-      if( deflt || fd->DstRegister.WriteMask != dst->DstRegister.WriteMask ) {
-         TXT( "\nWriteMask: " );
-         ENM( dst->DstRegister.WriteMask, TGSI_WRITEMASKS );
-      }
-      if( ignored ) {
-         if( deflt || fd->DstRegister.Indirect != dst->DstRegister.Indirect ) {
-            TXT( "\nIndirect : " );
-            UID( dst->DstRegister.Indirect );
-         }
-         if( deflt || fd->DstRegister.Dimension != dst->DstRegister.Dimension ) {
-            TXT( "\nDimension: " );
-            UID( dst->DstRegister.Dimension );
-         }
-      }
-      if( deflt || fd->DstRegister.Index != dst->DstRegister.Index ) {
-         TXT( "\nIndex    : " );
-         SID( dst->DstRegister.Index );
-      }
-      if( ignored ) {
-         TXT( "\nPadding  : " );
-         UIX( dst->DstRegister.Padding );
-         if( deflt || fd->DstRegister.Extended != dst->DstRegister.Extended ) {
-            TXT( "\nExtended : " );
-            UID( dst->DstRegister.Extended );
-         }
-      }
-
-      if( deflt || tgsi_compare_dst_register_ext_concode( dst->DstRegisterExtConcode, fd->DstRegisterExtConcode ) ) {
-         EOL();
-         TXT( "\nType        : " );
-         ENM( dst->DstRegisterExtConcode.Type, TGSI_DST_REGISTER_EXTS );
-         if( deflt || fd->DstRegisterExtConcode.CondMask != dst->DstRegisterExtConcode.CondMask ) {
-            TXT( "\nCondMask    : " );
-            ENM( dst->DstRegisterExtConcode.CondMask, TGSI_CCS );
-         }
-         if( deflt || fd->DstRegisterExtConcode.CondSwizzleX != dst->DstRegisterExtConcode.CondSwizzleX ) {
-            TXT( "\nCondSwizzleX: " );
-            ENM( dst->DstRegisterExtConcode.CondSwizzleX, TGSI_SWIZZLES );
-         }
-         if( deflt || fd->DstRegisterExtConcode.CondSwizzleY != dst->DstRegisterExtConcode.CondSwizzleY ) {
-            TXT( "\nCondSwizzleY: " );
-            ENM( dst->DstRegisterExtConcode.CondSwizzleY, TGSI_SWIZZLES );
-         }
-         if( deflt || fd->DstRegisterExtConcode.CondSwizzleZ != dst->DstRegisterExtConcode.CondSwizzleZ ) {
-            TXT( "\nCondSwizzleZ: " );
-            ENM( dst->DstRegisterExtConcode.CondSwizzleZ, TGSI_SWIZZLES );
-         }
-         if( deflt || fd->DstRegisterExtConcode.CondSwizzleW != dst->DstRegisterExtConcode.CondSwizzleW ) {
-            TXT( "\nCondSwizzleW: " );
-            ENM( dst->DstRegisterExtConcode.CondSwizzleW, TGSI_SWIZZLES );
-         }
-         if( deflt || fd->DstRegisterExtConcode.CondSrcIndex != dst->DstRegisterExtConcode.CondSrcIndex ) {
-            TXT( "\nCondSrcIndex: " );
-            UID( dst->DstRegisterExtConcode.CondSrcIndex );
-         }
-         if( ignored ) {
-            TXT( "\nPadding     : " );
-            UIX( dst->DstRegisterExtConcode.Padding );
-            if( deflt || fd->DstRegisterExtConcode.Extended != dst->DstRegisterExtConcode.Extended ) {
-               TXT( "\nExtended    : " );
-               UID( dst->DstRegisterExtConcode.Extended );
-            }
-         }
-      }
-
-      if( deflt || tgsi_compare_dst_register_ext_modulate( dst->DstRegisterExtModulate, fd->DstRegisterExtModulate ) ) {
-         EOL();
-         TXT( "\nType    : " );
-         ENM( dst->DstRegisterExtModulate.Type, TGSI_DST_REGISTER_EXTS );
-         if( deflt || fd->DstRegisterExtModulate.Modulate != dst->DstRegisterExtModulate.Modulate ) {
-            TXT( "\nModulate: " );
-            ENM( dst->DstRegisterExtModulate.Modulate, TGSI_MODULATES );
-         }
-         if( ignored ) {
-            TXT( "\nPadding : " );
-            UIX( dst->DstRegisterExtModulate.Padding );
-            if( deflt || fd->DstRegisterExtModulate.Extended != dst->DstRegisterExtModulate.Extended ) {
-               TXT( "\nExtended: " );
-               UID( dst->DstRegisterExtModulate.Extended );
-            }
-         }
-      }
-   }
-
-   for( i = 0; i < inst->Instruction.NumSrcRegs; i++ ) {
-      struct tgsi_full_src_register *src = &inst->FullSrcRegisters[i];
-      struct tgsi_full_src_register *fs = &fi->FullSrcRegisters[i];
-
-      EOL();
-      TXT( "\nFile     : ");
-      ENM( src->SrcRegister.File, TGSI_FILES );
-      if( deflt || fs->SrcRegister.SwizzleX != src->SrcRegister.SwizzleX ) {
-         TXT( "\nSwizzleX : " );
-         ENM( src->SrcRegister.SwizzleX, TGSI_SWIZZLES );
-      }
-      if( deflt || fs->SrcRegister.SwizzleY != src->SrcRegister.SwizzleY ) {
-         TXT( "\nSwizzleY : " );
-         ENM( src->SrcRegister.SwizzleY, TGSI_SWIZZLES );
-      }
-      if( deflt || fs->SrcRegister.SwizzleZ != src->SrcRegister.SwizzleZ ) {
-         TXT( "\nSwizzleZ : " );
-         ENM( src->SrcRegister.SwizzleZ, TGSI_SWIZZLES );
-      }
-      if( deflt || fs->SrcRegister.SwizzleW != src->SrcRegister.SwizzleW ) {
-         TXT( "\nSwizzleW : " );
-         ENM( src->SrcRegister.SwizzleW, TGSI_SWIZZLES );
-      }
-      if( deflt || fs->SrcRegister.Negate != src->SrcRegister.Negate ) {
-         TXT( "\nNegate   : " );
-         UID( src->SrcRegister.Negate );
-      }
-      if( ignored ) {
-         if( deflt || fs->SrcRegister.Indirect != src->SrcRegister.Indirect ) {
-            TXT( "\nIndirect : " );
-            UID( src->SrcRegister.Indirect );
-         }
-         if( deflt || fs->SrcRegister.Dimension != src->SrcRegister.Dimension ) {
-            TXT( "\nDimension: " );
-            UID( src->SrcRegister.Dimension );
-         }
-      }
-      if( deflt || fs->SrcRegister.Index != src->SrcRegister.Index ) {
-         TXT( "\nIndex    : " );
-         SID( src->SrcRegister.Index );
-      }
-      if( ignored ) {
-         if( deflt || fs->SrcRegister.Extended != src->SrcRegister.Extended ) {
-            TXT( "\nExtended : " );
-            UID( src->SrcRegister.Extended );
-         }
-      }
-
-      if( deflt || tgsi_compare_src_register_ext_swz( src->SrcRegisterExtSwz, fs->SrcRegisterExtSwz ) ) {
-         EOL();
-         TXT( "\nType       : " );
-         ENM( src->SrcRegisterExtSwz.Type, TGSI_SRC_REGISTER_EXTS );
-         if( deflt || fs->SrcRegisterExtSwz.ExtSwizzleX != src->SrcRegisterExtSwz.ExtSwizzleX ) {
-            TXT( "\nExtSwizzleX: " );
-            ENM( src->SrcRegisterExtSwz.ExtSwizzleX, TGSI_EXTSWIZZLES );
-         }
-         if( deflt || fs->SrcRegisterExtSwz.ExtSwizzleY != src->SrcRegisterExtSwz.ExtSwizzleY ) {
-            TXT( "\nExtSwizzleY: " );
-            ENM( src->SrcRegisterExtSwz.ExtSwizzleY, TGSI_EXTSWIZZLES );
-         }
-         if( deflt || fs->SrcRegisterExtSwz.ExtSwizzleZ != src->SrcRegisterExtSwz.ExtSwizzleZ ) {
-            TXT( "\nExtSwizzleZ: " );
-            ENM( src->SrcRegisterExtSwz.ExtSwizzleZ, TGSI_EXTSWIZZLES );
-         }
-         if( deflt || fs->SrcRegisterExtSwz.ExtSwizzleW != src->SrcRegisterExtSwz.ExtSwizzleW ) {
-            TXT( "\nExtSwizzleW: " );
-            ENM( src->SrcRegisterExtSwz.ExtSwizzleW, TGSI_EXTSWIZZLES );
-         }
-         if( deflt || fs->SrcRegisterExtSwz.NegateX != src->SrcRegisterExtSwz.NegateX ) {
-            TXT( "\nNegateX   : " );
-            UID( src->SrcRegisterExtSwz.NegateX );
-         }
-         if( deflt || fs->SrcRegisterExtSwz.NegateY != src->SrcRegisterExtSwz.NegateY ) {
-            TXT( "\nNegateY   : " );
-            UID( src->SrcRegisterExtSwz.NegateY );
-         }
-         if( deflt || fs->SrcRegisterExtSwz.NegateZ != src->SrcRegisterExtSwz.NegateZ ) {
-            TXT( "\nNegateZ   : " );
-            UID( src->SrcRegisterExtSwz.NegateZ );
-         }
-         if( deflt || fs->SrcRegisterExtSwz.NegateW != src->SrcRegisterExtSwz.NegateW ) {
-            TXT( "\nNegateW   : " );
-            UID( src->SrcRegisterExtSwz.NegateW );
-         }
-         if( ignored ) {
-            TXT( "\nPadding   : " );
-            UIX( src->SrcRegisterExtSwz.Padding );
-            if( deflt || fs->SrcRegisterExtSwz.Extended != src->SrcRegisterExtSwz.Extended ) {
-               TXT( "\nExtended   : " );
-               UID( src->SrcRegisterExtSwz.Extended );
-            }
-         }
-      }
-
-      if( deflt || tgsi_compare_src_register_ext_mod( src->SrcRegisterExtMod, fs->SrcRegisterExtMod ) ) {
-         EOL();
-         TXT( "\nType     : " );
-         ENM( src->SrcRegisterExtMod.Type, TGSI_SRC_REGISTER_EXTS );
-         if( deflt || fs->SrcRegisterExtMod.Complement != src->SrcRegisterExtMod.Complement ) {
-            TXT( "\nComplement: " );
-            UID( src->SrcRegisterExtMod.Complement );
-         }
-         if( deflt || fs->SrcRegisterExtMod.Bias != src->SrcRegisterExtMod.Bias ) {
-            TXT( "\nBias     : " );
-            UID( src->SrcRegisterExtMod.Bias );
-         }
-         if( deflt || fs->SrcRegisterExtMod.Scale2X != src->SrcRegisterExtMod.Scale2X ) {
-            TXT( "\nScale2X   : " );
-            UID( src->SrcRegisterExtMod.Scale2X );
-         }
-         if( deflt || fs->SrcRegisterExtMod.Absolute != src->SrcRegisterExtMod.Absolute ) {
-            TXT( "\nAbsolute  : " );
-            UID( src->SrcRegisterExtMod.Absolute );
-         }
-         if( deflt || fs->SrcRegisterExtMod.Negate != src->SrcRegisterExtMod.Negate ) {
-            TXT( "\nNegate   : " );
-            UID( src->SrcRegisterExtMod.Negate );
-         }
-         if( ignored ) {
-            TXT( "\nPadding   : " );
-            UIX( src->SrcRegisterExtMod.Padding );
-            if( deflt || fs->SrcRegisterExtMod.Extended != src->SrcRegisterExtMod.Extended ) {
-               TXT( "\nExtended  : " );
-               UID( src->SrcRegisterExtMod.Extended );
-            }
-         }
-      }
-   }
-}
-
-void
-tgsi_dump_c(
-   const struct tgsi_token *tokens,
-   uint flags )
-{
-   struct tgsi_parse_context parse;
-   struct tgsi_full_instruction fi;
-   struct tgsi_full_declaration fd;
-   uint ignored = flags & TGSI_DUMP_C_IGNORED;
-   uint deflt = flags & TGSI_DUMP_C_DEFAULT;
-   uint instno = 0;
-
-   /* sanity checks */
-   assert(strcmp(TGSI_OPCODES[TGSI_OPCODE_CONT], "OPCODE_CONT") == 0);
-   assert(strcmp(TGSI_OPCODES[TGSI_OPCODE_END], "OPCODE_END") == 0);
-
-   tgsi_parse_init( &parse, tokens );
-
-   TXT( "tgsi-dump begin -----------------" );
-
-   TXT( "\nMajorVersion: " );
-   UID( parse.FullVersion.Version.MajorVersion );
-   TXT( "\nMinorVersion: " );
-   UID( parse.FullVersion.Version.MinorVersion );
-   EOL();
-
-   TXT( "\nHeaderSize: " );
-   UID( parse.FullHeader.Header.HeaderSize );
-   TXT( "\nBodySize  : " );
-   UID( parse.FullHeader.Header.BodySize );
-   TXT( "\nProcessor : " );
-   ENM( parse.FullHeader.Processor.Processor, TGSI_PROCESSOR_TYPES );
-   EOL();
-
-   fi = tgsi_default_full_instruction();
-   fd = tgsi_default_full_declaration();
-
-   while( !tgsi_parse_end_of_tokens( &parse ) ) {
-      tgsi_parse_token( &parse );
-
-      TXT( "\nType       : " );
-      ENM( parse.FullToken.Token.Type, TGSI_TOKEN_TYPES );
-      if( ignored ) {
-         TXT( "\nSize       : " );
-         UID( parse.FullToken.Token.Size );
-         if( deflt || parse.FullToken.Token.Extended ) {
-            TXT( "\nExtended   : " );
-            UID( parse.FullToken.Token.Extended );
-         }
-      }
-
-      switch( parse.FullToken.Token.Type ) {
-      case TGSI_TOKEN_TYPE_DECLARATION:
-         dump_declaration_verbose(
-            &parse.FullToken.FullDeclaration,
-            ignored,
-            deflt,
-            &fd );
-         break;
-
-      case TGSI_TOKEN_TYPE_IMMEDIATE:
-         dump_immediate_verbose(
-            &parse.FullToken.FullImmediate,
-            ignored );
-         break;
-
-      case TGSI_TOKEN_TYPE_INSTRUCTION:
-         dump_instruction_verbose(
-            &parse.FullToken.FullInstruction,
-            ignored,
-            deflt,
-            &fi );
-         break;
-
-      default:
-         assert( 0 );
-      }
-
-      EOL();
-   }
-
-   TXT( "\ntgsi-dump end -------------------\n" );
-
-   tgsi_parse_free( &parse );
-}
diff --git a/src/gallium/auxiliary/tgsi/util/tgsi_dump_c.h b/src/gallium/auxiliary/tgsi/util/tgsi_dump_c.h
deleted file mode 100644
index d91cd35b3b..0000000000
--- a/src/gallium/auxiliary/tgsi/util/tgsi_dump_c.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-#ifndef TGSI_DUMP_C_H
-#define TGSI_DUMP_C_H
-
-#include "pipe/p_shader_tokens.h"
-
-#if defined __cplusplus
-extern "C" {
-#endif
-
-#define TGSI_DUMP_C_IGNORED 1
-#define TGSI_DUMP_C_DEFAULT 2
-
-void
-tgsi_dump_c(
-   const struct tgsi_token *tokens,
-   uint flags );
-
-#if defined __cplusplus
-}
-#endif
-
-#endif /* TGSI_DUMP_C_H */
diff --git a/src/gallium/auxiliary/tgsi/util/tgsi_iterate.c b/src/gallium/auxiliary/tgsi/util/tgsi_iterate.c
deleted file mode 100644
index 5371a88b96..0000000000
--- a/src/gallium/auxiliary/tgsi/util/tgsi_iterate.c
+++ /dev/null
@@ -1,85 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-#include "pipe/p_debug.h"
-#include "tgsi_iterate.h"
-
-boolean
-tgsi_iterate_shader(
-   const struct tgsi_token *tokens,
-   struct tgsi_iterate_context *ctx )
-{
-   struct tgsi_parse_context parse;
-
-   if (tgsi_parse_init( &parse, tokens ) != TGSI_PARSE_OK)
-      return FALSE;
-
-   ctx->processor = parse.FullHeader.Processor;
-   ctx->version = parse.FullVersion.Version;
-
-   if (ctx->prolog)
-      if (!ctx->prolog( ctx ))
-         goto fail;
-
-   while (!tgsi_parse_end_of_tokens( &parse )) {
-      tgsi_parse_token( &parse );
-
-      switch (parse.FullToken.Token.Type) {
-      case TGSI_TOKEN_TYPE_INSTRUCTION:
-         if (ctx->iterate_instruction)
-            if (!ctx->iterate_instruction( ctx, &parse.FullToken.FullInstruction ))
-               goto fail;
-         break;
-
-      case TGSI_TOKEN_TYPE_DECLARATION:
-         if (ctx->iterate_declaration)
-            if (!ctx->iterate_declaration( ctx, &parse.FullToken.FullDeclaration ))
-               goto fail;
-         break;
-
-      case TGSI_TOKEN_TYPE_IMMEDIATE:
-         if (ctx->iterate_immediate)
-            if (!ctx->iterate_immediate( ctx, &parse.FullToken.FullImmediate ))
-               goto fail;
-         break;
-
-      default:
-         assert( 0 );
-      }
-   }
-
-   if (ctx->epilog)
-      if (!ctx->epilog( ctx ))
-         goto fail;
-
-   tgsi_parse_free( &parse );
-   return TRUE;
-
-fail:
-   tgsi_parse_free( &parse );
-   return FALSE;
-}
diff --git a/src/gallium/auxiliary/tgsi/util/tgsi_iterate.h b/src/gallium/auxiliary/tgsi/util/tgsi_iterate.h
deleted file mode 100644
index f5bebf89b8..0000000000
--- a/src/gallium/auxiliary/tgsi/util/tgsi_iterate.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-#ifndef TGSI_ITERATE_H
-#define TGSI_ITERATE_H
-
-#include "pipe/p_shader_tokens.h"
-#include "tgsi/util/tgsi_parse.h"
-
-#if defined __cplusplus
-extern "C" {
-#endif
-
-struct tgsi_iterate_context
-{
-   boolean
-   (* prolog)(
-      struct tgsi_iterate_context *ctx );
-
-   boolean
-   (* iterate_instruction)(
-      struct tgsi_iterate_context *ctx,
-      struct tgsi_full_instruction *inst );
-
-   boolean
-   (* iterate_declaration)(
-      struct tgsi_iterate_context *ctx,
-      struct tgsi_full_declaration *decl );
-
-   boolean
-   (* iterate_immediate)(
-      struct tgsi_iterate_context *ctx,
-      struct tgsi_full_immediate *imm );
-
-   boolean
-   (* epilog)(
-      struct tgsi_iterate_context *ctx );
-
-   struct tgsi_processor processor;
-   struct tgsi_version version;
-};
-
-boolean
-tgsi_iterate_shader(
-   const struct tgsi_token *tokens,
-   struct tgsi_iterate_context *ctx );
-
-#if defined __cplusplus
-}
-#endif
-
-#endif /* TGSI_ITERATE_H */
diff --git a/src/gallium/auxiliary/tgsi/util/tgsi_parse.c b/src/gallium/auxiliary/tgsi/util/tgsi_parse.c
deleted file mode 100644
index d16f0cdcad..0000000000
--- a/src/gallium/auxiliary/tgsi/util/tgsi_parse.c
+++ /dev/null
@@ -1,332 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-#include "pipe/p_debug.h"
-#include "pipe/p_util.h"
-#include "pipe/p_shader_tokens.h"
-#include "tgsi_parse.h"
-#include "tgsi_build.h"
-
-void
-tgsi_full_token_init(
-   union tgsi_full_token *full_token )
-{
-   full_token->Token.Type = TGSI_TOKEN_TYPE_DECLARATION;
-}
-
-void
-tgsi_full_token_free(
-   union tgsi_full_token *full_token )
-{
-   if( full_token->Token.Type == TGSI_TOKEN_TYPE_IMMEDIATE ) {
-      FREE( (void *) full_token->FullImmediate.u.Pointer );
-   }
-}
-
-unsigned
-tgsi_parse_init(
-   struct tgsi_parse_context *ctx,
-   const struct tgsi_token *tokens )
-{
-   ctx->FullVersion.Version = *(struct tgsi_version *) &tokens[0];
-   if( ctx->FullVersion.Version.MajorVersion > 1 ) {
-      return TGSI_PARSE_ERROR;
-   }
-
-   ctx->FullHeader.Header = *(struct tgsi_header *) &tokens[1];
-   if( ctx->FullHeader.Header.HeaderSize >= 2 ) {
-      ctx->FullHeader.Processor = *(struct tgsi_processor *) &tokens[2];
-   }
-   else {
-      ctx->FullHeader.Processor = tgsi_default_processor();
-   }
-
-   ctx->Tokens = tokens;
-   ctx->Position = 1 + ctx->FullHeader.Header.HeaderSize;
-
-   tgsi_full_token_init( &ctx->FullToken );
-
-   return TGSI_PARSE_OK;
-}
-
-void
-tgsi_parse_free(
-   struct tgsi_parse_context *ctx )
-{
-   tgsi_full_token_free( &ctx->FullToken );
-}
-
-boolean
-tgsi_parse_end_of_tokens(
-   struct tgsi_parse_context *ctx )
-{
-   return ctx->Position >=
-      1 + ctx->FullHeader.Header.HeaderSize + ctx->FullHeader.Header.BodySize;
-}
-
-static void
-next_token(
-   struct tgsi_parse_context *ctx,
-   void *token )
-{
-   assert( !tgsi_parse_end_of_tokens( ctx ) );
-
-   *(struct tgsi_token *) token = ctx->Tokens[ctx->Position++];
-}
-
-void
-tgsi_parse_token(
-   struct tgsi_parse_context *ctx )
-{
-   struct tgsi_token token;
-   unsigned i;
-
-   tgsi_full_token_free( &ctx->FullToken );
-   tgsi_full_token_init( &ctx->FullToken );
-
-   next_token( ctx, &token );
-
-   switch( token.Type ) {
-   case TGSI_TOKEN_TYPE_DECLARATION:
-   {
-      struct tgsi_full_declaration *decl = &ctx->FullToken.FullDeclaration;
-
-      *decl = tgsi_default_full_declaration();
-      decl->Declaration = *(struct tgsi_declaration *) &token;
-
-      next_token( ctx, &decl->DeclarationRange );
-
-      if( decl->Declaration.Semantic ) {
-         next_token( ctx, &decl->Semantic );
-      }
-
-      break;
-   }
-
-   case TGSI_TOKEN_TYPE_IMMEDIATE:
-   {
-      struct tgsi_full_immediate *imm = &ctx->FullToken.FullImmediate;
-
-      *imm = tgsi_default_full_immediate();
-      imm->Immediate = *(struct tgsi_immediate *) &token;
-
-      assert( !imm->Immediate.Extended );
-
-      switch (imm->Immediate.DataType) {
-      case TGSI_IMM_FLOAT32:
-         imm->u.Pointer = MALLOC(
-            sizeof( struct tgsi_immediate_float32 ) * (imm->Immediate.Size - 1) );
-         for( i = 0; i < imm->Immediate.Size - 1; i++ ) {
-            next_token( ctx, (struct tgsi_immediate_float32 *) &imm->u.ImmediateFloat32[i] );
-         }
-         break;
-
-      default:
-         assert( 0 );
-      }
-
-      break;
-   }
-
-   case TGSI_TOKEN_TYPE_INSTRUCTION:
-   {
-      struct tgsi_full_instruction *inst = &ctx->FullToken.FullInstruction;
-      unsigned extended;
-
-      *inst = tgsi_default_full_instruction();
-      inst->Instruction = *(struct tgsi_instruction *) &token;
-
-      extended = inst->Instruction.Extended;
-
-      while( extended ) {
-         struct tgsi_src_register_ext token;
-
-         next_token( ctx, &token );
-
-         switch( token.Type ) {
-         case TGSI_INSTRUCTION_EXT_TYPE_NV:
-            inst->InstructionExtNv =
-               *(struct tgsi_instruction_ext_nv *) &token;
-            break;
-
-         case TGSI_INSTRUCTION_EXT_TYPE_LABEL:
-            inst->InstructionExtLabel =
-               *(struct tgsi_instruction_ext_label *) &token;
-            break;
-
-         case TGSI_INSTRUCTION_EXT_TYPE_TEXTURE:
-            inst->InstructionExtTexture =
-               *(struct tgsi_instruction_ext_texture *) &token;
-            break;
-
-         default:
-            assert( 0 );
-         }
-
-         extended = token.Extended;
-      }
-
-      assert( inst->Instruction.NumDstRegs <= TGSI_FULL_MAX_DST_REGISTERS );
-
-      for(  i = 0; i < inst->Instruction.NumDstRegs; i++ ) {
-         unsigned extended;
-
-         next_token( ctx, &inst->FullDstRegisters[i].DstRegister );
-
-         /*
-          * No support for indirect or multi-dimensional addressing.
-          */
-         assert( !inst->FullDstRegisters[i].DstRegister.Indirect );
-         assert( !inst->FullDstRegisters[i].DstRegister.Dimension );
-
-         extended = inst->FullDstRegisters[i].DstRegister.Extended;
-
-         while( extended ) {
-            struct tgsi_src_register_ext token;
-
-            next_token( ctx, &token );
-
-            switch( token.Type ) {
-            case TGSI_DST_REGISTER_EXT_TYPE_CONDCODE:
-               inst->FullDstRegisters[i].DstRegisterExtConcode =
-                  *(struct tgsi_dst_register_ext_concode *) &token;
-               break;
-
-            case TGSI_DST_REGISTER_EXT_TYPE_MODULATE:
-               inst->FullDstRegisters[i].DstRegisterExtModulate =
-                  *(struct tgsi_dst_register_ext_modulate *) &token;
-               break;
-
-            default:
-               assert( 0 );
-            }
-
-            extended = token.Extended;
-         }
-      }
-
-      assert( inst->Instruction.NumSrcRegs <= TGSI_FULL_MAX_SRC_REGISTERS );
-
-      for( i = 0; i < inst->Instruction.NumSrcRegs; i++ ) {
-         unsigned extended;
-
-         next_token( ctx, &inst->FullSrcRegisters[i].SrcRegister );
-
-         extended = inst->FullSrcRegisters[i].SrcRegister.Extended;
-
-         while( extended ) {
-            struct tgsi_src_register_ext token;
-
-            next_token( ctx, &token );
-
-            switch( token.Type ) {
-            case TGSI_SRC_REGISTER_EXT_TYPE_SWZ:
-               inst->FullSrcRegisters[i].SrcRegisterExtSwz =
-                  *(struct tgsi_src_register_ext_swz *) &token;
-               break;
-
-            case TGSI_SRC_REGISTER_EXT_TYPE_MOD:
-               inst->FullSrcRegisters[i].SrcRegisterExtMod =
-                  *(struct tgsi_src_register_ext_mod *) &token;
-               break;
-
-            default:
-               assert( 0 );
-            }
-
-            extended = token.Extended;
-         }
-
-         if( inst->FullSrcRegisters[i].SrcRegister.Indirect ) {
-            next_token( ctx, &inst->FullSrcRegisters[i].SrcRegisterInd );
-
-            /*
-             * No support for indirect or multi-dimensional addressing.
-             */
-            assert( !inst->FullSrcRegisters[i].SrcRegisterInd.Indirect );
-            assert( !inst->FullSrcRegisters[i].SrcRegisterInd.Dimension );
-            assert( !inst->FullSrcRegisters[i].SrcRegisterInd.Extended );
-         }
-
-         if( inst->FullSrcRegisters[i].SrcRegister.Dimension ) {
-            next_token( ctx, &inst->FullSrcRegisters[i].SrcRegisterDim );
-
-            /*
-             * No support for multi-dimensional addressing.
-             */
-            assert( !inst->FullSrcRegisters[i].SrcRegisterDim.Dimension );
-            assert( !inst->FullSrcRegisters[i].SrcRegisterDim.Extended );
-
-            if( inst->FullSrcRegisters[i].SrcRegisterDim.Indirect ) {
-               next_token( ctx, &inst->FullSrcRegisters[i].SrcRegisterDimInd );
-
-               /*
-               * No support for indirect or multi-dimensional addressing.
-               */
-               assert( !inst->FullSrcRegisters[i].SrcRegisterInd.Indirect );
-               assert( !inst->FullSrcRegisters[i].SrcRegisterInd.Dimension );
-               assert( !inst->FullSrcRegisters[i].SrcRegisterInd.Extended );
-            }
-         }
-      }
-
-      break;
-   }
-
-   default:
-      assert( 0 );
-   }
-}
-
-
-unsigned
-tgsi_num_tokens(const struct tgsi_token *tokens)
-{
-   struct tgsi_parse_context ctx;
-   if (tgsi_parse_init(&ctx, tokens) == TGSI_PARSE_OK) {
-      unsigned len = (ctx.FullHeader.Header.HeaderSize +
-                      ctx.FullHeader.Header.BodySize +
-                      1);
-      return len;
-   }
-   return 0;
-}
-
-
-/**
- * Make a new copy of a token array.
- */
-struct tgsi_token *
-tgsi_dup_tokens(const struct tgsi_token *tokens)
-{
-   unsigned n = tgsi_num_tokens(tokens);
-   unsigned bytes = n * sizeof(struct tgsi_token);
-   struct tgsi_token *new_tokens = (struct tgsi_token *) MALLOC(bytes);
-   if (new_tokens)
-      memcpy(new_tokens, tokens, bytes);
-   return new_tokens;
-}
diff --git a/src/gallium/auxiliary/tgsi/util/tgsi_parse.h b/src/gallium/auxiliary/tgsi/util/tgsi_parse.h
deleted file mode 100644
index 054350712d..0000000000
--- a/src/gallium/auxiliary/tgsi/util/tgsi_parse.h
+++ /dev/null
@@ -1,151 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-#ifndef TGSI_PARSE_H
-#define TGSI_PARSE_H
-
-#include "pipe/p_shader_tokens.h"
-
-#if defined __cplusplus
-extern "C" {
-#endif
-
-struct tgsi_full_version
-{
-   struct tgsi_version  Version;
-};
-
-struct tgsi_full_header
-{
-   struct tgsi_header      Header;
-   struct tgsi_processor   Processor;
-};
-
-struct tgsi_full_dst_register
-{
-   struct tgsi_dst_register               DstRegister;
-   struct tgsi_dst_register_ext_concode   DstRegisterExtConcode;
-   struct tgsi_dst_register_ext_modulate  DstRegisterExtModulate;
-};
-
-struct tgsi_full_src_register
-{
-   struct tgsi_src_register         SrcRegister;
-   struct tgsi_src_register_ext_swz SrcRegisterExtSwz;
-   struct tgsi_src_register_ext_mod SrcRegisterExtMod;
-   struct tgsi_src_register         SrcRegisterInd;
-   struct tgsi_dimension            SrcRegisterDim;
-   struct tgsi_src_register         SrcRegisterDimInd;
-};
-
-struct tgsi_full_declaration
-{
-   struct tgsi_declaration Declaration;
-   struct tgsi_declaration_range DeclarationRange;
-   struct tgsi_declaration_semantic Semantic;
-};
-
-struct tgsi_full_immediate
-{
-   struct tgsi_immediate   Immediate;
-   union
-   {
-      const void                          *Pointer;
-      const struct tgsi_immediate_float32 *ImmediateFloat32;
-   } u;
-};
-
-#define TGSI_FULL_MAX_DST_REGISTERS 2
-#define TGSI_FULL_MAX_SRC_REGISTERS 4 /* TXD has 4 */
-
-struct tgsi_full_instruction
-{
-   struct tgsi_instruction             Instruction;
-   struct tgsi_instruction_ext_nv      InstructionExtNv;
-   struct tgsi_instruction_ext_label   InstructionExtLabel;
-   struct tgsi_instruction_ext_texture InstructionExtTexture;
-   struct tgsi_full_dst_register       FullDstRegisters[TGSI_FULL_MAX_DST_REGISTERS];
-   struct tgsi_full_src_register       FullSrcRegisters[TGSI_FULL_MAX_SRC_REGISTERS];
-};
-
-union tgsi_full_token
-{
-   struct tgsi_token             Token;
-   struct tgsi_full_declaration  FullDeclaration;
-   struct tgsi_full_immediate    FullImmediate;
-   struct tgsi_full_instruction  FullInstruction;
-};
-
-void
-tgsi_full_token_init(
-   union tgsi_full_token *full_token );
-
-void
-tgsi_full_token_free(
-   union tgsi_full_token *full_token );
-
-struct tgsi_parse_context
-{
-   const struct tgsi_token    *Tokens;
-   unsigned                   Position;
-   struct tgsi_full_version   FullVersion;
-   struct tgsi_full_header    FullHeader;
-   union tgsi_full_token      FullToken;
-};
-
-#define TGSI_PARSE_OK      0
-#define TGSI_PARSE_ERROR   1
-
-unsigned
-tgsi_parse_init(
-   struct tgsi_parse_context *ctx,
-   const struct tgsi_token *tokens );
-
-void
-tgsi_parse_free(
-   struct tgsi_parse_context *ctx );
-
-boolean
-tgsi_parse_end_of_tokens(
-   struct tgsi_parse_context *ctx );
-
-void
-tgsi_parse_token(
-   struct tgsi_parse_context *ctx );
-
-unsigned
-tgsi_num_tokens(const struct tgsi_token *tokens);
-
-struct tgsi_token *
-tgsi_dup_tokens(const struct tgsi_token *tokens);
-
-#if defined __cplusplus
-}
-#endif
-
-#endif /* TGSI_PARSE_H */
-
diff --git a/src/gallium/auxiliary/tgsi/util/tgsi_sanity.c b/src/gallium/auxiliary/tgsi/util/tgsi_sanity.c
deleted file mode 100644
index 2e3ec96b5b..0000000000
--- a/src/gallium/auxiliary/tgsi/util/tgsi_sanity.c
+++ /dev/null
@@ -1,341 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-#include "pipe/p_debug.h"
-#include "tgsi_sanity.h"
-#include "tgsi_iterate.h"
-
-#define MAX_REGISTERS 256
-
-typedef uint reg_flag;
-
-#define BITS_IN_REG_FLAG (sizeof( reg_flag ) * 8)
-
-struct sanity_check_ctx
-{
-   struct tgsi_iterate_context iter;
-
-   reg_flag regs_decl[TGSI_FILE_COUNT][MAX_REGISTERS / BITS_IN_REG_FLAG];
-   reg_flag regs_used[TGSI_FILE_COUNT][MAX_REGISTERS / BITS_IN_REG_FLAG];
-   boolean regs_ind_used[TGSI_FILE_COUNT];
-   uint num_imms;
-   uint num_instructions;
-   uint index_of_END;
-
-   uint errors;
-   uint warnings;
-};
-
-static void
-report_error(
-   struct sanity_check_ctx *ctx,
-   const char *format,
-   ... )
-{
-   va_list args;
-
-   debug_printf( "Error  : " );
-   va_start( args, format );
-   _debug_vprintf( format, args );
-   va_end( args );
-   debug_printf( "\n" );
-   ctx->errors++;
-}
-
-static void
-report_warning(
-   struct sanity_check_ctx *ctx,
-   const char *format,
-   ... )
-{
-   va_list args;
-
-   debug_printf( "Warning: " );
-   va_start( args, format );
-   _debug_vprintf( format, args );
-   va_end( args );
-   debug_printf( "\n" );
-   ctx->warnings++;
-}
-
-static boolean
-check_file_name(
-   struct sanity_check_ctx *ctx,
-   uint file )
-{
-   if (file <= TGSI_FILE_NULL || file >= TGSI_FILE_COUNT) {
-      report_error( ctx, "Invalid register file name" );
-      return FALSE;
-   }
-   return TRUE;
-}
-
-static boolean
-is_register_declared(
-   struct sanity_check_ctx *ctx,
-   uint file,
-   int index )
-{
-   assert( index >= 0 && index < MAX_REGISTERS );
-
-   return (ctx->regs_decl[file][index / BITS_IN_REG_FLAG] & (1 << (index % BITS_IN_REG_FLAG))) ? TRUE : FALSE;
-}
-
-static boolean
-is_any_register_declared(
-   struct sanity_check_ctx *ctx,
-   uint file )
-{
-   uint i;
-
-   for (i = 0; i < MAX_REGISTERS / BITS_IN_REG_FLAG; i++)
-      if (ctx->regs_decl[file][i])
-         return TRUE;
-   return FALSE;
-}
-
-static boolean
-is_register_used(
-   struct sanity_check_ctx *ctx,
-   uint file,
-   int index )
-{
-   assert( index < MAX_REGISTERS );
-
-   return (ctx->regs_used[file][index / BITS_IN_REG_FLAG] & (1 << (index % BITS_IN_REG_FLAG))) ? TRUE : FALSE;
-}
-
-static const char *file_names[] =
-{
-   "NULL",
-   "CONST",
-   "IN",
-   "OUT",
-   "TEMP",
-   "SAMP",
-   "ADDR",
-   "IMM"
-};
-
-static boolean
-check_register_usage(
-   struct sanity_check_ctx *ctx,
-   uint file,
-   int index,
-   const char *name,
-   boolean indirect_access )
-{
-   if (!check_file_name( ctx, file ))
-      return FALSE;
-   if (indirect_access) {
-      if (!is_any_register_declared( ctx, file ))
-         report_error( ctx, "%s: Undeclared %s register", file_names[file], name );
-      ctx->regs_ind_used[file] = TRUE;
-   }
-   else {
-      if (!is_register_declared( ctx, file, index ))
-         report_error( ctx, "%s[%d]: Undeclared %s register", file_names[file], index, name );
-      ctx->regs_used[file][index / BITS_IN_REG_FLAG] |= (1 << (index % BITS_IN_REG_FLAG));
-   }
-   return TRUE;
-}
-
-static boolean
-iter_instruction(
-   struct tgsi_iterate_context *iter,
-   struct tgsi_full_instruction *inst )
-{
-   struct sanity_check_ctx *ctx = (struct sanity_check_ctx *) iter;
-   uint i;
-
-   /* There must be no other instructions after END.
-    */
-   if (ctx->index_of_END != ~0) {
-      report_error( ctx, "Unexpected instruction after END" );
-   }
-   else if (inst->Instruction.Opcode == TGSI_OPCODE_END) {
-      ctx->index_of_END = ctx->num_instructions;
-   }
-
-   /* Check destination and source registers' validity.
-    * Mark the registers as used.
-    */
-   for (i = 0; i < inst->Instruction.NumDstRegs; i++) {
-      check_register_usage(
-         ctx,
-         inst->FullDstRegisters[i].DstRegister.File,
-         inst->FullDstRegisters[i].DstRegister.Index,
-         "destination",
-         FALSE );
-   }
-   for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
-      check_register_usage(
-         ctx,
-         inst->FullSrcRegisters[i].SrcRegister.File,
-         inst->FullSrcRegisters[i].SrcRegister.Index,
-         "source",
-         (boolean)inst->FullSrcRegisters[i].SrcRegister.Indirect );
-      if (inst->FullSrcRegisters[i].SrcRegister.Indirect) {
-         uint file;
-         int index;
-
-         file = inst->FullSrcRegisters[i].SrcRegisterInd.File;
-         index = inst->FullSrcRegisters[i].SrcRegisterInd.Index;
-         check_register_usage(
-            ctx,
-            file,
-            index,
-            "indirect",
-            FALSE );
-         if (file != TGSI_FILE_ADDRESS || index != 0)
-            report_warning( ctx, "Indirect register not ADDR[0]" );
-      }
-   }
-
-   ctx->num_instructions++;
-
-   return TRUE;
-}
-
-static boolean
-iter_declaration(
-   struct tgsi_iterate_context *iter,
-   struct tgsi_full_declaration *decl )
-{
-   struct sanity_check_ctx *ctx = (struct sanity_check_ctx *) iter;
-   uint file;
-   uint i;
-
-   /* No declarations allowed after the first instruction.
-    */
-   if (ctx->num_instructions > 0)
-      report_error( ctx, "Instruction expected but declaration found" );
-
-   /* Check registers' validity.
-    * Mark the registers as declared.
-    */
-   file = decl->Declaration.File;
-   if (!check_file_name( ctx, file ))
-      return TRUE;
-   for (i = decl->DeclarationRange.First; i <= decl->DeclarationRange.Last; i++) {
-      if (is_register_declared( ctx, file, i ))
-         report_error( ctx, "The same register declared twice" );
-      ctx->regs_decl[file][i / BITS_IN_REG_FLAG] |= (1 << (i % BITS_IN_REG_FLAG));
-   }
-
-   return TRUE;
-}
-
-static boolean
-iter_immediate(
-   struct tgsi_iterate_context *iter,
-   struct tgsi_full_immediate *imm )
-{
-   struct sanity_check_ctx *ctx = (struct sanity_check_ctx *) iter;
-
-   assert( ctx->num_imms < MAX_REGISTERS );
-
-   /* No immediates allowed after the first instruction.
-    */
-   if (ctx->num_instructions > 0)
-      report_error( ctx, "Instruction expected but immediate found" );
-
-   /* Mark the register as declared.
-    */
-   ctx->regs_decl[TGSI_FILE_IMMEDIATE][ctx->num_imms / BITS_IN_REG_FLAG] |= (1 << (ctx->num_imms % BITS_IN_REG_FLAG));
-   ctx->num_imms++;
-
-   /* Check data type validity.
-    */
-   if (imm->Immediate.DataType != TGSI_IMM_FLOAT32) {
-      report_error( ctx, "Invalid immediate data type" );
-      return TRUE;
-   }
-
-   return TRUE;
-}
-
-static boolean
-epilog(
-   struct tgsi_iterate_context *iter )
-{
-   struct sanity_check_ctx *ctx = (struct sanity_check_ctx *) iter;
-   uint file;
-
-   /* There must be an END instruction at the end.
-    */
-   if (ctx->index_of_END == ~0 || ctx->index_of_END != ctx->num_instructions - 1) {
-      report_error( ctx, "Expected END at end of instruction sequence" );
-   }
-
-   /* Check if all declared registers were used.
-    */
-   for (file = TGSI_FILE_NULL; file < TGSI_FILE_COUNT; file++) {
-      uint i;
-
-      for (i = 0; i < MAX_REGISTERS; i++) {
-         if (is_register_declared( ctx, file, i ) && !is_register_used( ctx, file, i ) && !ctx->regs_ind_used[file]) {
-            report_warning( ctx, "Register never used" );
-         }
-      }
-   }
-
-   /* Print totals, if any.
-    */
-   if (ctx->errors || ctx->warnings)
-      debug_printf( "\n%u errors, %u warnings", ctx->errors, ctx->warnings );
-
-   return TRUE;
-}
-
-boolean
-tgsi_sanity_check(
-   struct tgsi_token *tokens )
-{
-   struct sanity_check_ctx ctx;
-
-   ctx.iter.prolog = NULL;
-   ctx.iter.iterate_instruction = iter_instruction;
-   ctx.iter.iterate_declaration = iter_declaration;
-   ctx.iter.iterate_immediate = iter_immediate;
-   ctx.iter.epilog = epilog;
-
-   memset( ctx.regs_decl, 0, sizeof( ctx.regs_decl ) );
-   memset( ctx.regs_used, 0, sizeof( ctx.regs_used ) );
-   memset( ctx.regs_ind_used, 0, sizeof( ctx.regs_ind_used ) );
-   ctx.num_imms = 0;
-   ctx.num_instructions = 0;
-   ctx.index_of_END = ~0;
-
-   ctx.errors = 0;
-   ctx.warnings = 0;
-
-   if (!tgsi_iterate_shader( tokens, &ctx.iter ))
-      return FALSE;
-
-   return ctx.errors == 0;
-}
diff --git a/src/gallium/auxiliary/tgsi/util/tgsi_sanity.h b/src/gallium/auxiliary/tgsi/util/tgsi_sanity.h
deleted file mode 100644
index ca45e94c7a..0000000000
--- a/src/gallium/auxiliary/tgsi/util/tgsi_sanity.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-#ifndef TGSI_SANITY_H
-#define TGSI_SANITY_H
-
-#include "pipe/p_shader_tokens.h"
-
-#if defined __cplusplus
-extern "C" {
-#endif
-
-/* Check the given token stream for errors and common mistakes.
- * Diagnostic messages are printed out to the debug output.
- * Returns TRUE if there are no errors, even though there could be some warnings.
- */
-boolean
-tgsi_sanity_check(
-   struct tgsi_token *tokens );
-
-#if defined __cplusplus
-}
-#endif
-
-#endif /* TGSI_SANITY_H */
diff --git a/src/gallium/auxiliary/tgsi/util/tgsi_scan.c b/src/gallium/auxiliary/tgsi/util/tgsi_scan.c
deleted file mode 100644
index 240aaaf362..0000000000
--- a/src/gallium/auxiliary/tgsi/util/tgsi_scan.c
+++ /dev/null
@@ -1,226 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-/**
- * TGSI program scan utility.
- * Used to determine which registers and instructions are used by a shader.
- *
- * Authors:  Brian Paul
- */
-
-
-#include "tgsi_scan.h"
-#include "tgsi/util/tgsi_parse.h"
-#include "tgsi/util/tgsi_build.h"
-
-#include "pipe/p_util.h"
-
-
-
-/**
- */
-void
-tgsi_scan_shader(const struct tgsi_token *tokens,
-                 struct tgsi_shader_info *info)
-{
-   uint procType, i;
-   struct tgsi_parse_context parse;
-
-   memset(info, 0, sizeof(*info));
-   for (i = 0; i < TGSI_FILE_COUNT; i++)
-      info->file_max[i] = -1;
-
-   /**
-    ** Setup to begin parsing input shader
-    **/
-   if (tgsi_parse_init( &parse, tokens ) != TGSI_PARSE_OK) {
-      debug_printf("tgsi_parse_init() failed in tgsi_scan_shader()!\n");
-      return;
-   }
-   procType = parse.FullHeader.Processor.Processor;
-   assert(procType == TGSI_PROCESSOR_FRAGMENT ||
-          procType == TGSI_PROCESSOR_VERTEX ||
-          procType == TGSI_PROCESSOR_GEOMETRY);
-
-
-   /**
-    ** Loop over incoming program tokens/instructions
-    */
-   while( !tgsi_parse_end_of_tokens( &parse ) ) {
-
-      info->num_tokens++;
-
-      tgsi_parse_token( &parse );
-
-      switch( parse.FullToken.Token.Type ) {
-      case TGSI_TOKEN_TYPE_INSTRUCTION:
-         {
-            struct tgsi_full_instruction *fullinst
-               = &parse.FullToken.FullInstruction;
-
-            assert(fullinst->Instruction.Opcode < TGSI_OPCODE_LAST);
-            info->opcode_count[fullinst->Instruction.Opcode]++;
-         }
-         break;
-
-      case TGSI_TOKEN_TYPE_DECLARATION:
-         {
-            struct tgsi_full_declaration *fulldecl
-               = &parse.FullToken.FullDeclaration;
-            uint file = fulldecl->Declaration.File;
-            uint i;
-            for (i = fulldecl->DeclarationRange.First;
-                 i <= fulldecl->DeclarationRange.Last;
-                 i++) {
-
-               /* only first 32 regs will appear in this bitfield */
-               info->file_mask[file] |= (1 << i);
-               info->file_count[file]++;
-               info->file_max[file] = MAX2(info->file_max[file], (int)i);
-
-               if (file == TGSI_FILE_INPUT) {
-                  info->input_semantic_name[i] = (ubyte)fulldecl->Semantic.SemanticName;
-                  info->input_semantic_index[i] = (ubyte)fulldecl->Semantic.SemanticIndex;
-                  info->num_inputs++;
-               }
-
-               if (file == TGSI_FILE_OUTPUT) {
-                  info->output_semantic_name[i] = (ubyte)fulldecl->Semantic.SemanticName;
-                  info->output_semantic_index[i] = (ubyte)fulldecl->Semantic.SemanticIndex;
-                  info->num_outputs++;
-               }
-
-               /* special case */
-               if (procType == TGSI_PROCESSOR_FRAGMENT &&
-                   file == TGSI_FILE_OUTPUT &&
-                   fulldecl->Semantic.SemanticName == TGSI_SEMANTIC_POSITION) {
-                  info->writes_z = TRUE;
-               }
-            }
-         }
-         break;
-
-      case TGSI_TOKEN_TYPE_IMMEDIATE:
-         info->immediate_count++;
-         break;
-
-      default:
-         assert( 0 );
-      }
-   }
-
-   assert( info->file_max[TGSI_FILE_INPUT] + 1 == info->num_inputs );
-   assert( info->file_max[TGSI_FILE_OUTPUT] + 1 == info->num_outputs );
-
-   info->uses_kill = (info->opcode_count[TGSI_OPCODE_KIL] ||
-                      info->opcode_count[TGSI_OPCODE_KILP]);
-
-   tgsi_parse_free (&parse);
-}
-
-
-
-/**
- * Check if the given shader is a "passthrough" shader consisting of only
- * MOV instructions of the form:  MOV OUT[n], IN[n]
- *  
- */
-boolean
-tgsi_is_passthrough_shader(const struct tgsi_token *tokens)
-{
-   struct tgsi_parse_context parse;
-
-   /**
-    ** Setup to begin parsing input shader
-    **/
-   if (tgsi_parse_init(&parse, tokens) != TGSI_PARSE_OK) {
-      debug_printf("tgsi_parse_init() failed in tgsi_is_passthrough_shader()!\n");
-      return FALSE;
-   }
-
-   /**
-    ** Loop over incoming program tokens/instructions
-    */
-   while (!tgsi_parse_end_of_tokens(&parse)) {
-
-      tgsi_parse_token(&parse);
-
-      switch (parse.FullToken.Token.Type) {
-      case TGSI_TOKEN_TYPE_INSTRUCTION:
-         {
-            struct tgsi_full_instruction *fullinst =
-               &parse.FullToken.FullInstruction;
-            const struct tgsi_full_src_register *src =
-               &fullinst->FullSrcRegisters[0];
-            const struct tgsi_full_dst_register *dst =
-               &fullinst->FullDstRegisters[0];
-
-            /* Do a whole bunch of checks for a simple move */
-            if (fullinst->Instruction.Opcode != TGSI_OPCODE_MOV ||
-                src->SrcRegister.File != TGSI_FILE_INPUT ||
-                dst->DstRegister.File != TGSI_FILE_OUTPUT ||
-                src->SrcRegister.Index != dst->DstRegister.Index ||
-
-                src->SrcRegister.Negate ||
-                src->SrcRegisterExtMod.Negate ||
-                src->SrcRegisterExtMod.Absolute ||
-                src->SrcRegisterExtMod.Scale2X ||
-                src->SrcRegisterExtMod.Bias ||
-                src->SrcRegisterExtMod.Complement ||
-
-                src->SrcRegister.SwizzleX != TGSI_SWIZZLE_X ||
-                src->SrcRegister.SwizzleY != TGSI_SWIZZLE_Y ||
-                src->SrcRegister.SwizzleZ != TGSI_SWIZZLE_Z ||
-                src->SrcRegister.SwizzleW != TGSI_SWIZZLE_W ||
-
-                src->SrcRegisterExtSwz.ExtSwizzleX != TGSI_EXTSWIZZLE_X ||
-                src->SrcRegisterExtSwz.ExtSwizzleY != TGSI_EXTSWIZZLE_Y ||
-                src->SrcRegisterExtSwz.ExtSwizzleZ != TGSI_EXTSWIZZLE_Z ||
-                src->SrcRegisterExtSwz.ExtSwizzleW != TGSI_EXTSWIZZLE_W ||
-
-                dst->DstRegister.WriteMask != TGSI_WRITEMASK_XYZW)
-            {
-               tgsi_parse_free(&parse);
-               return FALSE;
-            }
-         }
-         break;
-
-      case TGSI_TOKEN_TYPE_DECLARATION:
-         /* fall-through */
-      case TGSI_TOKEN_TYPE_IMMEDIATE:
-         /* fall-through */
-      default:
-         ; /* no-op */
-      }
-   }
-
-   tgsi_parse_free(&parse);
-
-   /* if we get here, it's a pass-through shader */
-   return TRUE;
-}
diff --git a/src/gallium/auxiliary/tgsi/util/tgsi_scan.h b/src/gallium/auxiliary/tgsi/util/tgsi_scan.h
deleted file mode 100644
index 5cb6efb343..0000000000
--- a/src/gallium/auxiliary/tgsi/util/tgsi_scan.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-#ifndef TGSI_SCAN_H
-#define TGSI_SCAN_H
-
-
-#include "pipe/p_compiler.h"
-#include "pipe/p_state.h"
-#include "pipe/p_shader_tokens.h"
-
-
-/**
- * Shader summary info
- */
-struct tgsi_shader_info
-{
-   uint num_tokens;
-
-   /* XXX eventually remove the corresponding fields from pipe_shader_state: */
-   ubyte num_inputs;
-   ubyte num_outputs;
-   ubyte input_semantic_name[PIPE_MAX_SHADER_INPUTS]; /**< TGSI_SEMANTIC_x */
-   ubyte input_semantic_index[PIPE_MAX_SHADER_INPUTS];
-   ubyte output_semantic_name[PIPE_MAX_SHADER_OUTPUTS]; /**< TGSI_SEMANTIC_x */
-   ubyte output_semantic_index[PIPE_MAX_SHADER_OUTPUTS];
-
-   uint file_mask[TGSI_FILE_COUNT];  /**< bitmask of declared registers */
-   uint file_count[TGSI_FILE_COUNT];  /**< number of declared registers */
-   int file_max[TGSI_FILE_COUNT];  /**< highest index of declared registers */
-
-   uint immediate_count; /**< number of immediates declared */
-
-   uint opcode_count[TGSI_OPCODE_LAST];  /**< opcode histogram */
-
-   boolean writes_z;  /**< does fragment shader write Z value? */
-   boolean uses_kill;  /**< KIL or KILP instruction used? */
-};
-
-
-extern void
-tgsi_scan_shader(const struct tgsi_token *tokens,
-                 struct tgsi_shader_info *info);
-
-
-extern boolean
-tgsi_is_passthrough_shader(const struct tgsi_token *tokens);
-
-
-#endif /* TGSI_SCAN_H */
diff --git a/src/gallium/auxiliary/tgsi/util/tgsi_text.c b/src/gallium/auxiliary/tgsi/util/tgsi_text.c
deleted file mode 100644
index 35cb3055bb..0000000000
--- a/src/gallium/auxiliary/tgsi/util/tgsi_text.c
+++ /dev/null
@@ -1,1221 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-#include "pipe/p_debug.h"
-#include "tgsi_text.h"
-#include "tgsi_build.h"
-#include "tgsi_parse.h"
-#include "tgsi_sanity.h"
-#include "tgsi_util.h"
-
-static boolean is_alpha_underscore( const char *cur )
-{
-   return
-      (*cur >= 'a' && *cur <= 'z') ||
-      (*cur >= 'A' && *cur <= 'Z') ||
-      *cur == '_';
-}
-
-static boolean is_digit( const char *cur )
-{
-   return *cur >= '0' && *cur <= '9';
-}
-
-static boolean is_digit_alpha_underscore( const char *cur )
-{
-   return is_digit( cur ) || is_alpha_underscore( cur );
-}
-
-static boolean str_match_no_case( const char **pcur, const char *str )
-{
-   const char *cur = *pcur;
-
-   while (*str != '\0' && *str == toupper( *cur )) {
-      str++;
-      cur++;
-   }
-   if (*str == '\0') {
-      *pcur = cur;
-      return TRUE;
-   }
-   return FALSE;
-}
-
-/* Eat zero or more whitespaces.
- */
-static void eat_opt_white( const char **pcur )
-{
-   while (**pcur == ' ' || **pcur == '\t' || **pcur == '\n')
-      (*pcur)++;
-}
-
-/* Eat one or more whitespaces.
- * Return TRUE if at least one whitespace eaten.
- */
-static boolean eat_white( const char **pcur )
-{
-   const char *cur = *pcur;
-
-   eat_opt_white( pcur );
-   return *pcur > cur;
-}
-
-/* Parse unsigned integer.
- * No checks for overflow.
- */
-static boolean parse_uint( const char **pcur, uint *val )
-{
-   const char *cur = *pcur;
-
-   if (is_digit( cur )) {
-      *val = *cur++ - '0';
-      while (is_digit( cur ))
-         *val = *val * 10 + *cur++ - '0';
-      *pcur = cur;
-      return TRUE;
-   }
-   return FALSE;
-}
-
-/* Parse floating point.
- */
-static boolean parse_float( const char **pcur, float *val )
-{
-   const char *cur = *pcur;
-   boolean integral_part = FALSE;
-   boolean fractional_part = FALSE;
-
-   *val = (float) atof( cur );
-
-   if (*cur == '-' || *cur == '+')
-      cur++;
-   if (is_digit( cur )) {
-      cur++;
-      integral_part = TRUE;
-      while (is_digit( cur ))
-         cur++;
-   }
-   if (*cur == '.') {
-      cur++;
-      if (is_digit( cur )) {
-         cur++;
-         fractional_part = TRUE;
-         while (is_digit( cur ))
-            cur++;
-      }
-   }
-   if (!integral_part && !fractional_part)
-      return FALSE;
-   if (toupper( *cur ) == 'E') {
-      cur++;
-      if (*cur == '-' || *cur == '+')
-         cur++;
-      if (is_digit( cur )) {
-         cur++;
-         while (is_digit( cur ))
-            cur++;
-      }
-      else
-         return FALSE;
-   }
-   *pcur = cur;
-   return TRUE;
-}
-
-struct translate_ctx
-{
-   const char *text;
-   const char *cur;
-   struct tgsi_token *tokens;
-   struct tgsi_token *tokens_cur;
-   struct tgsi_token *tokens_end;
-   struct tgsi_header *header;
-};
-
-static void report_error( struct translate_ctx *ctx, const char *msg )
-{
-   debug_printf( "\nError: %s", msg );
-}
-
-/* Parse shader header.
- * Return TRUE for one of the following headers.
- *    FRAG1.1
- *    GEOM1.1
- *    VERT1.1
- */
-static boolean parse_header( struct translate_ctx *ctx )
-{
-   uint processor;
-
-   if (str_match_no_case( &ctx->cur, "FRAG1.1" ))
-      processor = TGSI_PROCESSOR_FRAGMENT;
-   else if (str_match_no_case( &ctx->cur, "VERT1.1" ))
-      processor = TGSI_PROCESSOR_VERTEX;
-   else if (str_match_no_case( &ctx->cur, "GEOM1.1" ))
-      processor = TGSI_PROCESSOR_GEOMETRY;
-   else {
-      report_error( ctx, "Unknown header" );
-      return FALSE;
-   }
-
-   if (ctx->tokens_cur >= ctx->tokens_end)
-      return FALSE;
-   *(struct tgsi_version *) ctx->tokens_cur++ = tgsi_build_version();
-
-   if (ctx->tokens_cur >= ctx->tokens_end)
-      return FALSE;
-   ctx->header = (struct tgsi_header *) ctx->tokens_cur++;
-   *ctx->header = tgsi_build_header();
-
-   if (ctx->tokens_cur >= ctx->tokens_end)
-      return FALSE;
-   *(struct tgsi_processor *) ctx->tokens_cur++ = tgsi_build_processor( processor, ctx->header );
-
-   return TRUE;
-}
-
-static boolean parse_label( struct translate_ctx *ctx, uint *val )
-{
-   const char *cur = ctx->cur;
-
-   if (parse_uint( &cur, val )) {
-      eat_opt_white( &cur );
-      if (*cur == ':') {
-         cur++;
-         ctx->cur = cur;
-         return TRUE;
-      }
-   }
-   return FALSE;
-}
-
-static const char *file_names[TGSI_FILE_COUNT] =
-{
-   "NULL",
-   "CONST",
-   "IN",
-   "OUT",
-   "TEMP",
-   "SAMP",
-   "ADDR",
-   "IMM"
-};
-
-static boolean
-parse_file( const char **pcur, uint *file )
-{
-   uint i;
-
-   for (i = 0; i < TGSI_FILE_COUNT; i++) {
-      const char *cur = *pcur;
-
-      if (str_match_no_case( &cur, file_names[i] )) {
-         if (!is_digit_alpha_underscore( cur )) {
-            *pcur = cur;
-            *file = i;
-            return TRUE;
-         }
-      }
-   }
-   return FALSE;
-}
-
-static boolean
-parse_opt_writemask(
-   struct translate_ctx *ctx,
-   uint *writemask )
-{
-   const char *cur;
-
-   cur = ctx->cur;
-   eat_opt_white( &cur );
-   if (*cur == '.') {
-      cur++;
-      *writemask = TGSI_WRITEMASK_NONE;
-      eat_opt_white( &cur );
-      if (toupper( *cur ) == 'X') {
-         cur++;
-         *writemask |= TGSI_WRITEMASK_X;
-      }
-      if (toupper( *cur ) == 'Y') {
-         cur++;
-         *writemask |= TGSI_WRITEMASK_Y;
-      }
-      if (toupper( *cur ) == 'Z') {
-         cur++;
-         *writemask |= TGSI_WRITEMASK_Z;
-      }
-      if (toupper( *cur ) == 'W') {
-         cur++;
-         *writemask |= TGSI_WRITEMASK_W;
-      }
-
-      if (*writemask == TGSI_WRITEMASK_NONE) {
-         report_error( ctx, "Writemask expected" );
-         return FALSE;
-      }
-
-      ctx->cur = cur;
-   }
-   else {
-      *writemask = TGSI_WRITEMASK_XYZW;
-   }
-   return TRUE;
-}
-
-/* <register_file_bracket> ::= <file> `['
- */
-static boolean
-parse_register_file_bracket(
-   struct translate_ctx *ctx,
-   uint *file )
-{
-   if (!parse_file( &ctx->cur, file )) {
-      report_error( ctx, "Unknown register file" );
-      return FALSE;
-   }
-   eat_opt_white( &ctx->cur );
-   if (*ctx->cur != '[') {
-      report_error( ctx, "Expected `['" );
-      return FALSE;
-   }
-   ctx->cur++;
-   return TRUE;
-}
-
-/* <register_file_bracket_index> ::= <register_file_bracket> <uint>
- */
-static boolean
-parse_register_file_bracket_index(
-   struct translate_ctx *ctx,
-   uint *file,
-   int *index )
-{
-   uint uindex;
-
-   if (!parse_register_file_bracket( ctx, file ))
-      return FALSE;
-   eat_opt_white( &ctx->cur );
-   if (!parse_uint( &ctx->cur, &uindex )) {
-      report_error( ctx, "Expected literal unsigned integer" );
-      return FALSE;
-   }
-   *index = (int) uindex;
-   return TRUE;
-}
-
-/* Parse destination register operand.
- *    <register_dst> ::= <register_file_bracket_index> `]'
- */
-static boolean
-parse_register_dst(
-   struct translate_ctx *ctx,
-   uint *file,
-   int *index )
-{
-   if (!parse_register_file_bracket_index( ctx, file, index ))
-      return FALSE;
-   eat_opt_white( &ctx->cur );
-   if (*ctx->cur != ']') {
-      report_error( ctx, "Expected `]'" );
-      return FALSE;
-   }
-   ctx->cur++;
-   return TRUE;
-}
-
-/* Parse source register operand.
- *    <register_src> ::= <register_file_bracket_index> `]' |
- *                       <register_file_bracket> <register_dst> `]' |
- *                       <register_file_bracket> <register_dst> `+' <uint> `]' |
- *                       <register_file_bracket> <register_dst> `-' <uint> `]'
- */
-static boolean
-parse_register_src(
-   struct translate_ctx *ctx,
-   uint *file,
-   int *index,
-   uint *ind_file,
-   int *ind_index )
-{
-   const char *cur;
-   uint uindex;
-
-   if (!parse_register_file_bracket( ctx, file ))
-      return FALSE;
-   eat_opt_white( &ctx->cur );
-   cur = ctx->cur;
-   if (parse_file( &cur, ind_file )) {
-      if (!parse_register_dst( ctx, ind_file, ind_index ))
-         return FALSE;
-      eat_opt_white( &ctx->cur );
-      if (*ctx->cur == '+' || *ctx->cur == '-') {
-         boolean negate;
-
-         negate = *ctx->cur == '-';
-         ctx->cur++;
-         eat_opt_white( &ctx->cur );
-         if (!parse_uint( &ctx->cur, &uindex )) {
-            report_error( ctx, "Expected literal unsigned integer" );
-            return FALSE;
-         }
-         if (negate)
-            *index = -(int) uindex;
-         else
-            *index = (int) uindex;
-      }
-      else {
-         *index = 0;
-      }
-   }
-   else {
-      if (!parse_uint( &ctx->cur, &uindex )) {
-         report_error( ctx, "Expected literal unsigned integer" );
-         return FALSE;
-      }
-      *index = (int) uindex;
-      *ind_file = TGSI_FILE_NULL;
-      *ind_index = 0;
-   }
-   eat_opt_white( &ctx->cur );
-   if (*ctx->cur != ']') {
-      report_error( ctx, "Expected `]'" );
-      return FALSE;
-   }
-   ctx->cur++;
-   return TRUE;
-}
-
-/* Parse register declaration.
- *    <register_dcl> ::= <register_file_bracket_index> `]' |
- *                       <register_file_bracket_index> `..' <index> `]'
- */
-static boolean
-parse_register_dcl(
-   struct translate_ctx *ctx,
-   uint *file,
-   int *first,
-   int *last )
-{
-   if (!parse_register_file_bracket_index( ctx, file, first ))
-      return FALSE;
-   eat_opt_white( &ctx->cur );
-   if (ctx->cur[0] == '.' && ctx->cur[1] == '.') {
-      uint uindex;
-
-      ctx->cur += 2;
-      eat_opt_white( &ctx->cur );
-      if (!parse_uint( &ctx->cur, &uindex )) {
-         report_error( ctx, "Expected literal integer" );
-         return FALSE;
-      }
-      *last = (int) uindex;
-      eat_opt_white( &ctx->cur );
-   }
-   else {
-      *last = *first;
-   }
-   if (*ctx->cur != ']') {
-      report_error( ctx, "Expected `]' or `..'" );
-      return FALSE;
-   }
-   ctx->cur++;
-   return TRUE;
-}
-
-static const char *modulate_names[TGSI_MODULATE_COUNT] =
-{
-   "_1X",
-   "_2X",
-   "_4X",
-   "_8X",
-   "_D2",
-   "_D4",
-   "_D8"
-};
-
-static boolean
-parse_dst_operand(
-   struct translate_ctx *ctx,
-   struct tgsi_full_dst_register *dst )
-{
-   uint file;
-   int index;
-   uint writemask;
-   const char *cur;
-
-   if (!parse_register_dst( ctx, &file, &index ))
-      return FALSE;
-
-   cur = ctx->cur;
-   eat_opt_white( &cur );
-   if (*cur == '_') {
-      uint i;
-
-      for (i = 0; i < TGSI_MODULATE_COUNT; i++) {
-         if (str_match_no_case( &cur, modulate_names[i] )) {
-            if (!is_digit_alpha_underscore( cur )) {
-               dst->DstRegisterExtModulate.Modulate = i;
-               ctx->cur = cur;
-               break;
-            }
-         }
-      }
-   }
-
-   if (!parse_opt_writemask( ctx, &writemask ))
-      return FALSE;
-
-   dst->DstRegister.File = file;
-   dst->DstRegister.Index = index;
-   dst->DstRegister.WriteMask = writemask;
-   return TRUE;
-}
-
-static boolean
-parse_optional_swizzle(
-   struct translate_ctx *ctx,
-   uint swizzle[4],
-   boolean *parsed_swizzle,
-   boolean *parsed_extswizzle )
-{
-   const char *cur = ctx->cur;
-
-   *parsed_swizzle = FALSE;
-   *parsed_extswizzle = FALSE;
-
-   eat_opt_white( &cur );
-   if (*cur == '.') {
-      uint i;
-
-      cur++;
-      eat_opt_white( &cur );
-      for (i = 0; i < 4; i++) {
-         if (toupper( *cur ) == 'X')
-            swizzle[i] = TGSI_SWIZZLE_X;
-         else if (toupper( *cur ) == 'Y')
-            swizzle[i] = TGSI_SWIZZLE_Y;
-         else if (toupper( *cur ) == 'Z')
-            swizzle[i] = TGSI_SWIZZLE_Z;
-         else if (toupper( *cur ) == 'W')
-            swizzle[i] = TGSI_SWIZZLE_W;
-         else {
-            if (*cur == '0')
-               swizzle[i] = TGSI_EXTSWIZZLE_ZERO;
-            else if (*cur == '1')
-               swizzle[i] = TGSI_EXTSWIZZLE_ONE;
-            else {
-               report_error( ctx, "Expected register swizzle component `x', `y', `z', `w', `0' or `1'" );
-               return FALSE;
-            }
-            *parsed_extswizzle = TRUE;
-         }
-         cur++;
-      }
-      *parsed_swizzle = TRUE;
-      ctx->cur = cur;
-   }
-   return TRUE;
-}
-
-static boolean
-parse_src_operand(
-   struct translate_ctx *ctx,
-   struct tgsi_full_src_register *src )
-{
-   const char *cur;
-   float value;
-   uint file;
-   int index;
-   uint ind_file;
-   int ind_index;
-   uint swizzle[4];
-   boolean parsed_swizzle;
-   boolean parsed_extswizzle;
-
-   if (*ctx->cur == '-') {
-      cur = ctx->cur;
-      cur++;
-      eat_opt_white( &cur );
-      if (*cur == '(') {
-         cur++;
-         src->SrcRegisterExtMod.Negate = 1;
-         eat_opt_white( &cur );
-         ctx->cur = cur;
-      }
-   }
-
-   if (*ctx->cur == '|') {
-      ctx->cur++;
-      eat_opt_white( &ctx->cur );
-      src->SrcRegisterExtMod.Absolute = 1;
-   }
-
-   if (*ctx->cur == '-') {
-      ctx->cur++;
-      eat_opt_white( &ctx->cur );
-      src->SrcRegister.Negate = 1;
-   }
-
-   cur = ctx->cur;
-   if (parse_float( &cur, &value )) {
-      if (value == 2.0f) {
-         eat_opt_white( &cur );
-         if (*cur != '*') {
-            report_error( ctx, "Expected `*'" );
-            return FALSE;
-         }
-         cur++;
-         if (*cur != '(') {
-            report_error( ctx, "Expected `('" );
-            return FALSE;
-         }
-         cur++;
-         src->SrcRegisterExtMod.Scale2X = 1;
-         eat_opt_white( &cur );
-         ctx->cur = cur;
-      }
-   }
-
-   if (*ctx->cur == '(') {
-      ctx->cur++;
-      eat_opt_white( &ctx->cur );
-      src->SrcRegisterExtMod.Bias = 1;
-   }
-
-   cur = ctx->cur;
-   if (parse_float( &cur, &value )) {
-      if (value == 1.0f) {
-         eat_opt_white( &cur );
-         if (*cur != '-') {
-            report_error( ctx, "Expected `-'" );
-            return FALSE;
-         }
-         cur++;
-         if (*cur != '(') {
-            report_error( ctx, "Expected `('" );
-            return FALSE;
-         }
-         cur++;
-         src->SrcRegisterExtMod.Complement = 1;
-         eat_opt_white( &cur );
-         ctx->cur = cur;
-      }
-   }
-
-   if (!parse_register_src( ctx, &file, &index, &ind_file, &ind_index ))
-      return FALSE;
-   src->SrcRegister.File = file;
-   src->SrcRegister.Index = index;
-   if (ind_file != TGSI_FILE_NULL) {
-      src->SrcRegister.Indirect = 1;
-      src->SrcRegisterInd.File = ind_file;
-      src->SrcRegisterInd.Index = ind_index;
-   }
-
-   /* Parse optional swizzle.
-    */
-   if (parse_optional_swizzle( ctx, swizzle, &parsed_swizzle, &parsed_extswizzle )) {
-      if (parsed_extswizzle) {
-         assert( parsed_swizzle );
-
-         src->SrcRegisterExtSwz.ExtSwizzleX = swizzle[0];
-         src->SrcRegisterExtSwz.ExtSwizzleY = swizzle[1];
-         src->SrcRegisterExtSwz.ExtSwizzleZ = swizzle[2];
-         src->SrcRegisterExtSwz.ExtSwizzleW = swizzle[3];
-      }
-      else if (parsed_swizzle) {
-         src->SrcRegister.SwizzleX = swizzle[0];
-         src->SrcRegister.SwizzleY = swizzle[1];
-         src->SrcRegister.SwizzleZ = swizzle[2];
-         src->SrcRegister.SwizzleW = swizzle[3];
-      }
-   }
-
-   if (src->SrcRegisterExtMod.Complement) {
-      eat_opt_white( &ctx->cur );
-      if (*ctx->cur != ')') {
-         report_error( ctx, "Expected `)'" );
-         return FALSE;
-      }
-      ctx->cur++;
-   }
-
-   if (src->SrcRegisterExtMod.Bias) {
-      eat_opt_white( &ctx->cur );
-      if (*ctx->cur != ')') {
-         report_error( ctx, "Expected `)'" );
-         return FALSE;
-      }
-      ctx->cur++;
-      eat_opt_white( &ctx->cur );
-      if (*ctx->cur != '-') {
-         report_error( ctx, "Expected `-'" );
-         return FALSE;
-      }
-      ctx->cur++;
-      eat_opt_white( &ctx->cur );
-      if (!parse_float( &ctx->cur, &value )) {
-         report_error( ctx, "Expected literal floating point" );
-         return FALSE;
-      }
-      if (value != 0.5f) {
-         report_error( ctx, "Expected 0.5" );
-         return FALSE;
-      }
-   }
-
-   if (src->SrcRegisterExtMod.Scale2X) {
-      eat_opt_white( &ctx->cur );
-      if (*ctx->cur != ')') {
-         report_error( ctx, "Expected `)'" );
-         return FALSE;
-      }
-      ctx->cur++;
-   }
-
-   if (src->SrcRegisterExtMod.Absolute) {
-      eat_opt_white( &ctx->cur );
-      if (*ctx->cur != '|') {
-         report_error( ctx, "Expected `|'" );
-         return FALSE;
-      }
-      ctx->cur++;
-   }
-
-   if (src->SrcRegisterExtMod.Negate) {
-      eat_opt_white( &ctx->cur );
-      if (*ctx->cur != ')') {
-         report_error( ctx, "Expected `)'" );
-         return FALSE;
-      }
-      ctx->cur++;
-   }
-
-   return TRUE;
-}
-
-struct opcode_info
-{
-   uint num_dst;
-   uint num_src;
-   uint is_tex;
-   uint is_branch;
-   const char *mnemonic;
-};
-
-static const struct opcode_info opcode_info[TGSI_OPCODE_LAST] =
-{
-   { 1, 1, 0, 0, "ARL" },
-   { 1, 1, 0, 0, "MOV" },
-   { 1, 1, 0, 0, "LIT" },
-   { 1, 1, 0, 0, "RCP" },
-   { 1, 1, 0, 0, "RSQ" },
-   { 1, 1, 0, 0, "EXP" },
-   { 1, 1, 0, 0, "LOG" },
-   { 1, 2, 0, 0, "MUL" },
-   { 1, 2, 0, 0, "ADD" },
-   { 1, 2, 0, 0, "DP3" },
-   { 1, 2, 0, 0, "DP4" },
-   { 1, 2, 0, 0, "DST" },
-   { 1, 2, 0, 0, "MIN" },
-   { 1, 2, 0, 0, "MAX" },
-   { 1, 2, 0, 0, "SLT" },
-   { 1, 2, 0, 0, "SGE" },
-   { 1, 3, 0, 0, "MAD" },
-   { 1, 2, 0, 0, "SUB" },
-   { 1, 3, 0, 0, "LERP" },
-   { 1, 3, 0, 0, "CND" },
-   { 1, 3, 0, 0, "CND0" },
-   { 1, 3, 0, 0, "DOT2ADD" },
-   { 1, 2, 0, 0, "INDEX" },
-   { 1, 1, 0, 0, "NEGATE" },
-   { 1, 1, 0, 0, "FRAC" },
-   { 1, 3, 0, 0, "CLAMP" },
-   { 1, 1, 0, 0, "FLOOR" },
-   { 1, 1, 0, 0, "ROUND" },
-   { 1, 1, 0, 0, "EXPBASE2" },
-   { 1, 1, 0, 0, "LOGBASE2" },
-   { 1, 2, 0, 0, "POWER" },
-   { 1, 2, 0, 0, "CROSSPRODUCT" },
-   { 1, 2, 0, 0, "MULTIPLYMATRIX" },
-   { 1, 1, 0, 0, "ABS" },
-   { 1, 1, 0, 0, "RCC" },
-   { 1, 2, 0, 0, "DPH" },
-   { 1, 1, 0, 0, "COS" },
-   { 1, 1, 0, 0, "DDX" },
-   { 1, 1, 0, 0, "DDY" },
-   { 0, 1, 0, 0, "KILP" },
-   { 1, 1, 0, 0, "PK2H" },
-   { 1, 1, 0, 0, "PK2US" },
-   { 1, 1, 0, 0, "PK4B" },
-   { 1, 1, 0, 0, "PK4UB" },
-   { 1, 2, 0, 0, "RFL" },
-   { 1, 2, 0, 0, "SEQ" },
-   { 1, 2, 0, 0, "SFL" },
-   { 1, 2, 0, 0, "SGT" },
-   { 1, 1, 0, 0, "SIN" },
-   { 1, 2, 0, 0, "SLE" },
-   { 1, 2, 0, 0, "SNE" },
-   { 1, 2, 0, 0, "STR" },
-   { 1, 2, 1, 0, "TEX" },
-   { 1, 4, 1, 0, "TXD" },
-   { 1, 2, 1, 0, "TXP" },
-   { 1, 1, 0, 0, "UP2H" },
-   { 1, 1, 0, 0, "UP2US" },
-   { 1, 1, 0, 0, "UP4B" },
-   { 1, 1, 0, 0, "UP4UB" },
-   { 1, 3, 0, 0, "X2D" },
-   { 1, 1, 0, 0, "ARA" },
-   { 1, 1, 0, 0, "ARR" },
-   { 0, 1, 0, 0, "BRA" },
-   { 0, 0, 0, 1, "CAL" },
-   { 0, 0, 0, 0, "RET" },
-   { 1, 1, 0, 0, "SSG" },
-   { 1, 3, 0, 0, "CMP" },
-   { 1, 1, 0, 0, "SCS" },
-   { 1, 2, 1, 0, "TXB" },
-   { 1, 1, 0, 0, "NRM" },
-   { 1, 2, 0, 0, "DIV" },
-   { 1, 2, 0, 0, "DP2" },
-   { 1, 2, 1, 0, "TXL" },
-   { 0, 0, 0, 0, "BRK" },
-   { 0, 1, 0, 1, "IF" },
-   { 0, 0, 0, 0, "LOOP" },
-   { 0, 1, 0, 0, "REP" },
-   { 0, 0, 0, 1, "ELSE" },
-   { 0, 0, 0, 0, "ENDIF" },
-   { 0, 0, 0, 0, "ENDLOOP" },
-   { 0, 0, 0, 0, "ENDREP" },
-   { 0, 1, 0, 0, "PUSHA" },
-   { 1, 0, 0, 0, "POPA" },
-   { 1, 1, 0, 0, "CEIL" },
-   { 1, 1, 0, 0, "I2F" },
-   { 1, 1, 0, 0, "NOT" },
-   { 1, 1, 0, 0, "TRUNC" },
-   { 1, 2, 0, 0, "SHL" },
-   { 1, 2, 0, 0, "SHR" },
-   { 1, 2, 0, 0, "AND" },
-   { 1, 2, 0, 0, "OR" },
-   { 1, 2, 0, 0, "MOD" },
-   { 1, 2, 0, 0, "XOR" },
-   { 1, 3, 0, 0, "SAD" },
-   { 1, 2, 1, 0, "TXF" },
-   { 1, 2, 1, 0, "TXQ" },
-   { 0, 0, 0, 0, "CONT" },
-   { 0, 0, 0, 0, "EMIT" },
-   { 0, 0, 0, 0, "ENDPRIM" },
-   { 0, 0, 0, 1, "BGNLOOP2" },
-   { 0, 0, 0, 0, "BGNSUB" },
-   { 0, 0, 0, 1, "ENDLOOP2" },
-   { 0, 0, 0, 0, "ENDSUB" },
-   { 1, 1, 0, 0, "NOISE1" },
-   { 1, 1, 0, 0, "NOISE2" },
-   { 1, 1, 0, 0, "NOISE3" },
-   { 1, 1, 0, 0, "NOISE4" },
-   { 0, 0, 0, 0, "NOP" },
-   { 1, 2, 0, 0, "M4X3" },
-   { 1, 2, 0, 0, "M3X4" },
-   { 1, 2, 0, 0, "M3X3" },
-   { 1, 2, 0, 0, "M3X2" },
-   { 1, 1, 0, 0, "NRM4" },
-   { 0, 1, 0, 0, "CALLNZ" },
-   { 0, 1, 0, 0, "IFC" },
-   { 0, 1, 0, 0, "BREAKC" },
-   { 0, 0, 0, 0, "KIL" },
-   { 0, 0, 0, 0, "END" },
-   { 1, 1, 0, 0, "SWZ" }
-};
-
-static const char *texture_names[TGSI_TEXTURE_COUNT] =
-{
-   "UNKNOWN",
-   "1D",
-   "2D",
-   "3D",
-   "CUBE",
-   "RECT",
-   "SHADOW1D",
-   "SHADOW2D",
-   "SHADOWRECT"
-};
-
-static boolean
-parse_instruction(
-   struct translate_ctx *ctx,
-   boolean has_label )
-{
-   uint i;
-   uint saturate = TGSI_SAT_NONE;
-   const struct opcode_info *info;
-   struct tgsi_full_instruction inst;
-   uint advance;
-
-   /* Parse instruction name.
-    */
-   eat_opt_white( &ctx->cur );
-   for (i = 0; i < TGSI_OPCODE_LAST; i++) {
-      const char *cur = ctx->cur;
-
-      info = &opcode_info[i];
-      if (str_match_no_case( &cur, info->mnemonic )) {
-         if (str_match_no_case( &cur, "_SATNV" ))
-            saturate = TGSI_SAT_MINUS_PLUS_ONE;
-         else if (str_match_no_case( &cur, "_SAT" ))
-            saturate = TGSI_SAT_ZERO_ONE;
-
-         if (info->num_dst + info->num_src + info->is_tex == 0) {
-            if (!is_digit_alpha_underscore( cur )) {
-               ctx->cur = cur;
-               break;
-            }
-         }
-         else if (*cur == '\0' || eat_white( &cur )) {
-            ctx->cur = cur;
-            break;
-         }
-      }
-   }
-   if (i == TGSI_OPCODE_LAST) {
-      if (has_label)
-         report_error( ctx, "Unknown opcode" );
-      else
-         report_error( ctx, "Expected `DCL', `IMM' or a label" );
-      return FALSE;
-   }
-
-   inst = tgsi_default_full_instruction();
-   inst.Instruction.Opcode = i;
-   inst.Instruction.Saturate = saturate;
-   inst.Instruction.NumDstRegs = info->num_dst;
-   inst.Instruction.NumSrcRegs = info->num_src;
-
-   /* Parse instruction operands.
-    */
-   for (i = 0; i < info->num_dst + info->num_src + info->is_tex; i++) {
-      if (i > 0) {
-         eat_opt_white( &ctx->cur );
-         if (*ctx->cur != ',') {
-            report_error( ctx, "Expected `,'" );
-            return FALSE;
-         }
-         ctx->cur++;
-         eat_opt_white( &ctx->cur );
-      }
-
-      if (i < info->num_dst) {
-         if (!parse_dst_operand( ctx, &inst.FullDstRegisters[i] ))
-            return FALSE;
-      }
-      else if (i < info->num_dst + info->num_src) {
-         if (!parse_src_operand( ctx, &inst.FullSrcRegisters[i - info->num_dst] ))
-            return FALSE;
-      }
-      else {
-         uint j;
-
-         for (j = 0; j < TGSI_TEXTURE_COUNT; j++) {
-            if (str_match_no_case( &ctx->cur, texture_names[j] )) {
-               if (!is_digit_alpha_underscore( ctx->cur )) {
-                  inst.InstructionExtTexture.Texture = j;
-                  break;
-               }
-            }
-         }
-         if (j == TGSI_TEXTURE_COUNT) {
-            report_error( ctx, "Expected texture target" );
-            return FALSE;
-         }
-      }
-   }
-
-   if (info->is_branch) {
-      uint target;
-
-      eat_opt_white( &ctx->cur );
-      if (*ctx->cur != ':') {
-         report_error( ctx, "Expected `:'" );
-         return FALSE;
-      }
-      ctx->cur++;
-      eat_opt_white( &ctx->cur );
-      if (!parse_uint( &ctx->cur, &target )) {
-         report_error( ctx, "Expected a label" );
-         return FALSE;
-      }
-      inst.InstructionExtLabel.Label = target;
-   }
-
-   advance = tgsi_build_full_instruction(
-      &inst,
-      ctx->tokens_cur,
-      ctx->header,
-      (uint) (ctx->tokens_end - ctx->tokens_cur) );
-   if (advance == 0)
-      return FALSE;
-   ctx->tokens_cur += advance;
-
-   return TRUE;
-}
-
-static const char *semantic_names[TGSI_SEMANTIC_COUNT] =
-{
-   "POSITION",
-   "COLOR",
-   "BCOLOR",
-   "FOG",
-   "PSIZE",
-   "GENERIC",
-   "NORMAL"
-};
-
-static const char *interpolate_names[TGSI_INTERPOLATE_COUNT] =
-{
-   "CONSTANT",
-   "LINEAR",
-   "PERSPECTIVE"
-};
-
-static boolean parse_declaration( struct translate_ctx *ctx )
-{
-   struct tgsi_full_declaration decl;
-   uint file;
-   int first;
-   int last;
-   uint writemask;
-   const char *cur;
-   uint advance;
-
-   if (!eat_white( &ctx->cur )) {
-      report_error( ctx, "Syntax error" );
-      return FALSE;
-   }
-   if (!parse_register_dcl( ctx, &file, &first, &last ))
-      return FALSE;
-   if (!parse_opt_writemask( ctx, &writemask ))
-      return FALSE;
-
-   decl = tgsi_default_full_declaration();
-   decl.Declaration.File = file;
-   decl.Declaration.UsageMask = writemask;
-   decl.DeclarationRange.First = first;
-   decl.DeclarationRange.Last = last;
-
-   cur = ctx->cur;
-   eat_opt_white( &cur );
-   if (*cur == ',') {
-      uint i;
-
-      cur++;
-      eat_opt_white( &cur );
-      for (i = 0; i < TGSI_SEMANTIC_COUNT; i++) {
-         if (str_match_no_case( &cur, semantic_names[i] )) {
-            const char *cur2 = cur;
-            uint index;
-
-            if (is_digit_alpha_underscore( cur ))
-               continue;
-            eat_opt_white( &cur2 );
-            if (*cur2 == '[') {
-               cur2++;
-               eat_opt_white( &cur2 );
-               if (!parse_uint( &cur2, &index )) {
-                  report_error( ctx, "Expected literal integer" );
-                  return FALSE;
-               }
-               eat_opt_white( &cur2 );
-               if (*cur2 != ']') {
-                  report_error( ctx, "Expected `]'" );
-                  return FALSE;
-               }
-               cur2++;
-
-               decl.Semantic.SemanticIndex = index;
-
-               cur = cur2;
-            }
-
-            decl.Declaration.Semantic = 1;
-            decl.Semantic.SemanticName = i;
-
-            ctx->cur = cur;
-            break;
-         }
-      }
-   }
-
-   cur = ctx->cur;
-   eat_opt_white( &cur );
-   if (*cur == ',') {
-      uint i;
-
-      cur++;
-      eat_opt_white( &cur );
-      for (i = 0; i < TGSI_INTERPOLATE_COUNT; i++) {
-         if (str_match_no_case( &cur, interpolate_names[i] )) {
-            if (is_digit_alpha_underscore( cur ))
-               continue;
-            decl.Declaration.Interpolate = i;
-
-            ctx->cur = cur;
-            break;
-         }
-      }
-      if (i == TGSI_INTERPOLATE_COUNT) {
-         report_error( ctx, "Expected semantic or interpolate attribute" );
-         return FALSE;
-      }
-   }
-
-   advance = tgsi_build_full_declaration(
-      &decl,
-      ctx->tokens_cur,
-      ctx->header,
-      (uint) (ctx->tokens_end - ctx->tokens_cur) );
-   if (advance == 0)
-      return FALSE;
-   ctx->tokens_cur += advance;
-
-   return TRUE;
-}
-
-static boolean parse_immediate( struct translate_ctx *ctx )
-{
-   struct tgsi_full_immediate imm;
-   uint i;
-   float values[4];
-   uint advance;
-
-   if (!eat_white( &ctx->cur )) {
-      report_error( ctx, "Syntax error" );
-      return FALSE;
-   }
-   if (!str_match_no_case( &ctx->cur, "FLT32" ) || is_digit_alpha_underscore( ctx->cur )) {
-      report_error( ctx, "Expected `FLT32'" );
-      return FALSE;
-   }
-   eat_opt_white( &ctx->cur );
-   if (*ctx->cur != '{') {
-      report_error( ctx, "Expected `{'" );
-      return FALSE;
-   }
-   ctx->cur++;
-   for (i = 0; i < 4; i++) {
-      eat_opt_white( &ctx->cur );
-      if (i > 0) {
-         if (*ctx->cur != ',') {
-            report_error( ctx, "Expected `,'" );
-            return FALSE;
-         }
-         ctx->cur++;
-         eat_opt_white( &ctx->cur );
-      }
-      if (!parse_float( &ctx->cur, &values[i] )) {
-         report_error( ctx, "Expected literal floating point" );
-         return FALSE;
-      }
-   }
-   eat_opt_white( &ctx->cur );
-   if (*ctx->cur != '}') {
-      report_error( ctx, "Expected `}'" );
-      return FALSE;
-   }
-   ctx->cur++;
-
-   imm = tgsi_default_full_immediate();
-   imm.Immediate.Size += 4;
-   imm.Immediate.DataType = TGSI_IMM_FLOAT32;
-   imm.u.Pointer = values;
-
-   advance = tgsi_build_full_immediate(
-      &imm,
-      ctx->tokens_cur,
-      ctx->header,
-      (uint) (ctx->tokens_end - ctx->tokens_cur) );
-   if (advance == 0)
-      return FALSE;
-   ctx->tokens_cur += advance;
-
-   return TRUE;
-}
-
-static boolean translate( struct translate_ctx *ctx )
-{
-   eat_opt_white( &ctx->cur );
-   if (!parse_header( ctx ))
-      return FALSE;
-
-   while (*ctx->cur != '\0') {
-      uint label_val = 0;
-
-      if (!eat_white( &ctx->cur )) {
-         report_error( ctx, "Syntax error" );
-         return FALSE;
-      }
-
-      if (*ctx->cur == '\0')
-         break;
-
-      if (parse_label( ctx, &label_val )) {
-         if (!parse_instruction( ctx, TRUE ))
-            return FALSE;
-      }
-      else if (str_match_no_case( &ctx->cur, "DCL" )) {
-         if (!parse_declaration( ctx ))
-            return FALSE;
-      }
-      else if (str_match_no_case( &ctx->cur, "IMM" )) {
-         if (!parse_immediate( ctx ))
-            return FALSE;
-      }
-      else if (!parse_instruction( ctx, FALSE )) {
-         return FALSE;
-      }
-   }
-
-   return TRUE;
-}
-
-boolean
-tgsi_text_translate(
-   const char *text,
-   struct tgsi_token *tokens,
-   uint num_tokens )
-{
-   struct translate_ctx ctx;
-
-   ctx.text = text;
-   ctx.cur = text;
-   ctx.tokens = tokens;
-   ctx.tokens_cur = tokens;
-   ctx.tokens_end = tokens + num_tokens;
-
-   if (!translate( &ctx ))
-      return FALSE;
-
-   return tgsi_sanity_check( tokens );
-}
diff --git a/src/gallium/auxiliary/tgsi/util/tgsi_text.h b/src/gallium/auxiliary/tgsi/util/tgsi_text.h
deleted file mode 100644
index 8eeeeef140..0000000000
--- a/src/gallium/auxiliary/tgsi/util/tgsi_text.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-#ifndef TGSI_TEXT_H
-#define TGSI_TEXT_H
-
-#include "pipe/p_shader_tokens.h"
-
-#if defined __cplusplus
-extern "C" {
-#endif
-
-boolean
-tgsi_text_translate(
-   const char *text,
-   struct tgsi_token *tokens,
-   uint num_tokens );
-
-#if defined __cplusplus
-}
-#endif
-
-#endif /* TGSI_TEXT_H */
diff --git a/src/gallium/auxiliary/tgsi/util/tgsi_transform.c b/src/gallium/auxiliary/tgsi/util/tgsi_transform.c
deleted file mode 100644
index 357f77b05a..0000000000
--- a/src/gallium/auxiliary/tgsi/util/tgsi_transform.c
+++ /dev/null
@@ -1,199 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-/**
- * TGSI program transformation utility.
- *
- * Authors:  Brian Paul
- */
-
-
-#include "tgsi_transform.h"
-
-
-
-static void
-emit_instruction(struct tgsi_transform_context *ctx,
-                 const struct tgsi_full_instruction *inst)
-{
-   uint ti = ctx->ti;
-
-   ti += tgsi_build_full_instruction(inst,
-                                     ctx->tokens_out + ti,
-                                     ctx->header,
-                                     ctx->max_tokens_out - ti);
-   ctx->ti = ti;
-}
-
-
-static void
-emit_declaration(struct tgsi_transform_context *ctx,
-                 const struct tgsi_full_declaration *decl)
-{
-   uint ti = ctx->ti;
-
-   ti += tgsi_build_full_declaration(decl,
-                                     ctx->tokens_out + ti,
-                                     ctx->header,
-                                     ctx->max_tokens_out - ti);
-   ctx->ti = ti;
-}
-
-
-static void
-emit_immediate(struct tgsi_transform_context *ctx,
-               const struct tgsi_full_immediate *imm)
-{
-   uint ti = ctx->ti;
-
-   ti += tgsi_build_full_immediate(imm,
-                                   ctx->tokens_out + ti,
-                                   ctx->header,
-                                   ctx->max_tokens_out - ti);
-   ctx->ti = ti;
-}
-
-
-
-/**
- * Apply user-defined transformations to the input shader to produce
- * the output shader.
- * For example, a register search-and-replace operation could be applied
- * by defining a transform_instruction() callback that examined and changed
- * the instruction src/dest regs.
- *
- * \return number of tokens emitted
- */
-int
-tgsi_transform_shader(const struct tgsi_token *tokens_in,
-                      struct tgsi_token *tokens_out,
-                      uint max_tokens_out,
-                      struct tgsi_transform_context *ctx)
-{
-   uint procType;
-
-   /* input shader */
-   struct tgsi_parse_context parse;
-
-   /* output shader */
-   struct tgsi_processor *processor;
-
-
-   /**
-    ** callback context init
-    **/
-   ctx->emit_instruction = emit_instruction;
-   ctx->emit_declaration = emit_declaration;
-   ctx->emit_immediate = emit_immediate;
-   ctx->tokens_out = tokens_out;
-   ctx->max_tokens_out = max_tokens_out;
-
-
-   /**
-    ** Setup to begin parsing input shader
-    **/
-   if (tgsi_parse_init( &parse, tokens_in ) != TGSI_PARSE_OK) {
-      debug_printf("tgsi_parse_init() failed in tgsi_transform_shader()!\n");
-      return -1;
-   }
-   procType = parse.FullHeader.Processor.Processor;
-   assert(procType == TGSI_PROCESSOR_FRAGMENT ||
-          procType == TGSI_PROCESSOR_VERTEX ||
-          procType == TGSI_PROCESSOR_GEOMETRY);
-
-
-   /**
-    **  Setup output shader
-    **/
-   *(struct tgsi_version *) &tokens_out[0] = tgsi_build_version();
-
-   ctx->header = (struct tgsi_header *) (tokens_out + 1);
-   *ctx->header = tgsi_build_header();
-
-   processor = (struct tgsi_processor *) (tokens_out + 2);
-   *processor = tgsi_build_processor( procType, ctx->header );
-
-   ctx->ti = 3;
-
-
-   /**
-    ** Loop over incoming program tokens/instructions
-    */
-   while( !tgsi_parse_end_of_tokens( &parse ) ) {
-
-      tgsi_parse_token( &parse );
-
-      switch( parse.FullToken.Token.Type ) {
-      case TGSI_TOKEN_TYPE_INSTRUCTION:
-         {
-            struct tgsi_full_instruction *fullinst
-               = &parse.FullToken.FullInstruction;
-
-            if (ctx->transform_instruction)
-               ctx->transform_instruction(ctx, fullinst);
-            else
-               ctx->emit_instruction(ctx, fullinst);
-         }
-         break;
-
-      case TGSI_TOKEN_TYPE_DECLARATION:
-         {
-            struct tgsi_full_declaration *fulldecl
-               = &parse.FullToken.FullDeclaration;
-
-            if (ctx->transform_declaration)
-               ctx->transform_declaration(ctx, fulldecl);
-            else
-               ctx->emit_declaration(ctx, fulldecl);
-         }
-         break;
-
-      case TGSI_TOKEN_TYPE_IMMEDIATE:
-         {
-            struct tgsi_full_immediate *fullimm
-               = &parse.FullToken.FullImmediate;
-
-            if (ctx->transform_immediate)
-               ctx->transform_immediate(ctx, fullimm);
-            else
-               ctx->emit_immediate(ctx, fullimm);
-         }
-         break;
-
-      default:
-         assert( 0 );
-      }
-   }
-
-   if (ctx->epilog) {
-      ctx->epilog(ctx);
-   }
-
-   tgsi_parse_free (&parse);
-
-   return ctx->ti;
-}
diff --git a/src/gallium/auxiliary/tgsi/util/tgsi_transform.h b/src/gallium/auxiliary/tgsi/util/tgsi_transform.h
deleted file mode 100644
index fcf85d603b..0000000000
--- a/src/gallium/auxiliary/tgsi/util/tgsi_transform.h
+++ /dev/null
@@ -1,93 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-#ifndef TGSI_TRANSFORM_H
-#define TGSI_TRANSFORM_H
-
-
-#include "pipe/p_util.h"
-#include "pipe/p_shader_tokens.h"
-#include "tgsi/util/tgsi_parse.h"
-#include "tgsi/util/tgsi_build.h"
-
-
-
-/**
- * Subclass this to add caller-specific data
- */
-struct tgsi_transform_context
-{
-/**** PUBLIC ***/
-
-   /**
-    * User-defined callbacks invoked per instruction.
-    */
-   void (*transform_instruction)(struct tgsi_transform_context *ctx,
-                                 struct tgsi_full_instruction *inst);
-
-   void (*transform_declaration)(struct tgsi_transform_context *ctx,
-                                 struct tgsi_full_declaration *decl);
-
-   void (*transform_immediate)(struct tgsi_transform_context *ctx,
-                               struct tgsi_full_immediate *imm);
-
-   /**
-    * Called at end of input program to allow caller to append extra
-    * instructions.  Return number of tokens emitted.
-    */
-   void (*epilog)(struct tgsi_transform_context *ctx);
-
-
-/*** PRIVATE ***/
-
-   /**
-    * These are setup by tgsi_transform_shader() and cannot be overridden.
-    * Meant to be called from in the above user callback functions.
-    */
-   void (*emit_instruction)(struct tgsi_transform_context *ctx,
-                            const struct tgsi_full_instruction *inst);
-   void (*emit_declaration)(struct tgsi_transform_context *ctx,
-                            const struct tgsi_full_declaration *decl);
-   void (*emit_immediate)(struct tgsi_transform_context *ctx,
-                          const struct tgsi_full_immediate *imm);
-
-   struct tgsi_header *header;
-   uint max_tokens_out;
-   struct tgsi_token *tokens_out;
-   uint ti;
-};
-
-
-
-extern int
-tgsi_transform_shader(const struct tgsi_token *tokens_in,
-                      struct tgsi_token *tokens_out,
-                      uint max_tokens_out,
-                      struct tgsi_transform_context *ctx);
-
-
-#endif /* TGSI_TRANSFORM_H */
diff --git a/src/gallium/auxiliary/tgsi/util/tgsi_util.c b/src/gallium/auxiliary/tgsi/util/tgsi_util.c
deleted file mode 100644
index 09486e649e..0000000000
--- a/src/gallium/auxiliary/tgsi/util/tgsi_util.c
+++ /dev/null
@@ -1,300 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-#include "pipe/p_debug.h"
-#include "pipe/p_util.h"
-#include "pipe/p_shader_tokens.h"
-#include "tgsi_parse.h"
-#include "tgsi_build.h"
-#include "tgsi_util.h"
-
-union pointer_hack
-{
-   void *pointer;
-   uint64_t uint64;
-};
-
-void *
-tgsi_align_128bit(
-   void *unaligned )
-{
-   union pointer_hack ph;
-
-   ph.uint64 = 0;
-   ph.pointer = unaligned;
-   ph.uint64 = (ph.uint64 + 15) & ~15;
-   return ph.pointer;
-}
-
-unsigned
-tgsi_util_get_src_register_swizzle(
-   const struct tgsi_src_register *reg,
-   unsigned component )
-{
-   switch( component ) {
-   case 0:
-      return reg->SwizzleX;
-   case 1:
-      return reg->SwizzleY;
-   case 2:
-      return reg->SwizzleZ;
-   case 3:
-      return reg->SwizzleW;
-   default:
-      assert( 0 );
-   }
-   return 0;
-}
-
-unsigned
-tgsi_util_get_src_register_extswizzle(
-   const struct tgsi_src_register_ext_swz *reg,
-   unsigned component )
-{
-   switch( component ) {
-   case 0:
-      return reg->ExtSwizzleX;
-   case 1:
-      return reg->ExtSwizzleY;
-   case 2:
-      return reg->ExtSwizzleZ;
-   case 3:
-      return reg->ExtSwizzleW;
-   default:
-      assert( 0 );
-   }
-   return 0;
-}
-
-unsigned
-tgsi_util_get_full_src_register_extswizzle(
-   const struct tgsi_full_src_register  *reg,
-   unsigned component )
-{
-   unsigned swizzle;
-
-   /*
-    * First, calculate  the   extended swizzle for a given channel. This will give
-    * us either a channel index into the simple swizzle or  a constant 1 or   0.
-    */
-   swizzle = tgsi_util_get_src_register_extswizzle(
-      &reg->SrcRegisterExtSwz,
-      component );
-
-   assert (TGSI_SWIZZLE_X == TGSI_EXTSWIZZLE_X);
-   assert (TGSI_SWIZZLE_Y == TGSI_EXTSWIZZLE_Y);
-   assert (TGSI_SWIZZLE_Z == TGSI_EXTSWIZZLE_Z);
-   assert (TGSI_SWIZZLE_W == TGSI_EXTSWIZZLE_W);
-   assert (TGSI_EXTSWIZZLE_ZERO > TGSI_SWIZZLE_W);
-   assert (TGSI_EXTSWIZZLE_ONE > TGSI_SWIZZLE_W);
-
-   /*
-    * Second, calculate the simple  swizzle  for   the   unswizzled channel index.
-    * Leave the constants intact, they are   not   affected by the   simple swizzle.
-    */
-   if( swizzle <= TGSI_SWIZZLE_W ) {
-      swizzle = tgsi_util_get_src_register_swizzle(
-         &reg->SrcRegister,
-         swizzle );
-   }
-
-   return swizzle;
-}
-
-void
-tgsi_util_set_src_register_swizzle(
-   struct tgsi_src_register *reg,
-   unsigned swizzle,
-   unsigned component )
-{
-   switch( component ) {
-   case 0:
-      reg->SwizzleX = swizzle;
-      break;
-   case 1:
-      reg->SwizzleY = swizzle;
-      break;
-   case 2:
-      reg->SwizzleZ = swizzle;
-      break;
-   case 3:
-      reg->SwizzleW = swizzle;
-      break;
-   default:
-      assert( 0 );
-   }
-}
-
-void
-tgsi_util_set_src_register_extswizzle(
-   struct tgsi_src_register_ext_swz *reg,
-   unsigned swizzle,
-   unsigned component )
-{
-   switch( component ) {
-   case 0:
-      reg->ExtSwizzleX = swizzle;
-      break;
-   case 1:
-      reg->ExtSwizzleY = swizzle;
-      break;
-   case 2:
-      reg->ExtSwizzleZ = swizzle;
-      break;
-   case 3:
-      reg->ExtSwizzleW = swizzle;
-      break;
-   default:
-      assert( 0 );
-   }
-}
-
-unsigned
-tgsi_util_get_src_register_extnegate(
-   const  struct tgsi_src_register_ext_swz *reg,
-   unsigned component )
-{
-   switch( component ) {
-   case 0:
-      return reg->NegateX;
-   case 1:
-      return reg->NegateY;
-   case 2:
-      return reg->NegateZ;
-   case 3:
-      return reg->NegateW;
-   default:
-      assert( 0 );
-   }
-   return 0;
-}
-
-void
-tgsi_util_set_src_register_extnegate(
-   struct tgsi_src_register_ext_swz *reg,
-   unsigned negate,
-   unsigned component )
-{
-   switch( component ) {
-   case 0:
-      reg->NegateX = negate;
-      break;
-   case 1:
-      reg->NegateY = negate;
-      break;
-   case 2:
-      reg->NegateZ = negate;
-      break;
-   case 3:
-      reg->NegateW = negate;
-      break;
-   default:
-      assert( 0 );
-   }
-}
-
-unsigned
-tgsi_util_get_full_src_register_sign_mode(
-   const struct  tgsi_full_src_register *reg,
-   unsigned component )
-{
-   unsigned sign_mode;
-
-   if( reg->SrcRegisterExtMod.Absolute ) {
-      /* Consider only the post-abs negation. */
-
-      if( reg->SrcRegisterExtMod.Negate ) {
-         sign_mode = TGSI_UTIL_SIGN_SET;
-      }
-      else {
-         sign_mode = TGSI_UTIL_SIGN_CLEAR;
-      }
-   }
-   else {
-      /* Accumulate the three negations. */
-
-      unsigned negate;
-
-      negate = reg->SrcRegister.Negate;
-      if( tgsi_util_get_src_register_extnegate( &reg->SrcRegisterExtSwz, component ) ) {
-         negate = !negate;
-      }
-      if( reg->SrcRegisterExtMod.Negate ) {
-         negate = !negate;
-      }
-
-      if( negate ) {
-         sign_mode = TGSI_UTIL_SIGN_TOGGLE;
-      }
-      else {
-         sign_mode = TGSI_UTIL_SIGN_KEEP;
-      }
-   }
-
-   return sign_mode;
-}
-
-void
-tgsi_util_set_full_src_register_sign_mode(
-   struct tgsi_full_src_register *reg,
-   unsigned sign_mode )
-{
-   reg->SrcRegisterExtSwz.NegateX = 0;
-   reg->SrcRegisterExtSwz.NegateY = 0;
-   reg->SrcRegisterExtSwz.NegateZ = 0;
-   reg->SrcRegisterExtSwz.NegateW = 0;
-
-   switch (sign_mode)
-   {
-   case TGSI_UTIL_SIGN_CLEAR:
-      reg->SrcRegister.Negate = 0;
-      reg->SrcRegisterExtMod.Absolute = 1;
-      reg->SrcRegisterExtMod.Negate = 0;
-      break;
-
-   case TGSI_UTIL_SIGN_SET:
-      reg->SrcRegister.Negate = 0;
-      reg->SrcRegisterExtMod.Absolute = 1;
-      reg->SrcRegisterExtMod.Negate = 1;
-      break;
-
-   case TGSI_UTIL_SIGN_TOGGLE:
-      reg->SrcRegister.Negate = 1;
-      reg->SrcRegisterExtMod.Absolute = 0;
-      reg->SrcRegisterExtMod.Negate = 0;
-      break;
-
-   case TGSI_UTIL_SIGN_KEEP:
-      reg->SrcRegister.Negate = 0;
-      reg->SrcRegisterExtMod.Absolute = 0;
-      reg->SrcRegisterExtMod.Negate = 0;
-      break;
-
-   default:
-      assert( 0 );
-   }
-}
diff --git a/src/gallium/auxiliary/tgsi/util/tgsi_util.h b/src/gallium/auxiliary/tgsi/util/tgsi_util.h
deleted file mode 100644
index 7877f34558..0000000000
--- a/src/gallium/auxiliary/tgsi/util/tgsi_util.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-#ifndef TGSI_UTIL_H
-#define TGSI_UTIL_H
-
-#if defined __cplusplus
-extern "C" {
-#endif
-
-void *
-tgsi_align_128bit(
-   void *unaligned );
-
-unsigned
-tgsi_util_get_src_register_swizzle(
-   const struct tgsi_src_register *reg,
-   unsigned component );
-
-unsigned
-tgsi_util_get_src_register_extswizzle(
-   const struct tgsi_src_register_ext_swz *reg,
-   unsigned component);
-
-unsigned
-tgsi_util_get_full_src_register_extswizzle(
-   const struct tgsi_full_src_register *reg,
-   unsigned component );
-
-void
-tgsi_util_set_src_register_swizzle(
-   struct tgsi_src_register *reg,
-   unsigned swizzle,
-   unsigned component );
-
-void
-tgsi_util_set_src_register_extswizzle(
-   struct tgsi_src_register_ext_swz *reg,
-   unsigned swizzle,
-   unsigned component );
-
-unsigned
-tgsi_util_get_src_register_extnegate(
-   const struct tgsi_src_register_ext_swz *reg,
-   unsigned component );
-
-void
-tgsi_util_set_src_register_extnegate(
-   struct tgsi_src_register_ext_swz *reg,
-   unsigned negate,
-   unsigned component );
-
-#define TGSI_UTIL_SIGN_CLEAR    0   /* Force positive */
-#define TGSI_UTIL_SIGN_SET      1   /* Force negative */
-#define TGSI_UTIL_SIGN_TOGGLE   2   /* Negate */
-#define TGSI_UTIL_SIGN_KEEP     3   /* No change */
-
-unsigned
-tgsi_util_get_full_src_register_sign_mode(
-   const struct tgsi_full_src_register *reg,
-   unsigned component );
-
-void
-tgsi_util_set_full_src_register_sign_mode(
-   struct tgsi_full_src_register *reg,
-   unsigned sign_mode );
-
-#if defined __cplusplus
-}
-#endif
-
-#endif /* TGSI_UTIL_H */
diff --git a/src/gallium/auxiliary/util/u_gen_mipmap.c b/src/gallium/auxiliary/util/u_gen_mipmap.c
index 4999822068..8713ff5d58 100644
--- a/src/gallium/auxiliary/util/u_gen_mipmap.c
+++ b/src/gallium/auxiliary/util/u_gen_mipmap.c
@@ -45,9 +45,9 @@
 #include "util/u_gen_mipmap.h"
 #include "util/u_simple_shaders.h"
 
-#include "tgsi/util/tgsi_build.h"
-#include "tgsi/util/tgsi_dump.h"
-#include "tgsi/util/tgsi_parse.h"
+#include "tgsi/tgsi_build.h"
+#include "tgsi/tgsi_dump.h"
+#include "tgsi/tgsi_parse.h"
 
 #include "cso_cache/cso_context.h"
 
diff --git a/src/gallium/auxiliary/util/u_simple_shaders.c b/src/gallium/auxiliary/util/u_simple_shaders.c
index 505d93d727..c34fb6ee33 100644
--- a/src/gallium/auxiliary/util/u_simple_shaders.c
+++ b/src/gallium/auxiliary/util/u_simple_shaders.c
@@ -43,9 +43,9 @@
 
 #include "util/u_simple_shaders.h"
 
-#include "tgsi/util/tgsi_build.h"
-#include "tgsi/util/tgsi_dump.h"
-#include "tgsi/util/tgsi_parse.h"
+#include "tgsi/tgsi_build.h"
+#include "tgsi/tgsi_dump.h"
+#include "tgsi/tgsi_parse.h"
 
 
diff --git a/src/gallium/drivers/cell/ppu/cell_context.h b/src/gallium/drivers/cell/ppu/cell_context.h
index 8df41c1d4c..f1d1ca89a9 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.h
+++ b/src/gallium/drivers/cell/ppu/cell_context.h
@@ -37,7 +37,7 @@
 #include "cell_winsys.h"
 #include "cell/common.h"
 #include "rtasm/rtasm_ppc_spe.h"
-#include "tgsi/util/tgsi_scan.h"
+#include "tgsi/tgsi_scan.h"
 
 struct cell_vbuf_render;
 
diff --git a/src/gallium/drivers/cell/ppu/cell_state_shader.c b/src/gallium/drivers/cell/ppu/cell_state_shader.c
index c3a3fbd066..f5707f2bb8 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_shader.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_shader.c
@@ -30,7 +30,7 @@
 #include "pipe/p_inlines.h"
 #include "pipe/p_winsys.h"
 #include "draw/draw_context.h"
-#include "tgsi/util/tgsi_parse.h"
+#include "tgsi/tgsi_parse.h"
 
 #include "cell_context.h"
 #include "cell_state.h"
diff --git a/src/gallium/drivers/cell/spu/spu_exec.c b/src/gallium/drivers/cell/spu/spu_exec.c
index 3a80df427d..96393732ed 100644
--- a/src/gallium/drivers/cell/spu/spu_exec.c
+++ b/src/gallium/drivers/cell/spu/spu_exec.c
@@ -65,8 +65,8 @@
 #include "pipe/p_state.h"
 #include "pipe/p_util.h"
 #include "pipe/p_shader_tokens.h"
-#include "tgsi/util/tgsi_parse.h"
-#include "tgsi/util/tgsi_util.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_util.h"
 #include "spu_exec.h"
 #include "spu_main.h"
 #include "spu_vertex_shader.h"
diff --git a/src/gallium/drivers/cell/spu/spu_exec.h b/src/gallium/drivers/cell/spu/spu_exec.h
index 3e17c490d2..c68f78f59b 100644
--- a/src/gallium/drivers/cell/spu/spu_exec.h
+++ b/src/gallium/drivers/cell/spu/spu_exec.h
@@ -29,7 +29,7 @@
 #define SPU_EXEC_H
 
 #include "pipe/p_compiler.h"
-#include "tgsi/exec/tgsi_exec.h"
+#include "tgsi/tgsi_exec.h"
 
 #if defined __cplusplus
 extern "C" {
diff --git a/src/gallium/drivers/cell/spu/spu_util.c b/src/gallium/drivers/cell/spu/spu_util.c
index ea4274a0a7..74ab2bbd1f 100644
--- a/src/gallium/drivers/cell/spu/spu_util.c
+++ b/src/gallium/drivers/cell/spu/spu_util.c
@@ -1,8 +1,8 @@
 #include "pipe/p_util.h"
 #include "pipe/p_shader_tokens.h"
-#include "tgsi/util/tgsi_parse.h"
+#include "tgsi/tgsi_parse.h"
 //#include "tgsi_build.h"
-#include "tgsi/util/tgsi_util.h"
+#include "tgsi/tgsi_util.h"
 
 unsigned
 tgsi_util_get_src_register_swizzle(
diff --git a/src/gallium/drivers/i915simple/i915_context.h b/src/gallium/drivers/i915simple/i915_context.h
index c8db4f608c..3cdabe45f9 100644
--- a/src/gallium/drivers/i915simple/i915_context.h
+++ b/src/gallium/drivers/i915simple/i915_context.h
@@ -35,7 +35,7 @@
 
 #include "draw/draw_vertex.h"
 
-#include "tgsi/util/tgsi_scan.h"
+#include "tgsi/tgsi_scan.h"
 
 
 #define I915_TEX_UNITS 8
diff --git a/src/gallium/drivers/i915simple/i915_fpc_translate.c b/src/gallium/drivers/i915simple/i915_fpc_translate.c
index 23cd909337..04507ab8ad 100644
--- a/src/gallium/drivers/i915simple/i915_fpc_translate.c
+++ b/src/gallium/drivers/i915simple/i915_fpc_translate.c
@@ -34,8 +34,8 @@
 
 #include "pipe/p_shader_tokens.h"
 #include "util/u_string.h"
-#include "tgsi/util/tgsi_parse.h"
-#include "tgsi/util/tgsi_dump.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_dump.h"
 
 #include "draw/draw_vertex.h"
 
diff --git a/src/gallium/drivers/i915simple/i915_state.c b/src/gallium/drivers/i915simple/i915_state.c
index dbb33f2695..e8521b385e 100644
--- a/src/gallium/drivers/i915simple/i915_state.c
+++ b/src/gallium/drivers/i915simple/i915_state.c
@@ -33,7 +33,7 @@
 #include "pipe/p_winsys.h"
 #include "pipe/p_util.h"
 #include "pipe/p_inlines.h"
-#include "tgsi/util/tgsi_parse.h"
+#include "tgsi/tgsi_parse.h"
 
 #include "i915_context.h"
 #include "i915_reg.h"
diff --git a/src/gallium/drivers/i965simple/brw_context.h b/src/gallium/drivers/i965simple/brw_context.h
index 2cae7665f7..f00eb34f92 100644
--- a/src/gallium/drivers/i965simple/brw_context.h
+++ b/src/gallium/drivers/i965simple/brw_context.h
@@ -38,7 +38,7 @@
 #include "pipe/p_defines.h"
 #include "pipe/p_state.h"
 
-#include "tgsi/util/tgsi_scan.h"
+#include "tgsi/tgsi_scan.h"
 
 #include "brw_structs.h"
 #include "brw_winsys.h"
diff --git a/src/gallium/drivers/i965simple/brw_sf.c b/src/gallium/drivers/i965simple/brw_sf.c
index 96f8fb87a3..b82a2e143b 100644
--- a/src/gallium/drivers/i965simple/brw_sf.c
+++ b/src/gallium/drivers/i965simple/brw_sf.c
@@ -36,7 +36,7 @@
 #include "brw_util.h"
 #include "brw_sf.h"
 #include "brw_state.h"
-#include "tgsi/util/tgsi_parse.h"
+#include "tgsi/tgsi_parse.h"
 
 
 static void compile_sf_prog( struct brw_context *brw,
diff --git a/src/gallium/drivers/i965simple/brw_shader_info.c b/src/gallium/drivers/i965simple/brw_shader_info.c
index fb3da92421..30f37a99d4 100644
--- a/src/gallium/drivers/i965simple/brw_shader_info.c
+++ b/src/gallium/drivers/i965simple/brw_shader_info.c
@@ -3,7 +3,7 @@
 #include "brw_state.h"
 #include "pipe/p_util.h"
 #include "pipe/p_shader_tokens.h"
-#include "tgsi/util/tgsi_parse.h"
+#include "tgsi/tgsi_parse.h"
 
 
 /**
diff --git a/src/gallium/drivers/i965simple/brw_state.c b/src/gallium/drivers/i965simple/brw_state.c
index caeeba4630..27ca32843d 100644
--- a/src/gallium/drivers/i965simple/brw_state.c
+++ b/src/gallium/drivers/i965simple/brw_state.c
@@ -34,8 +34,8 @@
 #include "pipe/p_util.h"
 #include "pipe/p_inlines.h"
 #include "pipe/p_shader_tokens.h"
-#include "tgsi/util/tgsi_dump.h"
-#include "tgsi/util/tgsi_parse.h"
+#include "tgsi/tgsi_dump.h"
+#include "tgsi/tgsi_parse.h"
 
 #include "brw_context.h"
 #include "brw_defines.h"
diff --git a/src/gallium/drivers/i965simple/brw_vs_emit.c b/src/gallium/drivers/i965simple/brw_vs_emit.c
index 81423e2d7d..34dbc0624d 100644
--- a/src/gallium/drivers/i965simple/brw_vs_emit.c
+++ b/src/gallium/drivers/i965simple/brw_vs_emit.c
@@ -33,7 +33,7 @@
 #include "brw_vs.h"
 
 #include "pipe/p_shader_tokens.h"
-#include "tgsi/util/tgsi_parse.h"
+#include "tgsi/tgsi_parse.h"
 
 struct brw_prog_info {
    unsigned num_temps;
diff --git a/src/gallium/drivers/i965simple/brw_wm_decl.c b/src/gallium/drivers/i965simple/brw_wm_decl.c
index bf1b4d961a..e6f1a44817 100644
--- a/src/gallium/drivers/i965simple/brw_wm_decl.c
+++ b/src/gallium/drivers/i965simple/brw_wm_decl.c
@@ -4,7 +4,7 @@
 #include "brw_wm.h"
 #include "pipe/p_util.h"
 #include "pipe/p_shader_tokens.h"
-#include "tgsi/util/tgsi_parse.h"
+#include "tgsi/tgsi_parse.h"
 
 static struct brw_reg alloc_tmp(struct brw_wm_compile *c)
 {
diff --git a/src/gallium/drivers/i965simple/brw_wm_glsl.c b/src/gallium/drivers/i965simple/brw_wm_glsl.c
index 5c90583824..6a4a5aef09 100644
--- a/src/gallium/drivers/i965simple/brw_wm_glsl.c
+++ b/src/gallium/drivers/i965simple/brw_wm_glsl.c
@@ -4,7 +4,7 @@
 #include "brw_wm.h"
 #include "pipe/p_util.h"
 #include "pipe/p_shader_tokens.h"
-#include "tgsi/util/tgsi_parse.h"
+#include "tgsi/tgsi_parse.h"
 
 
diff --git a/src/gallium/drivers/softpipe/sp_fs_exec.c b/src/gallium/drivers/softpipe/sp_fs_exec.c
index 0b199a2193..cc171bbc39 100644
--- a/src/gallium/drivers/softpipe/sp_fs_exec.c
+++ b/src/gallium/drivers/softpipe/sp_fs_exec.c
@@ -36,8 +36,8 @@
 #include "pipe/p_defines.h"
 #include "pipe/p_util.h"
 #include "pipe/p_inlines.h"
-#include "tgsi/exec/tgsi_exec.h"
-#include "tgsi/util/tgsi_parse.h"
+#include "tgsi/tgsi_exec.h"
+#include "tgsi/tgsi_parse.h"
 
 struct sp_exec_fragment_shader {
    struct sp_fragment_shader base;
diff --git a/src/gallium/drivers/softpipe/sp_fs_llvm.c b/src/gallium/drivers/softpipe/sp_fs_llvm.c
index 6e1d9280bb..20226da78c 100644
--- a/src/gallium/drivers/softpipe/sp_fs_llvm.c
+++ b/src/gallium/drivers/softpipe/sp_fs_llvm.c
@@ -38,7 +38,7 @@
 #include "pipe/p_defines.h"
 #include "pipe/p_util.h"
 #include "pipe/p_inlines.h"
-#include "tgsi/exec/tgsi_sse2.h"
+#include "tgsi/tgsi_sse2.h"
 
 #if 0
 
diff --git a/src/gallium/drivers/softpipe/sp_fs_sse.c b/src/gallium/drivers/softpipe/sp_fs_sse.c
index 69f7f960aa..8b7da7c747 100644
--- a/src/gallium/drivers/softpipe/sp_fs_sse.c
+++ b/src/gallium/drivers/softpipe/sp_fs_sse.c
@@ -36,8 +36,8 @@
 #include "pipe/p_defines.h"
 #include "pipe/p_util.h"
 #include "pipe/p_inlines.h"
-#include "tgsi/exec/tgsi_exec.h"
-#include "tgsi/exec/tgsi_sse2.h"
+#include "tgsi/tgsi_exec.h"
+#include "tgsi/tgsi_sse2.h"
 
 
 #ifdef PIPE_ARCH_X86
diff --git a/src/gallium/drivers/softpipe/sp_headers.h b/src/gallium/drivers/softpipe/sp_headers.h
index 3d9ede69bb..ae2ee210fc 100644
--- a/src/gallium/drivers/softpipe/sp_headers.h
+++ b/src/gallium/drivers/softpipe/sp_headers.h
@@ -32,7 +32,7 @@
 #define SP_HEADERS_H
 
 #include "pipe/p_state.h"
-#include "tgsi/exec/tgsi_exec.h"
+#include "tgsi/tgsi_exec.h"
 
 #define PRIM_POINT 1
 #define PRIM_LINE  2
diff --git a/src/gallium/drivers/softpipe/sp_state.h b/src/gallium/drivers/softpipe/sp_state.h
index 701e02b295..476ef3dc8f 100644
--- a/src/gallium/drivers/softpipe/sp_state.h
+++ b/src/gallium/drivers/softpipe/sp_state.h
@@ -32,7 +32,7 @@
 #define SP_STATE_H
 
 #include "pipe/p_state.h"
-#include "tgsi/util/tgsi_scan.h"
+#include "tgsi/tgsi_scan.h"
 
 
 #define SP_NEW_VIEWPORT      0x1
diff --git a/src/gallium/drivers/softpipe/sp_state_fs.c b/src/gallium/drivers/softpipe/sp_state_fs.c
index 901c8f83e7..76fe6bfef9 100644
--- a/src/gallium/drivers/softpipe/sp_state_fs.c
+++ b/src/gallium/drivers/softpipe/sp_state_fs.c
@@ -35,8 +35,8 @@
 #include "pipe/p_winsys.h"
 #include "pipe/p_shader_tokens.h"
 #include "draw/draw_context.h"
-#include "tgsi/util/tgsi_dump.h"
-#include "tgsi/util/tgsi_scan.h"
+#include "tgsi/tgsi_dump.h"
+#include "tgsi/tgsi_scan.h"
 
 
 void *
diff --git a/src/gallium/drivers/softpipe/sp_tex_sample.c b/src/gallium/drivers/softpipe/sp_tex_sample.c
index be0b57d9fa..63b3b91110 100644
--- a/src/gallium/drivers/softpipe/sp_tex_sample.c
+++ b/src/gallium/drivers/softpipe/sp_tex_sample.c
@@ -40,7 +40,7 @@
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_util.h"
-#include "tgsi/exec/tgsi_exec.h"
+#include "tgsi/tgsi_exec.h"
 
 
 /*
diff --git a/src/gallium/state_trackers/python/gallium.i b/src/gallium/state_trackers/python/gallium.i
index 8d8b762ea5..284ecb827d 100644
--- a/src/gallium/state_trackers/python/gallium.i
+++ b/src/gallium/state_trackers/python/gallium.i
@@ -47,8 +47,8 @@
 #include "cso_cache/cso_context.h"
 #include "util/u_draw_quad.h" 
 #include "util/p_tile.h" 
-#include "tgsi/util/tgsi_text.h" 
-#include "tgsi/util/tgsi_dump.h" 
+#include "tgsi/tgsi_text.h" 
+#include "tgsi/tgsi_dump.h" 
 
 #include "st_device.h"
 #include "st_sample.h"
diff --git a/src/mesa/state_tracker/st_debug.c b/src/mesa/state_tracker/st_debug.c
index 23ecfff0aa..c7d26ce33c 100644
--- a/src/mesa/state_tracker/st_debug.c
+++ b/src/mesa/state_tracker/st_debug.c
@@ -31,7 +31,7 @@
 
 #include "pipe/p_state.h"
 #include "pipe/p_shader_tokens.h"
-#include "tgsi/util/tgsi_dump.h"
+#include "tgsi/tgsi_dump.h"
 
 #include "cso_cache/cso_cache.h"
 
diff --git a/src/mesa/state_tracker/st_mesa_to_tgsi.c b/src/mesa/state_tracker/st_mesa_to_tgsi.c
index 9029f12056..6565107b10 100644
--- a/src/mesa/state_tracker/st_mesa_to_tgsi.c
+++ b/src/mesa/state_tracker/st_mesa_to_tgsi.c
@@ -32,9 +32,9 @@
 
 #include "pipe/p_compiler.h"
 #include "pipe/p_shader_tokens.h"
-#include "tgsi/util/tgsi_parse.h"
-#include "tgsi/util/tgsi_build.h"
-#include "tgsi/util/tgsi_util.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_build.h"
+#include "tgsi/tgsi_util.h"
 #include "st_mesa_to_tgsi.h"
 #include "shader/prog_instruction.h"
 #include "shader/prog_parameter.h"
diff --git a/src/mesa/state_tracker/st_program.c b/src/mesa/state_tracker/st_program.c
index 5966bbadae..c25c668329 100644
--- a/src/mesa/state_tracker/st_program.c
+++ b/src/mesa/state_tracker/st_program.c
@@ -40,7 +40,7 @@
 #include "pipe/p_defines.h"
 #include "pipe/p_shader_tokens.h"
 #include "draw/draw_context.h"
-#include "tgsi/util/tgsi_dump.h"
+#include "tgsi/tgsi_dump.h"
 
 #include "st_context.h"
 #include "st_atom.h"
-- 
cgit v1.2.3


From e8e75c8c865bb5bbff9db2682b130c8d147f3a38 Mon Sep 17 00:00:00 2001
From: Michal Krol <michal@tungstengraphics.com>
Date: Wed, 13 Aug 2008 11:10:58 +0200
Subject: cell: KILP is a predicated discard, KIL is a conditional discard.

---
 src/gallium/drivers/cell/spu/spu_exec.c | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/spu/spu_exec.c b/src/gallium/drivers/cell/spu/spu_exec.c
index 96393732ed..42e5022f30 100644
--- a/src/gallium/drivers/cell/spu/spu_exec.c
+++ b/src/gallium/drivers/cell/spu/spu_exec.c
@@ -603,8 +603,8 @@ store_dest(
  * Kill fragment if any of the four values is less than zero.
  */
 static void
-exec_kilp(struct spu_exec_machine *mach,
-          const struct tgsi_full_instruction *inst)
+exec_kil(struct spu_exec_machine *mach,
+         const struct tgsi_full_instruction *inst)
 {
    uint uniquemask;
    uint chan_index;
@@ -640,6 +640,20 @@ exec_kilp(struct spu_exec_machine *mach,
    mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
 }
 
+/**
+ * Execute NVIDIA-style KIL which is predicated by a condition code.
+ * Kill fragment if the condition code is TRUE.
+ */
+static void
+exec_kilp(struct tgsi_exec_machine *mach,
+          const struct tgsi_full_instruction *inst)
+{
+   uint kilmask = 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
+
+   /* TODO: build kilmask from CC mask */
+
+   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
+}
 
 /*
  * Fetch a texel using STR texture coordinates.
@@ -1336,8 +1350,7 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_KIL:
-      /* for enabled ExecMask bits, set the killed bit */
-      mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= mach->ExecMask;
+      exec_kil (mach, inst);
       break;
 
    case TGSI_OPCODE_PK2H:
-- 
cgit v1.2.3


From 4f25420bdd834e81a3e22733304efc5261c2998a Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Sun, 24 Aug 2008 17:48:55 -0600
Subject: gallium: refactor/replace p_util.h with util/u_memory.h and
 util/u_math.h

Also, rename p_tile.[ch] to u_tile.[ch]
---
 src/gallium/README.portability                     |    4 +-
 src/gallium/auxiliary/cso_cache/cso_cache.c        |    3 +-
 src/gallium/auxiliary/cso_cache/cso_context.c      |    2 +-
 src/gallium/auxiliary/cso_cache/cso_hash.c         |    2 +-
 src/gallium/auxiliary/draw/draw_context.c          |    3 +-
 src/gallium/auxiliary/draw/draw_pipe.c             |    1 -
 src/gallium/auxiliary/draw/draw_pipe_aaline.c      |    3 +-
 src/gallium/auxiliary/draw/draw_pipe_aapoint.c     |    4 +-
 src/gallium/auxiliary/draw/draw_pipe_clip.c        |    4 +-
 src/gallium/auxiliary/draw/draw_pipe_cull.c        |    2 +-
 src/gallium/auxiliary/draw/draw_pipe_flatshade.c   |    4 +-
 src/gallium/auxiliary/draw/draw_pipe_offset.c      |    3 +-
 src/gallium/auxiliary/draw/draw_pipe_pstipple.c    |    4 +-
 src/gallium/auxiliary/draw/draw_pipe_stipple.c     |    6 +-
 src/gallium/auxiliary/draw/draw_pipe_twoside.c     |    3 +-
 src/gallium/auxiliary/draw/draw_pipe_unfilled.c    |    2 +-
 src/gallium/auxiliary/draw/draw_pipe_util.c        |    2 +-
 src/gallium/auxiliary/draw/draw_pipe_validate.c    |    2 +-
 src/gallium/auxiliary/draw/draw_pipe_vbuf.c        |    3 +-
 src/gallium/auxiliary/draw/draw_pipe_wide_line.c   |    3 +-
 src/gallium/auxiliary/draw/draw_pipe_wide_point.c  |    3 +-
 src/gallium/auxiliary/draw/draw_pt.c               |    1 -
 src/gallium/auxiliary/draw/draw_pt_emit.c          |    2 +-
 src/gallium/auxiliary/draw/draw_pt_fetch.c         |    2 +-
 src/gallium/auxiliary/draw/draw_pt_fetch_emit.c    |    2 +-
 .../auxiliary/draw/draw_pt_fetch_shade_emit.c      |    3 +-
 .../auxiliary/draw/draw_pt_fetch_shade_pipeline.c  |    3 +-
 src/gallium/auxiliary/draw/draw_pt_post_vs.c       |    2 +-
 src/gallium/auxiliary/draw/draw_pt_util.c          |    1 -
 src/gallium/auxiliary/draw/draw_pt_varray.c        |    4 +-
 src/gallium/auxiliary/draw/draw_pt_vcache.c        |    2 +-
 src/gallium/auxiliary/draw/draw_vbuf.h             |    2 -
 src/gallium/auxiliary/draw/draw_vs.c               |    6 +-
 src/gallium/auxiliary/draw/draw_vs_aos.c           |    4 +-
 src/gallium/auxiliary/draw/draw_vs_aos_io.c        |    2 +-
 src/gallium/auxiliary/draw/draw_vs_aos_machine.c   |    3 +-
 src/gallium/auxiliary/draw/draw_vs_exec.c          |    3 +-
 src/gallium/auxiliary/draw/draw_vs_llvm.c          |    1 -
 src/gallium/auxiliary/draw/draw_vs_sse.c           |    3 +-
 src/gallium/auxiliary/draw/draw_vs_varient.c       |    3 +-
 src/gallium/auxiliary/gallivm/gallivm_cpu.cpp      |    3 +-
 src/gallium/auxiliary/gallivm/instructions.cpp     |    2 +-
 src/gallium/auxiliary/gallivm/instructionssoa.cpp  |    2 +-
 .../auxiliary/pipebuffer/pb_buffer_fenced.c        |    2 +-
 .../auxiliary/pipebuffer/pb_buffer_malloc.c        |    2 +-
 src/gallium/auxiliary/pipebuffer/pb_bufmgr_alt.c   |    2 +-
 src/gallium/auxiliary/pipebuffer/pb_bufmgr_cache.c |    2 +-
 src/gallium/auxiliary/pipebuffer/pb_bufmgr_debug.c |    2 +-
 .../auxiliary/pipebuffer/pb_bufmgr_fenced.c        |    2 +-
 src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c    |    2 +-
 src/gallium/auxiliary/pipebuffer/pb_bufmgr_pool.c  |    2 +-
 src/gallium/auxiliary/pipebuffer/pb_bufmgr_slab.c  |    2 +-
 src/gallium/auxiliary/pipebuffer/pb_validate.c     |    2 +-
 src/gallium/auxiliary/pipebuffer/pb_winsys.c       |    2 +-
 src/gallium/auxiliary/rtasm/rtasm_execmem.c        |    2 +-
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c        |    2 +-
 src/gallium/auxiliary/sct/sct.c                    |    2 +-
 src/gallium/auxiliary/tgsi/tgsi_build.c            |    1 -
 src/gallium/auxiliary/tgsi/tgsi_build.h            |    4 +
 src/gallium/auxiliary/tgsi/tgsi_dump_c.c           |    1 -
 src/gallium/auxiliary/tgsi/tgsi_exec.c             |    2 +-
 src/gallium/auxiliary/tgsi/tgsi_parse.c            |    2 +-
 src/gallium/auxiliary/tgsi/tgsi_scan.c             |    6 +-
 src/gallium/auxiliary/tgsi/tgsi_sse2.c             |    2 +-
 src/gallium/auxiliary/tgsi/tgsi_transform.c        |    1 +
 src/gallium/auxiliary/tgsi/tgsi_transform.h        |    1 -
 src/gallium/auxiliary/tgsi/tgsi_util.c             |    1 -
 src/gallium/auxiliary/translate/translate.c        |    1 -
 src/gallium/auxiliary/translate/translate_cache.c  |    2 +-
 .../auxiliary/translate/translate_generic.c        |    2 +-
 src/gallium/auxiliary/translate/translate_sse.c    |    2 +-
 src/gallium/auxiliary/util/Makefile                |    2 +-
 src/gallium/auxiliary/util/SConscript              |    2 +-
 src/gallium/auxiliary/util/p_debug.c               |    1 -
 src/gallium/auxiliary/util/u_blit.c                |    5 +-
 src/gallium/auxiliary/util/u_gen_mipmap.c          |    2 +-
 src/gallium/auxiliary/util/u_handle_table.c        |    4 +-
 src/gallium/auxiliary/util/u_hash_table.c          |    5 +-
 src/gallium/auxiliary/util/u_math.h                |  240 +++-
 src/gallium/auxiliary/util/u_memory.h              |  222 ++++
 src/gallium/auxiliary/util/u_mm.c                  |    2 +-
 src/gallium/auxiliary/util/u_pack_color.h          |   36 +-
 src/gallium/auxiliary/util/u_pointer.h             |  107 ++
 src/gallium/auxiliary/util/u_rect.c                |    1 -
 src/gallium/auxiliary/util/u_simple_shaders.c      |    2 +-
 src/gallium/auxiliary/util/u_tile.c                | 1169 ++++++++++++++++++++
 src/gallium/auxiliary/util/u_tile.h                |  101 ++
 src/gallium/drivers/cell/common.h                  |    1 -
 src/gallium/drivers/cell/ppu/cell_clear.c          |    2 +-
 src/gallium/drivers/cell/ppu/cell_context.c        |    2 +-
 src/gallium/drivers/cell/ppu/cell_pipe_state.c     |    2 +-
 src/gallium/drivers/cell/ppu/cell_render.c         |    2 +-
 src/gallium/drivers/cell/ppu/cell_screen.c         |    2 +-
 src/gallium/drivers/cell/ppu/cell_state_derived.c  |    2 +-
 src/gallium/drivers/cell/ppu/cell_state_emit.c     |    2 +-
 src/gallium/drivers/cell/ppu/cell_state_shader.c   |    2 +-
 src/gallium/drivers/cell/ppu/cell_surface.c        |    2 +-
 src/gallium/drivers/cell/ppu/cell_texture.c        |    2 +-
 src/gallium/drivers/cell/ppu/cell_winsys.c         |    2 +-
 src/gallium/drivers/cell/spu/spu_exec.c            |    1 -
 src/gallium/drivers/cell/spu/spu_tri.c             |    1 -
 src/gallium/drivers/cell/spu/spu_util.c            |    1 -
 src/gallium/drivers/cell/spu/spu_vertex_fetch.c    |    1 -
 src/gallium/drivers/cell/spu/spu_vertex_shader.c   |    1 -
 src/gallium/drivers/failover/fo_context.c          |    2 +-
 src/gallium/drivers/i915simple/i915_context.c      |    2 +-
 src/gallium/drivers/i915simple/i915_debug_fp.c     |    2 +-
 src/gallium/drivers/i915simple/i915_fpc.h          |    1 -
 .../drivers/i915simple/i915_fpc_translate.c        |    2 +
 src/gallium/drivers/i915simple/i915_prim_emit.c    |    4 +-
 src/gallium/drivers/i915simple/i915_prim_vbuf.c    |    3 +-
 src/gallium/drivers/i915simple/i915_screen.c       |    2 +-
 src/gallium/drivers/i915simple/i915_state.c        |    3 +-
 .../drivers/i915simple/i915_state_derived.c        |    2 +-
 .../drivers/i915simple/i915_state_dynamic.c        |    4 +-
 .../drivers/i915simple/i915_state_immediate.c      |    2 +-
 .../drivers/i915simple/i915_state_sampler.c        |    2 +-
 src/gallium/drivers/i915simple/i915_surface.c      |    3 +-
 src/gallium/drivers/i915simple/i915_texture.c      |    3 +-
 src/gallium/drivers/i965simple/brw_cc.c            |    6 +-
 src/gallium/drivers/i965simple/brw_clip_state.c    |    3 +-
 src/gallium/drivers/i965simple/brw_context.c       |    2 +-
 src/gallium/drivers/i965simple/brw_curbe.c         |    3 +-
 src/gallium/drivers/i965simple/brw_draw_upload.c   |    1 +
 src/gallium/drivers/i965simple/brw_gs_state.c      |    3 +-
 src/gallium/drivers/i965simple/brw_screen.c        |    2 +-
 src/gallium/drivers/i965simple/brw_sf_state.c      |    5 +-
 src/gallium/drivers/i965simple/brw_shader_info.c   |    2 +-
 src/gallium/drivers/i965simple/brw_state.c         |    2 +-
 src/gallium/drivers/i965simple/brw_state_batch.c   |    2 +-
 src/gallium/drivers/i965simple/brw_state_cache.c   |    2 +-
 src/gallium/drivers/i965simple/brw_state_pool.c    |    3 +-
 src/gallium/drivers/i965simple/brw_state_upload.c  |    2 +-
 src/gallium/drivers/i965simple/brw_surface.c       |    3 +-
 src/gallium/drivers/i965simple/brw_tex_layout.c    |    8 +-
 src/gallium/drivers/i965simple/brw_vs_state.c      |    3 +-
 src/gallium/drivers/i965simple/brw_wm.c            |    2 +-
 src/gallium/drivers/i965simple/brw_wm_decl.c       |    3 +-
 src/gallium/drivers/i965simple/brw_wm_glsl.c       |    3 +-
 .../drivers/i965simple/brw_wm_sampler_state.c      |    3 +-
 src/gallium/drivers/i965simple/brw_wm_state.c      |    3 +-
 src/gallium/drivers/softpipe/sp_context.c          |    2 +-
 src/gallium/drivers/softpipe/sp_fs_exec.c          |    2 +-
 src/gallium/drivers/softpipe/sp_fs_llvm.c          |    2 +-
 src/gallium/drivers/softpipe/sp_fs_sse.c           |    2 +-
 src/gallium/drivers/softpipe/sp_prim_setup.c       |    2 +-
 src/gallium/drivers/softpipe/sp_prim_vbuf.c        |    1 +
 src/gallium/drivers/softpipe/sp_quad_alpha_test.c  |    2 +-
 src/gallium/drivers/softpipe/sp_quad_blend.c       |   29 +-
 src/gallium/drivers/softpipe/sp_quad_bufloop.c     |    2 +-
 src/gallium/drivers/softpipe/sp_quad_colormask.c   |    3 +-
 src/gallium/drivers/softpipe/sp_quad_coverage.c    |    2 +-
 src/gallium/drivers/softpipe/sp_quad_depth_test.c  |    2 +-
 src/gallium/drivers/softpipe/sp_quad_earlyz.c      |    2 +-
 src/gallium/drivers/softpipe/sp_quad_fs.c          |    3 +-
 src/gallium/drivers/softpipe/sp_quad_occlusion.c   |    2 +-
 src/gallium/drivers/softpipe/sp_quad_output.c      |    2 +-
 src/gallium/drivers/softpipe/sp_quad_stencil.c     |    2 +-
 src/gallium/drivers/softpipe/sp_quad_stipple.c     |    2 +-
 src/gallium/drivers/softpipe/sp_query.c            |    2 +-
 src/gallium/drivers/softpipe/sp_screen.c           |    2 +-
 src/gallium/drivers/softpipe/sp_setup.c            |    2 +-
 src/gallium/drivers/softpipe/sp_state_blend.c      |    2 +-
 src/gallium/drivers/softpipe/sp_state_derived.c    |    3 +-
 src/gallium/drivers/softpipe/sp_state_fs.c         |    2 +-
 src/gallium/drivers/softpipe/sp_state_rasterizer.c |    2 +-
 src/gallium/drivers/softpipe/sp_state_sampler.c    |    2 +-
 src/gallium/drivers/softpipe/sp_surface.c          |    3 +-
 src/gallium/drivers/softpipe/sp_tex_sample.c       |    2 +-
 src/gallium/drivers/softpipe/sp_texture.c          |    3 +-
 src/gallium/drivers/softpipe/sp_tile_cache.c       |    4 +-
 src/gallium/drivers/trace/tr_context.c             |    2 +-
 src/gallium/drivers/trace/tr_dump.c                |    2 +
 src/gallium/drivers/trace/tr_dump.h                |    1 -
 src/gallium/drivers/trace/tr_screen.c              |    2 +-
 src/gallium/drivers/trace/tr_state.c               |    1 +
 src/gallium/drivers/trace/tr_stream_stdc.c         |    2 +-
 src/gallium/drivers/trace/tr_stream_wd.c           |    2 +-
 src/gallium/drivers/trace/tr_texture.c             |    2 +-
 src/gallium/drivers/trace/tr_winsys.c              |    3 +-
 src/gallium/include/pipe/p_util.h                  |  460 --------
 src/gallium/state_trackers/python/gallium.i        |    2 +-
 src/gallium/state_trackers/python/st_device.c      |    3 +-
 src/gallium/state_trackers/python/st_sample.c      |    5 +-
 .../state_trackers/python/st_softpipe_winsys.c     |    3 +-
 .../winsys/drm/intel/common/intel_be_device.c      |    2 +-
 .../winsys/drm/intel/dri/intel_winsys_softpipe.c   |    2 +-
 src/gallium/winsys/egl_xlib/egl_xlib.c             |    2 +-
 src/gallium/winsys/egl_xlib/sw_winsys.c            |    3 +-
 src/gallium/winsys/gdi/wmesa.c                     |    2 +-
 src/gallium/winsys/xlib/brw_aub.c                  |    1 -
 src/gallium/winsys/xlib/xm_winsys.c                |    3 +-
 src/gallium/winsys/xlib/xm_winsys_aub.c            |    2 +-
 src/mesa/state_tracker/acc2.c                      |  319 ++++++
 src/mesa/state_tracker/st_cb_accum.c               |    2 +-
 src/mesa/state_tracker/st_cb_bitmap.c              |    2 +-
 src/mesa/state_tracker/st_cb_drawpixels.c          |    2 +-
 src/mesa/state_tracker/st_cb_readpixels.c          |    2 +-
 src/mesa/state_tracker/st_cb_texture.c             |    2 +-
 src/mesa/state_tracker/st_program.c                |    2 +-
 src/mesa/state_tracker/st_texture.c                |    1 -
 201 files changed, 2453 insertions(+), 686 deletions(-)
 create mode 100644 src/gallium/auxiliary/util/u_memory.h
 create mode 100644 src/gallium/auxiliary/util/u_pointer.h
 create mode 100644 src/gallium/auxiliary/util/u_tile.c
 create mode 100644 src/gallium/auxiliary/util/u_tile.h
 delete mode 100644 src/gallium/include/pipe/p_util.h
 create mode 100644 src/mesa/state_tracker/acc2.c

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/README.portability b/src/gallium/README.portability
index d5d5987a7f..adecf4bb79 100644
--- a/src/gallium/README.portability
+++ b/src/gallium/README.portability
@@ -35,8 +35,8 @@ not available in Windows Kernel Mode. Use the appropriate p_*.h include.
 
 * Use MALLOC, CALLOC, FREE instead of the malloc, calloc, free functions.
 
-* Use align_pointer() function defined in p_util.h for aligning pointers in a
-portable way.
+* Use align_pointer() function defined in u_memory.h for aligning pointers
+ in a portable way.
 
 == Debugging ==
 
diff --git a/src/gallium/auxiliary/cso_cache/cso_cache.c b/src/gallium/auxiliary/cso_cache/cso_cache.c
index 36dc46ff80..6b1754ea00 100644
--- a/src/gallium/auxiliary/cso_cache/cso_cache.c
+++ b/src/gallium/auxiliary/cso_cache/cso_cache.c
@@ -28,9 +28,10 @@
 /* Authors:  Zack Rusin <zack@tungstengraphics.com>
  */
 
-#include "pipe/p_util.h"
 #include "pipe/p_debug.h"
 
+#include "util/u_memory.h"
+
 #include "cso_cache.h"
 #include "cso_hash.h"
 
diff --git a/src/gallium/auxiliary/cso_cache/cso_context.c b/src/gallium/auxiliary/cso_cache/cso_context.c
index 86e4d46a20..f22ba40824 100644
--- a/src/gallium/auxiliary/cso_cache/cso_context.c
+++ b/src/gallium/auxiliary/cso_cache/cso_context.c
@@ -36,7 +36,7 @@
   */
 
 #include "pipe/p_state.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "pipe/p_inlines.h"
 #include "tgsi/tgsi_parse.h"
 
diff --git a/src/gallium/auxiliary/cso_cache/cso_hash.c b/src/gallium/auxiliary/cso_cache/cso_hash.c
index 0646efd952..7f0044c5a7 100644
--- a/src/gallium/auxiliary/cso_cache/cso_hash.c
+++ b/src/gallium/auxiliary/cso_cache/cso_hash.c
@@ -31,7 +31,7 @@
   */
 
 #include "pipe/p_debug.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 
 #include "cso_hash.h"
 
diff --git a/src/gallium/auxiliary/draw/draw_context.c b/src/gallium/auxiliary/draw/draw_context.c
index 2f263cf06a..1c26cb31a3 100644
--- a/src/gallium/auxiliary/draw/draw_context.c
+++ b/src/gallium/auxiliary/draw/draw_context.c
@@ -31,7 +31,8 @@
   */
 
 
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
+#include "util/u_math.h"
 #include "draw_context.h"
 #include "draw_vbuf.h"
 #include "draw_vs.h"
diff --git a/src/gallium/auxiliary/draw/draw_pipe.c b/src/gallium/auxiliary/draw/draw_pipe.c
index 1db43876ef..3cde9d36d3 100644
--- a/src/gallium/auxiliary/draw/draw_pipe.c
+++ b/src/gallium/auxiliary/draw/draw_pipe.c
@@ -30,7 +30,6 @@
   *   Keith Whitwell <keith@tungstengraphics.com>
   */
 
-#include "pipe/p_util.h"
 #include "draw/draw_private.h"
 #include "draw/draw_pipe.h"
 
diff --git a/src/gallium/auxiliary/draw/draw_pipe_aaline.c b/src/gallium/auxiliary/draw/draw_pipe_aaline.c
index 991304b2c8..20841bb5d6 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_aaline.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_aaline.c
@@ -32,11 +32,12 @@
  */
 
 
-#include "pipe/p_util.h"
 #include "pipe/p_inlines.h"
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_shader_tokens.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
 
 #include "tgsi/tgsi_transform.h"
 #include "tgsi/tgsi_dump.h"
diff --git a/src/gallium/auxiliary/draw/draw_pipe_aapoint.c b/src/gallium/auxiliary/draw/draw_pipe_aapoint.c
index c7f4349cb3..2c1cacbdb4 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_aapoint.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_aapoint.c
@@ -38,7 +38,6 @@
  */
 
 
-#include "pipe/p_util.h"
 #include "pipe/p_inlines.h"
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
@@ -47,6 +46,9 @@
 #include "tgsi/tgsi_transform.h"
 #include "tgsi/tgsi_dump.h"
 
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
 #include "draw_context.h"
 #include "draw_vs.h"
 #include "draw_pipe.h"
diff --git a/src/gallium/auxiliary/draw/draw_pipe_clip.c b/src/gallium/auxiliary/draw/draw_pipe_clip.c
index fa10f8efca..3265dcd154 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_clip.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_clip.c
@@ -32,7 +32,9 @@
  */
 
 
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
+#include "util/u_math.h"
+
 #include "pipe/p_shader_tokens.h"
 
 #include "draw_vs.h"
diff --git a/src/gallium/auxiliary/draw/draw_pipe_cull.c b/src/gallium/auxiliary/draw/draw_pipe_cull.c
index d0d22a38e0..053be5f050 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_cull.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_cull.c
@@ -33,7 +33,7 @@
  */
 
 
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "pipe/p_defines.h"
 #include "draw_pipe.h"
 
diff --git a/src/gallium/auxiliary/draw/draw_pipe_flatshade.c b/src/gallium/auxiliary/draw/draw_pipe_flatshade.c
index 4741b22d02..43d1fecc4d 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_flatshade.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_flatshade.c
@@ -28,7 +28,9 @@
 /* Authors:  Keith Whitwell <keith@tungstengraphics.com>
  */
 
-#include "pipe/p_util.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
 #include "pipe/p_shader_tokens.h"
 #include "draw_vs.h"
 #include "draw_pipe.h"
diff --git a/src/gallium/auxiliary/draw/draw_pipe_offset.c b/src/gallium/auxiliary/draw/draw_pipe_offset.c
index 2f5865741c..1fea5e6dcb 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_offset.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_offset.c
@@ -32,7 +32,8 @@
  * \author  Brian Paul
  */
 
-#include "pipe/p_util.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
 #include "draw_pipe.h"
 
 
diff --git a/src/gallium/auxiliary/draw/draw_pipe_pstipple.c b/src/gallium/auxiliary/draw/draw_pipe_pstipple.c
index e97136fa1f..b764d9c518 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_pstipple.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_pstipple.c
@@ -34,12 +34,14 @@
  */
 
 
-#include "pipe/p_util.h"
 #include "pipe/p_inlines.h"
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_shader_tokens.h"
 
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
 #include "tgsi/tgsi_transform.h"
 #include "tgsi/tgsi_dump.h"
 
diff --git a/src/gallium/auxiliary/draw/draw_pipe_stipple.c b/src/gallium/auxiliary/draw/draw_pipe_stipple.c
index bf0db18a68..b65e2aa102 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_stipple.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_stipple.c
@@ -36,10 +36,12 @@
  */
 
 
-#include "pipe/p_util.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_shader_tokens.h"
-#include "draw_pipe.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+#include "draw/draw_pipe.h"
 
 
 /** Subclass of draw_stage */
diff --git a/src/gallium/auxiliary/draw/draw_pipe_twoside.c b/src/gallium/auxiliary/draw/draw_pipe_twoside.c
index 3ac825f565..c329d92339 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_twoside.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_twoside.c
@@ -28,7 +28,8 @@
 /* Authors:  Keith Whitwell <keith@tungstengraphics.com>
  */
 
-#include "pipe/p_util.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_shader_tokens.h"
 #include "draw_vs.h"
diff --git a/src/gallium/auxiliary/draw/draw_pipe_unfilled.c b/src/gallium/auxiliary/draw/draw_pipe_unfilled.c
index 8f97fdedaa..68835fd1a5 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_unfilled.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_unfilled.c
@@ -33,7 +33,7 @@
 /* Authors:  Keith Whitwell <keith@tungstengraphics.com>
  */
 
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "pipe/p_defines.h"
 #include "draw_private.h"
 #include "draw_pipe.h"
diff --git a/src/gallium/auxiliary/draw/draw_pipe_util.c b/src/gallium/auxiliary/draw/draw_pipe_util.c
index 04438f4dd0..e22e5fed0c 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_util.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_util.c
@@ -30,7 +30,7 @@
   *   Keith Whitwell <keith@tungstengraphics.com>
   */
 
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "draw/draw_private.h"
 #include "draw/draw_pipe.h"
 
diff --git a/src/gallium/auxiliary/draw/draw_pipe_validate.c b/src/gallium/auxiliary/draw/draw_pipe_validate.c
index 6be1d369c3..f34c68728e 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_validate.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_validate.c
@@ -28,7 +28,7 @@
 /* Authors:  Keith Whitwell <keith@tungstengraphics.com>
  */
 
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "pipe/p_defines.h"
 #include "draw_private.h"
 #include "draw_pipe.h"
diff --git a/src/gallium/auxiliary/draw/draw_pipe_vbuf.c b/src/gallium/auxiliary/draw/draw_pipe_vbuf.c
index a6fde77a0e..c0cf4269db 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_vbuf.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_vbuf.c
@@ -35,7 +35,8 @@
 
 
 #include "pipe/p_debug.h"
-#include "pipe/p_util.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
 
 #include "draw_vbuf.h"
 #include "draw_private.h"
diff --git a/src/gallium/auxiliary/draw/draw_pipe_wide_line.c b/src/gallium/auxiliary/draw/draw_pipe_wide_line.c
index 48ec2f1239..184e363594 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_wide_line.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_wide_line.c
@@ -28,9 +28,10 @@
 /* Authors:  Keith Whitwell <keith@tungstengraphics.com>
  */
 
-#include "pipe/p_util.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_shader_tokens.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
 #include "draw_private.h"
 #include "draw_pipe.h"
 
diff --git a/src/gallium/auxiliary/draw/draw_pipe_wide_point.c b/src/gallium/auxiliary/draw/draw_pipe_wide_point.c
index 54590984c6..4f1326053d 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_wide_point.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_wide_point.c
@@ -28,7 +28,8 @@
 /* Authors:  Keith Whitwell <keith@tungstengraphics.com>
  */
 
-#include "pipe/p_util.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_shader_tokens.h"
 #include "draw_vs.h"
diff --git a/src/gallium/auxiliary/draw/draw_pt.c b/src/gallium/auxiliary/draw/draw_pt.c
index 85a75525c8..669c11c993 100644
--- a/src/gallium/auxiliary/draw/draw_pt.c
+++ b/src/gallium/auxiliary/draw/draw_pt.c
@@ -30,7 +30,6 @@
   *   Keith Whitwell <keith@tungstengraphics.com>
   */
 
-#include "pipe/p_util.h"
 #include "draw/draw_context.h"
 #include "draw/draw_private.h"
 #include "draw/draw_pt.h"
diff --git a/src/gallium/auxiliary/draw/draw_pt_emit.c b/src/gallium/auxiliary/draw/draw_pt_emit.c
index 40f05cb9e0..d4eca80588 100644
--- a/src/gallium/auxiliary/draw/draw_pt_emit.c
+++ b/src/gallium/auxiliary/draw/draw_pt_emit.c
@@ -25,7 +25,7 @@
  *
  **************************************************************************/
 
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "draw/draw_context.h"
 #include "draw/draw_private.h"
 #include "draw/draw_vbuf.h"
diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch.c b/src/gallium/auxiliary/draw/draw_pt_fetch.c
index 07f4c99164..6377f896fb 100644
--- a/src/gallium/auxiliary/draw/draw_pt_fetch.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch.c
@@ -25,7 +25,7 @@
  *
  **************************************************************************/
 
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "draw/draw_context.h"
 #include "draw/draw_private.h"
 #include "draw/draw_vbuf.h"
diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c b/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c
index 4a1f3b0953..0684c93d10 100644
--- a/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c
@@ -30,7 +30,7 @@
   *   Keith Whitwell <keith@tungstengraphics.com>
   */
 
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "draw/draw_context.h"
 #include "draw/draw_private.h"
 #include "draw/draw_vbuf.h"
diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c
index fdf9b6fe6a..87094f3092 100644
--- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c
@@ -31,7 +31,8 @@
   */
 
 
-#include "pipe/p_util.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
 #include "draw/draw_context.h"
 #include "draw/draw_private.h"
 #include "draw/draw_vbuf.h"
diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c
index be3535ed9e..f617aac9f7 100644
--- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c
@@ -25,7 +25,8 @@
  *
  **************************************************************************/
 
-#include "pipe/p_util.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
 #include "draw/draw_context.h"
 #include "draw/draw_vbuf.h"
 #include "draw/draw_vertex.h"
diff --git a/src/gallium/auxiliary/draw/draw_pt_post_vs.c b/src/gallium/auxiliary/draw/draw_pt_post_vs.c
index af6306b1c6..96dc706b99 100644
--- a/src/gallium/auxiliary/draw/draw_pt_post_vs.c
+++ b/src/gallium/auxiliary/draw/draw_pt_post_vs.c
@@ -25,7 +25,7 @@
  *
  **************************************************************************/
 
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "pipe/p_context.h"
 #include "draw/draw_context.h"
 #include "draw/draw_private.h"
diff --git a/src/gallium/auxiliary/draw/draw_pt_util.c b/src/gallium/auxiliary/draw/draw_pt_util.c
index 32c8a9632c..3bc7939c55 100644
--- a/src/gallium/auxiliary/draw/draw_pt_util.c
+++ b/src/gallium/auxiliary/draw/draw_pt_util.c
@@ -30,7 +30,6 @@
   *   Keith Whitwell <keith@tungstengraphics.com>
   */
 
-#include "pipe/p_util.h"
 #include "draw/draw_context.h"
 #include "draw/draw_private.h"
 #include "draw/draw_pt.h"
diff --git a/src/gallium/auxiliary/draw/draw_pt_varray.c b/src/gallium/auxiliary/draw/draw_pt_varray.c
index 46e722a154..c15afe65f1 100644
--- a/src/gallium/auxiliary/draw/draw_pt_varray.c
+++ b/src/gallium/auxiliary/draw/draw_pt_varray.c
@@ -25,7 +25,9 @@
  *
  **************************************************************************/
 
-#include "pipe/p_util.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
 #include "draw/draw_context.h"
 #include "draw/draw_private.h"
 #include "draw/draw_pt.h"
diff --git a/src/gallium/auxiliary/draw/draw_pt_vcache.c b/src/gallium/auxiliary/draw/draw_pt_vcache.c
index cda2987c9e..b8b5de729d 100644
--- a/src/gallium/auxiliary/draw/draw_pt_vcache.c
+++ b/src/gallium/auxiliary/draw/draw_pt_vcache.c
@@ -30,7 +30,7 @@
   *   Keith Whitwell <keith@tungstengraphics.com>
   */
 
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "draw/draw_context.h"
 #include "draw/draw_private.h"
 #include "draw/draw_pt.h"
diff --git a/src/gallium/auxiliary/draw/draw_vbuf.h b/src/gallium/auxiliary/draw/draw_vbuf.h
index e90f37872a..62247ccd9f 100644
--- a/src/gallium/auxiliary/draw/draw_vbuf.h
+++ b/src/gallium/auxiliary/draw/draw_vbuf.h
@@ -37,8 +37,6 @@
 #define DRAW_VBUF_H_
 
 
-#include "pipe/p_util.h"
-
 
 struct draw_context;
 struct vertex_info;
diff --git a/src/gallium/auxiliary/draw/draw_vs.c b/src/gallium/auxiliary/draw/draw_vs.c
index f798b20492..34adbd49b0 100644
--- a/src/gallium/auxiliary/draw/draw_vs.c
+++ b/src/gallium/auxiliary/draw/draw_vs.c
@@ -31,11 +31,15 @@
   *   Brian Paul
   */
 
-#include "pipe/p_util.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
 #include "pipe/p_shader_tokens.h"
+
 #include "draw_private.h"
 #include "draw_context.h"
 #include "draw_vs.h"
+
 #include "translate/translate.h"
 #include "translate/translate_cache.h"
 
diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.c b/src/gallium/auxiliary/draw/draw_vs_aos.c
index 41bdd012d5..760fcb389f 100644
--- a/src/gallium/auxiliary/draw/draw_vs_aos.c
+++ b/src/gallium/auxiliary/draw/draw_vs_aos.c
@@ -29,9 +29,9 @@
  */
 
 
-#include "pipe/p_util.h"
-#include "pipe/p_shader_tokens.h"
+#include "util/u_memory.h"
 #include "util/u_math.h"
+#include "pipe/p_shader_tokens.h"
 #include "tgsi/tgsi_parse.h"
 #include "tgsi/tgsi_util.h"
 #include "tgsi/tgsi_exec.h"
diff --git a/src/gallium/auxiliary/draw/draw_vs_aos_io.c b/src/gallium/auxiliary/draw/draw_vs_aos_io.c
index eda677cc62..ab3c5b94a5 100644
--- a/src/gallium/auxiliary/draw/draw_vs_aos_io.c
+++ b/src/gallium/auxiliary/draw/draw_vs_aos_io.c
@@ -26,7 +26,7 @@
  **************************************************************************/
 
 
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "pipe/p_shader_tokens.h"
 #include "tgsi/tgsi_parse.h"
 #include "tgsi/tgsi_util.h"
diff --git a/src/gallium/auxiliary/draw/draw_vs_aos_machine.c b/src/gallium/auxiliary/draw/draw_vs_aos_machine.c
index e029b7b4bb..b358bd2df4 100644
--- a/src/gallium/auxiliary/draw/draw_vs_aos_machine.c
+++ b/src/gallium/auxiliary/draw/draw_vs_aos_machine.c
@@ -29,8 +29,9 @@
 #include "pipe/p_config.h"
 
 
-#include "pipe/p_util.h"
 #include "pipe/p_shader_tokens.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
 #include "tgsi/tgsi_parse.h"
 #include "tgsi/tgsi_util.h"
 #include "tgsi/tgsi_exec.h"
diff --git a/src/gallium/auxiliary/draw/draw_vs_exec.c b/src/gallium/auxiliary/draw/draw_vs_exec.c
index e26903d8cc..44563803f9 100644
--- a/src/gallium/auxiliary/draw/draw_vs_exec.c
+++ b/src/gallium/auxiliary/draw/draw_vs_exec.c
@@ -31,7 +31,8 @@
   *   Brian Paul
   */
 
-#include "pipe/p_util.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
 #include "pipe/p_shader_tokens.h"
 
 #include "draw_private.h"
diff --git a/src/gallium/auxiliary/draw/draw_vs_llvm.c b/src/gallium/auxiliary/draw/draw_vs_llvm.c
index fc03473b91..2ce30b9a02 100644
--- a/src/gallium/auxiliary/draw/draw_vs_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_vs_llvm.c
@@ -32,7 +32,6 @@
   *   Brian Paul
   */
 
-#include "pipe/p_util.h"
 #include "pipe/p_shader_tokens.h"
 #include "draw_private.h"
 #include "draw_context.h"
diff --git a/src/gallium/auxiliary/draw/draw_vs_sse.c b/src/gallium/auxiliary/draw/draw_vs_sse.c
index 61f0c084c3..0efabd9de8 100644
--- a/src/gallium/auxiliary/draw/draw_vs_sse.c
+++ b/src/gallium/auxiliary/draw/draw_vs_sse.c
@@ -31,13 +31,14 @@
   *   Brian Paul
   */
 
+#include "util/u_math.h"
+#include "util/u_memory.h"
 #include "pipe/p_config.h"
 
 #include "draw_vs.h"
 
 #if defined(PIPE_ARCH_X86)
 
-#include "pipe/p_util.h"
 #include "pipe/p_shader_tokens.h"
 
 #include "draw_private.h"
diff --git a/src/gallium/auxiliary/draw/draw_vs_varient.c b/src/gallium/auxiliary/draw/draw_vs_varient.c
index 994ce3e889..4daf05dae7 100644
--- a/src/gallium/auxiliary/draw/draw_vs_varient.c
+++ b/src/gallium/auxiliary/draw/draw_vs_varient.c
@@ -30,7 +30,8 @@
   *   Keith Whitwell <keith@tungstengraphics.com>
   */
 
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
+#include "util/u_math.h"
 #include "draw/draw_context.h"
 #include "draw/draw_private.h"
 #include "draw/draw_vbuf.h"
diff --git a/src/gallium/auxiliary/gallivm/gallivm_cpu.cpp b/src/gallium/auxiliary/gallivm/gallivm_cpu.cpp
index cf5b978837..e64bfb1c6c 100644
--- a/src/gallium/auxiliary/gallivm/gallivm_cpu.cpp
+++ b/src/gallium/auxiliary/gallivm/gallivm_cpu.cpp
@@ -41,11 +41,12 @@
 
 #include "pipe/p_context.h"
 #include "pipe/p_shader_tokens.h"
-#include "pipe/p_util.h"
 
 #include "tgsi/tgsi_exec.h"
 #include "tgsi/tgsi_dump.h"
 
+#include "util/u_memory.h"
+
 #include <llvm/Module.h>
 #include <llvm/CallingConv.h>
 #include <llvm/Constants.h>
diff --git a/src/gallium/auxiliary/gallivm/instructions.cpp b/src/gallium/auxiliary/gallivm/instructions.cpp
index 035224e8f3..a82dc30306 100644
--- a/src/gallium/auxiliary/gallivm/instructions.cpp
+++ b/src/gallium/auxiliary/gallivm/instructions.cpp
@@ -35,7 +35,7 @@
 
 #include "storage.h"
 
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 
 #include <llvm/CallingConv.h>
 #include <llvm/Constants.h>
diff --git a/src/gallium/auxiliary/gallivm/instructionssoa.cpp b/src/gallium/auxiliary/gallivm/instructionssoa.cpp
index 76049ade7c..efddc04e81 100644
--- a/src/gallium/auxiliary/gallivm/instructionssoa.cpp
+++ b/src/gallium/auxiliary/gallivm/instructionssoa.cpp
@@ -29,7 +29,7 @@
 #include "storagesoa.h"
 
 #include "pipe/p_shader_tokens.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 
 #include <llvm/CallingConv.h>
 #include <llvm/Constants.h>
diff --git a/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c b/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c
index ce41418a0f..8ae052e875 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c
@@ -45,7 +45,7 @@
 #include "pipe/p_debug.h"
 #include "pipe/p_winsys.h"
 #include "pipe/p_thread.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "util/u_double_list.h"
 
 #include "pb_buffer.h"
diff --git a/src/gallium/auxiliary/pipebuffer/pb_buffer_malloc.c b/src/gallium/auxiliary/pipebuffer/pb_buffer_malloc.c
index e90d2e5623..20fc87b39d 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_buffer_malloc.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_buffer_malloc.c
@@ -35,7 +35,7 @@
 
 
 #include "pipe/p_debug.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "pb_buffer.h"
 #include "pb_bufmgr.h"
 
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_alt.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_alt.c
index 0d2d6c0c1b..2afaeafa1a 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_alt.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_alt.c
@@ -35,7 +35,7 @@
 
 #include "pipe/p_compiler.h"
 #include "pipe/p_debug.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 
 #include "pb_buffer.h"
 #include "pb_bufmgr.h"
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_cache.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_cache.c
index bed4bec4fe..b914c2d0fe 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_cache.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_cache.c
@@ -38,7 +38,7 @@
 #include "pipe/p_debug.h"
 #include "pipe/p_winsys.h"
 #include "pipe/p_thread.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "util/u_double_list.h"
 #include "util/u_time.h"
 
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_debug.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_debug.c
index d02e3500ff..5e518370d0 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_debug.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_debug.c
@@ -37,7 +37,7 @@
 #include "pipe/p_debug.h"
 #include "pipe/p_winsys.h"
 #include "pipe/p_thread.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "util/u_double_list.h"
 #include "util/u_time.h"
 
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_fenced.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_fenced.c
index 05efd8ce41..8fc63ce648 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_fenced.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_fenced.c
@@ -35,7 +35,7 @@
 
 
 #include "pipe/p_debug.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 
 #include "pb_buffer.h"
 #include "pb_buffer_fenced.h"
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c
index c51e582611..b40eb6cc90 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c
@@ -36,7 +36,7 @@
 #include "pipe/p_defines.h"
 #include "pipe/p_debug.h"
 #include "pipe/p_thread.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "util/u_double_list.h"
 #include "util/u_mm.h"
 #include "pb_buffer.h"
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_pool.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_pool.c
index 95af08929a..93d2cc9635 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_pool.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_pool.c
@@ -39,7 +39,7 @@
 #include "pipe/p_debug.h"
 #include "pipe/p_thread.h"
 #include "pipe/p_defines.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "util/u_double_list.h"
 
 #include "pb_buffer.h"
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_slab.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_slab.c
index 598d9ce310..af307e265a 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_slab.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_slab.c
@@ -39,7 +39,7 @@
 #include "pipe/p_debug.h"
 #include "pipe/p_thread.h"
 #include "pipe/p_defines.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "util/u_double_list.h"
 #include "util/u_time.h"
 
diff --git a/src/gallium/auxiliary/pipebuffer/pb_validate.c b/src/gallium/auxiliary/pipebuffer/pb_validate.c
index 362fd896f3..1e54fc39d4 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_validate.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_validate.c
@@ -35,7 +35,7 @@
 
 #include "pipe/p_compiler.h"
 #include "pipe/p_error.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "pipe/p_debug.h"
 
 #include "pb_buffer.h"
diff --git a/src/gallium/auxiliary/pipebuffer/pb_winsys.c b/src/gallium/auxiliary/pipebuffer/pb_winsys.c
index 978944091f..28d137dbc4 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_winsys.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_winsys.c
@@ -35,7 +35,7 @@
 
 
 #include "pipe/p_winsys.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 
 #include "pb_buffer.h"
 
diff --git a/src/gallium/auxiliary/rtasm/rtasm_execmem.c b/src/gallium/auxiliary/rtasm/rtasm_execmem.c
index 300c1c2d9d..dfa5c35ab6 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_execmem.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_execmem.c
@@ -33,7 +33,7 @@
 #include "pipe/p_compiler.h"
 #include "pipe/p_debug.h"
 #include "pipe/p_thread.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 
 #include "rtasm_execmem.h"
 
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
index 7f6bf577b2..285ddc0e3f 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
@@ -30,7 +30,7 @@
  */
 
 #include "pipe/p_compiler.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "rtasm_ppc_spe.h"
 
 #ifdef GALLIUM_CELL
diff --git a/src/gallium/auxiliary/sct/sct.c b/src/gallium/auxiliary/sct/sct.c
index 5e4126e014..49bb7ea92e 100644
--- a/src/gallium/auxiliary/sct/sct.c
+++ b/src/gallium/auxiliary/sct/sct.c
@@ -26,7 +26,7 @@
  **************************************************************************/
 
 
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "pipe/p_state.h"
 #include "pipe/p_inlines.h"
 #include "sct.h"
diff --git a/src/gallium/auxiliary/tgsi/tgsi_build.c b/src/gallium/auxiliary/tgsi/tgsi_build.c
index 050b448fe7..74614d3688 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_build.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_build.c
@@ -26,7 +26,6 @@
  **************************************************************************/
 
 #include "pipe/p_debug.h"
-#include "pipe/p_util.h"
 #include "pipe/p_shader_tokens.h"
 #include "tgsi_build.h"
 #include "tgsi_parse.h"
diff --git a/src/gallium/auxiliary/tgsi/tgsi_build.h b/src/gallium/auxiliary/tgsi/tgsi_build.h
index 6ae7f324f8..7d6234746a 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_build.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_build.h
@@ -28,6 +28,10 @@
 #ifndef TGSI_BUILD_H
 #define TGSI_BUILD_H
 
+
+struct tgsi_token;
+
+
 #if defined __cplusplus
 extern "C" {
 #endif
diff --git a/src/gallium/auxiliary/tgsi/tgsi_dump_c.c b/src/gallium/auxiliary/tgsi/tgsi_dump_c.c
index 1025866a25..be25cb45a0 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_dump_c.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_dump_c.c
@@ -26,7 +26,6 @@
  **************************************************************************/
 
 #include "pipe/p_debug.h"
-#include "pipe/p_util.h"
 #include "util/u_string.h"
 #include "tgsi_dump_c.h"
 #include "tgsi_build.h"
diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.c b/src/gallium/auxiliary/tgsi/tgsi_exec.c
index e28b56c842..fb573fe1f0 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.c
@@ -52,11 +52,11 @@
 
 #include "pipe/p_compiler.h"
 #include "pipe/p_state.h"
-#include "pipe/p_util.h"
 #include "pipe/p_shader_tokens.h"
 #include "tgsi/tgsi_parse.h"
 #include "tgsi/tgsi_util.h"
 #include "tgsi_exec.h"
+#include "util/u_memory.h"
 #include "util/u_math.h"
 
 #define FAST_MATH 1
diff --git a/src/gallium/auxiliary/tgsi/tgsi_parse.c b/src/gallium/auxiliary/tgsi/tgsi_parse.c
index d16f0cdcad..3757486ba9 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_parse.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_parse.c
@@ -26,10 +26,10 @@
  **************************************************************************/
 
 #include "pipe/p_debug.h"
-#include "pipe/p_util.h"
 #include "pipe/p_shader_tokens.h"
 #include "tgsi_parse.h"
 #include "tgsi_build.h"
+#include "util/u_memory.h"
 
 void
 tgsi_full_token_init(
diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.c b/src/gallium/auxiliary/tgsi/tgsi_scan.c
index 59bcf10b53..be4870a498 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_scan.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.c
@@ -33,11 +33,11 @@
  */
 
 
-#include "tgsi_scan.h"
-#include "tgsi/tgsi_parse.h"
+#include "util/u_math.h"
 #include "tgsi/tgsi_build.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_scan.h"
 
-#include "pipe/p_util.h"
 
 
diff --git a/src/gallium/auxiliary/tgsi/tgsi_sse2.c b/src/gallium/auxiliary/tgsi/tgsi_sse2.c
index 00ed4da450..626724ad4e 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_sse2.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_sse2.c
@@ -25,7 +25,7 @@
  * 
  **************************************************************************/
 
-#include "pipe/p_util.h"
+#include "pipe/p_debug.h"
 #include "pipe/p_shader_tokens.h"
 #include "util/u_math.h"
 #include "tgsi/tgsi_parse.h"
diff --git a/src/gallium/auxiliary/tgsi/tgsi_transform.c b/src/gallium/auxiliary/tgsi/tgsi_transform.c
index 357f77b05a..ea87da31e5 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_transform.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_transform.c
@@ -31,6 +31,7 @@
  * Authors:  Brian Paul
  */
 
+#include "pipe/p_debug.h"
 
 #include "tgsi_transform.h"
 
diff --git a/src/gallium/auxiliary/tgsi/tgsi_transform.h b/src/gallium/auxiliary/tgsi/tgsi_transform.h
index 3da0b38271..a121adbaef 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_transform.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_transform.h
@@ -29,7 +29,6 @@
 #define TGSI_TRANSFORM_H
 
 
-#include "pipe/p_util.h"
 #include "pipe/p_shader_tokens.h"
 #include "tgsi/tgsi_parse.h"
 #include "tgsi/tgsi_build.h"
diff --git a/src/gallium/auxiliary/tgsi/tgsi_util.c b/src/gallium/auxiliary/tgsi/tgsi_util.c
index 09486e649e..50101a9bb0 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_util.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_util.c
@@ -26,7 +26,6 @@
  **************************************************************************/
 
 #include "pipe/p_debug.h"
-#include "pipe/p_util.h"
 #include "pipe/p_shader_tokens.h"
 #include "tgsi_parse.h"
 #include "tgsi_build.h"
diff --git a/src/gallium/auxiliary/translate/translate.c b/src/gallium/auxiliary/translate/translate.c
index b93fbf9033..7678903f75 100644
--- a/src/gallium/auxiliary/translate/translate.c
+++ b/src/gallium/auxiliary/translate/translate.c
@@ -31,7 +31,6 @@
   */
 
 #include "pipe/p_config.h"
-#include "pipe/p_util.h"
 #include "pipe/p_state.h"
 #include "translate.h"
 
diff --git a/src/gallium/auxiliary/translate/translate_cache.c b/src/gallium/auxiliary/translate/translate_cache.c
index 115dc9287e..d8069a149c 100644
--- a/src/gallium/auxiliary/translate/translate_cache.c
+++ b/src/gallium/auxiliary/translate/translate_cache.c
@@ -25,7 +25,7 @@
  *
  **************************************************************************/
 
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "pipe/p_state.h"
 #include "translate.h"
 #include "translate_cache.h"
diff --git a/src/gallium/auxiliary/translate/translate_generic.c b/src/gallium/auxiliary/translate/translate_generic.c
index 4c8179ffa8..4d336f47ea 100644
--- a/src/gallium/auxiliary/translate/translate_generic.c
+++ b/src/gallium/auxiliary/translate/translate_generic.c
@@ -30,7 +30,7 @@
   *   Keith Whitwell <keith@tungstengraphics.com>
   */
 
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "pipe/p_state.h"
 #include "translate.h"
 
diff --git a/src/gallium/auxiliary/translate/translate_sse.c b/src/gallium/auxiliary/translate/translate_sse.c
index 18a212ac1c..7955186e16 100644
--- a/src/gallium/auxiliary/translate/translate_sse.c
+++ b/src/gallium/auxiliary/translate/translate_sse.c
@@ -28,7 +28,7 @@
 
 #include "pipe/p_config.h"
 #include "pipe/p_compiler.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "util/u_simple_list.h"
 
 #include "translate.h"
diff --git a/src/gallium/auxiliary/util/Makefile b/src/gallium/auxiliary/util/Makefile
index 6eebf6d29b..6e5fd26c05 100644
--- a/src/gallium/auxiliary/util/Makefile
+++ b/src/gallium/auxiliary/util/Makefile
@@ -5,7 +5,6 @@ LIBNAME = util
 
 C_SOURCES = \
 	p_debug.c \
-	p_tile.c \
 	u_blit.c \
 	u_draw_quad.c \
 	u_gen_mipmap.c \
@@ -16,6 +15,7 @@ C_SOURCES = \
 	u_rect.c \
 	u_simple_shaders.c \
 	u_snprintf.c \
+	u_tile.c \
 	u_time.c
 
 include ../../Makefile.template
diff --git a/src/gallium/auxiliary/util/SConscript b/src/gallium/auxiliary/util/SConscript
index 94382fe1f9..ce3fad7068 100644
--- a/src/gallium/auxiliary/util/SConscript
+++ b/src/gallium/auxiliary/util/SConscript
@@ -6,7 +6,6 @@ util = env.ConvenienceLibrary(
 		'p_debug.c',
 		'p_debug_mem.c',
 		'p_debug_prof.c',
-		'p_tile.c',
 		'u_blit.c',
 		'u_draw_quad.c',
 		'u_gen_mipmap.c',
@@ -17,6 +16,7 @@ util = env.ConvenienceLibrary(
 		'u_rect.c',
 		'u_simple_shaders.c',
 		'u_snprintf.c',
+		'u_tile.c',
 		'u_time.c',
 	])
 
diff --git a/src/gallium/auxiliary/util/p_debug.c b/src/gallium/auxiliary/util/p_debug.c
index 2c2f2f8931..7d1dba5a24 100644
--- a/src/gallium/auxiliary/util/p_debug.c
+++ b/src/gallium/auxiliary/util/p_debug.c
@@ -51,7 +51,6 @@
 #endif
 
 #include "pipe/p_compiler.h" 
-#include "pipe/p_util.h" 
 #include "pipe/p_debug.h" 
 #include "pipe/p_format.h" 
 #include "pipe/p_state.h" 
diff --git a/src/gallium/auxiliary/util/u_blit.c b/src/gallium/auxiliary/util/u_blit.c
index ae087df4cf..05399f9885 100644
--- a/src/gallium/auxiliary/util/u_blit.c
+++ b/src/gallium/auxiliary/util/u_blit.c
@@ -37,12 +37,13 @@
 #include "pipe/p_debug.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_inlines.h"
-#include "pipe/p_util.h"
 #include "pipe/p_winsys.h"
 #include "pipe/p_shader_tokens.h"
 
-#include "util/u_draw_quad.h"
 #include "util/u_blit.h"
+#include "util/u_draw_quad.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
 #include "util/u_simple_shaders.h"
 
 #include "cso_cache/cso_context.h"
diff --git a/src/gallium/auxiliary/util/u_gen_mipmap.c b/src/gallium/auxiliary/util/u_gen_mipmap.c
index 8713ff5d58..c1e2c19f87 100644
--- a/src/gallium/auxiliary/util/u_gen_mipmap.c
+++ b/src/gallium/auxiliary/util/u_gen_mipmap.c
@@ -37,10 +37,10 @@
 #include "pipe/p_debug.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_inlines.h"
-#include "pipe/p_util.h"
 #include "pipe/p_winsys.h"
 #include "pipe/p_shader_tokens.h"
 
+#include "util/u_memory.h"
 #include "util/u_draw_quad.h"
 #include "util/u_gen_mipmap.h"
 #include "util/u_simple_shaders.h"
diff --git a/src/gallium/auxiliary/util/u_handle_table.c b/src/gallium/auxiliary/util/u_handle_table.c
index 2176a00959..2c40011923 100644
--- a/src/gallium/auxiliary/util/u_handle_table.c
+++ b/src/gallium/auxiliary/util/u_handle_table.c
@@ -35,9 +35,9 @@
 
 #include "pipe/p_compiler.h"
 #include "pipe/p_debug.h"
-#include "pipe/p_util.h"
 
-#include "u_handle_table.h"
+#include "util/u_memory.h"
+#include "util/u_handle_table.h"
 
 
 #define HANDLE_TABLE_INITIAL_SIZE 16  
diff --git a/src/gallium/auxiliary/util/u_hash_table.c b/src/gallium/auxiliary/util/u_hash_table.c
index dd5eca7fca..0bc8de9632 100644
--- a/src/gallium/auxiliary/util/u_hash_table.c
+++ b/src/gallium/auxiliary/util/u_hash_table.c
@@ -40,10 +40,11 @@
 
 #include "pipe/p_compiler.h"
 #include "pipe/p_debug.h"
-#include "pipe/p_util.h"
 
 #include "cso_cache/cso_hash.h"
-#include "u_hash_table.h"
+
+#include "util/u_memory.h"
+#include "util/u_hash_table.h"
 
 
 struct hash_table
diff --git a/src/gallium/auxiliary/util/u_math.h b/src/gallium/auxiliary/util/u_math.h
index a541d30a5d..9b4ca39371 100644
--- a/src/gallium/auxiliary/util/u_math.h
+++ b/src/gallium/auxiliary/util/u_math.h
@@ -40,8 +40,6 @@
 
 
 #include "pipe/p_compiler.h"
-#include "pipe/p_util.h"
-#include "util/u_math.h"
 
 
 #ifdef __cplusplus
@@ -49,6 +47,132 @@ extern "C" {
 #endif
 
 
+#if defined(PIPE_SUBSYSTEM_WINDOWS_MINIPORT)
+__inline double ceil(double val)
+{
+   double ceil_val;
+
+   if((val - (long) val) == 0) {
+      ceil_val = val;
+   }
+   else {
+      if(val > 0) {
+         ceil_val = (long) val + 1;
+      }
+      else {
+         ceil_val = (long) val;
+      }
+   }
+
+   return ceil_val;
+}
+
+#ifndef PIPE_SUBSYSTEM_WINDOWS_CE
+__inline double floor(double val)
+{
+   double floor_val;
+
+   if((val - (long) val) == 0) {
+      floor_val = val;
+   }
+   else {
+      if(val > 0) {
+         floor_val = (long) val;
+      }
+      else {
+         floor_val = (long) val - 1;
+      }
+   }
+
+   return floor_val;
+}
+#endif
+
+#pragma function(pow)
+__inline double __cdecl pow(double val, double exponent)
+{
+   /* XXX */
+   assert(0);
+   return 0;
+}
+
+#pragma function(log)
+__inline double __cdecl log(double val)
+{
+   /* XXX */
+   assert(0);
+   return 0;
+}
+
+#pragma function(atan2)
+__inline double __cdecl atan2(double val)
+{
+   /* XXX */
+   assert(0);
+   return 0;
+}
+#else
+#include <math.h>
+#include <stdarg.h>
+#endif
+
+
+#if defined(_MSC_VER) 
+#if _MSC_VER < 1400 && !defined(__cplusplus) || defined(PIPE_SUBSYSTEM_WINDOWS_CE)
+ 
+static INLINE float cosf( float f ) 
+{
+   return (float) cos( (double) f );
+}
+
+static INLINE float sinf( float f ) 
+{
+   return (float) sin( (double) f );
+}
+
+static INLINE float ceilf( float f ) 
+{
+   return (float) ceil( (double) f );
+}
+
+static INLINE float floorf( float f ) 
+{
+   return (float) floor( (double) f );
+}
+
+static INLINE float powf( float f, float g ) 
+{
+   return (float) pow( (double) f, (double) g );
+}
+
+static INLINE float sqrtf( float f ) 
+{
+   return (float) sqrt( (double) f );
+}
+
+static INLINE float fabsf( float f ) 
+{
+   return (float) fabs( (double) f );
+}
+
+static INLINE float logf( float f ) 
+{
+   return (float) log( (double) f );
+}
+
+#else
+/* Work-around an extra semi-colon in VS 2005 logf definition */
+#ifdef logf
+#undef logf
+#define logf(x) ((float)log((double)(x)))
+#endif /* logf */
+#endif
+#endif /* _MSC_VER */
+
+
+
+
+
 #define POW2_TABLE_SIZE 256
 #define POW2_TABLE_SCALE ((float) (POW2_TABLE_SIZE-1))
 extern float pow2_table[POW2_TABLE_SIZE];
@@ -59,6 +183,11 @@ extern void
 util_init_math(void);
 
 
+union fi {
+   float f;
+   int i;
+   unsigned ui;
+};
 
 
 /**
@@ -195,6 +324,113 @@ util_iround(float f)
 
 
+#if defined(PIPE_CC_MSVC) && defined(PIPE_ARCH_X86)
+/**
+ * Find first bit set in word.  Least significant bit is 1.
+ * Return 0 if no bits set.
+ */
+static INLINE
+unsigned ffs( unsigned u )
+{
+   unsigned i;
+
+   if( u == 0 ) {
+      return 0;
+   }
+
+   __asm bsf eax, [u]
+   __asm inc eax
+   __asm mov [i], eax
+
+   return i;
+}
+#endif
+
+
+/**
+ * Return float bits.
+ */
+static INLINE unsigned
+fui( float f )
+{
+   union fi fi;
+   fi.f = f;
+   return fi.ui;
+}
+
+
+
+static INLINE float
+ubyte_to_float(ubyte ub)
+{
+   return (float) ub * (1.0f / 255.0f);
+}
+
+
+/**
+ * Convert float in [0,1] to ubyte in [0,255] with clamping.
+ */
+static INLINE ubyte
+float_to_ubyte(float f)
+{
+   const int ieee_0996 = 0x3f7f0000;   /* 0.996 or so */
+   union fi tmp;
+
+   tmp.f = f;
+   if (tmp.i < 0) {
+      return (ubyte) 0;
+   }
+   else if (tmp.i >= ieee_0996) {
+      return (ubyte) 255;
+   }
+   else {
+      tmp.f = tmp.f * (255.0f/256.0f) + 32768.0f;
+      return (ubyte) tmp.i;
+   }
+}
+
+
+
+#define CLAMP( X, MIN, MAX )  ( (X)<(MIN) ? (MIN) : ((X)>(MAX) ? (MAX) : (X)) )
+
+#define MIN2( A, B )   ( (A)<(B) ? (A) : (B) )
+#define MAX2( A, B )   ( (A)>(B) ? (A) : (B) )
+
+
+static INLINE int
+align(int value, int alignment)
+{
+   return (value + alignment - 1) & ~(alignment - 1);
+}
+
+
+#ifndef COPY_4V
+#define COPY_4V( DST, SRC )         \
+do {                                \
+   (DST)[0] = (SRC)[0];             \
+   (DST)[1] = (SRC)[1];             \
+   (DST)[2] = (SRC)[2];             \
+   (DST)[3] = (SRC)[3];             \
+} while (0)
+#endif
+
+
+#ifndef COPY_4FV
+#define COPY_4FV( DST, SRC )  COPY_4V(DST, SRC)
+#endif
+
+
+#ifndef ASSIGN_4V
+#define ASSIGN_4V( DST, V0, V1, V2, V3 ) \
+do {                                     \
+   (DST)[0] = (V0);                      \
+   (DST)[1] = (V1);                      \
+   (DST)[2] = (V2);                      \
+   (DST)[3] = (V3);                      \
+} while (0)
+#endif
+
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/gallium/auxiliary/util/u_memory.h b/src/gallium/auxiliary/util/u_memory.h
new file mode 100644
index 0000000000..148a5cb997
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_memory.h
@@ -0,0 +1,222 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+/**
+ * Memory functions
+ */
+
+
+#ifndef U_MEMORY_H
+#define U_MEMORY_H
+
+
+#include "util/u_pointer.h"
+
+
+ /* Define ENOMEM for WINCE */ 
+#if (_WIN32_WCE < 600)
+#ifndef ENOMEM
+#define ENOMEM 12
+#endif
+#endif
+
+
+
+#if defined(PIPE_SUBSYSTEM_WINDOWS_DISPLAY) && defined(DEBUG) 
+
+/* memory debugging */
+
+#include "p_debug.h"
+
+#define MALLOC( _size ) \
+   debug_malloc( __FILE__, __LINE__, __FUNCTION__, _size )
+#define CALLOC( _count, _size ) \
+   debug_calloc(__FILE__, __LINE__, __FUNCTION__, _count, _size )
+#define FREE( _ptr ) \
+   debug_free( __FILE__, __LINE__, __FUNCTION__,  _ptr )
+#define REALLOC( _ptr, _old_size, _size ) \
+   debug_realloc( __FILE__, __LINE__, __FUNCTION__,  _ptr, _old_size, _size )
+
+#elif defined(PIPE_SUBSYSTEM_WINDOWS_DISPLAY)
+
+void * __stdcall
+EngAllocMem(
+    unsigned long Flags,
+    unsigned long MemSize,
+    unsigned long Tag );
+
+void __stdcall
+EngFreeMem(
+    void *Mem );
+
+#define MALLOC( _size ) EngAllocMem( 0, _size, 'D3AG' )
+#define _FREE( _ptr ) EngFreeMem( _ptr )
+
+#elif defined(PIPE_SUBSYSTEM_WINDOWS_MINIPORT)
+
+void *
+ExAllocatePool(
+    unsigned long PoolType, 
+    size_t NumberOfBytes);
+
+void 
+ExFreePool(void *P);
+
+#define MALLOC(_size) ExAllocatePool(0, _size)
+#define _FREE(_ptr) ExFreePool(_ptr)
+
+#else
+
+#define MALLOC( SIZE )  malloc( SIZE )
+#define CALLOC( COUNT, SIZE )   calloc( COUNT, SIZE )
+#define FREE( PTR )  free( PTR )
+#define REALLOC( OLDPTR, OLDSIZE, NEWSIZE )  realloc( OLDPTR, NEWSIZE )
+
+#endif
+
+
+#ifndef CALLOC
+static INLINE void *
+CALLOC( unsigned count, unsigned size )
+{
+   void *ptr = MALLOC( count * size );
+   if( ptr ) {
+      memset( ptr, 0, count * size );
+   }
+   return ptr;
+}
+#endif /* !CALLOC */
+
+#ifndef FREE
+static INLINE void
+FREE( void *ptr )
+{
+   if( ptr ) {
+      _FREE( ptr );
+   }
+}
+#endif /* !FREE */
+
+#ifndef REALLOC
+static INLINE void *
+REALLOC( void *old_ptr, unsigned old_size, unsigned new_size )
+{
+   void *new_ptr = NULL;
+
+   if (new_size != 0) {
+      unsigned copy_size = old_size < new_size ? old_size : new_size;
+      new_ptr = MALLOC( new_size );
+      if (new_ptr && old_ptr && copy_size) {
+         memcpy( new_ptr, old_ptr, copy_size );
+      }
+   }
+
+   FREE( old_ptr );
+   return new_ptr;
+}
+#endif /* !REALLOC */
+
+
+#define MALLOC_STRUCT(T)   (struct T *) MALLOC(sizeof(struct T))
+
+#define CALLOC_STRUCT(T)   (struct T *) CALLOC(1, sizeof(struct T))
+
+
+/**
+ * Return memory on given byte alignment
+ */
+static INLINE void *
+align_malloc(size_t bytes, uint alignment)
+{
+#if defined(HAVE_POSIX_MEMALIGN)
+   void *mem;
+   alignment = (alignment + (uint)sizeof(void*) - 1) & ~((uint)sizeof(void*) - 1);
+   if(posix_memalign(& mem, alignment, bytes) != 0)
+      return NULL;
+   return mem;
+#else
+   char *ptr, *buf;
+
+   assert( alignment > 0 );
+
+   ptr = (char *) MALLOC(bytes + alignment + sizeof(void *));
+   if (!ptr)
+      return NULL;
+
+   buf = (char *) align_pointer( ptr + sizeof(void *), alignment );
+   *(char **)(buf - sizeof(void *)) = ptr;
+
+   return buf;
+#endif /* defined(HAVE_POSIX_MEMALIGN) */
+}
+
+/**
+ * Free memory returned by align_malloc().
+ */
+static INLINE void
+align_free(void *ptr)
+{
+#if defined(HAVE_POSIX_MEMALIGN)
+   FREE(ptr);
+#else
+   void **cubbyHole = (void **) ((char *) ptr - sizeof(void *));
+   void *realAddr = *cubbyHole;
+   FREE(realAddr);
+#endif /* defined(HAVE_POSIX_MEMALIGN) */
+}
+
+
+/**
+ * Duplicate a block of memory.
+ */
+static INLINE void *
+mem_dup(const void *src, uint size)
+{
+   void *dup = MALLOC(size);
+   if (dup)
+      memcpy(dup, src, size);
+   return dup;
+}
+
+
+/**
+ * Number of elements in an array.
+ */
+#ifndef Elements
+#define Elements(x) (sizeof(x)/sizeof((x)[0]))
+#endif
+
+
+/**
+ * Offset of a field in a struct, in bytes.
+ */
+#define Offset(TYPE, MEMBER) ((unsigned)&(((TYPE *)NULL)->MEMBER))
+
+
+
+#endif /* U_MEMORY_H */
diff --git a/src/gallium/auxiliary/util/u_mm.c b/src/gallium/auxiliary/util/u_mm.c
index b49ae074e0..0f51dd5977 100644
--- a/src/gallium/auxiliary/util/u_mm.c
+++ b/src/gallium/auxiliary/util/u_mm.c
@@ -24,9 +24,9 @@
 
 
 #include "pipe/p_compiler.h"
-#include "pipe/p_util.h"
 #include "pipe/p_debug.h"
 
+#include "util/u_memory.h"
 #include "util/u_mm.h"
 
 
diff --git a/src/gallium/auxiliary/util/u_pack_color.h b/src/gallium/auxiliary/util/u_pack_color.h
index 06abb34d5a..39e4ae9d07 100644
--- a/src/gallium/auxiliary/util/u_pack_color.h
+++ b/src/gallium/auxiliary/util/u_pack_color.h
@@ -37,6 +37,7 @@
 
 #include "pipe/p_compiler.h"
 #include "pipe/p_format.h"
+#include "util/u_math.h"
 
 
 /**
@@ -150,10 +151,10 @@ util_pack_color(const float rgba[4], enum pipe_format format, void *dest)
 
    if (pf_size_x(format) <= 8) {
       /* format uses 8-bit components or less */
-      UNCLAMPED_FLOAT_TO_UBYTE(r, rgba[0]);
-      UNCLAMPED_FLOAT_TO_UBYTE(g, rgba[1]);
-      UNCLAMPED_FLOAT_TO_UBYTE(b, rgba[2]);
-      UNCLAMPED_FLOAT_TO_UBYTE(a, rgba[3]);
+      r = float_to_ubyte(rgba[0]);
+      g = float_to_ubyte(rgba[1]);
+      b = float_to_ubyte(rgba[2]);
+      a = float_to_ubyte(rgba[3]);
    }
 
    switch (format) {
@@ -286,4 +287,31 @@ util_pack_z(enum pipe_format format, double z)
 }
 
 
+/**
+ * Pack 4 ubytes into a 4-byte word
+ */
+static INLINE unsigned
+pack_ub4(ubyte b0, ubyte b1, ubyte b2, ubyte b3)
+{
+   return ((((unsigned int)b0) << 0) |
+	   (((unsigned int)b1) << 8) |
+	   (((unsigned int)b2) << 16) |
+	   (((unsigned int)b3) << 24));
+}
+
+
+/**
+ * Pack/convert 4 floats into one 4-byte word.
+ */
+static INLINE unsigned
+pack_ui32_float4(float a, float b, float c, float d)
+{
+   return pack_ub4( float_to_ubyte(a),
+		    float_to_ubyte(b),
+		    float_to_ubyte(c),
+		    float_to_ubyte(d) );
+}
+
+
+
 #endif /* U_PACK_COLOR_H */
diff --git a/src/gallium/auxiliary/util/u_pointer.h b/src/gallium/auxiliary/util/u_pointer.h
new file mode 100644
index 0000000000..e1af9f11cb
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_pointer.h
@@ -0,0 +1,107 @@
+/**************************************************************************
+ * 
+ * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef U_POINTER_H
+#define U_POINTER_H
+
+#include "pipe/p_compiler.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static INLINE intptr_t
+pointer_to_intptr( const void *p )
+{
+   union {
+      const void *p;
+      intptr_t i;
+   } pi;
+   pi.p = p;
+   return pi.i;
+}
+
+static INLINE void *
+intptr_to_pointer( intptr_t i )
+{
+   union {
+      void *p;
+      intptr_t i;
+   } pi;
+   pi.i = i;
+   return pi.p;
+}
+
+static INLINE uintptr_t
+pointer_to_uintptr( const void *ptr )
+{
+   union {
+      const void *p;
+      uintptr_t u;
+   } pu;
+   pu.p = ptr;
+   return pu.u;
+}
+
+static INLINE void *
+uintptr_to_pointer( uintptr_t u )
+{
+   union {
+      void *p;
+      uintptr_t u;
+   } pu;
+   pu.u = u;
+   return pu.p;
+}
+
+/**
+ * Return a pointer aligned to next multiple of N bytes.
+ */
+static INLINE void *
+align_pointer( const void *unaligned, uintptr_t alignment )
+{
+   uintptr_t aligned = (pointer_to_uintptr( unaligned ) + alignment - 1) & ~(alignment - 1);
+   return uintptr_to_pointer( aligned );
+}
+
+
+/**
+ * Return a pointer aligned to next multiple of 16 bytes.
+ */
+static INLINE void *
+align16( void *unaligned )
+{
+   return align_pointer( unaligned, 16 );
+}
+
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* U_POINTER_H */
diff --git a/src/gallium/auxiliary/util/u_rect.c b/src/gallium/auxiliary/util/u_rect.c
index 94e447b9d5..b31ab5415f 100644
--- a/src/gallium/auxiliary/util/u_rect.c
+++ b/src/gallium/auxiliary/util/u_rect.c
@@ -31,7 +31,6 @@
 
 
 #include "pipe/p_defines.h"
-#include "pipe/p_util.h"
 #include "pipe/p_format.h"
 #include "util/u_rect.h"
 
diff --git a/src/gallium/auxiliary/util/u_simple_shaders.c b/src/gallium/auxiliary/util/u_simple_shaders.c
index c34fb6ee33..f06d13c2c4 100644
--- a/src/gallium/auxiliary/util/u_simple_shaders.c
+++ b/src/gallium/auxiliary/util/u_simple_shaders.c
@@ -37,10 +37,10 @@
 #include "pipe/p_debug.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_inlines.h"
-#include "pipe/p_util.h"
 #include "pipe/p_winsys.h"
 #include "pipe/p_shader_tokens.h"
 
+#include "util/u_memory.h"
 #include "util/u_simple_shaders.h"
 
 #include "tgsi/tgsi_build.h"
diff --git a/src/gallium/auxiliary/util/u_tile.c b/src/gallium/auxiliary/util/u_tile.c
new file mode 100644
index 0000000000..853c503f4f
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_tile.c
@@ -0,0 +1,1169 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * RGBA/float tile get/put functions.
+ * Usable both by drivers and state trackers.
+ * Surfaces should already be in a mapped state.
+ */
+
+
+#include "pipe/p_defines.h"
+#include "pipe/p_inlines.h"
+
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "util/u_rect.h"
+#include "util/u_tile.h"
+
+
+/**
+ * Move raw block of pixels from surface to user memory.
+ * This should be usable by any hw driver that has mappable surfaces.
+ */
+void
+pipe_get_tile_raw(struct pipe_surface *ps,
+                  uint x, uint y, uint w, uint h,
+                  void *dst, int dst_stride)
+{
+   const void *src;
+
+   if (dst_stride == 0)
+      dst_stride = pf_get_nblocksx(&ps->block, w) * ps->block.size;
+
+   if (pipe_clip_tile(x, y, &w, &h, ps))
+      return;
+
+   src = pipe_surface_map(ps, PIPE_BUFFER_USAGE_CPU_READ);
+   assert(src);
+   if(!src)
+      return;
+
+   pipe_copy_rect(dst, &ps->block, dst_stride, 0, 0, w, h, src, ps->stride, x, y);
+
+   pipe_surface_unmap(ps);
+}
+
+
+/**
+ * Move raw block of pixels from user memory to surface.
+ * This should be usable by any hw driver that has mappable surfaces.
+ */
+void
+pipe_put_tile_raw(struct pipe_surface *ps,
+                  uint x, uint y, uint w, uint h,
+                  const void *src, int src_stride)
+{
+   void *dst;
+
+   if (src_stride == 0)
+      src_stride = pf_get_nblocksx(&ps->block, w) * ps->block.size;
+
+   if (pipe_clip_tile(x, y, &w, &h, ps))
+      return;
+
+   dst = pipe_surface_map(ps, PIPE_BUFFER_USAGE_CPU_WRITE);
+   assert(dst);
+   if(!dst)
+      return;
+
+   pipe_copy_rect(dst, &ps->block, ps->stride, x, y, w, h, src, src_stride, 0, 0);
+
+   pipe_surface_unmap(ps);
+}
+
+
+
+
+/** Convert short in [-32768,32767] to GLfloat in [-1.0,1.0] */
+#define SHORT_TO_FLOAT(S)   ((2.0F * (S) + 1.0F) * (1.0F/65535.0F))
+
+#define UNCLAMPED_FLOAT_TO_SHORT(us, f)  \
+   us = ( (short) ( CLAMP((f), -1.0, 1.0) * 32767.0F) )
+
+
+
+/*** PIPE_FORMAT_A8R8G8B8_UNORM ***/
+
+static void
+a8r8g8b8_get_tile_rgba(const unsigned *src,
+                       unsigned w, unsigned h,
+                       float *p,
+                       unsigned dst_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         const unsigned pixel = *src++;
+         pRow[0] = ubyte_to_float((pixel >> 16) & 0xff);
+         pRow[1] = ubyte_to_float((pixel >>  8) & 0xff);
+         pRow[2] = ubyte_to_float((pixel >>  0) & 0xff);
+         pRow[3] = ubyte_to_float((pixel >> 24) & 0xff);
+      }
+      p += dst_stride;
+   }
+}
+
+
+static void
+a8r8g8b8_put_tile_rgba(unsigned *dst,
+                       unsigned w, unsigned h,
+                       const float *p,
+                       unsigned src_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      const float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         unsigned r, g, b, a;
+         r = float_to_ubyte(pRow[0]);
+         g = float_to_ubyte(pRow[1]);
+         b = float_to_ubyte(pRow[2]);
+         a = float_to_ubyte(pRow[3]);
+         *dst++ = (a << 24) | (r << 16) | (g << 8) | b;
+      }
+      p += src_stride;
+   }
+}
+
+
+/*** PIPE_FORMAT_A8R8G8B8_UNORM ***/
+
+static void
+x8r8g8b8_get_tile_rgba(const unsigned *src,
+                       unsigned w, unsigned h,
+                       float *p,
+                       unsigned dst_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         const unsigned pixel = *src++;
+         pRow[0] = ubyte_to_float((pixel >> 16) & 0xff);
+         pRow[1] = ubyte_to_float((pixel >>  8) & 0xff);
+         pRow[2] = ubyte_to_float((pixel >>  0) & 0xff);
+         pRow[3] = ubyte_to_float(0xff);
+      }
+      p += dst_stride;
+   }
+}
+
+
+static void
+x8r8g8b8_put_tile_rgba(unsigned *dst,
+                       unsigned w, unsigned h,
+                       const float *p,
+                       unsigned src_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      const float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         unsigned r, g, b;
+         r = float_to_ubyte(pRow[0]);
+         g = float_to_ubyte(pRow[1]);
+         b = float_to_ubyte(pRow[2]);
+         *dst++ = (0xff << 24) | (r << 16) | (g << 8) | b;
+      }
+      p += src_stride;
+   }
+}
+
+
+/*** PIPE_FORMAT_B8G8R8A8_UNORM ***/
+
+static void
+b8g8r8a8_get_tile_rgba(const unsigned *src,
+                       unsigned w, unsigned h,
+                       float *p,
+                       unsigned dst_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         const unsigned pixel = *src++;
+         pRow[0] = ubyte_to_float((pixel >>  8) & 0xff);
+         pRow[1] = ubyte_to_float((pixel >> 16) & 0xff);
+         pRow[2] = ubyte_to_float((pixel >> 24) & 0xff);
+         pRow[3] = ubyte_to_float((pixel >>  0) & 0xff);
+      }
+      p += dst_stride;
+   }
+}
+
+
+static void
+b8g8r8a8_put_tile_rgba(unsigned *dst,
+                       unsigned w, unsigned h,
+                       const float *p,
+                       unsigned src_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      const float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         unsigned r, g, b, a;
+         r = float_to_ubyte(pRow[0]);
+         g = float_to_ubyte(pRow[1]);
+         b = float_to_ubyte(pRow[2]);
+         a = float_to_ubyte(pRow[3]);
+         *dst++ = (b << 24) | (g << 16) | (r << 8) | a;
+      }
+      p += src_stride;
+   }
+}
+
+
+/*** PIPE_FORMAT_A1R5G5B5_UNORM ***/
+
+static void
+a1r5g5b5_get_tile_rgba(const ushort *src,
+                       unsigned w, unsigned h,
+                       float *p,
+                       unsigned dst_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         const ushort pixel = *src++;
+         pRow[0] = ((pixel >> 10) & 0x1f) * (1.0f / 31.0f);
+         pRow[1] = ((pixel >>  5) & 0x1f) * (1.0f / 31.0f);
+         pRow[2] = ((pixel      ) & 0x1f) * (1.0f / 31.0f);
+         pRow[3] = ((pixel >> 15)       ) * 1.0f;
+      }
+      p += dst_stride;
+   }
+}
+
+
+static void
+a1r5g5b5_put_tile_rgba(ushort *dst,
+                       unsigned w, unsigned h,
+                       const float *p,
+                       unsigned src_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      const float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         unsigned r, g, b, a;
+         r = float_to_ubyte(pRow[0]);
+         g = float_to_ubyte(pRow[1]);
+         b = float_to_ubyte(pRow[2]);
+         a = float_to_ubyte(pRow[3]);
+         r = r >> 3;  /* 5 bits */
+         g = g >> 3;  /* 5 bits */
+         b = b >> 3;  /* 5 bits */
+         a = a >> 7;  /* 1 bit */
+         *dst++ = (a << 15) | (r << 10) | (g << 5) | b;
+      }
+      p += src_stride;
+   }
+}
+
+
+/*** PIPE_FORMAT_A4R4G4B4_UNORM ***/
+
+static void
+a4r4g4b4_get_tile_rgba(const ushort *src,
+                       unsigned w, unsigned h,
+                       float *p,
+                       unsigned dst_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         const ushort pixel = *src++;
+         pRow[0] = ((pixel >>  8) & 0xf) * (1.0f / 15.0f);
+         pRow[1] = ((pixel >>  4) & 0xf) * (1.0f / 15.0f);
+         pRow[2] = ((pixel      ) & 0xf) * (1.0f / 15.0f);
+         pRow[3] = ((pixel >> 12)      ) * (1.0f / 15.0f);
+      }
+      p += dst_stride;
+   }
+}
+
+
+static void
+a4r4g4b4_put_tile_rgba(ushort *dst,
+                       unsigned w, unsigned h,
+                       const float *p,
+                       unsigned src_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      const float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         unsigned r, g, b, a;
+         r = float_to_ubyte(pRow[0]);
+         g = float_to_ubyte(pRow[1]);
+         b = float_to_ubyte(pRow[2]);
+         a = float_to_ubyte(pRow[3]);
+         r >>= 4;
+         g >>= 4;
+         b >>= 4;
+         a >>= 4;
+         *dst++ = (a << 12) | (r << 16) | (g << 4) | b;
+      }
+      p += src_stride;
+   }
+}
+
+
+/*** PIPE_FORMAT_R5G6B5_UNORM ***/
+
+static void
+r5g6b5_get_tile_rgba(const ushort *src,
+                     unsigned w, unsigned h,
+                     float *p,
+                     unsigned dst_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         const ushort pixel = *src++;
+         pRow[0] = ((pixel >> 11) & 0x1f) * (1.0f / 31.0f);
+         pRow[1] = ((pixel >>  5) & 0x3f) * (1.0f / 63.0f);
+         pRow[2] = ((pixel      ) & 0x1f) * (1.0f / 31.0f);
+         pRow[3] = 1.0f;
+      }
+      p += dst_stride;
+   }
+}
+
+
+static void
+r5g6b5_put_tile_rgba(ushort *dst,
+                     unsigned w, unsigned h,
+                     const float *p,
+                     unsigned src_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      const float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         uint r = (uint) (CLAMP(pRow[0], 0.0, 1.0) * 31.0);
+         uint g = (uint) (CLAMP(pRow[1], 0.0, 1.0) * 63.0);
+         uint b = (uint) (CLAMP(pRow[2], 0.0, 1.0) * 31.0);
+         *dst++ = (r << 11) | (g << 5) | (b);
+      }
+      p += src_stride;
+   }
+}
+
+
+
+/*** PIPE_FORMAT_Z16_UNORM ***/
+
+/**
+ * Return each Z value as four floats in [0,1].
+ */
+static void
+z16_get_tile_rgba(const ushort *src,
+                  unsigned w, unsigned h,
+                  float *p,
+                  unsigned dst_stride)
+{
+   const float scale = 1.0f / 65535.0f;
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         pRow[0] =
+         pRow[1] =
+         pRow[2] =
+         pRow[3] = *src++ * scale;
+      }
+      p += dst_stride;
+   }
+}
+
+
+
+
+/*** PIPE_FORMAT_L8_UNORM ***/
+
+static void
+l8_get_tile_rgba(const ubyte *src,
+                 unsigned w, unsigned h,
+                 float *p,
+                 unsigned dst_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      float *pRow = p;
+      for (j = 0; j < w; j++, src++, pRow += 4) {
+         pRow[0] =
+         pRow[1] =
+         pRow[2] = ubyte_to_float(*src);
+         pRow[3] = 1.0;
+      }
+      p += dst_stride;
+   }
+}
+
+
+static void
+l8_put_tile_rgba(ubyte *dst,
+                 unsigned w, unsigned h,
+                 const float *p,
+                 unsigned src_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      const float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         unsigned r;
+         r = float_to_ubyte(pRow[0]);
+         *dst++ = r;
+      }
+      p += src_stride;
+   }
+}
+
+
+
+/*** PIPE_FORMAT_A8_UNORM ***/
+
+static void
+a8_get_tile_rgba(const ubyte *src,
+                 unsigned w, unsigned h,
+                 float *p,
+                 unsigned dst_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      float *pRow = p;
+      for (j = 0; j < w; j++, src++, pRow += 4) {
+         pRow[0] =
+         pRow[1] =
+         pRow[2] = 0.0;
+         pRow[3] = ubyte_to_float(*src);
+      }
+      p += dst_stride;
+   }
+}
+
+
+static void
+a8_put_tile_rgba(ubyte *dst,
+                 unsigned w, unsigned h,
+                 const float *p,
+                 unsigned src_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      const float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         unsigned a;
+         a = float_to_ubyte(pRow[3]);
+         *dst++ = a;
+      }
+      p += src_stride;
+   }
+}
+
+
+
+/*** PIPE_FORMAT_R16_SNORM ***/
+
+static void
+r16_get_tile_rgba(const short *src,
+                  unsigned w, unsigned h,
+                  float *p,
+                  unsigned dst_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      float *pRow = p;
+      for (j = 0; j < w; j++, src++, pRow += 4) {
+         pRow[0] = SHORT_TO_FLOAT(src[0]);
+         pRow[1] =
+         pRow[2] = 0.0;
+         pRow[3] = 1.0;
+      }
+      p += dst_stride;
+   }
+}
+
+
+static void
+r16_put_tile_rgba(short *dst,
+                  unsigned w, unsigned h,
+                  const float *p,
+                  unsigned src_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      const float *pRow = p;
+      for (j = 0; j < w; j++, dst++, pRow += 4) {
+         UNCLAMPED_FLOAT_TO_SHORT(dst[0], pRow[0]);
+      }
+      p += src_stride;
+   }
+}
+
+
+/*** PIPE_FORMAT_R16G16B16A16_SNORM ***/
+
+static void
+r16g16b16a16_get_tile_rgba(const short *src,
+                           unsigned w, unsigned h,
+                           float *p,
+                           unsigned dst_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      float *pRow = p;
+      for (j = 0; j < w; j++, src += 4, pRow += 4) {
+         pRow[0] = SHORT_TO_FLOAT(src[0]);
+         pRow[1] = SHORT_TO_FLOAT(src[1]);
+         pRow[2] = SHORT_TO_FLOAT(src[2]);
+         pRow[3] = SHORT_TO_FLOAT(src[3]);
+      }
+      p += dst_stride;
+   }
+}
+
+
+static void
+r16g16b16a16_put_tile_rgba(short *dst,
+                           unsigned w, unsigned h,
+                           const float *p,
+                           unsigned src_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      const float *pRow = p;
+      for (j = 0; j < w; j++, dst += 4, pRow += 4) {
+         UNCLAMPED_FLOAT_TO_SHORT(dst[0], pRow[0]);
+         UNCLAMPED_FLOAT_TO_SHORT(dst[1], pRow[1]);
+         UNCLAMPED_FLOAT_TO_SHORT(dst[2], pRow[2]);
+         UNCLAMPED_FLOAT_TO_SHORT(dst[3], pRow[3]);
+      }
+      p += src_stride;
+   }
+}
+
+
+
+/*** PIPE_FORMAT_I8_UNORM ***/
+
+static void
+i8_get_tile_rgba(const ubyte *src,
+                 unsigned w, unsigned h,
+                 float *p,
+                 unsigned dst_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      float *pRow = p;
+      for (j = 0; j < w; j++, src++, pRow += 4) {
+         pRow[0] =
+         pRow[1] =
+         pRow[2] =
+         pRow[3] = ubyte_to_float(*src);
+      }
+      p += dst_stride;
+   }
+}
+
+
+static void
+i8_put_tile_rgba(ubyte *dst,
+                 unsigned w, unsigned h,
+                 const float *p,
+                 unsigned src_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      const float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         unsigned r;
+         r = float_to_ubyte(pRow[0]);
+         *dst++ = r;
+      }
+      p += src_stride;
+   }
+}
+
+
+/*** PIPE_FORMAT_A8L8_UNORM ***/
+
+static void
+a8l8_get_tile_rgba(const ushort *src,
+                   unsigned w, unsigned h,
+                   float *p,
+                   unsigned dst_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         ushort p = *src++;
+         pRow[0] =
+         pRow[1] =
+         pRow[2] = ubyte_to_float(p & 0xff);
+         pRow[3] = ubyte_to_float(p >> 8);
+      }
+      p += dst_stride;
+   }
+}
+
+
+static void
+a8l8_put_tile_rgba(ushort *dst,
+                   unsigned w, unsigned h,
+                   const float *p,
+                   unsigned src_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      const float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         unsigned r, a;
+         r = float_to_ubyte(pRow[0]);
+         a = float_to_ubyte(pRow[3]);
+         *dst++ = (a << 8) | r;
+      }
+      p += src_stride;
+   }
+}
+
+
+
+
+/*** PIPE_FORMAT_Z32_UNORM ***/
+
+/**
+ * Return each Z value as four floats in [0,1].
+ */
+static void
+z32_get_tile_rgba(const unsigned *src,
+                  unsigned w, unsigned h,
+                  float *p,
+                  unsigned dst_stride)
+{
+   const double scale = 1.0 / (double) 0xffffffff;
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         pRow[0] =
+         pRow[1] =
+         pRow[2] =
+         pRow[3] = (float) (*src++ * scale);
+      }
+      p += dst_stride;
+   }
+}
+
+
+/*** PIPE_FORMAT_S8Z24_UNORM ***/
+
+/**
+ * Return Z component as four float in [0,1].  Stencil part ignored.
+ */
+static void
+s8z24_get_tile_rgba(const unsigned *src,
+                    unsigned w, unsigned h,
+                    float *p,
+                    unsigned dst_stride)
+{
+   const double scale = 1.0 / ((1 << 24) - 1);
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         pRow[0] =
+         pRow[1] =
+         pRow[2] =
+         pRow[3] = (float) (scale * (*src++ & 0xffffff));
+      }
+      p += dst_stride;
+   }
+}
+
+
+/*** PIPE_FORMAT_Z24S8_UNORM ***/
+
+/**
+ * Return Z component as four float in [0,1].  Stencil part ignored.
+ */
+static void
+z24s8_get_tile_rgba(const unsigned *src,
+                    unsigned w, unsigned h,
+                    float *p,
+                    unsigned dst_stride)
+{
+   const double scale = 1.0 / ((1 << 24) - 1);
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         pRow[0] =
+         pRow[1] =
+         pRow[2] =
+         pRow[3] = (float) (scale * (*src++ >> 8));
+      }
+      p += dst_stride;
+   }
+}
+
+
+/*** PIPE_FORMAT_YCBCR / PIPE_FORMAT_YCBCR_REV ***/
+
+/**
+ * Convert YCbCr (or YCrCb) to RGBA.
+ */
+static void
+ycbcr_get_tile_rgba(const ushort *src,
+                    unsigned w, unsigned h,
+                    float *p,
+                    unsigned dst_stride,
+                    boolean rev)
+{
+   const float scale = 1.0f / 255.0f;
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      float *pRow = p;
+      /* do two texels at a time */
+      for (j = 0; j < (w & ~1); j += 2, src += 2) {
+         const ushort t0 = src[0];
+         const ushort t1 = src[1];
+         const ubyte y0 = (t0 >> 8) & 0xff;  /* luminance */
+         const ubyte y1 = (t1 >> 8) & 0xff;  /* luminance */
+         ubyte cb, cr;
+         float r, g, b;
+
+         if (rev) {
+            cb = t1 & 0xff;         /* chroma U */
+            cr = t0 & 0xff;         /* chroma V */
+         }
+         else {
+            cb = t0 & 0xff;         /* chroma U */
+            cr = t1 & 0xff;         /* chroma V */
+         }
+
+         /* even pixel: y0,cr,cb */
+         r = 1.164f * (y0-16) + 1.596f * (cr-128);
+         g = 1.164f * (y0-16) - 0.813f * (cr-128) - 0.391f * (cb-128);
+         b = 1.164f * (y0-16) + 2.018f * (cb-128);
+         pRow[0] = r * scale;
+         pRow[1] = g * scale;
+         pRow[2] = b * scale;
+         pRow[3] = 1.0f;
+         pRow += 4;
+
+         /* odd pixel: use y1,cr,cb */
+         r = 1.164f * (y1-16) + 1.596f * (cr-128);
+         g = 1.164f * (y1-16) - 0.813f * (cr-128) - 0.391f * (cb-128);
+         b = 1.164f * (y1-16) + 2.018f * (cb-128);
+         pRow[0] = r * scale;
+         pRow[1] = g * scale;
+         pRow[2] = b * scale;
+         pRow[3] = 1.0f;
+         pRow += 4;
+
+      }
+      /* do the last texel */
+      if (w & 1) {
+         const ushort t0 = src[0];
+         const ushort t1 = src[1];
+         const ubyte y0 = (t0 >> 8) & 0xff;  /* luminance */
+         ubyte cb, cr;
+         float r, g, b;
+
+         if (rev) {
+            cb = t1 & 0xff;         /* chroma U */
+            cr = t0 & 0xff;         /* chroma V */
+         }
+         else {
+            cb = t0 & 0xff;         /* chroma U */
+            cr = t1 & 0xff;         /* chroma V */
+         }
+
+         /* even pixel: y0,cr,cb */
+         r = 1.164f * (y0-16) + 1.596f * (cr-128);
+         g = 1.164f * (y0-16) - 0.813f * (cr-128) - 0.391f * (cb-128);
+         b = 1.164f * (y0-16) + 2.018f * (cb-128);
+         pRow[0] = r * scale;
+         pRow[1] = g * scale;
+         pRow[2] = b * scale;
+         pRow[3] = 1.0f;
+         pRow += 4;
+      }
+      p += dst_stride;
+   }
+}
+
+
+void
+pipe_tile_raw_to_rgba(enum pipe_format format,
+                      void *src,
+                      uint w, uint h,
+                      float *dst, unsigned dst_stride)
+{
+   switch (format) {
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+      a8r8g8b8_get_tile_rgba((unsigned *) src, w, h, dst, dst_stride);
+      break;
+   case PIPE_FORMAT_X8R8G8B8_UNORM:
+      x8r8g8b8_get_tile_rgba((unsigned *) src, w, h, dst, dst_stride);
+      break;
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
+      b8g8r8a8_get_tile_rgba((unsigned *) src, w, h, dst, dst_stride);
+      break;
+   case PIPE_FORMAT_A1R5G5B5_UNORM:
+      a1r5g5b5_get_tile_rgba((ushort *) src, w, h, dst, dst_stride);
+      break;
+   case PIPE_FORMAT_A4R4G4B4_UNORM:
+      a4r4g4b4_get_tile_rgba((ushort *) src, w, h, dst, dst_stride);
+      break;
+   case PIPE_FORMAT_R5G6B5_UNORM:
+      r5g6b5_get_tile_rgba((ushort *) src, w, h, dst, dst_stride);
+      break;
+   case PIPE_FORMAT_L8_UNORM:
+      l8_get_tile_rgba((ubyte *) src, w, h, dst, dst_stride);
+      break;
+   case PIPE_FORMAT_A8_UNORM:
+      a8_get_tile_rgba((ubyte *) src, w, h, dst, dst_stride);
+      break;
+   case PIPE_FORMAT_I8_UNORM:
+      i8_get_tile_rgba((ubyte *) src, w, h, dst, dst_stride);
+      break;
+   case PIPE_FORMAT_A8L8_UNORM:
+      a8l8_get_tile_rgba((ushort *) src, w, h, dst, dst_stride);
+      break;
+   case PIPE_FORMAT_R16_SNORM:
+      r16_get_tile_rgba((short *) src, w, h, dst, dst_stride);
+      break;
+   case PIPE_FORMAT_R16G16B16A16_SNORM:
+      r16g16b16a16_get_tile_rgba((short *) src, w, h, dst, dst_stride);
+      break;
+   case PIPE_FORMAT_Z16_UNORM:
+      z16_get_tile_rgba((ushort *) src, w, h, dst, dst_stride);
+      break;
+   case PIPE_FORMAT_Z32_UNORM:
+      z32_get_tile_rgba((unsigned *) src, w, h, dst, dst_stride);
+      break;
+   case PIPE_FORMAT_S8Z24_UNORM:
+   case PIPE_FORMAT_X8Z24_UNORM:
+      s8z24_get_tile_rgba((unsigned *) src, w, h, dst, dst_stride);
+      break;
+   case PIPE_FORMAT_Z24S8_UNORM:
+      z24s8_get_tile_rgba((unsigned *) src, w, h, dst, dst_stride);
+      break;
+   case PIPE_FORMAT_YCBCR:
+      ycbcr_get_tile_rgba((ushort *) src, w, h, dst, dst_stride, FALSE);
+      break;
+   case PIPE_FORMAT_YCBCR_REV:
+      ycbcr_get_tile_rgba((ushort *) src, w, h, dst, dst_stride, TRUE);
+      break;
+   default:
+      assert(0);
+   }
+}
+
+
+void
+pipe_get_tile_rgba(struct pipe_surface *ps,
+                   uint x, uint y, uint w, uint h,
+                   float *p)
+{
+   unsigned dst_stride = w * 4;
+   void *packed;
+
+   if (pipe_clip_tile(x, y, &w, &h, ps))
+      return;
+
+   packed = MALLOC(pf_get_nblocks(&ps->block, w, h) * ps->block.size);
+
+   if (!packed)
+      return;
+
+   if(ps->format == PIPE_FORMAT_YCBCR || ps->format == PIPE_FORMAT_YCBCR_REV)
+      assert((x & 1) == 0);
+
+   pipe_get_tile_raw(ps, x, y, w, h, packed, 0);
+
+   pipe_tile_raw_to_rgba(ps->format, packed, w, h, p, dst_stride);
+
+   FREE(packed);
+}
+
+
+void
+pipe_put_tile_rgba(struct pipe_surface *ps,
+                   uint x, uint y, uint w, uint h,
+                   const float *p)
+{
+   unsigned src_stride = w * 4;
+   void *packed;
+
+   if (pipe_clip_tile(x, y, &w, &h, ps))
+      return;
+
+   packed = MALLOC(pf_get_nblocks(&ps->block, w, h) * ps->block.size);
+
+   if (!packed)
+      return;
+
+   switch (ps->format) {
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+      a8r8g8b8_put_tile_rgba((unsigned *) packed, w, h, p, src_stride);
+      break;
+   case PIPE_FORMAT_X8R8G8B8_UNORM:
+      x8r8g8b8_put_tile_rgba((unsigned *) packed, w, h, p, src_stride);
+      break;
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
+      b8g8r8a8_put_tile_rgba((unsigned *) packed, w, h, p, src_stride);
+      break;
+   case PIPE_FORMAT_A1R5G5B5_UNORM:
+      a1r5g5b5_put_tile_rgba((ushort *) packed, w, h, p, src_stride);
+      break;
+   case PIPE_FORMAT_R5G6B5_UNORM:
+      r5g6b5_put_tile_rgba((ushort *) packed, w, h, p, src_stride);
+      break;
+   case PIPE_FORMAT_R8G8B8A8_UNORM:
+      assert(0);
+      break;
+   case PIPE_FORMAT_A4R4G4B4_UNORM:
+      a4r4g4b4_put_tile_rgba((ushort *) packed, w, h, p, src_stride);
+      break;
+   case PIPE_FORMAT_L8_UNORM:
+      l8_put_tile_rgba((ubyte *) packed, w, h, p, src_stride);
+      break;
+   case PIPE_FORMAT_A8_UNORM:
+      a8_put_tile_rgba((ubyte *) packed, w, h, p, src_stride);
+      break;
+   case PIPE_FORMAT_I8_UNORM:
+      i8_put_tile_rgba((ubyte *) packed, w, h, p, src_stride);
+      break;
+   case PIPE_FORMAT_A8L8_UNORM:
+      a8l8_put_tile_rgba((ushort *) packed, w, h, p, src_stride);
+      break;
+   case PIPE_FORMAT_R16_SNORM:
+      r16_put_tile_rgba((short *) packed, w, h, p, src_stride);
+      break;
+   case PIPE_FORMAT_R16G16B16A16_SNORM:
+      r16g16b16a16_put_tile_rgba((short *) packed, w, h, p, src_stride);
+      break;
+   case PIPE_FORMAT_Z16_UNORM:
+      /*z16_put_tile_rgba((ushort *) packed, w, h, p, src_stride);*/
+      break;
+   case PIPE_FORMAT_Z32_UNORM:
+      /*z32_put_tile_rgba((unsigned *) packed, w, h, p, src_stride);*/
+      break;
+   case PIPE_FORMAT_S8Z24_UNORM:
+   case PIPE_FORMAT_X8Z24_UNORM:
+      /*s8z24_put_tile_rgba((unsigned *) packed, w, h, p, src_stride);*/
+      break;
+   case PIPE_FORMAT_Z24S8_UNORM:
+      /*z24s8_put_tile_rgba((unsigned *) packed, w, h, p, src_stride);*/
+      break;
+   default:
+      assert(0);
+   }
+
+   pipe_put_tile_raw(ps, x, y, w, h, packed, 0);
+
+   FREE(packed);
+}
+
+
+/**
+ * Get a block of Z values, converted to 32-bit range.
+ */
+void
+pipe_get_tile_z(struct pipe_surface *ps,
+                uint x, uint y, uint w, uint h,
+                uint *z)
+{
+   const uint dstStride = w;
+   ubyte *map;
+   uint *pDest = z;
+   uint i, j;
+
+   if (pipe_clip_tile(x, y, &w, &h, ps))
+      return;
+
+   map = (ubyte *)pipe_surface_map(ps, PIPE_BUFFER_USAGE_CPU_READ);
+   if (!map) {
+      assert(0);
+      return;
+   }
+
+   switch (ps->format) {
+   case PIPE_FORMAT_Z32_UNORM:
+      {
+         const uint *pSrc
+            = (const uint *)(map  + y * ps->stride + x*4);
+         for (i = 0; i < h; i++) {
+            memcpy(pDest, pSrc, 4 * w);
+            pDest += dstStride;
+            pSrc += ps->stride/4;
+         }
+      }
+      break;
+   case PIPE_FORMAT_S8Z24_UNORM:
+   case PIPE_FORMAT_X8Z24_UNORM:
+      {
+         const uint *pSrc
+            = (const uint *)(map + y * ps->stride + x*4);
+         for (i = 0; i < h; i++) {
+            for (j = 0; j < w; j++) {
+               /* convert 24-bit Z to 32-bit Z */
+               pDest[j] = (pSrc[j] << 8) | (pSrc[j] & 0xff);
+            }
+            pDest += dstStride;
+            pSrc += ps->stride/4;
+         }
+      }
+      break;
+   case PIPE_FORMAT_Z16_UNORM:
+      {
+         const ushort *pSrc
+            = (const ushort *)(map + y * ps->stride + x*2);
+         for (i = 0; i < h; i++) {
+            for (j = 0; j < w; j++) {
+               /* convert 16-bit Z to 32-bit Z */
+               pDest[j] = (pSrc[j] << 16) | pSrc[j];
+            }
+            pDest += dstStride;
+            pSrc += ps->stride/2;
+         }
+      }
+      break;
+   default:
+      assert(0);
+   }
+
+   pipe_surface_unmap(ps);
+}
+
+
+void
+pipe_put_tile_z(struct pipe_surface *ps,
+                uint x, uint y, uint w, uint h,
+                const uint *zSrc)
+{
+   const uint srcStride = w;
+   const uint *pSrc = zSrc;
+   ubyte *map;
+   uint i, j;
+
+   if (pipe_clip_tile(x, y, &w, &h, ps))
+      return;
+
+   map = (ubyte *)pipe_surface_map(ps, PIPE_BUFFER_USAGE_CPU_WRITE);
+   if (!map) {
+      assert(0);
+      return;
+   }
+
+   switch (ps->format) {
+   case PIPE_FORMAT_Z32_UNORM:
+      {
+         uint *pDest = (uint *) (map + y * ps->stride + x*4);
+         for (i = 0; i < h; i++) {
+            memcpy(pDest, pSrc, 4 * w);
+            pDest += ps->stride/4;
+            pSrc += srcStride;
+         }
+      }
+      break;
+   case PIPE_FORMAT_S8Z24_UNORM:
+   case PIPE_FORMAT_X8Z24_UNORM:
+      {
+         uint *pDest = (uint *) (map + y * ps->stride + x*4);
+         for (i = 0; i < h; i++) {
+            for (j = 0; j < w; j++) {
+               /* convert 32-bit Z to 24-bit Z (0 stencil) */
+               pDest[j] = pSrc[j] >> 8;
+            }
+            pDest += ps->stride/4;
+            pSrc += srcStride;
+         }
+      }
+      break;
+   case PIPE_FORMAT_Z16_UNORM:
+      {
+         ushort *pDest = (ushort *) (map + y * ps->stride + x*2);
+         for (i = 0; i < h; i++) {
+            for (j = 0; j < w; j++) {
+               /* convert 32-bit Z to 16-bit Z */
+               pDest[j] = pSrc[j] >> 16;
+            }
+            pDest += ps->stride/2;
+            pSrc += srcStride;
+         }
+      }
+      break;
+   default:
+      assert(0);
+   }
+
+   pipe_surface_unmap(ps);
+}
+
+
diff --git a/src/gallium/auxiliary/util/u_tile.h b/src/gallium/auxiliary/util/u_tile.h
new file mode 100644
index 0000000000..a8ac805308
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_tile.h
@@ -0,0 +1,101 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef P_TILE_H
+#define P_TILE_H
+
+#include "pipe/p_compiler.h"
+
+struct pipe_surface;
+
+
+/**
+ * Clip tile against surface dims.
+ * \return TRUE if tile is totally clipped, FALSE otherwise
+ */
+static INLINE boolean
+pipe_clip_tile(uint x, uint y, uint *w, uint *h, const struct pipe_surface *ps)
+{
+   if (x >= ps->width)
+      return TRUE;
+   if (y >= ps->height)
+      return TRUE;
+   if (x + *w > ps->width)
+      *w = ps->width - x;
+   if (y + *h > ps->height)
+      *h = ps->height - y;
+   return FALSE;
+}
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void
+pipe_get_tile_raw(struct pipe_surface *ps,
+                  uint x, uint y, uint w, uint h,
+                  void *p, int dst_stride);
+
+void
+pipe_put_tile_raw(struct pipe_surface *ps,
+                  uint x, uint y, uint w, uint h,
+                  const void *p, int src_stride);
+
+
+void
+pipe_get_tile_rgba(struct pipe_surface *ps,
+                   uint x, uint y, uint w, uint h,
+                   float *p);
+
+void
+pipe_put_tile_rgba(struct pipe_surface *ps,
+                   uint x, uint y, uint w, uint h,
+                   const float *p);
+
+
+void
+pipe_get_tile_z(struct pipe_surface *ps,
+                uint x, uint y, uint w, uint h,
+                uint *z);
+
+void
+pipe_put_tile_z(struct pipe_surface *ps,
+                uint x, uint y, uint w, uint h,
+                const uint *z);
+
+void
+pipe_tile_raw_to_rgba(enum pipe_format format,
+                      void *src,
+                      uint w, uint h,
+                      float *dst, unsigned dst_stride);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index f430e88b9c..6bace0bb11 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -34,7 +34,6 @@
 #define CELL_COMMON_H
 
 #include "pipe/p_compiler.h"
-#include "pipe/p_util.h"
 #include "pipe/p_format.h"
 #include "pipe/p_state.h"
 
diff --git a/src/gallium/drivers/cell/ppu/cell_clear.c b/src/gallium/drivers/cell/ppu/cell_clear.c
index 3ffe09add6..cee0917b63 100644
--- a/src/gallium/drivers/cell/ppu/cell_clear.c
+++ b/src/gallium/drivers/cell/ppu/cell_clear.c
@@ -34,7 +34,7 @@
 #include <assert.h>
 #include <stdint.h>
 #include "pipe/p_inlines.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "cell/common.h"
 #include "cell_clear.h"
 #include "cell_context.h"
diff --git a/src/gallium/drivers/cell/ppu/cell_context.c b/src/gallium/drivers/cell/ppu/cell_context.c
index 12eb5aa254..5af95a3c10 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.c
+++ b/src/gallium/drivers/cell/ppu/cell_context.c
@@ -35,7 +35,7 @@
 
 #include "pipe/p_defines.h"
 #include "pipe/p_format.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "pipe/p_winsys.h"
 #include "pipe/p_screen.h"
 
diff --git a/src/gallium/drivers/cell/ppu/cell_pipe_state.c b/src/gallium/drivers/cell/ppu/cell_pipe_state.c
index 67b87f16d7..971d65d09e 100644
--- a/src/gallium/drivers/cell/ppu/cell_pipe_state.c
+++ b/src/gallium/drivers/cell/ppu/cell_pipe_state.c
@@ -30,7 +30,7 @@
  *  Brian Paul
  */
 
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "pipe/p_inlines.h"
 #include "draw/draw_context.h"
 #include "cell_context.h"
diff --git a/src/gallium/drivers/cell/ppu/cell_render.c b/src/gallium/drivers/cell/ppu/cell_render.c
index b663b37622..dd25ae880e 100644
--- a/src/gallium/drivers/cell/ppu/cell_render.c
+++ b/src/gallium/drivers/cell/ppu/cell_render.c
@@ -33,7 +33,7 @@
 #include "cell_context.h"
 #include "cell_render.h"
 #include "cell_spu.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "draw/draw_private.h"
 
 
diff --git a/src/gallium/drivers/cell/ppu/cell_screen.c b/src/gallium/drivers/cell/ppu/cell_screen.c
index 2bf441a0c5..139b3719b6 100644
--- a/src/gallium/drivers/cell/ppu/cell_screen.c
+++ b/src/gallium/drivers/cell/ppu/cell_screen.c
@@ -26,7 +26,7 @@
  **************************************************************************/
 
 
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "pipe/p_winsys.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_screen.h"
diff --git a/src/gallium/drivers/cell/ppu/cell_state_derived.c b/src/gallium/drivers/cell/ppu/cell_state_derived.c
index 5480534ad9..8ab938a02a 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_derived.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_derived.c
@@ -25,7 +25,7 @@
  * 
  **************************************************************************/
 
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "pipe/p_shader_tokens.h"
 #include "draw/draw_context.h"
 #include "draw/draw_vertex.h"
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index 9cae67f091..3646a0ee4f 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -25,7 +25,7 @@
  * 
  **************************************************************************/
 
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "cell_context.h"
 #include "cell_state.h"
 #include "cell_state_emit.h"
diff --git a/src/gallium/drivers/cell/ppu/cell_state_shader.c b/src/gallium/drivers/cell/ppu/cell_state_shader.c
index f5707f2bb8..cd96b317fa 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_shader.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_shader.c
@@ -26,7 +26,7 @@
  **************************************************************************/
 
 #include "pipe/p_defines.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "pipe/p_inlines.h"
 #include "pipe/p_winsys.h"
 #include "draw/draw_context.h"
diff --git a/src/gallium/drivers/cell/ppu/cell_surface.c b/src/gallium/drivers/cell/ppu/cell_surface.c
index 01ffa31c2c..2d31ad89a6 100644
--- a/src/gallium/drivers/cell/ppu/cell_surface.c
+++ b/src/gallium/drivers/cell/ppu/cell_surface.c
@@ -26,7 +26,7 @@
  **************************************************************************/
 
 #include "pipe/p_defines.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "pipe/p_inlines.h"
 #include "pipe/p_winsys.h"
 #include "util/p_tile.h"
diff --git a/src/gallium/drivers/cell/ppu/cell_texture.c b/src/gallium/drivers/cell/ppu/cell_texture.c
index 533b64227d..1add81373d 100644
--- a/src/gallium/drivers/cell/ppu/cell_texture.c
+++ b/src/gallium/drivers/cell/ppu/cell_texture.c
@@ -33,7 +33,7 @@
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_inlines.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "pipe/p_winsys.h"
 
 #include "cell_context.h"
diff --git a/src/gallium/drivers/cell/ppu/cell_winsys.c b/src/gallium/drivers/cell/ppu/cell_winsys.c
index ebabce3c8f..d570bbd2f9 100644
--- a/src/gallium/drivers/cell/ppu/cell_winsys.c
+++ b/src/gallium/drivers/cell/ppu/cell_winsys.c
@@ -26,7 +26,7 @@
  **************************************************************************/
 
 
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "cell_winsys.h"
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_exec.c b/src/gallium/drivers/cell/spu/spu_exec.c
index 42e5022f30..89c61136a4 100644
--- a/src/gallium/drivers/cell/spu/spu_exec.c
+++ b/src/gallium/drivers/cell/spu/spu_exec.c
@@ -63,7 +63,6 @@
 
 #include "pipe/p_compiler.h"
 #include "pipe/p_state.h"
-#include "pipe/p_util.h"
 #include "pipe/p_shader_tokens.h"
 #include "tgsi/tgsi_parse.h"
 #include "tgsi/tgsi_util.h"
diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index ab4ff8160a..8944ef171e 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -32,7 +32,6 @@
 #include <transpose_matrix4x4.h>
 #include "pipe/p_compiler.h"
 #include "pipe/p_format.h"
-#include "pipe/p_util.h"
 #include "spu_colorpack.h"
 #include "spu_main.h"
 #include "spu_texture.h"
diff --git a/src/gallium/drivers/cell/spu/spu_util.c b/src/gallium/drivers/cell/spu/spu_util.c
index 74ab2bbd1f..dbcf4b0eb9 100644
--- a/src/gallium/drivers/cell/spu/spu_util.c
+++ b/src/gallium/drivers/cell/spu/spu_util.c
@@ -1,4 +1,3 @@
-#include "pipe/p_util.h"
 #include "pipe/p_shader_tokens.h"
 #include "tgsi/tgsi_parse.h"
 //#include "tgsi_build.h"
diff --git a/src/gallium/drivers/cell/spu/spu_vertex_fetch.c b/src/gallium/drivers/cell/spu/spu_vertex_fetch.c
index 219fd90cc0..26f2363749 100644
--- a/src/gallium/drivers/cell/spu/spu_vertex_fetch.c
+++ b/src/gallium/drivers/cell/spu/spu_vertex_fetch.c
@@ -32,7 +32,6 @@
   *   Ian Romanick <idr@us.ibm.com>
   */
 
-#include "pipe/p_util.h"
 #include "pipe/p_state.h"
 #include "pipe/p_shader_tokens.h"
 #include "spu_exec.h"
diff --git a/src/gallium/drivers/cell/spu/spu_vertex_shader.c b/src/gallium/drivers/cell/spu/spu_vertex_shader.c
index 3119a78c06..a1e81975e6 100644
--- a/src/gallium/drivers/cell/spu/spu_vertex_shader.c
+++ b/src/gallium/drivers/cell/spu/spu_vertex_shader.c
@@ -34,7 +34,6 @@
 
 #include <spu_mfcio.h>
 
-#include "pipe/p_util.h"
 #include "pipe/p_state.h"
 #include "pipe/p_shader_tokens.h"
 #include "spu_vertex_shader.h"
diff --git a/src/gallium/drivers/failover/fo_context.c b/src/gallium/drivers/failover/fo_context.c
index 014a3e31d5..10c4ffc209 100644
--- a/src/gallium/drivers/failover/fo_context.c
+++ b/src/gallium/drivers/failover/fo_context.c
@@ -28,7 +28,7 @@
 
 #include "pipe/p_defines.h"
 #include "pipe/p_winsys.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "pipe/p_context.h"
 
 #include "fo_context.h"
diff --git a/src/gallium/drivers/i915simple/i915_context.c b/src/gallium/drivers/i915simple/i915_context.c
index e2bf5ab678..c6776716a2 100644
--- a/src/gallium/drivers/i915simple/i915_context.c
+++ b/src/gallium/drivers/i915simple/i915_context.c
@@ -35,7 +35,7 @@
 #include "draw/draw_context.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_winsys.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "pipe/p_screen.h"
 
 
diff --git a/src/gallium/drivers/i915simple/i915_debug_fp.c b/src/gallium/drivers/i915simple/i915_debug_fp.c
index c024a051a5..48be3e1472 100644
--- a/src/gallium/drivers/i915simple/i915_debug_fp.c
+++ b/src/gallium/drivers/i915simple/i915_debug_fp.c
@@ -29,7 +29,7 @@
 #include "i915_reg.h"
 #include "i915_debug.h"
 #include "pipe/p_winsys.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 
 
 static void
diff --git a/src/gallium/drivers/i915simple/i915_fpc.h b/src/gallium/drivers/i915simple/i915_fpc.h
index 80a9576304..2f0f99d046 100644
--- a/src/gallium/drivers/i915simple/i915_fpc.h
+++ b/src/gallium/drivers/i915simple/i915_fpc.h
@@ -29,7 +29,6 @@
 #ifndef I915_FPC_H
 #define I915_FPC_H
 
-#include "pipe/p_util.h"
 
 #include "i915_context.h"
 #include "i915_reg.h"
diff --git a/src/gallium/drivers/i915simple/i915_fpc_translate.c b/src/gallium/drivers/i915simple/i915_fpc_translate.c
index 64432982c4..34b4a846c1 100644
--- a/src/gallium/drivers/i915simple/i915_fpc_translate.c
+++ b/src/gallium/drivers/i915simple/i915_fpc_translate.c
@@ -33,6 +33,8 @@
 #include "i915_fpc.h"
 
 #include "pipe/p_shader_tokens.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
 #include "util/u_string.h"
 #include "tgsi/tgsi_parse.h"
 #include "tgsi/tgsi_dump.h"
diff --git a/src/gallium/drivers/i915simple/i915_prim_emit.c b/src/gallium/drivers/i915simple/i915_prim_emit.c
index 9ffa460138..d194c2fb15 100644
--- a/src/gallium/drivers/i915simple/i915_prim_emit.c
+++ b/src/gallium/drivers/i915simple/i915_prim_emit.c
@@ -27,7 +27,9 @@
 
 
 #include "draw/draw_pipe.h"
-#include "pipe/p_util.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "util/u_pack_color.h"
 
 #include "i915_context.h"
 #include "i915_winsys.h"
diff --git a/src/gallium/drivers/i915simple/i915_prim_vbuf.c b/src/gallium/drivers/i915simple/i915_prim_vbuf.c
index aef3682bbf..e4ece55098 100644
--- a/src/gallium/drivers/i915simple/i915_prim_vbuf.c
+++ b/src/gallium/drivers/i915simple/i915_prim_vbuf.c
@@ -41,9 +41,10 @@
 #include "draw/draw_context.h"
 #include "draw/draw_vbuf.h"
 #include "pipe/p_debug.h"
-#include "pipe/p_util.h"
 #include "pipe/p_inlines.h"
 #include "pipe/p_winsys.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
 
 #include "i915_context.h"
 #include "i915_reg.h"
diff --git a/src/gallium/drivers/i915simple/i915_screen.c b/src/gallium/drivers/i915simple/i915_screen.c
index 0afa17bed8..e9e40c3f0b 100644
--- a/src/gallium/drivers/i915simple/i915_screen.c
+++ b/src/gallium/drivers/i915simple/i915_screen.c
@@ -26,7 +26,7 @@
  **************************************************************************/
 
 
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "pipe/p_winsys.h"
 #include "util/u_string.h"
 
diff --git a/src/gallium/drivers/i915simple/i915_state.c b/src/gallium/drivers/i915simple/i915_state.c
index e8521b385e..d2487d8277 100644
--- a/src/gallium/drivers/i915simple/i915_state.c
+++ b/src/gallium/drivers/i915simple/i915_state.c
@@ -31,8 +31,9 @@
 
 #include "draw/draw_context.h"
 #include "pipe/p_winsys.h"
-#include "pipe/p_util.h"
 #include "pipe/p_inlines.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
 #include "tgsi/tgsi_parse.h"
 
 #include "i915_context.h"
diff --git a/src/gallium/drivers/i915simple/i915_state_derived.c b/src/gallium/drivers/i915simple/i915_state_derived.c
index 4daccec6e0..488615067c 100644
--- a/src/gallium/drivers/i915simple/i915_state_derived.c
+++ b/src/gallium/drivers/i915simple/i915_state_derived.c
@@ -26,7 +26,7 @@
  **************************************************************************/
 
 
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "pipe/p_shader_tokens.h"
 #include "draw/draw_context.h"
 #include "draw/draw_vertex.h"
diff --git a/src/gallium/drivers/i915simple/i915_state_dynamic.c b/src/gallium/drivers/i915simple/i915_state_dynamic.c
index 8cfbdddd19..86126a5a15 100644
--- a/src/gallium/drivers/i915simple/i915_state_dynamic.c
+++ b/src/gallium/drivers/i915simple/i915_state_dynamic.c
@@ -30,7 +30,9 @@
 #include "i915_context.h"
 #include "i915_reg.h"
 #include "i915_state.h"
-#include "pipe/p_util.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "util/u_pack_color.h"
 
 #define FILE_DEBUG_FLAG DEBUG_STATE
 
diff --git a/src/gallium/drivers/i915simple/i915_state_immediate.c b/src/gallium/drivers/i915simple/i915_state_immediate.c
index 2501f2d7cb..8c16bb4e27 100644
--- a/src/gallium/drivers/i915simple/i915_state_immediate.c
+++ b/src/gallium/drivers/i915simple/i915_state_immediate.c
@@ -33,7 +33,7 @@
 #include "i915_context.h"
 #include "i915_state.h"
 #include "i915_reg.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 
 
 /* All state expressable with the LOAD_STATE_IMMEDIATE_1 packet.
diff --git a/src/gallium/drivers/i915simple/i915_state_sampler.c b/src/gallium/drivers/i915simple/i915_state_sampler.c
index 7868f21ca6..c09c10601b 100644
--- a/src/gallium/drivers/i915simple/i915_state_sampler.c
+++ b/src/gallium/drivers/i915simple/i915_state_sampler.c
@@ -27,7 +27,7 @@
 
 #include "pipe/p_context.h"
 #include "pipe/p_state.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 
 #include "i915_state_inlines.h"
 #include "i915_context.h"
diff --git a/src/gallium/drivers/i915simple/i915_surface.c b/src/gallium/drivers/i915simple/i915_surface.c
index 17b5125e56..62f1926644 100644
--- a/src/gallium/drivers/i915simple/i915_surface.c
+++ b/src/gallium/drivers/i915simple/i915_surface.c
@@ -30,10 +30,9 @@
 #include "i915_state.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_inlines.h"
-#include "pipe/p_util.h"
 #include "pipe/p_inlines.h"
 #include "pipe/p_winsys.h"
-#include "util/p_tile.h"
+#include "util/u_tile.h"
 #include "util/u_rect.h"
 
 
diff --git a/src/gallium/drivers/i915simple/i915_texture.c b/src/gallium/drivers/i915simple/i915_texture.c
index ca0fb8761b..32344da4d5 100644
--- a/src/gallium/drivers/i915simple/i915_texture.c
+++ b/src/gallium/drivers/i915simple/i915_texture.c
@@ -34,8 +34,9 @@
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_inlines.h"
-#include "pipe/p_util.h"
 #include "pipe/p_winsys.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
 
 #include "i915_context.h"
 #include "i915_texture.h"
diff --git a/src/gallium/drivers/i965simple/brw_cc.c b/src/gallium/drivers/i965simple/brw_cc.c
index 337e4f95f6..79d4150383 100644
--- a/src/gallium/drivers/i965simple/brw_cc.c
+++ b/src/gallium/drivers/i965simple/brw_cc.c
@@ -29,7 +29,8 @@
   *   Keith Whitwell <keith@tungstengraphics.com>
   */
 
-#include "pipe/p_util.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
 
 #include "brw_context.h"
 #include "brw_state.h"
@@ -232,8 +233,7 @@ static void upload_cc_unit( struct brw_context *brw )
       cc.cc3.alpha_test_func = 
 	 brw_translate_compare_func(brw->attribs.DepthStencil->alpha.func);
 
-      UNCLAMPED_FLOAT_TO_UBYTE(cc.cc7.alpha_ref.ub[0], 
-			       brw->attribs.DepthStencil->alpha.ref);
+      cc.cc7.alpha_ref.ub[0] = float_to_ubyte(brw->attribs.DepthStencil->alpha.ref);
 
       cc.cc3.alpha_test_format = BRW_ALPHATEST_FORMAT_UNORM8;
    }
diff --git a/src/gallium/drivers/i965simple/brw_clip_state.c b/src/gallium/drivers/i965simple/brw_clip_state.c
index ea5c05a279..8e78dd51be 100644
--- a/src/gallium/drivers/i965simple/brw_clip_state.c
+++ b/src/gallium/drivers/i965simple/brw_clip_state.c
@@ -32,7 +32,8 @@
 #include "brw_context.h"
 #include "brw_state.h"
 #include "brw_defines.h"
-#include "pipe/p_util.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
 
 
 static void upload_clip_unit( struct brw_context *brw )
diff --git a/src/gallium/drivers/i965simple/brw_context.c b/src/gallium/drivers/i965simple/brw_context.c
index 8326f7b9c4..96920df008 100644
--- a/src/gallium/drivers/i965simple/brw_context.c
+++ b/src/gallium/drivers/i965simple/brw_context.c
@@ -39,7 +39,7 @@
 
 #include "pipe/p_winsys.h"
 #include "pipe/p_context.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "pipe/p_screen.h"
 
 
diff --git a/src/gallium/drivers/i965simple/brw_curbe.c b/src/gallium/drivers/i965simple/brw_curbe.c
index 52bbd525c1..824ee7fd6d 100644
--- a/src/gallium/drivers/i965simple/brw_curbe.c
+++ b/src/gallium/drivers/i965simple/brw_curbe.c
@@ -39,7 +39,8 @@
 #include "brw_wm.h"
 #include "pipe/p_state.h"
 #include "pipe/p_winsys.h"
-#include "pipe/p_util.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
 
 #define FILE_DEBUG_FLAG DEBUG_FALLBACKS
 
diff --git a/src/gallium/drivers/i965simple/brw_draw_upload.c b/src/gallium/drivers/i965simple/brw_draw_upload.c
index 9c0c78c236..7c20ea52af 100644
--- a/src/gallium/drivers/i965simple/brw_draw_upload.c
+++ b/src/gallium/drivers/i965simple/brw_draw_upload.c
@@ -33,6 +33,7 @@
 #include "brw_context.h"
 #include "brw_state.h"
 
+
 struct brw_array_state {
    union header_union header;
 
diff --git a/src/gallium/drivers/i965simple/brw_gs_state.c b/src/gallium/drivers/i965simple/brw_gs_state.c
index 3932e9e939..5b8016b2e9 100644
--- a/src/gallium/drivers/i965simple/brw_gs_state.c
+++ b/src/gallium/drivers/i965simple/brw_gs_state.c
@@ -34,7 +34,8 @@
 #include "brw_context.h"
 #include "brw_state.h"
 #include "brw_defines.h"
-#include "pipe/p_util.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
 
 
diff --git a/src/gallium/drivers/i965simple/brw_screen.c b/src/gallium/drivers/i965simple/brw_screen.c
index fadfbf94ab..ab7cd624b2 100644
--- a/src/gallium/drivers/i965simple/brw_screen.c
+++ b/src/gallium/drivers/i965simple/brw_screen.c
@@ -26,7 +26,7 @@
  **************************************************************************/
 
 
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "pipe/p_winsys.h"
 #include "util/u_string.h"
 
diff --git a/src/gallium/drivers/i965simple/brw_sf_state.c b/src/gallium/drivers/i965simple/brw_sf_state.c
index 9acd3ea61b..2a5de61c21 100644
--- a/src/gallium/drivers/i965simple/brw_sf_state.c
+++ b/src/gallium/drivers/i965simple/brw_sf_state.c
@@ -30,11 +30,12 @@
   */
 
 
-
 #include "brw_context.h"
 #include "brw_state.h"
 #include "brw_defines.h"
-#include "pipe/p_util.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
 
 static void upload_sf_vp(struct brw_context *brw)
 {
diff --git a/src/gallium/drivers/i965simple/brw_shader_info.c b/src/gallium/drivers/i965simple/brw_shader_info.c
index 30f37a99d4..86d877d7ef 100644
--- a/src/gallium/drivers/i965simple/brw_shader_info.c
+++ b/src/gallium/drivers/i965simple/brw_shader_info.c
@@ -1,7 +1,7 @@
 
 #include "brw_context.h"
 #include "brw_state.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "pipe/p_shader_tokens.h"
 #include "tgsi/tgsi_parse.h"
 
diff --git a/src/gallium/drivers/i965simple/brw_state.c b/src/gallium/drivers/i965simple/brw_state.c
index 27ca32843d..af46cb546f 100644
--- a/src/gallium/drivers/i965simple/brw_state.c
+++ b/src/gallium/drivers/i965simple/brw_state.c
@@ -31,7 +31,7 @@
 
 
 #include "pipe/p_winsys.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "pipe/p_inlines.h"
 #include "pipe/p_shader_tokens.h"
 #include "tgsi/tgsi_dump.h"
diff --git a/src/gallium/drivers/i965simple/brw_state_batch.c b/src/gallium/drivers/i965simple/brw_state_batch.c
index 35db76b594..43a1c89fc4 100644
--- a/src/gallium/drivers/i965simple/brw_state_batch.c
+++ b/src/gallium/drivers/i965simple/brw_state_batch.c
@@ -32,7 +32,7 @@
 #include "brw_state.h"
 #include "brw_winsys.h"
 
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 
 /* A facility similar to the data caching code above, which aims to
  * prevent identical commands being issued repeatedly.
diff --git a/src/gallium/drivers/i965simple/brw_state_cache.c b/src/gallium/drivers/i965simple/brw_state_cache.c
index b3a5124461..094248fa69 100644
--- a/src/gallium/drivers/i965simple/brw_state_cache.c
+++ b/src/gallium/drivers/i965simple/brw_state_cache.c
@@ -38,7 +38,7 @@
 #include "brw_sf.h"
 #include "brw_gs.h"
 
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 
 
diff --git a/src/gallium/drivers/i965simple/brw_state_pool.c b/src/gallium/drivers/i965simple/brw_state_pool.c
index f3174bfe0a..78d4c0e411 100644
--- a/src/gallium/drivers/i965simple/brw_state_pool.c
+++ b/src/gallium/drivers/i965simple/brw_state_pool.c
@@ -43,7 +43,8 @@
  */
 
 #include "pipe/p_winsys.h"
-#include "pipe/p_util.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
 #include "pipe/p_inlines.h"
 #include "brw_context.h"
 #include "brw_state.h"
diff --git a/src/gallium/drivers/i965simple/brw_state_upload.c b/src/gallium/drivers/i965simple/brw_state_upload.c
index e727601e1e..bac9161b5f 100644
--- a/src/gallium/drivers/i965simple/brw_state_upload.c
+++ b/src/gallium/drivers/i965simple/brw_state_upload.c
@@ -33,7 +33,7 @@
 #include "brw_context.h"
 #include "brw_state.h"
 
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 
 /* This is used to initialize brw->state.atoms[].  We could use this
  * list directly except for a single atom, brw_constant_buffer, which
diff --git a/src/gallium/drivers/i965simple/brw_surface.c b/src/gallium/drivers/i965simple/brw_surface.c
index 69da252285..b89756c47b 100644
--- a/src/gallium/drivers/i965simple/brw_surface.c
+++ b/src/gallium/drivers/i965simple/brw_surface.c
@@ -29,10 +29,9 @@
 #include "brw_context.h"
 #include "brw_state.h"
 #include "pipe/p_defines.h"
-#include "pipe/p_util.h"
 #include "pipe/p_inlines.h"
 #include "pipe/p_winsys.h"
-#include "util/p_tile.h"
+#include "util/u_tile.h"
 #include "util/u_rect.h"
 
 
diff --git a/src/gallium/drivers/i965simple/brw_tex_layout.c b/src/gallium/drivers/i965simple/brw_tex_layout.c
index 9b6cf81723..05eda9d1f2 100644
--- a/src/gallium/drivers/i965simple/brw_tex_layout.c
+++ b/src/gallium/drivers/i965simple/brw_tex_layout.c
@@ -33,16 +33,16 @@
 /* Code to layout images in a mipmap tree for i965.
  */
 
-#include "brw_tex_layout.h"
-
 #include "pipe/p_state.h"
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
-#include "pipe/p_util.h"
 #include "pipe/p_inlines.h"
 #include "pipe/p_winsys.h"
-
+#include "util/u_math.h"
+#include "util/u_memory.h"
 #include "brw_context.h"
+#include "brw_tex_layout.h"
+
 
 #define FILE_DEBUG_FLAG DEBUG_TEXTURE
 
diff --git a/src/gallium/drivers/i965simple/brw_vs_state.c b/src/gallium/drivers/i965simple/brw_vs_state.c
index c73469929c..1eaff87892 100644
--- a/src/gallium/drivers/i965simple/brw_vs_state.c
+++ b/src/gallium/drivers/i965simple/brw_vs_state.c
@@ -34,7 +34,8 @@
 #include "brw_state.h"
 #include "brw_defines.h"
 
-#include "pipe/p_util.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
 
 static void upload_vs_unit( struct brw_context *brw )
 {
diff --git a/src/gallium/drivers/i965simple/brw_wm.c b/src/gallium/drivers/i965simple/brw_wm.c
index 7fc5f59a98..8de565b96c 100644
--- a/src/gallium/drivers/i965simple/brw_wm.c
+++ b/src/gallium/drivers/i965simple/brw_wm.c
@@ -35,7 +35,7 @@
 #include "brw_wm.h"
 #include "brw_eu.h"
 #include "brw_state.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 
 
diff --git a/src/gallium/drivers/i965simple/brw_wm_decl.c b/src/gallium/drivers/i965simple/brw_wm_decl.c
index e6f1a44817..d50e66f613 100644
--- a/src/gallium/drivers/i965simple/brw_wm_decl.c
+++ b/src/gallium/drivers/i965simple/brw_wm_decl.c
@@ -2,7 +2,8 @@
 #include "brw_context.h"
 #include "brw_eu.h"
 #include "brw_wm.h"
-#include "pipe/p_util.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
 #include "pipe/p_shader_tokens.h"
 #include "tgsi/tgsi_parse.h"
 
diff --git a/src/gallium/drivers/i965simple/brw_wm_glsl.c b/src/gallium/drivers/i965simple/brw_wm_glsl.c
index 6a4a5aef09..ab6410aa60 100644
--- a/src/gallium/drivers/i965simple/brw_wm_glsl.c
+++ b/src/gallium/drivers/i965simple/brw_wm_glsl.c
@@ -2,7 +2,8 @@
 #include "brw_context.h"
 #include "brw_eu.h"
 #include "brw_wm.h"
-#include "pipe/p_util.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
 #include "pipe/p_shader_tokens.h"
 #include "tgsi/tgsi_parse.h"
 
diff --git a/src/gallium/drivers/i965simple/brw_wm_sampler_state.c b/src/gallium/drivers/i965simple/brw_wm_sampler_state.c
index b9eaee56ee..52b2909a65 100644
--- a/src/gallium/drivers/i965simple/brw_wm_sampler_state.c
+++ b/src/gallium/drivers/i965simple/brw_wm_sampler_state.c
@@ -34,7 +34,8 @@
 #include "brw_state.h"
 #include "brw_defines.h"
 
-#include "pipe/p_util.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
 
 
 #define COMPAREFUNC_ALWAYS		0
diff --git a/src/gallium/drivers/i965simple/brw_wm_state.c b/src/gallium/drivers/i965simple/brw_wm_state.c
index f3aa36b07f..37a9bf919c 100644
--- a/src/gallium/drivers/i965simple/brw_wm_state.c
+++ b/src/gallium/drivers/i965simple/brw_wm_state.c
@@ -34,7 +34,8 @@
 #include "brw_state.h"
 #include "brw_defines.h"
 #include "brw_wm.h"
-#include "pipe/p_util.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
 
 /***********************************************************************
  * WM unit - fragment programs and rasterization
diff --git a/src/gallium/drivers/softpipe/sp_context.c b/src/gallium/drivers/softpipe/sp_context.c
index 9b1313bc83..dda90f760a 100644
--- a/src/gallium/drivers/softpipe/sp_context.c
+++ b/src/gallium/drivers/softpipe/sp_context.c
@@ -32,8 +32,8 @@
 #include "draw/draw_context.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_inlines.h"
-#include "pipe/p_util.h"
 #include "util/u_math.h"
+#include "util/u_memory.h"
 #include "sp_clear.h"
 #include "sp_context.h"
 #include "sp_flush.h"
diff --git a/src/gallium/drivers/softpipe/sp_fs_exec.c b/src/gallium/drivers/softpipe/sp_fs_exec.c
index cc171bbc39..d0456731be 100644
--- a/src/gallium/drivers/softpipe/sp_fs_exec.c
+++ b/src/gallium/drivers/softpipe/sp_fs_exec.c
@@ -34,7 +34,7 @@
 
 #include "pipe/p_state.h"
 #include "pipe/p_defines.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "pipe/p_inlines.h"
 #include "tgsi/tgsi_exec.h"
 #include "tgsi/tgsi_parse.h"
diff --git a/src/gallium/drivers/softpipe/sp_fs_llvm.c b/src/gallium/drivers/softpipe/sp_fs_llvm.c
index 20226da78c..34adac5226 100644
--- a/src/gallium/drivers/softpipe/sp_fs_llvm.c
+++ b/src/gallium/drivers/softpipe/sp_fs_llvm.c
@@ -36,7 +36,7 @@
 
 #include "pipe/p_state.h"
 #include "pipe/p_defines.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "pipe/p_inlines.h"
 #include "tgsi/tgsi_sse2.h"
 
diff --git a/src/gallium/drivers/softpipe/sp_fs_sse.c b/src/gallium/drivers/softpipe/sp_fs_sse.c
index 8b7da7c747..35653a8e48 100644
--- a/src/gallium/drivers/softpipe/sp_fs_sse.c
+++ b/src/gallium/drivers/softpipe/sp_fs_sse.c
@@ -34,7 +34,7 @@
 
 #include "pipe/p_state.h"
 #include "pipe/p_defines.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "pipe/p_inlines.h"
 #include "tgsi/tgsi_exec.h"
 #include "tgsi/tgsi_sse2.h"
diff --git a/src/gallium/drivers/softpipe/sp_prim_setup.c b/src/gallium/drivers/softpipe/sp_prim_setup.c
index 941ab62e00..038ff04d4f 100644
--- a/src/gallium/drivers/softpipe/sp_prim_setup.c
+++ b/src/gallium/drivers/softpipe/sp_prim_setup.c
@@ -41,7 +41,7 @@
 #include "sp_prim_setup.h"
 #include "draw/draw_pipe.h"
 #include "draw/draw_vertex.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 
 /**
  * Triangle setup info (derived from draw_stage).
diff --git a/src/gallium/drivers/softpipe/sp_prim_vbuf.c b/src/gallium/drivers/softpipe/sp_prim_vbuf.c
index e9fae951e0..425e13cd28 100644
--- a/src/gallium/drivers/softpipe/sp_prim_vbuf.c
+++ b/src/gallium/drivers/softpipe/sp_prim_vbuf.c
@@ -43,6 +43,7 @@
 #include "sp_setup.h"
 #include "draw/draw_context.h"
 #include "draw/draw_vbuf.h"
+#include "util/u_memory.h"
 
 
 #define SP_MAX_VBUF_INDEXES 1024
diff --git a/src/gallium/drivers/softpipe/sp_quad_alpha_test.c b/src/gallium/drivers/softpipe/sp_quad_alpha_test.c
index 7a42b08ef5..7d3580fb4f 100644
--- a/src/gallium/drivers/softpipe/sp_quad_alpha_test.c
+++ b/src/gallium/drivers/softpipe/sp_quad_alpha_test.c
@@ -7,7 +7,7 @@
 #include "sp_headers.h"
 #include "sp_quad.h"
 #include "pipe/p_defines.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 
 
 static void
diff --git a/src/gallium/drivers/softpipe/sp_quad_blend.c b/src/gallium/drivers/softpipe/sp_quad_blend.c
index 74c6bff84a..a834accb86 100644
--- a/src/gallium/drivers/softpipe/sp_quad_blend.c
+++ b/src/gallium/drivers/softpipe/sp_quad_blend.c
@@ -31,7 +31,8 @@
  */
 
 #include "pipe/p_defines.h"
-#include "pipe/p_util.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
 #include "sp_context.h"
 #include "sp_headers.h"
 #include "sp_surface.h"
@@ -128,15 +129,15 @@ logicop_quad(struct quad_stage *qs, struct quad_header *quad)
 
       /* convert to ubyte */
       for (j = 0; j < 4; j++) { /* loop over R,G,B,A channels */
-         UNCLAMPED_FLOAT_TO_UBYTE(dst[j][0], dest[j][0]); /* P0 */
-         UNCLAMPED_FLOAT_TO_UBYTE(dst[j][1], dest[j][1]); /* P1 */
-         UNCLAMPED_FLOAT_TO_UBYTE(dst[j][2], dest[j][2]); /* P2 */
-         UNCLAMPED_FLOAT_TO_UBYTE(dst[j][3], dest[j][3]); /* P3 */
-
-         UNCLAMPED_FLOAT_TO_UBYTE(src[j][0], quadColor[j][0]); /* P0 */
-         UNCLAMPED_FLOAT_TO_UBYTE(src[j][1], quadColor[j][1]); /* P1 */
-         UNCLAMPED_FLOAT_TO_UBYTE(src[j][2], quadColor[j][2]); /* P2 */
-         UNCLAMPED_FLOAT_TO_UBYTE(src[j][3], quadColor[j][3]); /* P3 */
+         dst[j][0] = float_to_ubyte(dest[j][0]); /* P0 */
+         dst[j][1] = float_to_ubyte(dest[j][1]); /* P1 */
+         dst[j][2] = float_to_ubyte(dest[j][2]); /* P2 */
+         dst[j][3] = float_to_ubyte(dest[j][3]); /* P3 */
+
+         src[j][0] = float_to_ubyte(quadColor[j][0]); /* P0 */
+         src[j][1] = float_to_ubyte(quadColor[j][1]); /* P1 */
+         src[j][2] = float_to_ubyte(quadColor[j][2]); /* P2 */
+         src[j][3] = float_to_ubyte(quadColor[j][3]); /* P3 */
       }
 
       switch (softpipe->blend->logicop_func) {
@@ -209,10 +210,10 @@ logicop_quad(struct quad_stage *qs, struct quad_header *quad)
       }
 
       for (j = 0; j < 4; j++) {
-         quadColor[j][0] = UBYTE_TO_FLOAT(res[j][0]);
-         quadColor[j][1] = UBYTE_TO_FLOAT(res[j][1]);
-         quadColor[j][2] = UBYTE_TO_FLOAT(res[j][2]);
-         quadColor[j][3] = UBYTE_TO_FLOAT(res[j][3]);
+         quadColor[j][0] = ubyte_to_float(res[j][0]);
+         quadColor[j][1] = ubyte_to_float(res[j][1]);
+         quadColor[j][2] = ubyte_to_float(res[j][2]);
+         quadColor[j][3] = ubyte_to_float(res[j][3]);
       }
    }
 
diff --git a/src/gallium/drivers/softpipe/sp_quad_bufloop.c b/src/gallium/drivers/softpipe/sp_quad_bufloop.c
index b3db428ef1..92e9af09c1 100644
--- a/src/gallium/drivers/softpipe/sp_quad_bufloop.c
+++ b/src/gallium/drivers/softpipe/sp_quad_bufloop.c
@@ -1,5 +1,5 @@
 
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "sp_context.h"
 #include "sp_headers.h"
 #include "sp_surface.h"
diff --git a/src/gallium/drivers/softpipe/sp_quad_colormask.c b/src/gallium/drivers/softpipe/sp_quad_colormask.c
index 7fe080990b..f72f31db97 100644
--- a/src/gallium/drivers/softpipe/sp_quad_colormask.c
+++ b/src/gallium/drivers/softpipe/sp_quad_colormask.c
@@ -31,7 +31,8 @@
  */
 
 #include "pipe/p_defines.h"
-#include "pipe/p_util.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
 #include "sp_context.h"
 #include "sp_headers.h"
 #include "sp_surface.h"
diff --git a/src/gallium/drivers/softpipe/sp_quad_coverage.c b/src/gallium/drivers/softpipe/sp_quad_coverage.c
index dd5ebb2296..ad907ec25f 100644
--- a/src/gallium/drivers/softpipe/sp_quad_coverage.c
+++ b/src/gallium/drivers/softpipe/sp_quad_coverage.c
@@ -33,7 +33,7 @@
 
 
 #include "pipe/p_defines.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "sp_context.h"
 #include "sp_headers.h"
 #include "sp_quad.h"
diff --git a/src/gallium/drivers/softpipe/sp_quad_depth_test.c b/src/gallium/drivers/softpipe/sp_quad_depth_test.c
index 0c82692c6e..227cb2014e 100644
--- a/src/gallium/drivers/softpipe/sp_quad_depth_test.c
+++ b/src/gallium/drivers/softpipe/sp_quad_depth_test.c
@@ -30,7 +30,7 @@
  */
 
 #include "pipe/p_defines.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "sp_context.h"
 #include "sp_headers.h"
 #include "sp_surface.h"
diff --git a/src/gallium/drivers/softpipe/sp_quad_earlyz.c b/src/gallium/drivers/softpipe/sp_quad_earlyz.c
index 22ea99049f..5a66a86699 100644
--- a/src/gallium/drivers/softpipe/sp_quad_earlyz.c
+++ b/src/gallium/drivers/softpipe/sp_quad_earlyz.c
@@ -30,7 +30,7 @@
  */
 
 #include "pipe/p_defines.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "sp_headers.h"
 #include "sp_quad.h"
 
diff --git a/src/gallium/drivers/softpipe/sp_quad_fs.c b/src/gallium/drivers/softpipe/sp_quad_fs.c
index 8c88c192f8..5499ba5361 100644
--- a/src/gallium/drivers/softpipe/sp_quad_fs.c
+++ b/src/gallium/drivers/softpipe/sp_quad_fs.c
@@ -35,7 +35,8 @@
  * all the enabled attributes run contiguously.
  */
 
-#include "pipe/p_util.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_shader_tokens.h"
 
diff --git a/src/gallium/drivers/softpipe/sp_quad_occlusion.c b/src/gallium/drivers/softpipe/sp_quad_occlusion.c
index 54254df1f1..db13e73ae3 100644
--- a/src/gallium/drivers/softpipe/sp_quad_occlusion.c
+++ b/src/gallium/drivers/softpipe/sp_quad_occlusion.c
@@ -33,7 +33,7 @@
 
 
 #include "pipe/p_defines.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "sp_context.h"
 #include "sp_headers.h"
 #include "sp_surface.h"
diff --git a/src/gallium/drivers/softpipe/sp_quad_output.c b/src/gallium/drivers/softpipe/sp_quad_output.c
index 40083138a4..b64646a449 100644
--- a/src/gallium/drivers/softpipe/sp_quad_output.c
+++ b/src/gallium/drivers/softpipe/sp_quad_output.c
@@ -25,7 +25,7 @@
  * 
  **************************************************************************/
 
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "sp_context.h"
 #include "sp_headers.h"
 #include "sp_surface.h"
diff --git a/src/gallium/drivers/softpipe/sp_quad_stencil.c b/src/gallium/drivers/softpipe/sp_quad_stencil.c
index b4c7e942fa..ce9562e07c 100644
--- a/src/gallium/drivers/softpipe/sp_quad_stencil.c
+++ b/src/gallium/drivers/softpipe/sp_quad_stencil.c
@@ -10,7 +10,7 @@
 #include "sp_tile_cache.h"
 #include "sp_quad.h"
 #include "pipe/p_defines.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 
 
 /** Only 8-bit stencil supported */
diff --git a/src/gallium/drivers/softpipe/sp_quad_stipple.c b/src/gallium/drivers/softpipe/sp_quad_stipple.c
index f1e9b80e09..a39ecc2e9d 100644
--- a/src/gallium/drivers/softpipe/sp_quad_stipple.c
+++ b/src/gallium/drivers/softpipe/sp_quad_stipple.c
@@ -7,7 +7,7 @@
 #include "sp_headers.h"
 #include "sp_quad.h"
 #include "pipe/p_defines.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 
 
 /**
diff --git a/src/gallium/drivers/softpipe/sp_query.c b/src/gallium/drivers/softpipe/sp_query.c
index adf9ccf64c..2106ee1d23 100644
--- a/src/gallium/drivers/softpipe/sp_query.c
+++ b/src/gallium/drivers/softpipe/sp_query.c
@@ -32,7 +32,7 @@
 #include "draw/draw_context.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_inlines.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "sp_context.h"
 #include "sp_query.h"
 
diff --git a/src/gallium/drivers/softpipe/sp_screen.c b/src/gallium/drivers/softpipe/sp_screen.c
index f6b3d7ac24..9644dbd168 100644
--- a/src/gallium/drivers/softpipe/sp_screen.c
+++ b/src/gallium/drivers/softpipe/sp_screen.c
@@ -26,7 +26,7 @@
  **************************************************************************/
 
 
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "pipe/p_winsys.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_screen.h"
diff --git a/src/gallium/drivers/softpipe/sp_setup.c b/src/gallium/drivers/softpipe/sp_setup.c
index c8c55fa6e8..87336ab6e3 100644
--- a/src/gallium/drivers/softpipe/sp_setup.c
+++ b/src/gallium/drivers/softpipe/sp_setup.c
@@ -42,9 +42,9 @@
 #include "draw/draw_context.h"
 #include "draw/draw_private.h"
 #include "draw/draw_vertex.h"
-#include "pipe/p_util.h"
 #include "pipe/p_shader_tokens.h"
 #include "util/u_math.h"
+#include "util/u_memory.h"
 
 
 #define DEBUG_VERTS 0
diff --git a/src/gallium/drivers/softpipe/sp_state_blend.c b/src/gallium/drivers/softpipe/sp_state_blend.c
index 2d40d6bd8f..384fe559af 100644
--- a/src/gallium/drivers/softpipe/sp_state_blend.c
+++ b/src/gallium/drivers/softpipe/sp_state_blend.c
@@ -28,7 +28,7 @@
 /* Authors:  Keith Whitwell <keith@tungstengraphics.com>
  */
 
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "sp_context.h"
 #include "sp_state.h"
 
diff --git a/src/gallium/drivers/softpipe/sp_state_derived.c b/src/gallium/drivers/softpipe/sp_state_derived.c
index f10a1fa471..6b6a4c3ff3 100644
--- a/src/gallium/drivers/softpipe/sp_state_derived.c
+++ b/src/gallium/drivers/softpipe/sp_state_derived.c
@@ -25,7 +25,8 @@
  * 
  **************************************************************************/
 
-#include "pipe/p_util.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
 #include "pipe/p_shader_tokens.h"
 #include "draw/draw_context.h"
 #include "draw/draw_vertex.h"
diff --git a/src/gallium/drivers/softpipe/sp_state_fs.c b/src/gallium/drivers/softpipe/sp_state_fs.c
index 76fe6bfef9..1be461b3a4 100644
--- a/src/gallium/drivers/softpipe/sp_state_fs.c
+++ b/src/gallium/drivers/softpipe/sp_state_fs.c
@@ -30,7 +30,7 @@
 #include "sp_fs.h"
 
 #include "pipe/p_defines.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "pipe/p_inlines.h"
 #include "pipe/p_winsys.h"
 #include "pipe/p_shader_tokens.h"
diff --git a/src/gallium/drivers/softpipe/sp_state_rasterizer.c b/src/gallium/drivers/softpipe/sp_state_rasterizer.c
index 98e04352db..87b7219683 100644
--- a/src/gallium/drivers/softpipe/sp_state_rasterizer.c
+++ b/src/gallium/drivers/softpipe/sp_state_rasterizer.c
@@ -26,7 +26,7 @@
  **************************************************************************/
 
 #include "pipe/p_defines.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "sp_context.h"
 #include "sp_state.h"
 #include "draw/draw_context.h"
diff --git a/src/gallium/drivers/softpipe/sp_state_sampler.c b/src/gallium/drivers/softpipe/sp_state_sampler.c
index 033288a0aa..99a28c0d7e 100644
--- a/src/gallium/drivers/softpipe/sp_state_sampler.c
+++ b/src/gallium/drivers/softpipe/sp_state_sampler.c
@@ -29,7 +29,7 @@
  *  Brian Paul
  */
 
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "pipe/p_inlines.h"
 
 #include "draw/draw_context.h"
diff --git a/src/gallium/drivers/softpipe/sp_surface.c b/src/gallium/drivers/softpipe/sp_surface.c
index bfbae234f1..389aceb27c 100644
--- a/src/gallium/drivers/softpipe/sp_surface.c
+++ b/src/gallium/drivers/softpipe/sp_surface.c
@@ -26,10 +26,9 @@
  **************************************************************************/
 
 #include "pipe/p_defines.h"
-#include "pipe/p_util.h"
 #include "pipe/p_inlines.h"
 #include "pipe/p_winsys.h"
-#include "util/p_tile.h"
+#include "util/u_tile.h"
 #include "util/u_rect.h"
 #include "sp_context.h"
 #include "sp_surface.h"
diff --git a/src/gallium/drivers/softpipe/sp_tex_sample.c b/src/gallium/drivers/softpipe/sp_tex_sample.c
index 58a95d13e1..49250ec084 100644
--- a/src/gallium/drivers/softpipe/sp_tex_sample.c
+++ b/src/gallium/drivers/softpipe/sp_tex_sample.c
@@ -39,9 +39,9 @@
 #include "sp_tile_cache.h"
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
-#include "pipe/p_util.h"
 #include "tgsi/tgsi_exec.h"
 #include "util/u_math.h"
+#include "util/u_memory.h"
 
 
 /*
diff --git a/src/gallium/drivers/softpipe/sp_texture.c b/src/gallium/drivers/softpipe/sp_texture.c
index f775591352..3a737d6f72 100644
--- a/src/gallium/drivers/softpipe/sp_texture.c
+++ b/src/gallium/drivers/softpipe/sp_texture.c
@@ -33,8 +33,9 @@
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_inlines.h"
-#include "pipe/p_util.h"
 #include "pipe/p_winsys.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
 
 #include "sp_context.h"
 #include "sp_state.h"
diff --git a/src/gallium/drivers/softpipe/sp_tile_cache.c b/src/gallium/drivers/softpipe/sp_tile_cache.c
index 57c12ffe33..b50c984513 100644
--- a/src/gallium/drivers/softpipe/sp_tile_cache.c
+++ b/src/gallium/drivers/softpipe/sp_tile_cache.c
@@ -32,9 +32,9 @@
  *    Brian Paul
  */
 
-#include "pipe/p_util.h"
 #include "pipe/p_inlines.h"
-#include "util/p_tile.h"
+#include "util/u_memory.h"
+#include "util/u_tile.h"
 #include "sp_context.h"
 #include "sp_surface.h"
 #include "sp_texture.h"
diff --git a/src/gallium/drivers/trace/tr_context.c b/src/gallium/drivers/trace/tr_context.c
index f16359e8ad..1dd7719379 100644
--- a/src/gallium/drivers/trace/tr_context.c
+++ b/src/gallium/drivers/trace/tr_context.c
@@ -25,7 +25,7 @@
  *
  **************************************************************************/
 
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "pipe/p_screen.h"
 
 #include "tr_dump.h"
diff --git a/src/gallium/drivers/trace/tr_dump.c b/src/gallium/drivers/trace/tr_dump.c
index 1613a626df..48032c1617 100644
--- a/src/gallium/drivers/trace/tr_dump.c
+++ b/src/gallium/drivers/trace/tr_dump.c
@@ -45,6 +45,8 @@
 #endif
 
 #include "pipe/p_compiler.h"
+#include "pipe/p_debug.h"
+#include "util/u_memory.h"
 #include "util/u_string.h"
 
 #include "tr_stream.h"
diff --git a/src/gallium/drivers/trace/tr_dump.h b/src/gallium/drivers/trace/tr_dump.h
index 6ddc8fc15c..76a53731b3 100644
--- a/src/gallium/drivers/trace/tr_dump.h
+++ b/src/gallium/drivers/trace/tr_dump.h
@@ -35,7 +35,6 @@
 
 
 #include "pipe/p_compiler.h"
-#include "pipe/p_util.h"
 
 
 boolean trace_dump_trace_begin(void);
diff --git a/src/gallium/drivers/trace/tr_screen.c b/src/gallium/drivers/trace/tr_screen.c
index a6467ec35f..8789f86b1a 100644
--- a/src/gallium/drivers/trace/tr_screen.c
+++ b/src/gallium/drivers/trace/tr_screen.c
@@ -25,7 +25,7 @@
  *
  **************************************************************************/
 
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 
 #include "tr_dump.h"
 #include "tr_state.h"
diff --git a/src/gallium/drivers/trace/tr_state.c b/src/gallium/drivers/trace/tr_state.c
index 30ab5a8fdc..986d939e0c 100644
--- a/src/gallium/drivers/trace/tr_state.c
+++ b/src/gallium/drivers/trace/tr_state.c
@@ -27,6 +27,7 @@
 
 
 #include "pipe/p_compiler.h"
+#include "util/u_memory.h"
 #include "tgsi/tgsi_dump.h"
 
 #include "tr_dump.h"
diff --git a/src/gallium/drivers/trace/tr_stream_stdc.c b/src/gallium/drivers/trace/tr_stream_stdc.c
index 4c77e1c995..4c19ec0b24 100644
--- a/src/gallium/drivers/trace/tr_stream_stdc.c
+++ b/src/gallium/drivers/trace/tr_stream_stdc.c
@@ -36,7 +36,7 @@
 
 #include <stdio.h>
 
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 
 #include "tr_stream.h"
 
diff --git a/src/gallium/drivers/trace/tr_stream_wd.c b/src/gallium/drivers/trace/tr_stream_wd.c
index b3b65f0971..704eb15bd7 100644
--- a/src/gallium/drivers/trace/tr_stream_wd.c
+++ b/src/gallium/drivers/trace/tr_stream_wd.c
@@ -37,7 +37,7 @@
 #include <windows.h>
 #include <winddi.h>
 
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "util/u_string.h"
 
 #include "tr_stream.h"
diff --git a/src/gallium/drivers/trace/tr_texture.c b/src/gallium/drivers/trace/tr_texture.c
index 99ba74d366..440a78704a 100644
--- a/src/gallium/drivers/trace/tr_texture.c
+++ b/src/gallium/drivers/trace/tr_texture.c
@@ -25,9 +25,9 @@
  *
  **************************************************************************/
 
-#include "pipe/p_util.h"
 #include "pipe/p_inlines.h"
 #include "util/u_hash_table.h"
+#include "util/u_memory.h"
 
 #include "tr_screen.h"
 #include "tr_texture.h"
diff --git a/src/gallium/drivers/trace/tr_winsys.c b/src/gallium/drivers/trace/tr_winsys.c
index 2c7a6f893b..177835854e 100644
--- a/src/gallium/drivers/trace/tr_winsys.c
+++ b/src/gallium/drivers/trace/tr_winsys.c
@@ -25,8 +25,7 @@
  *
  **************************************************************************/
 
-#include "pipe/p_util.h"
-#include "pipe/p_state.h"
+#include "util/u_memory.h"
 #include "util/u_hash_table.h"
 
 #include "tr_dump.h"
diff --git a/src/gallium/include/pipe/p_util.h b/src/gallium/include/pipe/p_util.h
deleted file mode 100644
index 4a3fca5962..0000000000
--- a/src/gallium/include/pipe/p_util.h
+++ /dev/null
@@ -1,460 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-#ifndef P_UTIL_H
-#define P_UTIL_H
-
-#include "p_config.h"
-#include "p_compiler.h"
-#include "p_debug.h"
-#include "p_pointer.h"
-
-#if defined(PIPE_SUBSYSTEM_WINDOWS_MINIPORT)
-__inline double ceil(double val)
-{
-	double ceil_val;
-
-	if((val - (long) val) == 0) {
-		ceil_val = val;
-	} else {
-		if(val > 0) {
-			ceil_val = (long) val + 1;
-		} else {
-			ceil_val = (long) val;
-		}
-	}
-
-	return ceil_val;
-}
-
-#ifndef PIPE_SUBSYSTEM_WINDOWS_CE
-__inline double floor(double val)
-{
-	double floor_val;
-
-	if((val - (long) val) == 0) {
-		floor_val = val;
-	} else {
-		if(val > 0) {
-			floor_val = (long) val;
-		} else {
-			floor_val = (long) val - 1;
-		}
-	}
-
-	return floor_val;
-}
-#endif
-
-#pragma function(pow)
-__inline double __cdecl pow(double val, double exponent)
-{
-	/* XXX */
-	assert(0);
-	return 0;
-}
-
-#pragma function(log)
-__inline double __cdecl log(double val)
-{
-	/* XXX */
-	assert(0);
-	return 0;
-}
-
-#pragma function(atan2)
-__inline double __cdecl atan2(double val)
-{
-	/* XXX */
-	assert(0);
-	return 0;
-}
-#else
-#include <math.h>
-#include <stdarg.h>
-#endif
-
- /* Define ENOMEM for WINCE */ 
-#if (_WIN32_WCE < 600)
-#ifndef ENOMEM
-#define ENOMEM 12
-#endif
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-
-#if defined(PIPE_SUBSYSTEM_WINDOWS_DISPLAY) && defined(DEBUG) 
-
-/* memory debugging */
-
-#include "p_debug.h"
-
-#define MALLOC( _size ) \
-   debug_malloc( __FILE__, __LINE__, __FUNCTION__, _size )
-#define CALLOC( _count, _size ) \
-   debug_calloc(__FILE__, __LINE__, __FUNCTION__, _count, _size )
-#define FREE( _ptr ) \
-   debug_free( __FILE__, __LINE__, __FUNCTION__,  _ptr )
-#define REALLOC( _ptr, _old_size, _size ) \
-   debug_realloc( __FILE__, __LINE__, __FUNCTION__,  _ptr, _old_size, _size )
-
-#elif defined(PIPE_SUBSYSTEM_WINDOWS_DISPLAY)
-
-void * __stdcall
-EngAllocMem(
-    unsigned long Flags,
-    unsigned long MemSize,
-    unsigned long Tag );
-
-void __stdcall
-EngFreeMem(
-    void *Mem );
-
-#define MALLOC( _size ) EngAllocMem( 0, _size, 'D3AG' )
-#define _FREE( _ptr ) EngFreeMem( _ptr )
-
-#elif defined(PIPE_SUBSYSTEM_WINDOWS_MINIPORT)
-
-void *
-ExAllocatePool(
-    unsigned long PoolType, 
-    size_t NumberOfBytes);
-
-void 
-ExFreePool(void *P);
-
-#define MALLOC(_size) ExAllocatePool(0, _size)
-#define _FREE(_ptr) ExFreePool(_ptr)
-
-#else
-
-#define MALLOC( SIZE )  malloc( SIZE )
-#define CALLOC( COUNT, SIZE )   calloc( COUNT, SIZE )
-#define FREE( PTR )  free( PTR )
-#define REALLOC( OLDPTR, OLDSIZE, NEWSIZE )  realloc( OLDPTR, NEWSIZE )
-
-#endif
-
-
-#ifndef CALLOC
-static INLINE void *
-CALLOC( unsigned count, unsigned size )
-{
-   void *ptr = MALLOC( count * size );
-   if( ptr ) {
-      memset( ptr, 0, count * size );
-   }
-   return ptr;
-}
-#endif /* !CALLOC */
-
-#ifndef FREE
-static INLINE void
-FREE( void *ptr )
-{
-   if( ptr ) {
-      _FREE( ptr );
-   }
-}
-#endif /* !FREE */
-
-#ifndef REALLOC
-static INLINE void *
-REALLOC( void *old_ptr, unsigned old_size, unsigned new_size )
-{
-   void *new_ptr = NULL;
-
-   if (new_size != 0) {
-      unsigned copy_size = old_size < new_size ? old_size : new_size;
-      new_ptr = MALLOC( new_size );
-      if (new_ptr && old_ptr && copy_size) {
-         memcpy( new_ptr, old_ptr, copy_size );
-      }
-   }
-
-   FREE( old_ptr );
-   return new_ptr;
-}
-#endif /* !REALLOC */
-
-
-#define MALLOC_STRUCT(T)   (struct T *) MALLOC(sizeof(struct T))
-
-#define CALLOC_STRUCT(T)   (struct T *) CALLOC(1, sizeof(struct T))
-
-
-/**
- * Return memory on given byte alignment
- */
-static INLINE void *
-align_malloc(size_t bytes, uint alignment)
-{
-#if defined(HAVE_POSIX_MEMALIGN)
-   void *mem;
-   alignment = (alignment + (uint)sizeof(void*) - 1) & ~((uint)sizeof(void*) - 1);
-   if(posix_memalign(& mem, alignment, bytes) != 0)
-      return NULL;
-   return mem;
-#else
-   char *ptr, *buf;
-
-   assert( alignment > 0 );
-
-   ptr = (char *) MALLOC(bytes + alignment + sizeof(void *));
-   if (!ptr)
-      return NULL;
-
-   buf = (char *) align_pointer( ptr + sizeof(void *), alignment );
-   *(char **)(buf - sizeof(void *)) = ptr;
-
-   return buf;
-#endif /* defined(HAVE_POSIX_MEMALIGN) */
-}
-
-/**
- * Free memory returned by align_malloc().
- */
-static INLINE void
-align_free(void *ptr)
-{
-#if defined(HAVE_POSIX_MEMALIGN)
-   FREE(ptr);
-#else
-   void **cubbyHole = (void **) ((char *) ptr - sizeof(void *));
-   void *realAddr = *cubbyHole;
-   FREE(realAddr);
-#endif /* defined(HAVE_POSIX_MEMALIGN) */
-}
-
-
-
-/**
- * Duplicate a block of memory.
- */
-static INLINE void *
-mem_dup(const void *src, uint size)
-{
-   void *dup = MALLOC(size);
-   if (dup)
-      memcpy(dup, src, size);
-   return dup;
-}
-
-
-
-#define CLAMP( X, MIN, MAX )  ( (X)<(MIN) ? (MIN) : ((X)>(MAX) ? (MAX) : (X)) )
-#define MIN2( A, B )   ( (A)<(B) ? (A) : (B) )
-#define MAX2( A, B )   ( (A)>(B) ? (A) : (B) )
-
-#ifndef Elements
-#define Elements(x) (sizeof(x)/sizeof((x)[0]))
-#endif
-#define Offset(TYPE, MEMBER) ((unsigned)&(((TYPE *)NULL)->MEMBER))
-
-/**
- * Return a pointer aligned to next multiple of 16 bytes.
- */
-static INLINE void *
-align16( void *unaligned )
-{
-   return align_pointer( unaligned, 16 );
-}
-
-
-static INLINE int align(int value, int alignment)
-{
-   return (value + alignment - 1) & ~(alignment - 1);
-}
-
-
-
-
-#if defined(PIPE_CC_MSVC) && defined(PIPE_ARCH_X86)
-static INLINE unsigned ffs( unsigned u )
-{
-   unsigned i;
-
-   if( u == 0 ) {
-      return 0;
-   }
-
-   __asm bsf eax, [u]
-   __asm inc eax
-   __asm mov [i], eax
-
-   return i;
-}
-#endif
-
-union fi {
-   float f;
-   int i;
-   unsigned ui;
-};
-
-#define UBYTE_TO_FLOAT( ub ) ((float)(ub) / 255.0F)
-
-#define IEEE_0996 0x3f7f0000	/* 0.996 or so */
-
-/* This function/macro is sensitive to precision.  Test very carefully
- * if you change it!
- */
-#define UNCLAMPED_FLOAT_TO_UBYTE(UB, F)					\
-        do {								\
-           union fi __tmp;						\
-           __tmp.f = (F);						\
-           if (__tmp.i < 0)						\
-              UB = (ubyte) 0;						\
-           else if (__tmp.i >= IEEE_0996)				\
-              UB = (ubyte) 255;					\
-           else {							\
-              __tmp.f = __tmp.f * (255.0f/256.0f) + 32768.0f;		\
-              UB = (ubyte) __tmp.i;					\
-           }								\
-        } while (0)
-
-
-
-static INLINE unsigned pack_ub4( unsigned char b0,
-				 unsigned char b1,
-				 unsigned char b2,
-				 unsigned char b3 )
-{
-   return ((((unsigned int)b0) << 0) |
-	   (((unsigned int)b1) << 8) |
-	   (((unsigned int)b2) << 16) |
-	   (((unsigned int)b3) << 24));
-}
-
-static INLINE unsigned fui( float f )
-{
-   union fi fi;
-   fi.f = f;
-   return fi.ui;
-}
-
-static INLINE unsigned char float_to_ubyte( float f )
-{
-   unsigned char ub;
-   UNCLAMPED_FLOAT_TO_UBYTE(ub, f);
-   return ub;
-}
-
-static INLINE unsigned pack_ui32_float4( float a,
-					 float b, 
-					 float c, 
-					 float d )
-{
-   return pack_ub4( float_to_ubyte(a),
-		    float_to_ubyte(b),
-		    float_to_ubyte(c),
-		    float_to_ubyte(d) );
-}
-
-#define COPY_4V( DST, SRC )         \
-do {                                \
-   (DST)[0] = (SRC)[0];             \
-   (DST)[1] = (SRC)[1];             \
-   (DST)[2] = (SRC)[2];             \
-   (DST)[3] = (SRC)[3];             \
-} while (0)
-
-
-#define COPY_4FV( DST, SRC )  COPY_4V(DST, SRC)
-
-
-#define ASSIGN_4V( DST, V0, V1, V2, V3 ) \
-do {                                     \
-   (DST)[0] = (V0);                      \
-   (DST)[1] = (V1);                      \
-   (DST)[2] = (V2);                      \
-   (DST)[3] = (V3);                      \
-} while (0)
-
-
-
-#if defined(_MSC_VER) 
-#if _MSC_VER < 1400 && !defined(__cplusplus) || defined(PIPE_SUBSYSTEM_WINDOWS_CE)
- 
-static INLINE float cosf( float f ) 
-{
-   return (float) cos( (double) f );
-}
-
-static INLINE float sinf( float f ) 
-{
-   return (float) sin( (double) f );
-}
-
-static INLINE float ceilf( float f ) 
-{
-   return (float) ceil( (double) f );
-}
-
-static INLINE float floorf( float f ) 
-{
-   return (float) floor( (double) f );
-}
-
-static INLINE float powf( float f, float g ) 
-{
-   return (float) pow( (double) f, (double) g );
-}
-
-static INLINE float sqrtf( float f ) 
-{
-   return (float) sqrt( (double) f );
-}
-
-static INLINE float fabsf( float f ) 
-{
-   return (float) fabs( (double) f );
-}
-
-static INLINE float logf( float f ) 
-{
-   return (float) log( (double) f );
-}
-
-#else
-/* Work-around an extra semi-colon in VS 2005 logf definition */
-#ifdef logf
-#undef logf
-#define logf(x) ((float)log((double)(x)))
-#endif /* logf */
-#endif
-#endif /* _MSC_VER */
-
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/src/gallium/state_trackers/python/gallium.i b/src/gallium/state_trackers/python/gallium.i
index 641b19e940..a67372c623 100644
--- a/src/gallium/state_trackers/python/gallium.i
+++ b/src/gallium/state_trackers/python/gallium.i
@@ -42,7 +42,7 @@
 #include "pipe/p_screen.h"
 #include "pipe/p_context.h"
 #include "pipe/p_inlines.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "pipe/p_shader_tokens.h" 
 #include "cso_cache/cso_context.h"
 #include "util/u_draw_quad.h" 
diff --git a/src/gallium/state_trackers/python/st_device.c b/src/gallium/state_trackers/python/st_device.c
index a1889539dc..f71d85dd9b 100644
--- a/src/gallium/state_trackers/python/st_device.c
+++ b/src/gallium/state_trackers/python/st_device.c
@@ -26,12 +26,13 @@
  **************************************************************************/
 
 
-#include "pipe/p_util.h"
 #include "pipe/p_winsys.h"
 #include "pipe/p_context.h"
 #include "pipe/p_shader_tokens.h"
 #include "pipe/p_inlines.h"
 #include "cso_cache/cso_context.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
 #include "util/u_simple_shaders.h"
 #include "trace/tr_screen.h"
 #include "trace/tr_context.h"
diff --git a/src/gallium/state_trackers/python/st_sample.c b/src/gallium/state_trackers/python/st_sample.c
index b47c7be293..7765df3c4a 100644
--- a/src/gallium/state_trackers/python/st_sample.c
+++ b/src/gallium/state_trackers/python/st_sample.c
@@ -29,9 +29,10 @@
 #include "pipe/p_compiler.h"
 #include "pipe/p_format.h"
 #include "pipe/p_state.h"
-#include "pipe/p_util.h"
 #include "pipe/p_inlines.h"
-#include "util/p_tile.h"
+#include "util/u_tile.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
 
 #include "st_sample.h"
 
diff --git a/src/gallium/state_trackers/python/st_softpipe_winsys.c b/src/gallium/state_trackers/python/st_softpipe_winsys.c
index 6ea3c9a5cf..2d4f5434b3 100644
--- a/src/gallium/state_trackers/python/st_softpipe_winsys.c
+++ b/src/gallium/state_trackers/python/st_softpipe_winsys.c
@@ -39,8 +39,9 @@
 #include "pipe/p_winsys.h"
 #include "pipe/p_format.h"
 #include "pipe/p_context.h"
-#include "pipe/p_util.h"
 #include "pipe/p_inlines.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
 #include "softpipe/sp_winsys.h"
 #include "st_winsys.h"
 
diff --git a/src/gallium/winsys/drm/intel/common/intel_be_device.c b/src/gallium/winsys/drm/intel/common/intel_be_device.c
index 8db0329615..019ee5cbd2 100644
--- a/src/gallium/winsys/drm/intel/common/intel_be_device.c
+++ b/src/gallium/winsys/drm/intel/common/intel_be_device.c
@@ -13,8 +13,8 @@
 #include "pipe/p_winsys.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_state.h"
-#include "pipe/p_util.h"
 #include "pipe/p_inlines.h"
+#include "util/u_memory.h"
 
 #include "i915simple/i915_screen.h"
 
diff --git a/src/gallium/winsys/drm/intel/dri/intel_winsys_softpipe.c b/src/gallium/winsys/drm/intel/dri/intel_winsys_softpipe.c
index 0d98d16cf1..20920a2052 100644
--- a/src/gallium/winsys/drm/intel/dri/intel_winsys_softpipe.c
+++ b/src/gallium/winsys/drm/intel/dri/intel_winsys_softpipe.c
@@ -32,8 +32,8 @@
 #include "intel_context.h"
 #include "intel_winsys_softpipe.h"
 #include "pipe/p_defines.h"
-#include "pipe/p_util.h"
 #include "pipe/p_format.h"
+#include "util/u_memory.h"
 #include "softpipe/sp_winsys.h"
 
 
diff --git a/src/gallium/winsys/egl_xlib/egl_xlib.c b/src/gallium/winsys/egl_xlib/egl_xlib.c
index 829732eea8..e9f821d276 100644
--- a/src/gallium/winsys/egl_xlib/egl_xlib.c
+++ b/src/gallium/winsys/egl_xlib/egl_xlib.c
@@ -38,8 +38,8 @@
 #include "pipe/p_compiler.h"
 #include "pipe/p_format.h"
 #include "pipe/p_state.h"
-#include "pipe/p_util.h"
 #include "pipe/p_winsys.h"
+#include "util/u_memory.h"
 #include "softpipe/sp_winsys.h"
 
 #include "eglconfig.h"
diff --git a/src/gallium/winsys/egl_xlib/sw_winsys.c b/src/gallium/winsys/egl_xlib/sw_winsys.c
index f4199e6f89..ae81d7f801 100644
--- a/src/gallium/winsys/egl_xlib/sw_winsys.c
+++ b/src/gallium/winsys/egl_xlib/sw_winsys.c
@@ -37,8 +37,9 @@
 
 #include "pipe/p_winsys.h"
 #include "pipe/p_state.h"
-#include "pipe/p_util.h"
 #include "pipe/p_inlines.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
 
 #include "sw_winsys.h"
 
diff --git a/src/gallium/winsys/gdi/wmesa.c b/src/gallium/winsys/gdi/wmesa.c
index ff52ceb8c4..730fb1b541 100644
--- a/src/gallium/winsys/gdi/wmesa.c
+++ b/src/gallium/winsys/gdi/wmesa.c
@@ -12,8 +12,8 @@
 #include "pipe/p_winsys.h"
 #include "pipe/p_format.h"
 #include "pipe/p_context.h"
-#include "pipe/p_util.h"
 #include "pipe/p_inlines.h"
+#include "util/u_memory.h"
 #include "softpipe/sp_winsys.h"
 #include "glapi/glapi.h"
 #include "colors.h"
diff --git a/src/gallium/winsys/xlib/brw_aub.c b/src/gallium/winsys/xlib/brw_aub.c
index 6e814ce5d1..f319802962 100644
--- a/src/gallium/winsys/xlib/brw_aub.c
+++ b/src/gallium/winsys/xlib/brw_aub.c
@@ -34,7 +34,6 @@
 #include "brw_aub.h"
 #include "pipe/p_context.h"
 #include "pipe/p_state.h"
-#include "pipe/p_util.h"
 #include "pipe/p_debug.h"
 
 
diff --git a/src/gallium/winsys/xlib/xm_winsys.c b/src/gallium/winsys/xlib/xm_winsys.c
index 4b4dc56e84..68ead7f528 100644
--- a/src/gallium/winsys/xlib/xm_winsys.c
+++ b/src/gallium/winsys/xlib/xm_winsys.c
@@ -42,8 +42,9 @@
 #include "pipe/p_winsys.h"
 #include "pipe/p_format.h"
 #include "pipe/p_context.h"
-#include "pipe/p_util.h"
 #include "pipe/p_inlines.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
 #include "softpipe/sp_winsys.h"
 
 #ifdef GALLIUM_CELL
diff --git a/src/gallium/winsys/xlib/xm_winsys_aub.c b/src/gallium/winsys/xlib/xm_winsys_aub.c
index 7fc9debdd5..3439367636 100644
--- a/src/gallium/winsys/xlib/xm_winsys_aub.c
+++ b/src/gallium/winsys/xlib/xm_winsys_aub.c
@@ -37,7 +37,7 @@
 #include "xmesaP.h"
 
 #include "pipe/p_winsys.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "pipe/p_inlines.h"
 #include "i965simple/brw_winsys.h"
 #include "i965simple/brw_screen.h"
diff --git a/src/mesa/state_tracker/acc2.c b/src/mesa/state_tracker/acc2.c
new file mode 100644
index 0000000000..fa5de2b764
--- /dev/null
+++ b/src/mesa/state_tracker/acc2.c
@@ -0,0 +1,319 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Brian Paul
+  */
+
+#include "main/imports.h"
+#include "main/image.h"
+#include "main/macros.h"
+
+#include "st_context.h"
+#include "st_cb_accum.h"
+#include "st_cb_fbo.h"
+#include "st_draw.h"
+#include "st_format.h"
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_inlines.h"
+#include "util/p_tile.h"
+
+
+#define UNCLAMPED_FLOAT_TO_SHORT(us, f)  \
+   us = ( (short) ( CLAMP((f), -1.0, 1.0) * 32767.0F) )
+
+
+/**
+ * For hardware that supports deep color buffers, we could accelerate
+ * most/all the accum operations with blending/texturing.
+ * For now, just use the get/put_tile() functions and do things in software.
+ */
+
+
+static void
+acc_get_tile_rgba(struct pipe_context *pipe, struct pipe_surface *acc_ps,
+                  uint x, uint y, uint w, uint h, float *p)
+{
+   const enum pipe_format f = acc_ps->format;
+   const int cpp = acc_ps->cpp;
+
+   acc_ps->format = PIPE_FORMAT_R16G16B16A16_SNORM;
+   acc_ps->cpp = 8;
+
+   pipe_get_tile_rgba(pipe, acc_ps, x, y, w, h, p);
+
+   acc_ps->format = f;
+   acc_ps->cpp = cpp;
+}
+
+
+static void
+acc_put_tile_rgba(struct pipe_context *pipe, struct pipe_surface *acc_ps,
+                  uint x, uint y, uint w, uint h, const float *p)
+{
+   enum pipe_format f = acc_ps->format;
+   const int cpp = acc_ps->cpp;
+
+   acc_ps->format = PIPE_FORMAT_R16G16B16A16_SNORM;
+   acc_ps->cpp = 8;
+
+   pipe_put_tile_rgba(pipe, acc_ps, x, y, w, h, p);
+
+   acc_ps->format = f;
+   acc_ps->cpp = cpp;
+}
+
+
+
+void
+st_clear_accum_buffer(GLcontext *ctx, struct gl_renderbuffer *rb)
+{
+   struct pipe_context *pipe = ctx->st->pipe;
+   struct st_renderbuffer *acc_strb = st_renderbuffer(rb);
+   struct pipe_surface *acc_ps = acc_strb->surface;
+   const GLint xpos = ctx->DrawBuffer->_Xmin;
+   const GLint ypos = ctx->DrawBuffer->_Ymin;
+   const GLint width = ctx->DrawBuffer->_Xmax - xpos;
+   const GLint height = ctx->DrawBuffer->_Ymax - ypos;
+   const GLfloat r = ctx->Accum.ClearColor[0];
+   const GLfloat g = ctx->Accum.ClearColor[1];
+   const GLfloat b = ctx->Accum.ClearColor[2];
+   const GLfloat a = ctx->Accum.ClearColor[3];
+   GLfloat *accBuf = (GLfloat *) malloc(width * height * 4 * sizeof(GLfloat));
+   int i;
+
+#if 1
+   GLvoid *map;
+
+   map = pipe_surface_map(acc_ps);
+   switch (acc_strb->format) {
+   case PIPE_FORMAT_R16G16B16A16_SNORM:
+      {
+         GLshort r = FLOAT_TO_SHORT(ctx->Accum.ClearColor[0]);
+         GLshort g = FLOAT_TO_SHORT(ctx->Accum.ClearColor[1]);
+         GLshort b = FLOAT_TO_SHORT(ctx->Accum.ClearColor[2]);
+         GLshort a = FLOAT_TO_SHORT(ctx->Accum.ClearColor[3]);
+         int i, j;
+         for (i = 0; i < height; i++) {
+            GLshort *dst = ((GLshort *) map
+                            + ((ypos + i) * acc_ps->pitch + xpos) * 4);
+            for (j = 0; j < width; j++) {
+               dst[0] = r;
+               dst[1] = g;
+               dst[2] = b;
+               dst[3] = a;
+               dst += 4;
+            }
+         }
+      }
+      break;
+   default:
+      _mesa_problem(ctx, "unexpected format in st_clear_accum_buffer()");
+   }
+
+   pipe_surface_unmap(acc_ps);
+
+#else
+   for (i = 0; i < width * height; i++) {
+      accBuf[i*4+0] = r;
+      accBuf[i*4+1] = g;
+      accBuf[i*4+2] = b;
+      accBuf[i*4+3] = a;
+   }
+
+   acc_put_tile_rgba(pipe, acc_ps, xpos, ypos, width, height, accBuf);
+#endif
+}
+
+
+/** For ADD/MULT */
+static void
+accum_mad(struct pipe_context *pipe, GLfloat scale, GLfloat bias,
+          GLint xpos, GLint ypos, GLint width, GLint height,
+          struct pipe_surface *acc_ps)
+{
+   GLfloat *accBuf;
+   GLint i;
+
+   accBuf = (GLfloat *) malloc(width * height * 4 * sizeof(GLfloat));
+
+   pipe_get_tile_rgba(pipe, acc_ps, xpos, ypos, width, height, accBuf);
+
+   for (i = 0; i < 4 * width * height; i++) {
+      accBuf[i] = accBuf[i] * scale + bias;
+   }
+
+   pipe_put_tile_rgba(pipe, acc_ps, xpos, ypos, width, height, accBuf);
+
+   free(accBuf);
+}
+
+
+static void
+accum_accum(struct pipe_context *pipe, GLfloat value,
+            GLint xpos, GLint ypos, GLint width, GLint height,
+            struct pipe_surface *acc_ps,
+            struct pipe_surface *color_ps)
+{
+   GLfloat *colorBuf, *accBuf;
+   GLint i;
+
+   colorBuf = (GLfloat *) malloc(width * height * 4 * sizeof(GLfloat));
+   accBuf = (GLfloat *) malloc(width * height * 4 * sizeof(GLfloat));
+
+   pipe_get_tile_rgba(pipe, color_ps, xpos, ypos, width, height, colorBuf);
+   acc_get_tile_rgba(pipe, acc_ps, xpos, ypos, width, height, accBuf);
+
+   for (i = 0; i < 4 * width * height; i++) {
+      accBuf[i] = accBuf[i] + colorBuf[i] * value;
+   }
+
+   acc_put_tile_rgba(pipe, acc_ps, xpos, ypos, width, height, accBuf);
+
+   free(colorBuf);
+   free(accBuf);
+}
+
+
+static void
+accum_load(struct pipe_context *pipe, GLfloat value,
+           GLint xpos, GLint ypos, GLint width, GLint height,
+           struct pipe_surface *acc_ps,
+           struct pipe_surface *color_ps)
+{
+   GLfloat *buf;
+   GLint i;
+
+   buf = (GLfloat *) malloc(width * height * 4 * sizeof(GLfloat));
+
+   pipe_get_tile_rgba(pipe, color_ps, xpos, ypos, width, height, buf);
+
+   for (i = 0; i < 4 * width * height; i++) {
+      buf[i] = buf[i] * value;
+   }
+
+   acc_put_tile_rgba(pipe, acc_ps, xpos, ypos, width, height, buf);
+
+   free(buf);
+}
+
+
+static void
+accum_return(GLcontext *ctx, GLfloat value,
+             GLint xpos, GLint ypos, GLint width, GLint height,
+             struct pipe_surface *acc_ps,
+             struct pipe_surface *color_ps)
+{
+   struct pipe_context *pipe = ctx->st->pipe;
+   const GLubyte *colormask = ctx->Color.ColorMask;
+   GLfloat *abuf, *cbuf = NULL;
+   GLint i, ch;
+
+   abuf = (GLfloat *) malloc(width * height * 4 * sizeof(GLfloat));
+
+   acc_get_tile_rgba(pipe, acc_ps, xpos, ypos, width, height, abuf);
+
+   if (!colormask[0] || !colormask[1] || !colormask[2] || !colormask[3]) {
+      cbuf = (GLfloat *) malloc(width * height * 4 * sizeof(GLfloat));
+      pipe_get_tile_rgba(pipe, color_ps, xpos, ypos, width, height, cbuf);
+   }
+
+   for (i = 0; i < width * height; i++) {
+      for (ch = 0; ch < 4; ch++) {
+         if (colormask[ch]) {
+            GLfloat val = abuf[i * 4 + ch] * value;
+            abuf[i * 4 + ch] = CLAMP(val, 0.0, 1.0);
+         }
+         else {
+            abuf[i * 4 + ch] = cbuf[i * 4 + ch];
+         }
+      }
+   }
+
+   pipe_put_tile_rgba(pipe, color_ps, xpos, ypos, width, height, abuf);
+
+   free(abuf);
+   if (cbuf)
+      free(cbuf);
+}
+
+
+static void
+st_Accum(GLcontext *ctx, GLenum op, GLfloat value)
+{
+   struct st_context *st = ctx->st;
+   struct pipe_context *pipe = st->pipe;
+   struct st_renderbuffer *acc_strb
+     = st_renderbuffer(ctx->DrawBuffer->Attachment[BUFFER_ACCUM].Renderbuffer);
+   struct st_renderbuffer *color_strb
+      = st_renderbuffer(ctx->ReadBuffer->_ColorReadBuffer);
+   struct pipe_surface *acc_ps = acc_strb->surface;
+   struct pipe_surface *color_ps = color_strb->surface;
+
+   const GLint xpos = ctx->DrawBuffer->_Xmin;
+   const GLint ypos = ctx->DrawBuffer->_Ymin;
+   const GLint width = ctx->DrawBuffer->_Xmax - xpos;
+   const GLint height = ctx->DrawBuffer->_Ymax - ypos;
+
+   /* make sure color bufs aren't cached */
+   pipe->flush(pipe, PIPE_FLUSH_RENDER_CACHE, NULL);
+
+   switch (op) {
+   case GL_ADD:
+      if (value != 0.0F) {
+         accum_mad(pipe, 1.0, value, xpos, ypos, width, height, acc_ps);
+      }
+      break;
+   case GL_MULT:
+      if (value != 1.0F) {
+         accum_mad(pipe, value, 0.0, xpos, ypos, width, height, acc_ps);
+      }
+      break;
+   case GL_ACCUM:
+      if (value != 0.0F) {
+         accum_accum(pipe, value, xpos, ypos, width, height, acc_ps, color_ps);
+      }
+      break;
+   case GL_LOAD:
+      accum_load(pipe, value, xpos, ypos, width, height, acc_ps, color_ps);
+      break;
+   case GL_RETURN:
+      accum_return(ctx, value, xpos, ypos, width, height, acc_ps, color_ps);
+      break;
+   default:
+      assert(0);
+   }
+}
+
+
+
+void st_init_accum_functions(struct dd_function_table *functions)
+{
+   functions->Accum = st_Accum;
+}
diff --git a/src/mesa/state_tracker/st_cb_accum.c b/src/mesa/state_tracker/st_cb_accum.c
index a992e08ff6..cf3a99e7e9 100644
--- a/src/mesa/state_tracker/st_cb_accum.c
+++ b/src/mesa/state_tracker/st_cb_accum.c
@@ -42,7 +42,7 @@
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_inlines.h"
-#include "util/p_tile.h"
+#include "util/u_tile.h"
 
 
 #define UNCLAMPED_FLOAT_TO_SHORT(us, f)  \
diff --git a/src/mesa/state_tracker/st_cb_bitmap.c b/src/mesa/state_tracker/st_cb_bitmap.c
index d5696a909f..a0c305d66f 100644
--- a/src/mesa/state_tracker/st_cb_bitmap.c
+++ b/src/mesa/state_tracker/st_cb_bitmap.c
@@ -50,7 +50,7 @@
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_inlines.h"
-#include "util/p_tile.h"
+#include "util/u_tile.h"
 #include "util/u_draw_quad.h"
 #include "util/u_simple_shaders.h"
 #include "shader/prog_instruction.h"
diff --git a/src/mesa/state_tracker/st_cb_drawpixels.c b/src/mesa/state_tracker/st_cb_drawpixels.c
index 0c5e21d4ff..4ec7c752df 100644
--- a/src/mesa/state_tracker/st_cb_drawpixels.c
+++ b/src/mesa/state_tracker/st_cb_drawpixels.c
@@ -55,7 +55,7 @@
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_inlines.h"
-#include "util/p_tile.h"
+#include "util/u_tile.h"
 #include "util/u_draw_quad.h"
 #include "shader/prog_instruction.h"
 #include "cso_cache/cso_context.h"
diff --git a/src/mesa/state_tracker/st_cb_readpixels.c b/src/mesa/state_tracker/st_cb_readpixels.c
index 39f5856f94..c801532788 100644
--- a/src/mesa/state_tracker/st_cb_readpixels.c
+++ b/src/mesa/state_tracker/st_cb_readpixels.c
@@ -41,7 +41,7 @@
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_inlines.h"
-#include "util/p_tile.h"
+#include "util/u_tile.h"
 #include "st_context.h"
 #include "st_cb_bitmap.h"
 #include "st_cb_readpixels.h"
diff --git a/src/mesa/state_tracker/st_cb_texture.c b/src/mesa/state_tracker/st_cb_texture.c
index 6177ac63f0..16bbf3d80f 100644
--- a/src/mesa/state_tracker/st_cb_texture.c
+++ b/src/mesa/state_tracker/st_cb_texture.c
@@ -51,7 +51,7 @@
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_inlines.h"
-#include "util/p_tile.h"
+#include "util/u_tile.h"
 #include "util/u_blit.h"
 
 
diff --git a/src/mesa/state_tracker/st_program.c b/src/mesa/state_tracker/st_program.c
index 325d95e865..936a6e32ea 100644
--- a/src/mesa/state_tracker/st_program.c
+++ b/src/mesa/state_tracker/st_program.c
@@ -55,7 +55,7 @@
 #define TGSI_DEBUG 0
 
 
-/** XXX we should use the version of this from p_util.h but including
+/** XXX we should use the version of this from u_memory.h but including
  * that header causes symbol collisions.
  */
 static INLINE void *
diff --git a/src/mesa/state_tracker/st_texture.c b/src/mesa/state_tracker/st_texture.c
index 63046a0ecc..73cebff33f 100644
--- a/src/mesa/state_tracker/st_texture.c
+++ b/src/mesa/state_tracker/st_texture.c
@@ -36,7 +36,6 @@
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_inlines.h"
-#include "pipe/p_util.h"
 #include "pipe/p_inlines.h"
 #include "util/u_rect.h"
 
-- 
cgit v1.2.3


From 6ba9fb9b6693904054ad4e1506ba42e324334b0a Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 25 Aug 2008 11:31:59 -0600
Subject: cell: asst fixes to get driver building/running again.

Note that SPU vertex transformation is disabled at this time.
---
 src/gallium/drivers/cell/ppu/cell_clear.c         |  4 +-
 src/gallium/drivers/cell/ppu/cell_context.c       |  4 ++
 src/gallium/drivers/cell/ppu/cell_draw_arrays.c   | 35 ++++++++++--
 src/gallium/drivers/cell/ppu/cell_draw_arrays.h   | 24 ++++++--
 src/gallium/drivers/cell/ppu/cell_pipe_state.c    |  6 +-
 src/gallium/drivers/cell/ppu/cell_state_emit.c    | 14 ++---
 src/gallium/drivers/cell/ppu/cell_state_shader.c  |  4 +-
 src/gallium/drivers/cell/ppu/cell_surface.c       | 11 ++--
 src/gallium/drivers/cell/ppu/cell_texture.c       | 69 ++++++++++++++++++++---
 src/gallium/drivers/cell/ppu/cell_texture.h       |  1 +
 src/gallium/drivers/cell/ppu/cell_vbuf.c          |  1 +
 src/gallium/drivers/cell/ppu/cell_vertex_fetch.c  |  4 ++
 src/gallium/drivers/cell/ppu/cell_vertex_shader.c |  5 ++
 src/gallium/drivers/cell/spu/spu_exec.h           |  2 +-
 src/gallium/drivers/cell/spu/spu_render.c         |  2 +-
 src/gallium/drivers/cell/spu/spu_tri.c            |  1 +
 src/gallium/drivers/cell/spu/spu_util.c           |  2 +
 src/gallium/drivers/cell/spu/spu_vertex_shader.c  | 26 ++++++++-
 18 files changed, 176 insertions(+), 39 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/ppu/cell_clear.c b/src/gallium/drivers/cell/ppu/cell_clear.c
index cee0917b63..a421c95c8e 100644
--- a/src/gallium/drivers/cell/ppu/cell_clear.c
+++ b/src/gallium/drivers/cell/ppu/cell_clear.c
@@ -48,6 +48,7 @@ void
 cell_clear_surface(struct pipe_context *pipe, struct pipe_surface *ps,
                    unsigned clearValue)
 {
+   struct pipe_screen *screen = pipe->screen;
    struct cell_context *cell = cell_context(pipe);
    uint surfIndex;
 
@@ -56,7 +57,8 @@ cell_clear_surface(struct pipe_context *pipe, struct pipe_surface *ps,
 
 
    if (!cell->cbuf_map[0])
-      cell->cbuf_map[0] = pipe_surface_map(ps);
+      cell->cbuf_map[0] = screen->surface_map(screen, ps,
+                                              PIPE_BUFFER_USAGE_GPU_WRITE);
 
    if (ps == cell->framebuffer.zsbuf) {
       surfIndex = 1;
diff --git a/src/gallium/drivers/cell/ppu/cell_context.c b/src/gallium/drivers/cell/ppu/cell_context.c
index 5af95a3c10..9ff4e86943 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.c
+++ b/src/gallium/drivers/cell/ppu/cell_context.c
@@ -73,11 +73,13 @@ cell_draw_create(struct cell_context *cell)
 {
    struct draw_context *draw = draw_create();
 
+#if 0 /* broken */
    if (getenv("GALLIUM_CELL_VS")) {
       /* plug in SPU-based vertex transformation code */
       draw->shader_queue_flush = cell_vertex_shader_queue_flush;
       draw->driver_private = cell;
    }
+#endif
 
    return draw;
 }
@@ -108,6 +110,8 @@ cell_create_context(struct pipe_screen *screen,
 
    cell->pipe.draw_arrays = cell_draw_arrays;
    cell->pipe.draw_elements = cell_draw_elements;
+   cell->pipe.draw_range_elements = cell_draw_range_elements;
+   cell->pipe.set_edgeflags = cell_set_edgeflags;
 
    cell->pipe.clear = cell_clear_surface;
    cell->pipe.flush = cell_flush;
diff --git a/src/gallium/drivers/cell/ppu/cell_draw_arrays.c b/src/gallium/drivers/cell/ppu/cell_draw_arrays.c
index 6e08cf6fe8..f02dffe124 100644
--- a/src/gallium/drivers/cell/ppu/cell_draw_arrays.c
+++ b/src/gallium/drivers/cell/ppu/cell_draw_arrays.c
@@ -59,7 +59,8 @@ cell_map_constant_buffers(struct cell_context *sp)
    }
 
    draw_set_mapped_constant_buffer(sp->draw,
-                                   sp->mapped_constants[PIPE_SHADER_VERTEX]);
+                                   sp->mapped_constants[PIPE_SHADER_VERTEX],
+                                   sp->constants[PIPE_SHADER_VERTEX].size);
 }
 
 static void
@@ -92,10 +93,12 @@ cell_draw_arrays(struct pipe_context *pipe, unsigned mode,
  * XXX should the element buffer be specified/bound with a separate function?
  */
 boolean
-cell_draw_elements(struct pipe_context *pipe,
-                       struct pipe_buffer *indexBuffer,
-                       unsigned indexSize,
-                       unsigned mode, unsigned start, unsigned count)
+cell_draw_range_elements(struct pipe_context *pipe,
+                         struct pipe_buffer *indexBuffer,
+                         unsigned indexSize,
+                         unsigned min_index,
+                         unsigned max_index,
+                         unsigned mode, unsigned start, unsigned count)
 {
    struct cell_context *sp = cell_context(pipe);
    struct draw_context *draw = sp->draw;
@@ -152,3 +155,25 @@ cell_draw_elements(struct pipe_context *pipe,
 
    return TRUE;
 }
+
+
+boolean
+cell_draw_elements(struct pipe_context *pipe,
+                   struct pipe_buffer *indexBuffer,
+                   unsigned indexSize,
+                   unsigned mode, unsigned start, unsigned count)
+{
+   return cell_draw_range_elements( pipe, indexBuffer,
+                                    indexSize,
+                                    0, 0xffffffff,
+                                    mode, start, count );
+}
+
+
+
+void
+cell_set_edgeflags(struct pipe_context *pipe, const unsigned *edgeflags)
+{
+   struct cell_context *cell = cell_context(pipe);
+   draw_set_edgeflags(cell->draw, edgeflags);
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_draw_arrays.h b/src/gallium/drivers/cell/ppu/cell_draw_arrays.h
index d5df4aa05f..cd35ec17b4 100644
--- a/src/gallium/drivers/cell/ppu/cell_draw_arrays.h
+++ b/src/gallium/drivers/cell/ppu/cell_draw_arrays.h
@@ -29,14 +29,26 @@
 #define CELL_DRAW_ARRAYS_H
 
 
-boolean cell_draw_arrays(struct pipe_context *pipe, unsigned mode,
-                         unsigned start, unsigned count);
+extern boolean
+cell_draw_arrays(struct pipe_context *pipe, unsigned mode,
+                 unsigned start, unsigned count);
 
-boolean cell_draw_elements(struct pipe_context *pipe,
-                           struct pipe_buffer *indexBuffer,
-                           unsigned indexSize,
-                           unsigned mode, unsigned start, unsigned count);
+extern boolean
+cell_draw_elements(struct pipe_context *pipe,
+                   struct pipe_buffer *indexBuffer,
+                   unsigned indexSize,
+                   unsigned mode, unsigned start, unsigned count);
 
+extern boolean
+cell_draw_range_elements(struct pipe_context *pipe,
+                         struct pipe_buffer *indexBuffer,
+                         unsigned indexSize,
+                         unsigned min_index,
+                         unsigned max_index,
+                         unsigned mode, unsigned start, unsigned count);
+
+extern void
+cell_set_edgeflags(struct pipe_context *pipe, const unsigned *edgeflags);
 
 
 #endif /* CELL_DRAW_ARRAYS_H */
diff --git a/src/gallium/drivers/cell/ppu/cell_pipe_state.c b/src/gallium/drivers/cell/ppu/cell_pipe_state.c
index 971d65d09e..fe5437023b 100644
--- a/src/gallium/drivers/cell/ppu/cell_pipe_state.c
+++ b/src/gallium/drivers/cell/ppu/cell_pipe_state.c
@@ -294,6 +294,8 @@ cell_set_framebuffer_state(struct pipe_context *pipe,
       struct pipe_surface *csurf = fb->cbufs[0];
       struct pipe_surface *zsurf = fb->zsbuf;
       uint i;
+      uint flags = (PIPE_BUFFER_USAGE_GPU_WRITE |
+                    PIPE_BUFFER_USAGE_GPU_READ);
 
       /* unmap old surfaces */
       for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
@@ -313,10 +315,10 @@ cell_set_framebuffer_state(struct pipe_context *pipe,
 
       /* map new surfaces */
       if (csurf)
-         cell->cbuf_map[0] = pipe_surface_map(csurf);
+         cell->cbuf_map[0] = pipe_surface_map(csurf, flags);
 
       if (zsurf)
-         cell->zsbuf_map = pipe_surface_map(zsurf);
+         cell->zsbuf_map = pipe_surface_map(zsurf, flags);
 
       cell->dirty |= CELL_NEW_FRAMEBUFFER;
    }
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index 3646a0ee4f..9d88c1cf3d 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -162,13 +162,13 @@ cell_emit_state(struct cell_context *cell)
       const struct draw_context *const draw = cell->draw;
       struct cell_shader_info info;
 
-      info.num_outputs = draw->num_vs_outputs;
-      info.declarations = (uintptr_t) draw->machine.Declarations;
-      info.num_declarations = draw->machine.NumDeclarations;
-      info.instructions = (uintptr_t) draw->machine.Instructions;
-      info.num_instructions = draw->machine.NumInstructions;
-      info.immediates = (uintptr_t) draw->machine.Imms;
-      info.num_immediates = draw->machine.ImmLimit / 4;
+      info.num_outputs = draw_num_vs_outputs(draw);
+      info.declarations = (uintptr_t) draw->vs.machine.Declarations;
+      info.num_declarations = draw->vs.machine.NumDeclarations;
+      info.instructions = (uintptr_t) draw->vs.machine.Instructions;
+      info.num_instructions = draw->vs.machine.NumInstructions;
+      info.immediates = (uintptr_t) draw->vs.machine.Imms;
+      info.num_immediates = draw->vs.machine.ImmLimit / 4;
 
       emit_state_cmd(cell, CELL_CMD_STATE_BIND_VS,
 		     & info, sizeof(info));
diff --git a/src/gallium/drivers/cell/ppu/cell_state_shader.c b/src/gallium/drivers/cell/ppu/cell_state_shader.c
index cd96b317fa..86bcad05e9 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_shader.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_shader.c
@@ -41,7 +41,7 @@
 static INLINE struct cell_fragment_shader_state *
 cell_fragment_shader_state(void *shader)
 {
-   return (struct pipe_shader_state *) shader;
+   return (struct cell_fragment_shader_state *) shader;
 }
 
 
@@ -49,7 +49,7 @@ cell_fragment_shader_state(void *shader)
 static INLINE struct cell_vertex_shader_state *
 cell_vertex_shader_state(void *shader)
 {
-   return (struct pipe_shader_state *) shader;
+   return (struct cell_vertex_shader_state *) shader;
 }
 
 
diff --git a/src/gallium/drivers/cell/ppu/cell_surface.c b/src/gallium/drivers/cell/ppu/cell_surface.c
index 2d31ad89a6..d9e3b510dc 100644
--- a/src/gallium/drivers/cell/ppu/cell_surface.c
+++ b/src/gallium/drivers/cell/ppu/cell_surface.c
@@ -26,11 +26,12 @@
  **************************************************************************/
 
 #include "pipe/p_defines.h"
-#include "util/u_memory.h"
 #include "pipe/p_inlines.h"
 #include "pipe/p_winsys.h"
-#include "util/p_tile.h"
+#include "util/u_memory.h"
 #include "util/u_rect.h"
+#include "util/u_tile.h"
+
 #include "cell_context.h"
 #include "cell_surface.h"
 
@@ -46,12 +47,12 @@ cell_surface_copy(struct pipe_context *pipe,
 {
    assert( dst->cpp == src->cpp );
 
-   pipe_copy_rect(pipe_surface_map(dst),
+   pipe_copy_rect(pipe_surface_map(dst, PIPE_BUFFER_USAGE_CPU_WRITE),
                   &dst->block,
                   dst->stride,
                   dstx, dsty,
                   width, height,
-                  pipe_surface_map(src),
+                  pipe_surface_map(src, PIPE_BUFFER_USAGE_CPU_READ),
                   do_flip ? -src->stride : src->stride,
                   srcx, do_flip ? height - 1 - srcy : srcy);
 
@@ -81,7 +82,7 @@ cell_surface_fill(struct pipe_context *pipe,
                   unsigned width, unsigned height, unsigned value)
 {
    unsigned i, j;
-   void *dst_map = pipe_surface_map(dst);
+   void *dst_map = pipe_surface_map(dst, PIPE_BUFFER_USAGE_CPU_WRITE);
 
    assert(dst->stride > 0);
 
diff --git a/src/gallium/drivers/cell/ppu/cell_texture.c b/src/gallium/drivers/cell/ppu/cell_texture.c
index 1add81373d..5a0942bbd6 100644
--- a/src/gallium/drivers/cell/ppu/cell_texture.c
+++ b/src/gallium/drivers/cell/ppu/cell_texture.c
@@ -33,8 +33,9 @@
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_inlines.h"
-#include "util/u_memory.h"
 #include "pipe/p_winsys.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
 
 #include "cell_context.h"
 #include "cell_state.h"
@@ -68,11 +69,13 @@ cell_texture_layout(struct cell_texture * spt)
       pt->nblocksx[level] = pf_get_nblocksx(&pt->block, width);  
       pt->nblocksy[level] = pf_get_nblocksy(&pt->block, height);  
 
+      spt->stride[level] = pt->nblocksx[level] * pt->block.size;
+
       spt->level_offset[level] = spt->buffer_size;
 
       spt->buffer_size += (pt->nblocksy[level] *
 			  ((pt->target == PIPE_TEXTURE_CUBE) ? 6 : depth) *
-			  pt->nblocksx[level] * pt->block.size;
+                           pt->nblocksx[level] * pt->block.size);
 
       width  = minify(width);
       height = minify(height);
@@ -147,7 +150,8 @@ cell_texture_update(struct pipe_context *pipe, struct pipe_texture *texture,
 static struct pipe_surface *
 cell_get_tex_surface_screen(struct pipe_screen *screen,
                             struct pipe_texture *pt,
-                            unsigned face, unsigned level, unsigned zslice)
+                            unsigned face, unsigned level, unsigned zslice,
+                            unsigned usage)
 {
    struct pipe_winsys *ws = screen->winsys;
    struct cell_texture *spt = cell_texture(pt);
@@ -166,6 +170,10 @@ cell_get_tex_surface_screen(struct pipe_screen *screen,
       ps->nblocksy = pt->nblocksy[level];
       ps->stride = spt->stride[level];
       ps->offset = spt->level_offset[level];
+      ps->usage = usage;
+
+      /* XXX may need to override usage flags (see sp_texture.c) */
+
 
       if (pt->target == PIPE_TEXTURE_CUBE || pt->target == PIPE_TEXTURE_3D) {
 	 ps->offset += ((pt->target == PIPE_TEXTURE_CUBE) ? face : zslice) *
@@ -228,10 +236,11 @@ cell_tile_texture(struct cell_context *cell,
    assert(w % TILE_SIZE == 0);
    assert(h % TILE_SIZE == 0);
 
-   surf = screen->get_tex_surface(screen, &texture->base, face, level, zslice);
+   surf = screen->get_tex_surface(screen, &texture->base, face, level, zslice,
+                                  PIPE_BUFFER_USAGE_CPU_WRITE);
    ASSERT(surf);
 
-   src = (const uint *) pipe_surface_map(surf);
+   src = (const uint *) pipe_surface_map(surf, PIPE_BUFFER_USAGE_CPU_WRITE);
 
    if (texture->tiled_data) {
       align_free(texture->tiled_data);
@@ -246,11 +255,12 @@ cell_tile_texture(struct cell_context *cell,
 }
 
 
-
 void
 cell_update_texture_mapping(struct cell_context *cell)
 {
+#if 0
    uint face = 0, level = 0, zslice = 0;
+#endif
    uint i;
 
    for (i = 0; i < CELL_MAX_SAMPLERS; i++) {
@@ -275,10 +285,52 @@ cell_update_texture_mapping(struct cell_context *cell)
 }
 
 
+static void *
+cell_surface_map( struct pipe_screen *screen,
+                  struct pipe_surface *surface,
+                  unsigned flags )
+{
+   ubyte *map;
+
+   if (flags & ~surface->usage) {
+      assert(0);
+      return NULL;
+   }
+
+   map = screen->winsys->buffer_map( screen->winsys, surface->buffer, flags );
+   if (map == NULL)
+      return NULL;
+
+   /* May want to different things here depending on read/write nature
+    * of the map:
+    */
+   if (surface->texture &&
+       (flags & PIPE_BUFFER_USAGE_CPU_WRITE)) 
+   {
+      /* Do something to notify sharing contexts of a texture change.
+       * In softpipe, that would mean flushing the texture cache.
+       */
+#if 0
+      cell_screen(screen)->timestamp++;
+#endif
+   }
+   
+   return map + surface->offset;
+}
+
+
+static void
+cell_surface_unmap(struct pipe_screen *screen,
+                   struct pipe_surface *surface)
+{
+   screen->winsys->buffer_unmap( screen->winsys, surface->buffer );
+}
+
+
 void
 cell_init_texture_functions(struct cell_context *cell)
 {
-   cell->pipe.texture_update = cell_texture_update;
+   /*cell->pipe.texture_update = cell_texture_update;*/
 }
 
 void
@@ -287,4 +339,7 @@ cell_init_screen_texture_funcs(struct pipe_screen *screen)
    screen->texture_create = cell_texture_create_screen;
    screen->texture_release = cell_texture_release_screen;
    screen->get_tex_surface = cell_get_tex_surface_screen;
+
+   screen->surface_map = cell_surface_map;
+   screen->surface_unmap = cell_surface_unmap;
 }
diff --git a/src/gallium/drivers/cell/ppu/cell_texture.h b/src/gallium/drivers/cell/ppu/cell_texture.h
index fcee069d05..6d37e95ebc 100644
--- a/src/gallium/drivers/cell/ppu/cell_texture.h
+++ b/src/gallium/drivers/cell/ppu/cell_texture.h
@@ -41,6 +41,7 @@ struct cell_texture
    struct pipe_texture base;
 
    unsigned long level_offset[PIPE_MAX_TEXTURE_LEVELS];
+   unsigned long stride[PIPE_MAX_TEXTURE_LEVELS];
 
    /* The data is held here:
     */
diff --git a/src/gallium/drivers/cell/ppu/cell_vbuf.c b/src/gallium/drivers/cell/ppu/cell_vbuf.c
index 3a181b585c..e4230c7a5f 100644
--- a/src/gallium/drivers/cell/ppu/cell_vbuf.c
+++ b/src/gallium/drivers/cell/ppu/cell_vbuf.c
@@ -37,6 +37,7 @@
 #include "cell_spu.h"
 #include "cell_vbuf.h"
 #include "draw/draw_vbuf.h"
+#include "util/u_memory.h"
 
 
 /** Allow vertex data to be inlined after RENDER command */
diff --git a/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c b/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c
index 49d5443cde..2ece0250f6 100644
--- a/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c
+++ b/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c
@@ -260,6 +260,7 @@ emit_fetch(struct spe_function *p,
 
 void cell_update_vertex_fetch(struct draw_context *draw)
 {
+#if 0
    struct cell_context *const cell =
        (struct cell_context *) draw->driver_private;
    struct spe_function *p = &cell->attrib_fetch;
@@ -337,4 +338,7 @@ void cell_update_vertex_fetch(struct draw_context *draw)
 	     cell->attrib_fetch_offsets[function_index[i]];
       }
    }
+#else
+   assert(0);
+#endif
 }
diff --git a/src/gallium/drivers/cell/ppu/cell_vertex_shader.c b/src/gallium/drivers/cell/ppu/cell_vertex_shader.c
index f753960a0f..3658947715 100644
--- a/src/gallium/drivers/cell/ppu/cell_vertex_shader.c
+++ b/src/gallium/drivers/cell/ppu/cell_vertex_shader.c
@@ -32,6 +32,7 @@
 #include "pipe/p_defines.h"
 #include "pipe/p_context.h"
 #include "pipe/p_winsys.h"
+#include "util/u_math.h"
 
 #include "cell_context.h"
 #include "cell_draw_arrays.h"
@@ -50,6 +51,7 @@
 void
 cell_vertex_shader_queue_flush(struct draw_context *draw)
 {
+#if 0
    struct cell_context *const cell =
        (struct cell_context *) draw->driver_private;
    struct cell_command_vs *const vs = &cell_global.command[0].vs;
@@ -138,4 +140,7 @@ cell_vertex_shader_queue_flush(struct draw_context *draw)
 
    draw->vs.post_nr = draw->vs.queue_nr;
    draw->vs.queue_nr = 0;
+#else
+   assert(0);
+#endif
 }
diff --git a/src/gallium/drivers/cell/spu/spu_exec.h b/src/gallium/drivers/cell/spu/spu_exec.h
index c68f78f59b..8605679940 100644
--- a/src/gallium/drivers/cell/spu/spu_exec.h
+++ b/src/gallium/drivers/cell/spu/spu_exec.h
@@ -99,7 +99,7 @@ struct spu_exec_machine
     * 1  address
     */
    struct spu_exec_vector       Temps[TGSI_EXEC_NUM_TEMPS 
-				      + TGSI_EXEC_NUM_ADDRS + 1]
+                                      + TGSI_EXEC_NUM_TEMP_EXTRAS + 1]
        ALIGN16_ATTRIB;
 
    struct spu_exec_vector       *Addrs;
diff --git a/src/gallium/drivers/cell/spu/spu_render.c b/src/gallium/drivers/cell/spu/spu_render.c
index 6df59abd36..305dc98881 100644
--- a/src/gallium/drivers/cell/spu/spu_render.c
+++ b/src/gallium/drivers/cell/spu/spu_render.c
@@ -35,7 +35,7 @@
 #include "spu_tri.h"
 #include "spu_tile.h"
 #include "cell/common.h"
-
+#include "util/u_memory.h"
 
 
 /**
diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index 8944ef171e..2a4e0b423c 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -32,6 +32,7 @@
 #include <transpose_matrix4x4.h>
 #include "pipe/p_compiler.h"
 #include "pipe/p_format.h"
+#include "util/u_math.h"
 #include "spu_colorpack.h"
 #include "spu_main.h"
 #include "spu_texture.h"
diff --git a/src/gallium/drivers/cell/spu/spu_util.c b/src/gallium/drivers/cell/spu/spu_util.c
index dbcf4b0eb9..b25ca4eafc 100644
--- a/src/gallium/drivers/cell/spu/spu_util.c
+++ b/src/gallium/drivers/cell/spu/spu_util.c
@@ -1,4 +1,6 @@
+
 #include "pipe/p_shader_tokens.h"
+#include "pipe/p_debug.h"
 #include "tgsi/tgsi_parse.h"
 //#include "tgsi_build.h"
 #include "tgsi/tgsi_util.h"
diff --git a/src/gallium/drivers/cell/spu/spu_vertex_shader.c b/src/gallium/drivers/cell/spu/spu_vertex_shader.c
index a1e81975e6..f81d19fea1 100644
--- a/src/gallium/drivers/cell/spu/spu_vertex_shader.c
+++ b/src/gallium/drivers/cell/spu/spu_vertex_shader.c
@@ -36,13 +36,35 @@
 
 #include "pipe/p_state.h"
 #include "pipe/p_shader_tokens.h"
-#include "spu_vertex_shader.h"
-#include "spu_exec.h"
+#include "util/u_math.h"
 #include "draw/draw_private.h"
 #include "draw/draw_context.h"
 #include "cell/common.h"
+#include "spu_vertex_shader.h"
+#include "spu_exec.h"
 #include "spu_main.h"
 
+
+#define MAX_VERTEX_SIZE ((2 + PIPE_MAX_SHADER_OUTPUTS) * 4 * sizeof(float))
+
+
+#define CLIP_RIGHT_BIT 0x01
+#define CLIP_LEFT_BIT 0x02
+#define CLIP_TOP_BIT 0x04
+#define CLIP_BOTTOM_BIT 0x08
+#define CLIP_FAR_BIT 0x10
+#define CLIP_NEAR_BIT 0x20
+
+
+static INLINE float
+dot4(const float *a, const float *b)
+{
+   return (a[0]*b[0] +
+           a[1]*b[1] +
+           a[2]*b[2] +
+           a[3]*b[3]);
+}
+
 static INLINE unsigned
 compute_clipmask(const float *clip, /*const*/ float plane[][4], unsigned nr)
 {
-- 
cgit v1.2.3


From fafc36920eb79ddbe7049f6bbce18bcc279982d0 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 3 Sep 2008 09:31:36 -0600
Subject: cell: add -DDEBUG flag, fixes to Cell Makefiles

---
 configs/linux-cell-debug              | 2 +-
 src/gallium/drivers/cell/ppu/Makefile | 2 +-
 src/gallium/drivers/cell/spu/Makefile | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/configs/linux-cell-debug b/configs/linux-cell-debug
index ba333bba68..42f3245edc 100644
--- a/configs/linux-cell-debug
+++ b/configs/linux-cell-debug
@@ -6,5 +6,5 @@ include $(TOP)/configs/linux-cell
 
 CONFIG_NAME = linux-cell-debug
 
-OPT_FLAGS = -g
+OPT_FLAGS = -g -DDEBUG
 
diff --git a/src/gallium/drivers/cell/ppu/Makefile b/src/gallium/drivers/cell/ppu/Makefile
index 0389a9554c..25473e200c 100644
--- a/src/gallium/drivers/cell/ppu/Makefile
+++ b/src/gallium/drivers/cell/ppu/Makefile
@@ -5,7 +5,7 @@
 
 
 TOP = ../../../../..
-include $(TOP)/configs/linux-cell
+include $(TOP)/configs/current
 
 
 # This is the "top-level" cell PPU driver code, will get pulled into libGL.so
diff --git a/src/gallium/drivers/cell/spu/Makefile b/src/gallium/drivers/cell/spu/Makefile
index 8e83610790..d49abb2e82 100644
--- a/src/gallium/drivers/cell/spu/Makefile
+++ b/src/gallium/drivers/cell/spu/Makefile
@@ -5,7 +5,7 @@
 
 
 TOP = ../../../../..
-include $(TOP)/configs/linux-cell
+include $(TOP)/configs/current
 
 
 PROG = g3d
-- 
cgit v1.2.3


From ba2812f23e05c63e0ea2a042dcb788c30fcbc37f Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 3 Sep 2008 11:45:28 -0600
Subject: cell: comments

---
 src/gallium/drivers/cell/spu/spu_tile.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/spu/spu_tile.c b/src/gallium/drivers/cell/spu/spu_tile.c
index 12dc246328..216a33126b 100644
--- a/src/gallium/drivers/cell/spu/spu_tile.c
+++ b/src/gallium/drivers/cell/spu/spu_tile.c
@@ -31,6 +31,9 @@
 #include "spu_main.h"
 
 
+/**
+ * Get tile of color or Z values from main memory, put into SPU memory.
+ */
 void
 get_tile(uint tx, uint ty, tile_t *tile, int tag, int zBuf)
 {
@@ -56,6 +59,9 @@ get_tile(uint tx, uint ty, tile_t *tile, int tag, int zBuf)
 }
 
 
+/**
+ * Move tile of color or Z values from SPU memory to main memory.
+ */
 void
 put_tile(uint tx, uint ty, const tile_t *tile, int tag, int zBuf)
 {
-- 
cgit v1.2.3


From 6c84652dc3877593b8b151366521289833707b40 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 3 Sep 2008 13:31:51 -0600
Subject: cell: replace assert() with special spu ASSERT() macro

---
 src/gallium/drivers/cell/spu/spu_exec.c            | 150 ++++++++++-----------
 src/gallium/drivers/cell/spu/spu_per_fragment_op.c |   4 +-
 src/gallium/drivers/cell/spu/spu_util.c            |  21 +--
 src/gallium/drivers/cell/spu/spu_vertex_fetch.c    |   2 +-
 src/gallium/drivers/cell/spu/spu_vertex_shader.c   |   2 +-
 5 files changed, 90 insertions(+), 89 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/spu/spu_exec.c b/src/gallium/drivers/cell/spu/spu_exec.c
index 89c61136a4..e27df2dfb3 100644
--- a/src/gallium/drivers/cell/spu/spu_exec.c
+++ b/src/gallium/drivers/cell/spu/spu_exec.c
@@ -382,10 +382,10 @@ fetch_src_file_channel(
          break;
 
       case TGSI_FILE_IMMEDIATE:
-         assert( index->i[0] < (int) mach->ImmLimit );
-         assert( index->i[1] < (int) mach->ImmLimit );
-         assert( index->i[2] < (int) mach->ImmLimit );
-         assert( index->i[3] < (int) mach->ImmLimit );
+         ASSERT( index->i[0] < (int) mach->ImmLimit );
+         ASSERT( index->i[1] < (int) mach->ImmLimit );
+         ASSERT( index->i[2] < (int) mach->ImmLimit );
+         ASSERT( index->i[3] < (int) mach->ImmLimit );
 
          chan->f[0] = mach->Imms[index->i[0]][swizzle];
          chan->f[1] = mach->Imms[index->i[1]][swizzle];
@@ -409,7 +409,7 @@ fetch_src_file_channel(
          break;
 
       default:
-         assert( 0 );
+         ASSERT( 0 );
       }
       break;
 
@@ -422,7 +422,7 @@ fetch_src_file_channel(
       break;
 
    default:
-      assert( 0 );
+      ASSERT( 0 );
    }
 }
 
@@ -471,7 +471,7 @@ fetch_source(
          index.q = si_shli(index.q, 12);
          break;
       default:
-         assert( 0 );
+         ASSERT( 0 );
       }
 
       index.i[0] += reg->SrcRegisterDim.Index;
@@ -558,7 +558,7 @@ store_dest(
       break;
 
    default:
-      assert( 0 );
+      ASSERT( 0 );
       return;
    }
 
@@ -582,11 +582,11 @@ store_dest(
       break;
 
    case TGSI_SAT_MINUS_PLUS_ONE:
-      assert( 0 );
+      ASSERT( 0 );
       break;
 
    default:
-      assert( 0 );
+      ASSERT( 0 );
    }
 }
 
@@ -769,7 +769,7 @@ exec_tex(struct spu_exec_machine *mach,
       break;
 
    default:
-      assert (0);
+      ASSERT (0);
    }
 
    FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
@@ -861,7 +861,7 @@ exec_declaration(struct spu_exec_machine *mach,
             break;
 
          default:
-            assert( 0 );
+            ASSERT( 0 );
          }
 
          if( mask == TGSI_WRITEMASK_XYZW ) {
@@ -971,11 +971,11 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_EXP:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_LOG:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_MUL:
@@ -1151,24 +1151,24 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_CND:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_CND0:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_DOT2ADD:
       /* TGSI_OPCODE_DP2A */
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_INDEX:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_NEGATE:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_FRAC:
@@ -1181,7 +1181,7 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_CLAMP:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_FLOOR:
@@ -1276,7 +1276,7 @@ exec_instruction(
       break;
 
     case TGSI_OPCODE_MULTIPLYMATRIX:
-       assert (0);
+       ASSERT (0);
        break;
 
     case TGSI_OPCODE_ABS:
@@ -1290,7 +1290,7 @@ exec_instruction(
        break;
 
    case TGSI_OPCODE_RCC:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_DPH:
@@ -1353,23 +1353,23 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_PK2H:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_PK2US:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_PK4B:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_PK4UB:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_RFL:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_SEQ:
@@ -1384,7 +1384,7 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_SFL:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_SGT:
@@ -1429,7 +1429,7 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_STR:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_TEX:
@@ -1452,7 +1452,7 @@ exec_instruction(
       /* src[1] = d[strq]/dx */
       /* src[2] = d[strq]/dy */
       /* src[3] = sampler unit */
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_TXL:
@@ -1470,35 +1470,35 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_UP2H:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_UP2US:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_UP4B:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_UP4UB:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_X2D:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_ARA:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_ARR:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_BRA:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_CAL:
@@ -1507,14 +1507,14 @@ exec_instruction(
          /* do the call */
 
          /* push the Cond, Loop, Cont stacks */
-         assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
+         ASSERT(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
          mach->CondStack[mach->CondStackTop++] = mach->CondMask;
-         assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
+         ASSERT(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
          mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
-         assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
+         ASSERT(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
          mach->ContStack[mach->ContStackTop++] = mach->ContMask;
 
-         assert(mach->FuncStackTop < TGSI_EXEC_MAX_CALL_NESTING);
+         ASSERT(mach->FuncStackTop < TGSI_EXEC_MAX_CALL_NESTING);
          mach->FuncStack[mach->FuncStackTop++] = mach->FuncMask;
 
          /* note that PC was already incremented above */
@@ -1538,13 +1538,13 @@ exec_instruction(
          *pc = mach->CallStack[--mach->CallStackTop];
 
          /* pop the Cond, Loop, Cont stacks */
-         assert(mach->CondStackTop > 0);
+         ASSERT(mach->CondStackTop > 0);
          mach->CondMask = mach->CondStack[--mach->CondStackTop];
-         assert(mach->LoopStackTop > 0);
+         ASSERT(mach->LoopStackTop > 0);
          mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
-         assert(mach->ContStackTop > 0);
+         ASSERT(mach->ContStackTop > 0);
          mach->ContMask = mach->ContStack[--mach->ContStackTop];
-         assert(mach->FuncStackTop > 0);
+         ASSERT(mach->FuncStackTop > 0);
          mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
 
          UPDATE_EXEC_MASK(mach);
@@ -1552,7 +1552,7 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_SSG:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_CMP:
@@ -1592,11 +1592,11 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_NRM:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_DIV:
-      assert( 0 );
+      ASSERT( 0 );
       break;
 
    case TGSI_OPCODE_DP2:
@@ -1615,7 +1615,7 @@ exec_instruction(
 
    case TGSI_OPCODE_IF:
       /* push CondMask */
-      assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
+      ASSERT(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
       mach->CondStack[mach->CondStackTop++] = mach->CondMask;
       FETCH( &r[0], 0, CHAN_X );
       /* update CondMask */
@@ -1639,7 +1639,7 @@ exec_instruction(
       /* invert CondMask wrt previous mask */
       {
          uint prevMask;
-         assert(mach->CondStackTop > 0);
+         ASSERT(mach->CondStackTop > 0);
          prevMask = mach->CondStack[mach->CondStackTop - 1];
          mach->CondMask = ~mach->CondMask & prevMask;
          UPDATE_EXEC_MASK(mach);
@@ -1649,7 +1649,7 @@ exec_instruction(
 
    case TGSI_OPCODE_ENDIF:
       /* pop CondMask */
-      assert(mach->CondStackTop > 0);
+      ASSERT(mach->CondStackTop > 0);
       mach->CondMask = mach->CondStack[--mach->CondStackTop];
       UPDATE_EXEC_MASK(mach);
       break;
@@ -1660,19 +1660,19 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_REP:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_ENDREP:
-       assert (0);
+       ASSERT (0);
        break;
 
    case TGSI_OPCODE_PUSHA:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_POPA:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_CEIL:
@@ -1746,7 +1746,7 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_MOD:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_XOR:
@@ -1759,15 +1759,15 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_SAD:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_TXF:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_TXQ:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_EMIT:
@@ -1784,9 +1784,9 @@ exec_instruction(
       /* fall-through (for now) */
    case TGSI_OPCODE_BGNLOOP2:
       /* push LoopMask and ContMasks */
-      assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
+      ASSERT(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
       mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
-      assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
+      ASSERT(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
       mach->ContStack[mach->ContStackTop++] = mach->ContMask;
       break;
 
@@ -1794,7 +1794,7 @@ exec_instruction(
       /* fall-through (for now at least) */
    case TGSI_OPCODE_ENDLOOP2:
       /* Restore ContMask, but don't pop */
-      assert(mach->ContStackTop > 0);
+      ASSERT(mach->ContStackTop > 0);
       mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
       if (mach->LoopMask) {
          /* repeat loop: jump to instruction just past BGNLOOP */
@@ -1802,10 +1802,10 @@ exec_instruction(
       }
       else {
          /* exit loop: pop LoopMask */
-         assert(mach->LoopStackTop > 0);
+         ASSERT(mach->LoopStackTop > 0);
          mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
          /* pop ContMask */
-         assert(mach->ContStackTop > 0);
+         ASSERT(mach->ContStackTop > 0);
          mach->ContMask = mach->ContStack[--mach->ContStackTop];
       }
       UPDATE_EXEC_MASK(mach);
@@ -1834,26 +1834,26 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_NOISE1:
-      assert( 0 );
+      ASSERT( 0 );
       break;
 
    case TGSI_OPCODE_NOISE2:
-      assert( 0 );
+      ASSERT( 0 );
       break;
 
    case TGSI_OPCODE_NOISE3:
-      assert( 0 );
+      ASSERT( 0 );
       break;
 
    case TGSI_OPCODE_NOISE4:
-      assert( 0 );
+      ASSERT( 0 );
       break;
 
    case TGSI_OPCODE_NOP:
       break;
 
    default:
-      assert( 0 );
+      ASSERT( 0 );
    }
 }
 
@@ -1874,11 +1874,11 @@ spu_exec_machine_run( struct spu_exec_machine *mach )
    mach->FuncMask = 0xf;
    mach->ExecMask = 0xf;
 
-   mach->CondStackTop = 0; /* temporarily subvert this assertion */
-   assert(mach->CondStackTop == 0);
-   assert(mach->LoopStackTop == 0);
-   assert(mach->ContStackTop == 0);
-   assert(mach->CallStackTop == 0);
+   mach->CondStackTop = 0; /* temporarily subvert this ASSERTion */
+   ASSERT(mach->CondStackTop == 0);
+   ASSERT(mach->LoopStackTop == 0);
+   ASSERT(mach->ContStackTop == 0);
+   ASSERT(mach->CallStackTop == 0);
 
    mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] = 0;
    mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] = 0;
diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
index b4cffeeb32..c0a729b3d2 100644
--- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
+++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
@@ -95,7 +95,7 @@ read_ds_quad(tile_t *buffer, unsigned x, unsigned y,
 
 
    default:
-      assert(0);
+      ASSERT(0);
       break;
    }
 }
@@ -153,7 +153,7 @@ write_ds_quad(tile_t *buffer, unsigned x, unsigned y,
 
 
    default:
-      assert(0);
+      ASSERT(0);
       break;
    }
 }
diff --git a/src/gallium/drivers/cell/spu/spu_util.c b/src/gallium/drivers/cell/spu/spu_util.c
index b25ca4eafc..b8a0d4a265 100644
--- a/src/gallium/drivers/cell/spu/spu_util.c
+++ b/src/gallium/drivers/cell/spu/spu_util.c
@@ -1,4 +1,5 @@
 
+#include "cell/common.h"
 #include "pipe/p_shader_tokens.h"
 #include "pipe/p_debug.h"
 #include "tgsi/tgsi_parse.h"
@@ -20,7 +21,7 @@ tgsi_util_get_src_register_swizzle(
    case 3:
       return reg->SwizzleW;
    default:
-      assert( 0 );
+      ASSERT( 0 );
    }
    return 0;
 }
@@ -40,7 +41,7 @@ tgsi_util_get_src_register_extswizzle(
    case 3:
       return reg->ExtSwizzleW;
    default:
-      assert( 0 );
+      ASSERT( 0 );
    }
    return 0;
 }
@@ -60,12 +61,12 @@ tgsi_util_get_full_src_register_extswizzle(
       &reg->SrcRegisterExtSwz,
       component );
 
-   assert (TGSI_SWIZZLE_X == TGSI_EXTSWIZZLE_X);
-   assert (TGSI_SWIZZLE_Y == TGSI_EXTSWIZZLE_Y);
-   assert (TGSI_SWIZZLE_Z == TGSI_EXTSWIZZLE_Z);
-   assert (TGSI_SWIZZLE_W == TGSI_EXTSWIZZLE_W);
-   assert (TGSI_EXTSWIZZLE_ZERO > TGSI_SWIZZLE_W);
-   assert (TGSI_EXTSWIZZLE_ONE > TGSI_SWIZZLE_W);
+   ASSERT (TGSI_SWIZZLE_X == TGSI_EXTSWIZZLE_X);
+   ASSERT (TGSI_SWIZZLE_Y == TGSI_EXTSWIZZLE_Y);
+   ASSERT (TGSI_SWIZZLE_Z == TGSI_EXTSWIZZLE_Z);
+   ASSERT (TGSI_SWIZZLE_W == TGSI_EXTSWIZZLE_W);
+   ASSERT (TGSI_EXTSWIZZLE_ZERO > TGSI_SWIZZLE_W);
+   ASSERT (TGSI_EXTSWIZZLE_ONE > TGSI_SWIZZLE_W);
 
    /*
     * Second, calculate the simple  swizzle  for   the   unswizzled channel index.
@@ -95,7 +96,7 @@ tgsi_util_get_src_register_extnegate(
    case 3:
       return reg->NegateW;
    default:
-      assert( 0 );
+      ASSERT( 0 );
    }
    return 0;
 }
@@ -120,7 +121,7 @@ tgsi_util_set_src_register_extnegate(
       reg->NegateW = negate;
       break;
    default:
-      assert( 0 );
+      ASSERT( 0 );
    }
 }
 
diff --git a/src/gallium/drivers/cell/spu/spu_vertex_fetch.c b/src/gallium/drivers/cell/spu/spu_vertex_fetch.c
index 26f2363749..03375d84a5 100644
--- a/src/gallium/drivers/cell/spu/spu_vertex_fetch.c
+++ b/src/gallium/drivers/cell/spu/spu_vertex_fetch.c
@@ -92,7 +92,7 @@ static void generic_vertex_fetch(struct spu_vs_context *draw,
    unsigned nr_attrs = draw->vertex_fetch.nr_attrs;
    unsigned attr;
 
-   assert(count <= 4);
+   ASSERT(count <= 4);
 
 #if DRAW_DBG
    printf("SPU: %s count = %u, nr_attrs = %u\n", 
diff --git a/src/gallium/drivers/cell/spu/spu_vertex_shader.c b/src/gallium/drivers/cell/spu/spu_vertex_shader.c
index f81d19fea1..fbe5b34d39 100644
--- a/src/gallium/drivers/cell/spu/spu_vertex_shader.c
+++ b/src/gallium/drivers/cell/spu/spu_vertex_shader.c
@@ -112,7 +112,7 @@ run_vertex_program(struct spu_vs_context *draw,
    const float *scale = draw->viewport.scale;
    const float *trans = draw->viewport.translate;
 
-   assert(count <= 4);
+   ASSERT(count <= 4);
 
    machine->Processor = TGSI_PROCESSOR_VERTEX;
 
-- 
cgit v1.2.3


From 5cf2e226548f08c4b79a4eb289fd636a00079fb3 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Thu, 4 Sep 2008 18:36:22 -0600
Subject: cell: implement CELL_DEBUG env/options var

Options so far:
  "checker"  module tile clear color by SPU ID to see where the tiles are
  "sync"  to do synchronous DMA (only partially implemented)
---
 src/gallium/drivers/cell/common.h           |  4 ++
 src/gallium/drivers/cell/ppu/cell_context.c | 13 ++++++
 src/gallium/drivers/cell/ppu/cell_context.h |  2 +
 src/gallium/drivers/cell/ppu/cell_spu.c     |  1 +
 src/gallium/drivers/cell/spu/spu_main.c     | 71 +++++++++++++++++++----------
 5 files changed, 66 insertions(+), 25 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index 6bace0bb11..c0ca201e1d 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -106,6 +106,9 @@
 #define CELL_BUFFER_STATUS_USED 20
 
 
+#define CELL_DEBUG_CHECKER  (1 << 0)
+#define CELL_DEBUG_SYNC     (1 << 1)
+
 
 /**
  */
@@ -263,6 +266,7 @@ struct cell_init_info
 {
    unsigned id;
    unsigned num_spus;
+   unsigned debug_flags;  /**< mask of CELL_DEBUG_x flags */
    struct cell_command *cmd;
 
    /** Buffers for command batches, vertex/index data */
diff --git a/src/gallium/drivers/cell/ppu/cell_context.c b/src/gallium/drivers/cell/ppu/cell_context.c
index 9ff4e86943..c8828e644c 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.c
+++ b/src/gallium/drivers/cell/ppu/cell_context.c
@@ -85,6 +85,15 @@ cell_draw_create(struct cell_context *cell)
 }
 
 
+#ifdef DEBUG
+static const struct debug_named_value cell_debug_flags[] = {
+   {"checker", CELL_DEBUG_CHECKER},/**< modulate tile clear color by SPU ID */
+   {"sync", CELL_DEBUG_SYNC},      /**< SPUs do synchronous DMA */
+   {NULL, 0}
+};
+#endif
+
+
 struct pipe_context *
 cell_create_context(struct pipe_screen *screen,
                     struct cell_winsys *cws)
@@ -136,6 +145,10 @@ cell_create_context(struct pipe_screen *screen,
    draw_wide_point_threshold(cell->draw, 0.0);
    draw_wide_line_threshold(cell->draw, 0.0);
 
+   cell->debug_flags = debug_get_flags_option("CELL_DEBUG", 
+                                              cell_debug_flags, 
+                                              0 );
+
    /*
     * SPU stuff
     */
diff --git a/src/gallium/drivers/cell/ppu/cell_context.h b/src/gallium/drivers/cell/ppu/cell_context.h
index 9ca153d52f..8cec9f45b2 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.h
+++ b/src/gallium/drivers/cell/ppu/cell_context.h
@@ -163,6 +163,8 @@ struct cell_context
 
    struct spe_function attrib_fetch;
    unsigned attrib_fetch_offsets[PIPE_MAX_ATTRIBS];
+
+   unsigned debug_flags;
 };
 
 
diff --git a/src/gallium/drivers/cell/ppu/cell_spu.c b/src/gallium/drivers/cell/ppu/cell_spu.c
index 5e75f409a3..2df90fdcb7 100644
--- a/src/gallium/drivers/cell/ppu/cell_spu.c
+++ b/src/gallium/drivers/cell/ppu/cell_spu.c
@@ -131,6 +131,7 @@ cell_start_spus(struct cell_context *cell)
    for (i = 0; i < cell->num_spus; i++) {
       cell_global.inits[i].id = i;
       cell_global.inits[i].num_spus = cell->num_spus;
+      cell_global.inits[i].debug_flags = cell->debug_flags;
       cell_global.inits[i].cmd = &cell_global.command[i];
       for (j = 0; j < CELL_NUM_BUFFERS; j++) {
          cell_global.inits[i].buffers[j] = cell->buffer[j];
diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index e04ffeb9b1..0d4cdfae85 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -136,54 +136,75 @@ really_clear_tiles(uint surfaceIndex)
 static void
 cmd_clear_surface(const struct cell_command_clear_surface *clear)
 {
-   const uint num_tiles = spu.fb.width_tiles * spu.fb.height_tiles;
-   uint i;
-
    if (Debug)
       printf("SPU %u: CLEAR SURF %u to 0x%08x\n", spu.init.id,
              clear->surface, clear->value);
 
-#define CLEAR_OPT 1
-#if CLEAR_OPT
-   /* set all tile's status to CLEAR */
    if (clear->surface == 0) {
-      memset(spu.ctile_status, TILE_STATUS_CLEAR, sizeof(spu.ctile_status));
       spu.fb.color_clear_value = clear->value;
+      if (spu.init.debug_flags & CELL_DEBUG_CHECKER) {
+         uint x = (spu.init.id << 4) | (spu.init.id << 12) |
+            (spu.init.id << 20) | (spu.init.id << 28);
+         spu.fb.color_clear_value ^= x;
+      }
    }
    else {
-      memset(spu.ztile_status, TILE_STATUS_CLEAR, sizeof(spu.ztile_status));
       spu.fb.depth_clear_value = clear->value;
    }
-   return;
-#endif
 
+#define CLEAR_OPT 1
+#if CLEAR_OPT
+
+   /* Simply set all tiles' status to CLEAR.
+    * When we actually begin rendering into a tile, we'll initialize it to
+    * the clear value.  If any tiles go untouched during the frame,
+    * really_clear_tiles() will set them to the clear value.
+    */
    if (clear->surface == 0) {
-      spu.fb.color_clear_value = clear->value;
-      clear_c_tile(&spu.ctile);
+      memset(spu.ctile_status, TILE_STATUS_CLEAR, sizeof(spu.ctile_status));
    }
    else {
-      spu.fb.depth_clear_value = clear->value;
-      clear_z_tile(&spu.ztile);
+      memset(spu.ztile_status, TILE_STATUS_CLEAR, sizeof(spu.ztile_status));
    }
 
+#else
+
+   /*
+    * This path clears the whole framebuffer to the clear color right now.
+    */
+
    /*
    printf("SPU: %s num=%d w=%d h=%d\n",
           __FUNCTION__, num_tiles, spu.fb.width_tiles, spu.fb.height_tiles);
    */
 
-   for (i = spu.init.id; i < num_tiles; i += spu.init.num_spus) {
-      uint tx = i % spu.fb.width_tiles;
-      uint ty = i / spu.fb.width_tiles;
-      if (clear->surface == 0)
-         put_tile(tx, ty, &spu.ctile, TAG_SURFACE_CLEAR, 0);
-      else
-         put_tile(tx, ty, &spu.ztile, TAG_SURFACE_CLEAR, 1);
-      /* XXX we don't want this here, but it fixes bad tile results */
+   /* init a single tile to the clear value */
+   if (clear->surface == 0) {
+      clear_c_tile(&spu.ctile);
+   }
+   else {
+      clear_z_tile(&spu.ztile);
    }
 
-#if 0
-   wait_on_mask(1 << TAG_SURFACE_CLEAR);
-#endif
+   /* walk over my tiles, writing the 'clear' tile's data */
+   {
+      const uint num_tiles = spu.fb.width_tiles * spu.fb.height_tiles;
+      uint i;
+      for (i = spu.init.id; i < num_tiles; i += spu.init.num_spus) {
+         uint tx = i % spu.fb.width_tiles;
+         uint ty = i / spu.fb.width_tiles;
+         if (clear->surface == 0)
+            put_tile(tx, ty, &spu.ctile, TAG_SURFACE_CLEAR, 0);
+         else
+            put_tile(tx, ty, &spu.ztile, TAG_SURFACE_CLEAR, 1);
+      }
+   }
+
+   if (spu.init.debug_flags & CELL_DEBUG_SYNC) {
+      wait_on_mask(1 << TAG_SURFACE_CLEAR);
+   }
+
+#endif /* CLEAR_OPT */
 
    if (Debug)
       printf("SPU %u: CLEAR SURF done\n", spu.init.id);
-- 
cgit v1.2.3


From 8af4794afcfa04351d4131d826daeb1312634f82 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 5 Sep 2008 10:18:00 -0600
Subject: cell: code clean-up, comments

---
 src/gallium/drivers/cell/spu/spu_main.c | 67 +++++++++++++++++++--------------
 src/gallium/drivers/cell/spu/spu_main.h |  8 ++--
 2 files changed, 43 insertions(+), 32 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index 0d4cdfae85..d223f32d94 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -55,6 +55,10 @@ struct spu_global spu;
 
 struct spu_vs_context draw;
 
+
+/**
+ * Buffers containing dynamically generated SPU code:
+ */
 static unsigned char attribute_fetch_code_buffer[136 * PIPE_MAX_ATTRIBS]
     ALIGN16_ATTRIB;
 
@@ -332,6 +336,21 @@ cmd_state_depth_stencil(const struct cell_command_depth_stencil_alpha_test *stat
 }
 
 
+static void
+cmd_state_logicop(const struct cell_command_logicop * code)
+{
+   mfc_get(logicop_code_buffer,
+           (unsigned int) code->base,  /* src */
+           code->size,
+           TAG_BATCH_BUFFER,
+           0, /* tid */
+           0  /* rid */);
+   wait_on_mask(1 << TAG_BATCH_BUFFER);
+
+   spu.logicop = (logicop_func) logicop_code_buffer;
+}
+
+
 static void
 cmd_state_sampler(const struct cell_command_sampler *sampler)
 {
@@ -401,6 +420,21 @@ cmd_state_vs_array_info(const struct cell_array_info *vs_info)
 }
 
 
+static void
+cmd_state_attrib_fetch(const struct cell_attribute_fetch_code *code)
+{
+   mfc_get(attribute_fetch_code_buffer,
+           (unsigned int) code->base,  /* src */
+           code->size,
+           TAG_BATCH_BUFFER,
+           0, /* tid */
+           0  /* rid */);
+   wait_on_mask(1 << TAG_BATCH_BUFFER);
+
+   draw.vertex_fetch.code = attribute_fetch_code_buffer;
+}
+
+
 static void
 cmd_finish(void)
 {
@@ -539,38 +573,15 @@ cmd_batch(uint opcode)
                                 (struct cell_shader_info *) &buffer[pos+1]);
          pos += (1 + ROUNDUP8(sizeof(struct cell_shader_info)) / 8);
          break;
-      case CELL_CMD_STATE_ATTRIB_FETCH: {
-         struct cell_attribute_fetch_code *code =
-             (struct cell_attribute_fetch_code *) &buffer[pos+1];
-
-              mfc_get(attribute_fetch_code_buffer,
-                      (unsigned int) code->base,  /* src */
-                      code->size,
-                      TAG_BATCH_BUFFER,
-                      0, /* tid */
-                      0  /* rid */);
-         wait_on_mask(1 << TAG_BATCH_BUFFER);
-
-         draw.vertex_fetch.code = attribute_fetch_code_buffer;
+      case CELL_CMD_STATE_ATTRIB_FETCH:
+         cmd_state_attrib_fetch((struct cell_attribute_fetch_code *)
+                                &buffer[pos+1]);
          pos += (1 + ROUNDUP8(sizeof(struct cell_attribute_fetch_code)) / 8);
          break;
-      }
-      case CELL_CMD_STATE_LOGICOP: {
-         struct cell_command_logicop *code =
-             (struct cell_command_logicop *) &buffer[pos+1];
-
-              mfc_get(logicop_code_buffer,
-                      (unsigned int) code->base,  /* src */
-                      code->size,
-                      TAG_BATCH_BUFFER,
-                      0, /* tid */
-                      0  /* rid */);
-         wait_on_mask(1 << TAG_BATCH_BUFFER);
-
-	 spu.logicop = (logicop_func) logicop_code_buffer;
+      case CELL_CMD_STATE_LOGICOP:
+         cmd_state_logicop((struct cell_command_logicop *) &buffer[pos+1]);
          pos += (1 + ROUNDUP8(sizeof(struct cell_command_logicop)) / 8);
          break;
-      }
       case CELL_CMD_FLUSH_BUFFER_RANGE: {
 	 struct cell_buffer_range *br = (struct cell_buffer_range *)
 	     &buffer[pos+1];
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index e962e1426c..4879f8c9c8 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -124,13 +124,13 @@ struct spu_global
    struct spu_framebuffer fb;
    boolean read_depth;
    boolean read_stencil;
-   frag_test_func frag_test;
+   frag_test_func frag_test;  /**< Current depth/stencil test code */
    
-   boolean read_fb;
-   blend_func blend;
+   boolean read_fb;   /**< Does current blend mode require framebuffer read? */
+   blend_func blend;  /**< Current blend code */
    qword const_blend_color[4] ALIGN16_ATTRIB;
 
-   logicop_func logicop;
+   logicop_func logicop;  /**< Current logicop code **/
 
    struct pipe_sampler_state sampler[PIPE_MAX_SAMPLERS];
    struct spu_texture texture[PIPE_MAX_SAMPLERS];
-- 
cgit v1.2.3


From 0e79e474de164a765b9759398c83b6bfa16a0012 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 5 Sep 2008 13:55:02 -0600
Subject: cell: comments, etc.

---
 .../drivers/cell/ppu/cell_state_per_fragment.c     | 28 ++++++++++----
 src/gallium/drivers/cell/ppu/cell_vertex_fetch.c   |  5 +--
 src/gallium/drivers/cell/spu/spu_per_fragment_op.c | 44 +++++++++++++++-------
 3 files changed, 52 insertions(+), 25 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c b/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c
index 53ae3aa50e..705867107b 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c
@@ -132,9 +132,9 @@ emit_alpha_test(struct pipe_depth_stencil_alpha_state *dsa,
 
 
 /**
+ * Generate code to perform Z testing.  Four Z values are tested at once.
  * \param dsa        Current depth-test state
  * \param f          Function to which code should be appended
- * \param m          Mask of allocated / free SPE registers
  * \param mask       Index of register to contain depth-pass mask
  * \param stored     Index of register containing values from depth buffer
  * \param calculated Index of register containing per-fragment depth values
@@ -198,6 +198,7 @@ emit_depth_test(struct pipe_depth_stencil_alpha_state *dsa,
 
 
 /**
+ * Generate code to apply the stencil operation (after testing).
  * \note Emits a maximum of 5 instructions.
  *
  * \warning
@@ -222,9 +223,13 @@ emit_stencil_op(struct spe_function *f,
       spe_il(f, result, ref);
       break;
    case PIPE_STENCIL_OP_INCR:
+      /* clamp = [0xff, 0xff, 0xff, 0xff] */
       spe_il(f, clamp, 0x0ff);
+      /* result[i] = in[i] + 1 */
       spe_ai(f, result, in, 1);
+      /* clamp_mask[i] = (result[i] > 0xff) */
       spe_clgti(f, clamp_mask, result, 0x0ff);
+      /* result[i] = clamp_mask[i] ? clamp[i] : result[i] */
       spe_selb(f, result, result, clamp, clamp_mask);
       break;
    case PIPE_STENCIL_OP_DECR:
@@ -259,10 +264,10 @@ emit_stencil_op(struct spe_function *f,
 
 
 /**
+ * Generate code to do stencil test.  Four pixels are tested at once.
  * \param dsa        Depth / stencil test state
  * \param face       0 for front face, 1 for back face
  * \param f          Function to append instructions to
- * \param reg_mask   Mask of allocated registers
  * \param mask       Register containing mask of fragments passing the
  *                   alpha test
  * \param depth_mask Register containing mask of fragments passing the
@@ -310,13 +315,14 @@ emit_stencil_test(struct pipe_depth_stencil_alpha_state *dsa,
 
    switch (dsa->stencil[face].func) {
    case PIPE_FUNC_NEVER:
-      spe_il(f, stencil_mask, 0);
+      spe_il(f, stencil_mask, 0);   /* stencil_mask[0..3] = [0,0,0,0] */
       break;
 
    case PIPE_FUNC_NOTEQUAL:
       complement = TRUE;
       /* FALLTHROUGH */
    case PIPE_FUNC_EQUAL:
+      /* stencil_mask[i] = (stored[i] == ref) */
       spe_ceqi(f, stencil_mask, stored, ref);
       break;
 
@@ -324,6 +330,8 @@ emit_stencil_test(struct pipe_depth_stencil_alpha_state *dsa,
       complement = TRUE;
       /* FALLTHROUGH */
    case PIPE_FUNC_GREATER:
+      complement = TRUE;
+      /* stencil_mask[i] = (stored[i] > ref) */
       spe_clgti(f, stencil_mask, stored, ref);
       break;
 
@@ -331,8 +339,11 @@ emit_stencil_test(struct pipe_depth_stencil_alpha_state *dsa,
       complement = TRUE;
       /* FALLTHROUGH */
    case PIPE_FUNC_GEQUAL:
+      /* stencil_mask[i] = (stored[i] > ref) */
       spe_clgti(f, stencil_mask, stored, ref);
+      /* tmp[i] = (stored[i] == ref) */
       spe_ceqi(f, tmp, stored, ref);
+      /* stencil_mask[i] = stencil_mask[i] | tmp[i] */
       spe_or(f, stencil_mask, stencil_mask, tmp);
       break;
 
@@ -461,7 +472,7 @@ cell_generate_depth_stencil_test(struct cell_depth_stencil_alpha_state *cdsa)
     * + 25 (front stencil) + 25 (back stencil) + 4 = 63 instructions.  Round
     * up to 64 to make it a happy power-of-two.
     */
-   spe_init_func(f, 4 * 64);
+   spe_init_func(f, SPE_INST_SIZE * 64);
 
 
    /* Allocate registers for the function's input parameters.  Cleverly (and
@@ -540,7 +551,7 @@ cell_generate_depth_stencil_test(struct cell_depth_stencil_alpha_state *cdsa)
          spe_selb(f, depth, depth, zvals, mask);
    }
 
-   spe_bi(f, 0, 0, 0);
+   spe_bi(f, 0, 0, 0);  /* return from function call */
 
 
 #if 0
@@ -956,7 +967,7 @@ cell_generate_alpha_blend(struct cell_blend_state *cb)
     * + 4 (fragment mask) + 1 (return) = 55 instlructions.  Round up to 64 to
     * make it a happy power-of-two.
     */
-   spe_init_func(f, 4 * 64);
+   spe_init_func(f, SPE_INST_SIZE * 64);
 
 
    const int frag[4] = {
@@ -1144,7 +1155,8 @@ cell_generate_alpha_blend(struct cell_blend_state *cb)
 }
 
 
-int PC_OFFSET(const struct spe_function *f, const void *d)
+static int
+PC_OFFSET(const struct spe_function *f, const void *d)
 {
    const intptr_t pc = (intptr_t) f->csr;
    const intptr_t ea = ~0x0f & (intptr_t) d;
@@ -1178,7 +1190,7 @@ cell_generate_logic_op(struct spe_function *f,
     * bytes (equiv. to 8 instructions) are needed for data storage.  Round up
     * to 64 to make it a happy power-of-two.
     */
-   spe_init_func(f, 4 * 64);
+   spe_init_func(f, SPE_INST_SIZE * 64);
 
 
    /* Pixel colors in framebuffer format in AoS layout.
diff --git a/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c b/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c
index 2ece0250f6..566df7f59e 100644
--- a/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c
+++ b/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c
@@ -297,10 +297,9 @@ void cell_update_vertex_fetch(struct draw_context *draw)
 
 
    /* Each fetch function can be a maximum of 34 instructions (note: this is
-    * actually a slight over-estimate).  That means (34 * 4) = 136 bytes
-    * each maximum.
+    * actually a slight over-estimate).
     */
-   spe_init_func(p, 136 * unique_attr_formats);
+   spe_init_func(p, 34 * SPE_INST_SIZE * unique_attr_formats);
 
 
    /* Allocate registers for the function's input parameters.
diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
index c0a729b3d2..db88735226 100644
--- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
+++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
@@ -35,8 +35,17 @@
 
 #define ZERO 0x80
 
+
+/**
+ * Get a "quad" of four fragment Z/stencil values from the given tile.
+ * \param tile  the tile of Z/stencil values
+ * \param x, y  location of the quad in the tile, in pixels
+ * \param depth_format  format of the tile's data
+ * \param detph  returns four depth values
+ * \param stencil  returns four stencil values
+ */
 static void
-read_ds_quad(tile_t *buffer, unsigned x, unsigned y,
+read_ds_quad(tile_t *tile, unsigned x, unsigned y,
              enum pipe_format depth_format, qword *depth,
              qword *stencil)
 {
@@ -45,14 +54,13 @@ read_ds_quad(tile_t *buffer, unsigned x, unsigned y,
 
    switch (depth_format) {
    case PIPE_FORMAT_Z16_UNORM: {
-      qword *ptr = (qword *) &buffer->us8[iy][ix / 2];
+      qword *ptr = (qword *) &tile->us8[iy][ix / 2];
 
       const qword shuf_vec = (qword) {
          ZERO, ZERO, 0, 1, ZERO, ZERO, 2, 3,
          ZERO, ZERO, 4, 5, ZERO, ZERO, 6, 7
       };
 
-
       /* At even X values we want the first 4 shorts, and at odd X values we
        * want the second 4 shorts.
        */
@@ -65,18 +73,16 @@ read_ds_quad(tile_t *buffer, unsigned x, unsigned y,
       break;
    }
 
-
    case PIPE_FORMAT_Z32_UNORM: {
-      qword *ptr = (qword *) &buffer->ui4[iy][ix];
+      qword *ptr = (qword *) &tile->ui4[iy][ix];
 
       *depth = *ptr;
       *stencil = si_il(0);
       break;
    }
-      
 
    case PIPE_FORMAT_Z24S8_UNORM: {
-      qword *ptr = (qword *) &buffer->ui4[iy][ix];
+      qword *ptr = (qword *) &tile->ui4[iy][ix];
       qword mask = si_fsmbi(0xEEEE);
 
       *depth = si_rotmai(si_and(*ptr, mask), -8);
@@ -84,16 +90,14 @@ read_ds_quad(tile_t *buffer, unsigned x, unsigned y,
       break;
    }
 
-
    case PIPE_FORMAT_S8Z24_UNORM: {
-      qword *ptr = (qword *) &buffer->ui4[iy][ix];
+      qword *ptr = (qword *) &tile->ui4[iy][ix];
 
       *depth = si_and(*ptr, si_fsmbi(0x7777));
       *stencil = si_andi(si_roti(*ptr, 8), 0x0ff);
       break;
    }
 
-
    default:
       ASSERT(0);
       break;
@@ -101,6 +105,14 @@ read_ds_quad(tile_t *buffer, unsigned x, unsigned y,
 }
 
 
+/**
+ * Put a quad of Z/stencil values into a tile.
+ * \param tile  the tile of Z/stencil values to write into
+ * \param x, y  location of the quad in the tile, in pixels
+ * \param depth_format  format of the tile's data
+ * \param detph  depth values to store
+ * \param stencil  stencil values to store
+ */
 static void
 write_ds_quad(tile_t *buffer, unsigned x, unsigned y,
               enum pipe_format depth_format,
@@ -124,14 +136,12 @@ write_ds_quad(tile_t *buffer, unsigned x, unsigned y,
       break;
    }
 
-
    case PIPE_FORMAT_Z32_UNORM: {
       qword *ptr = (qword *) &buffer->ui4[iy][ix];
       *ptr = depth;
       break;
    }
 
-
    case PIPE_FORMAT_Z24S8_UNORM: {
       qword *ptr = (qword *) &buffer->ui4[iy][ix];
       qword mask = si_fsmbi(0xEEEE);
@@ -141,7 +151,6 @@ write_ds_quad(tile_t *buffer, unsigned x, unsigned y,
       break;
    }
 
-
    case PIPE_FORMAT_S8Z24_UNORM: {
       qword *ptr = (qword *) &buffer->ui4[iy][ix];
       qword mask = si_fsmbi(0x7777);
@@ -151,7 +160,6 @@ write_ds_quad(tile_t *buffer, unsigned x, unsigned y,
       break;
    }
 
-
    default:
       ASSERT(0);
       break;
@@ -159,6 +167,14 @@ write_ds_quad(tile_t *buffer, unsigned x, unsigned y,
 }
 
 
+/**
+ * Do depth/stencil/alpha test for a "quad" of 4 fragments.
+ * \param x,y  location of quad within tile
+ * \param frag_mask  indicates which fragments are "alive"
+ * \param frag_depth  four fragment depth values
+ * \param frag_alpha  four fragment alpha values
+ * \param facing  front/back facing for four fragments (1=front, 0=back)
+ */
 qword
 spu_do_depth_stencil(int x, int y,
                      qword frag_mask, qword frag_depth, qword frag_alpha,
-- 
cgit v1.2.3


From cd9722dcddcb41af3196860280d23542dc673700 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Mon, 8 Sep 2008 11:50:13 -0600
Subject: cell: comments

---
 src/gallium/drivers/cell/spu/spu_tri.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index 2a4e0b423c..a3ea0a3e69 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -209,7 +209,7 @@ clip_emit_quad(struct setup_stage *setup)
 /**
  * Evaluate attribute coefficients (plane equations) to compute
  * attribute values for the four fragments in a quad.
- * Eg: four colors will be compute.
+ * Eg: four colors will be computed (in AoS format).
  */
 static INLINE void
 eval_coeff(uint slot, float x, float y, vector float result[4])
@@ -356,6 +356,7 @@ emit_quad( int x, int y, mask_t mask )
 
 
       /* Convert fragment data from AoS to SoA format.
+       * I.e. (RGBA,RGBA,RGBA,RGBA) -> (RRRR,GGGG,BBBB,AAAA)
        */
       qword soa_frag[4];
       _transpose_matrix4x4((vec_float4 *) soa_frag, colors);
@@ -373,6 +374,7 @@ emit_quad( int x, int y, mask_t mask )
 
       if (spu.read_fb) {
          /* Convert pixel data from AoS to SoA format.
+          * I.e. (RGBA,RGBA,RGBA,RGBA) -> (RRRR,GGGG,BBBB,AAAA)
           */
          vec_float4 aos_pix[4] = {
             spu_unpack_A8R8G8B8(spu.ctile.ui[iy+0][ix+0]),
@@ -393,6 +395,7 @@ emit_quad( int x, int y, mask_t mask )
 
 
       /* Convert final pixel data from SoA to AoS format.
+       * I.e. (RRRR,GGGG,BBBB,AAAA) -> (RGBA,RGBA,RGBA,RGBA)
        */
       result = (*spu.logicop)(pix[0], pix[1], pix[2], pix[3],
                               result.r, result.g, result.b, result.a,
-- 
cgit v1.2.3


From 04ae4fba3c0a656cf2747fc994b99f99576d0e2b Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Mon, 8 Sep 2008 11:53:14 -0600
Subject: cell: minor change to Z float/int conversion code (avoid switch)

---
 src/gallium/drivers/cell/spu/spu_main.c            |  5 ++++
 src/gallium/drivers/cell/spu/spu_main.h            |  5 ++++
 src/gallium/drivers/cell/spu/spu_per_fragment_op.c | 34 +++++++++-------------
 3 files changed, 23 insertions(+), 21 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index d223f32d94..c4236817a9 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -252,12 +252,17 @@ cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
 
    switch (spu.fb.depth_format) {
    case PIPE_FORMAT_Z32_UNORM:
+      spu.fb.zsize = 4;
+      spu.fb.zscale = (float) 0xffffffffu;
+      break;
    case PIPE_FORMAT_Z24S8_UNORM:
    case PIPE_FORMAT_S8Z24_UNORM:
       spu.fb.zsize = 4;
+      spu.fb.zscale = (float) 0x00ffffffu;
       break;
    case PIPE_FORMAT_Z16_UNORM:
       spu.fb.zsize = 2;
+      spu.fb.zscale = (float) 0xffffu;
       break;
    default:
       spu.fb.zsize = 0;
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 4879f8c9c8..c2a53c9dcf 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -41,6 +41,10 @@
 #define MAX_HEIGHT 1024
 
 
+/**
+ * A tile is basically a TILE_SIZE x TILE_SIZE block of 4-byte pixels.
+ * The data may be addressed through several different types.
+ */
 typedef union {
    ushort us[TILE_SIZE][TILE_SIZE];
    uint   ui[TILE_SIZE][TILE_SIZE];
@@ -99,6 +103,7 @@ struct spu_framebuffer {
    uint depth_clear_value;
 
    uint zsize;                     /**< 0, 2 or 4 bytes per Z */
+   float zscale;                   /**< 65535.0, 2^24-1 or 2^32-1 */
 } ALIGN16_ATTRIB;
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
index db88735226..29dc07a2e8 100644
--- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
+++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
@@ -144,18 +144,22 @@ write_ds_quad(tile_t *buffer, unsigned x, unsigned y,
 
    case PIPE_FORMAT_Z24S8_UNORM: {
       qword *ptr = (qword *) &buffer->ui4[iy][ix];
+      /* form select mask = 1110,1110,1110,1110 */
       qword mask = si_fsmbi(0xEEEE);
-
+      /* depth[i] = depth[i] << 8 */
       depth = si_shli(depth, 8);
+      /* *ptr[i] = depth[i][31:8] | stencil[i][7:0] */
       *ptr = si_selb(stencil, depth, mask);
       break;
    }
 
    case PIPE_FORMAT_S8Z24_UNORM: {
       qword *ptr = (qword *) &buffer->ui4[iy][ix];
+      /* form select mask = 0111,0111,0111,0111 */
       qword mask = si_fsmbi(0x7777);
-
+      /* stencil[i] = stencil[i] << 24 */
       stencil = si_shli(stencil, 24);
+      /* *ptr[i] = stencil[i][31:24] | depth[i][23:0] */
       *ptr = si_selb(stencil, depth, mask);
       break;
    }
@@ -191,25 +195,13 @@ spu_do_depth_stencil(int x, int y,
       read_ds_quad(&spu.ztile, x, y, spu.fb.depth_format,
                    &pixel_depth, &pixel_stencil);
    }
-   
-   switch (spu.fb.depth_format) {
-   case PIPE_FORMAT_Z16_UNORM:
-      frag_depth = si_fm(frag_depth, (qword)spu_splats((float)(0x0000ffffu)));
-      frag_depth = si_cfltu(frag_depth, 0);
-      break;
-   case PIPE_FORMAT_Z32_UNORM:
-      frag_depth = si_fm(frag_depth, (qword)spu_splats((float)(0xffffffffu)));
-      frag_depth = si_cfltu(frag_depth, 0);
-      break;
-   case PIPE_FORMAT_Z24S8_UNORM:
-   case PIPE_FORMAT_S8Z24_UNORM:
-      frag_depth = si_fm(frag_depth, (qword)spu_splats((float)(0x00ffffffu)));
-      frag_depth = si_cfltu(frag_depth, 0);
-      break;
-   default:
-      ASSERT(0);
-      break;
-   }
+
+   /* convert floating point Z values to 32-bit uint */
+
+   /* frag_depth *= spu.fb.zscale */
+   frag_depth = si_fm(frag_depth, (qword)spu_splats(spu.fb.zscale));
+   /* frag_depth = uint(frag_depth) */
+   frag_depth = si_cfltu(frag_depth, 0);
 
    result = (*spu.frag_test)(frag_mask, pixel_depth, pixel_stencil,
                              frag_depth, frag_alpha, facing);
-- 
cgit v1.2.3


From 284ab5a6127f8b452acaa0e10ac1d9ebc87fac3e Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 10 Sep 2008 18:22:00 -0600
Subject: cell: checkpoint commit of new per-fragment processing

Do code generation for alpha test, z test, stencil, blend, colormask
and framebuffer/tile read/write as a single code block.
Ian's previous blend/z/stencil test code is still there but mostly disabled
and will be removed soon.
---
 src/gallium/drivers/cell/common.h                  |  20 +-
 src/gallium/drivers/cell/ppu/Makefile              |   1 +
 src/gallium/drivers/cell/ppu/cell_gen_fragment.c   | 530 +++++++++++++++++++++
 src/gallium/drivers/cell/ppu/cell_gen_fragment.h   |  38 ++
 src/gallium/drivers/cell/ppu/cell_state_emit.c     |  31 +-
 .../drivers/cell/ppu/cell_state_per_fragment.c     |   2 +-
 src/gallium/drivers/cell/spu/Makefile              |   2 +-
 src/gallium/drivers/cell/spu/spu_main.c            |  53 ++-
 src/gallium/drivers/cell/spu/spu_main.h            |  23 +
 src/gallium/drivers/cell/spu/spu_per_fragment_op.c | 231 ++++++++-
 src/gallium/drivers/cell/spu/spu_per_fragment_op.h |  11 +
 src/gallium/drivers/cell/spu/spu_tri.c             |  30 ++
 src/gallium/winsys/xlib/xm_api.c                   |   7 +-
 src/gallium/winsys/xlib/xm_winsys.c                |  35 ++
 14 files changed, 998 insertions(+), 16 deletions(-)
 create mode 100644 src/gallium/drivers/cell/ppu/cell_gen_fragment.c
 create mode 100644 src/gallium/drivers/cell/ppu/cell_gen_fragment.h

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index c0ca201e1d..a62530c64d 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -97,6 +97,7 @@
 #define CELL_CMD_STATE_LOGICOP       21
 #define CELL_CMD_VS_EXECUTE          22
 #define CELL_CMD_FLUSH_BUFFER_RANGE  23
+#define CELL_CMD_STATE_FRAGMENT_OPS  24
 
 
 #define CELL_NUM_BUFFERS 4
@@ -112,30 +113,43 @@
 
 /**
  */
-struct cell_command_depth_stencil_alpha_test {
+struct cell_command_depth_stencil_alpha_test
+{
    uint64_t base;               /**< Effective address of code start. */
    unsigned size;               /**< Size in bytes of SPE code. */
    unsigned read_depth;         /**< Flag: should depth be read? */
    unsigned read_stencil;       /**< Flag: should stencil be read? */
+   struct pipe_depth_stencil_alpha_state state;
 };
 
 
 /**
  * Upload code to perform framebuffer blend operation
  */
-struct cell_command_blend {
+struct cell_command_blend
+{
    uint64_t base;               /**< Effective address of code start. */
    unsigned size;               /**< Size in bytes of SPE code. */
    unsigned read_fb;            /**< Flag: should framebuffer be read? */
 };
 
 
-struct cell_command_logicop {
+struct cell_command_logicop
+{
    uint64_t base;               /**< Effective address of code start. */
    unsigned size;               /**< Size in bytes of SPE code. */
 };
 
 
+#define SPU_MAX_FRAGMENT_OPS_INSTS 64
+
+struct cell_command_fragment_ops
+{
+   uint64_t opcode;      /**< CELL_CMD_STATE_FRAGMENT_OPS */
+   unsigned code[SPU_MAX_FRAGMENT_OPS_INSTS];
+};
+
+
 /**
  * Tell SPUs about the framebuffer size, location
  */
diff --git a/src/gallium/drivers/cell/ppu/Makefile b/src/gallium/drivers/cell/ppu/Makefile
index 25473e200c..b5a6fcb8de 100644
--- a/src/gallium/drivers/cell/ppu/Makefile
+++ b/src/gallium/drivers/cell/ppu/Makefile
@@ -25,6 +25,7 @@ SOURCES = \
 	cell_context.c \
 	cell_draw_arrays.c \
 	cell_flush.c \
+	cell_gen_fragment.c \
 	cell_state_derived.c \
 	cell_state_emit.c \
 	cell_state_per_fragment.c \
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
new file mode 100644
index 0000000000..df29476be6
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
@@ -0,0 +1,530 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+
+/**
+ * Generate SPU per-fragment code (actually per-quad code).
+ * \author Brian Paul
+ */
+
+
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "rtasm/rtasm_ppc_spe.h"
+#include "cell_context.h"
+#include "cell_gen_fragment.h"
+
+
+
+/** Do extra optimizations? */
+#define OPTIMIZATIONS 1
+
+
+/**
+ * Generate SPE code to perform Z/depth testing.
+ *
+ * \param dsa         Gallium depth/stencil/alpha state to gen code for
+ * \param f           SPE function to append instruction onto.
+ * \param mask_reg    register containing quad/pixel "alive" mask (in/out)
+ * \param ifragZ_reg  register containing integer fragment Z values (in)
+ * \param ifbZ_reg    register containing integer frame buffer Z values (in/out)
+ * \param zmask_reg   register containing result of Z test/comparison (out)
+ */
+static void
+gen_depth_test(const struct pipe_depth_stencil_alpha_state *dsa,
+               struct spe_function *f,
+               int mask_reg, int ifragZ_reg, int ifbZ_reg, int zmask_reg)
+{
+   ASSERT(dsa->depth.enabled);
+
+   switch (dsa->depth.func) {
+   case PIPE_FUNC_EQUAL:
+      /* zmask = (ifragZ == ref) */
+      spe_ceq(f, zmask_reg, ifragZ_reg, ifbZ_reg);
+      /* mask = (mask & zmask) */
+      spe_and(f, mask_reg, mask_reg, zmask_reg);
+      break;
+
+   case PIPE_FUNC_NOTEQUAL:
+      /* zmask = (ifragZ == ref) */
+      spe_ceq(f, zmask_reg, ifragZ_reg, ifbZ_reg);
+      /* mask = (mask & ~zmask) */
+      spe_andc(f, mask_reg, mask_reg, zmask_reg);
+      break;
+
+   case PIPE_FUNC_GREATER:
+      /* zmask = (ifragZ > ref) */
+      spe_cgt(f, zmask_reg, ifragZ_reg, ifbZ_reg);
+      /* mask = (mask & zmask) */
+      spe_and(f, mask_reg, mask_reg, zmask_reg);
+      break;
+
+   case PIPE_FUNC_LESS:
+      /* zmask = (ref > ifragZ) */
+      spe_cgt(f, zmask_reg, ifbZ_reg, ifragZ_reg);
+      /* mask = (mask & zmask) */
+      spe_and(f, mask_reg, mask_reg, zmask_reg);
+      break;
+
+   case PIPE_FUNC_LEQUAL:
+      /* zmask = (ifragZ > ref) */
+      spe_cgt(f, zmask_reg, ifragZ_reg, ifbZ_reg);
+      /* mask = (mask & ~zmask) */
+      spe_andc(f, mask_reg, mask_reg, zmask_reg);
+      break;
+
+   case PIPE_FUNC_GEQUAL:
+      /* zmask = (ref > ifragZ) */
+      spe_cgt(f, zmask_reg, ifbZ_reg, ifragZ_reg);
+      /* mask = (mask & ~zmask) */
+      spe_andc(f, mask_reg, mask_reg, zmask_reg);
+      break;
+
+   case PIPE_FUNC_NEVER:
+      spe_il(f, mask_reg, 0);  /* mask = {0,0,0,0} */
+      spe_move(f, zmask_reg, mask_reg);  /* zmask = mask */
+      break;
+
+   case PIPE_FUNC_ALWAYS:
+      /* mask unchanged */
+      spe_il(f, zmask_reg, ~0);  /* zmask = {~0,~0,~0,~0} */
+      break;
+
+   default:
+      ASSERT(0);
+      break;
+   }
+
+   if (dsa->depth.writemask) {
+      /*
+       * If (ztest passed) {
+       *    framebufferZ = fragmentZ;
+       * }
+       * OR,
+       * framebufferZ = (ztest_passed ? fragmentZ : framebufferZ;
+       */
+      spe_selb(f, ifbZ_reg, ifbZ_reg, ifragZ_reg, mask_reg);
+   }
+}
+
+
+/**
+ * Generate SPE code to perform alpha testing.
+ *
+ * \param dsa        Gallium depth/stencil/alpha state to gen code for
+ * \param f          SPE function to append instruction onto.
+ * \param mask_reg   register containing quad/pixel "alive" mask (in/out)
+ * \param fragA_reg  register containing four fragment alpha values (in)
+ */
+static void
+gen_alpha_test(const struct pipe_depth_stencil_alpha_state *dsa,
+               struct spe_function *f, int mask_reg, int fragA_reg)
+{
+   int ref_reg = spe_allocate_available_register(f);
+   int amask_reg = spe_allocate_available_register(f);
+
+   ASSERT(dsa->alpha.enabled);
+
+   if ((dsa->alpha.func != PIPE_FUNC_NEVER) &&
+       (dsa->alpha.func != PIPE_FUNC_ALWAYS)) {
+      /* load/splat the alpha reference float value */
+      spe_load_float(f, ref_reg, dsa->alpha.ref);
+   }
+
+   /* emit code to do the alpha comparison, updating 'mask' */
+   switch (dsa->alpha.func) {
+   case PIPE_FUNC_EQUAL:
+      /* amask = (fragA == ref) */
+      spe_fceq(f, amask_reg, fragA_reg, ref_reg);
+      /* mask = (mask & amask) */
+      spe_and(f, mask_reg, mask_reg, amask_reg);
+      break;
+
+   case PIPE_FUNC_NOTEQUAL:
+      /* amask = (fragA == ref) */
+      spe_fceq(f, amask_reg, fragA_reg, ref_reg);
+      /* mask = (mask & ~amask) */
+      spe_andc(f, mask_reg, mask_reg, amask_reg);
+      break;
+
+   case PIPE_FUNC_GREATER:
+      /* amask = (fragA > ref) */
+      spe_fcgt(f, amask_reg, fragA_reg, ref_reg);
+      /* mask = (mask & amask) */
+      spe_and(f, mask_reg, mask_reg, amask_reg);
+      break;
+
+   case PIPE_FUNC_LESS:
+      /* amask = (ref > fragA) */
+      spe_fcgt(f, amask_reg, ref_reg, fragA_reg);
+      /* mask = (mask & amask) */
+      spe_and(f, mask_reg, mask_reg, amask_reg);
+      break;
+
+   case PIPE_FUNC_LEQUAL:
+      /* amask = (fragA > ref) */
+      spe_fcgt(f, amask_reg, fragA_reg, ref_reg);
+      /* mask = (mask & ~amask) */
+      spe_andc(f, mask_reg, mask_reg, amask_reg);
+      break;
+
+   case PIPE_FUNC_GEQUAL:
+      /* amask = (ref > fragA) */
+      spe_fcgt(f, amask_reg, ref_reg, fragA_reg);
+      /* mask = (mask & ~amask) */
+      spe_andc(f, mask_reg, mask_reg, amask_reg);
+      break;
+
+   case PIPE_FUNC_NEVER:
+      spe_il(f, mask_reg, 0);  /* mask = [0,0,0,0] */
+      break;
+
+   case PIPE_FUNC_ALWAYS:
+      /* no-op, mask unchanged */
+      break;
+
+   default:
+      ASSERT(0);
+      break;
+   }
+
+#if OPTIMIZATIONS
+   /* if mask == {0,0,0,0} we're all done, return */
+   {
+      /* re-use amask reg here */
+      int tmp_reg = amask_reg;
+      /* tmp[0] = (mask[0] | mask[1] | mask[2] | mask[3]) */
+      spe_orx(f, tmp_reg, mask_reg);
+      /* if tmp[0] == 0 then return from function call */
+      spe_biz(f, tmp_reg, SPE_REG_RA, 0, 0);
+   }
+#endif
+
+   spe_release_register(f, ref_reg);
+   spe_release_register(f, amask_reg);
+}
+
+
+
+/**
+ * Generate SPE code to implement the fragment operations (alpha test,
+ * depth test, stencil test, blending, colormask, and final
+ * framebuffer write) as specified by the current context state.
+ *
+ * Logically, this code will be called after running the fragment
+ * shader.  But under some circumstances we could run some of this
+ * code before the fragment shader to cull fragments/quads that are
+ * totally occluded/discarded.
+ *
+ * XXX we only support PIPE_FORMAT_Z24S8_UNORM z/stencil buffer right now.
+ *
+ * See the spu_default_fragment_ops() function to see how the per-fragment
+ * operations would be done with ordinary C code.
+ * The code we generate here though has no branches, is SIMD, etc and
+ * should be much faster.
+ *
+ * \param cell  the rendering context (in)
+ * \param f     the generated function (out)
+ */
+void
+gen_fragment_function(struct cell_context *cell, struct spe_function *f)
+{
+   const struct pipe_depth_stencil_alpha_state *dsa =
+      &cell->depth_stencil->base;
+   const struct pipe_blend_state *blend = &cell->blend->base;
+
+   /* For SPE function calls: reg $3 = first param, $4 = second param, etc. */
+   const int x_reg = 3;  /* uint */
+   const int y_reg = 4;  /* uint */
+   const int color_tile_reg = 5;  /* tile_t * */
+   const int depth_tile_reg = 6;  /* tile_t * */
+   const int fragZ_reg = 7;   /* vector float */
+   const int fragR_reg = 8;   /* vector float */
+   const int fragG_reg = 9;   /* vector float */
+   const int fragB_reg = 10;  /* vector float */
+   const int fragA_reg = 11;  /* vector float */
+   const int mask_reg = 12;   /* vector uint */
+
+   /* offset of quad from start of tile
+    * XXX assuming 4-byte pixels for color AND Z/stencil!!!!
+    */
+   int quad_offset_reg;
+
+   int fbRGBA_reg;  /**< framebuffer's RGBA colors for quad */
+   int fbZS_reg;    /**< framebuffer's combined z/stencil values for quad */
+
+   spe_init_func(f, SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE);
+   spe_allocate_register(f, x_reg);
+   spe_allocate_register(f, y_reg);
+   spe_allocate_register(f, color_tile_reg);
+   spe_allocate_register(f, depth_tile_reg);
+   spe_allocate_register(f, fragZ_reg);
+   spe_allocate_register(f, fragR_reg);
+   spe_allocate_register(f, fragG_reg);
+   spe_allocate_register(f, fragB_reg);
+   spe_allocate_register(f, fragA_reg);
+   spe_allocate_register(f, mask_reg);
+
+   quad_offset_reg = spe_allocate_available_register(f);
+   fbRGBA_reg = spe_allocate_available_register(f);
+   fbZS_reg = spe_allocate_available_register(f);
+
+   /* compute offset of quad from start of tile, in bytes */
+   {
+      int x2_reg = spe_allocate_available_register(f);
+      int y2_reg = spe_allocate_available_register(f);
+
+      ASSERT(TILE_SIZE == 32);
+
+      spe_rotmi(f, x2_reg, x_reg, -1);  /* x2 = x / 2 */
+      spe_rotmi(f, y2_reg, y_reg, -1);  /* y2 = y / 2 */
+      spe_shli(f, y2_reg, y2_reg, 4);   /* y2 *= 16 */
+      spe_a(f, quad_offset_reg, y2_reg, x2_reg);  /* offset = y2 + x2 */
+      spe_shli(f, quad_offset_reg, quad_offset_reg, 4);   /* offset *= 16 */
+
+      spe_release_register(f, x2_reg);
+      spe_release_register(f, y2_reg);
+   }
+
+
+   if (dsa->alpha.enabled) {
+      gen_alpha_test(dsa, f, mask_reg, fragA_reg);
+   }
+
+   if (dsa->depth.enabled || dsa->stencil[0].enabled) {
+      const enum pipe_format zs_format = cell->framebuffer.zsbuf->format;
+      boolean write_depth_stencil;
+
+      int fbZ_reg = spe_allocate_available_register(f); /* Z values */
+      int fbS_reg = spe_allocate_available_register(f); /* Stencil values */
+
+      /* fetch quad of depth/stencil values from tile at (x,y) */
+      /* Load: fbZS_reg = memory[depth_tile_reg + offset_reg] */
+      spe_lqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg);
+
+      if (dsa->depth.enabled) {
+         /* Extract Z bits from fbZS_reg into fbZ_reg */
+         if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
+             zs_format == PIPE_FORMAT_X8Z24_UNORM) {
+            int mask_reg = spe_allocate_available_register(f);
+            spe_fsmbi(f, mask_reg, 0x7777);  /* mask[0,1,2,3] = 0x00ffffff */
+            spe_and(f, fbZ_reg, fbZS_reg, mask_reg);  /* fbZ = fbZS & mask */
+            spe_release_register(f, mask_reg);
+            /* OK, fbZ_reg has four 24-bit Z values now */
+         }
+         else {
+            /* XXX handle other z/stencil formats */
+            ASSERT(0);
+         }
+
+         /* Convert fragZ values from float[4] to uint[4] */
+         if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
+             zs_format == PIPE_FORMAT_X8Z24_UNORM ||
+             zs_format == PIPE_FORMAT_Z24S8_UNORM ||
+             zs_format == PIPE_FORMAT_Z24X8_UNORM) {
+            /* 24-bit Z values */
+            int scale_reg = spe_allocate_available_register(f);
+
+            /* scale_reg[0,1,2,3] = float(2^24-1) */
+            spe_load_float(f, scale_reg, (float) 0xffffff);
+
+            /* XXX these two instructions might be combined */
+            spe_fm(f, fragZ_reg, fragZ_reg, scale_reg); /* fragZ *= scale */
+            spe_cfltu(f, fragZ_reg, fragZ_reg, 0);  /* fragZ = (int) fragZ */
+
+            spe_release_register(f, scale_reg);
+         }
+         else {
+            /* XXX handle 16-bit Z format */
+            ASSERT(0);
+         }
+      }
+
+      if (dsa->stencil[0].enabled) {
+         /* Extract Stencil bit sfrom fbZS_reg into fbS_reg */
+         if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
+             zs_format == PIPE_FORMAT_X8Z24_UNORM) {
+            /* XXX extract with a shift */
+            ASSERT(0);
+         }
+         else if (zs_format == PIPE_FORMAT_Z24S8_UNORM ||
+                  zs_format == PIPE_FORMAT_Z24X8_UNORM) {
+            /* XXX extract with a mask */
+            ASSERT(0);
+         }
+      }
+
+
+      if (dsa->stencil[0].enabled) {
+         /* XXX this may involve depth testing too */
+         // gen_stencil_test(dsa, f, ... );
+         ASSERT(0);
+      }
+      else if (dsa->depth.enabled) {
+         int zmask_reg = spe_allocate_available_register(f);
+         gen_depth_test(dsa, f, mask_reg, fragZ_reg, fbZ_reg, zmask_reg);
+         spe_release_register(f, zmask_reg);
+      }
+
+      /* do we need to write Z and/or Stencil back into framebuffer? */
+      write_depth_stencil = (dsa->depth.writemask |
+                             dsa->stencil[0].write_mask |
+                             dsa->stencil[1].write_mask);
+
+      if (write_depth_stencil) {
+         /* Merge latest Z and Stencil values into fbZS_reg.
+          * fbZ_reg has four Z vals in bits [23..0] or bits [15..0].
+          * fbS_reg has four 8-bit Z values in bits [7..0].
+          */
+         if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
+             zs_format == PIPE_FORMAT_X8Z24_UNORM) {
+            spe_shli(f, fbS_reg, fbS_reg, 24); /* fbS = fbS << 24 */
+            spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
+         }
+         else if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
+                  zs_format == PIPE_FORMAT_X8Z24_UNORM) {
+            /* XXX to do */
+            ASSERT(0);
+         }
+         else if (zs_format == PIPE_FORMAT_Z16_UNORM) {
+            /* XXX to do */
+            ASSERT(0);
+         }
+         else if (zs_format == PIPE_FORMAT_S8_UNORM) {
+            /* XXX to do */
+            ASSERT(0);
+         }
+         else {
+            /* bad zs_format */
+            ASSERT(0);
+         }
+
+         /* Store: memory[depth_tile_reg + quad_offset_reg] = fbZS */
+         spe_stqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg);
+      }
+
+      spe_release_register(f, fbZ_reg);
+      spe_release_register(f, fbS_reg);
+   }
+
+
+   /* Get framebuffer quad/colors.  We'll need these for blending,
+    * color masking, and to obey the quad/pixel mask.
+    * Load: fbRGBA_reg = memory[color_tile + quad_offset]
+    * Note: if mask={~0,~0,~0,~0} and we're not blending or colormasking
+    * we could skip this load.
+    */
+   spe_lqx(f, fbRGBA_reg, color_tile_reg, quad_offset_reg);
+
+
+   if (blend->blend_enable) {
+      /* convert packed tile colors in fbRGBA_reg to float[4] vectors */
+
+      // gen_blend_code(blend, f, mask_reg, ... );
+
+   }
+
+
+
+   /*
+    * Write fragment colors to framebuffer/tile.
+    * This involves converting the fragment colors from float[4] to the
+    * tile's specific format and obeying the quad/pixel mask.
+    */
+   {
+      const enum pipe_format color_format = cell->framebuffer.cbufs[0]->format;
+      int rgba_reg = spe_allocate_available_register(f);
+
+      /* Convert float[4] in [0.0,1.0] to int[4] in [0,~0], with clamping */
+      spe_cfltu(f, fragR_reg, fragR_reg, 32);
+      spe_cfltu(f, fragG_reg, fragG_reg, 32);
+      spe_cfltu(f, fragB_reg, fragB_reg, 32);
+      spe_cfltu(f, fragA_reg, fragA_reg, 32);
+
+      /* Shift most the significant bytes to least the significant positions.
+       * I.e.: reg = reg >> 24
+       */
+      spe_rotmi(f, fragR_reg, fragR_reg, -24);
+      spe_rotmi(f, fragG_reg, fragG_reg, -24);
+      spe_rotmi(f, fragB_reg, fragB_reg, -24);
+      spe_rotmi(f, fragA_reg, fragA_reg, -24);
+
+      /* Shift the color bytes according to the surface format */
+      if (color_format == PIPE_FORMAT_A8R8G8B8_UNORM) {
+         spe_roti(f, fragG_reg, fragG_reg, 8);   /* green <<= 8 */
+         spe_roti(f, fragR_reg, fragR_reg, 16);  /* red <<= 16 */
+         spe_roti(f, fragA_reg, fragA_reg, 24);  /* alpha <<= 24 */
+      }
+      else if (color_format == PIPE_FORMAT_B8G8R8A8_UNORM) {
+         spe_roti(f, fragR_reg, fragR_reg, 8);   /* red <<= 8 */
+         spe_roti(f, fragG_reg, fragG_reg, 16);  /* green <<= 16 */
+         spe_roti(f, fragB_reg, fragB_reg, 24);  /* blue <<= 24 */
+      }
+      else {
+         ASSERT(0);
+      }
+
+      /* Merge red, green, blue, alpha registers to make packed RGBA colors.
+       * Eg: after shifting according to color_format we might have:
+       *     R = {0x00ff0000, 0x00110000, 0x00220000, 0x00330000}
+       *     G = {0x0000ff00, 0x00004400, 0x00005500, 0x00006600}
+       *     B = {0x000000ff, 0x00000077, 0x00000088, 0x00000099}
+       *     A = {0xff000000, 0xaa000000, 0xbb000000, 0xcc000000}
+       * OR-ing all those together gives us four packed colors:
+       *  RGBA = {0xffffffff, 0xaa114477, 0xbb225588, 0xcc336699}
+       */
+      spe_or(f, rgba_reg, fragR_reg, fragG_reg);
+      spe_or(f, rgba_reg, rgba_reg, fragB_reg);
+      spe_or(f, rgba_reg, rgba_reg, fragA_reg);
+
+      /* Mix fragment colors with framebuffer colors using the quad/pixel mask:
+       * if (mask[i])
+       *    rgba[i] = rgba[i];
+       * else
+       *    rgba[i] = framebuffer[i];
+       */
+      spe_selb(f, rgba_reg, fbRGBA_reg, rgba_reg, mask_reg);
+
+      /* Store updated quad in tile:
+       * memory[color_tile + quad_offset] = rgba_reg;
+       */
+      spe_stqx(f, rgba_reg, color_tile_reg, quad_offset_reg);
+
+      spe_release_register(f, rgba_reg);
+   }
+
+   printf("gen_fragment_ops nr instructions: %u\n", f->num_inst);
+
+   spe_bi(f, SPE_REG_RA, 0, 0);  /* return from function call */
+
+
+   spe_release_register(f, fbRGBA_reg);
+   spe_release_register(f, fbZS_reg);
+   spe_release_register(f, quad_offset_reg);
+}
+
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.h b/src/gallium/drivers/cell/ppu/cell_gen_fragment.h
new file mode 100644
index 0000000000..0ea0fc690c
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.h
@@ -0,0 +1,38 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef CELL_GEN_FRAGMENT_H
+#define CELL_GEN_FRAGMENT_H
+
+
+extern void
+gen_fragment_function(struct cell_context *cell, struct spe_function *f);
+
+
+#endif /* CELL_GEN_FRAGMENT_H */
+
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index f2feaa329a..06777aac14 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -27,6 +27,7 @@
 
 #include "util/u_memory.h"
 #include "cell_context.h"
+#include "cell_gen_fragment.h"
 #include "cell_state.h"
 #include "cell_state_emit.h"
 #include "cell_state_per_fragment.h"
@@ -83,6 +84,29 @@ cell_emit_state(struct cell_context *cell)
       fb->depth_format = zbuf ? zbuf->format : PIPE_FORMAT_NONE;
       fb->width = cell->framebuffer.width;
       fb->height = cell->framebuffer.height;
+#if 0
+      printf("EMIT color format %s\n", pf_name(fb->color_format));
+      printf("EMIT depth format %s\n", pf_name(fb->depth_format));
+#endif
+   }
+
+
+   if (cell->dirty & (CELL_NEW_FRAMEBUFFER | CELL_NEW_DEPTH_STENCIL)) {
+      /* XXX we don't want to always do codegen here.  We should have
+       * a hash/lookup table to cache previous results...
+       */
+      struct cell_command_fragment_ops *fops
+            = cell_batch_alloc(cell, sizeof(*fops));
+      struct spe_function spe_code;
+
+      /* generate new code */
+      gen_fragment_function(cell, &spe_code);
+      /* put the new code into the batch buffer */
+      fops->opcode = CELL_CMD_STATE_FRAGMENT_OPS;
+      memcpy(&fops->code, spe_code.store,
+             SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE);
+      /* free codegen buffer */
+      spe_release_func(&spe_code);
    }
 
    if (cell->dirty & CELL_NEW_BLEND) {
@@ -90,8 +114,7 @@ cell_emit_state(struct cell_context *cell)
 
       if (cell->blend != NULL) {
          blend.base = (intptr_t) cell->blend->code.store;
-         blend.size = (char *) cell->blend->code.csr
-             - (char *) cell->blend->code.store;
+         blend.size = cell->blend->code.num_inst * SPE_INST_SIZE;
          blend.read_fb = TRUE;
       }
       else {
@@ -108,10 +131,10 @@ cell_emit_state(struct cell_context *cell)
 
       if (cell->depth_stencil != NULL) {
 	 dsat.base = (intptr_t) cell->depth_stencil->code.store;
-	 dsat.size = (char *) cell->depth_stencil->code.csr
-	     - (char *) cell->depth_stencil->code.store;
+	 dsat.size = cell->depth_stencil->code.num_inst * SPE_INST_SIZE;
 	 dsat.read_depth = TRUE;
 	 dsat.read_stencil = FALSE;
+         dsat.state = cell->depth_stencil->base;
       }
       else {
 	 dsat.base = 0;
diff --git a/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c b/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c
index 705867107b..78cb446c14 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c
@@ -1158,7 +1158,7 @@ cell_generate_alpha_blend(struct cell_blend_state *cb)
 static int
 PC_OFFSET(const struct spe_function *f, const void *d)
 {
-   const intptr_t pc = (intptr_t) f->csr;
+   const intptr_t pc = (intptr_t) &f->store[f->num_inst];
    const intptr_t ea = ~0x0f & (intptr_t) d;
 
    return (ea - pc) >> 2;
diff --git a/src/gallium/drivers/cell/spu/Makefile b/src/gallium/drivers/cell/spu/Makefile
index d49abb2e82..e285ae9fdb 100644
--- a/src/gallium/drivers/cell/spu/Makefile
+++ b/src/gallium/drivers/cell/spu/Makefile
@@ -43,7 +43,7 @@ INCLUDE_DIRS = \
 	$(SPU_CC) $(SPU_CFLAGS) -c $<
 
 .c.s:
-	$(SPU_CC) $(SPU_CFLAGS) -S $<
+	$(SPU_CC) $(SPU_CFLAGS) -O3 -S $<
 
 
 # The .a file will be linked into the main/PPU executable
diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index c4236817a9..4e0ec15925 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -34,6 +34,7 @@
 
 #include "spu_main.h"
 #include "spu_render.h"
+#include "spu_per_fragment_op.h"
 #include "spu_texture.h"
 #include "spu_tile.h"
 //#include "spu_test.h"
@@ -46,7 +47,7 @@
 /*
 helpful headers:
 /usr/lib/gcc/spu/4.1.1/include/spu_mfcio.h
-/opt/ibm/cell-sdk/prototype/sysroot/usr/include/libmisc.h
+/opt/cell/sdk/usr/include/libmisc.h
 */
 
 boolean Debug = FALSE;
@@ -226,6 +227,24 @@ cmd_release_verts(const struct cell_command_release_verts *release)
 }
 
 
+/**
+ * Process a CELL_CMD_STATE_FRAGMENT_OPS command.
+ * This involves installing new fragment ops SPU code.
+ * If this function is never called, we'll use a regular C fallback function
+ * for fragment processing.
+ */
+static void
+cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops)
+{
+   if (Debug)
+      printf("SPU %u: CMD_STATE_FRAGMENT_OPS\n", spu.init.id);
+   /* Copy SPU code from batch buffer to spu buffer */
+   memcpy(spu.fragment_ops.code, fops->code, SPU_MAX_FRAGMENT_OPS_INSTS * 4);
+   /* Point function pointer at new code */
+   spu.fragment_ops.func = (spu_fragment_ops_func) spu.fragment_ops.code;
+}
+
+
 static void
 cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
 {
@@ -257,6 +276,8 @@ cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
       break;
    case PIPE_FORMAT_Z24S8_UNORM:
    case PIPE_FORMAT_S8Z24_UNORM:
+   case PIPE_FORMAT_Z24X8_UNORM:
+   case PIPE_FORMAT_X8Z24_UNORM:
       spu.fb.zsize = 4;
       spu.fb.zscale = (float) 0x00ffffffu;
       break;
@@ -282,6 +303,8 @@ cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
 }
 
 
+#define NEW_FRAGMENT_FUNCTION 01
+
 static void
 cmd_state_blend(const struct cell_command_blend *state)
 {
@@ -302,7 +325,9 @@ cmd_state_blend(const struct cell_command_blend *state)
       wait_on_mask(1 << TAG_BATCH_BUFFER);
       spu.blend = (blend_func) fb_blend_code_buffer;
       spu.read_fb = state->read_fb;
-   } else {
+   }
+   else
+   {
       spu.read_fb = FALSE;
    }
 }
@@ -326,7 +351,9 @@ cmd_state_depth_stencil(const struct cell_command_depth_stencil_alpha_test *stat
 	      0, /* tid */
 	      0  /* rid */);
       wait_on_mask(1 << TAG_BATCH_BUFFER);
-   } else {
+   }
+   else
+   {
       /* If there is no code, emit a return instruction.
        */
       depth_stencil_code_buffer[0] = 0x35;
@@ -338,12 +365,14 @@ cmd_state_depth_stencil(const struct cell_command_depth_stencil_alpha_test *stat
    spu.frag_test = (frag_test_func) depth_stencil_code_buffer;
    spu.read_depth = state->read_depth;
    spu.read_stencil = state->read_stencil;
+   spu.depth_stencil_alpha = state->state;
 }
 
 
 static void
 cmd_state_logicop(const struct cell_command_logicop * code)
 {
+#if !NEW_FRAGMENT_FUNCTION
    mfc_get(logicop_code_buffer,
            (unsigned int) code->base,  /* src */
            code->size,
@@ -353,6 +382,7 @@ cmd_state_logicop(const struct cell_command_logicop * code)
    wait_on_mask(1 << TAG_BATCH_BUFFER);
 
    spu.logicop = (logicop_func) logicop_code_buffer;
+#endif
 }
 
 
@@ -455,7 +485,9 @@ cmd_finish(void)
 
 
 /**
- * Execute a batch of commands
+ * Execute a batch of commands which was sent to us by the PPU.
+ * See the cell_emit_state.c code to see where the commands come from.
+ *
  * The opcode param encodes the location of the buffer and its size.
  */
 static void
@@ -519,6 +551,14 @@ cmd_batch(uint opcode)
             pos += pos_incr;
          }
          break;
+      case CELL_CMD_STATE_FRAGMENT_OPS:
+         {
+            struct cell_command_fragment_ops *fops
+               = (struct cell_command_fragment_ops *) &buffer[pos];
+            cmd_state_fragment_ops(fops);
+            pos += sizeof(*fops) / 8;
+         }
+         break;
       case CELL_CMD_RELEASE_VERTS:
          {
             struct cell_command_release_verts *release
@@ -680,6 +720,11 @@ one_time_init(void)
    memset(spu.ctile_status, TILE_STATUS_DEFINED, sizeof(spu.ctile_status));
    memset(spu.ztile_status, TILE_STATUS_DEFINED, sizeof(spu.ztile_status));
    invalidate_tex_cache();
+
+   /* Install default/fallback fragment processing function.
+    * This will normally be overriden by a code-gen'd function.
+    */
+   spu.fragment_ops.func = spu_fallback_fragment_ops;
 }
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index c2a53c9dcf..7ab34f5222 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -91,6 +91,24 @@ typedef struct spu_blend_results (*logicop_func)(
 
 typedef vector float (*sample_texture_func)(uint unit, vector float texcoord);
 
+
+typedef void (*spu_fragment_ops_func)(uint x, uint y,
+                                      tile_t *colorTile,
+                                      tile_t *depthStencilTile,
+                                      vector float fragZ,
+                                      vector float fragRed,
+                                      vector float fragGreen,
+                                      vector float fragBlue,
+                                      vector float fragAlpha,
+                                      vector unsigned int mask);
+
+struct spu_fragment_ops
+{
+   uint code[SPU_MAX_FRAGMENT_OPS_INSTS];
+   spu_fragment_ops_func func;  /**< Current fragment ops function */
+} ALIGN16_ATTRIB;
+
+
 struct spu_framebuffer {
    void *color_start;              /**< addr of color surface in main memory */
    void *depth_start;              /**< addr of depth surface in main memory */
@@ -127,6 +145,9 @@ struct spu_global
    struct cell_init_info init;
 
    struct spu_framebuffer fb;
+
+   struct pipe_depth_stencil_alpha_state depth_stencil_alpha;
+
    boolean read_depth;
    boolean read_stencil;
    frag_test_func frag_test;  /**< Current depth/stencil test code */
@@ -142,6 +163,8 @@ struct spu_global
 
    struct vertex_info vertex_info;
 
+   struct spu_fragment_ops fragment_ops;
+
    /* XXX more state to come */
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
index 29dc07a2e8..ffc596aa62 100644
--- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
+++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
@@ -29,8 +29,11 @@
  * \author Ian Romanick <idr@us.ibm.com>
  */
 
+
+#include <transpose_matrix4x4.h>
 #include "pipe/p_format.h"
 #include "spu_main.h"
+#include "spu_colorpack.h"
 #include "spu_per_fragment_op.h"
 
 #define ZERO 0x80
@@ -90,7 +93,8 @@ read_ds_quad(tile_t *tile, unsigned x, unsigned y,
       break;
    }
 
-   case PIPE_FORMAT_S8Z24_UNORM: {
+   case PIPE_FORMAT_S8Z24_UNORM:
+   case PIPE_FORMAT_X8Z24_UNORM: {
       qword *ptr = (qword *) &tile->ui4[iy][ix];
 
       *depth = si_and(*ptr, si_fsmbi(0x7777));
@@ -153,7 +157,8 @@ write_ds_quad(tile_t *buffer, unsigned x, unsigned y,
       break;
    }
 
-   case PIPE_FORMAT_S8Z24_UNORM: {
+   case PIPE_FORMAT_S8Z24_UNORM:
+   case PIPE_FORMAT_X8Z24_UNORM: {
       qword *ptr = (qword *) &buffer->ui4[iy][ix];
       /* form select mask = 0111,0111,0111,0111 */
       qword mask = si_fsmbi(0x7777);
@@ -217,3 +222,225 @@ spu_do_depth_stencil(int x, int y,
 
    return result.mask;
 }
+
+
+
+
+/**
+ * Called by rasterizer for each quad after the shader has run.  This
+ * is a fallback/debug function.  In reality we'll use a generated
+ * function produced by the PPU.  But this function is useful for
+ * debug/validation.
+ */
+void
+spu_fallback_fragment_ops(uint x, uint y,
+                          tile_t *colorTile,
+                          tile_t *depthStencilTile,
+                          vector float fragZ,
+                          vector float fragRed,
+                          vector float fragGreen,
+                          vector float fragBlue,
+                          vector float fragAlpha,
+                          vector unsigned int mask)
+{
+   vector float frag_soa[4], frag_aos[4];
+   unsigned int c0, c1, c2, c3;
+
+   /* do alpha test */
+   if (spu.depth_stencil_alpha.alpha.enabled) {
+      vector float ref = spu_splats(spu.depth_stencil_alpha.alpha.ref);
+      vector unsigned int amask;
+
+      switch (spu.depth_stencil_alpha.alpha.func) {
+      case PIPE_FUNC_LESS:
+         amask = spu_cmpgt(ref, fragAlpha);  /* mask = (fragAlpha < ref) */
+         break;
+      case PIPE_FUNC_GREATER:
+         amask = spu_cmpgt(fragAlpha, ref);  /* mask = (fragAlpha > ref) */
+         break;
+      case PIPE_FUNC_GEQUAL:
+         amask = spu_cmpgt(ref, fragAlpha);
+         amask = spu_nor(amask, amask);
+         break;
+      case PIPE_FUNC_LEQUAL:
+         amask = spu_cmpgt(fragAlpha, ref);
+         amask = spu_nor(amask, amask);
+         break;
+      case PIPE_FUNC_EQUAL:
+         amask = spu_cmpeq(ref, fragAlpha);
+         break;
+      case PIPE_FUNC_NOTEQUAL:
+         amask = spu_cmpeq(ref, fragAlpha);
+         amask = spu_nor(amask, amask);
+         break;
+      case PIPE_FUNC_ALWAYS:
+         amask = spu_splats(0xffffffffU);
+         break;
+      case PIPE_FUNC_NEVER:
+         amask = spu_splats( 0x0U);
+         break;
+      default:
+         ;
+      }
+
+      mask = spu_and(mask, amask);
+   }
+
+   /* Z and/or stencil testing... */
+   if (spu.depth_stencil_alpha.depth.enabled ||
+       spu.depth_stencil_alpha.stencil[0].enabled) {
+
+      /* get four Z/Stencil values from tile */
+      vector unsigned int mask24 = spu_splats((unsigned int)0x00ffffffU);
+      vector unsigned int ifbZS = depthStencilTile->ui4[y/2][x/2];
+      vector unsigned int ifbZ = spu_and(ifbZS, mask24);
+      vector unsigned int ifbS = spu_andc(ifbZS, mask24);
+
+      if (spu.depth_stencil_alpha.stencil[0].enabled) {
+         /* do stencil test */
+         ASSERT(spu.fb.depth_format == PIPE_FORMAT_S8Z24_UNORM);
+
+      }
+      else if (spu.depth_stencil_alpha.depth.enabled) {
+         /* do depth test */
+
+         ASSERT(spu.fb.depth_format == PIPE_FORMAT_S8Z24_UNORM ||
+                spu.fb.depth_format == PIPE_FORMAT_X8Z24_UNORM);
+
+         vector unsigned int ifragZ;
+         vector unsigned int zmask;
+
+         /* convert four fragZ from float to uint */
+         fragZ = spu_mul(fragZ, spu_splats((float) 0xffffff));
+         ifragZ = spu_convtu(fragZ, 0);
+
+         /* do depth comparison, setting zmask with results */
+         switch (spu.depth_stencil_alpha.depth.func) {
+         case PIPE_FUNC_LESS:
+            zmask = spu_cmpgt(ifbZ, ifragZ);  /* mask = (ifragZ < ifbZ) */
+            break;
+         case PIPE_FUNC_GREATER:
+            zmask = spu_cmpgt(ifragZ, ifbZ);  /* mask = (ifbZ > ifragZ) */
+            break;
+         case PIPE_FUNC_GEQUAL:
+            zmask = spu_cmpgt(ifbZ, ifragZ);
+            zmask = spu_nor(zmask, zmask);
+            break;
+         case PIPE_FUNC_LEQUAL:
+            zmask = spu_cmpgt(ifragZ, ifbZ);
+            zmask = spu_nor(zmask, zmask);
+            break;
+         case PIPE_FUNC_EQUAL:
+            zmask = spu_cmpeq(ifbZ, ifragZ);
+            break;
+         case PIPE_FUNC_NOTEQUAL:
+            zmask = spu_cmpeq(ifbZ, ifragZ);
+            zmask = spu_nor(zmask, zmask);
+            break;
+         case PIPE_FUNC_ALWAYS:
+            zmask = spu_splats(0xffffffffU);
+            break;
+         case PIPE_FUNC_NEVER:
+            zmask = spu_splats( 0x0U);
+            break;
+         default:
+            ;
+         }
+
+         mask = spu_and(mask, zmask);
+
+         /* merge framebuffer Z and fragment Z according to the mask */
+         ifbZ = spu_or(spu_and(ifragZ, mask),
+                       spu_andc(ifbZ, mask));
+      }
+
+      if (spu_extract(spu_orx(mask), 0)) {
+         /* put new fragment Z/Stencil values back into Z/Stencil tile */
+         depthStencilTile->ui4[y/2][x/2] = spu_or(ifbZ, ifbS);
+
+         spu.cur_ztile_status = TILE_STATUS_DIRTY;
+      }
+   }
+
+   /* XXX do blending here */
+
+   /* XXX do colormask test here */
+
+
+   if (spu_extract(spu_orx(mask), 0)) {
+      spu.cur_ctile_status = TILE_STATUS_DIRTY;
+   }
+   else {
+      return;
+   }
+
+   /* convert RRRR,GGGG,BBBB,AAAA to RGBA,RGBA,RGBA,RGBA */
+#if 0
+   {
+      vector float frag_soa[4];
+      frag_soa[0] = fragRed;
+      frag_soa[1] = fragGreen;
+      frag_soa[2] = fragBlue;
+      frag_soa[3] = fragAlpha;
+      _transpose_matrix4x4(frag_aos, frag_soa);
+   }
+#else
+   /* short-cut relying on function parameter layout: */
+   _transpose_matrix4x4(frag_aos, &fragRed);
+   (void) fragGreen;
+   (void) fragBlue;
+#endif
+
+   switch (spu.fb.color_format) {
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+      c0 = spu_pack_A8R8G8B8(frag_aos[0]);
+      c1 = spu_pack_A8R8G8B8(frag_aos[1]);
+      c2 = spu_pack_A8R8G8B8(frag_aos[2]);
+      c3 = spu_pack_A8R8G8B8(frag_aos[3]);
+      break;
+
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
+      c0 = spu_pack_B8G8R8A8(frag_aos[0]);
+      c1 = spu_pack_B8G8R8A8(frag_aos[1]);
+      c2 = spu_pack_B8G8R8A8(frag_aos[2]);
+      c3 = spu_pack_B8G8R8A8(frag_aos[3]);
+      break;
+   default:
+      fprintf(stderr, "SPU: Bad pixel format in spu_default_fragment_ops\n");
+      ASSERT(0);
+   }
+
+#if 0
+   /*
+    * Quad layout:
+    *  +--+--+
+    *  |p0|p1|
+    *  +--+--+
+    *  |p2|p3|
+    *  +--+--+
+    */
+   if (spu_extract(mask, 0))
+      colorTile->ui[y+0][x+0] = c0;
+   if (spu_extract(mask, 1))
+      colorTile->ui[y+0][x+1] = c1;
+   if (spu_extract(mask, 2))
+      colorTile->ui[y+1][x+0] = c2;
+   if (spu_extract(mask, 3))
+      colorTile->ui[y+1][x+1] = c3;   
+#else
+   /*
+    * Quad layout:
+    *  +--+--+--+--+
+    *  |p0|p1|p2|p3|
+    *  +--+--+--+--+
+    */
+   if (spu_extract(mask, 0))
+      colorTile->ui[y][x*2] = c0;
+   if (spu_extract(mask, 1))
+      colorTile->ui[y][x*2+1] = c1;
+   if (spu_extract(mask, 2))
+      colorTile->ui[y][x*2+2] = c2;
+   if (spu_extract(mask, 3))
+      colorTile->ui[y][x*2+3] = c3;   
+#endif
+}
diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.h b/src/gallium/drivers/cell/spu/spu_per_fragment_op.h
index 6571258699..ffadf0661c 100644
--- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.h
+++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.h
@@ -29,4 +29,15 @@ extern qword
 spu_do_depth_stencil(int x, int y, qword frag_mask, qword frag_depth,
 		     qword frag_alpha, qword facing);
 
+extern void
+spu_fallback_fragment_ops(uint x, uint y,
+                          tile_t *colorTile,
+                          tile_t *depthStencilTile,
+                          vector float fragZ,
+                          vector float fragRed,
+                          vector float fragGreen,
+                          vector float fragBlue,
+                          vector float fragAlpha,
+                          vector unsigned int mask);
+
 #endif /* SPU_PER_FRAGMENT_OP */
diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index a3ea0a3e69..71ef6ca24f 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -297,9 +297,12 @@ emit_quad( int x, int y, mask_t mask )
    sp->quad.first->run(sp->quad.first, &setup.quad);
 #else
 
+#define NEW_FRAGMENT_FUNCTION 01
+#if !NEW_FRAGMENT_FUNCTION
    if (spu.read_depth) {
       mask = do_depth_test(x, y, mask);
    }
+#endif
 
    /* If any bits in mask are set... */
    if (spu_extract(spu_orx(mask), 0)) {
@@ -308,6 +311,7 @@ emit_quad( int x, int y, mask_t mask )
       vector float colors[4];
 
       spu.cur_ctile_status = TILE_STATUS_DIRTY;
+      spu.cur_ztile_status = TILE_STATUS_DIRTY;
 
       if (spu.texture[0].start) {
          /* texture mapping */
@@ -355,6 +359,29 @@ emit_quad( int x, int y, mask_t mask )
       }
 
 
+#if NEW_FRAGMENT_FUNCTION
+      {
+         /* Convert fragment data from AoS to SoA format.
+          * I.e. (RGBA,RGBA,RGBA,RGBA) -> (RRRR,GGGG,BBBB,AAAA)
+          * This is temporary!
+          */
+         vector float soa_frag[4];
+         _transpose_matrix4x4(soa_frag, colors);
+
+         float4 fragZ;
+
+         fragZ.v = eval_z((float) x, (float) y);
+
+         /* Do all per-fragment/quad operations here, including:
+          *  alpha test, z test, stencil test, blend and framebuffer writing.
+          */
+         spu.fragment_ops.func(ix, iy, &spu.ctile, &spu.ztile,
+                               fragZ.v,
+                               soa_frag[0], soa_frag[1],
+                               soa_frag[2], soa_frag[3],
+                               mask);
+      }
+#else
       /* Convert fragment data from AoS to SoA format.
        * I.e. (RGBA,RGBA,RGBA,RGBA) -> (RRRR,GGGG,BBBB,AAAA)
        */
@@ -405,6 +432,9 @@ emit_quad( int x, int y, mask_t mask )
       spu.ctile.ui[iy+0][ix+1] = spu_extract((vec_uint4) result.g, 0);
       spu.ctile.ui[iy+1][ix+0] = spu_extract((vec_uint4) result.b, 0);
       spu.ctile.ui[iy+1][ix+1] = spu_extract((vec_uint4) result.a, 0);
+
+#endif /* NEW_FRAGMENT_FUNCTION */
+
    }
 #endif
 }
diff --git a/src/gallium/winsys/xlib/xm_api.c b/src/gallium/winsys/xlib/xm_api.c
index b010513107..28bd6ceab4 100644
--- a/src/gallium/winsys/xlib/xm_api.c
+++ b/src/gallium/winsys/xlib/xm_api.c
@@ -349,12 +349,17 @@ create_xmesa_buffer(XMesaDrawable d, BufferType type,
 
    if (vis->mesa_visual.depthBits == 0)
       depthFormat = PIPE_FORMAT_NONE;
+#ifdef GALLIUM_CELL /* XXX temporary for Cell! */
+   else
+      depthFormat = PIPE_FORMAT_S8Z24_UNORM;
+#else
    else if (vis->mesa_visual.depthBits <= 16)
-      depthFormat = PIPE_FORMAT_Z16_UNORM;
+      depthFormat = PIPE_FORMAT_Z16UNORM;
    else if (vis->mesa_visual.depthBits <= 24)
       depthFormat = PIPE_FORMAT_S8Z24_UNORM;
    else
       depthFormat = PIPE_FORMAT_Z32_UNORM;
+#endif
 
    if (vis->mesa_visual.stencilBits == 8) {
       if (depthFormat == PIPE_FORMAT_S8Z24_UNORM)
diff --git a/src/gallium/winsys/xlib/xm_winsys.c b/src/gallium/winsys/xlib/xm_winsys.c
index 5e9a1f92f1..c4a30d3702 100644
--- a/src/gallium/winsys/xlib/xm_winsys.c
+++ b/src/gallium/winsys/xlib/xm_winsys.c
@@ -275,6 +275,39 @@ xm_buffer_destroy(struct pipe_winsys *pws,
 }
 
 
+/**
+ * For Cell.  Basically, rearrange the pixels/quads from this layout:
+ *  +--+--+--+--+
+ *  |p0|p1|p2|p3|....
+ *  +--+--+--+--+
+ *
+ * to this layout:
+ *  +--+--+
+ *  |p0|p1|....
+ *  +--+--+
+ *  |p2|p3|
+ *  +--+--+
+ */
+static void
+twiddle_tile(uint *tile)
+{
+   uint tile2[TILE_SIZE * TILE_SIZE];
+   int y, x;
+
+   for (y = 0; y < TILE_SIZE; y+=2) {
+      for (x = 0; x < TILE_SIZE; x+=2) {
+         int k = 4 * (y/2 * TILE_SIZE/2 + x/2);
+         tile2[y * TILE_SIZE + (x + 0)] = tile[k];
+         tile2[y * TILE_SIZE + (x + 1)] = tile[k+1];
+         tile2[(y + 1) * TILE_SIZE + (x + 0)] = tile[k+2];
+         tile2[(y + 1) * TILE_SIZE + (x + 1)] = tile[k+3];
+      }
+   }
+   memcpy(tile, tile2, sizeof(tile2));
+}
+
+
+
 /**
  * Display a surface that's in a tiled configuration.  That is, all the
  * pixels for a TILE_SIZExTILE_SIZE block are contiguous in memory.
@@ -321,6 +354,8 @@ xmesa_display_surface_tiled(XMesaBuffer b, const struct pipe_surface *surf)
 
          ximage->data = (char *) xm_buf->data + offset;
 
+         twiddle_tile((uint *) ximage->data);
+
          if (XSHM_ENABLED(xm_buf)) {
 #if defined(USE_XSHM) && !defined(XFree86Server)
             XShmPutImage(b->xm_visual->display, b->drawable, b->gc,
-- 
cgit v1.2.3


From 701fcee65db6b72f98e926d838956bbcc54f1cc6 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 10 Sep 2008 18:51:43 -0600
Subject: cell: remove old per-fragment code, replace with all new code

---
 src/gallium/drivers/cell/spu/spu_per_fragment_op.c | 236 +++------------------
 src/gallium/drivers/cell/spu/spu_per_fragment_op.h |  47 ++--
 src/gallium/drivers/cell/spu/spu_tri.c             |  96 ---------
 3 files changed, 48 insertions(+), 331 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
index ffc596aa62..9ed5fc50cd 100644
--- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
+++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
@@ -1,32 +1,32 @@
-/*
- * (C) Copyright IBM Corporation 2008
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
  * All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * on the rights to use, copy, modify, merge, publish, distribute, sub
- * license, and/or sell copies of the Software, and to permit persons to whom
- * the Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
- * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
- * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- * USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
 
 /**
- * \file spu_per_fragment_op.c
- * SPU implementation various per-fragment operations.
- *
- * \author Ian Romanick <idr@us.ibm.com>
+ * \author Brian Paul
  */
 
 
@@ -36,194 +36,6 @@
 #include "spu_colorpack.h"
 #include "spu_per_fragment_op.h"
 
-#define ZERO 0x80
-
-
-/**
- * Get a "quad" of four fragment Z/stencil values from the given tile.
- * \param tile  the tile of Z/stencil values
- * \param x, y  location of the quad in the tile, in pixels
- * \param depth_format  format of the tile's data
- * \param detph  returns four depth values
- * \param stencil  returns four stencil values
- */
-static void
-read_ds_quad(tile_t *tile, unsigned x, unsigned y,
-             enum pipe_format depth_format, qword *depth,
-             qword *stencil)
-{
-   const int ix = x / 2;
-   const int iy = y / 2;
-
-   switch (depth_format) {
-   case PIPE_FORMAT_Z16_UNORM: {
-      qword *ptr = (qword *) &tile->us8[iy][ix / 2];
-
-      const qword shuf_vec = (qword) {
-         ZERO, ZERO, 0, 1, ZERO, ZERO, 2, 3,
-         ZERO, ZERO, 4, 5, ZERO, ZERO, 6, 7
-      };
-
-      /* At even X values we want the first 4 shorts, and at odd X values we
-       * want the second 4 shorts.
-       */
-      qword bias = (qword) spu_splats((unsigned char) ((ix & 0x01) << 3));
-      qword bias_mask = si_fsmbi(0x3333);
-      qword sv = si_a(shuf_vec, si_and(bias_mask, bias));
-
-      *depth = si_shufb(*ptr, *ptr, sv);
-      *stencil = si_il(0);
-      break;
-   }
-
-   case PIPE_FORMAT_Z32_UNORM: {
-      qword *ptr = (qword *) &tile->ui4[iy][ix];
-
-      *depth = *ptr;
-      *stencil = si_il(0);
-      break;
-   }
-
-   case PIPE_FORMAT_Z24S8_UNORM: {
-      qword *ptr = (qword *) &tile->ui4[iy][ix];
-      qword mask = si_fsmbi(0xEEEE);
-
-      *depth = si_rotmai(si_and(*ptr, mask), -8);
-      *stencil = si_andc(*ptr, mask);
-      break;
-   }
-
-   case PIPE_FORMAT_S8Z24_UNORM:
-   case PIPE_FORMAT_X8Z24_UNORM: {
-      qword *ptr = (qword *) &tile->ui4[iy][ix];
-
-      *depth = si_and(*ptr, si_fsmbi(0x7777));
-      *stencil = si_andi(si_roti(*ptr, 8), 0x0ff);
-      break;
-   }
-
-   default:
-      ASSERT(0);
-      break;
-   }
-}
-
-
-/**
- * Put a quad of Z/stencil values into a tile.
- * \param tile  the tile of Z/stencil values to write into
- * \param x, y  location of the quad in the tile, in pixels
- * \param depth_format  format of the tile's data
- * \param detph  depth values to store
- * \param stencil  stencil values to store
- */
-static void
-write_ds_quad(tile_t *buffer, unsigned x, unsigned y,
-              enum pipe_format depth_format,
-              qword depth, qword stencil)
-{
-   const int ix = x / 2;
-   const int iy = y / 2;
-
-   (void) stencil;
-
-   switch (depth_format) {
-   case PIPE_FORMAT_Z16_UNORM: {
-      qword *ptr = (qword *) &buffer->us8[iy][ix / 2];
-
-      qword sv = ((ix & 0x01) == 0) 
-          ? (qword) { 2, 3, 6, 7, 10, 11, 14, 15,
-                      24, 25, 26, 27, 28, 29, 30, 31 }
-          : (qword) { 16, 17, 18, 19, 20 , 21, 22, 23,
-                      2, 3, 6, 7, 10, 11, 14, 15 };
-      *ptr = si_shufb(depth, *ptr, sv);
-      break;
-   }
-
-   case PIPE_FORMAT_Z32_UNORM: {
-      qword *ptr = (qword *) &buffer->ui4[iy][ix];
-      *ptr = depth;
-      break;
-   }
-
-   case PIPE_FORMAT_Z24S8_UNORM: {
-      qword *ptr = (qword *) &buffer->ui4[iy][ix];
-      /* form select mask = 1110,1110,1110,1110 */
-      qword mask = si_fsmbi(0xEEEE);
-      /* depth[i] = depth[i] << 8 */
-      depth = si_shli(depth, 8);
-      /* *ptr[i] = depth[i][31:8] | stencil[i][7:0] */
-      *ptr = si_selb(stencil, depth, mask);
-      break;
-   }
-
-   case PIPE_FORMAT_S8Z24_UNORM:
-   case PIPE_FORMAT_X8Z24_UNORM: {
-      qword *ptr = (qword *) &buffer->ui4[iy][ix];
-      /* form select mask = 0111,0111,0111,0111 */
-      qword mask = si_fsmbi(0x7777);
-      /* stencil[i] = stencil[i] << 24 */
-      stencil = si_shli(stencil, 24);
-      /* *ptr[i] = stencil[i][31:24] | depth[i][23:0] */
-      *ptr = si_selb(stencil, depth, mask);
-      break;
-   }
-
-   default:
-      ASSERT(0);
-      break;
-   }
-}
-
-
-/**
- * Do depth/stencil/alpha test for a "quad" of 4 fragments.
- * \param x,y  location of quad within tile
- * \param frag_mask  indicates which fragments are "alive"
- * \param frag_depth  four fragment depth values
- * \param frag_alpha  four fragment alpha values
- * \param facing  front/back facing for four fragments (1=front, 0=back)
- */
-qword
-spu_do_depth_stencil(int x, int y,
-                     qword frag_mask, qword frag_depth, qword frag_alpha,
-                     qword facing)
-{
-   struct spu_frag_test_results  result;
-   qword pixel_depth;
-   qword pixel_stencil;
-
-   /* All of this preable code (everthing before the call to frag_test) should
-    * be generated on the PPU and upload to the SPU.
-    */
-   if (spu.read_depth || spu.read_stencil) {
-      read_ds_quad(&spu.ztile, x, y, spu.fb.depth_format,
-                   &pixel_depth, &pixel_stencil);
-   }
-
-   /* convert floating point Z values to 32-bit uint */
-
-   /* frag_depth *= spu.fb.zscale */
-   frag_depth = si_fm(frag_depth, (qword)spu_splats(spu.fb.zscale));
-   /* frag_depth = uint(frag_depth) */
-   frag_depth = si_cfltu(frag_depth, 0);
-
-   result = (*spu.frag_test)(frag_mask, pixel_depth, pixel_stencil,
-                             frag_depth, frag_alpha, facing);
-
-
-   /* This code (everthing after the call to frag_test) should
-    * be generated on the PPU and upload to the SPU.
-    */
-   if (spu.read_depth || spu.read_stencil) {
-      write_ds_quad(&spu.ztile, x, y, spu.fb.depth_format,
-                    result.depth, result.stencil);
-   }
-
-   return result.mask;
-}
-
-
 
 
 /**
diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.h b/src/gallium/drivers/cell/spu/spu_per_fragment_op.h
index ffadf0661c..f817abf046 100644
--- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.h
+++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.h
@@ -1,33 +1,33 @@
-/*
- * (C) Copyright IBM Corporation 2008
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
  * All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * on the rights to use, copy, modify, merge, publish, distribute, sub
- * license, and/or sell copies of the Software, and to permit persons to whom
- * the Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
- * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
- * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- * USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
 
 #ifndef SPU_PER_FRAGMENT_OP
 #define SPU_PER_FRAGMENT_OP
 
-extern qword
-spu_do_depth_stencil(int x, int y, qword frag_mask, qword frag_depth,
-		     qword frag_alpha, qword facing);
 
 extern void
 spu_fallback_fragment_ops(uint x, uint y,
@@ -40,4 +40,5 @@ spu_fallback_fragment_ops(uint x, uint y,
                           vector float fragAlpha,
                           vector unsigned int mask);
 
+
 #endif /* SPU_PER_FRAGMENT_OP */
diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index 71ef6ca24f..a5bf3270c7 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -38,7 +38,6 @@
 #include "spu_texture.h"
 #include "spu_tile.h"
 #include "spu_tri.h"
-#include "spu_per_fragment_op.h"
 
 
 /** Masks are uint[4] vectors with each element being 0 or 0xffffffff */
@@ -255,31 +254,6 @@ eval_z(float x, float y)
 }
 
 
-static INLINE mask_t
-do_depth_test(int x, int y, mask_t quadmask)
-{
-   float4 zvals;
-   mask_t mask;
-
-   if (spu.fb.depth_format == PIPE_FORMAT_NONE)
-      return quadmask;
-
-   zvals.v = eval_z((float) x, (float) y);
-
-   mask = (mask_t) spu_do_depth_stencil(x - setup.cliprect_minx,
-					y - setup.cliprect_miny,
-					(qword) quadmask, 
-					(qword) zvals.v,
-					(qword) spu_splats((unsigned char) 0x0ffu),
-					(qword) spu_splats((unsigned int) 0x01u));
-
-   if (spu_extract(spu_orx(mask), 0))
-      spu.cur_ztile_status = TILE_STATUS_DIRTY;
-
-   return mask;
-}
-
-
 /**
  * Emit a quad (pass to next stage).  No clipping is done.
  * Note: about 1/5 to 1/7 of the time, mask is zero and this function
@@ -289,21 +263,6 @@ do_depth_test(int x, int y, mask_t quadmask)
 static INLINE void
 emit_quad( int x, int y, mask_t mask )
 {
-#if 0
-   struct softpipe_context *sp = setup.softpipe;
-   setup.quad.x0 = x;
-   setup.quad.y0 = y;
-   setup.quad.mask = mask;
-   sp->quad.first->run(sp->quad.first, &setup.quad);
-#else
-
-#define NEW_FRAGMENT_FUNCTION 01
-#if !NEW_FRAGMENT_FUNCTION
-   if (spu.read_depth) {
-      mask = do_depth_test(x, y, mask);
-   }
-#endif
-
    /* If any bits in mask are set... */
    if (spu_extract(spu_orx(mask), 0)) {
       const int ix = x - setup.cliprect_minx;
@@ -359,7 +318,6 @@ emit_quad( int x, int y, mask_t mask )
       }
 
 
-#if NEW_FRAGMENT_FUNCTION
       {
          /* Convert fragment data from AoS to SoA format.
           * I.e. (RGBA,RGBA,RGBA,RGBA) -> (RRRR,GGGG,BBBB,AAAA)
@@ -381,62 +339,8 @@ emit_quad( int x, int y, mask_t mask )
                                soa_frag[2], soa_frag[3],
                                mask);
       }
-#else
-      /* Convert fragment data from AoS to SoA format.
-       * I.e. (RGBA,RGBA,RGBA,RGBA) -> (RRRR,GGGG,BBBB,AAAA)
-       */
-      qword soa_frag[4];
-      _transpose_matrix4x4((vec_float4 *) soa_frag, colors);
-
-      /* Read the current framebuffer values.
-       */
-      const qword pix[4] = {
-         (qword) spu_splats(spu.ctile.ui[iy+0][ix+0]),
-         (qword) spu_splats(spu.ctile.ui[iy+0][ix+1]),
-         (qword) spu_splats(spu.ctile.ui[iy+1][ix+0]),
-         (qword) spu_splats(spu.ctile.ui[iy+1][ix+1]),
-      };
-
-      qword soa_pix[4];
-
-      if (spu.read_fb) {
-         /* Convert pixel data from AoS to SoA format.
-          * I.e. (RGBA,RGBA,RGBA,RGBA) -> (RRRR,GGGG,BBBB,AAAA)
-          */
-         vec_float4 aos_pix[4] = {
-            spu_unpack_A8R8G8B8(spu.ctile.ui[iy+0][ix+0]),
-            spu_unpack_A8R8G8B8(spu.ctile.ui[iy+0][ix+1]),
-            spu_unpack_A8R8G8B8(spu.ctile.ui[iy+1][ix+0]),
-            spu_unpack_A8R8G8B8(spu.ctile.ui[iy+1][ix+1]),
-         };
-
-         _transpose_matrix4x4((vec_float4 *) soa_pix, aos_pix);
-      }
-
-
-      struct spu_blend_results result =
-          (*spu.blend)(soa_frag[0], soa_frag[1], soa_frag[2], soa_frag[3],
-                       soa_pix[0], soa_pix[1], soa_pix[2], soa_pix[3],
-                       spu.const_blend_color[0], spu.const_blend_color[1],
-                       spu.const_blend_color[2], spu.const_blend_color[3]);
-
-
-      /* Convert final pixel data from SoA to AoS format.
-       * I.e. (RRRR,GGGG,BBBB,AAAA) -> (RGBA,RGBA,RGBA,RGBA)
-       */
-      result = (*spu.logicop)(pix[0], pix[1], pix[2], pix[3],
-                              result.r, result.g, result.b, result.a,
-                              (qword) mask);
-
-      spu.ctile.ui[iy+0][ix+0] = spu_extract((vec_uint4) result.r, 0);
-      spu.ctile.ui[iy+0][ix+1] = spu_extract((vec_uint4) result.g, 0);
-      spu.ctile.ui[iy+1][ix+0] = spu_extract((vec_uint4) result.b, 0);
-      spu.ctile.ui[iy+1][ix+1] = spu_extract((vec_uint4) result.a, 0);
-
-#endif /* NEW_FRAGMENT_FUNCTION */
 
    }
-#endif
 }
 
 
-- 
cgit v1.2.3


From 5336e758a483d15d579ffe7cad536be95637d904 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Thu, 11 Sep 2008 08:44:54 -0600
Subject: cell: added cast in spu_splats() call

---
 src/gallium/drivers/cell/spu/spu_texture.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
index 5051774f00..117b8a36f8 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.c
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -97,7 +97,7 @@ get_four_texels(uint unit, vec_uint4 x, vec_uint4 y, vec_uint4 *texels)
    const qword offset_y = si_andi((qword) y, 0x1f);
 
    const qword tiles_per_row = (qword) spu_splats(spu.texture[unit].tiles_per_row);
-   const qword tile_size = (qword) spu_splats(sizeof(tile_t));
+   const qword tile_size = (qword) spu_splats((unsigned) sizeof(tile_t));
 
    qword tile_offset = si_mpya((qword) tile_y, tiles_per_row, (qword) tile_x);
    tile_offset = si_mpy((qword) tile_offset, tile_size);
-- 
cgit v1.2.3


From 6092a057042c9f7a4cae0f0eb9e95307f5f850a1 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Thu, 11 Sep 2008 09:55:39 -0600
Subject: cell: fix shuffle in spu_unpack_B8G8R8A8()

---
 src/gallium/drivers/cell/spu/spu_colorpack.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/spu/spu_colorpack.h b/src/gallium/drivers/cell/spu/spu_colorpack.h
index e9fee8a3a6..fd8dc6ded3 100644
--- a/src/gallium/drivers/cell/spu/spu_colorpack.h
+++ b/src/gallium/drivers/cell/spu/spu_colorpack.h
@@ -79,14 +79,14 @@ spu_pack_color_shuffle(vector float rgba, vector unsigned char shuffle)
 
 
 static INLINE vector float
-spu_unpack_color(uint color)
+spu_unpack_B8G8R8A8(uint color)
 {
    vector unsigned int color_u4 = spu_splats(color);
    color_u4 = spu_shuffle(color_u4, color_u4,
                           ((vector unsigned char) {
-                             0, 0, 0, 0,
-                             5, 5, 5, 5,
                              10, 10, 10, 10,
+                             5, 5, 5, 5,
+                             0, 0, 0, 0,
                              15, 15, 15, 15}) );
    return spu_convtf(color_u4, 32);
 }
-- 
cgit v1.2.3


From add86031db757b0e3abe48bd8fdea40d4e380e05 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Thu, 11 Sep 2008 10:08:06 -0600
Subject: cell: begin new blending code (both codegen and fallback paths)

---
 src/gallium/drivers/cell/ppu/cell_gen_fragment.c   | 420 ++++++++++++++++++---
 src/gallium/drivers/cell/spu/spu_per_fragment_op.c | 232 ++++++++++--
 2 files changed, 584 insertions(+), 68 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
index df29476be6..7966c0916c 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
@@ -231,6 +231,370 @@ gen_alpha_test(const struct pipe_depth_stencil_alpha_state *dsa,
 
 
+/**
+ * Generate SPE code to implement the given blend mode for a quad of pixels.
+ * \param f          SPE function to append instruction onto.
+ * \param fragR_reg  register with fragment red values (float) (in/out)
+ * \param fragG_reg  register with fragment green values (float) (in/out)
+ * \param fragB_reg  register with fragment blue values (float) (in/out)
+ * \param fragA_reg  register with fragment alpha values (float) (in/out)
+ * \param fbRGBA_reg register with packed framebuffer colors (integer) (in)
+ */
+static void
+gen_blend(const struct pipe_blend_state *blend,
+          struct spe_function *f,
+          enum pipe_format color_format,
+          int fragR_reg, int fragG_reg, int fragB_reg, int fragA_reg,
+          int fbRGBA_reg)
+{
+   int term1R_reg = spe_allocate_available_register(f);
+   int term1G_reg = spe_allocate_available_register(f);
+   int term1B_reg = spe_allocate_available_register(f);
+   int term1A_reg = spe_allocate_available_register(f);
+
+   int term2R_reg = spe_allocate_available_register(f);
+   int term2G_reg = spe_allocate_available_register(f);
+   int term2B_reg = spe_allocate_available_register(f);
+   int term2A_reg = spe_allocate_available_register(f);
+
+   int fbR_reg = spe_allocate_available_register(f);
+   int fbG_reg = spe_allocate_available_register(f);
+   int fbB_reg = spe_allocate_available_register(f);
+   int fbA_reg = spe_allocate_available_register(f);
+
+   int one_reg = spe_allocate_available_register(f);
+   int tmp_reg = spe_allocate_available_register(f);
+
+   ASSERT(blend->blend_enable);
+
+   /* Unpack/convert framebuffer colors from four 32-bit packed colors
+    * (fbRGBA) to four float RGBA vectors (fbR, fbG, fbB, fbA).
+    * Each 8-bit color component is expanded into a float in [0.0, 1.0].
+    */
+   {
+      int mask_reg = spe_allocate_available_register(f);
+
+      /* mask = {0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff} */
+      spe_fsmbi(f, mask_reg, 0x1111);
+
+      /* XXX there may be more clever ways to implement the following code */
+      switch (color_format) {
+      case PIPE_FORMAT_A8R8G8B8_UNORM:
+         /* fbB = fbB & mask */
+         spe_and(f, fbB_reg, fbRGBA_reg, mask_reg);
+         /* mask = mask << 8 */
+         spe_roti(f, mask_reg, mask_reg, 8);
+
+         /* fbG = fbRGBA & mask */
+         spe_and(f, fbB_reg, fbRGBA_reg, mask_reg);
+         /* fbG = fbG >> 8 */
+         spe_roti(f, fbB_reg, fbB_reg, -8);
+         /* mask = mask << 8 */
+         spe_roti(f, mask_reg, mask_reg, 8);
+
+         /* fbR = fbRGBA & mask */
+         spe_and(f, fbR_reg, fbRGBA_reg, mask_reg);
+         /* fbR = fbR >> 16 */
+         spe_roti(f, fbB_reg, fbB_reg, -16);
+         /* mask = mask << 8 */
+         spe_roti(f, mask_reg, mask_reg, 8);
+
+         /* fbA = fbRGBA & mask */
+         spe_and(f, fbA_reg, fbRGBA_reg, mask_reg);
+         /* fbA = fbA >> 24 */
+         spe_roti(f, fbA_reg, fbA_reg, -24);
+         break;
+
+      case PIPE_FORMAT_B8G8R8A8_UNORM:
+         /* fbA = fbA & mask */
+         spe_and(f, fbA_reg, fbRGBA_reg, mask_reg);
+         /* mask = mask << 8 */
+         spe_roti(f, mask_reg, mask_reg, 8);
+
+         /* fbR = fbRGBA & mask */
+         spe_and(f, fbR_reg, fbRGBA_reg, mask_reg);
+         /* fbR = fbR >> 8 */
+         spe_roti(f, fbR_reg, fbR_reg, -8);
+         /* mask = mask << 8 */
+         spe_roti(f, mask_reg, mask_reg, 8);
+
+         /* fbG = fbRGBA & mask */
+         spe_and(f, fbG_reg, fbRGBA_reg, mask_reg);
+         /* fbG = fbG >> 16 */
+         spe_roti(f, fbG_reg, fbG_reg, -16);
+         /* mask = mask << 8 */
+         spe_roti(f, mask_reg, mask_reg, 8);
+
+         /* fbB = fbRGBA & mask */
+         spe_and(f, fbB_reg, fbRGBA_reg, mask_reg);
+         /* fbB = fbB >> 24 */
+         spe_roti(f, fbB_reg, fbB_reg, -24);
+         break;
+
+      default:
+         ASSERT(0);
+      }
+
+      /* convert int[4] in [0,255] to float[4] in [0.0, 1.0] */
+      spe_cuflt(f, fbR_reg, fbR_reg, 8);
+      spe_cuflt(f, fbG_reg, fbG_reg, 8);
+      spe_cuflt(f, fbB_reg, fbB_reg, 8);
+      spe_cuflt(f, fbA_reg, fbA_reg, 8);
+
+      spe_release_register(f, mask_reg);
+   }
+
+
+   /*
+    * Compute Src RGB terms
+    */
+   switch (blend->rgb_src_factor) {
+   case PIPE_BLENDFACTOR_ONE:
+      spe_move(f, term1R_reg, fragR_reg);
+      spe_move(f, term1G_reg, fragG_reg);
+      spe_move(f, term1B_reg, fragB_reg);
+      break;
+   case PIPE_BLENDFACTOR_ZERO:
+      spe_zero(f, term1R_reg);
+      spe_zero(f, term1G_reg);
+      spe_zero(f, term1B_reg);
+      break;
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+      spe_fm(f, term1R_reg, fragR_reg, fragR_reg);
+      spe_fm(f, term1G_reg, fragG_reg, fragG_reg);
+      spe_fm(f, term1B_reg, fragB_reg, fragB_reg);
+      break;
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+      spe_fm(f, term1R_reg, fragR_reg, fragA_reg);
+      spe_fm(f, term1G_reg, fragG_reg, fragA_reg);
+      spe_fm(f, term1B_reg, fragB_reg, fragA_reg);
+      break;
+      /* XXX more cases */
+   default:
+      ASSERT(0);
+   }
+
+   /*
+    * Compute Src Alpha term
+    */
+   switch (blend->alpha_src_factor) {
+   case PIPE_BLENDFACTOR_ONE:
+      spe_move(f, term1A_reg, fragA_reg);
+      break;
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+      spe_fm(f, term1A_reg, fragA_reg, fragA_reg);
+      break;
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+      spe_fm(f, term1A_reg, fragA_reg, fragA_reg);
+      break;
+      /* XXX more cases */
+   default:
+      ASSERT(0);
+   }
+
+   /*
+    * Compute Dest RGB terms
+    */
+   switch (blend->rgb_dst_factor) {
+   case PIPE_BLENDFACTOR_ONE:
+      spe_move(f, term2R_reg, fbR_reg);
+      spe_move(f, term2G_reg, fbG_reg);
+      spe_move(f, term2B_reg, fbB_reg);
+      break;
+   case PIPE_BLENDFACTOR_ZERO:
+      spe_zero(f, term2R_reg);
+      spe_zero(f, term2G_reg);
+      spe_zero(f, term2B_reg);
+      break;
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+      spe_fm(f, term2R_reg, fbR_reg, fragR_reg);
+      spe_fm(f, term2G_reg, fbG_reg, fragG_reg);
+      spe_fm(f, term2B_reg, fbB_reg, fragB_reg);
+      break;
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+      spe_fm(f, term2R_reg, fbR_reg, fragA_reg);
+      spe_fm(f, term2G_reg, fbG_reg, fragA_reg);
+      spe_fm(f, term2B_reg, fbB_reg, fragA_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+      /* one = {1.0, 1.0, 1.0, 1.0} */
+      spe_load_float(f, one_reg, 1.0f);
+      /* tmp = one - fragA */
+      spe_fs(f, tmp_reg, one_reg, fragA_reg);
+      /* term = fb * tmp */
+      spe_fm(f, term2R_reg, fbR_reg, tmp_reg);
+      spe_fm(f, term2G_reg, fbG_reg, tmp_reg);
+      spe_fm(f, term2B_reg, fbB_reg, tmp_reg);
+      break;
+      /* XXX more cases */
+   default:
+      ASSERT(0);
+   }
+
+   /*
+    * Compute Dest Alpha term
+    */
+   switch (blend->alpha_dst_factor) {
+   case PIPE_BLENDFACTOR_ONE:
+      spe_move(f, term2A_reg, fbA_reg);
+      break;
+   case PIPE_BLENDFACTOR_ZERO:
+      spe_zero(f, term2A_reg);
+      break;
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+      spe_fm(f, term2A_reg, fbA_reg, fragA_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+      /* one = {1.0, 1.0, 1.0, 1.0} */
+      spe_load_float(f, one_reg, 1.0f);
+      /* tmp = one - fragA */
+      spe_fs(f, tmp_reg, one_reg, fragA_reg);
+      /* termA = fbA * tmp */
+      spe_fm(f, term2A_reg, fbA_reg, tmp_reg);
+      break;
+      /* XXX more cases */
+   default:
+      ASSERT(0);
+   }
+
+   /*
+    * Combine Src/Dest RGB terms
+    */
+   switch (blend->rgb_func) {
+   case PIPE_BLEND_ADD:
+      spe_fa(f, fragR_reg, term1R_reg, term2R_reg);
+      spe_fa(f, fragG_reg, term1G_reg, term2G_reg);
+      spe_fa(f, fragB_reg, term1B_reg, term2B_reg);
+      break;
+   case PIPE_BLEND_SUBTRACT:
+      spe_fs(f, fragR_reg, term1R_reg, term2R_reg);
+      spe_fs(f, fragG_reg, term1G_reg, term2G_reg);
+      spe_fs(f, fragB_reg, term1B_reg, term2B_reg);
+      break;
+      /* XXX more cases */
+   default:
+      ASSERT(0);
+   }
+
+   /*
+    * Combine Src/Dest A term
+    */
+   switch (blend->alpha_func) {
+   case PIPE_BLEND_ADD:
+      spe_fa(f, fragA_reg, term1A_reg, term2A_reg);
+      break;
+   case PIPE_BLEND_SUBTRACT:
+      spe_fs(f, fragA_reg, term1A_reg, term2A_reg);
+      break;
+      /* XXX more cases */
+   default:
+      ASSERT(0);
+   }
+
+   spe_release_register(f, term1R_reg);
+   spe_release_register(f, term1G_reg);
+   spe_release_register(f, term1B_reg);
+   spe_release_register(f, term1A_reg);
+
+   spe_release_register(f, term2R_reg);
+   spe_release_register(f, term2G_reg);
+   spe_release_register(f, term2B_reg);
+   spe_release_register(f, term2A_reg);
+
+   spe_release_register(f, fbR_reg);
+   spe_release_register(f, fbG_reg);
+   spe_release_register(f, fbB_reg);
+   spe_release_register(f, fbA_reg);
+
+   spe_release_register(f, one_reg);
+   spe_release_register(f, tmp_reg);
+}
+
+
+static void
+gen_logicop(const struct pipe_blend_state *blend,
+            struct spe_function *f,
+            int fragRGBA_reg, int fbRGBA_reg)
+{
+   /* XXX to-do */
+   /* operate on 32-bit packed pixels, not float colors */
+}
+
+
+static void
+gen_colormask(uint colormask,
+              struct spe_function *f,
+              int fragRGBA_reg, int fbRGBA_reg)
+{
+   /* XXX to-do */
+   /* operate on 32-bit packed pixels, not float colors */
+}
+
+
+
+/**
+ * Generate code to pack a quad of float colors into a four 32-bit integers.
+ *
+ * \param f             SPE function to append instruction onto.
+ * \param color_format  the dest color packing format
+ * \param r_reg         register containing four red values (in/clobbered)
+ * \param g_reg         register containing four green values (in/clobbered)
+ * \param b_reg         register containing four blue values (in/clobbered)
+ * \param a_reg         register containing four alpha values (in/clobbered)
+ * \param rgba_reg      register to store the packed RGBA colors (out)
+ */
+static void
+gen_pack_colors(struct spe_function *f,
+                enum pipe_format color_format,
+                int r_reg, int g_reg, int b_reg, int a_reg,
+                int rgba_reg)
+{
+   /* Convert float[4] in [0.0,1.0] to int[4] in [0,~0], with clamping */
+   spe_cfltu(f, r_reg, r_reg, 32);
+   spe_cfltu(f, g_reg, g_reg, 32);
+   spe_cfltu(f, b_reg, b_reg, 32);
+   spe_cfltu(f, a_reg, a_reg, 32);
+
+   /* Shift the most significant bytes to least the significant positions.
+    * I.e.: reg = reg >> 24
+    */
+   spe_rotmi(f, r_reg, r_reg, -24);
+   spe_rotmi(f, g_reg, g_reg, -24);
+   spe_rotmi(f, b_reg, b_reg, -24);
+   spe_rotmi(f, a_reg, a_reg, -24);
+
+   /* Shift the color bytes according to the surface format */
+   if (color_format == PIPE_FORMAT_A8R8G8B8_UNORM) {
+      spe_roti(f, g_reg, g_reg, 8);   /* green <<= 8 */
+      spe_roti(f, r_reg, r_reg, 16);  /* red <<= 16 */
+      spe_roti(f, a_reg, a_reg, 24);  /* alpha <<= 24 */
+   }
+   else if (color_format == PIPE_FORMAT_B8G8R8A8_UNORM) {
+      spe_roti(f, r_reg, r_reg, 8);   /* red <<= 8 */
+      spe_roti(f, g_reg, g_reg, 16);  /* green <<= 16 */
+      spe_roti(f, b_reg, b_reg, 24);  /* blue <<= 24 */
+   }
+   else {
+      ASSERT(0);
+   }
+
+   /* Merge red, green, blue, alpha registers to make packed RGBA colors.
+    * Eg: after shifting according to color_format we might have:
+    *     R = {0x00ff0000, 0x00110000, 0x00220000, 0x00330000}
+    *     G = {0x0000ff00, 0x00004400, 0x00005500, 0x00006600}
+    *     B = {0x000000ff, 0x00000077, 0x00000088, 0x00000099}
+    *     A = {0xff000000, 0xaa000000, 0xbb000000, 0xcc000000}
+    * OR-ing all those together gives us four packed colors:
+    *  RGBA = {0xffffffff, 0xaa114477, 0xbb225588, 0xcc336699}
+    */
+   spe_or(f, rgba_reg, r_reg, g_reg);
+   spe_or(f, rgba_reg, rgba_reg, b_reg);
+   spe_or(f, rgba_reg, rgba_reg, a_reg);
+}
+
+
+
+
 /**
  * Generate SPE code to implement the fragment operations (alpha test,
  * depth test, stencil test, blending, colormask, and final
@@ -257,6 +621,7 @@ gen_fragment_function(struct cell_context *cell, struct spe_function *f)
    const struct pipe_depth_stencil_alpha_state *dsa =
       &cell->depth_stencil->base;
    const struct pipe_blend_state *blend = &cell->blend->base;
+   const enum pipe_format color_format = cell->framebuffer.cbufs[0]->format;
 
    /* For SPE function calls: reg $3 = first param, $4 = second param, etc. */
    const int x_reg = 3;  /* uint */
@@ -443,64 +808,31 @@ gen_fragment_function(struct cell_context *cell, struct spe_function *f)
 
 
    if (blend->blend_enable) {
-      /* convert packed tile colors in fbRGBA_reg to float[4] vectors */
-
-      // gen_blend_code(blend, f, mask_reg, ... );
-
+      gen_blend(blend, f, color_format,
+                fragR_reg, fragG_reg, fragB_reg, fragA_reg, fbRGBA_reg);
    }
 
-
-
    /*
     * Write fragment colors to framebuffer/tile.
     * This involves converting the fragment colors from float[4] to the
     * tile's specific format and obeying the quad/pixel mask.
     */
    {
-      const enum pipe_format color_format = cell->framebuffer.cbufs[0]->format;
       int rgba_reg = spe_allocate_available_register(f);
 
-      /* Convert float[4] in [0.0,1.0] to int[4] in [0,~0], with clamping */
-      spe_cfltu(f, fragR_reg, fragR_reg, 32);
-      spe_cfltu(f, fragG_reg, fragG_reg, 32);
-      spe_cfltu(f, fragB_reg, fragB_reg, 32);
-      spe_cfltu(f, fragA_reg, fragA_reg, 32);
+      /* Pack four float colors as four 32-bit int colors */
+      gen_pack_colors(f, color_format,
+                      fragR_reg, fragG_reg, fragB_reg, fragA_reg,
+                      rgba_reg);
 
-      /* Shift most the significant bytes to least the significant positions.
-       * I.e.: reg = reg >> 24
-       */
-      spe_rotmi(f, fragR_reg, fragR_reg, -24);
-      spe_rotmi(f, fragG_reg, fragG_reg, -24);
-      spe_rotmi(f, fragB_reg, fragB_reg, -24);
-      spe_rotmi(f, fragA_reg, fragA_reg, -24);
-
-      /* Shift the color bytes according to the surface format */
-      if (color_format == PIPE_FORMAT_A8R8G8B8_UNORM) {
-         spe_roti(f, fragG_reg, fragG_reg, 8);   /* green <<= 8 */
-         spe_roti(f, fragR_reg, fragR_reg, 16);  /* red <<= 16 */
-         spe_roti(f, fragA_reg, fragA_reg, 24);  /* alpha <<= 24 */
-      }
-      else if (color_format == PIPE_FORMAT_B8G8R8A8_UNORM) {
-         spe_roti(f, fragR_reg, fragR_reg, 8);   /* red <<= 8 */
-         spe_roti(f, fragG_reg, fragG_reg, 16);  /* green <<= 16 */
-         spe_roti(f, fragB_reg, fragB_reg, 24);  /* blue <<= 24 */
+      if (blend->logicop_enable) {
+         gen_logicop(blend, f, rgba_reg, fbRGBA_reg);
       }
-      else {
-         ASSERT(0);
+
+      if (blend->colormask != 0xf) {
+         gen_colormask(blend->colormask, f, rgba_reg, fbRGBA_reg);
       }
 
-      /* Merge red, green, blue, alpha registers to make packed RGBA colors.
-       * Eg: after shifting according to color_format we might have:
-       *     R = {0x00ff0000, 0x00110000, 0x00220000, 0x00330000}
-       *     G = {0x0000ff00, 0x00004400, 0x00005500, 0x00006600}
-       *     B = {0x000000ff, 0x00000077, 0x00000088, 0x00000099}
-       *     A = {0xff000000, 0xaa000000, 0xbb000000, 0xcc000000}
-       * OR-ing all those together gives us four packed colors:
-       *  RGBA = {0xffffffff, 0xaa114477, 0xbb225588, 0xcc336699}
-       */
-      spe_or(f, rgba_reg, fragR_reg, fragG_reg);
-      spe_or(f, rgba_reg, rgba_reg, fragB_reg);
-      spe_or(f, rgba_reg, rgba_reg, fragA_reg);
 
       /* Mix fragment colors with framebuffer colors using the quad/pixel mask:
        * if (mask[i])
diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
index 9ed5fc50cd..3f0eabaa05 100644
--- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
+++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
@@ -39,9 +39,11 @@
 
 
 /**
- * Called by rasterizer for each quad after the shader has run.  This
- * is a fallback/debug function.  In reality we'll use a generated
- * function produced by the PPU.  But this function is useful for
+ * Called by rasterizer for each quad after the shader has run.  Do
+ * all the per-fragment operations including alpha test, z test,
+ * stencil test, blend, colormask and logicops.  This is a
+ * fallback/debug function.  In reality we'll use a generated function
+ * produced by the PPU.  But this function is useful for
  * debug/validation.
  */
 void
@@ -49,13 +51,13 @@ spu_fallback_fragment_ops(uint x, uint y,
                           tile_t *colorTile,
                           tile_t *depthStencilTile,
                           vector float fragZ,
-                          vector float fragRed,
-                          vector float fragGreen,
-                          vector float fragBlue,
-                          vector float fragAlpha,
+                          vector float fragR,
+                          vector float fragG,
+                          vector float fragB,
+                          vector float fragA,
                           vector unsigned int mask)
 {
-   vector float frag_soa[4], frag_aos[4];
+   vector float frag_aos[4];
    unsigned int c0, c1, c2, c3;
 
    /* do alpha test */
@@ -65,24 +67,24 @@ spu_fallback_fragment_ops(uint x, uint y,
 
       switch (spu.depth_stencil_alpha.alpha.func) {
       case PIPE_FUNC_LESS:
-         amask = spu_cmpgt(ref, fragAlpha);  /* mask = (fragAlpha < ref) */
+         amask = spu_cmpgt(ref, fragA);  /* mask = (fragA < ref) */
          break;
       case PIPE_FUNC_GREATER:
-         amask = spu_cmpgt(fragAlpha, ref);  /* mask = (fragAlpha > ref) */
+         amask = spu_cmpgt(fragA, ref);  /* mask = (fragA > ref) */
          break;
       case PIPE_FUNC_GEQUAL:
-         amask = spu_cmpgt(ref, fragAlpha);
+         amask = spu_cmpgt(ref, fragA);
          amask = spu_nor(amask, amask);
          break;
       case PIPE_FUNC_LEQUAL:
-         amask = spu_cmpgt(fragAlpha, ref);
+         amask = spu_cmpgt(fragA, ref);
          amask = spu_nor(amask, amask);
          break;
       case PIPE_FUNC_EQUAL:
-         amask = spu_cmpeq(ref, fragAlpha);
+         amask = spu_cmpeq(ref, fragA);
          break;
       case PIPE_FUNC_NOTEQUAL:
-         amask = spu_cmpeq(ref, fragAlpha);
+         amask = spu_cmpeq(ref, fragA);
          amask = spu_nor(amask, amask);
          break;
       case PIPE_FUNC_ALWAYS:
@@ -174,7 +176,189 @@ spu_fallback_fragment_ops(uint x, uint y,
       }
    }
 
-   /* XXX do blending here */
+   if (spu.blend.blend_enable) {
+      vector float term1r, term1g, term1b, term1a;
+      vector float term2r, term2g, term2b, term2a;
+
+      vector float fbRGBA[4];
+
+      vector float one, tmp;
+
+      /* get colors from framebuffer */
+      {
+         vector float fc[4];
+         uint c0, c1, c2, c3;
+#if 0
+         c0 = colorTile->ui[y+0][x+0];
+         c1 = colorTile->ui[y+0][x+1];
+         c2 = colorTile->ui[y+1][x+0];
+         c3 = colorTile->ui[y+1][x+1];
+#else
+         c0 = colorTile->ui[y][x*2+0];
+         c1 = colorTile->ui[y][x*2+1];
+         c2 = colorTile->ui[y][x*2+2];
+         c3 = colorTile->ui[y][x*2+3];
+#endif
+         switch (spu.fb.color_format) {
+         case PIPE_FORMAT_B8G8R8A8_UNORM:
+            fc[0] = spu_unpack_B8G8R8A8(c0);
+            fc[1] = spu_unpack_B8G8R8A8(c1);
+            fc[2] = spu_unpack_B8G8R8A8(c2);
+            fc[3] = spu_unpack_B8G8R8A8(c3);
+            break;
+         case PIPE_FORMAT_A8R8G8B8_UNORM:
+            fc[0] = spu_unpack_A8R8G8B8(c0);
+            fc[1] = spu_unpack_A8R8G8B8(c1);
+            fc[2] = spu_unpack_A8R8G8B8(c2);
+            fc[3] = spu_unpack_A8R8G8B8(c3);
+            break;
+         default:
+            ASSERT(0);
+         }
+         _transpose_matrix4x4(fbRGBA, fc);
+      }
+
+      /*
+       * Compute Src RGB terms
+       */
+      switch (spu.blend.rgb_src_factor) {
+      case PIPE_BLENDFACTOR_ONE:
+         term1r = fragR;
+         term1g = fragG;
+         term1b = fragB;
+         break;
+      case PIPE_BLENDFACTOR_ZERO:
+         term1r =
+         term1g =
+         term1b = spu_splats(0.0f);
+         break;
+      case PIPE_BLENDFACTOR_SRC_COLOR:
+         term1r = spu_mul(fragR, fragR);
+         term1g = spu_mul(fragG, fragG);
+         term1b = spu_mul(fragB, fragB);
+         break;
+      case PIPE_BLENDFACTOR_SRC_ALPHA:
+         term1r = spu_mul(fragR, fragA);
+         term1g = spu_mul(fragG, fragA);
+         term1b = spu_mul(fragB, fragA);
+         break;
+      /* XXX more cases */
+      default:
+         ASSERT(0);
+      }
+
+      /*
+       * Compute Src Alpha term
+       */
+      switch (spu.blend.alpha_src_factor) {
+      case PIPE_BLENDFACTOR_ONE:
+         term1a = fragA;
+         break;
+      case PIPE_BLENDFACTOR_SRC_COLOR:
+         term1a = spu_splats(0.0f);
+         break;
+      case PIPE_BLENDFACTOR_SRC_ALPHA:
+         term1a = spu_mul(fragA, fragA);
+         break;
+      /* XXX more cases */
+      default:
+         ASSERT(0);
+      }
+
+      /*
+       * Compute Dest RGB terms
+       */
+      switch (spu.blend.rgb_dst_factor) {
+      case PIPE_BLENDFACTOR_ONE:
+         term2r = fragR;
+         term2g = fragG;
+         term2b = fragB;
+         break;
+      case PIPE_BLENDFACTOR_ZERO:
+         term2r =
+         term2g =
+         term2b = spu_splats(0.0f);
+         break;
+      case PIPE_BLENDFACTOR_SRC_COLOR:
+         term2r = spu_mul(fbRGBA[0], fragR);
+         term2g = spu_mul(fbRGBA[1], fragG);
+         term2b = spu_mul(fbRGBA[2], fragB);
+         break;
+      case PIPE_BLENDFACTOR_SRC_ALPHA:
+         term2r = spu_mul(fbRGBA[0], fragA);
+         term2g = spu_mul(fbRGBA[1], fragA);
+         term2b = spu_mul(fbRGBA[2], fragA);
+         break;
+      case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+         one = spu_splats(1.0f);
+         tmp = spu_sub(one, fragA);
+         term2r = spu_mul(fbRGBA[0], tmp);
+         term2g = spu_mul(fbRGBA[1], tmp);
+         term2b = spu_mul(fbRGBA[2], tmp);
+         break;
+      /* XXX more cases */
+      default:
+         ASSERT(0);
+      }
+
+      /*
+       * Compute Dest Alpha term
+       */
+      switch (spu.blend.alpha_dst_factor) {
+      case PIPE_BLENDFACTOR_ONE:
+         term2a = fragA;
+         break;
+      case PIPE_BLENDFACTOR_SRC_COLOR:
+         term2a = spu_splats(0.0f);
+         break;
+      case PIPE_BLENDFACTOR_SRC_ALPHA:
+         term2a = spu_mul(fbRGBA[3], fragA);
+         break;
+      case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+         one = spu_splats(1.0f);
+         tmp = spu_sub(one, fragA);
+         term2a = spu_mul(fbRGBA[3], tmp);
+         break;
+      /* XXX more cases */
+      default:
+         ASSERT(0);
+      }
+
+      /*
+       * Combine Src/Dest RGB terms
+       */
+      switch (spu.blend.rgb_func) {
+      case PIPE_BLEND_ADD:
+         fragR = spu_add(term1r, term2r);
+         fragG = spu_add(term1g, term2g);
+         fragB = spu_add(term1b, term2b);
+         break;
+      case PIPE_BLEND_SUBTRACT:
+         fragR = spu_sub(term1r, term2r);
+         fragG = spu_sub(term1g, term2g);
+         fragB = spu_sub(term1b, term2b);
+         break;
+      /* XXX more cases */
+      default:
+         ASSERT(0);
+      }
+
+      /*
+       * Combine Src/Dest A term
+       */
+      switch (spu.blend.alpha_func) {
+      case PIPE_BLEND_ADD:
+         fragA = spu_add(term1a, term2a);
+         break;
+      case PIPE_BLEND_SUBTRACT:
+         fragA = spu_sub(term1a, term2a);
+         break;
+      /* XXX more cases */
+      default:
+         ASSERT(0);
+      }
+   }
+
 
    /* XXX do colormask test here */
 
@@ -190,17 +374,17 @@ spu_fallback_fragment_ops(uint x, uint y,
 #if 0
    {
       vector float frag_soa[4];
-      frag_soa[0] = fragRed;
-      frag_soa[1] = fragGreen;
-      frag_soa[2] = fragBlue;
-      frag_soa[3] = fragAlpha;
+      frag_soa[0] = fragR;
+      frag_soa[1] = fragG;
+      frag_soa[2] = fragB;
+      frag_soa[3] = fragA;
       _transpose_matrix4x4(frag_aos, frag_soa);
    }
 #else
    /* short-cut relying on function parameter layout: */
-   _transpose_matrix4x4(frag_aos, &fragRed);
-   (void) fragGreen;
-   (void) fragBlue;
+   _transpose_matrix4x4(frag_aos, &fragR);
+   (void) fragG;
+   (void) fragB;
 #endif
 
    switch (spu.fb.color_format) {
@@ -238,7 +422,7 @@ spu_fallback_fragment_ops(uint x, uint y,
    if (spu_extract(mask, 2))
       colorTile->ui[y+1][x+0] = c2;
    if (spu_extract(mask, 3))
-      colorTile->ui[y+1][x+1] = c3;   
+      colorTile->ui[y+1][x+1] = c3;
 #else
    /*
     * Quad layout:
@@ -253,6 +437,6 @@ spu_fallback_fragment_ops(uint x, uint y,
    if (spu_extract(mask, 2))
       colorTile->ui[y][x*2+2] = c2;
    if (spu_extract(mask, 3))
-      colorTile->ui[y][x*2+3] = c3;   
+      colorTile->ui[y][x*2+3] = c3;
 #endif
 }
-- 
cgit v1.2.3


From 283ffdf99605c536d00e03ad6ec91a6f8e006fc2 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Thu, 11 Sep 2008 10:13:20 -0600
Subject: cell: checkpoint: remove more of the old per-fragment code

---
 src/gallium/drivers/cell/common.h              |   2 +
 src/gallium/drivers/cell/ppu/Makefile          |   1 -
 src/gallium/drivers/cell/ppu/cell_state_emit.c |  60 ++-----------
 src/gallium/drivers/cell/spu/spu_main.c        | 115 +++----------------------
 src/gallium/drivers/cell/spu/spu_main.h        |  37 +-------
 5 files changed, 19 insertions(+), 196 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index a62530c64d..61d2b7d1ae 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -146,6 +146,8 @@ struct cell_command_logicop
 struct cell_command_fragment_ops
 {
    uint64_t opcode;      /**< CELL_CMD_STATE_FRAGMENT_OPS */
+   struct pipe_depth_stencil_alpha_state dsa;
+   struct pipe_blend_state blend;
    unsigned code[SPU_MAX_FRAGMENT_OPS_INSTS];
 };
 
diff --git a/src/gallium/drivers/cell/ppu/Makefile b/src/gallium/drivers/cell/ppu/Makefile
index b5a6fcb8de..8699f3f8ec 100644
--- a/src/gallium/drivers/cell/ppu/Makefile
+++ b/src/gallium/drivers/cell/ppu/Makefile
@@ -28,7 +28,6 @@ SOURCES = \
 	cell_gen_fragment.c \
 	cell_state_derived.c \
 	cell_state_emit.c \
-	cell_state_per_fragment.c \
 	cell_state_shader.c \
 	cell_pipe_state.c \
 	cell_screen.c \
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index 06777aac14..2bfb976c59 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -55,23 +55,6 @@ emit_state_cmd(struct cell_context *cell, uint cmd,
 void
 cell_emit_state(struct cell_context *cell)
 {
-   if (cell->dirty & (CELL_NEW_FRAMEBUFFER | CELL_NEW_BLEND)) {
-      struct cell_command_logicop logicop;
-
-      if (cell->logic_op.store != NULL) {
-	 spe_release_func(& cell->logic_op);
-      }
-
-      cell_generate_logic_op(& cell->logic_op,
-			     & cell->blend->base,
-			     cell->framebuffer.cbufs[0]);
-
-      logicop.base = (intptr_t) cell->logic_op.store;
-      logicop.size = 64 * 4;
-      emit_state_cmd(cell, CELL_CMD_STATE_LOGICOP, &logicop,
-		     sizeof(logicop));
-   }
-
    if (cell->dirty & CELL_NEW_FRAMEBUFFER) {
       struct pipe_surface *cbuf = cell->framebuffer.cbufs[0];
       struct pipe_surface *zbuf = cell->framebuffer.zsbuf;
@@ -91,7 +74,9 @@ cell_emit_state(struct cell_context *cell)
    }
 
 
-   if (cell->dirty & (CELL_NEW_FRAMEBUFFER | CELL_NEW_DEPTH_STENCIL)) {
+   if (cell->dirty & (CELL_NEW_FRAMEBUFFER |
+                      CELL_NEW_DEPTH_STENCIL |
+                      CELL_NEW_BLEND)) {
       /* XXX we don't want to always do codegen here.  We should have
        * a hash/lookup table to cache previous results...
        */
@@ -105,47 +90,12 @@ cell_emit_state(struct cell_context *cell)
       fops->opcode = CELL_CMD_STATE_FRAGMENT_OPS;
       memcpy(&fops->code, spe_code.store,
              SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE);
+      fops->dsa = cell->depth_stencil->base;
+      fops->blend = cell->blend->base;
       /* free codegen buffer */
       spe_release_func(&spe_code);
    }
 
-   if (cell->dirty & CELL_NEW_BLEND) {
-      struct cell_command_blend blend;
-
-      if (cell->blend != NULL) {
-         blend.base = (intptr_t) cell->blend->code.store;
-         blend.size = cell->blend->code.num_inst * SPE_INST_SIZE;
-         blend.read_fb = TRUE;
-      }
-      else {
-         blend.base = 0;
-         blend.size = 0;
-         blend.read_fb = FALSE;
-      }
-
-      emit_state_cmd(cell, CELL_CMD_STATE_BLEND, &blend, sizeof(blend));
-   }
-
-   if (cell->dirty & CELL_NEW_DEPTH_STENCIL) {
-      struct cell_command_depth_stencil_alpha_test dsat;
-
-      if (cell->depth_stencil != NULL) {
-	 dsat.base = (intptr_t) cell->depth_stencil->code.store;
-	 dsat.size = cell->depth_stencil->code.num_inst * SPE_INST_SIZE;
-	 dsat.read_depth = TRUE;
-	 dsat.read_stencil = FALSE;
-         dsat.state = cell->depth_stencil->base;
-      }
-      else {
-	 dsat.base = 0;
-	 dsat.size = 0;
-	 dsat.read_depth = FALSE;
-	 dsat.read_stencil = FALSE;
-      }
-
-      emit_state_cmd(cell, CELL_CMD_STATE_DEPTH_STENCIL, &dsat, sizeof(dsat));
-   }
-
    if (cell->dirty & CELL_NEW_SAMPLER) {
       uint i;
       for (i = 0; i < CELL_MAX_SAMPLERS; i++) {
diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index 4e0ec15925..6afca19dfd 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -63,14 +63,6 @@ struct spu_vs_context draw;
 static unsigned char attribute_fetch_code_buffer[136 * PIPE_MAX_ATTRIBS]
     ALIGN16_ATTRIB;
 
-static unsigned char depth_stencil_code_buffer[4 * 64]
-    ALIGN16_ATTRIB;
-
-static unsigned char fb_blend_code_buffer[4 * 64]
-    ALIGN16_ATTRIB;
-
-static unsigned char logicop_code_buffer[4 * 64]
-    ALIGN16_ATTRIB;
 
 
 /**
@@ -240,8 +232,15 @@ cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops)
       printf("SPU %u: CMD_STATE_FRAGMENT_OPS\n", spu.init.id);
    /* Copy SPU code from batch buffer to spu buffer */
    memcpy(spu.fragment_ops.code, fops->code, SPU_MAX_FRAGMENT_OPS_INSTS * 4);
+   /* Copy state info */
+   memcpy(&spu.depth_stencil_alpha, &fops->dsa, sizeof(fops->dsa));
+   memcpy(&spu.blend, &fops->blend, sizeof(fops->blend));
+
    /* Point function pointer at new code */
    spu.fragment_ops.func = (spu_fragment_ops_func) spu.fragment_ops.code;
+
+   spu.read_depth = spu.depth_stencil_alpha.depth.enabled;
+   spu.read_stencil = spu.depth_stencil_alpha.stencil[0].enabled;
 }
 
 
@@ -303,89 +302,6 @@ cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
 }
 
 
-#define NEW_FRAGMENT_FUNCTION 01
-
-static void
-cmd_state_blend(const struct cell_command_blend *state)
-{
-   if (Debug)
-      printf("SPU %u: BLEND: enabled %d\n",
-             spu.init.id,
-             (state->size != 0));
-
-   ASSERT_ALIGN16(state->base);
-
-   if (state->size != 0) {
-      mfc_get(fb_blend_code_buffer,
-              (unsigned int) state->base,  /* src */
-              ROUNDUP16(state->size),
-              TAG_BATCH_BUFFER,
-              0, /* tid */
-              0  /* rid */);
-      wait_on_mask(1 << TAG_BATCH_BUFFER);
-      spu.blend = (blend_func) fb_blend_code_buffer;
-      spu.read_fb = state->read_fb;
-   }
-   else
-   {
-      spu.read_fb = FALSE;
-   }
-}
-
-
-static void
-cmd_state_depth_stencil(const struct cell_command_depth_stencil_alpha_test *state)
-{
-   if (Debug)
-      printf("SPU %u: DEPTH_STENCIL: ztest %d\n",
-             spu.init.id,
-             state->read_depth);
-
-   ASSERT_ALIGN16(state->base);
-
-   if (state->size != 0) {
-      mfc_get(depth_stencil_code_buffer,
-	      (unsigned int) state->base,  /* src */
-	      ROUNDUP16(state->size),
-	      TAG_BATCH_BUFFER,
-	      0, /* tid */
-	      0  /* rid */);
-      wait_on_mask(1 << TAG_BATCH_BUFFER);
-   }
-   else
-   {
-      /* If there is no code, emit a return instruction.
-       */
-      depth_stencil_code_buffer[0] = 0x35;
-      depth_stencil_code_buffer[1] = 0x00;
-      depth_stencil_code_buffer[2] = 0x00;
-      depth_stencil_code_buffer[3] = 0x00;
-   }
-
-   spu.frag_test = (frag_test_func) depth_stencil_code_buffer;
-   spu.read_depth = state->read_depth;
-   spu.read_stencil = state->read_stencil;
-   spu.depth_stencil_alpha = state->state;
-}
-
-
-static void
-cmd_state_logicop(const struct cell_command_logicop * code)
-{
-#if !NEW_FRAGMENT_FUNCTION
-   mfc_get(logicop_code_buffer,
-           (unsigned int) code->base,  /* src */
-           code->size,
-           TAG_BATCH_BUFFER,
-           0, /* tid */
-           0  /* rid */);
-   wait_on_mask(1 << TAG_BATCH_BUFFER);
-
-   spu.logicop = (logicop_func) logicop_code_buffer;
-#endif
-}
-
-
 static void
 cmd_state_sampler(const struct cell_command_sampler *sampler)
 {
@@ -571,15 +487,6 @@ cmd_batch(uint opcode)
          cmd_finish();
          pos += 1;
          break;
-      case CELL_CMD_STATE_BLEND:
-         cmd_state_blend((struct cell_command_blend *) &buffer[pos+1]);
-         pos += (1 + ROUNDUP8(sizeof(struct cell_command_blend)) / 8);
-         break;
-      case CELL_CMD_STATE_DEPTH_STENCIL:
-         cmd_state_depth_stencil((struct cell_command_depth_stencil_alpha_test *)
-                                 &buffer[pos+1]);
-         pos += (1 + ROUNDUP8(sizeof(struct cell_command_depth_stencil_alpha_test)) / 8);
-         break;
       case CELL_CMD_STATE_SAMPLER:
          {
             struct cell_command_sampler *sampler
@@ -614,19 +521,17 @@ cmd_batch(uint opcode)
          pos += (1 + ROUNDUP8(sizeof(struct cell_array_info)) / 8);
          break;
       case CELL_CMD_STATE_BIND_VS:
+#if 01
          spu_bind_vertex_shader(&draw,
                                 (struct cell_shader_info *) &buffer[pos+1]);
          pos += (1 + ROUNDUP8(sizeof(struct cell_shader_info)) / 8);
+#endif
          break;
       case CELL_CMD_STATE_ATTRIB_FETCH:
          cmd_state_attrib_fetch((struct cell_attribute_fetch_code *)
                                 &buffer[pos+1]);
          pos += (1 + ROUNDUP8(sizeof(struct cell_attribute_fetch_code)) / 8);
          break;
-      case CELL_CMD_STATE_LOGICOP:
-         cmd_state_logicop((struct cell_command_logicop *) &buffer[pos+1]);
-         pos += (1 + ROUNDUP8(sizeof(struct cell_command_logicop)) / 8);
-         break;
       case CELL_CMD_FLUSH_BUFFER_RANGE: {
 	 struct cell_buffer_range *br = (struct cell_buffer_range *)
 	     &buffer[pos+1];
@@ -695,7 +600,9 @@ main_loop(void)
          exitFlag = 1;
          break;
       case CELL_CMD_VS_EXECUTE:
+#if 01
          spu_execute_vertex_shader(&draw, &cmd.vs);
+#endif
          break;
       case CELL_CMD_BATCH:
          cmd_batch(opcode);
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 7ab34f5222..f0f8be47db 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -60,35 +60,6 @@ typedef union {
 #define TILE_STATUS_GETTING 5  /**< mfc_get() called but not yet arrived */
 
 
-struct spu_frag_test_results {
-   qword mask;
-   qword depth;
-   qword stencil;
-};
-
-typedef struct spu_frag_test_results (*frag_test_func)(qword frag_mask,
-    qword pixel_depth, qword pixel_stencil, qword frag_depth,
-    qword frag_alpha, qword facing);
-
-
-struct spu_blend_results {
-   qword r;
-   qword g;
-   qword b;
-   qword a;
-};
-
-typedef struct spu_blend_results (*blend_func)(
-    qword frag_r, qword frag_g, qword frag_b, qword frag_a,
-    qword pixel_r, qword pixel_g, qword pixel_b, qword pixel_a,
-    qword const_r, qword const_g, qword const_b, qword const_a);
-
-typedef struct spu_blend_results (*logicop_func)(
-    qword pixel_r, qword pixel_g, qword pixel_b, qword pixel_a,
-    qword frag_r, qword frag_g, qword frag_b, qword frag_a,
-    qword frag_mask);
-
-
 typedef vector float (*sample_texture_func)(uint unit, vector float texcoord);
 
 
@@ -147,16 +118,10 @@ struct spu_global
    struct spu_framebuffer fb;
 
    struct pipe_depth_stencil_alpha_state depth_stencil_alpha;
+   struct pipe_blend_state blend;
 
    boolean read_depth;
    boolean read_stencil;
-   frag_test_func frag_test;  /**< Current depth/stencil test code */
-   
-   boolean read_fb;   /**< Does current blend mode require framebuffer read? */
-   blend_func blend;  /**< Current blend code */
-   qword const_blend_color[4] ALIGN16_ATTRIB;
-
-   logicop_func logicop;  /**< Current logicop code **/
 
    struct pipe_sampler_state sampler[PIPE_MAX_SAMPLERS];
    struct spu_texture texture[PIPE_MAX_SAMPLERS];
-- 
cgit v1.2.3


From aa4a08d429712fa516342ec02253c2591794ea5f Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Thu, 11 Sep 2008 10:25:38 -0600
Subject: cell: asst. clean-up

---
 src/gallium/drivers/cell/spu/spu_main.c | 23 +++++-----------
 src/gallium/drivers/cell/spu/spu_main.h | 47 +++++++++++++++------------------
 src/gallium/drivers/cell/spu/spu_tri.c  | 10 +++----
 3 files changed, 32 insertions(+), 48 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index 6afca19dfd..29686964d2 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -231,13 +231,13 @@ cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops)
    if (Debug)
       printf("SPU %u: CMD_STATE_FRAGMENT_OPS\n", spu.init.id);
    /* Copy SPU code from batch buffer to spu buffer */
-   memcpy(spu.fragment_ops.code, fops->code, SPU_MAX_FRAGMENT_OPS_INSTS * 4);
+   memcpy(spu.fragment_ops_code, fops->code, SPU_MAX_FRAGMENT_OPS_INSTS * 4);
    /* Copy state info */
    memcpy(&spu.depth_stencil_alpha, &fops->dsa, sizeof(fops->dsa));
    memcpy(&spu.blend, &fops->blend, sizeof(fops->blend));
 
    /* Point function pointer at new code */
-   spu.fragment_ops.func = (spu_fragment_ops_func) spu.fragment_ops.code;
+   spu.fragment_ops = (spu_fragment_ops_func) spu.fragment_ops_code;
 
    spu.read_depth = spu.depth_stencil_alpha.depth.enabled;
    spu.read_stencil = spu.depth_stencil_alpha.stencil[0].enabled;
@@ -288,17 +288,6 @@ cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
       spu.fb.zsize = 0;
       break;
    }
-
-   if (spu.fb.color_format == PIPE_FORMAT_A8R8G8B8_UNORM)
-      spu.color_shuffle = ((vector unsigned char) {
-                              12, 0, 4, 8, 0, 0, 0, 0, 
-                              0, 0, 0, 0, 0, 0, 0, 0});
-   else if (spu.fb.color_format == PIPE_FORMAT_B8G8R8A8_UNORM)
-      spu.color_shuffle = ((vector unsigned char) {
-                              8, 4, 0, 12, 0, 0, 0, 0, 
-                              0, 0, 0, 0, 0, 0, 0, 0});
-   else
-      ASSERT(0);
 }
 
 
@@ -521,11 +510,11 @@ cmd_batch(uint opcode)
          pos += (1 + ROUNDUP8(sizeof(struct cell_array_info)) / 8);
          break;
       case CELL_CMD_STATE_BIND_VS:
-#if 01
+#if 0
          spu_bind_vertex_shader(&draw,
                                 (struct cell_shader_info *) &buffer[pos+1]);
-         pos += (1 + ROUNDUP8(sizeof(struct cell_shader_info)) / 8);
 #endif
+         pos += (1 + ROUNDUP8(sizeof(struct cell_shader_info)) / 8);
          break;
       case CELL_CMD_STATE_ATTRIB_FETCH:
          cmd_state_attrib_fetch((struct cell_attribute_fetch_code *)
@@ -600,7 +589,7 @@ main_loop(void)
          exitFlag = 1;
          break;
       case CELL_CMD_VS_EXECUTE:
-#if 01
+#if 0
          spu_execute_vertex_shader(&draw, &cmd.vs);
 #endif
          break;
@@ -631,7 +620,7 @@ one_time_init(void)
    /* Install default/fallback fragment processing function.
     * This will normally be overriden by a code-gen'd function.
     */
-   spu.fragment_ops.func = spu_fallback_fragment_ops;
+   spu.fragment_ops = spu_fallback_fragment_ops;
 }
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index f0f8be47db..d40539da83 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -60,9 +60,11 @@ typedef union {
 #define TILE_STATUS_GETTING 5  /**< mfc_get() called but not yet arrived */
 
 
-typedef vector float (*sample_texture_func)(uint unit, vector float texcoord);
-
+/** Function for sampling textures */
+typedef vector float (*spu_sample_texture_func)(uint unit,
+                                                vector float texcoord);
 
+/** Function for performing per-fragment ops */
 typedef void (*spu_fragment_ops_func)(uint x, uint y,
                                       tile_t *colorTile,
                                       tile_t *depthStencilTile,
@@ -73,14 +75,8 @@ typedef void (*spu_fragment_ops_func)(uint x, uint y,
                                       vector float fragAlpha,
                                       vector unsigned int mask);
 
-struct spu_fragment_ops
+struct spu_framebuffer
 {
-   uint code[SPU_MAX_FRAGMENT_OPS_INSTS];
-   spu_fragment_ops_func func;  /**< Current fragment ops function */
-} ALIGN16_ATTRIB;
-
-
-struct spu_framebuffer {
    void *color_start;              /**< addr of color surface in main memory */
    void *depth_start;              /**< addr of depth surface in main memory */
    enum pipe_format color_format;
@@ -109,34 +105,31 @@ struct spu_texture
 
 
 /**
- * All SPU global/context state will be in singleton object of this type:
+ * All SPU global/context state will be in a singleton object of this type:
  */
 struct spu_global
 {
+   /** One-time init/constant info */
    struct cell_init_info init;
 
+   /*
+    * Current state
+    */
    struct spu_framebuffer fb;
-
    struct pipe_depth_stencil_alpha_state depth_stencil_alpha;
    struct pipe_blend_state blend;
-
-   boolean read_depth;
-   boolean read_stencil;
-
    struct pipe_sampler_state sampler[PIPE_MAX_SAMPLERS];
    struct spu_texture texture[PIPE_MAX_SAMPLERS];
-
    struct vertex_info vertex_info;
 
-   struct spu_fragment_ops fragment_ops;
-
-   /* XXX more state to come */
-
-
-   /** current color and Z tiles */
+   /** Current color and Z tiles */
    tile_t ctile ALIGN16_ATTRIB;
    tile_t ztile ALIGN16_ATTRIB;
 
+   /** Read depth/stencil tiles? */
+   boolean read_depth;
+   boolean read_stencil;
+
    /** Current tiles' status */
    ubyte cur_ctile_status, cur_ztile_status;
 
@@ -144,11 +137,13 @@ struct spu_global
    ubyte ctile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
    ubyte ztile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
 
+   /** Current fragment ops machine code */
+   uint fragment_ops_code[SPU_MAX_FRAGMENT_OPS_INSTS];
+   /** Current fragment ops function */
+   spu_fragment_ops_func fragment_ops;
 
-   /** for converting RGBA to PIPE_FORMAT_x colors */
-   vector unsigned char color_shuffle;
-
-   sample_texture_func sample_texture[CELL_MAX_SAMPLERS];
+   /** Current texture sampler function */
+   spu_sample_texture_func sample_texture[CELL_MAX_SAMPLERS];
 
 } ALIGN16_ATTRIB;
 
diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index a5bf3270c7..f02cdd1f76 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -333,11 +333,11 @@ emit_quad( int x, int y, mask_t mask )
          /* Do all per-fragment/quad operations here, including:
           *  alpha test, z test, stencil test, blend and framebuffer writing.
           */
-         spu.fragment_ops.func(ix, iy, &spu.ctile, &spu.ztile,
-                               fragZ.v,
-                               soa_frag[0], soa_frag[1],
-                               soa_frag[2], soa_frag[3],
-                               mask);
+         spu.fragment_ops(ix, iy, &spu.ctile, &spu.ztile,
+                          fragZ.v,
+                          soa_frag[0], soa_frag[1],
+                          soa_frag[2], soa_frag[3],
+                          mask);
       }
 
    }
-- 
cgit v1.2.3


From 924653e37db4501d0f03721e9d74abffe46a3c72 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Thu, 11 Sep 2008 10:27:17 -0600
Subject: cell: don't build unused sources

---
 src/gallium/drivers/cell/spu/Makefile | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/spu/Makefile b/src/gallium/drivers/cell/spu/Makefile
index e285ae9fdb..1ae0dfb8c1 100644
--- a/src/gallium/drivers/cell/spu/Makefile
+++ b/src/gallium/drivers/cell/spu/Makefile
@@ -22,12 +22,15 @@ SOURCES = \
 	spu_render.c \
 	spu_texture.c \
 	spu_tile.c \
-	spu_tri.c \
+	spu_tri.c
+
+OLD_SOURCES = \
 	spu_exec.c \
 	spu_util.c \
 	spu_vertex_fetch.c \
 	spu_vertex_shader.c
 
+
 SPU_OBJECTS = $(SOURCES:.c=.o) \
 
 SPU_ASM_OUT = $(SOURCES:.c=.s) \
-- 
cgit v1.2.3


From f6bf8d9d410d94372b72f4f6ede6196ae5a4a67f Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Thu, 11 Sep 2008 10:33:24 -0600
Subject: cell: clean-up, comments

---
 src/gallium/drivers/cell/spu/spu_main.c | 52 ++++++++++++++++++++-------------
 1 file changed, 32 insertions(+), 20 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index 29686964d2..2a7cb75f59 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -429,16 +429,14 @@ cmd_batch(uint opcode)
       printf("SPU %u: release batch buf %u\n", spu.init.id, buf);
    release_buffer(buf);
 
+   /*
+    * Loop over commands in the batch buffer
+    */
    for (pos = 0; pos < usize; /* no incr */) {
       switch (buffer[pos]) {
-      case CELL_CMD_STATE_FRAMEBUFFER:
-         {
-            struct cell_command_framebuffer *fb
-               = (struct cell_command_framebuffer *) &buffer[pos];
-            cmd_state_framebuffer(fb);
-            pos += sizeof(*fb) / 8;
-         }
-         break;
+      /*
+       * rendering commands
+       */
       case CELL_CMD_CLEAR_SURFACE:
          {
             struct cell_command_clear_surface *clr
@@ -456,6 +454,17 @@ cmd_batch(uint opcode)
             pos += pos_incr;
          }
          break;
+      /*
+       * state-update commands
+       */
+      case CELL_CMD_STATE_FRAMEBUFFER:
+         {
+            struct cell_command_framebuffer *fb
+               = (struct cell_command_framebuffer *) &buffer[pos];
+            cmd_state_framebuffer(fb);
+            pos += sizeof(*fb) / 8;
+         }
+         break;
       case CELL_CMD_STATE_FRAGMENT_OPS:
          {
             struct cell_command_fragment_ops *fops
@@ -464,18 +473,6 @@ cmd_batch(uint opcode)
             pos += sizeof(*fops) / 8;
          }
          break;
-      case CELL_CMD_RELEASE_VERTS:
-         {
-            struct cell_command_release_verts *release
-               = (struct cell_command_release_verts *) &buffer[pos];
-            cmd_release_verts(release);
-            pos += sizeof(*release) / 8;
-         }
-         break;
-      case CELL_CMD_FINISH:
-         cmd_finish();
-         pos += 1;
-         break;
       case CELL_CMD_STATE_SAMPLER:
          {
             struct cell_command_sampler *sampler
@@ -521,6 +518,21 @@ cmd_batch(uint opcode)
                                 &buffer[pos+1]);
          pos += (1 + ROUNDUP8(sizeof(struct cell_attribute_fetch_code)) / 8);
          break;
+      /*
+       * misc commands
+       */
+      case CELL_CMD_FINISH:
+         cmd_finish();
+         pos += 1;
+         break;
+      case CELL_CMD_RELEASE_VERTS:
+         {
+            struct cell_command_release_verts *release
+               = (struct cell_command_release_verts *) &buffer[pos];
+            cmd_release_verts(release);
+            pos += sizeof(*release) / 8;
+         }
+         break;
       case CELL_CMD_FLUSH_BUFFER_RANGE: {
 	 struct cell_buffer_range *br = (struct cell_buffer_range *)
 	     &buffer[pos+1];
-- 
cgit v1.2.3


From 7ce1d0fb6700fd4998a095de2c9edf5ed920464c Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Thu, 11 Sep 2008 10:52:03 -0600
Subject: cell: more comments, stub code for colormask/logicop/etc

---
 src/gallium/drivers/cell/spu/spu_per_fragment_op.c | 107 ++++++++++++++-------
 1 file changed, 70 insertions(+), 37 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
index 3f0eabaa05..03dd547845 100644
--- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
+++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
@@ -37,6 +37,8 @@
 #include "spu_per_fragment_op.h"
 
 
+#define LINEAR_QUAD_LAYOUT 1
+
 
 /**
  * Called by rasterizer for each quad after the shader has run.  Do
@@ -177,27 +179,28 @@ spu_fallback_fragment_ops(uint x, uint y,
    }
 
    if (spu.blend.blend_enable) {
+      /* blending terms, misc regs */
       vector float term1r, term1g, term1b, term1a;
       vector float term2r, term2g, term2b, term2a;
-
-      vector float fbRGBA[4];
-
       vector float one, tmp;
 
-      /* get colors from framebuffer */
+      vector float fbRGBA[4];  /* current framebuffer colors */
+
+      /* get colors from framebuffer/tile */
       {
          vector float fc[4];
          uint c0, c1, c2, c3;
-#if 0
-         c0 = colorTile->ui[y+0][x+0];
-         c1 = colorTile->ui[y+0][x+1];
-         c2 = colorTile->ui[y+1][x+0];
-         c3 = colorTile->ui[y+1][x+1];
-#else
+
+#if LINEAR_QUAD_LAYOUT /* See comments/diagram below */
          c0 = colorTile->ui[y][x*2+0];
          c1 = colorTile->ui[y][x*2+1];
          c2 = colorTile->ui[y][x*2+2];
          c3 = colorTile->ui[y][x*2+3];
+#else
+         c0 = colorTile->ui[y+0][x+0];
+         c1 = colorTile->ui[y+0][x+1];
+         c2 = colorTile->ui[y+1][x+0];
+         c3 = colorTile->ui[y+1][x+1];
 #endif
          switch (spu.fb.color_format) {
          case PIPE_FORMAT_B8G8R8A8_UNORM:
@@ -360,18 +363,11 @@ spu_fallback_fragment_ops(uint x, uint y,
    }
 
 
-   /* XXX do colormask test here */
-
-
-   if (spu_extract(spu_orx(mask), 0)) {
-      spu.cur_ctile_status = TILE_STATUS_DIRTY;
-   }
-   else {
-      return;
-   }
-
-   /* convert RRRR,GGGG,BBBB,AAAA to RGBA,RGBA,RGBA,RGBA */
+   /*
+    * Convert RRRR,GGGG,BBBB,AAAA to RGBA,RGBA,RGBA,RGBA.
+    */
 #if 0
+   /* original code */
    {
       vector float frag_soa[4];
       frag_soa[0] = fragR;
@@ -387,6 +383,9 @@ spu_fallback_fragment_ops(uint x, uint y,
    (void) fragB;
 #endif
 
+   /*
+    * Pack float colors into 32-bit RGBA words.
+    */
    switch (spu.fb.color_format) {
    case PIPE_FORMAT_A8R8G8B8_UNORM:
       c0 = spu_pack_A8R8G8B8(frag_aos[0]);
@@ -406,24 +405,41 @@ spu_fallback_fragment_ops(uint x, uint y,
       ASSERT(0);
    }
 
-#if 0
+
    /*
-    * Quad layout:
-    *  +--+--+
-    *  |p0|p1|
-    *  +--+--+
-    *  |p2|p3|
-    *  +--+--+
+    * Color masking
     */
-   if (spu_extract(mask, 0))
-      colorTile->ui[y+0][x+0] = c0;
-   if (spu_extract(mask, 1))
-      colorTile->ui[y+0][x+1] = c1;
-   if (spu_extract(mask, 2))
-      colorTile->ui[y+1][x+0] = c2;
-   if (spu_extract(mask, 3))
-      colorTile->ui[y+1][x+1] = c3;
-#else
+   if (spu.blend.colormask != 0xf) {
+      /* XXX to do */
+      /* apply color mask to 32-bit packed colors */
+   }
+
+
+   /*
+    * Logic Ops
+    */
+   if (spu.blend.logicop_enable) {
+      /* XXX to do */
+      /* apply logicop to 32-bit packed colors */
+   }
+
+
+   /*
+    * If mask is non-zero, mark tile as dirty.
+    */
+   if (spu_extract(spu_orx(mask), 0)) {
+      spu.cur_ctile_status = TILE_STATUS_DIRTY;
+   }
+   else {
+      return;
+   }
+
+
+   /*
+    * Write new quad colors to the framebuffer/tile.
+    * Only write pixels where the corresponding mask word is set.
+    */
+#if LINEAR_QUAD_LAYOUT
    /*
     * Quad layout:
     *  +--+--+--+--+
@@ -438,5 +454,22 @@ spu_fallback_fragment_ops(uint x, uint y,
       colorTile->ui[y][x*2+2] = c2;
    if (spu_extract(mask, 3))
       colorTile->ui[y][x*2+3] = c3;
+#else
+   /*
+    * Quad layout:
+    *  +--+--+
+    *  |p0|p1|
+    *  +--+--+
+    *  |p2|p3|
+    *  +--+--+
+    */
+   if (spu_extract(mask, 0))
+      colorTile->ui[y+0][x+0] = c0;
+   if (spu_extract(mask, 1))
+      colorTile->ui[y+0][x+1] = c1;
+   if (spu_extract(mask, 2))
+      colorTile->ui[y+1][x+0] = c2;
+   if (spu_extract(mask, 3))
+      colorTile->ui[y+1][x+1] = c3;
 #endif
 }
-- 
cgit v1.2.3


From aa66f08a21b791f338b519f0c2162cd8f7b3aeb0 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Thu, 11 Sep 2008 17:59:52 -0600
Subject: cell: initial support for fragment shader code generation.

TGSI shaders are translated into SPE instructions which are then sent to
the SPEs for execution.  Only a few opcodes work, no swizzling yet, no
support for constants/immediates, etc.
---
 src/gallium/drivers/cell/common.h                |  15 +
 src/gallium/drivers/cell/ppu/Makefile            |   1 +
 src/gallium/drivers/cell/ppu/cell_context.h      |   1 +
 src/gallium/drivers/cell/ppu/cell_gen_fp.c       | 523 +++++++++++++++++++++++
 src/gallium/drivers/cell/ppu/cell_gen_fp.h       |  42 ++
 src/gallium/drivers/cell/ppu/cell_state_emit.c   |  16 +
 src/gallium/drivers/cell/ppu/cell_state_shader.c |   8 +-
 src/gallium/drivers/cell/spu/spu_main.c          |  25 +-
 src/gallium/drivers/cell/spu/spu_main.h          |  15 +
 src/gallium/drivers/cell/spu/spu_tri.c           |  35 ++
 10 files changed, 678 insertions(+), 3 deletions(-)
 create mode 100644 src/gallium/drivers/cell/ppu/cell_gen_fp.c
 create mode 100644 src/gallium/drivers/cell/ppu/cell_gen_fp.h

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index e989d8c2e5..cb0631baf5 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -92,6 +92,7 @@
 #define CELL_CMD_STATE_UNIFORMS      16
 #define CELL_CMD_STATE_VS_ARRAY_INFO 17
 #define CELL_CMD_STATE_BIND_VS       18
+#define CELL_CMD_STATE_FRAGMENT_PROGRAM 19
 #define CELL_CMD_STATE_ATTRIB_FETCH  20
 #define CELL_CMD_VS_EXECUTE          22
 #define CELL_CMD_FLUSH_BUFFER_RANGE  23
@@ -125,6 +126,20 @@ struct cell_command_fragment_ops
 };
 
 
+/** Max instructions for fragment programs */
+#define SPU_MAX_FRAGMENT_PROGRAM_INSTS 128
+
+/**
+ * Command to send a fragment progra to SPUs.
+ */
+struct cell_command_fragment_program
+{
+   uint64_t opcode;      /**< CELL_CMD_STATE_FRAGMENT_PROGRAM */
+   uint num_inst;        /**< Number of instructions */
+   unsigned code[SPU_MAX_FRAGMENT_PROGRAM_INSTS];
+};
+
+
 /**
  * Tell SPUs about the framebuffer size, location
  */
diff --git a/src/gallium/drivers/cell/ppu/Makefile b/src/gallium/drivers/cell/ppu/Makefile
index 8699f3f8ec..b28f4c5c31 100644
--- a/src/gallium/drivers/cell/ppu/Makefile
+++ b/src/gallium/drivers/cell/ppu/Makefile
@@ -26,6 +26,7 @@ SOURCES = \
 	cell_draw_arrays.c \
 	cell_flush.c \
 	cell_gen_fragment.c \
+	cell_gen_fp.c \
 	cell_state_derived.c \
 	cell_state_emit.c \
 	cell_state_shader.c \
diff --git a/src/gallium/drivers/cell/ppu/cell_context.h b/src/gallium/drivers/cell/ppu/cell_context.h
index 8cec9f45b2..14914b9c6f 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.h
+++ b/src/gallium/drivers/cell/ppu/cell_context.h
@@ -61,6 +61,7 @@ struct cell_fragment_shader_state
 {
    struct pipe_shader_state shader;
    struct tgsi_shader_info info;
+   struct spe_function code;
    void *data;
 };
 
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
new file mode 100644
index 0000000000..6ffe94eb14
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
@@ -0,0 +1,523 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+
+/**
+ * Generate SPU fragment program/shader code.
+ *
+ * Note that we generate SOA-style code here.  So each TGSI instruction
+ * operates on four pixels (and is translated into four SPU instructions,
+ * generally speaking).
+ *
+ * \author Brian Paul
+ */
+
+
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_util.h"
+#include "tgsi/tgsi_exec.h"
+#include "tgsi/tgsi_dump.h"
+#include "rtasm/rtasm_ppc_spe.h"
+#include "util/u_memory.h"
+#include "cell_context.h"
+#include "cell_gen_fp.h"
+
+
+/** Set to 1 to enable debug/disassembly printfs */
+#define DISASSEM 01
+
+
+/**
+ * Context needed during code generation.
+ */
+struct codegen
+{
+   int inputs_reg;      /**< 1st function parameter */
+   int outputs_reg;     /**< 2nd function parameter */
+   int constants_reg;   /**< 3rd function parameter */
+   int temp_regs[8][4]; /**< maps TGSI temps to SPE registers */
+
+   int one_reg;         /**< register containing {1.0, 1.0, 1.0, 1.0} */
+
+   /** Per-instruction temps / intermediate temps */
+   int num_itemps;
+   int itemps[3];
+
+   struct spe_function *f;
+   boolean error;
+};
+
+
+/**
+ * Allocate an intermediate temporary register.
+ */
+static int
+get_itemp(struct codegen *gen)
+{
+   int t = spe_allocate_available_register(gen->f);
+   assert(gen->num_itemps < Elements(gen->itemps));
+   gen->itemps[gen->num_itemps++] = t;
+   return t;
+}
+
+/**
+ * Free all intermediate temporary registers.  To be called after each
+ * instruction has been emitted.
+ */
+static void
+free_itemps(struct codegen *gen)
+{
+   int i;
+   for (i = 0; i < gen->num_itemps; i++) {
+      spe_release_register(gen->f, gen->itemps[i]);
+   }
+   gen->num_itemps = 0;
+}
+
+
+/**
+ * Return index of an SPE register containing {1.0, 1.0, 1.0, 1.0}.
+ * The register is allocated and initialized upon the first call.
+ */
+static int
+get_const_one_reg(struct codegen *gen)
+{
+   if (gen->one_reg <= 0) {
+      gen->one_reg = spe_allocate_available_register(gen->f);
+   }
+
+   /* one = {1.0, 1.0, 1.0, 1.0} */
+   spe_load_float(gen->f, gen->one_reg, 1.0f);
+#if DISASSEM
+   printf("il\tr%d, 1.0f\n", gen->one_reg);
+#endif
+
+   return gen->one_reg;
+}
+
+
+/**
+ * Return the index of the SPU temporary containing the named TGSI
+ * source register.  If the TGSI register is a TGSI_FILE_TEMPORARY we
+ * just return the corresponding SPE register.  If the TGIS register
+ * is TGSI_FILE_INPUT/CONSTANT/IMMEDIATE we allocate a new SPE register
+ * and emit an SPE load instruction.
+ */
+static int
+get_src_reg(struct codegen *gen,
+            int channel,
+            const struct tgsi_full_src_register *src)
+{
+   int reg;
+
+   /* XXX need to examine src swizzle info here.
+    * That will involve changing the channel var...
+    */
+
+
+   switch (src->SrcRegister.File) {
+   case TGSI_FILE_TEMPORARY:
+      reg = gen->temp_regs[src->SrcRegister.Index][channel];
+      break;
+   case TGSI_FILE_INPUT:
+      {
+         /* offset is measured in quadwords, not bytes */
+         int offset = src->SrcRegister.Index * 4 + channel;
+         reg = get_itemp(gen);
+         /* Load:  reg = memory[(machine_reg) + offset] */
+         spe_lqd(gen->f, reg, gen->inputs_reg, offset);
+#if DISASSEM
+         printf("lqd\tr%d, r%d + %d\n", reg, gen->inputs_reg, offset);
+#endif
+      }
+      break;
+   case TGSI_FILE_IMMEDIATE:
+      /* xxx fall-through for now / fix */
+   case TGSI_FILE_CONSTANT:
+      /* xxx fall-through for now / fix */
+   default:
+      assert(0);
+   }
+
+   return reg;
+}
+
+
+/**
+ * Return the index of an SPE register to use for the given TGSI register.
+ * If the TGSI register is TGSI_FILE_TEMPORARAY, the index of the
+ * corresponding SPE register is returned.  If the TGSI register is
+ * TGSI_FILE_OUTPUT we allocate an intermediate temporary register.
+ * See store_dest_reg() below...
+ */
+static int
+get_dst_reg(struct codegen *gen,
+            int channel,
+            const struct tgsi_full_dst_register *dest)
+{
+   int reg;
+
+   switch (dest->DstRegister.File) {
+   case TGSI_FILE_TEMPORARY:
+      reg = gen->temp_regs[dest->DstRegister.Index][channel];
+      break;
+   case TGSI_FILE_OUTPUT:
+      reg = get_itemp(gen);
+      break;
+   default:
+      assert(0);
+   }
+
+   return reg;
+}
+
+
+/**
+ * When a TGSI instruction is writing to an output register, this
+ * function emits the SPE store instruction to store the value_reg.
+ * \param value_reg  the SPE register containing the value to store.
+ *                   This would have been returned by get_dst_reg().
+ */
+static void
+store_dest_reg(struct codegen *gen,
+               int value_reg, int channel,
+               const struct tgsi_full_dst_register *dest)
+{
+   switch (dest->DstRegister.File) {
+   case TGSI_FILE_TEMPORARY:
+      /* no-op */
+      break;
+   case TGSI_FILE_OUTPUT:
+      {
+         /* offset is measured in quadwords, not bytes */
+         int offset = dest->DstRegister.Index * 4 + channel;
+         /* Store: memory[(machine_reg) + offset] = reg */
+         spe_stqd(gen->f, value_reg, gen->outputs_reg, offset);
+#if DISASSEM
+         printf("stqd\tr%d, r%d + %d\n", value_reg, gen->outputs_reg, offset);
+#endif
+      }
+      break;
+   default:
+      assert(0);
+   }
+}
+
+
+static boolean
+emit_MOV(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int src_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int dst_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         /* XXX we don't always need to actually emit a mov instruction here */
+         spe_move(gen->f, dst_reg, src_reg);
+#if DISASSEM
+         printf("mov\tr%d, r%d\n", dst_reg, src_reg);
+#endif
+         store_dest_reg(gen, dst_reg, ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+   return true;
+}
+
+
+/**
+ * Emit addition instructions.  Recall that a single TGSI_OPCODE_ADD
+ * becomes (up to) four SPU "fa" instructions because we're doing SOA
+ * processing.
+ */
+static boolean
+emit_ADD(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+   /* Loop over Red/Green/Blue/Alpha channels */
+   for (ch = 0; ch < 4; ch++) {
+      /* If the dest R, G, B or A writemask is enabled... */
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         /* get indexes of the two src, one dest SPE registers */
+         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+
+         /* Emit actual SPE instruction: d = s1 + s2 */
+         spe_fa(gen->f, d_reg, s1_reg, s2_reg);
+#if DISASSEM
+         printf("fa\tr%d, r%d, r%d\n", d_reg, s1_reg, s2_reg);
+#endif
+
+         /* Store the result (a no-op for TGSI_FILE_TEMPORARY dests) */
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         /* Free any intermediate temps we allocated */
+         free_itemps(gen);
+      }
+   }
+   return true;
+}
+
+
+/**
+ * Emit multiply.  See emit_ADD for comments.
+ */
+static boolean
+emit_MUL(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         /* d = s1 * s2 */
+         spe_fm(gen->f, d_reg, s1_reg, s2_reg);
+#if DISASSEM
+         printf("fm\tr%d, r%d, r%d\n", d_reg, s1_reg, s2_reg);
+#endif
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+   return true;
+}
+
+
+/**
+ * Emit set-if-greater-than.
+ * Note that the SPE fcgt instruction produces 0x0 and 0xffffffff as
+ * the result but OpenGL/TGSI needs 0.0 and 1.0 results.
+ * We can easily convert 0x0/0xffffffff to 0.0/1.0 with a bitwise AND.
+ */
+static boolean
+emit_SGT(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+
+         /* d = (s1 > s2) */
+         spe_fcgt(gen->f, d_reg, s1_reg, s2_reg);
+#if DISASSEM
+         printf("fcgt\tr%d, r%d, r%d\n", d_reg, s1_reg, s2_reg);
+#endif
+
+         /* convert d from 0x0/0xffffffff to 0.0/1.0 */
+         /* d = d & one_reg */
+         spe_and(gen->f, d_reg, d_reg, get_const_one_reg(gen));
+#if DISASSEM
+         printf("and\tr%d, r%d, r%d\n", d_reg, d_reg, get_const_one_reg(gen));
+#endif
+
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+
+   return true;
+}
+
+
+/**
+ * Emit END instruction.
+ * We just return from the shader function at this point.
+ *
+ * Note that there may be more code after this that would be
+ * called by TGSI_OPCODE_CALL.
+ */
+static boolean
+emit_END(struct codegen *gen)
+{
+   /* return from function call */
+   spe_bi(gen->f, SPE_REG_RA, 0, 0);
+#if DISASSEM
+   printf("bi\trRA\n");
+#endif
+   return true;
+}
+
+
+/**
+ * Emit code for the given instruction.  Just a big switch stmt.
+ */
+static boolean
+emit_instruction(struct codegen *gen,
+                 const struct tgsi_full_instruction *inst)
+{
+   switch (inst->Instruction.Opcode) {
+   case TGSI_OPCODE_MOV:
+      return emit_MOV(gen, inst);
+   case TGSI_OPCODE_MUL:
+      return emit_MUL(gen, inst);
+   case TGSI_OPCODE_ADD:
+      return emit_ADD(gen, inst);
+   case TGSI_OPCODE_SGT:
+      return emit_SGT(gen, inst);
+   case TGSI_OPCODE_END:
+      return emit_END(gen);
+
+   /* XXX lots more cases to do... */
+
+   default:
+      return false;
+   }
+
+   return true;
+}
+
+
+
+/**
+ * Emit "code" for a TGSI declaration.
+ * We only care about TGSI TEMPORARY register declarations at this time.
+ * For each TGSI TEMPORARY we allocate four SPE registers.
+ */
+static void
+emit_declaration(struct codegen *gen, const struct tgsi_full_declaration *decl)
+{
+   int i, ch;
+
+   switch (decl->Declaration.File) {
+   case TGSI_FILE_TEMPORARY:
+#if DISASSEM
+      printf("Declare temp reg %d .. %d\n",
+             decl->DeclarationRange.First,
+             decl->DeclarationRange.Last);
+#endif
+      for (i = decl->DeclarationRange.First;
+           i <= decl->DeclarationRange.Last;
+           i++) {
+         for (ch = 0; ch < 4; ch++) {
+            gen->temp_regs[i][ch] = spe_allocate_available_register(gen->f);
+         }
+
+         /* XXX if we run out of SPE registers, we need to spill
+          * to SPU memory.  someday...
+          */
+
+#if DISASSEM
+         printf("  SPE regs: %d %d %d %d\n",
+                gen->temp_regs[i][0],
+                gen->temp_regs[i][1],
+                gen->temp_regs[i][2],
+                gen->temp_regs[i][3]);
+#endif
+      }
+      break;
+   default:
+      ; /* ignore */
+   }
+}
+
+
+/**
+ * Translate TGSI shader code to SPE instructions.  This is done when
+ * the state tracker gives us a new shader (via pipe->create_fs_state()).
+ *
+ * \param cell    the rendering context (in)
+ * \param tokens  the TGSI shader (in)
+ * \param f       the generated function (out)
+ */
+boolean
+cell_gen_fragment_program(struct cell_context *cell,
+                          const struct tgsi_token *tokens,
+                          struct spe_function *f)
+{
+   struct tgsi_parse_context parse;
+   struct codegen gen;
+
+   memset(&gen, 0, sizeof(gen));
+   gen.f = f;
+
+   /* For SPE function calls: reg $3 = first param, $4 = second param, etc. */
+   gen.inputs_reg = 3;     /* pointer to inputs array */
+   gen.outputs_reg = 4;    /* pointer to outputs array */
+   gen.constants_reg = 5;  /* pointer to constants array */
+
+   spe_init_func(f, SPU_MAX_FRAGMENT_PROGRAM_INSTS * SPE_INST_SIZE);
+   spe_allocate_register(f, gen.inputs_reg);
+   spe_allocate_register(f, gen.outputs_reg);
+   spe_allocate_register(f, gen.constants_reg);
+
+#if DISASSEM
+   printf("Begin %s\n", __FUNCTION__);
+   tgsi_dump(tokens, 0);
+#endif
+
+   tgsi_parse_init(&parse, tokens);
+
+   while (!tgsi_parse_end_of_tokens(&parse) && !gen.error) {
+      tgsi_parse_token(&parse);
+
+      switch (parse.FullToken.Token.Type) {
+      case TGSI_TOKEN_TYPE_IMMEDIATE:
+#if 0
+         if (!note_immediate(&gen, &parse.FullToken.FullImmediate ))
+            goto fail;
+#endif
+         break;
+
+      case TGSI_TOKEN_TYPE_DECLARATION:
+         emit_declaration(&gen, &parse.FullToken.FullDeclaration);
+         break;
+
+      case TGSI_TOKEN_TYPE_INSTRUCTION:
+         if (!emit_instruction(&gen, &parse.FullToken.FullInstruction )) {
+            gen.error = true;
+         }
+         break;
+
+      default:
+         assert(0);
+
+      }
+   }
+
+
+   if (gen.error) {
+      /* terminate the SPE code */
+      return emit_END(&gen);
+   }
+
+#if DISASSEM
+   printf("cell_gen_fragment_program nr instructions: %d\n", f->num_inst);
+   printf("End %s\n", __FUNCTION__);
+#endif
+
+   tgsi_parse_free( &parse );
+
+   return !gen.error;
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.h b/src/gallium/drivers/cell/ppu/cell_gen_fp.h
new file mode 100644
index 0000000000..99faea7046
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.h
@@ -0,0 +1,42 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+
+#ifndef CELL_GEN_FP_H
+#define CELL_GEN_FP_H
+
+
+
+extern boolean
+cell_gen_fragment_program(struct cell_context *cell,
+                          const struct tgsi_token *tokens,
+                          struct spe_function *f);
+
+
+#endif /* CELL_GEN_FP_H */
+
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index 3ebf0749ad..2da3097983 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -73,6 +73,22 @@ cell_emit_state(struct cell_context *cell)
 #endif
    }
 
+   if (cell->dirty & (CELL_NEW_FS)) {
+      /* Send new fragment program to SPUs */
+      struct cell_command_fragment_program *fp
+            = cell_batch_alloc(cell, sizeof(*fp));
+      fp->opcode = CELL_CMD_STATE_FRAGMENT_PROGRAM;
+      fp->num_inst = cell->fs->code.num_inst;
+      memcpy(&fp->code, cell->fs->code.store,
+             SPU_MAX_FRAGMENT_PROGRAM_INSTS * SPE_INST_SIZE);
+      if (0) {
+         int i;
+         printf("PPU Emit CELL_CMD_STATE_FRAGMENT_PROGRAM:\n");
+         for (i = 0; i < fp->num_inst; i++) {
+            printf(" %3d: 0x%08x\n", i, fp->code[i]);
+         }
+      }
+   }
 
    if (cell->dirty & (CELL_NEW_FRAMEBUFFER |
                       CELL_NEW_DEPTH_STENCIL |
diff --git a/src/gallium/drivers/cell/ppu/cell_state_shader.c b/src/gallium/drivers/cell/ppu/cell_state_shader.c
index 97e44eeb1a..3a0d066da2 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_shader.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_shader.c
@@ -34,7 +34,7 @@
 
 #include "cell_context.h"
 #include "cell_state.h"
-
+#include "cell_gen_fp.h"
 
 
 /** cast wrapper */
@@ -61,7 +61,7 @@ static void *
 cell_create_fs_state(struct pipe_context *pipe,
                      const struct pipe_shader_state *templ)
 {
-   /*struct cell_context *cell = cell_context(pipe);*/
+   struct cell_context *cell = cell_context(pipe);
    struct cell_fragment_shader_state *cfs;
 
    cfs = CALLOC_STRUCT(cell_fragment_shader_state);
@@ -76,6 +76,8 @@ cell_create_fs_state(struct pipe_context *pipe,
 
    tgsi_scan_shader(templ->tokens, &cfs->info);
 
+   cell_gen_fragment_program(cell, cfs->shader.tokens, &cfs->code);
+
    return cfs;
 }
 
@@ -102,6 +104,8 @@ cell_delete_fs_state(struct pipe_context *pipe, void *fs)
 {
    struct cell_fragment_shader_state *cfs = cell_fragment_shader_state(fs);
 
+   spe_release_func(&cfs->code);
+
    FREE((void *) cfs->shader.tokens);
    FREE(cfs);
 }
diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index 2a7cb75f59..78260c4259 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -232,7 +232,7 @@ cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops)
       printf("SPU %u: CMD_STATE_FRAGMENT_OPS\n", spu.init.id);
    /* Copy SPU code from batch buffer to spu buffer */
    memcpy(spu.fragment_ops_code, fops->code, SPU_MAX_FRAGMENT_OPS_INSTS * 4);
-   /* Copy state info */
+   /* Copy state info (for fallback case only) */
    memcpy(&spu.depth_stencil_alpha, &fops->dsa, sizeof(fops->dsa));
    memcpy(&spu.blend, &fops->blend, sizeof(fops->blend));
 
@@ -244,6 +244,21 @@ cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops)
 }
 
 
+static void
+cmd_state_fragment_program(const struct cell_command_fragment_program *fp)
+{
+   if (Debug)
+      printf("SPU %u: CMD_STATE_FRAGMENT_PROGRAM\n", spu.init.id);
+   /* Copy SPU code from batch buffer to spu buffer */
+   memcpy(spu.fragment_program_code, fp->code,
+          SPU_MAX_FRAGMENT_PROGRAM_INSTS * 4);
+#if 01
+   /* Point function pointer at new code */
+   spu.fragment_program = (spu_fragment_program_func)spu.fragment_program_code;
+#endif
+}
+
+
 static void
 cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
 {
@@ -473,6 +488,14 @@ cmd_batch(uint opcode)
             pos += sizeof(*fops) / 8;
          }
          break;
+      case CELL_CMD_STATE_FRAGMENT_PROGRAM:
+         {
+            struct cell_command_fragment_program *fp
+               = (struct cell_command_fragment_program *) &buffer[pos];
+            cmd_state_fragment_program(fp);
+            pos += sizeof(*fp) / 8;
+         }
+         break;
       case CELL_CMD_STATE_SAMPLER:
          {
             struct cell_command_sampler *sampler
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index d40539da83..2c7b625840 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -75,6 +75,12 @@ typedef void (*spu_fragment_ops_func)(uint x, uint y,
                                       vector float fragAlpha,
                                       vector unsigned int mask);
 
+/** Function for running fragment program */
+typedef void (*spu_fragment_program_func)(vector float *inputs,
+                                          vector float *outputs,
+                                          vector float *constants);
+
+
 struct spu_framebuffer
 {
    void *color_start;              /**< addr of color surface in main memory */
@@ -142,9 +148,18 @@ struct spu_global
    /** Current fragment ops function */
    spu_fragment_ops_func fragment_ops;
 
+   /** Current fragment program machine code */
+   uint fragment_program_code[SPU_MAX_FRAGMENT_PROGRAM_INSTS];
+   /** Current fragment ops function */
+   spu_fragment_program_func fragment_program;
+
    /** Current texture sampler function */
    spu_sample_texture_func sample_texture[CELL_MAX_SAMPLERS];
 
+   /** Fragment program constants (XXX preliminary/used) */
+#define MAX_CONSTANTS 32
+   vector float constants[MAX_CONSTANTS];
+
 } ALIGN16_ATTRIB;
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index f02cdd1f76..8b93878192 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -314,7 +314,42 @@ emit_quad( int x, int y, mask_t mask )
       }
       else {
          /* simple shading */
+#if 0
          eval_coeff(1, (float) x, (float) y, colors);
+
+#else
+         /* XXX new fragment program code */
+
+         if (spu.fragment_program) {
+            vector float inputs[4*4], outputs[2*4];
+
+            /* setup inputs */
+            eval_coeff(1, (float) x, (float) y, inputs);
+
+            /* Execute the current fragment program */
+            spu.fragment_program(inputs, outputs, spu.constants);
+
+            /* Copy outputs */
+            colors[0] = outputs[0*4+0];
+            colors[1] = outputs[0*4+1];
+            colors[2] = outputs[0*4+2];
+            colors[3] = outputs[0*4+3];
+
+            if (0 && spu.init.id==0 && y == 48) {
+               printf("colors[0] = %f %f %f %f\n",
+                      spu_extract(colors[0], 0),
+                      spu_extract(colors[0], 1),
+                      spu_extract(colors[0], 2),
+                      spu_extract(colors[0], 3));
+               printf("colors[1] = %f %f %f %f\n",
+                      spu_extract(colors[1], 0),
+                      spu_extract(colors[1], 1),
+                      spu_extract(colors[1], 2),
+                      spu_extract(colors[1], 3));
+            }
+
+         }
+#endif
       }
 
 
-- 
cgit v1.2.3


From 6c0fa798578ad247027dff861406a524821ddcdd Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 12 Sep 2008 08:47:45 -0600
Subject: cell: setup fragment program inputs in SOA format

Also remove old code, etc.
---
 src/gallium/drivers/cell/spu/spu_tri.c | 112 ++++++++++++++++-----------------
 1 file changed, 56 insertions(+), 56 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index 8b93878192..b7faae6d60 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -241,6 +241,19 @@ eval_coeff(uint slot, float x, float y, vector float result[4])
 }
 
 
+/**
+ * As above, but return 4 vectors in SOA format.
+ * XXX this will all be re-written someday.
+ */
+static INLINE void
+eval_coeff_soa(uint slot, float x, float y, vector float result[4])
+{
+   eval_coeff(slot, x, y, result);
+   _transpose_matrix4x4(result, result);
+}
+
+
+
 static INLINE vector float
 eval_z(float x, float y)
 {
@@ -267,14 +280,17 @@ emit_quad( int x, int y, mask_t mask )
    if (spu_extract(spu_orx(mask), 0)) {
       const int ix = x - setup.cliprect_minx;
       const int iy = y - setup.cliprect_miny;
-      vector float colors[4];
 
       spu.cur_ctile_status = TILE_STATUS_DIRTY;
       spu.cur_ztile_status = TILE_STATUS_DIRTY;
 
       if (spu.texture[0].start) {
-         /* texture mapping */
+         /*
+          * Temporary texture mapping path
+          * This will go away when fragment programs support TEX inst.
+          */
          const uint unit = 0;
+         vector float colors[4];
          vector float texcoords[4];
          eval_coeff(2, (float) x, (float) y, texcoords);
 
@@ -311,70 +327,54 @@ emit_quad( int x, int y, mask_t mask )
             colors[3] = spu_mul(colors[3], colors1[3]);
          }
 
-      }
-      else {
-         /* simple shading */
-#if 0
-         eval_coeff(1, (float) x, (float) y, colors);
-
-#else
-         /* XXX new fragment program code */
-
-         if (spu.fragment_program) {
-            vector float inputs[4*4], outputs[2*4];
-
-            /* setup inputs */
-            eval_coeff(1, (float) x, (float) y, inputs);
-
-            /* Execute the current fragment program */
-            spu.fragment_program(inputs, outputs, spu.constants);
-
-            /* Copy outputs */
-            colors[0] = outputs[0*4+0];
-            colors[1] = outputs[0*4+1];
-            colors[2] = outputs[0*4+2];
-            colors[3] = outputs[0*4+3];
-
-            if (0 && spu.init.id==0 && y == 48) {
-               printf("colors[0] = %f %f %f %f\n",
-                      spu_extract(colors[0], 0),
-                      spu_extract(colors[0], 1),
-                      spu_extract(colors[0], 2),
-                      spu_extract(colors[0], 3));
-               printf("colors[1] = %f %f %f %f\n",
-                      spu_extract(colors[1], 0),
-                      spu_extract(colors[1], 1),
-                      spu_extract(colors[1], 2),
-                      spu_extract(colors[1], 3));
-            }
-
+         {
+            /* Convert fragment data from AoS to SoA format.
+             * I.e. (RGBA,RGBA,RGBA,RGBA) -> (RRRR,GGGG,BBBB,AAAA)
+             * This is temporary!
+             */
+            vector float soa_frag[4];
+            _transpose_matrix4x4(soa_frag, colors);
+
+            vector float fragZ = eval_z((float) x, (float) y);
+
+            /* Do all per-fragment/quad operations here, including:
+             * alpha test, z test, stencil test, blend and framebuffer writing.
+             */
+            spu.fragment_ops(ix, iy, &spu.ctile, &spu.ztile,
+                             fragZ,
+                             soa_frag[0], soa_frag[1],
+                             soa_frag[2], soa_frag[3],
+                             mask);
          }
-#endif
-      }
-
 
-      {
-         /* Convert fragment data from AoS to SoA format.
-          * I.e. (RGBA,RGBA,RGBA,RGBA) -> (RRRR,GGGG,BBBB,AAAA)
-          * This is temporary!
+      }
+      else {
+         /*
+          * Run fragment shader, execute per-fragment ops, update fb/tile.
           */
-         vector float soa_frag[4];
-         _transpose_matrix4x4(soa_frag, colors);
+         vector float inputs[4*4], outputs[2*4];
+         vector float fragZ = eval_z((float) x, (float) y);
 
-         float4 fragZ;
+         /* setup inputs */
+         eval_coeff_soa(1, (float) x, (float) y, inputs);
 
-         fragZ.v = eval_z((float) x, (float) y);
+         ASSERT(spu.fragment_program);
+         ASSERT(spu.fragment_ops);
 
-         /* Do all per-fragment/quad operations here, including:
-          *  alpha test, z test, stencil test, blend and framebuffer writing.
+         /* Execute the current fragment program */
+         spu.fragment_program(inputs, outputs, spu.constants);
+
+         /* Execute per-fragment/quad operations, including:
+          * alpha test, z test, stencil test, blend and framebuffer writing.
           */
          spu.fragment_ops(ix, iy, &spu.ctile, &spu.ztile,
-                          fragZ.v,
-                          soa_frag[0], soa_frag[1],
-                          soa_frag[2], soa_frag[3],
+                          fragZ,
+                          outputs[0*4+0],
+                          outputs[0*4+1],
+                          outputs[0*4+2],
+                          outputs[0*4+3],
                           mask);
       }
-
    }
 }
 
-- 
cgit v1.2.3


From 38bacb6f32d8a2cddc1116f7fbe2b21ea5a91a95 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 12 Sep 2008 11:43:37 -0600
Subject: cell: implement colormask on fallback path

Also, some var renaming and additional comments
---
 src/gallium/drivers/cell/spu/spu_per_fragment_op.c | 164 ++++++++++++++-------
 1 file changed, 110 insertions(+), 54 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
index 03dd547845..f107764fb2 100644
--- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
+++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
@@ -60,9 +60,12 @@ spu_fallback_fragment_ops(uint x, uint y,
                           vector unsigned int mask)
 {
    vector float frag_aos[4];
-   unsigned int c0, c1, c2, c3;
+   unsigned int fbc0, fbc1, fbc2, fbc3 ; /* framebuffer/tile colors */
+   unsigned int fragc0, fragc1, fragc2, fragc3;  /* fragment colors */
 
-   /* do alpha test */
+   /*
+    * Do alpha test
+    */
    if (spu.depth_stencil_alpha.alpha.enabled) {
       vector float ref = spu_splats(spu.depth_stencil_alpha.alpha.ref);
       vector unsigned int amask;
@@ -102,7 +105,10 @@ spu_fallback_fragment_ops(uint x, uint y,
       mask = spu_and(mask, amask);
    }
 
-   /* Z and/or stencil testing... */
+
+   /*
+    * Z and/or stencil testing...
+    */
    if (spu.depth_stencil_alpha.depth.enabled ||
        spu.depth_stencil_alpha.stencil[0].enabled) {
 
@@ -178,6 +184,32 @@ spu_fallback_fragment_ops(uint x, uint y,
       }
    }
 
+
+   /*
+    * If we'll need the current framebuffer/tile colors for blending
+    * or logicop or colormask, fetch them now.
+    */
+   if (spu.blend.blend_enable ||
+       spu.blend.logicop_enable ||
+       spu.blend.colormask != 0xf) {
+
+#if LINEAR_QUAD_LAYOUT /* See comments/diagram below */
+      fbc0 = colorTile->ui[y][x*2+0];
+      fbc1 = colorTile->ui[y][x*2+1];
+      fbc2 = colorTile->ui[y][x*2+2];
+      fbc3 = colorTile->ui[y][x*2+3];
+#else
+      fbc0 = colorTile->ui[y+0][x+0];
+      fbc1 = colorTile->ui[y+0][x+1];
+      fbc2 = colorTile->ui[y+1][x+0];
+      fbc3 = colorTile->ui[y+1][x+1];
+#endif
+   }
+
+
+   /*
+    * Do blending
+    */
    if (spu.blend.blend_enable) {
       /* blending terms, misc regs */
       vector float term1r, term1g, term1b, term1a;
@@ -186,39 +218,26 @@ spu_fallback_fragment_ops(uint x, uint y,
 
       vector float fbRGBA[4];  /* current framebuffer colors */
 
-      /* get colors from framebuffer/tile */
+      /* convert framebuffer colors from packed int to vector float */
       {
-         vector float fc[4];
-         uint c0, c1, c2, c3;
-
-#if LINEAR_QUAD_LAYOUT /* See comments/diagram below */
-         c0 = colorTile->ui[y][x*2+0];
-         c1 = colorTile->ui[y][x*2+1];
-         c2 = colorTile->ui[y][x*2+2];
-         c3 = colorTile->ui[y][x*2+3];
-#else
-         c0 = colorTile->ui[y+0][x+0];
-         c1 = colorTile->ui[y+0][x+1];
-         c2 = colorTile->ui[y+1][x+0];
-         c3 = colorTile->ui[y+1][x+1];
-#endif
+         vector float temp[4]; /* float colors in AOS form */
          switch (spu.fb.color_format) {
          case PIPE_FORMAT_B8G8R8A8_UNORM:
-            fc[0] = spu_unpack_B8G8R8A8(c0);
-            fc[1] = spu_unpack_B8G8R8A8(c1);
-            fc[2] = spu_unpack_B8G8R8A8(c2);
-            fc[3] = spu_unpack_B8G8R8A8(c3);
+            temp[0] = spu_unpack_B8G8R8A8(fbc0);
+            temp[1] = spu_unpack_B8G8R8A8(fbc1);
+            temp[2] = spu_unpack_B8G8R8A8(fbc2);
+            temp[3] = spu_unpack_B8G8R8A8(fbc3);
             break;
          case PIPE_FORMAT_A8R8G8B8_UNORM:
-            fc[0] = spu_unpack_A8R8G8B8(c0);
-            fc[1] = spu_unpack_A8R8G8B8(c1);
-            fc[2] = spu_unpack_A8R8G8B8(c2);
-            fc[3] = spu_unpack_A8R8G8B8(c3);
+            temp[0] = spu_unpack_A8R8G8B8(fbc0);
+            temp[1] = spu_unpack_A8R8G8B8(fbc1);
+            temp[2] = spu_unpack_A8R8G8B8(fbc2);
+            temp[3] = spu_unpack_A8R8G8B8(fbc3);
             break;
          default:
             ASSERT(0);
          }
-         _transpose_matrix4x4(fbRGBA, fc);
+         _transpose_matrix4x4(fbRGBA, temp); /* fbRGBA = transpose(temp) */
       }
 
       /*
@@ -384,21 +403,20 @@ spu_fallback_fragment_ops(uint x, uint y,
 #endif
 
    /*
-    * Pack float colors into 32-bit RGBA words.
+    * Pack fragment float colors into 32-bit RGBA words.
     */
    switch (spu.fb.color_format) {
    case PIPE_FORMAT_A8R8G8B8_UNORM:
-      c0 = spu_pack_A8R8G8B8(frag_aos[0]);
-      c1 = spu_pack_A8R8G8B8(frag_aos[1]);
-      c2 = spu_pack_A8R8G8B8(frag_aos[2]);
-      c3 = spu_pack_A8R8G8B8(frag_aos[3]);
+      fragc0 = spu_pack_A8R8G8B8(frag_aos[0]);
+      fragc1 = spu_pack_A8R8G8B8(frag_aos[1]);
+      fragc2 = spu_pack_A8R8G8B8(frag_aos[2]);
+      fragc3 = spu_pack_A8R8G8B8(frag_aos[3]);
       break;
-
    case PIPE_FORMAT_B8G8R8A8_UNORM:
-      c0 = spu_pack_B8G8R8A8(frag_aos[0]);
-      c1 = spu_pack_B8G8R8A8(frag_aos[1]);
-      c2 = spu_pack_B8G8R8A8(frag_aos[2]);
-      c3 = spu_pack_B8G8R8A8(frag_aos[3]);
+      fragc0 = spu_pack_B8G8R8A8(frag_aos[0]);
+      fragc1 = spu_pack_B8G8R8A8(frag_aos[1]);
+      fragc2 = spu_pack_B8G8R8A8(frag_aos[2]);
+      fragc3 = spu_pack_B8G8R8A8(frag_aos[3]);
       break;
    default:
       fprintf(stderr, "SPU: Bad pixel format in spu_default_fragment_ops\n");
@@ -407,20 +425,57 @@ spu_fallback_fragment_ops(uint x, uint y,
 
 
    /*
-    * Color masking
+    * Do color masking
     */
    if (spu.blend.colormask != 0xf) {
-      /* XXX to do */
-      /* apply color mask to 32-bit packed colors */
+      uint cmask = 0x0; /* each byte corresponds to a color channel */
+
+      /* Form bitmask depending on color buffer format and colormask bits */
+      switch (spu.fb.color_format) {
+      case PIPE_FORMAT_A8R8G8B8_UNORM:
+         if (spu.blend.colormask & (1<<0))
+            cmask |= 0x00ff0000; /* red */
+         if (spu.blend.colormask & (1<<1))
+            cmask |= 0x0000ff00; /* green */
+         if (spu.blend.colormask & (1<<2))
+            cmask |= 0x000000ff; /* blue */
+         if (spu.blend.colormask & (1<<3))
+            cmask |= 0xff000000; /* alpha */
+         break;
+      case PIPE_FORMAT_B8G8R8A8_UNORM:
+         if (spu.blend.colormask & (1<<0))
+            cmask |= 0x0000ff00; /* red */
+         if (spu.blend.colormask & (1<<1))
+            cmask |= 0x00ff0000; /* green */
+         if (spu.blend.colormask & (1<<2))
+            cmask |= 0xff000000; /* blue */
+         if (spu.blend.colormask & (1<<3))
+            cmask |= 0x000000ff; /* alpha */
+         break;
+      default:
+         ASSERT(0);
+      }
+
+      /*
+       * Apply color mask to the 32-bit packed colors.
+       * if (cmask[i])
+       *    frag color[i] = frag color[i];
+       * else
+       *    frag color[i] = framebuffer color[i];
+       */
+      fragc0 = (fragc0 & cmask) | (fbc0 & ~cmask);
+      fragc1 = (fragc1 & cmask) | (fbc1 & ~cmask);
+      fragc2 = (fragc2 & cmask) | (fbc2 & ~cmask);
+      fragc3 = (fragc3 & cmask) | (fbc3 & ~cmask);
    }
 
 
    /*
-    * Logic Ops
+    * Do logic ops
     */
    if (spu.blend.logicop_enable) {
       /* XXX to do */
-      /* apply logicop to 32-bit packed colors */
+      /* apply logicop to 32-bit packed colors (fragcx and fbcx) */
    }
 
 
@@ -431,45 +486,46 @@ spu_fallback_fragment_ops(uint x, uint y,
       spu.cur_ctile_status = TILE_STATUS_DIRTY;
    }
    else {
+      /* write no fragments */
       return;
    }
 
 
    /*
-    * Write new quad colors to the framebuffer/tile.
+    * Write new fragment/quad colors to the framebuffer/tile.
     * Only write pixels where the corresponding mask word is set.
     */
 #if LINEAR_QUAD_LAYOUT
    /*
     * Quad layout:
     *  +--+--+--+--+
-    *  |p0|p1|p2|p3|
+    *  |p0|p1|p2|p3|...
     *  +--+--+--+--+
     */
    if (spu_extract(mask, 0))
-      colorTile->ui[y][x*2] = c0;
+      colorTile->ui[y][x*2] = fragc0;
    if (spu_extract(mask, 1))
-      colorTile->ui[y][x*2+1] = c1;
+      colorTile->ui[y][x*2+1] = fragc1;
    if (spu_extract(mask, 2))
-      colorTile->ui[y][x*2+2] = c2;
+      colorTile->ui[y][x*2+2] = fragc2;
    if (spu_extract(mask, 3))
-      colorTile->ui[y][x*2+3] = c3;
+      colorTile->ui[y][x*2+3] = fragc3;
 #else
    /*
     * Quad layout:
     *  +--+--+
-    *  |p0|p1|
+    *  |p0|p1|...
     *  +--+--+
-    *  |p2|p3|
+    *  |p2|p3|...
     *  +--+--+
     */
    if (spu_extract(mask, 0))
-      colorTile->ui[y+0][x+0] = c0;
+      colorTile->ui[y+0][x+0] = fragc0;
    if (spu_extract(mask, 1))
-      colorTile->ui[y+0][x+1] = c1;
+      colorTile->ui[y+0][x+1] = fragc1;
    if (spu_extract(mask, 2))
-      colorTile->ui[y+1][x+0] = c2;
+      colorTile->ui[y+1][x+0] = fragc2;
    if (spu_extract(mask, 3))
-      colorTile->ui[y+1][x+1] = c3;
+      colorTile->ui[y+1][x+1] = fragc3;
 #endif
 }
-- 
cgit v1.2.3


From 5ab221549d5cdbf72817ff612464d83256765389 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 12 Sep 2008 16:11:52 -0600
Subject: cell: evaluate multiple fragment inputs

---
 src/gallium/drivers/cell/spu/spu_tri.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index b7faae6d60..0a8fb56a62 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -356,8 +356,14 @@ emit_quad( int x, int y, mask_t mask )
          vector float fragZ = eval_z((float) x, (float) y);
 
          /* setup inputs */
+#if 0
          eval_coeff_soa(1, (float) x, (float) y, inputs);
-
+#else
+         uint i;
+         for (i = 0; i < spu.vertex_info.num_attribs; i++) {
+            eval_coeff_soa(i+1, (float) x, (float) y, inputs + i * 4);
+         }
+#endif
          ASSERT(spu.fragment_program);
          ASSERT(spu.fragment_ops);
 
-- 
cgit v1.2.3


From 858ced051551aa5d0ddd41936253d3a4ee5c142f Mon Sep 17 00:00:00 2001
From: Robert Ellison <papillo@tungstengraphics.com>
Date: Wed, 17 Sep 2008 02:30:20 -0600
Subject: CELL: fleshing out the blending fragment ops

- Added two new debug flags (to be used with the CELL_DEBUG environment
  variable).  The first, "CELL_DEBUG=fragops", activates SPE fragment
  ops debug messages.  The second, "CELL_DEBUG=fragopfallback", will
  eventually be used to disable the use of generated SPE code for
  fragment ops in favor of the default fallback reference routine.
  (During development, though, the parity of this flag is reversed:
  all users will get the reference code *unless* CELL_DEBUG=fragopfallback
  is set.  This will prevent hiccups in code generation from affecting
  the other developers.)

- Formalized debug message usage and macros in spu/spu_main.c.

- Added lots of new code to ppu/cell_gen_fragment.c to extend the
  number of supported source RGB factors from 4 to 15, and to
  complete the list of supported blend equations.

More coming, to complete the source and destination RGB and alpha
factors, and to complete the rest of the fragment operations...
---
 src/gallium/drivers/cell/common.h                |  11 +-
 src/gallium/drivers/cell/ppu/cell_context.c      |   2 +
 src/gallium/drivers/cell/ppu/cell_gen_fragment.c | 272 ++++++++++++++++++++++-
 src/gallium/drivers/cell/ppu/cell_state_emit.c   |   5 +
 src/gallium/drivers/cell/spu/spu_main.c          | 115 +++++-----
 5 files changed, 337 insertions(+), 68 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index 8f08854117..f0ff96eb47 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -104,12 +104,11 @@
 #define CELL_BUFFER_STATUS_FREE 10
 #define CELL_BUFFER_STATUS_USED 20
 
-
-#define CELL_DEBUG_CHECKER  (1 << 0)
-#define CELL_DEBUG_ASM      (1 << 1)
-#define CELL_DEBUG_SYNC     (1 << 2)
-
-
+#define CELL_DEBUG_CHECKER              (1 << 0)
+#define CELL_DEBUG_ASM                  (1 << 1)
+#define CELL_DEBUG_SYNC                 (1 << 2)
+#define CELL_DEBUG_FRAGMENT_OPS         (1 << 3)
+#define CELL_DEBUG_FRAGMENT_OP_FALLBACK (1 << 4)
 
 /** Max instructions for doing per-fragment operations */
 #define SPU_MAX_FRAGMENT_OPS_INSTS 64
diff --git a/src/gallium/drivers/cell/ppu/cell_context.c b/src/gallium/drivers/cell/ppu/cell_context.c
index b418271dca..62e213ea35 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.c
+++ b/src/gallium/drivers/cell/ppu/cell_context.c
@@ -89,6 +89,8 @@ static const struct debug_named_value cell_debug_flags[] = {
    {"checker", CELL_DEBUG_CHECKER},/**< modulate tile clear color by SPU ID */
    {"asm", CELL_DEBUG_ASM},        /**< dump SPU asm code */
    {"sync", CELL_DEBUG_SYNC},      /**< SPUs do synchronous DMA */
+   {"fragops", CELL_DEBUG_FRAGMENT_OPS}, /**< SPUs emit fragment ops debug messages*/
+   {"fragopfallback", CELL_DEBUG_FRAGMENT_OP_FALLBACK}, /**< SPUs use reference implementation for fragment ops*/
    {NULL, 0}
 };
 
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
index 06219d4e98..2c8c9e0d2c 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
@@ -229,7 +229,36 @@ gen_alpha_test(const struct pipe_depth_stencil_alpha_state *dsa,
    spe_release_register(f, amask_reg);
 }
 
+/* This is a convenient and oft-used sequence.  It chooses
+ * the smaller of each element of reg1 and reg2, and combines them
+ * into the result register, as follows:
+ * 
+ * The Float Compare Greater Than (fcgt) instruction will put
+ * 1s into compare_reg where reg1 > reg2, and 0s where reg1 <= reg2.
+ *
+ * Then the Select Bits (selb) instruction will take bits from
+ * reg1 where compare_reg is 0, and from reg2 where compare_reg is
+ * 1.  Ergo, result_reg will have the bits from reg1 where reg1 <= reg2,
+ * and the bits from reg2 where reg1 > reg2, which is exactly the
+ * MIN operation.
+ */
+#define FLOAT_VECTOR_MIN(f, result_reg, reg1, reg2) {\
+   int compare_reg = spe_allocate_available_register(f); \
+   spe_fcgt(f, compare_reg, reg1, reg2); \
+   spe_selb(f, result_reg, reg1, reg2, compare_reg); \
+   spe_release_register(f, compare_reg); \
+}
 
+/* The FLOAT_VECTOR_MAX sequence is similar to the FLOAT_VECTOR_MIN 
+ * sequence above, except that the registers specified when selecting
+ * bits are reversed.
+ */
+#define FLOAT_VECTOR_MAX(f, result_reg, reg1, reg2) {\
+   int compare_reg = spe_allocate_available_register(f); \
+   spe_fcgt(f, compare_reg, reg1, reg2); \
+   spe_selb(f, result_reg, reg2, reg1, compare_reg); \
+   spe_release_register(f, compare_reg); \
+}
 
 /**
  * Generate SPE code to implement the given blend mode for a quad of pixels.
@@ -242,6 +271,7 @@ gen_alpha_test(const struct pipe_depth_stencil_alpha_state *dsa,
  */
 static void
 gen_blend(const struct pipe_blend_state *blend,
+          const struct pipe_blend_color *blend_color,
           struct spe_function *f,
           enum pipe_format color_format,
           int fragR_reg, int fragG_reg, int fragB_reg, int fragA_reg,
@@ -262,10 +292,53 @@ gen_blend(const struct pipe_blend_state *blend,
    int fbB_reg = spe_allocate_available_register(f);
    int fbA_reg = spe_allocate_available_register(f);
 
-   int one_reg = spe_allocate_available_register(f);
    int tmp_reg = spe_allocate_available_register(f);
 
-   boolean one_reg_set = false; /* avoid setting one_reg more than once */
+   /* These values might or might not eventually get put into
+    * registers.  We avoid allocating them and setting them until
+    * they're actually needed; then we avoid setting them more than
+    * once, and release them at the end of code generation.
+    */
+   boolean one_reg_set = false; 
+   int one_reg;
+#define SET_ONE_REG_IF_UNSET(f) if (!one_reg_set) {\
+   one_reg = spe_allocate_available_register(f); \
+   spe_load_float(f, one_reg, 1.0f); \
+   one_reg_set = true; \
+}
+#define RELEASE_ONE_REG_IF_USED(f) if (one_reg_set) {\
+   spe_release_register(f, one_reg); \
+}
+  
+   boolean const_color_set = false;
+   int constR_reg, constG_reg, constB_reg;
+#define SET_CONST_COLOR_IF_UNSET(f, blend_color) if (!const_color_set) {\
+   constR_reg = spe_allocate_available_register(f); \
+   constG_reg = spe_allocate_available_register(f); \
+   constG_reg = spe_allocate_available_register(f); \
+   spe_load_float(f, constR_reg, blend_color->color[0]); \
+   spe_load_float(f, constG_reg, blend_color->color[1]); \
+   spe_load_float(f, constB_reg, blend_color->color[2]); \
+   const_color_set = true;\
+}
+#define RELEASE_CONST_COLOR_IF_USED(f) if (const_color_set) {\
+   spe_release_register(f, constR_reg); \
+   spe_release_register(f, constG_reg); \
+   spe_release_register(f, constB_reg); \
+}
+
+   boolean const_alpha_set = false;
+   int constA_reg;
+#define SET_CONST_ALPHA_IF_UNSET(f, blend_color) if (!const_alpha_set) {\
+   constA_reg = spe_allocate_available_register(f); \
+   spe_load_float(f, constA_reg, blend_color->color[3]); \
+   const_alpha_set = true; \
+}
+#define RELEASE_CONST_ALPHA_IF_USED(f) if (const_alpha_set) {\
+   spe_release_register(f, constA_reg); \
+}
+
+   /* Real code starts here */
 
    ASSERT(blend->blend_enable);
 
@@ -348,30 +421,161 @@ gen_blend(const struct pipe_blend_state *blend,
 
 
    /*
-    * Compute Src RGB terms
+    * Compute Src RGB terms.  We're actually looking for the value
+    * of (the appropriate RGB factors) * (the incoming source RGB color).
     */
    switch (blend->rgb_src_factor) {
    case PIPE_BLENDFACTOR_ONE:
+      /* factors = (1,1,1), so term = (R,G,B) */
       spe_move(f, term1R_reg, fragR_reg);
       spe_move(f, term1G_reg, fragG_reg);
       spe_move(f, term1B_reg, fragB_reg);
       break;
    case PIPE_BLENDFACTOR_ZERO:
-      spe_zero(f, term1R_reg);
-      spe_zero(f, term1G_reg);
-      spe_zero(f, term1B_reg);
+      /* factors = (0,0,0), so term = (0,0,0) */
+      spe_load_float(f, term1R_reg, 0.0f);
+      spe_load_float(f, term1G_reg, 0.0f);
+      spe_load_float(f, term1B_reg, 0.0f);
       break;
    case PIPE_BLENDFACTOR_SRC_COLOR:
+      /* factors = (R,G,B), so term = (R*R, G*G, B*B) */
       spe_fm(f, term1R_reg, fragR_reg, fragR_reg);
       spe_fm(f, term1G_reg, fragG_reg, fragG_reg);
       spe_fm(f, term1B_reg, fragB_reg, fragB_reg);
       break;
    case PIPE_BLENDFACTOR_SRC_ALPHA:
+      /* factors = (A,A,A), so term = (R*A, G*A, B*A) */
       spe_fm(f, term1R_reg, fragR_reg, fragA_reg);
       spe_fm(f, term1G_reg, fragG_reg, fragA_reg);
       spe_fm(f, term1B_reg, fragB_reg, fragA_reg);
       break;
-      /* XXX more cases */
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+      /* factors = (1-R,1-G,1-B), so term = (R*(1-R), G*(1-G), B*(1-B)) */
+      /* we'll need the optional constant {1,1,1,1} register */
+      SET_ONE_REG_IF_UNSET(f)
+      /* tmp = 1 - R */
+      spe_fs(f, tmp_reg, one_reg, fragR_reg);
+      /* term = R * tmp */
+      spe_fm(f, term1R_reg, fragR_reg, tmp_reg);
+      /* repeat for G and B */
+      spe_fs(f, tmp_reg, one_reg, fragG_reg);
+      spe_fm(f, term1G_reg, fragG_reg, tmp_reg);
+      spe_fs(f, tmp_reg, one_reg, fragB_reg);
+      spe_fm(f, term1B_reg, fragB_reg, tmp_reg);
+      break;
+   case PIPE_BLENDFACTOR_DST_COLOR:
+      /* factors = (Rfb,Gfb,Bfb), so term = (R*Rfb, G*Gfb, B*Bfb) */
+      spe_fm(f, term1R_reg, fragR_reg, fbR_reg);
+      spe_fm(f, term1G_reg, fragG_reg, fbG_reg);
+      spe_fm(f, term1B_reg, fragB_reg, fbB_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+      /* factors = (1-Rfb,1-Gfb,1-Bfb), so term = (R*(1-Rfb),G*(1-Gfb),B*(1-Bfb)) */
+      /* we'll need the optional constant {1,1,1,1} register */
+      SET_ONE_REG_IF_UNSET(f)
+      /* tmp = 1 - Rfb */
+      spe_fs(f, tmp_reg, one_reg, fbR_reg);
+      /* term = R * tmp */
+      spe_fm(f, term1R_reg, fragR_reg, tmp_reg);
+      /* repeat for G and B */
+      spe_fs(f, tmp_reg, one_reg, fbG_reg);
+      spe_fm(f, term1G_reg, fragG_reg, tmp_reg);
+      spe_fs(f, tmp_reg, one_reg, fbB_reg);
+      spe_fm(f, term1B_reg, fragB_reg, tmp_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+      /* factors = (1-A,1-A,1-A), so term = (R*(1-A),G*(1-A),B*(1-A)) */
+      /* we'll need the optional constant {1,1,1,1} register */
+      SET_ONE_REG_IF_UNSET(f)
+      /* tmp = 1 - A */
+      spe_fs(f, tmp_reg, one_reg, fragA_reg);
+      /* term = R * tmp */
+      spe_fm(f, term1R_reg, fragR_reg, tmp_reg);
+      /* repeat for G and B with the same (1-A) factor */
+      spe_fm(f, term1G_reg, fragG_reg, tmp_reg);
+      spe_fm(f, term1B_reg, fragB_reg, tmp_reg);
+      break;
+   case PIPE_BLENDFACTOR_DST_ALPHA:
+      /* factors = (Afb, Afb, Afb), so term = (R*Afb, G*Afb, B*Afb) */
+      spe_fm(f, term1R_reg, fragR_reg, fbA_reg);
+      spe_fm(f, term1G_reg, fragG_reg, fbA_reg);
+      spe_fm(f, term1B_reg, fragB_reg, fbA_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+      /* factors = (1-Afb, 1-Afb, 1-Afb), so term = (R*(1-Afb),G*(1-Afb),B*(1-Afb)) */
+      /* we'll need the optional constant {1,1,1,1} register */
+      SET_ONE_REG_IF_UNSET(f)
+      /* tmp = 1 - A */
+      spe_fs(f, tmp_reg, one_reg, fbA_reg);
+      /* term = R * tmp, G*tmp, and B*tmp */
+      spe_fm(f, term1R_reg, fragR_reg, tmp_reg);
+      spe_fm(f, term1G_reg, fragG_reg, tmp_reg);
+      spe_fm(f, term1B_reg, fragB_reg, tmp_reg);
+      break;
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+      /* We'll need the optional blend color registers */
+      SET_CONST_COLOR_IF_UNSET(f,blend_color)
+      /* now, factor = (Rc,Gc,Bc), so term = (R*Rc,G*Gc,B*Bc) */
+      spe_fm(f, term1R_reg, fragR_reg, constR_reg);
+      spe_fm(f, term1G_reg, fragG_reg, constG_reg);
+      spe_fm(f, term1B_reg, fragB_reg, constB_reg);
+      break;
+   case PIPE_BLENDFACTOR_CONST_ALPHA:
+      /* we'll need the optional constant alpha register */
+      SET_CONST_ALPHA_IF_UNSET(f, blend_color)
+      /* factor = (Ac,Ac,Ac), so term = (R*Ac,G*Ac,B*Ac) */
+      spe_fm(f, term1R_reg, fragR_reg, constA_reg);
+      spe_fm(f, term1G_reg, fragG_reg, constA_reg);
+      spe_fm(f, term1B_reg, fragB_reg, constA_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+      /* We need both the optional {1,1,1,1} register, and the optional
+       * constant color registers
+       */
+      SET_ONE_REG_IF_UNSET(f)
+      SET_CONST_COLOR_IF_UNSET(f, blend_color)
+      /* factor = (1-Rc,1-Gc,1-Bc), so term = (R*(1-Rc),G*(1-Gc),B*(1-Bc)) */
+      spe_fs(f, tmp_reg, one_reg, constR_reg);
+      spe_fm(f, term1R_reg, fragR_reg, tmp_reg);
+      spe_fs(f, tmp_reg, one_reg, constG_reg);
+      spe_fm(f, term1G_reg, fragG_reg, tmp_reg);
+      spe_fs(f, tmp_reg, one_reg, constB_reg);
+      spe_fm(f, term1B_reg, fragB_reg, tmp_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+      /* We need the optional {1,1,1,1} register and the optional 
+       * constant alpha register
+       */
+      SET_ONE_REG_IF_UNSET(f)
+      SET_CONST_ALPHA_IF_UNSET(f, blend_color)
+      /* factor = (1-Ac,1-Ac,1-Ac), so term = (R*(1-Ac),G*(1-Ac),B*(1-Ac)) */
+      spe_fs(f, tmp_reg, one_reg, constA_reg);
+      spe_fm(f, term1R_reg, fragR_reg, tmp_reg);
+      spe_fm(f, term1G_reg, fragG_reg, tmp_reg);
+      spe_fm(f, term1B_reg, fragB_reg, tmp_reg);
+      break;
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+      /* We'll need the optional {1,1,1,1} register */
+      SET_ONE_REG_IF_UNSET(f)
+      /* factor = (min(A,1-Afb),min(A,1-Afb),min(A,1-Afb)), so 
+       * term = (R*min(A,1-Afb), G*min(A,1-Afb), B*min(A,1-Afb))
+       */
+      /* tmp = 1 - Afb */
+      spe_fs(f, tmp_reg, one_reg, fbA_reg);
+      /* tmp = min(A,tmp) */
+      FLOAT_VECTOR_MIN(f, tmp_reg, fragA_reg, tmp_reg)
+      /* term = R*tmp */
+      spe_fm(f, term1R_reg, fragR_reg, tmp_reg);
+      spe_fm(f, term1G_reg, fragG_reg, tmp_reg);
+      spe_fm(f, term1B_reg, fragB_reg, tmp_reg);
+      break;
+
+      /* non-OpenGL cases? */
+   case PIPE_BLENDFACTOR_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_SRC1_ALPHA:
+   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+
    default:
       ASSERT(0);
    }
@@ -421,6 +625,7 @@ gen_blend(const struct pipe_blend_state *blend,
    case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
       /* one = {1.0, 1.0, 1.0, 1.0} */
       if (!one_reg_set) {
+         one_reg = spe_allocate_available_register(f);
          spe_load_float(f, one_reg, 1.0f);
          one_reg_set = true;
       }
@@ -432,6 +637,14 @@ gen_blend(const struct pipe_blend_state *blend,
       spe_fm(f, term2B_reg, fbB_reg, tmp_reg);
       break;
       /* XXX more cases */
+      // GL_ONE_MINUS_SRC_COLOR
+      // GL_DST_COLOR
+      // GL_ONE_MINUS_DST_COLOR
+      // GL_DST_ALPHA
+      // GL_CONSTANT_COLOR
+      // GL_ONE_MINUS_CONSTANT_COLOR
+      // GL_CONSTANT_ALPHA
+      // GL_ONE_MINUS_CONSTANT_ALPHA
    default:
       ASSERT(0);
    }
@@ -452,6 +665,7 @@ gen_blend(const struct pipe_blend_state *blend,
    case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
       /* one = {1.0, 1.0, 1.0, 1.0} */
       if (!one_reg_set) {
+         one_reg = spe_allocate_available_register(f);
          spe_load_float(f, one_reg, 1.0f);
          one_reg_set = true;
       }
@@ -461,6 +675,14 @@ gen_blend(const struct pipe_blend_state *blend,
       spe_fm(f, term2A_reg, fbA_reg, tmp_reg);
       break;
       /* XXX more cases */
+      // GL_ONE_MINUS_SRC_COLOR
+      // GL_DST_COLOR
+      // GL_ONE_MINUS_DST_COLOR
+      // GL_DST_ALPHA
+      // GL_CONSTANT_COLOR
+      // GL_ONE_MINUS_CONSTANT_COLOR
+      // GL_CONSTANT_ALPHA
+      // GL_ONE_MINUS_CONSTANT_ALPHA
    default:
       ASSERT(0);
    }
@@ -479,7 +701,21 @@ gen_blend(const struct pipe_blend_state *blend,
       spe_fs(f, fragG_reg, term1G_reg, term2G_reg);
       spe_fs(f, fragB_reg, term1B_reg, term2B_reg);
       break;
-      /* XXX more cases */
+   case PIPE_BLEND_REVERSE_SUBTRACT:
+      spe_fs(f, fragR_reg, term2R_reg, term1R_reg);
+      spe_fs(f, fragG_reg, term2G_reg, term1G_reg);
+      spe_fs(f, fragB_reg, term2B_reg, term1B_reg);
+      break;
+   case PIPE_BLEND_MIN:
+      FLOAT_VECTOR_MIN(f, fragR_reg, term1R_reg, term2R_reg)
+      FLOAT_VECTOR_MIN(f, fragG_reg, term1G_reg, term2G_reg)
+      FLOAT_VECTOR_MIN(f, fragB_reg, term1B_reg, term2B_reg)
+      break;
+   case PIPE_BLEND_MAX:
+      FLOAT_VECTOR_MAX(f, fragR_reg, term1R_reg, term2R_reg)
+      FLOAT_VECTOR_MAX(f, fragG_reg, term1G_reg, term2G_reg)
+      FLOAT_VECTOR_MAX(f, fragB_reg, term1B_reg, term2B_reg)
+      break;
    default:
       ASSERT(0);
    }
@@ -494,7 +730,15 @@ gen_blend(const struct pipe_blend_state *blend,
    case PIPE_BLEND_SUBTRACT:
       spe_fs(f, fragA_reg, term1A_reg, term2A_reg);
       break;
-      /* XXX more cases */
+   case PIPE_BLEND_REVERSE_SUBTRACT:
+      spe_fs(f, fragA_reg, term2A_reg, term1A_reg);
+      break;
+   case PIPE_BLEND_MIN:
+      FLOAT_VECTOR_MIN(f, fragA_reg, term1A_reg, term2A_reg)
+      break;
+   case PIPE_BLEND_MAX:
+      FLOAT_VECTOR_MAX(f, fragA_reg, term1A_reg, term2A_reg)
+      break;
    default:
       ASSERT(0);
    }
@@ -514,8 +758,12 @@ gen_blend(const struct pipe_blend_state *blend,
    spe_release_register(f, fbB_reg);
    spe_release_register(f, fbA_reg);
 
-   spe_release_register(f, one_reg);
    spe_release_register(f, tmp_reg);
+
+   /* Free any optional registers that actually got used */
+   RELEASE_ONE_REG_IF_USED(f)
+   RELEASE_CONST_COLOR_IF_USED(f)
+   RELEASE_CONST_ALPHA_IF_USED(f)
 }
 
 
@@ -629,6 +877,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
    const struct pipe_depth_stencil_alpha_state *dsa =
       &cell->depth_stencil->base;
    const struct pipe_blend_state *blend = &cell->blend->base;
+   const struct pipe_blend_color *blend_color = &cell->blend_color;
    const enum pipe_format color_format = cell->framebuffer.cbufs[0]->format;
 
    /* For SPE function calls: reg $3 = first param, $4 = second param, etc. */
@@ -651,7 +900,6 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
    int fbRGBA_reg;  /**< framebuffer's RGBA colors for quad */
    int fbZS_reg;    /**< framebuffer's combined z/stencil values for quad */
 
-   spe_init_func(f, SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE);
    spe_allocate_register(f, x_reg);
    spe_allocate_register(f, y_reg);
    spe_allocate_register(f, color_tile_reg);
@@ -816,7 +1064,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
 
 
    if (blend->blend_enable) {
-      gen_blend(blend, f, color_format,
+      gen_blend(blend, blend_color, f, color_format,
                 fragR_reg, fragG_reg, fragB_reg, fragA_reg, fbRGBA_reg);
    }
 
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index 2da3097983..8a389cd6aa 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -100,14 +100,19 @@ cell_emit_state(struct cell_context *cell)
             = cell_batch_alloc(cell, sizeof(*fops));
       struct spe_function spe_code;
 
+      /* Prepare the buffer that will hold the generated code. */
+      spe_init_func(&spe_code, SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE);
+
       /* generate new code */
       cell_gen_fragment_function(cell, &spe_code);
+
       /* put the new code into the batch buffer */
       fops->opcode = CELL_CMD_STATE_FRAGMENT_OPS;
       memcpy(&fops->code, spe_code.store,
              SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE);
       fops->dsa = cell->depth_stencil->base;
       fops->blend = cell->blend->base;
+
       /* free codegen buffer */
       spe_release_func(&spe_code);
    }
diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index 78260c4259..da2cb08972 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -50,7 +50,31 @@ helpful headers:
 /opt/cell/sdk/usr/include/libmisc.h
 */
 
+/* Set to 0 to disable all extraneous debugging code */
+#define DEBUG 1
+
+#if DEBUG
 boolean Debug = FALSE;
+boolean force_fragment_ops_fallback = TRUE;
+
+/* These debug macros use the unusual construction ", ##__VA_ARGS__"
+ * which expands to the expected comma + args if variadic arguments
+ * are supplied, but swallows the comma if there are no variadic
+ * arguments (which avoids syntax errors that would otherwise occur).
+ */
+#define DEBUG_PRINTF(format,...) \
+   if (Debug) \
+      printf("SPU %u: " format, spu.init.id, ##__VA_ARGS__)
+#define D_PRINTF(flag, format,...) \
+   if (spu.init.debug_flags & (flag)) \
+      printf("SPU %u: " format, spu.init.id, ##__VA_ARGS__)
+
+#else
+
+#define DEBUG_PRINTF(...)
+#define D_PRINTF(...)
+
+#endif
 
 struct spu_global spu;
 
@@ -133,9 +157,7 @@ really_clear_tiles(uint surfaceIndex)
 static void
 cmd_clear_surface(const struct cell_command_clear_surface *clear)
 {
-   if (Debug)
-      printf("SPU %u: CLEAR SURF %u to 0x%08x\n", spu.init.id,
-             clear->surface, clear->value);
+   DEBUG_PRINTF("CLEAR SURF %u to 0x%08x\n", clear->surface, clear->value);
 
    if (clear->surface == 0) {
       spu.fb.color_clear_value = clear->value;
@@ -203,17 +225,14 @@ cmd_clear_surface(const struct cell_command_clear_surface *clear)
 
 #endif /* CLEAR_OPT */
 
-   if (Debug)
-      printf("SPU %u: CLEAR SURF done\n", spu.init.id);
+   DEBUG_PRINTF("CLEAR SURF done\n");
 }
 
 
 static void
 cmd_release_verts(const struct cell_command_release_verts *release)
 {
-   if (Debug)
-      printf("SPU %u: RELEASE VERTS %u\n",
-             spu.init.id, release->vertex_buf);
+   DEBUG_PRINTF("RELEASE VERTS %u\n", release->vertex_buf);
    ASSERT(release->vertex_buf != ~0U);
    release_buffer(release->vertex_buf);
 }
@@ -228,16 +247,30 @@ cmd_release_verts(const struct cell_command_release_verts *release)
 static void
 cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops)
 {
-   if (Debug)
-      printf("SPU %u: CMD_STATE_FRAGMENT_OPS\n", spu.init.id);
+   DEBUG_PRINTF("CMD_STATE_FRAGMENT_OPS\n");
    /* Copy SPU code from batch buffer to spu buffer */
    memcpy(spu.fragment_ops_code, fops->code, SPU_MAX_FRAGMENT_OPS_INSTS * 4);
    /* Copy state info (for fallback case only) */
    memcpy(&spu.depth_stencil_alpha, &fops->dsa, sizeof(fops->dsa));
    memcpy(&spu.blend, &fops->blend, sizeof(fops->blend));
 
-   /* Point function pointer at new code */
-   spu.fragment_ops = (spu_fragment_ops_func) spu.fragment_ops_code;
+   /* Parity twist!  For now, always use the fallback code by default,
+    * only switching to codegen when specifically requested.  This
+    * allows us to develop freely without risking taking down the
+    * branch.
+    *
+    * Later, the parity of this check will be reversed, so that
+    * codegen is *always* used, unless we specifically indicate that
+    * we don't want it.
+    *
+    * Eventually, the option will be removed completely, because in
+    * final code we'll always use codegen and won't even provide the
+    * raw state records that the fallback code requires.
+    */
+   if (spu.init.debug_flags & CELL_DEBUG_FRAGMENT_OP_FALLBACK) {
+      spu.fragment_ops = (spu_fragment_ops_func) spu.fragment_ops_code;
+   }
+   /* otherwise, the default fallback code remains in place */
 
    spu.read_depth = spu.depth_stencil_alpha.depth.enabled;
    spu.read_stencil = spu.depth_stencil_alpha.stencil[0].enabled;
@@ -247,8 +280,7 @@ cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops)
 static void
 cmd_state_fragment_program(const struct cell_command_fragment_program *fp)
 {
-   if (Debug)
-      printf("SPU %u: CMD_STATE_FRAGMENT_PROGRAM\n", spu.init.id);
+   DEBUG_PRINTF("CMD_STATE_FRAGMENT_PROGRAM\n");
    /* Copy SPU code from batch buffer to spu buffer */
    memcpy(spu.fragment_program_code, fp->code,
           SPU_MAX_FRAGMENT_PROGRAM_INSTS * 4);
@@ -262,9 +294,7 @@ cmd_state_fragment_program(const struct cell_command_fragment_program *fp)
 static void
 cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
 {
-   if (Debug)
-      printf("SPU %u: FRAMEBUFFER: %d x %d at %p, cformat 0x%x  zformat 0x%x\n",
-             spu.init.id,
+   DEBUG_PRINTF("FRAMEBUFFER: %d x %d at %p, cformat 0x%x  zformat 0x%x\n",
              cmd->width,
              cmd->height,
              cmd->color_start,
@@ -309,9 +339,7 @@ cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
 static void
 cmd_state_sampler(const struct cell_command_sampler *sampler)
 {
-   if (Debug)
-      printf("SPU %u: SAMPLER [%u]\n",
-             spu.init.id, sampler->unit);
+   DEBUG_PRINTF("SAMPLER [%u]\n", sampler->unit);
 
    spu.sampler[sampler->unit] = sampler->state;
    if (spu.sampler[sampler->unit].min_img_filter == PIPE_TEX_FILTER_LINEAR)
@@ -328,11 +356,9 @@ cmd_state_texture(const struct cell_command_texture *texture)
    const uint width = texture->width;
    const uint height = texture->height;
 
-   if (Debug) {
-      printf("SPU %u: TEXTURE [%u] at %p  size %u x %u\n", spu.init.id,
+   DEBUG_PRINTF("TEXTURE [%u] at %p  size %u x %u\n",
              texture->unit, texture->start,
              texture->width, texture->height);
-   }
 
    spu.texture[unit].start = texture->start;
    spu.texture[unit].width = width;
@@ -351,10 +377,7 @@ cmd_state_texture(const struct cell_command_texture *texture)
 static void
 cmd_state_vertex_info(const struct vertex_info *vinfo)
 {
-   if (Debug) {
-      printf("SPU %u: VERTEX_INFO num_attribs=%u\n", spu.init.id,
-             vinfo->num_attribs);
-   }
+   DEBUG_PRINTF("VERTEX_INFO num_attribs=%u\n", vinfo->num_attribs);
    ASSERT(vinfo->num_attribs >= 1);
    ASSERT(vinfo->num_attribs <= 8);
    memcpy(&spu.vertex_info, vinfo, sizeof(*vinfo));
@@ -393,8 +416,7 @@ cmd_state_attrib_fetch(const struct cell_attribute_fetch_code *code)
 static void
 cmd_finish(void)
 {
-   if (Debug)
-      printf("SPU %u: FINISH\n", spu.init.id);
+   DEBUG_PRINTF("FINISH\n");
    really_clear_tiles(0);
    /* wait for all outstanding DMAs to finish */
    mfc_write_tag_mask(~0);
@@ -419,9 +441,8 @@ cmd_batch(uint opcode)
    const unsigned usize = size / sizeof(buffer[0]);
    uint pos;
 
-   if (Debug)
-      printf("SPU %u: BATCH buffer %u, len %u, from %p\n",
-             spu.init.id, buf, size, spu.init.buffers[buf]);
+   DEBUG_PRINTF("BATCH buffer %u, len %u, from %p\n",
+             buf, size, spu.init.buffers[buf]);
 
    ASSERT((opcode & CELL_CMD_OPCODE_MASK) == CELL_CMD_BATCH);
 
@@ -440,8 +461,7 @@ cmd_batch(uint opcode)
    wait_on_mask(1 << TAG_BATCH_BUFFER);
 
    /* Tell PPU we're done copying the buffer to local store */
-   if (Debug)
-      printf("SPU %u: release batch buf %u\n", spu.init.id, buf);
+   DEBUG_PRINTF("release batch buf %u\n", buf);
    release_buffer(buf);
 
    /*
@@ -571,8 +591,7 @@ cmd_batch(uint opcode)
       }
    }
 
-   if (Debug)
-      printf("SPU %u: BATCH complete\n", spu.init.id);
+   DEBUG_PRINTF("BATCH complete\n");
 }
 
 
@@ -585,8 +604,7 @@ main_loop(void)
    struct cell_command cmd;
    int exitFlag = 0;
 
-   if (Debug)
-      printf("SPU %u: Enter main loop\n", spu.init.id);
+   DEBUG_PRINTF("Enter main loop\n");
 
    ASSERT((sizeof(struct cell_command) & 0xf) == 0);
    ASSERT_ALIGN16(&cmd);
@@ -595,14 +613,12 @@ main_loop(void)
       unsigned opcode;
       int tag = 0;
 
-      if (Debug)
-         printf("SPU %u: Wait for cmd...\n", spu.init.id);
+      DEBUG_PRINTF("Wait for cmd...\n");
 
       /* read/wait from mailbox */
       opcode = (unsigned int) spu_read_in_mbox();
 
-      if (Debug)
-         printf("SPU %u: got cmd 0x%x\n", spu.init.id, opcode);
+      DEBUG_PRINTF("got cmd 0x%x\n", opcode);
 
       /* command payload */
       mfc_get(&cmd,  /* dest */
@@ -619,8 +635,7 @@ main_loop(void)
 
       switch (opcode & CELL_CMD_OPCODE_MASK) {
       case CELL_CMD_EXIT:
-         if (Debug)
-            printf("SPU %u: EXIT\n", spu.init.id);
+         DEBUG_PRINTF("EXIT\n");
          exitFlag = 1;
          break;
       case CELL_CMD_VS_EXECUTE:
@@ -632,13 +647,12 @@ main_loop(void)
          cmd_batch(opcode);
          break;
       default:
-         printf("Bad opcode!\n");
+         printf("Bad opcode 0x%x!\n", opcode & CELL_CMD_OPCODE_MASK);
       }
 
    }
 
-   if (Debug)
-      printf("SPU %u: Exit main loop\n", spu.init.id);
+   DEBUG_PRINTF("Exit main loop\n");
 
    spu_dcache_report();
 }
@@ -653,7 +667,8 @@ one_time_init(void)
    invalidate_tex_cache();
 
    /* Install default/fallback fragment processing function.
-    * This will normally be overriden by a code-gen'd function.
+    * This will normally be overriden by a code-gen'd function
+    * unless CELL_FORCE_FRAGMENT_OPS_FALLBACK is set.
     */
    spu.fragment_ops = spu_fallback_fragment_ops;
 }
@@ -685,8 +700,8 @@ main(main_param_t speid, main_param_t argp)
 
    one_time_init();
 
-   if (Debug)
-      printf("SPU: main() speid=%lu\n", (unsigned long) speid);
+   DEBUG_PRINTF("main() speid=%lu\n", (unsigned long) speid);
+   D_PRINTF(CELL_DEBUG_FRAGMENT_OP_FALLBACK, "using fragment op fallback\n");
 
    mfc_get(&spu.init,  /* dest */
            (unsigned int) argp, /* src */
-- 
cgit v1.2.3


From 4485ac87c2cf69bef443ac36cccaa70054c6a7bb Mon Sep 17 00:00:00 2001
From: Robert Ellison <papillo@tungstengraphics.com>
Date: Thu, 18 Sep 2008 16:36:37 -0600
Subject: CELL: mark several transient files as .gitignore

progs/demos: added new demo "fbo_firecube"

progs/glsl: added new demo "pointcoord"

src/gallium/drivers/cell/spu: added the g3d_spu executable, a Cell SPU
executable file, which seems to be occasionally built as part of the
cell driver

src/glu/sgi: added "exptmp", a byproduct of the "mklib" process that
sometimes gets deleted and sometimes not.
---
 progs/demos/.gitignore                  | 1 +
 progs/glsl/.gitignore                   | 1 +
 src/gallium/drivers/cell/spu/.gitignore | 1 +
 src/glu/sgi/.gitignore                  | 1 +
 4 files changed, 4 insertions(+)
 create mode 100644 src/gallium/drivers/cell/spu/.gitignore
 create mode 100644 src/glu/sgi/.gitignore

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/progs/demos/.gitignore b/progs/demos/.gitignore
index 3693fafd4e..f033a0505d 100644
--- a/progs/demos/.gitignore
+++ b/progs/demos/.gitignore
@@ -8,6 +8,7 @@ cubemap
 drawpix
 engine
 extfuncs.h
+fbo_firecube
 fire
 fogcoord
 fplight
diff --git a/progs/glsl/.gitignore b/progs/glsl/.gitignore
index 09340ff2ad..978e31c6cc 100644
--- a/progs/glsl/.gitignore
+++ b/progs/glsl/.gitignore
@@ -7,6 +7,7 @@ extfuncs.h
 mandelbrot
 multitex
 noise
+pointcoord
 points
 readtex.c
 readtex.h
diff --git a/src/gallium/drivers/cell/spu/.gitignore b/src/gallium/drivers/cell/spu/.gitignore
new file mode 100644
index 0000000000..2be9a2d324
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/.gitignore
@@ -0,0 +1 @@
+g3d_spu
diff --git a/src/glu/sgi/.gitignore b/src/glu/sgi/.gitignore
new file mode 100644
index 0000000000..279ea7d434
--- /dev/null
+++ b/src/glu/sgi/.gitignore
@@ -0,0 +1 @@
+exptmp
-- 
cgit v1.2.3


From de0a6dc04a5b508472cc0cce4481ac3bb95fda3b Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 19 Sep 2008 10:42:21 -0600
Subject: cell: the test for CELL_DEBUG_FRAGMENT_OP_FALLBACK in
 cmd_state_fragment_ops() was inverted

---
 src/gallium/drivers/cell/spu/spu_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index da2cb08972..d99dd12d2a 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -267,7 +267,7 @@ cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops)
     * final code we'll always use codegen and won't even provide the
     * raw state records that the fallback code requires.
     */
-   if (spu.init.debug_flags & CELL_DEBUG_FRAGMENT_OP_FALLBACK) {
+   if ((spu.init.debug_flags & CELL_DEBUG_FRAGMENT_OP_FALLBACK) == 0) {
       spu.fragment_ops = (spu_fragment_ops_func) spu.fragment_ops_code;
    }
    /* otherwise, the default fallback code remains in place */
-- 
cgit v1.2.3


From 0500ae574f4192dd1972baa23e9c62f992042ab9 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 19 Sep 2008 10:50:46 -0600
Subject: cell: issue warning to stderr when using fallback fragment ops

---
 src/gallium/drivers/cell/spu/spu_main.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index d99dd12d2a..6b62417558 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -247,6 +247,8 @@ cmd_release_verts(const struct cell_command_release_verts *release)
 static void
 cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops)
 {
+   static int warned = 0;
+
    DEBUG_PRINTF("CMD_STATE_FRAGMENT_OPS\n");
    /* Copy SPU code from batch buffer to spu buffer */
    memcpy(spu.fragment_ops_code, fops->code, SPU_MAX_FRAGMENT_OPS_INSTS * 4);
@@ -270,7 +272,13 @@ cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops)
    if ((spu.init.debug_flags & CELL_DEBUG_FRAGMENT_OP_FALLBACK) == 0) {
       spu.fragment_ops = (spu_fragment_ops_func) spu.fragment_ops_code;
    }
-   /* otherwise, the default fallback code remains in place */
+   else {
+      /* otherwise, the default fallback code remains in place */
+      if (!warned) {
+         fprintf(stderr, "Cell Warning: using fallback per-fragment code\n");
+         warned = 1;
+      }
+   }
 
    spu.read_depth = spu.depth_stencil_alpha.depth.enabled;
    spu.read_stencil = spu.depth_stencil_alpha.stencil[0].enabled;
-- 
cgit v1.2.3


From aca74a4d92ba6f99d756ab703a78efc3918b3840 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 19 Sep 2008 17:55:10 -0600
Subject: cell: make sure the fragment ops and fragment shader code buffer is
 at a 32-byte boundary

To make sure even/odd instructions hit the right pipes.
---
 src/gallium/drivers/cell/spu/spu_main.c | 4 +++-
 src/gallium/drivers/cell/spu/spu_main.h | 8 ++++----
 2 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index 6b62417558..b4d30228f7 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -705,6 +705,8 @@ main(main_param_t speid, main_param_t argp)
 
    ASSERT(sizeof(tile_t) == TILE_SIZE * TILE_SIZE * 4);
    ASSERT(sizeof(struct cell_command_render) % 8 == 0);
+   ASSERT(((unsigned long) &spu.fragment_ops_code) % 32 == 0);
+   ASSERT(((unsigned long) &spu.fragment_program_code) % 32 == 0);
 
    one_time_init();
 
@@ -721,7 +723,7 @@ main(main_param_t speid, main_param_t argp)
 
 #if 0
    if (spu.init.id==0)
-      spu_test_misc();
+      spu_test_misc(spu.init.id);
 #endif
 
    main_loop();
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 2c7b625840..72e540fcff 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -143,13 +143,13 @@ struct spu_global
    ubyte ctile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
    ubyte ztile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
 
-   /** Current fragment ops machine code */
-   uint fragment_ops_code[SPU_MAX_FRAGMENT_OPS_INSTS];
+   /** Current fragment ops machine code, at 32-byte boundary */
+   uint fragment_ops_code[SPU_MAX_FRAGMENT_OPS_INSTS] ALIGN32_ATTRIB;
    /** Current fragment ops function */
    spu_fragment_ops_func fragment_ops;
 
-   /** Current fragment program machine code */
-   uint fragment_program_code[SPU_MAX_FRAGMENT_PROGRAM_INSTS];
+   /** Current fragment program machine code, at 32-byte boundary */
+   uint fragment_program_code[SPU_MAX_FRAGMENT_PROGRAM_INSTS] ALIGN32_ATTRIB;
    /** Current fragment ops function */
    spu_fragment_program_func fragment_program;
 
-- 
cgit v1.2.3


From 164fb1299e1614ce05ae539d832567469eedb402 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 26 Sep 2008 09:38:40 -0600
Subject: cell: checkpoint: support for function calls in SPU shaders

Will be used for instructions like SIN/COS/POW/TEX/etc.  The PPU needs to
know the address of some functions in the SPU address space.  Send that
info to the PPU/main memory rather than patch up shaders on the SPU side.
Not finished/tested yet...
---
 src/gallium/drivers/cell/common.h           |  18 ++++-
 src/gallium/drivers/cell/ppu/cell_context.h |   1 +
 src/gallium/drivers/cell/ppu/cell_gen_fp.c  |  81 ++++++++++++++++++++-
 src/gallium/drivers/cell/ppu/cell_spu.c     |   8 +++
 src/gallium/drivers/cell/spu/Makefile       |   3 +-
 src/gallium/drivers/cell/spu/spu_funcs.c    | 106 ++++++++++++++++++++++++++++
 src/gallium/drivers/cell/spu/spu_funcs.h    |  35 +++++++++
 src/gallium/drivers/cell/spu/spu_main.c     |   5 ++
 8 files changed, 254 insertions(+), 3 deletions(-)
 create mode 100644 src/gallium/drivers/cell/spu/spu_funcs.c
 create mode 100644 src/gallium/drivers/cell/spu/spu_funcs.h

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index f0ff96eb47..99329fd8e2 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -130,7 +130,7 @@ struct cell_command_fragment_ops
 #define SPU_MAX_FRAGMENT_PROGRAM_INSTS 128
 
 /**
- * Command to send a fragment progra to SPUs.
+ * Command to send a fragment program to SPUs.
  */
 struct cell_command_fragment_program
 {
@@ -267,6 +267,20 @@ struct cell_command
 } ALIGN16_ATTRIB;
 
 
+#define MAX_SPU_FUNCTIONS 12
+/**
+ * Used to tell the PPU about the address of particular functions in the
+ * SPU's address space.
+ */
+struct cell_spu_function_info
+{
+   uint num;
+   char names[MAX_SPU_FUNCTIONS][16];
+   uint addrs[MAX_SPU_FUNCTIONS];
+   char pad[12];   /**< Pad struct to multiple of 16 bytes (256 currently) */
+};
+
+
 /** This is the object passed to spe_create_thread() */
 struct cell_init_info
 {
@@ -278,6 +292,8 @@ struct cell_init_info
    /** Buffers for command batches, vertex/index data */
    ubyte *buffers[CELL_NUM_BUFFERS];
    uint *buffer_status;  /**< points at cell_context->buffer_status */
+
+   struct cell_spu_function_info *spu_functions;
 } ALIGN16_ATTRIB;
 
 
diff --git a/src/gallium/drivers/cell/ppu/cell_context.h b/src/gallium/drivers/cell/ppu/cell_context.h
index 14914b9c6f..a9ad84bb18 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.h
+++ b/src/gallium/drivers/cell/ppu/cell_context.h
@@ -149,6 +149,7 @@ struct cell_context
    /** Mapped constant buffers */
    void *mapped_constants[PIPE_SHADER_TYPES];
 
+   struct cell_spu_function_info spu_functions ALIGN16_ATTRIB;
 
    uint num_spus;
 
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
index 8972b5b1ea..fd12af19ce 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
@@ -37,7 +37,7 @@
  * \author Brian Paul
  */
 
-
+#include <math.h>
 #include "pipe/p_defines.h"
 #include "pipe/p_state.h"
 #include "pipe/p_shader_tokens.h"
@@ -64,6 +64,7 @@
  */
 struct codegen
 {
+   struct cell_context *cell;
    int inputs_reg;      /**< 1st function parameter */
    int outputs_reg;     /**< 2nd function parameter */
    int constants_reg;   /**< 3rd function parameter */
@@ -1076,6 +1077,76 @@ emit_FRC(struct codegen *gen, const struct tgsi_full_instruction *inst)
 }
 
 
+#if 0
+static void
+print_functions(struct cell_context *cell)
+{
+   struct cell_spu_function_info *funcs = &cell->spu_functions;
+   uint i;
+   for (i = 0; i < funcs->num; i++) {
+      printf("SPU func %u: %s at %u\n",
+             i, funcs->names[i], funcs->addrs[i]);
+   }
+}
+#endif
+
+
+/**
+ * Emit code to call a SPU function.
+ * Used to implement instructions like SIN/COS/POW/TEX/etc.
+ */
+static boolean
+emit_function_call(struct codegen *gen,
+                   const struct tgsi_full_instruction *inst,
+                   char *funcname, uint num_args)
+{
+   const struct cell_spu_function_info *funcs = &gen->cell->spu_functions;
+   char comment[100];
+   uint addr;
+   int ch;
+
+   assert(num_args <= 2);
+
+   /* lookup function address */
+   {
+      uint i;
+      addr = 0;
+      for (i = 0; i < funcs->num; i++) {
+         if (strcmp(funcs->names[i], funcname) == 0) {
+            addr = funcs->addrs[i];
+         }
+      }
+      assert(addr && "spu function not found");
+   }
+
+   sprintf(comment, "CALL %s:", funcname);
+   spe_comment(gen->f, -4, comment);
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         int s_regs[3];
+         uint a;
+         for (a = 0; a < num_args; a++) {
+            s_regs[a] = get_src_reg(gen, ch, &inst->FullSrcRegisters[a]);
+         }
+
+         /* XXX not done */
+         (void) s_regs;
+         (void) d_reg;
+
+         spe_bisl(gen->f, SPE_REG_RA, addr, 0, 0); /* XXX untested! */
+
+
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+
+   return true;
+}
+
+
 /**
  * Emit max.  See emit_SGT for comments.
  */
@@ -1303,6 +1374,13 @@ emit_instruction(struct codegen *gen,
    case TGSI_OPCODE_END:
       return emit_END(gen);
 
+   case TGSI_OPCODE_COS:
+      return emit_function_call(gen, inst, "spu_cos", 1);
+   case TGSI_OPCODE_SIN:
+      return emit_function_call(gen, inst, "spu_sin", 1);
+   case TGSI_OPCODE_POW:
+      return emit_function_call(gen, inst, "spu_pow", 2);
+
    case TGSI_OPCODE_IF:
       return emit_IF(gen, inst);
    case TGSI_OPCODE_ELSE:
@@ -1431,6 +1509,7 @@ cell_gen_fragment_program(struct cell_context *cell,
    struct codegen gen;
 
    memset(&gen, 0, sizeof(gen));
+   gen.cell = cell;
    gen.f = f;
 
    /* For SPE function calls: reg $3 = first param, $4 = second param, etc. */
diff --git a/src/gallium/drivers/cell/ppu/cell_spu.c b/src/gallium/drivers/cell/ppu/cell_spu.c
index 9508227e29..df020c4146 100644
--- a/src/gallium/drivers/cell/ppu/cell_spu.c
+++ b/src/gallium/drivers/cell/ppu/cell_spu.c
@@ -36,6 +36,7 @@
 #include "cell_spu.h"
 #include "pipe/p_format.h"
 #include "pipe/p_state.h"
+#include "util/u_memory.h"
 #include "cell/common.h"
 
 
@@ -131,6 +132,11 @@ cell_start_spus(struct cell_context *cell)
    ASSERT_ALIGN16(&cell_global.inits[0]);
    ASSERT_ALIGN16(&cell_global.inits[1]);
 
+   /*
+    * Initialize the global 'inits' structure for each SPU.
+    * A pointer to the init struct will be passed to each SPU.
+    * The SPUs will then each grab their init info with mfc_get().
+    */
    for (i = 0; i < cell->num_spus; i++) {
       cell_global.inits[i].id = i;
       cell_global.inits[i].num_spus = cell->num_spus;
@@ -141,6 +147,8 @@ cell_start_spus(struct cell_context *cell)
       }
       cell_global.inits[i].buffer_status = &cell->buffer_status[0][0][0];
 
+      cell_global.inits[i].spu_functions = &cell->spu_functions;
+
       cell_global.spe_contexts[i] = spe_context_create(0, NULL);
       if (!cell_global.spe_contexts[i]) {
          fprintf(stderr, "spe_context_create() failed\n");
diff --git a/src/gallium/drivers/cell/spu/Makefile b/src/gallium/drivers/cell/spu/Makefile
index 1ae0dfb8c1..c2db85247e 100644
--- a/src/gallium/drivers/cell/spu/Makefile
+++ b/src/gallium/drivers/cell/spu/Makefile
@@ -16,8 +16,9 @@ PROG_SPU_EMBED_O = $(PROG)_spu-embed.o
 
 
 SOURCES = \
-	spu_main.c \
+	spu_funcs.c \
 	spu_dcache.c \
+	spu_main.c \
 	spu_per_fragment_op.c \
 	spu_render.c \
 	spu_texture.c \
diff --git a/src/gallium/drivers/cell/spu/spu_funcs.c b/src/gallium/drivers/cell/spu/spu_funcs.c
new file mode 100644
index 0000000000..d174956518
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_funcs.c
@@ -0,0 +1,106 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+/**
+ * SPU functions accessed by shaders.
+ *
+ * Authors: Brian Paul
+ */
+
+
+#include <string.h>
+#include <libmisc.h>
+#include <cos8_v.h>
+#include <sin8_v.h>
+
+#include "cell/common.h"
+#include "spu_main.h"
+#include "spu_funcs.h"
+
+
+#define M_PI 3.1415926
+
+
+static vector float
+spu_cos(vector float x)
+{
+   static const float scale = 1.0 / (2.0 * M_PI);
+   x = x * spu_splats(scale); /* normalize */
+   return _cos8_v(x);
+}
+
+static vector float
+spu_sin(vector float x)
+{
+   static const float scale = 1.0 / (2.0 * M_PI);
+   x = x * spu_splats(scale); /* normalize */
+   return _sin8_v(x);   /* 8-bit accuracy enough?? */
+}
+
+
+static void
+add_func(struct cell_spu_function_info *spu_functions,
+             const char *name, void *addr)
+{
+   uint n = spu_functions->num;
+   ASSERT(strlen(name) < 16);
+   strcpy(spu_functions->names[n], name);
+   spu_functions->addrs[n] = (uint) addr;
+   spu_functions->num++;
+}
+
+
+/**
+ * Return info about the SPU's function to the PPU / main memory.
+ * The PPU needs to know the address of some SPU-side functions so
+ * that we can generate shader code with function calls.
+ */
+void
+return_function_info(void)
+{
+   struct cell_spu_function_info funcs ALIGN16_ATTRIB;
+   int tag = TAG_MISC;
+
+   ASSERT(sizeof(funcs) == 256); /* must be multiple of 16 bytes */
+
+   funcs.num = 0;
+   add_func(&funcs, "spu_cos", &spu_cos);
+   add_func(&funcs, "spu_sin", &spu_sin);
+
+   /* Send the function info back to the PPU / main memory */
+   mfc_put((void *) &funcs,  /* src in local store */
+           (unsigned int) spu.init.spu_functions, /* dst in main memory */
+           sizeof(funcs),  /* bytes */
+           tag,
+           0, /* tid */
+           0  /* rid */);
+   wait_on_mask(1 << tag);
+}
+
+
+
diff --git a/src/gallium/drivers/cell/spu/spu_funcs.h b/src/gallium/drivers/cell/spu/spu_funcs.h
new file mode 100644
index 0000000000..3adb6ae99f
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_funcs.h
@@ -0,0 +1,35 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef SPU_FUNCS_H
+#define SPU_FUNCS_H
+
+extern void
+return_function_info(void);
+
+#endif
+
diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index b4d30228f7..6ef65d5645 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -32,6 +32,7 @@
 #include <stdio.h>
 #include <libmisc.h>
 
+#include "spu_funcs.h"
 #include "spu_main.h"
 #include "spu_render.h"
 #include "spu_per_fragment_op.h"
@@ -721,6 +722,10 @@ main(main_param_t speid, main_param_t argp)
            0  /* rid */);
    wait_on_mask( 1 << tag );
 
+   if (spu.init.id == 0) {
+      return_function_info();
+   }
+
 #if 0
    if (spu.init.id==0)
       spu_test_misc(spu.init.id);
-- 
cgit v1.2.3


From bac5900a14b85a6513fae7eef19a5ed1d26b2011 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 26 Sep 2008 09:58:17 -0600
Subject: cell: align instruction buffers to 8-byte, not 32-byte boundary

---
 src/gallium/drivers/cell/spu/spu_main.c | 4 ++--
 src/gallium/drivers/cell/spu/spu_main.h | 8 ++++----
 src/gallium/include/pipe/p_compiler.h   | 4 ++--
 3 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index 6ef65d5645..8f3e3785c1 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -706,8 +706,8 @@ main(main_param_t speid, main_param_t argp)
 
    ASSERT(sizeof(tile_t) == TILE_SIZE * TILE_SIZE * 4);
    ASSERT(sizeof(struct cell_command_render) % 8 == 0);
-   ASSERT(((unsigned long) &spu.fragment_ops_code) % 32 == 0);
-   ASSERT(((unsigned long) &spu.fragment_program_code) % 32 == 0);
+   ASSERT(((unsigned long) &spu.fragment_ops_code) % 8 == 0);
+   ASSERT(((unsigned long) &spu.fragment_program_code) % 8 == 0);
 
    one_time_init();
 
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 72e540fcff..29a305232e 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -143,13 +143,13 @@ struct spu_global
    ubyte ctile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
    ubyte ztile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
 
-   /** Current fragment ops machine code, at 32-byte boundary */
-   uint fragment_ops_code[SPU_MAX_FRAGMENT_OPS_INSTS] ALIGN32_ATTRIB;
+   /** Current fragment ops machine code, at 8-byte boundary */
+   uint fragment_ops_code[SPU_MAX_FRAGMENT_OPS_INSTS] ALIGN8_ATTRIB;
    /** Current fragment ops function */
    spu_fragment_ops_func fragment_ops;
 
-   /** Current fragment program machine code, at 32-byte boundary */
-   uint fragment_program_code[SPU_MAX_FRAGMENT_PROGRAM_INSTS] ALIGN32_ATTRIB;
+   /** Current fragment program machine code, at 8-byte boundary */
+   uint fragment_program_code[SPU_MAX_FRAGMENT_PROGRAM_INSTS] ALIGN8_ATTRIB;
    /** Current fragment ops function */
    spu_fragment_program_func fragment_program;
 
diff --git a/src/gallium/include/pipe/p_compiler.h b/src/gallium/include/pipe/p_compiler.h
index 1e702c7fa7..7bcebd3d6b 100644
--- a/src/gallium/include/pipe/p_compiler.h
+++ b/src/gallium/include/pipe/p_compiler.h
@@ -144,12 +144,12 @@ typedef unsigned char boolean;
 #define ALIGN16_DECL(TYPE, NAME, SIZE)  TYPE NAME##___aligned[SIZE] __attribute__(( aligned( 16 ) ))
 #define ALIGN16_ASSIGN(NAME) NAME##___aligned
 #define ALIGN16_ATTRIB  __attribute__(( aligned( 16 ) ))
-#define ALIGN32_ATTRIB  __attribute__(( aligned( 32 ) ))
+#define ALIGN8_ATTRIB  __attribute__(( aligned( 8 ) ))
 #else
 #define ALIGN16_DECL(TYPE, NAME, SIZE)  TYPE NAME##___unaligned[SIZE + 1]
 #define ALIGN16_ASSIGN(NAME) align16(NAME##___unaligned)
 #define ALIGN16_ATTRIB
-#define ALIGN32_ATTRIB
+#define ALIGN8_ATTRIB
 #endif
 
 
-- 
cgit v1.2.3


From a1189ea882714282b884d37e530cd638dd4ca660 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 26 Sep 2008 10:00:14 -0600
Subject: cell: move really_clear_tiles()

---
 src/gallium/drivers/cell/spu/spu_main.c | 38 ---------------------------------
 src/gallium/drivers/cell/spu/spu_tile.c | 37 ++++++++++++++++++++++++++++++++
 src/gallium/drivers/cell/spu/spu_tile.h |  6 ++++--
 3 files changed, 41 insertions(+), 40 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index 8f3e3785c1..b45e79a30b 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -117,44 +117,6 @@ release_buffer(uint buffer)
 }
 
 
-/**
- * For tiles whose status is TILE_STATUS_CLEAR, write solid-filled
- * tiles back to the main framebuffer.
- */
-static void
-really_clear_tiles(uint surfaceIndex)
-{
-   const uint num_tiles = spu.fb.width_tiles * spu.fb.height_tiles;
-   uint i;
-
-   if (surfaceIndex == 0) {
-      clear_c_tile(&spu.ctile);
-
-      for (i = spu.init.id; i < num_tiles; i += spu.init.num_spus) {
-         uint tx = i % spu.fb.width_tiles;
-         uint ty = i / spu.fb.width_tiles;
-         if (spu.ctile_status[ty][tx] == TILE_STATUS_CLEAR) {
-            put_tile(tx, ty, &spu.ctile, TAG_SURFACE_CLEAR, 0);
-         }
-      }
-   }
-   else {
-      clear_z_tile(&spu.ztile);
-
-      for (i = spu.init.id; i < num_tiles; i += spu.init.num_spus) {
-         uint tx = i % spu.fb.width_tiles;
-         uint ty = i / spu.fb.width_tiles;
-         if (spu.ztile_status[ty][tx] == TILE_STATUS_CLEAR)
-            put_tile(tx, ty, &spu.ctile, TAG_SURFACE_CLEAR, 1);
-      }
-   }
-
-#if 0
-   wait_on_mask(1 << TAG_SURFACE_CLEAR);
-#endif
-}
-
-
 static void
 cmd_clear_surface(const struct cell_command_clear_surface *clear)
 {
diff --git a/src/gallium/drivers/cell/spu/spu_tile.c b/src/gallium/drivers/cell/spu/spu_tile.c
index 216a33126b..6905015a48 100644
--- a/src/gallium/drivers/cell/spu/spu_tile.c
+++ b/src/gallium/drivers/cell/spu/spu_tile.c
@@ -87,3 +87,40 @@ put_tile(uint tx, uint ty, const tile_t *tile, int tag, int zBuf)
            0  /* rid */);
 }
 
+
+/**
+ * For tiles whose status is TILE_STATUS_CLEAR, write solid-filled
+ * tiles back to the main framebuffer.
+ */
+void
+really_clear_tiles(uint surfaceIndex)
+{
+   const uint num_tiles = spu.fb.width_tiles * spu.fb.height_tiles;
+   uint i;
+
+   if (surfaceIndex == 0) {
+      clear_c_tile(&spu.ctile);
+
+      for (i = spu.init.id; i < num_tiles; i += spu.init.num_spus) {
+         uint tx = i % spu.fb.width_tiles;
+         uint ty = i / spu.fb.width_tiles;
+         if (spu.ctile_status[ty][tx] == TILE_STATUS_CLEAR) {
+            put_tile(tx, ty, &spu.ctile, TAG_SURFACE_CLEAR, 0);
+         }
+      }
+   }
+   else {
+      clear_z_tile(&spu.ztile);
+
+      for (i = spu.init.id; i < num_tiles; i += spu.init.num_spus) {
+         uint tx = i % spu.fb.width_tiles;
+         uint ty = i / spu.fb.width_tiles;
+         if (spu.ztile_status[ty][tx] == TILE_STATUS_CLEAR)
+            put_tile(tx, ty, &spu.ctile, TAG_SURFACE_CLEAR, 1);
+      }
+   }
+
+#if 0
+   wait_on_mask(1 << TAG_SURFACE_CLEAR);
+#endif
+}
diff --git a/src/gallium/drivers/cell/spu/spu_tile.h b/src/gallium/drivers/cell/spu/spu_tile.h
index 1b5491112d..7bfb52be8f 100644
--- a/src/gallium/drivers/cell/spu/spu_tile.h
+++ b/src/gallium/drivers/cell/spu/spu_tile.h
@@ -36,12 +36,14 @@
 
 
-void
+extern void
 get_tile(uint tx, uint ty, tile_t *tile, int tag, int zBuf);
 
-void
+extern void
 put_tile(uint tx, uint ty, const tile_t *tile, int tag, int zBuf);
 
+extern void
+really_clear_tiles(uint surfaceIndex);
 
 
 static INLINE void
-- 
cgit v1.2.3


From f45d39fa34ca36839c684fdcadd1476360de3a63 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 26 Sep 2008 10:02:58 -0600
Subject: cell: move debug macros into new spu_debug.h

---
 src/gallium/drivers/cell/spu/spu_debug.h | 60 ++++++++++++++++++++++++++++++++
 src/gallium/drivers/cell/spu/spu_main.c  | 30 ++--------------
 2 files changed, 63 insertions(+), 27 deletions(-)
 create mode 100644 src/gallium/drivers/cell/spu/spu_debug.h

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/spu/spu_debug.h b/src/gallium/drivers/cell/spu/spu_debug.h
new file mode 100644
index 0000000000..bbe5889c4b
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_debug.h
@@ -0,0 +1,60 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef SPU_DEBUG_H
+#define SPU_DEBUG_H
+
+
+/* Set to 0 to disable all extraneous debugging code */
+#define DEBUG 1
+
+#if DEBUG
+boolean Debug = FALSE;
+boolean force_fragment_ops_fallback = TRUE;
+
+/* These debug macros use the unusual construction ", ##__VA_ARGS__"
+ * which expands to the expected comma + args if variadic arguments
+ * are supplied, but swallows the comma if there are no variadic
+ * arguments (which avoids syntax errors that would otherwise occur).
+ */
+#define DEBUG_PRINTF(format,...) \
+   if (Debug) \
+      printf("SPU %u: " format, spu.init.id, ##__VA_ARGS__)
+#define D_PRINTF(flag, format,...) \
+   if (spu.init.debug_flags & (flag)) \
+      printf("SPU %u: " format, spu.init.id, ##__VA_ARGS__)
+
+#else
+
+#define DEBUG_PRINTF(...)
+#define D_PRINTF(...)
+
+#endif
+
+
+#endif /* SPU_DEBUG_H */
diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index b45e79a30b..ea01728824 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -32,6 +32,8 @@
 #include <stdio.h>
 #include <libmisc.h>
 
+#include "pipe/p_defines.h"
+
 #include "spu_funcs.h"
 #include "spu_main.h"
 #include "spu_render.h"
@@ -41,8 +43,8 @@
 //#include "spu_test.h"
 #include "spu_vertex_shader.h"
 #include "spu_dcache.h"
+#include "spu_debug.h"
 #include "cell/common.h"
-#include "pipe/p_defines.h"
 
 
 /*
@@ -51,32 +53,6 @@ helpful headers:
 /opt/cell/sdk/usr/include/libmisc.h
 */
 
-/* Set to 0 to disable all extraneous debugging code */
-#define DEBUG 1
-
-#if DEBUG
-boolean Debug = FALSE;
-boolean force_fragment_ops_fallback = TRUE;
-
-/* These debug macros use the unusual construction ", ##__VA_ARGS__"
- * which expands to the expected comma + args if variadic arguments
- * are supplied, but swallows the comma if there are no variadic
- * arguments (which avoids syntax errors that would otherwise occur).
- */
-#define DEBUG_PRINTF(format,...) \
-   if (Debug) \
-      printf("SPU %u: " format, spu.init.id, ##__VA_ARGS__)
-#define D_PRINTF(flag, format,...) \
-   if (spu.init.debug_flags & (flag)) \
-      printf("SPU %u: " format, spu.init.id, ##__VA_ARGS__)
-
-#else
-
-#define DEBUG_PRINTF(...)
-#define D_PRINTF(...)
-
-#endif
-
 struct spu_global spu;
 
 struct spu_vs_context draw;
-- 
cgit v1.2.3


From bb01c1a78eefeea6bc756d837fdd063660ac0230 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 26 Sep 2008 10:10:08 -0600
Subject: cell: move debug-related declarations

---
 src/gallium/drivers/cell/spu/spu_debug.h | 4 ++--
 src/gallium/drivers/cell/spu/spu_main.c  | 6 ++++++
 2 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/spu/spu_debug.h b/src/gallium/drivers/cell/spu/spu_debug.h
index bbe5889c4b..eeec052655 100644
--- a/src/gallium/drivers/cell/spu/spu_debug.h
+++ b/src/gallium/drivers/cell/spu/spu_debug.h
@@ -34,8 +34,8 @@
 #define DEBUG 1
 
 #if DEBUG
-boolean Debug = FALSE;
-boolean force_fragment_ops_fallback = TRUE;
+extern boolean Debug;
+extern boolean force_fragment_ops_fallback;
 
 /* These debug macros use the unusual construction ", ##__VA_ARGS__"
  * which expands to the expected comma + args if variadic arguments
diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index ea01728824..bc94674fe8 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -58,6 +58,12 @@ struct spu_global spu;
 struct spu_vs_context draw;
 
 
+#if DEBUG
+boolean Debug = FALSE;
+boolean force_fragment_ops_fallback = TRUE;
+#endif
+
+
 /**
  * Buffers containing dynamically generated SPU code:
  */
-- 
cgit v1.2.3


From 9d00cd3fc726a3fe01b98fd222dd4c71b3e95d44 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 26 Sep 2008 10:15:11 -0600
Subject: cell: move command processing code into new spu_command.c file

---
 src/gallium/drivers/cell/spu/Makefile      |   3 +-
 src/gallium/drivers/cell/spu/spu_command.c | 599 +++++++++++++++++++++++++++++
 src/gallium/drivers/cell/spu/spu_command.h |   7 +
 src/gallium/drivers/cell/spu/spu_main.c    | 558 +--------------------------
 4 files changed, 611 insertions(+), 556 deletions(-)
 create mode 100644 src/gallium/drivers/cell/spu/spu_command.c
 create mode 100644 src/gallium/drivers/cell/spu/spu_command.h

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/spu/Makefile b/src/gallium/drivers/cell/spu/Makefile
index c2db85247e..116453b79c 100644
--- a/src/gallium/drivers/cell/spu/Makefile
+++ b/src/gallium/drivers/cell/spu/Makefile
@@ -16,8 +16,9 @@ PROG_SPU_EMBED_O = $(PROG)_spu-embed.o
 
 
 SOURCES = \
-	spu_funcs.c \
+	spu_command.c \
 	spu_dcache.c \
+	spu_funcs.c \
 	spu_main.c \
 	spu_per_fragment_op.c \
 	spu_render.c \
diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
new file mode 100644
index 0000000000..ec9da5d887
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -0,0 +1,599 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+/**
+ * SPU command processing code
+ */
+
+
+#include <stdio.h>
+#include <libmisc.h>
+
+#include "pipe/p_defines.h"
+
+#include "spu_command.h"
+#include "spu_main.h"
+#include "spu_render.h"
+#include "spu_per_fragment_op.h"
+#include "spu_texture.h"
+#include "spu_tile.h"
+#include "spu_vertex_shader.h"
+#include "spu_dcache.h"
+#include "spu_debug.h"
+#include "cell/common.h"
+
+
+struct spu_vs_context draw;
+
+
+/**
+ * Buffers containing dynamically generated SPU code:
+ */
+static unsigned char attribute_fetch_code_buffer[136 * PIPE_MAX_ATTRIBS]
+    ALIGN16_ATTRIB;
+
+
+
+/**
+ * Tell the PPU that this SPU has finished copying a buffer to
+ * local store and that it may be reused by the PPU.
+ * This is done by writting a 16-byte batch-buffer-status block back into
+ * main memory (in cell_context->buffer_status[]).
+ */
+static void
+release_buffer(uint buffer)
+{
+   /* Evidently, using less than a 16-byte status doesn't work reliably */
+   static const uint status[4] ALIGN16_ATTRIB
+      = {CELL_BUFFER_STATUS_FREE, 0, 0, 0};
+
+   const uint index = 4 * (spu.init.id * CELL_NUM_BUFFERS + buffer);
+   uint *dst = spu.init.buffer_status + index;
+
+   ASSERT(buffer < CELL_NUM_BUFFERS);
+
+   mfc_put((void *) &status,    /* src in local memory */
+           (unsigned int) dst,  /* dst in main memory */
+           sizeof(status),      /* size */
+           TAG_MISC,            /* tag is unimportant */
+           0, /* tid */
+           0  /* rid */);
+}
+
+
+static void
+cmd_clear_surface(const struct cell_command_clear_surface *clear)
+{
+   DEBUG_PRINTF("CLEAR SURF %u to 0x%08x\n", clear->surface, clear->value);
+
+   if (clear->surface == 0) {
+      spu.fb.color_clear_value = clear->value;
+      if (spu.init.debug_flags & CELL_DEBUG_CHECKER) {
+         uint x = (spu.init.id << 4) | (spu.init.id << 12) |
+            (spu.init.id << 20) | (spu.init.id << 28);
+         spu.fb.color_clear_value ^= x;
+      }
+   }
+   else {
+      spu.fb.depth_clear_value = clear->value;
+   }
+
+#define CLEAR_OPT 1
+#if CLEAR_OPT
+
+   /* Simply set all tiles' status to CLEAR.
+    * When we actually begin rendering into a tile, we'll initialize it to
+    * the clear value.  If any tiles go untouched during the frame,
+    * really_clear_tiles() will set them to the clear value.
+    */
+   if (clear->surface == 0) {
+      memset(spu.ctile_status, TILE_STATUS_CLEAR, sizeof(spu.ctile_status));
+   }
+   else {
+      memset(spu.ztile_status, TILE_STATUS_CLEAR, sizeof(spu.ztile_status));
+   }
+
+#else
+
+   /*
+    * This path clears the whole framebuffer to the clear color right now.
+    */
+
+   /*
+   printf("SPU: %s num=%d w=%d h=%d\n",
+          __FUNCTION__, num_tiles, spu.fb.width_tiles, spu.fb.height_tiles);
+   */
+
+   /* init a single tile to the clear value */
+   if (clear->surface == 0) {
+      clear_c_tile(&spu.ctile);
+   }
+   else {
+      clear_z_tile(&spu.ztile);
+   }
+
+   /* walk over my tiles, writing the 'clear' tile's data */
+   {
+      const uint num_tiles = spu.fb.width_tiles * spu.fb.height_tiles;
+      uint i;
+      for (i = spu.init.id; i < num_tiles; i += spu.init.num_spus) {
+         uint tx = i % spu.fb.width_tiles;
+         uint ty = i / spu.fb.width_tiles;
+         if (clear->surface == 0)
+            put_tile(tx, ty, &spu.ctile, TAG_SURFACE_CLEAR, 0);
+         else
+            put_tile(tx, ty, &spu.ztile, TAG_SURFACE_CLEAR, 1);
+      }
+   }
+
+   if (spu.init.debug_flags & CELL_DEBUG_SYNC) {
+      wait_on_mask(1 << TAG_SURFACE_CLEAR);
+   }
+
+#endif /* CLEAR_OPT */
+
+   DEBUG_PRINTF("CLEAR SURF done\n");
+}
+
+
+static void
+cmd_release_verts(const struct cell_command_release_verts *release)
+{
+   DEBUG_PRINTF("RELEASE VERTS %u\n", release->vertex_buf);
+   ASSERT(release->vertex_buf != ~0U);
+   release_buffer(release->vertex_buf);
+}
+
+
+/**
+ * Process a CELL_CMD_STATE_FRAGMENT_OPS command.
+ * This involves installing new fragment ops SPU code.
+ * If this function is never called, we'll use a regular C fallback function
+ * for fragment processing.
+ */
+static void
+cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops)
+{
+   static int warned = 0;
+
+   DEBUG_PRINTF("CMD_STATE_FRAGMENT_OPS\n");
+   /* Copy SPU code from batch buffer to spu buffer */
+   memcpy(spu.fragment_ops_code, fops->code, SPU_MAX_FRAGMENT_OPS_INSTS * 4);
+   /* Copy state info (for fallback case only) */
+   memcpy(&spu.depth_stencil_alpha, &fops->dsa, sizeof(fops->dsa));
+   memcpy(&spu.blend, &fops->blend, sizeof(fops->blend));
+
+   /* Parity twist!  For now, always use the fallback code by default,
+    * only switching to codegen when specifically requested.  This
+    * allows us to develop freely without risking taking down the
+    * branch.
+    *
+    * Later, the parity of this check will be reversed, so that
+    * codegen is *always* used, unless we specifically indicate that
+    * we don't want it.
+    *
+    * Eventually, the option will be removed completely, because in
+    * final code we'll always use codegen and won't even provide the
+    * raw state records that the fallback code requires.
+    */
+   if ((spu.init.debug_flags & CELL_DEBUG_FRAGMENT_OP_FALLBACK) == 0) {
+      spu.fragment_ops = (spu_fragment_ops_func) spu.fragment_ops_code;
+   }
+   else {
+      /* otherwise, the default fallback code remains in place */
+      if (!warned) {
+         fprintf(stderr, "Cell Warning: using fallback per-fragment code\n");
+         warned = 1;
+      }
+   }
+
+   spu.read_depth = spu.depth_stencil_alpha.depth.enabled;
+   spu.read_stencil = spu.depth_stencil_alpha.stencil[0].enabled;
+}
+
+
+static void
+cmd_state_fragment_program(const struct cell_command_fragment_program *fp)
+{
+   DEBUG_PRINTF("CMD_STATE_FRAGMENT_PROGRAM\n");
+   /* Copy SPU code from batch buffer to spu buffer */
+   memcpy(spu.fragment_program_code, fp->code,
+          SPU_MAX_FRAGMENT_PROGRAM_INSTS * 4);
+#if 01
+   /* Point function pointer at new code */
+   spu.fragment_program = (spu_fragment_program_func)spu.fragment_program_code;
+#endif
+}
+
+
+static void
+cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
+{
+   DEBUG_PRINTF("FRAMEBUFFER: %d x %d at %p, cformat 0x%x  zformat 0x%x\n",
+             cmd->width,
+             cmd->height,
+             cmd->color_start,
+             cmd->color_format,
+             cmd->depth_format);
+
+   ASSERT_ALIGN16(cmd->color_start);
+   ASSERT_ALIGN16(cmd->depth_start);
+
+   spu.fb.color_start = cmd->color_start;
+   spu.fb.depth_start = cmd->depth_start;
+   spu.fb.color_format = cmd->color_format;
+   spu.fb.depth_format = cmd->depth_format;
+   spu.fb.width = cmd->width;
+   spu.fb.height = cmd->height;
+   spu.fb.width_tiles = (spu.fb.width + TILE_SIZE - 1) / TILE_SIZE;
+   spu.fb.height_tiles = (spu.fb.height + TILE_SIZE - 1) / TILE_SIZE;
+
+   switch (spu.fb.depth_format) {
+   case PIPE_FORMAT_Z32_UNORM:
+      spu.fb.zsize = 4;
+      spu.fb.zscale = (float) 0xffffffffu;
+      break;
+   case PIPE_FORMAT_Z24S8_UNORM:
+   case PIPE_FORMAT_S8Z24_UNORM:
+   case PIPE_FORMAT_Z24X8_UNORM:
+   case PIPE_FORMAT_X8Z24_UNORM:
+      spu.fb.zsize = 4;
+      spu.fb.zscale = (float) 0x00ffffffu;
+      break;
+   case PIPE_FORMAT_Z16_UNORM:
+      spu.fb.zsize = 2;
+      spu.fb.zscale = (float) 0xffffu;
+      break;
+   default:
+      spu.fb.zsize = 0;
+      break;
+   }
+}
+
+
+static void
+cmd_state_sampler(const struct cell_command_sampler *sampler)
+{
+   DEBUG_PRINTF("SAMPLER [%u]\n", sampler->unit);
+
+   spu.sampler[sampler->unit] = sampler->state;
+   if (spu.sampler[sampler->unit].min_img_filter == PIPE_TEX_FILTER_LINEAR)
+      spu.sample_texture[sampler->unit] = sample_texture_bilinear;
+   else
+      spu.sample_texture[sampler->unit] = sample_texture_nearest;
+}
+
+
+static void
+cmd_state_texture(const struct cell_command_texture *texture)
+{
+   const uint unit = texture->unit;
+   const uint width = texture->width;
+   const uint height = texture->height;
+
+   DEBUG_PRINTF("TEXTURE [%u] at %p  size %u x %u\n",
+             texture->unit, texture->start,
+             texture->width, texture->height);
+
+   spu.texture[unit].start = texture->start;
+   spu.texture[unit].width = width;
+   spu.texture[unit].height = height;
+
+   spu.texture[unit].tiles_per_row = width / TILE_SIZE;
+
+   spu.texture[unit].tex_size = (vector float) { width, height, 0.0, 0.0};
+   spu.texture[unit].tex_size_mask = (vector unsigned int)
+         { width - 1, height - 1, 0, 0 };
+   spu.texture[unit].tex_size_x_mask = spu_splats(width - 1);
+   spu.texture[unit].tex_size_y_mask = spu_splats(height - 1);
+}
+
+
+static void
+cmd_state_vertex_info(const struct vertex_info *vinfo)
+{
+   DEBUG_PRINTF("VERTEX_INFO num_attribs=%u\n", vinfo->num_attribs);
+   ASSERT(vinfo->num_attribs >= 1);
+   ASSERT(vinfo->num_attribs <= 8);
+   memcpy(&spu.vertex_info, vinfo, sizeof(*vinfo));
+}
+
+
+static void
+cmd_state_vs_array_info(const struct cell_array_info *vs_info)
+{
+   const unsigned attr = vs_info->attr;
+
+   ASSERT(attr < PIPE_MAX_ATTRIBS);
+   draw.vertex_fetch.src_ptr[attr] = vs_info->base;
+   draw.vertex_fetch.pitch[attr] = vs_info->pitch;
+   draw.vertex_fetch.size[attr] = vs_info->size;
+   draw.vertex_fetch.code_offset[attr] = vs_info->function_offset;
+   draw.vertex_fetch.dirty = 1;
+}
+
+
+static void
+cmd_state_attrib_fetch(const struct cell_attribute_fetch_code *code)
+{
+   mfc_get(attribute_fetch_code_buffer,
+           (unsigned int) code->base,  /* src */
+           code->size,
+           TAG_BATCH_BUFFER,
+           0, /* tid */
+           0  /* rid */);
+   wait_on_mask(1 << TAG_BATCH_BUFFER);
+
+   draw.vertex_fetch.code = attribute_fetch_code_buffer;
+}
+
+
+static void
+cmd_finish(void)
+{
+   DEBUG_PRINTF("FINISH\n");
+   really_clear_tiles(0);
+   /* wait for all outstanding DMAs to finish */
+   mfc_write_tag_mask(~0);
+   mfc_read_tag_status_all();
+   /* send mbox message to PPU */
+   spu_write_out_mbox(CELL_CMD_FINISH);
+}
+
+
+/**
+ * Execute a batch of commands which was sent to us by the PPU.
+ * See the cell_emit_state.c code to see where the commands come from.
+ *
+ * The opcode param encodes the location of the buffer and its size.
+ */
+static void
+cmd_batch(uint opcode)
+{
+   const uint buf = (opcode >> 8) & 0xff;
+   uint size = (opcode >> 16);
+   uint64_t buffer[CELL_BUFFER_SIZE / 8] ALIGN16_ATTRIB;
+   const unsigned usize = size / sizeof(buffer[0]);
+   uint pos;
+
+   DEBUG_PRINTF("BATCH buffer %u, len %u, from %p\n",
+             buf, size, spu.init.buffers[buf]);
+
+   ASSERT((opcode & CELL_CMD_OPCODE_MASK) == CELL_CMD_BATCH);
+
+   ASSERT_ALIGN16(spu.init.buffers[buf]);
+
+   size = ROUNDUP16(size);
+
+   ASSERT_ALIGN16(spu.init.buffers[buf]);
+
+   mfc_get(buffer,  /* dest */
+           (unsigned int) spu.init.buffers[buf],  /* src */
+           size,
+           TAG_BATCH_BUFFER,
+           0, /* tid */
+           0  /* rid */);
+   wait_on_mask(1 << TAG_BATCH_BUFFER);
+
+   /* Tell PPU we're done copying the buffer to local store */
+   DEBUG_PRINTF("release batch buf %u\n", buf);
+   release_buffer(buf);
+
+   /*
+    * Loop over commands in the batch buffer
+    */
+   for (pos = 0; pos < usize; /* no incr */) {
+      switch (buffer[pos]) {
+      /*
+       * rendering commands
+       */
+      case CELL_CMD_CLEAR_SURFACE:
+         {
+            struct cell_command_clear_surface *clr
+               = (struct cell_command_clear_surface *) &buffer[pos];
+            cmd_clear_surface(clr);
+            pos += sizeof(*clr) / 8;
+         }
+         break;
+      case CELL_CMD_RENDER:
+         {
+            struct cell_command_render *render
+               = (struct cell_command_render *) &buffer[pos];
+            uint pos_incr;
+            cmd_render(render, &pos_incr);
+            pos += pos_incr;
+         }
+         break;
+      /*
+       * state-update commands
+       */
+      case CELL_CMD_STATE_FRAMEBUFFER:
+         {
+            struct cell_command_framebuffer *fb
+               = (struct cell_command_framebuffer *) &buffer[pos];
+            cmd_state_framebuffer(fb);
+            pos += sizeof(*fb) / 8;
+         }
+         break;
+      case CELL_CMD_STATE_FRAGMENT_OPS:
+         {
+            struct cell_command_fragment_ops *fops
+               = (struct cell_command_fragment_ops *) &buffer[pos];
+            cmd_state_fragment_ops(fops);
+            pos += sizeof(*fops) / 8;
+         }
+         break;
+      case CELL_CMD_STATE_FRAGMENT_PROGRAM:
+         {
+            struct cell_command_fragment_program *fp
+               = (struct cell_command_fragment_program *) &buffer[pos];
+            cmd_state_fragment_program(fp);
+            pos += sizeof(*fp) / 8;
+         }
+         break;
+      case CELL_CMD_STATE_SAMPLER:
+         {
+            struct cell_command_sampler *sampler
+               = (struct cell_command_sampler *) &buffer[pos];
+            cmd_state_sampler(sampler);
+            pos += sizeof(*sampler) / 8;
+         }
+         break;
+      case CELL_CMD_STATE_TEXTURE:
+         {
+            struct cell_command_texture *texture
+               = (struct cell_command_texture *) &buffer[pos];
+            cmd_state_texture(texture);
+            pos += sizeof(*texture) / 8;
+         }
+         break;
+      case CELL_CMD_STATE_VERTEX_INFO:
+         cmd_state_vertex_info((struct vertex_info *) &buffer[pos+1]);
+         pos += (1 + ROUNDUP8(sizeof(struct vertex_info)) / 8);
+         break;
+      case CELL_CMD_STATE_VIEWPORT:
+         (void) memcpy(& draw.viewport, &buffer[pos+1],
+                       sizeof(struct pipe_viewport_state));
+         pos += (1 + ROUNDUP8(sizeof(struct pipe_viewport_state)) / 8);
+         break;
+      case CELL_CMD_STATE_UNIFORMS:
+         draw.constants = (const float (*)[4]) (uintptr_t) buffer[pos + 1];
+         pos += 2;
+         break;
+      case CELL_CMD_STATE_VS_ARRAY_INFO:
+         cmd_state_vs_array_info((struct cell_array_info *) &buffer[pos+1]);
+         pos += (1 + ROUNDUP8(sizeof(struct cell_array_info)) / 8);
+         break;
+      case CELL_CMD_STATE_BIND_VS:
+#if 0
+         spu_bind_vertex_shader(&draw,
+                                (struct cell_shader_info *) &buffer[pos+1]);
+#endif
+         pos += (1 + ROUNDUP8(sizeof(struct cell_shader_info)) / 8);
+         break;
+      case CELL_CMD_STATE_ATTRIB_FETCH:
+         cmd_state_attrib_fetch((struct cell_attribute_fetch_code *)
+                                &buffer[pos+1]);
+         pos += (1 + ROUNDUP8(sizeof(struct cell_attribute_fetch_code)) / 8);
+         break;
+      /*
+       * misc commands
+       */
+      case CELL_CMD_FINISH:
+         cmd_finish();
+         pos += 1;
+         break;
+      case CELL_CMD_RELEASE_VERTS:
+         {
+            struct cell_command_release_verts *release
+               = (struct cell_command_release_verts *) &buffer[pos];
+            cmd_release_verts(release);
+            pos += sizeof(*release) / 8;
+         }
+         break;
+      case CELL_CMD_FLUSH_BUFFER_RANGE: {
+	 struct cell_buffer_range *br = (struct cell_buffer_range *)
+	     &buffer[pos+1];
+
+	 spu_dcache_mark_dirty((unsigned) br->base, br->size);
+         pos += (1 + ROUNDUP8(sizeof(struct cell_buffer_range)) / 8);
+	 break;
+      }
+      default:
+         printf("SPU %u: bad opcode: 0x%llx\n", spu.init.id, buffer[pos]);
+         ASSERT(0);
+         break;
+      }
+   }
+
+   DEBUG_PRINTF("BATCH complete\n");
+}
+
+
+
+/**
+ * Main loop for SPEs: Get a command, execute it, repeat.
+ */
+void
+command_loop(void)
+{
+   struct cell_command cmd;
+   int exitFlag = 0;
+
+   DEBUG_PRINTF("Enter command loop\n");
+
+   ASSERT((sizeof(struct cell_command) & 0xf) == 0);
+   ASSERT_ALIGN16(&cmd);
+
+   while (!exitFlag) {
+      unsigned opcode;
+      int tag = 0;
+
+      DEBUG_PRINTF("Wait for cmd...\n");
+
+      /* read/wait from mailbox */
+      opcode = (unsigned int) spu_read_in_mbox();
+
+      DEBUG_PRINTF("got cmd 0x%x\n", opcode);
+
+      /* command payload */
+      mfc_get(&cmd,  /* dest */
+              (unsigned int) spu.init.cmd, /* src */
+              sizeof(struct cell_command), /* bytes */
+              tag,
+              0, /* tid */
+              0  /* rid */);
+      wait_on_mask( 1 << tag );
+
+      /*
+       * NOTE: most commands should be contained in a batch buffer
+       */
+
+      switch (opcode & CELL_CMD_OPCODE_MASK) {
+      case CELL_CMD_EXIT:
+         DEBUG_PRINTF("EXIT\n");
+         exitFlag = 1;
+         break;
+      case CELL_CMD_VS_EXECUTE:
+#if 0
+         spu_execute_vertex_shader(&draw, &cmd.vs);
+#endif
+         break;
+      case CELL_CMD_BATCH:
+         cmd_batch(opcode);
+         break;
+      default:
+         printf("Bad opcode 0x%x!\n", opcode & CELL_CMD_OPCODE_MASK);
+      }
+
+   }
+
+   DEBUG_PRINTF("Exit command loop\n");
+
+   spu_dcache_report();
+}
diff --git a/src/gallium/drivers/cell/spu/spu_command.h b/src/gallium/drivers/cell/spu/spu_command.h
new file mode 100644
index 0000000000..853e9aa549
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_command.h
@@ -0,0 +1,7 @@
+
+
+
+extern void
+command_loop(void);
+
+
diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index bc94674fe8..4becd0f92a 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -35,14 +35,11 @@
 #include "pipe/p_defines.h"
 
 #include "spu_funcs.h"
+#include "spu_command.h"
 #include "spu_main.h"
-#include "spu_render.h"
 #include "spu_per_fragment_op.h"
 #include "spu_texture.h"
-#include "spu_tile.h"
 //#include "spu_test.h"
-#include "spu_vertex_shader.h"
-#include "spu_dcache.h"
 #include "spu_debug.h"
 #include "cell/common.h"
 
@@ -55,8 +52,6 @@ helpful headers:
 
 struct spu_global spu;
 
-struct spu_vs_context draw;
-
 
 #if DEBUG
 boolean Debug = FALSE;
@@ -64,554 +59,6 @@ boolean force_fragment_ops_fallback = TRUE;
 #endif
 
 
-/**
- * Buffers containing dynamically generated SPU code:
- */
-static unsigned char attribute_fetch_code_buffer[136 * PIPE_MAX_ATTRIBS]
-    ALIGN16_ATTRIB;
-
-
-
-/**
- * Tell the PPU that this SPU has finished copying a buffer to
- * local store and that it may be reused by the PPU.
- * This is done by writting a 16-byte batch-buffer-status block back into
- * main memory (in cell_context->buffer_status[]).
- */
-static void
-release_buffer(uint buffer)
-{
-   /* Evidently, using less than a 16-byte status doesn't work reliably */
-   static const uint status[4] ALIGN16_ATTRIB
-      = {CELL_BUFFER_STATUS_FREE, 0, 0, 0};
-
-   const uint index = 4 * (spu.init.id * CELL_NUM_BUFFERS + buffer);
-   uint *dst = spu.init.buffer_status + index;
-
-   ASSERT(buffer < CELL_NUM_BUFFERS);
-
-   mfc_put((void *) &status,    /* src in local memory */
-           (unsigned int) dst,  /* dst in main memory */
-           sizeof(status),      /* size */
-           TAG_MISC,            /* tag is unimportant */
-           0, /* tid */
-           0  /* rid */);
-}
-
-
-static void
-cmd_clear_surface(const struct cell_command_clear_surface *clear)
-{
-   DEBUG_PRINTF("CLEAR SURF %u to 0x%08x\n", clear->surface, clear->value);
-
-   if (clear->surface == 0) {
-      spu.fb.color_clear_value = clear->value;
-      if (spu.init.debug_flags & CELL_DEBUG_CHECKER) {
-         uint x = (spu.init.id << 4) | (spu.init.id << 12) |
-            (spu.init.id << 20) | (spu.init.id << 28);
-         spu.fb.color_clear_value ^= x;
-      }
-   }
-   else {
-      spu.fb.depth_clear_value = clear->value;
-   }
-
-#define CLEAR_OPT 1
-#if CLEAR_OPT
-
-   /* Simply set all tiles' status to CLEAR.
-    * When we actually begin rendering into a tile, we'll initialize it to
-    * the clear value.  If any tiles go untouched during the frame,
-    * really_clear_tiles() will set them to the clear value.
-    */
-   if (clear->surface == 0) {
-      memset(spu.ctile_status, TILE_STATUS_CLEAR, sizeof(spu.ctile_status));
-   }
-   else {
-      memset(spu.ztile_status, TILE_STATUS_CLEAR, sizeof(spu.ztile_status));
-   }
-
-#else
-
-   /*
-    * This path clears the whole framebuffer to the clear color right now.
-    */
-
-   /*
-   printf("SPU: %s num=%d w=%d h=%d\n",
-          __FUNCTION__, num_tiles, spu.fb.width_tiles, spu.fb.height_tiles);
-   */
-
-   /* init a single tile to the clear value */
-   if (clear->surface == 0) {
-      clear_c_tile(&spu.ctile);
-   }
-   else {
-      clear_z_tile(&spu.ztile);
-   }
-
-   /* walk over my tiles, writing the 'clear' tile's data */
-   {
-      const uint num_tiles = spu.fb.width_tiles * spu.fb.height_tiles;
-      uint i;
-      for (i = spu.init.id; i < num_tiles; i += spu.init.num_spus) {
-         uint tx = i % spu.fb.width_tiles;
-         uint ty = i / spu.fb.width_tiles;
-         if (clear->surface == 0)
-            put_tile(tx, ty, &spu.ctile, TAG_SURFACE_CLEAR, 0);
-         else
-            put_tile(tx, ty, &spu.ztile, TAG_SURFACE_CLEAR, 1);
-      }
-   }
-
-   if (spu.init.debug_flags & CELL_DEBUG_SYNC) {
-      wait_on_mask(1 << TAG_SURFACE_CLEAR);
-   }
-
-#endif /* CLEAR_OPT */
-
-   DEBUG_PRINTF("CLEAR SURF done\n");
-}
-
-
-static void
-cmd_release_verts(const struct cell_command_release_verts *release)
-{
-   DEBUG_PRINTF("RELEASE VERTS %u\n", release->vertex_buf);
-   ASSERT(release->vertex_buf != ~0U);
-   release_buffer(release->vertex_buf);
-}
-
-
-/**
- * Process a CELL_CMD_STATE_FRAGMENT_OPS command.
- * This involves installing new fragment ops SPU code.
- * If this function is never called, we'll use a regular C fallback function
- * for fragment processing.
- */
-static void
-cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops)
-{
-   static int warned = 0;
-
-   DEBUG_PRINTF("CMD_STATE_FRAGMENT_OPS\n");
-   /* Copy SPU code from batch buffer to spu buffer */
-   memcpy(spu.fragment_ops_code, fops->code, SPU_MAX_FRAGMENT_OPS_INSTS * 4);
-   /* Copy state info (for fallback case only) */
-   memcpy(&spu.depth_stencil_alpha, &fops->dsa, sizeof(fops->dsa));
-   memcpy(&spu.blend, &fops->blend, sizeof(fops->blend));
-
-   /* Parity twist!  For now, always use the fallback code by default,
-    * only switching to codegen when specifically requested.  This
-    * allows us to develop freely without risking taking down the
-    * branch.
-    *
-    * Later, the parity of this check will be reversed, so that
-    * codegen is *always* used, unless we specifically indicate that
-    * we don't want it.
-    *
-    * Eventually, the option will be removed completely, because in
-    * final code we'll always use codegen and won't even provide the
-    * raw state records that the fallback code requires.
-    */
-   if ((spu.init.debug_flags & CELL_DEBUG_FRAGMENT_OP_FALLBACK) == 0) {
-      spu.fragment_ops = (spu_fragment_ops_func) spu.fragment_ops_code;
-   }
-   else {
-      /* otherwise, the default fallback code remains in place */
-      if (!warned) {
-         fprintf(stderr, "Cell Warning: using fallback per-fragment code\n");
-         warned = 1;
-      }
-   }
-
-   spu.read_depth = spu.depth_stencil_alpha.depth.enabled;
-   spu.read_stencil = spu.depth_stencil_alpha.stencil[0].enabled;
-}
-
-
-static void
-cmd_state_fragment_program(const struct cell_command_fragment_program *fp)
-{
-   DEBUG_PRINTF("CMD_STATE_FRAGMENT_PROGRAM\n");
-   /* Copy SPU code from batch buffer to spu buffer */
-   memcpy(spu.fragment_program_code, fp->code,
-          SPU_MAX_FRAGMENT_PROGRAM_INSTS * 4);
-#if 01
-   /* Point function pointer at new code */
-   spu.fragment_program = (spu_fragment_program_func)spu.fragment_program_code;
-#endif
-}
-
-
-static void
-cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
-{
-   DEBUG_PRINTF("FRAMEBUFFER: %d x %d at %p, cformat 0x%x  zformat 0x%x\n",
-             cmd->width,
-             cmd->height,
-             cmd->color_start,
-             cmd->color_format,
-             cmd->depth_format);
-
-   ASSERT_ALIGN16(cmd->color_start);
-   ASSERT_ALIGN16(cmd->depth_start);
-
-   spu.fb.color_start = cmd->color_start;
-   spu.fb.depth_start = cmd->depth_start;
-   spu.fb.color_format = cmd->color_format;
-   spu.fb.depth_format = cmd->depth_format;
-   spu.fb.width = cmd->width;
-   spu.fb.height = cmd->height;
-   spu.fb.width_tiles = (spu.fb.width + TILE_SIZE - 1) / TILE_SIZE;
-   spu.fb.height_tiles = (spu.fb.height + TILE_SIZE - 1) / TILE_SIZE;
-
-   switch (spu.fb.depth_format) {
-   case PIPE_FORMAT_Z32_UNORM:
-      spu.fb.zsize = 4;
-      spu.fb.zscale = (float) 0xffffffffu;
-      break;
-   case PIPE_FORMAT_Z24S8_UNORM:
-   case PIPE_FORMAT_S8Z24_UNORM:
-   case PIPE_FORMAT_Z24X8_UNORM:
-   case PIPE_FORMAT_X8Z24_UNORM:
-      spu.fb.zsize = 4;
-      spu.fb.zscale = (float) 0x00ffffffu;
-      break;
-   case PIPE_FORMAT_Z16_UNORM:
-      spu.fb.zsize = 2;
-      spu.fb.zscale = (float) 0xffffu;
-      break;
-   default:
-      spu.fb.zsize = 0;
-      break;
-   }
-}
-
-
-static void
-cmd_state_sampler(const struct cell_command_sampler *sampler)
-{
-   DEBUG_PRINTF("SAMPLER [%u]\n", sampler->unit);
-
-   spu.sampler[sampler->unit] = sampler->state;
-   if (spu.sampler[sampler->unit].min_img_filter == PIPE_TEX_FILTER_LINEAR)
-      spu.sample_texture[sampler->unit] = sample_texture_bilinear;
-   else
-      spu.sample_texture[sampler->unit] = sample_texture_nearest;
-}
-
-
-static void
-cmd_state_texture(const struct cell_command_texture *texture)
-{
-   const uint unit = texture->unit;
-   const uint width = texture->width;
-   const uint height = texture->height;
-
-   DEBUG_PRINTF("TEXTURE [%u] at %p  size %u x %u\n",
-             texture->unit, texture->start,
-             texture->width, texture->height);
-
-   spu.texture[unit].start = texture->start;
-   spu.texture[unit].width = width;
-   spu.texture[unit].height = height;
-
-   spu.texture[unit].tiles_per_row = width / TILE_SIZE;
-
-   spu.texture[unit].tex_size = (vector float) { width, height, 0.0, 0.0};
-   spu.texture[unit].tex_size_mask = (vector unsigned int)
-         { width - 1, height - 1, 0, 0 };
-   spu.texture[unit].tex_size_x_mask = spu_splats(width - 1);
-   spu.texture[unit].tex_size_y_mask = spu_splats(height - 1);
-}
-
-
-static void
-cmd_state_vertex_info(const struct vertex_info *vinfo)
-{
-   DEBUG_PRINTF("VERTEX_INFO num_attribs=%u\n", vinfo->num_attribs);
-   ASSERT(vinfo->num_attribs >= 1);
-   ASSERT(vinfo->num_attribs <= 8);
-   memcpy(&spu.vertex_info, vinfo, sizeof(*vinfo));
-}
-
-
-static void
-cmd_state_vs_array_info(const struct cell_array_info *vs_info)
-{
-   const unsigned attr = vs_info->attr;
-
-   ASSERT(attr < PIPE_MAX_ATTRIBS);
-   draw.vertex_fetch.src_ptr[attr] = vs_info->base;
-   draw.vertex_fetch.pitch[attr] = vs_info->pitch;
-   draw.vertex_fetch.size[attr] = vs_info->size;
-   draw.vertex_fetch.code_offset[attr] = vs_info->function_offset;
-   draw.vertex_fetch.dirty = 1;
-}
-
-
-static void
-cmd_state_attrib_fetch(const struct cell_attribute_fetch_code *code)
-{
-   mfc_get(attribute_fetch_code_buffer,
-           (unsigned int) code->base,  /* src */
-           code->size,
-           TAG_BATCH_BUFFER,
-           0, /* tid */
-           0  /* rid */);
-   wait_on_mask(1 << TAG_BATCH_BUFFER);
-
-   draw.vertex_fetch.code = attribute_fetch_code_buffer;
-}
-
-
-static void
-cmd_finish(void)
-{
-   DEBUG_PRINTF("FINISH\n");
-   really_clear_tiles(0);
-   /* wait for all outstanding DMAs to finish */
-   mfc_write_tag_mask(~0);
-   mfc_read_tag_status_all();
-   /* send mbox message to PPU */
-   spu_write_out_mbox(CELL_CMD_FINISH);
-}
-
-
-/**
- * Execute a batch of commands which was sent to us by the PPU.
- * See the cell_emit_state.c code to see where the commands come from.
- *
- * The opcode param encodes the location of the buffer and its size.
- */
-static void
-cmd_batch(uint opcode)
-{
-   const uint buf = (opcode >> 8) & 0xff;
-   uint size = (opcode >> 16);
-   uint64_t buffer[CELL_BUFFER_SIZE / 8] ALIGN16_ATTRIB;
-   const unsigned usize = size / sizeof(buffer[0]);
-   uint pos;
-
-   DEBUG_PRINTF("BATCH buffer %u, len %u, from %p\n",
-             buf, size, spu.init.buffers[buf]);
-
-   ASSERT((opcode & CELL_CMD_OPCODE_MASK) == CELL_CMD_BATCH);
-
-   ASSERT_ALIGN16(spu.init.buffers[buf]);
-
-   size = ROUNDUP16(size);
-
-   ASSERT_ALIGN16(spu.init.buffers[buf]);
-
-   mfc_get(buffer,  /* dest */
-           (unsigned int) spu.init.buffers[buf],  /* src */
-           size,
-           TAG_BATCH_BUFFER,
-           0, /* tid */
-           0  /* rid */);
-   wait_on_mask(1 << TAG_BATCH_BUFFER);
-
-   /* Tell PPU we're done copying the buffer to local store */
-   DEBUG_PRINTF("release batch buf %u\n", buf);
-   release_buffer(buf);
-
-   /*
-    * Loop over commands in the batch buffer
-    */
-   for (pos = 0; pos < usize; /* no incr */) {
-      switch (buffer[pos]) {
-      /*
-       * rendering commands
-       */
-      case CELL_CMD_CLEAR_SURFACE:
-         {
-            struct cell_command_clear_surface *clr
-               = (struct cell_command_clear_surface *) &buffer[pos];
-            cmd_clear_surface(clr);
-            pos += sizeof(*clr) / 8;
-         }
-         break;
-      case CELL_CMD_RENDER:
-         {
-            struct cell_command_render *render
-               = (struct cell_command_render *) &buffer[pos];
-            uint pos_incr;
-            cmd_render(render, &pos_incr);
-            pos += pos_incr;
-         }
-         break;
-      /*
-       * state-update commands
-       */
-      case CELL_CMD_STATE_FRAMEBUFFER:
-         {
-            struct cell_command_framebuffer *fb
-               = (struct cell_command_framebuffer *) &buffer[pos];
-            cmd_state_framebuffer(fb);
-            pos += sizeof(*fb) / 8;
-         }
-         break;
-      case CELL_CMD_STATE_FRAGMENT_OPS:
-         {
-            struct cell_command_fragment_ops *fops
-               = (struct cell_command_fragment_ops *) &buffer[pos];
-            cmd_state_fragment_ops(fops);
-            pos += sizeof(*fops) / 8;
-         }
-         break;
-      case CELL_CMD_STATE_FRAGMENT_PROGRAM:
-         {
-            struct cell_command_fragment_program *fp
-               = (struct cell_command_fragment_program *) &buffer[pos];
-            cmd_state_fragment_program(fp);
-            pos += sizeof(*fp) / 8;
-         }
-         break;
-      case CELL_CMD_STATE_SAMPLER:
-         {
-            struct cell_command_sampler *sampler
-               = (struct cell_command_sampler *) &buffer[pos];
-            cmd_state_sampler(sampler);
-            pos += sizeof(*sampler) / 8;
-         }
-         break;
-      case CELL_CMD_STATE_TEXTURE:
-         {
-            struct cell_command_texture *texture
-               = (struct cell_command_texture *) &buffer[pos];
-            cmd_state_texture(texture);
-            pos += sizeof(*texture) / 8;
-         }
-         break;
-      case CELL_CMD_STATE_VERTEX_INFO:
-         cmd_state_vertex_info((struct vertex_info *) &buffer[pos+1]);
-         pos += (1 + ROUNDUP8(sizeof(struct vertex_info)) / 8);
-         break;
-      case CELL_CMD_STATE_VIEWPORT:
-         (void) memcpy(& draw.viewport, &buffer[pos+1],
-                       sizeof(struct pipe_viewport_state));
-         pos += (1 + ROUNDUP8(sizeof(struct pipe_viewport_state)) / 8);
-         break;
-      case CELL_CMD_STATE_UNIFORMS:
-         draw.constants = (const float (*)[4]) (uintptr_t) buffer[pos + 1];
-         pos += 2;
-         break;
-      case CELL_CMD_STATE_VS_ARRAY_INFO:
-         cmd_state_vs_array_info((struct cell_array_info *) &buffer[pos+1]);
-         pos += (1 + ROUNDUP8(sizeof(struct cell_array_info)) / 8);
-         break;
-      case CELL_CMD_STATE_BIND_VS:
-#if 0
-         spu_bind_vertex_shader(&draw,
-                                (struct cell_shader_info *) &buffer[pos+1]);
-#endif
-         pos += (1 + ROUNDUP8(sizeof(struct cell_shader_info)) / 8);
-         break;
-      case CELL_CMD_STATE_ATTRIB_FETCH:
-         cmd_state_attrib_fetch((struct cell_attribute_fetch_code *)
-                                &buffer[pos+1]);
-         pos += (1 + ROUNDUP8(sizeof(struct cell_attribute_fetch_code)) / 8);
-         break;
-      /*
-       * misc commands
-       */
-      case CELL_CMD_FINISH:
-         cmd_finish();
-         pos += 1;
-         break;
-      case CELL_CMD_RELEASE_VERTS:
-         {
-            struct cell_command_release_verts *release
-               = (struct cell_command_release_verts *) &buffer[pos];
-            cmd_release_verts(release);
-            pos += sizeof(*release) / 8;
-         }
-         break;
-      case CELL_CMD_FLUSH_BUFFER_RANGE: {
-	 struct cell_buffer_range *br = (struct cell_buffer_range *)
-	     &buffer[pos+1];
-
-	 spu_dcache_mark_dirty((unsigned) br->base, br->size);
-         pos += (1 + ROUNDUP8(sizeof(struct cell_buffer_range)) / 8);
-	 break;
-      }
-      default:
-         printf("SPU %u: bad opcode: 0x%llx\n", spu.init.id, buffer[pos]);
-         ASSERT(0);
-         break;
-      }
-   }
-
-   DEBUG_PRINTF("BATCH complete\n");
-}
-
-
-/**
- * Temporary/simple main loop for SPEs: Get a command, execute it, repeat.
- */
-static void
-main_loop(void)
-{
-   struct cell_command cmd;
-   int exitFlag = 0;
-
-   DEBUG_PRINTF("Enter main loop\n");
-
-   ASSERT((sizeof(struct cell_command) & 0xf) == 0);
-   ASSERT_ALIGN16(&cmd);
-
-   while (!exitFlag) {
-      unsigned opcode;
-      int tag = 0;
-
-      DEBUG_PRINTF("Wait for cmd...\n");
-
-      /* read/wait from mailbox */
-      opcode = (unsigned int) spu_read_in_mbox();
-
-      DEBUG_PRINTF("got cmd 0x%x\n", opcode);
-
-      /* command payload */
-      mfc_get(&cmd,  /* dest */
-              (unsigned int) spu.init.cmd, /* src */
-              sizeof(struct cell_command), /* bytes */
-              tag,
-              0, /* tid */
-              0  /* rid */);
-      wait_on_mask( 1 << tag );
-
-      /*
-       * NOTE: most commands should be contained in a batch buffer
-       */
-
-      switch (opcode & CELL_CMD_OPCODE_MASK) {
-      case CELL_CMD_EXIT:
-         DEBUG_PRINTF("EXIT\n");
-         exitFlag = 1;
-         break;
-      case CELL_CMD_VS_EXECUTE:
-#if 0
-         spu_execute_vertex_shader(&draw, &cmd.vs);
-#endif
-         break;
-      case CELL_CMD_BATCH:
-         cmd_batch(opcode);
-         break;
-      default:
-         printf("Bad opcode 0x%x!\n", opcode & CELL_CMD_OPCODE_MASK);
-      }
-
-   }
-
-   DEBUG_PRINTF("Exit main loop\n");
-
-   spu_dcache_report();
-}
-
-
-
 static void
 one_time_init(void)
 {
@@ -658,6 +105,7 @@ main(main_param_t speid, main_param_t argp)
    DEBUG_PRINTF("main() speid=%lu\n", (unsigned long) speid);
    D_PRINTF(CELL_DEBUG_FRAGMENT_OP_FALLBACK, "using fragment op fallback\n");
 
+   /* get initialization data */
    mfc_get(&spu.init,  /* dest */
            (unsigned int) argp, /* src */
            sizeof(struct cell_init_info), /* bytes */
@@ -675,7 +123,7 @@ main(main_param_t speid, main_param_t argp)
       spu_test_misc(spu.init.id);
 #endif
 
-   main_loop();
+   command_loop();
 
    return 0;
 }
-- 
cgit v1.2.3


From 55b65d3b42b8ba1ea1c5b5549b4629f3b20e7a97 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 26 Sep 2008 17:57:01 -0600
Subject: cell: stub-out sin/cos function bodies to avoid trashing caller's
 stack for now

---
 src/gallium/drivers/cell/spu/spu_funcs.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/spu/spu_funcs.c b/src/gallium/drivers/cell/spu/spu_funcs.c
index d174956518..b57ad3f3b8 100644
--- a/src/gallium/drivers/cell/spu/spu_funcs.c
+++ b/src/gallium/drivers/cell/spu/spu_funcs.c
@@ -49,17 +49,27 @@
 static vector float
 spu_cos(vector float x)
 {
+#if 0
    static const float scale = 1.0 / (2.0 * M_PI);
    x = x * spu_splats(scale); /* normalize */
    return _cos8_v(x);
+#else
+   /* just pass-through to avoid trashing caller's stack */
+   return x;
+#endif
 }
 
 static vector float
 spu_sin(vector float x)
 {
+#if 0
    static const float scale = 1.0 / (2.0 * M_PI);
    x = x * spu_splats(scale); /* normalize */
    return _sin8_v(x);   /* 8-bit accuracy enough?? */
+#else
+   /* just pass-through to avoid trashing caller's stack */
+   return x;
+#endif
 }
 
 
-- 
cgit v1.2.3


From afaa53040bd01ca86762e7d7b1a5a65810767921 Mon Sep 17 00:00:00 2001
From: Robert Ellison <papillo@tungstengraphics.com>
Date: Fri, 3 Oct 2008 18:00:43 -0600
Subject: CELL: changes to generate SPU code for stenciling

This set of code changes are for stencil code generation
support.  Both one-sided and two-sided stenciling are supported.
In addition to the raw code generation changes, these changes had
to be made elsewhere in the system:

- Added new "register set" feature to the SPE assembly generation.
  A "register set" is a way to allocate multiple registers and free
  them all at the same time, delegating register allocation management
  to the spe_function unit.  It's quite useful in complex register
  allocation schemes (like stenciling).

- Added and improved SPE macro calculations.
  These are operations between registers and unsigned integer
  immediates.  In many cases, the calculation can be performed
  with a single instruction; the macros will generate the
  single instruction if possible, or generate a register load
  and register-to-register operation if not.  These macro
  functions are: spe_load_uint() (which has new ways to
  load a value in a single instruction), spe_and_uint(),
  spe_xor_uint(), spe_compare_equal_uint(), and spe_compare_greater_uint().

- Added facing to fragment generation.  While rendering, the rasterizer
  needs to be able to determine front- and back-facing fragments, in order
  to correctly apply two-sided stencil.  That requires these changes:
  - Added front_winding field to the cell_command_render block, so that
    the state tracker could communicate to the rasterizer what it
    considered to be the front-facing direction.
  - Added fragment facing as an input to the fragment function.
  - Calculated facing is passed during emit_quad().
---
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c        | 246 +++++-
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h        |  41 +-
 src/gallium/drivers/cell/common.h                  |   1 +
 src/gallium/drivers/cell/ppu/cell_gen_fragment.c   | 881 ++++++++++++++++++---
 src/gallium/drivers/cell/ppu/cell_render.c         |   1 +
 src/gallium/drivers/cell/ppu/cell_vbuf.c           |   1 +
 src/gallium/drivers/cell/spu/spu_main.h            |   3 +-
 src/gallium/drivers/cell/spu/spu_per_fragment_op.c |  19 +-
 src/gallium/drivers/cell/spu/spu_per_fragment_op.h |   3 +-
 src/gallium/drivers/cell/spu/spu_render.c          |   4 +-
 src/gallium/drivers/cell/spu/spu_tri.c             |  35 +-
 src/gallium/drivers/cell/spu/spu_tri.h             |   2 +-
 12 files changed, 1091 insertions(+), 146 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
index 491141f190..8a87e9abb1 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
@@ -359,14 +359,21 @@ void _name (struct spe_function *p, int imm) \
  */
 void spe_init_func(struct spe_function *p, unsigned code_size)
 {
+    register unsigned int i;
+
     p->store = align_malloc(code_size, 16);
     p->num_inst = 0;
     p->max_inst = code_size / SPE_INST_SIZE;
 
+    p->set_count = 0;
+    memset(p->regs, 0, SPE_NUM_REGS * sizeof(p->regs[0]));
+
     /* Conservatively treat R0 - R2 and R80 - R127 as non-volatile.
      */
-    p->regs[0] = ~7;
-    p->regs[1] = (1U << (80 - 64)) - 1;
+    p->regs[0] = p->regs[1] = p->regs[2] = 1;
+    for (i = 80; i <= 127; i++) {
+      p->regs[i] = 1;
+    }
 
     p->print = false;
     p->indent = 0;
@@ -398,12 +405,8 @@ int spe_allocate_available_register(struct spe_function *p)
 {
    unsigned i;
    for (i = 0; i < SPE_NUM_REGS; i++) {
-      const uint64_t mask = (1ULL << (i % 64));
-      const unsigned idx = i / 64;
-
-      assert(idx < 2);
-      if ((p->regs[idx] & mask) != 0) {
-         p->regs[idx] &= ~mask;
+      if (p->regs[i] == 0) {
+         p->regs[i] = 1;
          return i;
       }
    }
@@ -417,31 +420,68 @@ int spe_allocate_available_register(struct spe_function *p)
  */
 int spe_allocate_register(struct spe_function *p, int reg)
 {
-   const unsigned idx = reg / 64;
-   const unsigned bit = reg % 64;
-
    assert(reg < SPE_NUM_REGS);
-   assert((p->regs[idx] & (1ULL << bit)) != 0);
-
-   p->regs[idx] &= ~(1ULL << bit);
+   assert(p->regs[reg] == 0);
+   p->regs[reg] = 1;
    return reg;
 }
 
 
 /**
- * Mark the given SPE register as "unallocated".
+ * Mark the given SPE register as "unallocated".  Note that this should
+ * only be used on registers allocated in the current register set; an
+ * assertion will fail if an attempt is made to deallocate a register
+ * allocated in an earlier register set.
  */
 void spe_release_register(struct spe_function *p, int reg)
 {
-   const unsigned idx = reg / 64;
-   const unsigned bit = reg % 64;
+   assert(reg < SPE_NUM_REGS);
+   assert(p->regs[reg] == 1);
 
-   assert(idx < 2);
+   p->regs[reg] = 0;
+}
 
-   assert(reg < SPE_NUM_REGS);
-   assert((p->regs[idx] & (1ULL << bit)) == 0);
+/**
+ * Start a new set of registers.  This can be called if
+ * it will be difficult later to determine exactly what
+ * registers were actually allocated during a code generation
+ * sequence, and you really just want to deallocate all of them.
+ */
+void spe_allocate_register_set(struct spe_function *p)
+{
+   register unsigned int i;
+
+   /* Keep track of the set count.  If it ever wraps around to 0, 
+    * we're in trouble.
+    */
+   p->set_count++;
+   assert(p->set_count > 0);
+
+   /* Increment the allocation count of all registers currently
+    * allocated.  Then any registers that are allocated in this set
+    * will be the only ones with a count of 1; they'll all be released
+    * when the register set is released.
+    */
+   for (i = 0; i < SPE_NUM_REGS; i++) {
+      if (p->regs[i] > 0) p->regs[i]++;
+   }
+}
+
+void spe_release_register_set(struct spe_function *p)
+{
+   unsigned int i;
+
+   /* If the set count drops below zero, we're in trouble. */
+   assert(p->set_count > 0);
+   p->set_count--;
 
-   p->regs[idx] |= (1ULL << bit);
+   /* Drop the allocation level of all registers.  Any allocated
+    * during this register set will drop to 0 and then become
+    * available.
+    */
+   for (i = 0; i < SPE_NUM_REGS; i++) {
+      if (p->regs[i] > 0) p->regs[i]--;
+   }
 }
 
 
@@ -603,8 +643,10 @@ void spe_load_uint(struct spe_function *p, unsigned rT, unsigned int ui)
 {
    /* If the whole value is in the lower 18 bits, use ila, which
     * doesn't sign-extend.  Otherwise, if the two halfwords of
-    * the constant are identical, use ilh.  Otherwise, we have
-    * to use ilhu followed by iohl.
+    * the constant are identical, use ilh.  Otherwise, if every byte of
+    * the desired value is 0x00 or 0xff, we can use Form Select Mask for
+    * Bytes Immediate (fsmbi) to load the value in a single instruction.
+    * Otherwise, in the general case, we have to use ilhu followed by iohl.
     */
    if ((ui & 0xfffc0000) == ui) {
       spe_ila(p, rT, ui);
@@ -612,13 +654,171 @@ void spe_load_uint(struct spe_function *p, unsigned rT, unsigned int ui)
    else if ((ui >> 16) == (ui & 0xffff)) {
       spe_ilh(p, rT, ui & 0xffff);
    }
+   else if (
+      ((ui & 0x000000ff) == 0 || (ui & 0x000000ff) == 0x000000ff) &&
+      ((ui & 0x0000ff00) == 0 || (ui & 0x0000ff00) == 0x0000ff00) &&
+      ((ui & 0x00ff0000) == 0 || (ui & 0x00ff0000) == 0x00ff0000) &&
+      ((ui & 0xff000000) == 0 || (ui & 0xff000000) == 0xff000000)
+   ) {
+      unsigned int mask = 0;
+      /* fsmbi duplicates each bit in the given mask eight times,
+       * using a 16-bit value to initialize a 16-byte quadword.
+       * Each 4-bit nybble of the mask corresponds to a full word
+       * of the result; look at the value and figure out the mask
+       * (replicated for each word in the quadword), and then
+       * form the "select mask" to get the value.
+       */
+      if ((ui & 0x000000ff) == 0x000000ff) mask |= 0x1111;
+      if ((ui & 0x0000ff00) == 0x0000ff00) mask |= 0x2222;
+      if ((ui & 0x00ff0000) == 0x00ff0000) mask |= 0x4444;
+      if ((ui & 0xff000000) == 0xff000000) mask |= 0x8888;
+      spe_fsmbi(p, rT, mask);
+   }
    else {
+      /* The general case: this usually uses two instructions, but
+       * may use only one if the low-order 16 bits of each word are 0.
+       */
       spe_ilhu(p, rT, ui >> 16);
       if (ui & 0xffff)
          spe_iohl(p, rT, ui & 0xffff);
    }
 }
 
+/* This function is constructed identically to spe_sor_uint() below.
+ * Changes to one should be made in the other.
+ */
+void spe_and_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
+{
+   /* If we can, emit a single instruction, either And Byte Immediate
+    * (which uses the same constant across each byte), And Halfword Immediate
+    * (which sign-extends a 10-bit immediate to 16 bits and uses that
+    * across each halfword), or And Word Immediate (which sign-extends
+    * a 10-bit immediate to 32 bits).
+    *
+    * Otherwise, we'll need to use a temporary register.
+    */
+   register unsigned int tmp;
+
+   /* If the upper 23 bits are all 0s or all 1s, sign extension
+    * will work and we can use And Word Immediate
+    */
+   tmp = ui & 0xfffffe00;
+   if (tmp == 0xfffffe00 || tmp  == 0) {
+      spe_andi(p, rT, rA, ui & 0x000003ff);
+      return;
+   }
+   
+   /* If the ui field is symmetric along halfword boundaries and
+    * the upper 7 bits of each halfword are all 0s or 1s, we
+    * can use And Halfword Immediate
+    */
+   tmp = ui & 0xfe00fe00;
+   if ((tmp == 0xfe00fe00 || tmp == 0) && ((ui >> 16) == (ui & 0x0000ffff))) {
+      spe_andhi(p, rT, rA, ui & 0x000003ff);
+      return;
+   }
+
+   /* If the ui field is symmetric in each byte, then we can use
+    * the And Byte Immediate instruction.
+    */
+   tmp = ui & 0x000000ff;
+   if ((ui >> 24) == tmp && ((ui >> 16) & 0xff) == tmp && ((ui >> 8) & 0xff) == tmp) {
+      spe_andbi(p, rT, rA, tmp);
+      return;
+   }
+
+   /* Otherwise, we'll have to use a temporary register. */
+   unsigned int tmp_reg = spe_allocate_available_register(p);
+   spe_load_uint(p, tmp_reg, ui);
+   spe_and(p, rT, rA, tmp_reg);
+   spe_release_register(p, tmp_reg);
+}
+
+/* This function is constructed identically to spe_and_uint() above.
+ * Changes to one should be made in the other.
+ */
+void spe_xor_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
+{
+   /* If we can, emit a single instruction, either Exclusive Or Byte 
+    * Immediate (which uses the same constant across each byte), Exclusive 
+    * Or Halfword Immediate (which sign-extends a 10-bit immediate to 
+    * 16 bits and uses that across each halfword), or Exclusive Or Word 
+    * Immediate (which sign-extends a 10-bit immediate to 32 bits).
+    *
+    * Otherwise, we'll need to use a temporary register.
+    */
+   register unsigned int tmp;
+
+   /* If the upper 23 bits are all 0s or all 1s, sign extension
+    * will work and we can use Exclusive Or Word Immediate
+    */
+   tmp = ui & 0xfffffe00;
+   if (tmp == 0xfffffe00 || tmp  == 0) {
+      spe_xori(p, rT, rA, ui & 0x000003ff);
+      return;
+   }
+   
+   /* If the ui field is symmetric along halfword boundaries and
+    * the upper 7 bits of each halfword are all 0s or 1s, we
+    * can use Exclusive Or Halfword Immediate
+    */
+   tmp = ui & 0xfe00fe00;
+   if ((tmp == 0xfe00fe00 || tmp == 0) && ((ui >> 16) == (ui & 0x0000ffff))) {
+      spe_xorhi(p, rT, rA, ui & 0x000003ff);
+      return;
+   }
+
+   /* If the ui field is symmetric in each byte, then we can use
+    * the Exclusive Or Byte Immediate instruction.
+    */
+   tmp = ui & 0x000000ff;
+   if ((ui >> 24) == tmp && ((ui >> 16) & 0xff) == tmp && ((ui >> 8) & 0xff) == tmp) {
+      spe_xorbi(p, rT, rA, tmp);
+      return;
+   }
+
+   /* Otherwise, we'll have to use a temporary register. */
+   unsigned int tmp_reg = spe_allocate_available_register(p);
+   spe_load_uint(p, tmp_reg, ui);
+   spe_xor(p, rT, rA, tmp_reg);
+   spe_release_register(p, tmp_reg);
+}
+
+void
+spe_compare_equal_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
+{
+   /* If the comparison value is 9 bits or less, it fits inside a
+    * Compare Equal Word Immediate instruction.
+    */
+   if ((ui & 0x000001ff) == ui) {
+      spe_ceqi(p, rT, rA, ui);
+   }
+   /* Otherwise, we're going to have to load a word first. */
+   else {
+      unsigned int tmp_reg = spe_allocate_available_register(p);
+      spe_load_uint(p, tmp_reg, ui);
+      spe_ceq(p, rT, rA, tmp_reg);
+      spe_release_register(p, tmp_reg);
+   }
+}
+
+void
+spe_compare_greater_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
+{
+   /* If the comparison value is 10 bits or less, it fits inside a
+    * Compare Logical Greater Than Word Immediate instruction.
+    */
+   if ((ui & 0x000003ff) == ui) {
+      spe_clgti(p, rT, rA, ui);
+   }
+   /* Otherwise, we're going to have to load a word first. */
+   else {
+      unsigned int tmp_reg = spe_allocate_available_register(p);
+      spe_load_uint(p, tmp_reg, ui);
+      spe_clgt(p, rT, rA, tmp_reg);
+      spe_release_register(p, tmp_reg);
+   }
+}
 
 void
 spe_splat(struct spe_function *p, unsigned rT, unsigned rA)
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
index 61c7edeb60..cd2e245409 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
@@ -53,17 +53,26 @@ struct spe_function
    uint num_inst;
    uint max_inst;
 
-    /**
-     * Mask of used / unused registers
-     *
-     * Each set bit corresponds to an available register.  Each cleared bit
-     * corresponds to an allocated register.
+   /**
+    * The "set count" reflects the number of nested register sets
+    * are allowed.  In the unlikely case that we exceed the set count,
+    * register allocation will start to be confused, which is critical
+    * enough that we check for it.
+    */
+   unsigned char set_count;
+
+   /** 
+    * Flags for used and unused registers.  Each byte corresponds to a
+    * register; a 0 in that byte means that the register is available.
+    * A value of 1 means that the register was allocated in the current
+    * register set.  Any other value N means that the register was allocated
+    * N register sets ago.
      *
      * \sa
      * spe_allocate_register, spe_allocate_available_register,
-     * spe_release_register
+     * spe_allocate_register_set, spe_release_register_set, spe_release_register, 
      */
-    uint64_t regs[SPE_NUM_REGS / 64];
+    unsigned char regs[SPE_NUM_REGS];
 
     boolean print; /**< print/dump instructions as they're emitted? */
     int indent;    /**< number of spaces to indent */
@@ -77,6 +86,8 @@ extern unsigned spe_code_size(const struct spe_function *p);
 extern int spe_allocate_available_register(struct spe_function *p);
 extern int spe_allocate_register(struct spe_function *p, int reg);
 extern void spe_release_register(struct spe_function *p, int reg);
+extern void spe_allocate_register_set(struct spe_function *p);
+extern void spe_release_register_set(struct spe_function *p);
 
 extern void spe_print_code(struct spe_function *p, boolean enable);
 extern void spe_indent(struct spe_function *p, int spaces);
@@ -307,6 +318,22 @@ spe_load_int(struct spe_function *p, unsigned rT, int i);
 extern void
 spe_load_uint(struct spe_function *p, unsigned rT, unsigned int ui);
 
+/** And immediate value into rT. */
+extern void
+spe_and_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui);
+
+/** Xor immediate value into rT. */
+extern void
+spe_xor_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui);
+
+/** Compare equal with immediate value. */
+extern void
+spe_compare_equal_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui);
+
+/** Compare greater with immediate value. */
+extern void
+spe_compare_greater_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui);
+
 /** Replicate word 0 of rA across rT. */
 extern void
 spe_splat(struct spe_function *p, unsigned rT, unsigned rA);
diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index 99329fd8e2..c223bc1744 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -227,6 +227,7 @@ struct cell_command_render
    float xmin, ymin, xmax, ymax;  /* XXX another dummy field */
    uint min_index;
    boolean inline_verts;
+   uint front_winding; /* the rasterizer needs to be able to determine facing to apply front/back-facing stencil */
 };
 
 
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
index 653afc235d..f920ae13b4 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
@@ -54,10 +54,12 @@
  * \param ifragZ_reg  register containing integer fragment Z values (in)
  * \param ifbZ_reg    register containing integer frame buffer Z values (in/out)
  * \param zmask_reg   register containing result of Z test/comparison (out)
+ *
+ * Returns true if the Z-buffer needs to be updated.
  */
-static void
-gen_depth_test(const struct pipe_depth_stencil_alpha_state *dsa,
-               struct spe_function *f,
+static boolean
+gen_depth_test(struct spe_function *f,
+               const struct pipe_depth_stencil_alpha_state *dsa,
                int mask_reg, int ifragZ_reg, int ifbZ_reg, int zmask_reg)
 {
    /* NOTE: we use clgt below, not cgt, because we want to compare _unsigned_
@@ -132,7 +134,10 @@ gen_depth_test(const struct pipe_depth_stencil_alpha_state *dsa,
        * framebufferZ = (ztest_passed ? fragmentZ : framebufferZ;
        */
       spe_selb(f, ifbZ_reg, ifbZ_reg, ifragZ_reg, mask_reg);
+      return true;
    }
+
+   return false;
 }
 
 
@@ -238,22 +243,34 @@ gen_alpha_test(const struct pipe_depth_stencil_alpha_state *dsa,
  * it and have to allocate and load it again unnecessarily.
  */
 static inline void
-setup_const_register(struct spe_function *f, boolean *is_already_set, unsigned int *r, float value)
+setup_optional_register(struct spe_function *f, boolean *is_already_set, unsigned int *r)
 {
    if (*is_already_set) return;
    *r = spe_allocate_available_register(f);
-   spe_load_float(f, *r, value);
-   *is_already_set = true;
 }
 
 static inline void
-release_const_register(struct spe_function *f, boolean *is_already_set, unsigned int r)
+release_optional_register(struct spe_function *f, boolean *is_already_set, unsigned int r)
 {
     if (!*is_already_set) return;
     spe_release_register(f, r);
     *is_already_set = false;
 }
 
+static inline void
+setup_const_register(struct spe_function *f, boolean *is_already_set, unsigned int *r, float value)
+{
+   if (*is_already_set) return;
+   setup_optional_register(f, is_already_set, r);
+   spe_load_float(f, *r, value);
+}
+
+static inline void
+release_const_register(struct spe_function *f, boolean *is_already_set, unsigned int r)
+{
+    release_optional_register(f, is_already_set, r);
+}
+
 /**
  * Generate SPE code to implement the given blend mode for a quad of pixels.
  * \param f          SPE function to append instruction onto.
@@ -1117,6 +1134,633 @@ gen_colormask(struct spe_function *f,
     spe_release_register(f, colormask_reg);
 }
 
+/* This function is annoyingly similar to gen_depth_test(), above, except
+ * that instead of comparing two varying values (i.e. fragment and buffer),
+ * we're comparing a varying value with a static value.  As such, we have
+ * access to the Compare Immediate instructions where we don't in 
+ * gen_depth_test(), which is what makes us very different.
+ *
+ * The return value in the stencil_pass_reg is a bitmask of valid
+ * fragments that also passed the stencil test.  The bitmask of valid
+ * fragments that failed would be found in (mask_reg & ~stencil_pass_reg).
+ */
+static void
+gen_stencil_test(struct spe_function *f, const struct pipe_stencil_state *state, 
+                 unsigned int mask_reg, unsigned int fbS_reg, 
+                 unsigned int stencil_pass_reg)
+{
+   /* Generate code that puts the set of passing fragments into the stencil_pass_reg
+    * register, taking into account whether each fragment was active to begin with.
+    */
+   switch (state->func) {
+   case PIPE_FUNC_EQUAL:
+      /* stencil_pass = mask & (s == reference) */
+      spe_compare_equal_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
+      spe_and(f, stencil_pass_reg, mask_reg, stencil_pass_reg);
+      /* stencil_fail = mask & ~stencil_pass */
+      break;
+
+   case PIPE_FUNC_NOTEQUAL:
+      /* stencil_pass = mask & ~(s == reference) */
+      spe_compare_equal_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
+      spe_andc(f, stencil_pass_reg, mask_reg, stencil_pass_reg);
+      break;
+
+   case PIPE_FUNC_GREATER:
+      /* stencil_pass = mask & (s > reference) */
+      spe_compare_greater_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
+      spe_and(f, stencil_pass_reg, mask_reg, stencil_pass_reg);
+      break;
+
+   case PIPE_FUNC_LESS: {
+      /* stencil_pass = mask & (reference > s) */
+      /* There's no convenient Compare Less Than Immediate instruction, so
+       * we'll have to do this one the harder way, by loading a register and 
+       * comparing directly.  Compare Logical Greater Than Word (clgt) 
+       * treats its operands as unsigned - no sign extension.
+       */
+      unsigned int tmp_reg = spe_allocate_available_register(f);
+      spe_load_uint(f, tmp_reg, state->ref_value);
+      spe_clgt(f, stencil_pass_reg, tmp_reg, fbS_reg);
+      spe_and(f, stencil_pass_reg, mask_reg, stencil_pass_reg);
+      spe_release_register(f, tmp_reg);
+      break;
+   }
+
+   case PIPE_FUNC_LEQUAL:
+      /* stencil_pass = mask & (s <= reference) = mask & ~(s > reference) */
+      spe_compare_greater_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
+      spe_andc(f, stencil_pass_reg, mask_reg, stencil_pass_reg);
+      break;
+
+   case PIPE_FUNC_GEQUAL: {
+      /* stencil_pass = mask & (s >= reference) = mask & ~(reference > s) */
+      /* As above, we have to do this by loading a register */
+      unsigned int tmp_reg = spe_allocate_available_register(f);
+      spe_load_uint(f, tmp_reg, state->ref_value);
+      spe_clgt(f, stencil_pass_reg, tmp_reg, fbS_reg);
+      spe_andc(f, stencil_pass_reg, mask_reg, stencil_pass_reg);
+      spe_release_register(f, tmp_reg);
+      break;
+   }
+
+   case PIPE_FUNC_NEVER:
+      /* stencil_pass = mask & 0 = 0 */
+      spe_load_uint(f, stencil_pass_reg, 0);
+      spe_move(f, stencil_pass_reg, mask_reg);  /* zmask = mask */
+      break;
+
+   case PIPE_FUNC_ALWAYS:
+      /* stencil_pass = mask & 1 = mask */
+      spe_move(f, stencil_pass_reg, mask_reg);
+      break;
+   }
+
+   /* The fragments that passed the stencil test are now in stencil_pass_reg.
+    * The fragments that failed would be (mask_reg & ~stencil_pass_reg).
+    */
+}
+
+/* This function generates code that calculates a set of new stencil values
+ * given the earlier values and the operation to apply.  It does not
+ * apply any tests.  It is intended to be called up to 3 times
+ * (for the stencil fail operation, for the stencil pass-z fail operation,
+ * and for the stencil pass-z pass operation) to collect up to three
+ * possible sets of values, and for the caller to combine them based
+ * on the result of the tests.
+ *
+ * stencil_max_value should be (2^n - 1) where n is the number of bits
+ * in the stencil buffer - in other words, it should be usable as a mask.
+ */
+static void
+gen_stencil_values(struct spe_function *f, unsigned int stencil_op,
+                   unsigned int stencil_ref_value, unsigned int stencil_max_value,
+                   unsigned int fbS_reg, unsigned int newS_reg)
+{
+   /* The code below assumes that newS_reg and fbS_reg are not the same
+    * register; if they can be, the calculations below will have to use
+    * an additional temporary register.  For now, mark the assumption
+    * with an assertion that will fail if they are the same.
+    */
+   ASSERT(fbS_reg != newS_reg);
+
+   /* The code also assumes the the stencil_max_value is of the form 
+    * 2^n-1 and can therefore be used as a mask for the valid bits in 
+    * addition to a maximum.  Make sure this is the case as well.
+    * The clever math below exploits the fact that incrementing a 
+    * binary number serves to flip all the bits of a number starting at
+    * the LSB and continuing to (and including) the first zero bit
+    * found.  That means that a number and its increment will always
+    * have at least one bit in common (the high order bit, if nothing
+    * else) *unless* the number is zero, *or* the number is of a form
+    * consisting of some number of 1s in the low-order bits followed
+    * by nothing but 0s in the high-order bits.  The latter case
+    * implies it's of the form 2^n-1.
+    */
+   ASSERT(stencil_max_value > 0 && ((stencil_max_value + 1) & stencil_max_value) == 0);
+
+   switch(stencil_op) {
+   case PIPE_STENCIL_OP_KEEP:
+      /* newS = S */
+      spe_move(f, newS_reg, fbS_reg);
+      break;
+
+   case PIPE_STENCIL_OP_ZERO:
+      /* newS = 0 */
+      spe_zero(f, newS_reg);
+      break;
+
+   case PIPE_STENCIL_OP_REPLACE:
+      /* newS = stencil reference value */
+      spe_load_uint(f, newS_reg, stencil_ref_value);
+      break;
+
+   case PIPE_STENCIL_OP_INCR: {
+      /* newS = (s == max ? max : s + 1) */
+      unsigned int equals_reg = spe_allocate_available_register(f);
+
+      spe_compare_equal_uint(f, equals_reg, fbS_reg, stencil_max_value);
+      /* Add Word Immediate computes rT = rA + 10-bit signed immediate */
+      spe_ai(f, newS_reg, fbS_reg, 1);
+      /* Select from the current value or the new value based on the equality test */
+      spe_selb(f, newS_reg, fbS_reg, newS_reg, equals_reg);
+
+      spe_release_register(f, equals_reg);
+      break;
+   }
+   case PIPE_STENCIL_OP_DECR: {
+      /* newS = (s == 0 ? 0 : s - 1) */
+      unsigned int equals_reg = spe_allocate_available_register(f);
+
+      spe_compare_equal_uint(f, equals_reg, fbS_reg, 0);
+      /* Add Word Immediate with a (-1) value works */
+      spe_ai(f, newS_reg, fbS_reg, -1);
+      /* Select from the current value or the new value based on the equality test */
+      spe_selb(f, newS_reg, fbS_reg, newS_reg, equals_reg);
+
+      spe_release_register(f, equals_reg);
+      break;
+   }
+   case PIPE_STENCIL_OP_INCR_WRAP:
+      /* newS = (s == max ? 0 : s + 1), but since max is 2^n-1, we can
+       * do a normal add and mask off the correct bits 
+       */
+      spe_ai(f, newS_reg, fbS_reg, 1);
+      spe_and_uint(f, newS_reg, newS_reg, stencil_max_value);
+      break;
+
+   case PIPE_STENCIL_OP_DECR_WRAP:
+      /* newS = (s == 0 ? max : s - 1), but we'll pull the same mask trick as above */
+      spe_ai(f, newS_reg, fbS_reg, -1);
+      spe_and_uint(f, newS_reg, newS_reg, stencil_max_value);
+      break;
+
+   case PIPE_STENCIL_OP_INVERT:
+      /* newS = ~s.  We take advantage of the mask/max value to invert only
+       * the valid bits for the field so we don't have to do an extra "and".
+       */
+      spe_xor_uint(f, newS_reg, fbS_reg, stencil_max_value);
+      break;
+
+   default:
+      ASSERT(0);
+   }
+}
+
+
+/* This function generates code to get all the necessary possible
+ * stencil values.  For each of the output registers (fail_reg,
+ * zfail_reg, and zpass_reg), it either allocates a new register
+ * and calculates a new set of values based on the stencil operation,
+ * or it reuses a register allocation and calculation done for an
+ * earlier (matching) operation, or it reuses the fbS_reg register
+ * (if the stencil operation is KEEP, which doesn't change the 
+ * stencil buffer).
+ *
+ * Since this function allocates a variable number of registers,
+ * to avoid incurring complex logic to free them, they should
+ * be allocated after a spe_allocate_register_set() call
+ * and released by the corresponding spe_release_register_set() call.
+ */
+static void
+gen_get_stencil_values(struct spe_function *f, const struct pipe_depth_stencil_alpha_state *dsa,
+                       unsigned int fbS_reg, 
+                       unsigned int *fail_reg, unsigned int *zfail_reg, 
+                       unsigned int *zpass_reg, unsigned int *back_fail_reg, 
+                       unsigned int *back_zfail_reg, unsigned int *back_zpass_reg)
+{
+   unsigned zfail_op, back_zfail_op;
+
+   /* Stenciling had better be enabled here */
+   ASSERT(dsa->stencil[0].enabled);
+
+   /* If the depth test is not enabled, it is treated as though it always
+    * passes.  In particular, that means that the "zfail_op" (and the backfacing
+    * counterpart, if active) are not considered - a failing stencil test will
+    * trigger the "fail_op", and a passing stencil test will trigger the
+    * "zpass_op".
+    *
+    * By overriding the operations in this case to be PIPE_STENCIL_OP_KEEP,
+    * we keep them from being calculated.
+    */
+   if (dsa->depth.enabled) {
+      zfail_op = dsa->stencil[0].zfail_op;
+      back_zfail_op = dsa->stencil[1].zfail_op;
+   }
+   else {
+      zfail_op = PIPE_STENCIL_OP_KEEP;
+      back_zfail_op = PIPE_STENCIL_OP_KEEP;
+   }
+
+   /* One-sided or front-facing stencil */
+   if (dsa->stencil[0].fail_op == PIPE_STENCIL_OP_KEEP) {
+      *fail_reg = fbS_reg;
+   }
+   else {
+      *fail_reg = spe_allocate_available_register(f);
+      gen_stencil_values(f, dsa->stencil[0].fail_op, dsa->stencil[0].ref_value, 
+         0xff, fbS_reg, *fail_reg);
+   }
+
+   if (zfail_op == PIPE_STENCIL_OP_KEEP) {
+      *zfail_reg = fbS_reg;
+   }
+   else if (zfail_op == dsa->stencil[0].fail_op) {
+      *zfail_reg = *fail_reg;
+   }
+   else {
+      *zfail_reg = spe_allocate_available_register(f);
+      gen_stencil_values(f, dsa->stencil[0].zfail_op, dsa->stencil[0].ref_value, 
+         0xff, fbS_reg, *zfail_reg);
+   }
+
+   if (dsa->stencil[0].zpass_op == PIPE_STENCIL_OP_KEEP) {
+      *zpass_reg = fbS_reg;
+   }
+   else if (dsa->stencil[0].zpass_op == dsa->stencil[0].fail_op) {
+      *zpass_reg = *fail_reg;
+   }
+   else if (dsa->stencil[0].zpass_op == zfail_op) {
+      *zpass_reg = *zfail_reg;
+   }
+   else {
+      *zpass_reg = spe_allocate_available_register(f);
+      gen_stencil_values(f, dsa->stencil[0].zpass_op, dsa->stencil[0].ref_value, 
+         0xff, fbS_reg, *zpass_reg);
+   }
+
+   /* If two-sided stencil is enabled, we have more work to do. */
+   if (!dsa->stencil[1].enabled) {
+      /* This just flags that the registers need not be deallocated later */
+      *back_fail_reg = fbS_reg;
+      *back_zfail_reg = fbS_reg;
+      *back_zpass_reg = fbS_reg;
+   }
+   else {
+      /* Same calculations as above, but for the back stencil */
+      if (dsa->stencil[1].fail_op == PIPE_STENCIL_OP_KEEP) {
+         *back_fail_reg = fbS_reg;
+      }
+      else if (dsa->stencil[1].fail_op == dsa->stencil[0].fail_op) {
+         *back_fail_reg = *fail_reg;
+      }
+      else if (dsa->stencil[1].fail_op == zfail_op) {
+         *back_fail_reg = *zfail_reg;
+      }
+      else if (dsa->stencil[1].fail_op == dsa->stencil[0].zpass_op) {
+         *back_fail_reg = *zpass_reg;
+      }
+      else {
+         *back_fail_reg = spe_allocate_available_register(f);
+         gen_stencil_values(f, dsa->stencil[1].fail_op, dsa->stencil[1].ref_value, 
+            0xff, fbS_reg, *back_fail_reg);
+      }
+
+      if (back_zfail_op == PIPE_STENCIL_OP_KEEP) {
+         *back_zfail_reg = fbS_reg;
+      }
+      else if (back_zfail_op == dsa->stencil[0].fail_op) {
+         *back_zfail_reg = *fail_reg;
+      }
+      else if (back_zfail_op == zfail_op) {
+         *back_zfail_reg = *zfail_reg;
+      }
+      else if (back_zfail_op == dsa->stencil[0].zpass_op) {
+         *back_zfail_reg = *zpass_reg;
+      }
+      else if (back_zfail_op == dsa->stencil[1].fail_op) {
+         *back_zfail_reg = *back_fail_reg;
+      }
+      else {
+         *back_zfail_reg = spe_allocate_available_register(f);
+         gen_stencil_values(f, dsa->stencil[1].zfail_op, dsa->stencil[1].ref_value, 
+            0xff, fbS_reg, *back_zfail_reg);
+      }
+
+      if (dsa->stencil[1].zpass_op == PIPE_STENCIL_OP_KEEP) {
+         *back_zpass_reg = fbS_reg;
+      }
+      else if (dsa->stencil[1].zpass_op == dsa->stencil[0].fail_op) {
+         *back_zpass_reg = *fail_reg;
+      }
+      else if (dsa->stencil[1].zpass_op == zfail_op) {
+         *back_zpass_reg = *zfail_reg;
+      }
+      else if (dsa->stencil[1].zpass_op == dsa->stencil[0].zpass_op) {
+         *back_zpass_reg = *zpass_reg;
+      }
+      else if (dsa->stencil[1].zpass_op == dsa->stencil[1].fail_op) {
+         *back_zpass_reg = *back_fail_reg;
+      }
+      else if (dsa->stencil[1].zpass_op == back_zfail_op) {
+         *back_zpass_reg = *back_zfail_reg;
+      }
+      else {
+         *back_zfail_reg = spe_allocate_available_register(f);
+         gen_stencil_values(f, dsa->stencil[1].zpass_op, dsa->stencil[1].ref_value, 
+            0xff, fbS_reg, *back_zpass_reg);
+      }
+   } /* End of calculations for back-facing stencil */
+}
+
+static boolean
+gen_stencil_depth_test(struct spe_function *f, 
+                       const struct pipe_depth_stencil_alpha_state *dsa, 
+                       const int const facing_reg,
+                       const int mask_reg, const int fragZ_reg, 
+                       const int fbZ_reg, const int fbS_reg)
+{
+   /* True if we've generated code that could require writeback to the
+    * depth and/or stencil buffers
+    */
+   boolean modified_buffers = false;
+
+   boolean need_to_calculate_stencil_values;
+   boolean need_to_writemask_stencil_values;
+
+   /* Registers.  We may or may not actually allocate these, depending
+    * on whether the state values indicate that we need them.
+    */
+   unsigned int stencil_pass_reg, stencil_fail_reg;
+   unsigned int stencil_fail_values, stencil_pass_depth_fail_values, stencil_pass_depth_pass_values;
+   unsigned int stencil_writemask_reg;
+   unsigned int zmask_reg;
+   unsigned int newS_reg;
+
+   /* Stenciling is quite complex: up to six different configurable stencil 
+    * operations/calculations can be required (three each for front-facing
+    * and back-facing fragments).  Many of those operations will likely 
+    * be identical, so there's good reason to try to avoid calculating 
+    * the same values more than once (which unfortunately makes the code less 
+    * straightforward).
+    *
+    * To make register management easier, we start a new 
+    * register set; we can release all the registers in the set at
+    * once, and avoid having to keep track of exactly which registers
+    * we allocate.  We can still allocate and free registers as 
+    * desired (if we know we no longer need a register), but we don't
+    * have to spend the complexity to track the more difficult variant
+    * register usage scenarios.
+    */
+   spe_allocate_register_set(f);
+
+   /* Calculate the writemask.  If the writemask is trivial (either
+    * all 0s, meaning that we don't need to calculate any stencil values
+    * because they're not going to change the stencil anyway, or all 1s,
+    * meaning that we have to calculate the stencil values but do not
+    * need to mask them), we can avoid generating code.  Don't forget
+    * that we need to consider backfacing stencil, if enabled.
+    */
+   if (dsa->stencil[0].write_mask == 0x0 && (!dsa->stencil[1].enabled || dsa->stencil[1].write_mask == 0x00)) {
+      /* Trivial: don't need to calculate stencil values, and don't need to 
+       * write them back to the framebuffer.
+       */
+      need_to_calculate_stencil_values = false;
+      need_to_writemask_stencil_values = false;
+   }
+   else if (dsa->stencil[0].write_mask == 0xff && (!dsa->stencil[1].enabled || dsa->stencil[1].write_mask == 0x00)) {
+      /* Still trivial, but a little less so.  We need to write the stencil
+       * values, but we don't need to mask them.
+       */
+      need_to_calculate_stencil_values = true;
+      need_to_writemask_stencil_values = false;
+   }
+   else {
+      /* The general case: calculate, mask, and write */
+      need_to_calculate_stencil_values = true;
+      need_to_writemask_stencil_values = true;
+
+      /* While we're here, generate code that calculates what the
+       * writemask should be.  If backface stenciling is enabled,
+       * and the backface writemask is not the same as the frontface
+       * writemask, we'll have to generate code that merges the
+       * two masks into a single effective mask based on fragment facing.
+       */
+      stencil_writemask_reg = spe_allocate_available_register(f);
+      spe_load_uint(f, stencil_writemask_reg, dsa->stencil[0].write_mask);
+      if (dsa->stencil[1].enabled && dsa->stencil[0].write_mask != dsa->stencil[1].write_mask) {
+         unsigned int back_write_mask_reg = spe_allocate_available_register(f);
+         spe_load_uint(f, back_write_mask_reg, dsa->stencil[1].write_mask);
+         spe_selb(f, stencil_writemask_reg, stencil_writemask_reg, back_write_mask_reg, facing_reg);
+         spe_release_register(f, back_write_mask_reg);
+      }
+   }
+
+   /* At least one-sided stenciling must be on.  Generate code that
+    * runs the stencil test on the basic/front-facing stencil, leaving
+    * the mask of passing stencil bits in stencil_pass_reg.  This mask will
+    * be used both to mask the set of active pixels, and also to
+    * determine how the stencil buffer changes.
+    *
+    * This test will *not* change the value in mask_reg (because we don't
+    * yet know whether to apply the two-sided stencil or one-sided stencil).
+    */
+   stencil_pass_reg = spe_allocate_available_register(f);
+   gen_stencil_test(f, &dsa->stencil[0], mask_reg, fbS_reg, stencil_pass_reg);
+
+   /* If two-sided stenciling is on, generate code to run the stencil
+    * test on the backfacing stencil as well, and combine the two results
+    * into the one correct result based on facing.
+    */
+   if (dsa->stencil[1].enabled) {
+      unsigned int temp_reg = spe_allocate_available_register(f);
+      gen_stencil_test(f, &dsa->stencil[1], mask_reg, fbS_reg, temp_reg);
+      spe_selb(f, stencil_pass_reg, stencil_pass_reg, temp_reg, facing_reg);
+      spe_release_register(f, temp_reg);
+   }
+
+   /* Generate code that, given the mask of valid fragments and the
+    * mask of valid fragments that passed the stencil test, computes
+    * the mask of valid fragments that failed the stencil test.  We
+    * have to do this before we run a depth test (because the
+    * depth test should not be performed on fragments that failed the
+    * stencil test, and because the depth test will update the 
+    * mask of valid fragments based on the results of the depth test).
+    */
+   stencil_fail_reg = spe_allocate_available_register(f);
+   spe_andc(f, stencil_fail_reg, mask_reg, stencil_pass_reg);
+   /* Now remove the stenciled-out pixels from the valid fragment mask,
+    * so we can later use the valid fragment mask in the depth test.
+    */
+   spe_and(f, mask_reg, mask_reg, stencil_pass_reg);
+
+   /* We may not need to calculate stencil values, if the writemask is off */
+   if (need_to_calculate_stencil_values) {
+      unsigned int back_stencil_fail_values, back_stencil_pass_depth_fail_values, back_stencil_pass_depth_pass_values;
+      unsigned int front_stencil_fail_values, front_stencil_pass_depth_fail_values, front_stencil_pass_depth_pass_values;
+
+      /* Generate code that calculates exactly which stencil values we need,
+       * without calculating the same value twice (say, if two different
+       * stencil ops have the same value).  This code will work for one-sided
+       * and two-sided stenciling (so that we take into account that operations
+       * may match between front and back stencils), and will also take into
+       * account whether the depth test is enabled (if the depth test is off,
+       * we don't need any of the zfail results, because the depth test always
+       * is considered to pass if it is disabled).  Any register value that
+       * does not need to be calculated will come back with the same value
+       * that's in fbS_reg.
+       *
+       * This function will allocate a variant number of registers that
+       * will be released as part of the register set.
+       */
+      gen_get_stencil_values(f, dsa, fbS_reg, 
+         &front_stencil_fail_values, &front_stencil_pass_depth_fail_values, 
+         &front_stencil_pass_depth_pass_values, &back_stencil_fail_values, 
+         &back_stencil_pass_depth_fail_values, &back_stencil_pass_depth_pass_values);
+
+      /* Tricky, tricky, tricky - the things we do to create optimal
+       * code...
+       *
+       * The various stencil values registers may overlap with each other
+       * and with fbS_reg arbitrarily (as any particular operation is
+       * only calculated once and stored in one register, no matter
+       * how many times it is used).  So we can't change the values 
+       * within those registers directly - if we change a value in a
+       * register that's being referenced by two different calculations,
+       * we've just unwittingly changed the second value as well...
+       *
+       * Avoid this by allocating new registers to hold the results
+       * (there may be 2, if the depth test is off, or 3, if it is on).
+       * These will be released as part of the register set.
+       */
+      if (!dsa->stencil[1].enabled) {
+         /* The easy case: if two-sided stenciling is *not* enabled, we
+          * just use the front-sided values.
+          */
+         stencil_fail_values = front_stencil_fail_values;
+         stencil_pass_depth_fail_values = front_stencil_pass_depth_fail_values;
+         stencil_pass_depth_pass_values = front_stencil_pass_depth_pass_values;
+      }
+      else { /* two-sided stencil enabled */
+         /* Allocate new registers for the needed merged values */
+         stencil_fail_values = spe_allocate_available_register(f);
+         spe_selb(f, stencil_fail_values, front_stencil_fail_values, back_stencil_fail_values, facing_reg);
+         if (dsa->depth.enabled) {
+            stencil_pass_depth_fail_values = spe_allocate_available_register(f);
+            spe_selb(f, stencil_pass_depth_fail_values, front_stencil_pass_depth_fail_values, back_stencil_pass_depth_fail_values, facing_reg);
+         }
+         else {
+            stencil_pass_depth_fail_values = fbS_reg;
+         }
+         stencil_pass_depth_pass_values = spe_allocate_available_register(f);
+         spe_selb(f, stencil_pass_depth_pass_values, front_stencil_pass_depth_pass_values, back_stencil_pass_depth_pass_values, facing_reg);
+      }
+   }
+
+   /* We now have all the stencil values we need.  We also need 
+    * the results of the depth test to figure out which
+    * stencil values will become the new stencil values.  (Even if
+    * we aren't actually calculating stencil values, we need to apply
+    * the depth test if it's enabled.)
+    *
+    * The code generated by gen_depth_test() returns the results of the
+    * test in the given register, but also alters the mask_reg based
+    * on the results of the test.
+    */
+   if (dsa->depth.enabled) {
+      zmask_reg = spe_allocate_available_register(f);
+      modified_buffers |= gen_depth_test(f, dsa, mask_reg, fragZ_reg, fbZ_reg, zmask_reg);
+   }
+
+   if (need_to_calculate_stencil_values) {
+      /* If we need to writemask the stencil values before going into
+       * the stencil buffer, we'll have to use a new register to
+       * hold the new values.  If not, we can just keep using the
+       * current register.
+       */
+      if (need_to_writemask_stencil_values) {
+         newS_reg = spe_allocate_available_register(f);
+         spe_move(f, newS_reg, fbS_reg);
+         modified_buffers = true;
+      }
+      else {
+         newS_reg = fbS_reg;
+      }
+
+      /* Merge in the selected stencil fail values */
+      if (stencil_fail_values != fbS_reg) {
+         spe_selb(f, newS_reg, newS_reg, stencil_fail_values, stencil_fail_reg);
+      }
+
+      /* Same for the stencil pass/depth fail values.  If this calculation
+       * is not needed (say, if depth test is off), then the
+       * stencil_pass_depth_fail_values register will be equal to fbS_reg
+       * and we'll skip the calculation.
+       */
+      if (stencil_pass_depth_fail_values != fbS_reg) {
+         /* We don't actually have a stencil pass/depth fail mask yet.
+          * Calculate it here from the stencil passing mask and the
+          * depth passing mask.  Note that zmask_reg *must* have been
+          * set above if we're here.
+          */
+         unsigned int stencil_pass_depth_fail_mask = spe_allocate_available_register(f);
+         spe_andc(f, stencil_pass_depth_fail_mask, stencil_pass_reg, zmask_reg);
+
+         spe_selb(f, newS_reg, newS_reg, stencil_pass_depth_fail_values, stencil_pass_depth_fail_mask);
+
+         spe_release_register(f, stencil_pass_depth_fail_mask);
+      }
+
+      /* Same for the stencil pass/depth pass mask */
+      if (stencil_pass_depth_pass_values != fbS_reg) {
+         unsigned int stencil_pass_depth_pass_mask = spe_allocate_available_register(f);
+         spe_and(f, stencil_pass_depth_pass_mask, stencil_pass_reg, zmask_reg);
+
+         spe_selb(f, newS_reg, newS_reg, stencil_pass_depth_pass_values, stencil_pass_depth_pass_mask);
+         spe_release_register(f, stencil_pass_depth_pass_mask);
+      }
+
+      /* Almost done.  If we need to writemask, do it now, leaving the
+       * results in the fbS_reg register passed in.  If we don't need
+       * to writemask, then the results are *already* in the fbS_reg,
+       * so there's nothing more to do.
+       */
+
+      if (need_to_writemask_stencil_values) {
+         /* The Select Bytes command makes a fine writemask.  Where
+          * the mask is 0, the first (original) values are retained,
+          * effectively masking out changes.  Where the mask is 1, the
+          * second (new) values are retained, incorporating changes.
+          */
+         spe_selb(f, fbS_reg, fbS_reg, newS_reg, stencil_writemask_reg);
+      }
+   } /* done calculating stencil values */
+
+   /* The stencil and/or depth values have been applied, and the
+    * mask_reg, fbS_reg, and fbZ_reg values have been updated.
+    * We're all done, except that we've allocated a fair number
+    * of registers that we didn't bother tracking.  Release all
+    * those registers as part of the register set, and go home.
+    */
+   spe_release_register_set(f);
+
+   /* Return true if we could have modified the stencil and/or
+    * depth buffers.
+    */
+   return modified_buffers;
+}
+
+
 /**
  * Generate SPE code to implement the fragment operations (alpha test,
  * depth test, stencil test, blending, colormask, and final
@@ -1156,6 +1800,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
    const int fragB_reg = 10;  /* vector float */
    const int fragA_reg = 11;  /* vector float */
    const int mask_reg = 12;   /* vector uint */
+   const int facing_reg = 13; /* uint */
 
    /* offset of quad from start of tile
     * XXX assuming 4-byte pixels for color AND Z/stencil!!!!
@@ -1183,6 +1828,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
    spe_allocate_register(f, fragB_reg);
    spe_allocate_register(f, fragA_reg);
    spe_allocate_register(f, mask_reg);
+   spe_allocate_register(f, facing_reg);
 
    quad_offset_reg = spe_allocate_available_register(f);
    fbRGBA_reg = spe_allocate_available_register(f);
@@ -1195,6 +1841,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
 
       ASSERT(TILE_SIZE == 32);
 
+      spe_comment(f, 0, "Computing tile location in memory");
       spe_rotmi(f, y2_reg, y_reg, -1);  /* y2 = y / 2 */
       spe_rotmi(f, x2_reg, x_reg, -1);  /* x2 = x / 2 */
       spe_shli(f, y2_reg, y2_reg, 4);   /* y2 *= 16 */
@@ -1205,124 +1852,164 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
       spe_release_register(f, y2_reg);
    }
 
-
    if (dsa->alpha.enabled) {
       gen_alpha_test(dsa, f, mask_reg, fragA_reg);
    }
 
+   /* If we need the stencil buffers (because one- or two-sided stencil is
+    * enabled) or the depth buffer (because the depth test is enabled),
+    * go grab them.  Note that if either one- or two-sided stencil is
+    * enabled, dsa->stencil[0].enabled will be true.
+    */
    if (dsa->depth.enabled || dsa->stencil[0].enabled) {
       const enum pipe_format zs_format = cell->framebuffer.zsbuf->format;
       boolean write_depth_stencil;
 
-      int fbZ_reg = spe_allocate_available_register(f); /* Z values */
-      int fbS_reg = spe_allocate_available_register(f); /* Stencil values */
+      /* We may or may not need to allocate a register for Z or stencil values */
+      boolean fbS_reg_set = false, fbZ_reg_set = false;
+      unsigned int fbS_reg, fbZ_reg = 0;
+
+      spe_comment(f, 0, "Loading Z/stencil tile");
 
       /* fetch quad of depth/stencil values from tile at (x,y) */
       /* Load: fbZS_reg = memory[depth_tile_reg + offset_reg] */
+      /* XXX Not sure this is allowed if we've only got a 16-bit Z buffer... */
       spe_lqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg);
 
-      if (dsa->depth.enabled) {
-         /* Extract Z bits from fbZS_reg into fbZ_reg */
-         if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
-             zs_format == PIPE_FORMAT_X8Z24_UNORM) {
-            int mask_reg = spe_allocate_available_register(f);
-            spe_fsmbi(f, mask_reg, 0x7777);  /* mask[0,1,2,3] = 0x00ffffff */
-            spe_and(f, fbZ_reg, fbZS_reg, mask_reg);  /* fbZ = fbZS & mask */
-            spe_release_register(f, mask_reg);
-            /* OK, fbZ_reg has four 24-bit Z values now */
-         }
-         else if (zs_format == PIPE_FORMAT_Z24S8_UNORM ||
-                  zs_format == PIPE_FORMAT_Z24X8_UNORM) {
-            spe_rotmi(f, fbZ_reg, fbZS_reg, -8);  /* fbZ = fbZS >> 8 */
-            /* OK, fbZ_reg has four 24-bit Z values now */
-         }
-         else if (zs_format == PIPE_FORMAT_Z32_UNORM) {
-            spe_move(f, fbZ_reg, fbZS_reg);
-            /* OK, fbZ_reg has four 32-bit Z values now */
-         }
-         else if (zs_format == PIPE_FORMAT_Z16_UNORM) {
-            spe_move(f, fbZ_reg, fbZS_reg);
-            /* OK, fbZ_reg has four 16-bit Z values now */
-         }
-         else {
-            ASSERT(0);  /* invalid format */
-         }
-
-         /* Convert fragZ values from float[4] to 16, 24 or 32-bit uint[4] */
-         if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
-             zs_format == PIPE_FORMAT_X8Z24_UNORM ||
-             zs_format == PIPE_FORMAT_Z24S8_UNORM ||
-             zs_format == PIPE_FORMAT_Z24X8_UNORM) {
-            /* scale/convert fragZ from float in [0,1] to uint in [0, ~0] */
-            spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
-            /* fragZ = fragZ >> 8 */
-            spe_rotmi(f, fragZ_reg, fragZ_reg, -8);
-         }
-         else if (zs_format == PIPE_FORMAT_Z32_UNORM) {
-            /* scale/convert fragZ from float in [0,1] to uint in [0, ~0] */
-            spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
-         }
-         else if (zs_format == PIPE_FORMAT_Z16_UNORM) {
-            /* scale/convert fragZ from float in [0,1] to uint in [0, ~0] */
-            spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
-            /* fragZ = fragZ >> 16 */
-            spe_rotmi(f, fragZ_reg, fragZ_reg, -16);
-         }
-      }
-      else {
-         /* no Z test, but set Z to zero so we don't OR-in garbage below */
-         spe_load_uint(f, fbZ_reg, 0); /* XXX set to zero for now */
+      /* From the Z/stencil buffer format, pull out the bits we need for
+       * Z and/or stencil.  We'll also convert the incoming fragment Z
+       * value in fragZ_reg from a floating point value in [0.0..1.0] to
+       * an unsigned integer value with the appropriate resolution.
+       */
+      switch(zs_format) {
+
+         case PIPE_FORMAT_S8Z24_UNORM: /* fall through */
+         case PIPE_FORMAT_X8Z24_UNORM:
+            if (dsa->depth.enabled) {
+               /* We need the Z part at least */
+               setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
+               /* four 24-bit Z values in the low-order bits */
+               spe_and_uint(f, fbZ_reg, fbZS_reg, 0x00ffffff);
+
+               /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
+                * to a 24-bit unsigned integer
+                */
+               spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+               spe_rotmi(f, fragZ_reg, fragZ_reg, -8);
+            }
+            if (dsa->stencil[0].enabled) {
+               setup_optional_register(f, &fbS_reg_set, &fbS_reg);
+               /* four 8-bit Z values in the high-order bits */
+               spe_rotmi(f, fbS_reg, fbZS_reg, -24);
+            }
+            break;
+
+         case PIPE_FORMAT_Z24S8_UNORM: /* fall through */
+         case PIPE_FORMAT_Z24X8_UNORM:
+            if (dsa->depth.enabled) {
+               setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
+               /* shift by 8 to get the upper 24-bit values */
+               spe_rotmi(f, fbS_reg, fbZS_reg, -8);
+
+               /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
+                * to a 24-bit unsigned integer
+                */
+               spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+               spe_rotmi(f, fragZ_reg, fragZ_reg, -8);
+            }
+            if (dsa->stencil[0].enabled) {
+               setup_optional_register(f, &fbS_reg_set, &fbS_reg);
+               /* 8-bit stencil in the low-order bits - mask them out */
+               spe_and_uint(f, fbS_reg, fbZS_reg, 0x000000ff);
+            }
+            break;
+
+         case PIPE_FORMAT_Z32_UNORM:
+            if (dsa->depth.enabled) {
+               setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
+               /* Copy over 4 32-bit values */
+               spe_move(f, fbZ_reg, fbZS_reg);
+
+               /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
+                * to a 32-bit unsigned integer
+                */
+               spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+            }
+            /* No stencil, so can't do anything there */
+            break;
+
+         case PIPE_FORMAT_Z16_UNORM:
+            if (dsa->depth.enabled) {
+               /* XXX Not sure this is correct, but it was here before, so we're
+                * going with it for now
+                */
+               setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
+               /* Copy over 4 32-bit values */
+               spe_move(f, fbZ_reg, fbZS_reg);
+
+               /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
+                * to a 16-bit unsigned integer
+                */
+               spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+               spe_rotmi(f, fragZ_reg, fragZ_reg, -16);
+            }
+            /* No stencil */
+            break;
+
+         default:
+            ASSERT(0); /* invalid format */
       }
 
-
+      /* If stencil is enabled, use the stencil-specific code
+       * generator to generate both the stencil and depth (if needed)
+       * tests.  Otherwise, if only depth is enabled, generate
+       * a quick depth test.  The test generators themselves will
+       * report back whether the depth/stencil buffer has to be
+       * written back.
+       */
       if (dsa->stencil[0].enabled) {
-         /* Extract Stencil bit sfrom fbZS_reg into fbS_reg */
-         if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
-             zs_format == PIPE_FORMAT_X8Z24_UNORM) {
-            /* XXX extract with a shift */
-            ASSERT(0);
-         }
-         else if (zs_format == PIPE_FORMAT_Z24S8_UNORM ||
-                  zs_format == PIPE_FORMAT_Z24X8_UNORM) {
-            /* XXX extract with a mask */
-            ASSERT(0);
-         }
-      }
-      else {
-         /* no stencil test, but set to zero so we don't OR-in garbage below */
-         spe_load_uint(f, fbS_reg, 0); /* XXX set to zero for now */
-      }
+         /* This will perform the stencil and depth tests, and update
+          * the mask_reg, fbZ_reg, and fbS_reg as required by the
+          * tests.
+          */
+         ASSERT(fbS_reg_set);
+         ASSERT(fbZ_reg_set);
+         spe_comment(f, 0, "Perform stencil test");
 
-      if (dsa->stencil[0].enabled) {
-         /* XXX this may involve depth testing too */
-         // gen_stencil_test(dsa, f, ... );
-         ASSERT(0);
+         write_depth_stencil = gen_stencil_depth_test(f, dsa, facing_reg, mask_reg, fragZ_reg, fbZ_reg, fbS_reg);
       }
       else if (dsa->depth.enabled) {
          int zmask_reg = spe_allocate_available_register(f);
-         gen_depth_test(dsa, f, mask_reg, fragZ_reg, fbZ_reg, zmask_reg);
+         spe_comment(f, 0, "Perform depth test");
+         write_depth_stencil = gen_depth_test(f, dsa, mask_reg, fragZ_reg, fbZ_reg, zmask_reg);
          spe_release_register(f, zmask_reg);
       }
-
-      /* do we need to write Z and/or Stencil back into framebuffer? */
-      write_depth_stencil = (dsa->depth.writemask |
-                             dsa->stencil[0].write_mask |
-                             dsa->stencil[1].write_mask);
+      else {
+         write_depth_stencil = false;
+      }
 
       if (write_depth_stencil) {
          /* Merge latest Z and Stencil values into fbZS_reg.
           * fbZ_reg has four Z vals in bits [23..0] or bits [15..0].
           * fbS_reg has four 8-bit Z values in bits [7..0].
           */
+         spe_comment(f, 0, "Storing depth/stencil values");
          if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
              zs_format == PIPE_FORMAT_X8Z24_UNORM) {
-            spe_shli(f, fbS_reg, fbS_reg, 24); /* fbS = fbS << 24 */
-            spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
+            if (fbS_reg_set) {
+               spe_shli(f, fbS_reg, fbS_reg, 24); /* fbS = fbS << 24 */
+               spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
+            }
+            else {
+               spe_move(f, fbZS_reg, fbZ_reg);
+            }
          }
          else if (zs_format == PIPE_FORMAT_Z24S8_UNORM ||
                   zs_format == PIPE_FORMAT_Z24X8_UNORM) {
             spe_shli(f, fbZ_reg, fbZ_reg, 8); /* fbZ = fbZ << 8 */
-            spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
+            if (fbS_reg_set) {
+               spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
+            }
          }
          else if (zs_format == PIPE_FORMAT_Z32_UNORM) {
             spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */
@@ -1341,11 +2028,10 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
          spe_stqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg);
       }
 
-      spe_release_register(f, fbZ_reg);
-      spe_release_register(f, fbS_reg);
+      release_optional_register(f, &fbZ_reg_set, fbZ_reg);
+      release_optional_register(f, &fbS_reg_set, fbS_reg);
    }
 
-
    /* Get framebuffer quad/colors.  We'll need these for blending,
     * color masking, and to obey the quad/pixel mask.
     * Load: fbRGBA_reg = memory[color_tile + quad_offset]
@@ -1354,8 +2040,8 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
     */
    spe_lqx(f, fbRGBA_reg, color_tile_reg, quad_offset_reg);
 
-
    if (blend->blend_enable) {
+      spe_comment(f, 0, "Perform blending");
       gen_blend(blend, blend_color, f, color_format,
                 fragR_reg, fragG_reg, fragB_reg, fragA_reg, fbRGBA_reg);
    }
@@ -1369,19 +2055,21 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
       int rgba_reg = spe_allocate_available_register(f);
 
       /* Pack four float colors as four 32-bit int colors */
+      spe_comment(f, 0, "Convert fragment colors to framebuffer colors");
       gen_pack_colors(f, color_format,
                       fragR_reg, fragG_reg, fragB_reg, fragA_reg,
                       rgba_reg);
 
       if (blend->logicop_enable) {
+         spe_comment(f, 0, "Compute logic op");
          gen_logicop(blend, f, rgba_reg, fbRGBA_reg);
       }
 
       if (blend->colormask != PIPE_MASK_RGBA) {
+         spe_comment(f, 0, "Compute color mask");
          gen_colormask(f, blend->colormask, color_format, rgba_reg, fbRGBA_reg);
       }
 
-
       /* Mix fragment colors with framebuffer colors using the quad/pixel mask:
        * if (mask[i])
        *    rgba[i] = rgba[i];
@@ -1393,6 +2081,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
       /* Store updated quad in tile:
        * memory[color_tile + quad_offset] = rgba_reg;
        */
+      spe_comment(f, 0, "Store framebuffer colors");
       spe_stqx(f, rgba_reg, color_tile_reg, quad_offset_reg);
 
       spe_release_register(f, rgba_reg);
diff --git a/src/gallium/drivers/cell/ppu/cell_render.c b/src/gallium/drivers/cell/ppu/cell_render.c
index dd25ae880e..79cb8df82f 100644
--- a/src/gallium/drivers/cell/ppu/cell_render.c
+++ b/src/gallium/drivers/cell/ppu/cell_render.c
@@ -152,6 +152,7 @@ cell_flush_prim_buffer(struct cell_context *cell)
       struct cell_command_render *render = &cell_global.command[i].render;
       render->prim_type = PIPE_PRIM_TRIANGLES;
       render->num_verts = cell->prim_buffer.num_verts;
+      render->front_winding = cell->rasterizer->front_winding;
       render->vertex_size = cell->vertex_info->size * 4;
       render->xmin = cell->prim_buffer.xmin;
       render->ymin = cell->prim_buffer.ymin;
diff --git a/src/gallium/drivers/cell/ppu/cell_vbuf.c b/src/gallium/drivers/cell/ppu/cell_vbuf.c
index aa63435b93..578ddf62dc 100644
--- a/src/gallium/drivers/cell/ppu/cell_vbuf.c
+++ b/src/gallium/drivers/cell/ppu/cell_vbuf.c
@@ -214,6 +214,7 @@ cell_vbuf_draw(struct vbuf_render *vbr,
 
       render->opcode = CELL_CMD_RENDER;
       render->prim_type = cvbr->prim;
+      render->front_winding = cell->rasterizer->front_winding;
 
       render->num_indexes = nr_indices;
       render->min_index = min_index;
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 29a305232e..1cd577c23c 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -73,7 +73,8 @@ typedef void (*spu_fragment_ops_func)(uint x, uint y,
                                       vector float fragGreen,
                                       vector float fragBlue,
                                       vector float fragAlpha,
-                                      vector unsigned int mask);
+                                      vector unsigned int mask,
+                                      uint facing);
 
 /** Function for running fragment program */
 typedef void (*spu_fragment_program_func)(vector float *inputs,
diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
index f107764fb2..d252fa6dc1 100644
--- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
+++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
@@ -57,7 +57,8 @@ spu_fallback_fragment_ops(uint x, uint y,
                           vector float fragG,
                           vector float fragB,
                           vector float fragA,
-                          vector unsigned int mask)
+                          vector unsigned int mask,
+                          uint facing)
 {
    vector float frag_aos[4];
    unsigned int fbc0, fbc1, fbc2, fbc3 ; /* framebuffer/tile colors */
@@ -433,23 +434,23 @@ spu_fallback_fragment_ops(uint x, uint y,
       /* Form bitmask depending on color buffer format and colormask bits */
       switch (spu.fb.color_format) {
       case PIPE_FORMAT_A8R8G8B8_UNORM:
-         if (spu.blend.colormask & (1<<0))
+         if (spu.blend.colormask & PIPE_MASK_R)
             cmask |= 0x00ff0000; /* red */
-         if (spu.blend.colormask & (1<<1))
+         if (spu.blend.colormask & PIPE_MASK_G)
             cmask |= 0x0000ff00; /* green */
-         if (spu.blend.colormask & (1<<2))
+         if (spu.blend.colormask & PIPE_MASK_B)
             cmask |= 0x000000ff; /* blue */
-         if (spu.blend.colormask & (1<<3))
+         if (spu.blend.colormask & PIPE_MASK_A)
             cmask |= 0xff000000; /* alpha */
          break;
       case PIPE_FORMAT_B8G8R8A8_UNORM:
-         if (spu.blend.colormask & (1<<0))
+         if (spu.blend.colormask & PIPE_MASK_R)
             cmask |= 0x0000ff00; /* red */
-         if (spu.blend.colormask & (1<<1))
+         if (spu.blend.colormask & PIPE_MASK_G)
             cmask |= 0x00ff0000; /* green */
-         if (spu.blend.colormask & (1<<2))
+         if (spu.blend.colormask & PIPE_MASK_B)
             cmask |= 0xff000000; /* blue */
-         if (spu.blend.colormask & (1<<3))
+         if (spu.blend.colormask & PIPE_MASK_A)
             cmask |= 0x000000ff; /* alpha */
          break;
       default:
diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.h b/src/gallium/drivers/cell/spu/spu_per_fragment_op.h
index f817abf046..a61689c83a 100644
--- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.h
+++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.h
@@ -38,7 +38,8 @@ spu_fallback_fragment_ops(uint x, uint y,
                           vector float fragGreen,
                           vector float fragBlue,
                           vector float fragAlpha,
-                          vector unsigned int mask);
+                          vector unsigned int mask,
+                          uint facing);
 
 
 #endif /* SPU_PER_FRAGMENT_OP */
diff --git a/src/gallium/drivers/cell/spu/spu_render.c b/src/gallium/drivers/cell/spu/spu_render.c
index 305dc98881..82dbeb26b7 100644
--- a/src/gallium/drivers/cell/spu/spu_render.c
+++ b/src/gallium/drivers/cell/spu/spu_render.c
@@ -279,7 +279,7 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
          v1 = (const float *) (vertices + indexes[j+1] * vertex_size);
          v2 = (const float *) (vertices + indexes[j+2] * vertex_size);
 
-         drawn += tri_draw(v0, v1, v2, tx, ty);
+         drawn += tri_draw(v0, v1, v2, tx, ty, render->front_winding);
       }
 
       //printf("SPU %u: drew %u of %u\n", spu.init.id, drawn, render->num_indexes/3);
@@ -297,5 +297,3 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
       printf("SPU %u: RENDER done\n",
              spu.init.id);
 }
-
-
diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index 0a8fb56a62..6039cd80b2 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -118,6 +118,8 @@ struct setup_stage {
 
    float oneoverarea;
 
+   uint facing;
+
    uint tx, ty;
 
    int cliprect_minx, cliprect_maxx, cliprect_miny, cliprect_maxy;
@@ -274,7 +276,7 @@ eval_z(float x, float y)
  * overall.
  */
 static INLINE void
-emit_quad( int x, int y, mask_t mask )
+emit_quad( int x, int y, mask_t mask)
 {
    /* If any bits in mask are set... */
    if (spu_extract(spu_orx(mask), 0)) {
@@ -344,7 +346,8 @@ emit_quad( int x, int y, mask_t mask )
                              fragZ,
                              soa_frag[0], soa_frag[1],
                              soa_frag[2], soa_frag[3],
-                             mask);
+                             mask,
+                             setup.facing);
          }
 
       }
@@ -379,7 +382,8 @@ emit_quad( int x, int y, mask_t mask )
                           outputs[0*4+1],
                           outputs[0*4+2],
                           outputs[0*4+3],
-                          mask);
+                          mask,
+                          setup.facing);
       }
    }
 }
@@ -483,7 +487,7 @@ static void flush_spans( void )
     */
    for (x = block(minleft); x <= block(maxright); x += 2) {
 #if 1
-      emit_quad( x, setup.span.y, calculate_mask( x ) );
+      emit_quad( x, setup.span.y, calculate_mask( x ));
 #endif
    }
 
@@ -902,13 +906,28 @@ static void subtriangle( struct edge *eleft,
    eright->sy += lines;
 }
 
+static float
+determinant( const float *v0,
+             const float *v1,
+             const float *v2 )
+{
+   /* edge vectors e = v0 - v2, f = v1 - v2 */
+   const float ex = v0[0] - v2[0];
+   const float ey = v0[1] - v2[1];
+   const float fx = v1[0] - v2[0];
+   const float fy = v1[1] - v2[1];
+
+   /* det = cross(e,f).z */
+   return ex * fy - ey * fx;
+}
+
 
 /**
  * Draw triangle into tile at (tx, ty) (tile coords)
  * The tile data should have already been fetched.
  */
 boolean
-tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty)
+tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty, uint front_winding)
 {
    setup.tx = tx;
    setup.ty = ty;
@@ -919,6 +938,12 @@ tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty)
    setup.cliprect_maxx = (tx + 1) * TILE_SIZE;
    setup.cliprect_maxy = (ty + 1) * TILE_SIZE;
 
+   /* Before we sort vertices, determine the facing of the triangle,
+    * which will be needed for front/back-face stencil application
+    */
+   float det = determinant(v0, v1, v2);
+   setup.facing = (det > 0.0) ^ (front_winding == PIPE_WINDING_CW);
+
    if (!setup_sort_vertices((struct vertex_header *) v0,
                             (struct vertex_header *) v1,
                             (struct vertex_header *) v2)) {
diff --git a/src/gallium/drivers/cell/spu/spu_tri.h b/src/gallium/drivers/cell/spu/spu_tri.h
index aa694dd7c9..abc3d35160 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.h
+++ b/src/gallium/drivers/cell/spu/spu_tri.h
@@ -31,7 +31,7 @@
 
 
 extern boolean
-tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty);
+tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty, uint front_winding);
 
 
 #endif /* SPU_TRI_H */
-- 
cgit v1.2.3


From 800c350d71132bbb5126bd89310df540332978f4 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Tue, 7 Oct 2008 16:14:27 -0600
Subject: cell: add support for fragment shader constant buffers

---
 src/gallium/drivers/cell/common.h                |  1 +
 src/gallium/drivers/cell/ppu/cell_gen_fp.c       | 10 +++++++++-
 src/gallium/drivers/cell/ppu/cell_state.h        |  5 +++--
 src/gallium/drivers/cell/ppu/cell_state_emit.c   | 19 +++++++++++++++++++
 src/gallium/drivers/cell/ppu/cell_state_shader.c |  5 ++++-
 src/gallium/drivers/cell/spu/spu_command.c       | 22 ++++++++++++++++++++++
 src/gallium/drivers/cell/spu/spu_main.h          |  8 +++++---
 7 files changed, 63 insertions(+), 7 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index c223bc1744..d261c1a640 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -94,6 +94,7 @@
 #define CELL_CMD_STATE_BIND_VS       18
 #define CELL_CMD_STATE_FRAGMENT_PROGRAM 19
 #define CELL_CMD_STATE_ATTRIB_FETCH  20
+#define CELL_CMD_STATE_FS_CONSTANTS  21
 #define CELL_CMD_VS_EXECUTE          22
 #define CELL_CMD_FLUSH_BUFFER_RANGE  23
 
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
index 131a2356fe..3065869d04 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
@@ -215,7 +215,15 @@ get_src_reg(struct codegen *gen,
          reg = gen->imm_regs[src->SrcRegister.Index][swizzle];
          break;
       case TGSI_FILE_CONSTANT:
-         /* xxx fall-through for now / fix */
+         {
+            /* offset is measured in quadwords, not bytes */
+            int offset = src->SrcRegister.Index * 4 + swizzle;
+            reg = get_itemp(gen);
+            reg_is_itemp = TRUE;
+            /* Load:  reg = memory[(machine_reg) + offset] */
+            spe_lqd(gen->f, reg, gen->constants_reg, offset);
+         }
+         break;
       default:
          assert(0);
       }
diff --git a/src/gallium/drivers/cell/ppu/cell_state.h b/src/gallium/drivers/cell/ppu/cell_state.h
index a7771a55a3..b193170f9c 100644
--- a/src/gallium/drivers/cell/ppu/cell_state.h
+++ b/src/gallium/drivers/cell/ppu/cell_state.h
@@ -44,8 +44,9 @@
 #define CELL_NEW_TEXTURE       0x800
 #define CELL_NEW_VERTEX        0x1000
 #define CELL_NEW_VS            0x2000
-#define CELL_NEW_CONSTANTS     0x4000
-#define CELL_NEW_VERTEX_INFO   0x8000
+#define CELL_NEW_VS_CONSTANTS  0x4000
+#define CELL_NEW_FS_CONSTANTS  0x8000
+#define CELL_NEW_VERTEX_INFO   0x10000
 
 
 extern void
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index a36fd3a601..cbfa393cfb 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -25,6 +25,7 @@
  * 
  **************************************************************************/
 
+#include "pipe/p_inlines.h"
 #include "util/u_memory.h"
 #include "cell_context.h"
 #include "cell_gen_fragment.h"
@@ -162,6 +163,24 @@ cell_emit_state(struct cell_context *cell)
       }
    }
 
+   if (cell->dirty & (CELL_NEW_FS_CONSTANTS)) {
+      const uint shader = PIPE_SHADER_FRAGMENT;
+      const uint num_const = cell->constants[shader].size / sizeof(float);
+      uint i, j;
+      float *buf = cell_batch_alloc(cell, 16 + num_const * sizeof(float));
+      uint64_t *ibuf = (uint64_t *) buf;
+      const float *constants = pipe_buffer_map(cell->pipe.screen,
+                                               cell->constants[shader].buffer,
+                                               PIPE_BUFFER_USAGE_CPU_READ);
+      ibuf[0] = CELL_CMD_STATE_FS_CONSTANTS;
+      ibuf[1] = num_const;
+      j = 4;
+      for (i = 0; i < num_const; i++) {
+         buf[j++] = constants[i];
+      }
+      pipe_buffer_unmap(cell->pipe.screen, cell->constants[shader].buffer);
+   }
+
    if (cell->dirty & (CELL_NEW_FRAMEBUFFER |
                       CELL_NEW_DEPTH_STENCIL |
                       CELL_NEW_BLEND)) {
diff --git a/src/gallium/drivers/cell/ppu/cell_state_shader.c b/src/gallium/drivers/cell/ppu/cell_state_shader.c
index 3a0d066da2..54a17eaf2b 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_shader.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_shader.c
@@ -197,7 +197,10 @@ cell_set_constant_buffer(struct pipe_context *pipe,
                         buf->buffer);
    cell->constants[shader].size = buf->size;
 
-   cell->dirty |= CELL_NEW_CONSTANTS;
+   if (shader == PIPE_SHADER_VERTEX)
+      cell->dirty |= CELL_NEW_VS_CONSTANTS;
+   else if (shader == PIPE_SHADER_FRAGMENT)
+      cell->dirty |= CELL_NEW_FS_CONSTANTS;
 }
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index ec9da5d887..91a4c137e7 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -231,6 +231,25 @@ cmd_state_fragment_program(const struct cell_command_fragment_program *fp)
 }
 
 
+static uint
+cmd_state_fs_constants(const uint64_t *buffer, uint pos)
+{
+   const uint num_const = buffer[pos + 1];
+   const float *constants = (const float *) &buffer[pos + 2];
+   uint i;
+
+   DEBUG_PRINTF("CMD_STATE_FS_CONSTANTS (%u)\n", num_const);
+
+   /* Expand each float to float[4] for SOA execution */
+   for (i = 0; i < num_const; i++) {
+      spu.constants[i] = spu_splats(constants[i]);
+   }
+
+   /* return new buffer pos (in 8-byte words) */
+   return pos + 2 + num_const / 2;
+}
+
+
 static void
 cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
 {
@@ -456,6 +475,9 @@ cmd_batch(uint opcode)
             pos += sizeof(*fp) / 8;
          }
          break;
+      case CELL_CMD_STATE_FS_CONSTANTS:
+         pos = cmd_state_fs_constants(buffer, pos);
+         break;
       case CELL_CMD_STATE_SAMPLER:
          {
             struct cell_command_sampler *sampler
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 1cd577c23c..82c9c69a3a 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -41,6 +41,9 @@
 #define MAX_HEIGHT 1024
 
 
+#define CELL_MAX_CONSTANTS 32  /**< number of float[4] constants */
+
+
 /**
  * A tile is basically a TILE_SIZE x TILE_SIZE block of 4-byte pixels.
  * The data may be addressed through several different types.
@@ -157,9 +160,8 @@ struct spu_global
    /** Current texture sampler function */
    spu_sample_texture_func sample_texture[CELL_MAX_SAMPLERS];
 
-   /** Fragment program constants (XXX preliminary/used) */
-#define MAX_CONSTANTS 32
-   vector float constants[MAX_CONSTANTS];
+   /** Fragment program constants */
+   vector float constants[4 * CELL_MAX_CONSTANTS];
 
 } ALIGN16_ATTRIB;
 
-- 
cgit v1.2.3


From a4e477433f485a39b5de448d0a9cb6f4bf9bb90f Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 8 Oct 2008 20:34:35 -0600
Subject: cell: implement more built-in shader functions, link spu code with
 -lm

---
 configs/linux-cell                       |  2 +-
 src/gallium/drivers/cell/spu/spu_funcs.c | 65 +++++++++++++++++++++-----------
 2 files changed, 45 insertions(+), 22 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/configs/linux-cell b/configs/linux-cell
index 86651b83d7..8d74ee469d 100644
--- a/configs/linux-cell
+++ b/configs/linux-cell
@@ -53,7 +53,7 @@ SPU_CFLAGS = $(OPT_FLAGS) -W -Wall -Winline -Wmissing-prototypes -Wno-main \
 	-DSPU_MAIN_PARAM_LONG_LONG \
 	-include spu_intrinsics.h
 
-SPU_LFLAGS = -L$(SDK)/spu/lib -Wl,-N -lmisc
+SPU_LFLAGS = -L$(SDK)/spu/lib -Wl,-N -lmisc -lm
 
 SPU_AR = ppu-ar
 SPU_AR_FLAGS = -qcs
diff --git a/src/gallium/drivers/cell/spu/spu_funcs.c b/src/gallium/drivers/cell/spu/spu_funcs.c
index b57ad3f3b8..1adf9de0e8 100644
--- a/src/gallium/drivers/cell/spu/spu_funcs.c
+++ b/src/gallium/drivers/cell/spu/spu_funcs.c
@@ -35,41 +35,61 @@
 
 #include <string.h>
 #include <libmisc.h>
-#include <cos8_v.h>
-#include <sin8_v.h>
+#include <math.h>
+#include <cos14_v.h>
+#include <sin14_v.h>
 
 #include "cell/common.h"
 #include "spu_main.h"
 #include "spu_funcs.h"
 
 
-#define M_PI 3.1415926
-
-
 static vector float
 spu_cos(vector float x)
 {
-#if 0
-   static const float scale = 1.0 / (2.0 * M_PI);
-   x = x * spu_splats(scale); /* normalize */
-   return _cos8_v(x);
-#else
-   /* just pass-through to avoid trashing caller's stack */
-   return x;
-#endif
+   return _cos14_v(x);
 }
 
 static vector float
 spu_sin(vector float x)
 {
-#if 0
-   static const float scale = 1.0 / (2.0 * M_PI);
-   x = x * spu_splats(scale); /* normalize */
-   return _sin8_v(x);   /* 8-bit accuracy enough?? */
-#else
-   /* just pass-through to avoid trashing caller's stack */
-   return x;
-#endif
+   return _sin14_v(x);
+}
+
+static vector float
+spu_pow(vector float x, vector float y)
+{
+   float z0 = powf(spu_extract(x,0), spu_extract(y,0));
+   float z1 = powf(spu_extract(x,1), spu_extract(y,1));
+   float z2 = powf(spu_extract(x,2), spu_extract(y,2));
+   float z3 = powf(spu_extract(x,3), spu_extract(y,3));
+   return (vector float) {z0, z1, z2, z3};
+}
+
+static vector float
+spu_exp2(vector float x)
+{
+   float z0 = powf(2.0f, spu_extract(x,0));
+   float z1 = powf(2.0f, spu_extract(x,1));
+   float z2 = powf(2.0f, spu_extract(x,2));
+   float z3 = powf(2.0f, spu_extract(x,3));
+   return (vector float) {z0, z1, z2, z3};
+}
+
+static vector float
+spu_log2(vector float x)
+{
+   /*
+    * log_base_2(x) = log(x) / log(2)
+    * 1.442695 = 1/log(2).
+    */
+   static const vector float k = {1.442695F, 1.442695F, 1.442695F, 1.442695F};
+   float z0 = logf(spu_extract(x,0));
+   float z1 = logf(spu_extract(x,1));
+   float z2 = logf(spu_extract(x,2));
+   float z3 = logf(spu_extract(x,3));
+   vector float v = (vector float) {z0, z1, z2, z3};
+   return spu_mul(v, k);
 }
 
 
@@ -101,6 +121,9 @@ return_function_info(void)
    funcs.num = 0;
    add_func(&funcs, "spu_cos", &spu_cos);
    add_func(&funcs, "spu_sin", &spu_sin);
+   add_func(&funcs, "spu_pow", &spu_pow);
+   add_func(&funcs, "spu_exp2", &spu_exp2);
+   add_func(&funcs, "spu_log2", &spu_log2);
 
    /* Send the function info back to the PPU / main memory */
    mfc_put((void *) &funcs,  /* src in local store */
-- 
cgit v1.2.3


From 583098e3cb602fd9810a7c65718155fd9b0b3fda Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Thu, 9 Oct 2008 19:48:53 -0600
Subject: cell: implement basic TXP instruction in fragment shaders

Lots of restrictions for now (one 2D texture, no mipmaps, etc.) for now
but basic texture demos work.
TEX, TXD, TXP do the same thing for the time being.
---
 src/gallium/drivers/cell/ppu/cell_gen_fp.c | 109 ++++++++++++++++++++++++-----
 src/gallium/drivers/cell/spu/spu_funcs.c   |  51 ++++++++++++--
 src/gallium/drivers/cell/spu/spu_tri.c     |   2 +-
 3 files changed, 138 insertions(+), 24 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
index 5647bb23e6..c8125a8a05 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
@@ -226,6 +226,11 @@ get_src_reg(struct codegen *gen,
             spe_lqd(gen->f, reg, gen->constants_reg, offset * 16);
          }
          break;
+      case TGSI_FILE_SAMPLER:
+         {
+            reg = 3; /* XXX total hack */
+         }
+         break;
       default:
          assert(0);
       }
@@ -1162,6 +1167,21 @@ print_functions(struct cell_context *cell)
 #endif
 
 
+static uint
+lookup_function(struct cell_context *cell, const char *funcname)
+{
+   const struct cell_spu_function_info *funcs = &cell->spu_functions;
+   uint i, addr = 0;
+   for (i = 0; i < funcs->num; i++) {
+      if (strcmp(funcs->names[i], funcname) == 0) {
+         addr = funcs->addrs[i];
+      }
+   }
+   assert(addr && "spu function not found");
+   return addr / 4;  /* discard 2 least significant bits */
+}
+
+
 /**
  * Emit code to call a SPU function.
  * Used to implement instructions like SIN/COS/POW/TEX/etc.
@@ -1171,27 +1191,12 @@ emit_function_call(struct codegen *gen,
                    const struct tgsi_full_instruction *inst,
                    char *funcname, uint num_args)
 {
-   const struct cell_spu_function_info *funcs = &gen->cell->spu_functions;
+   const uint addr = lookup_function(gen->cell, funcname);
    char comment[100];
-   uint addr;
    int ch;
 
    assert(num_args <= 3);
 
-   /* lookup function address */
-   {
-      uint i;
-      addr = 0;
-      for (i = 0; i < funcs->num; i++) {
-         if (strcmp(funcs->names[i], funcname) == 0) {
-            addr = funcs->addrs[i];
-         }
-      }
-      assert(addr && "spu function not found");
-   }
-
-   addr /= 4; /* discard 2 least significant bits */
-
    snprintf(comment, sizeof(comment), "CALL %s:", funcname);
    spe_comment(gen->f, -4, comment);
 
@@ -1245,6 +1250,72 @@ emit_function_call(struct codegen *gen,
 }
 
 
+static boolean
+emit_TXP(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   const uint addr = lookup_function(gen->cell, "spu_txp");
+   int ch;
+   int coord_regs[4], d_regs[4];
+
+   spe_comment(gen->f, -4, "CALL txp:");
+
+   /* get src/dst reg info */
+   for (ch = 0; ch < 4; ch++) {
+      coord_regs[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+      d_regs[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+   }
+
+   {
+      ubyte usedRegs[SPE_NUM_REGS];
+      uint i, numUsed;
+
+      numUsed = spe_get_registers_used(gen->f, usedRegs);
+      assert(numUsed < gen->frame_size / 16 - 32);
+
+      /* save registers to stack */
+      for (i = 0; i < numUsed; i++) {
+         uint reg = usedRegs[i];
+         int offset = 2 + i;
+         spe_stqd(gen->f, reg, SPE_REG_SP, 16 * offset);
+      }
+
+      /* setup function arguments */
+      for (i = 0; i < 4; i++) {
+         spe_move(gen->f, 3 + i, coord_regs[i]);
+      }
+
+      /* branch to function, save return addr */
+      spe_brasl(gen->f, SPE_REG_RA, addr);
+
+      /* save function's return values (four pixel's colors) */
+      for (i = 0; i < 4; i++) {
+         spe_move(gen->f, d_regs[i], 3 + i);
+      }
+
+      /* restore registers from stack */
+      for (i = 0; i < numUsed; i++) {
+         uint reg = usedRegs[i];
+         if (reg != d_regs[0] &&
+             reg != d_regs[1] &&
+             reg != d_regs[2] &&
+             reg != d_regs[3]) {
+            int offset = 2 + i;
+            spe_lqd(gen->f, reg, SPE_REG_SP, 16 * offset);
+         }
+      }
+   }
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         store_dest_reg(gen, d_regs[ch], ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+
+   return TRUE;
+}
+
+
 /**
  * Emit max.  See emit_SGT for comments.
  */
@@ -1483,6 +1554,12 @@ emit_instruction(struct codegen *gen,
       return emit_function_call(gen, inst, "spu_exp2", 1);
    case TGSI_OPCODE_LOGBASE2:
       return emit_function_call(gen, inst, "spu_log2", 1);
+   case TGSI_OPCODE_TEX:
+      /* fall-through for now */
+   case TGSI_OPCODE_TXD:
+      /* fall-through for now */
+   case TGSI_OPCODE_TXP:
+      return emit_TXP(gen, inst);
 
    case TGSI_OPCODE_IF:
       return emit_IF(gen, inst);
diff --git a/src/gallium/drivers/cell/spu/spu_funcs.c b/src/gallium/drivers/cell/spu/spu_funcs.c
index 1adf9de0e8..c7bcb3de9d 100644
--- a/src/gallium/drivers/cell/spu/spu_funcs.c
+++ b/src/gallium/drivers/cell/spu/spu_funcs.c
@@ -38,12 +38,20 @@
 #include <math.h>
 #include <cos14_v.h>
 #include <sin14_v.h>
+#include <transpose_matrix4x4.h>
 
 #include "cell/common.h"
 #include "spu_main.h"
 #include "spu_funcs.h"
 
 
+/** For "return"-ing four vectors */
+struct vec_4x4
+{
+   vector float v[4];
+};
+
+
 static vector float
 spu_cos(vector float x)
 {
@@ -92,16 +100,44 @@ spu_log2(vector float x)
    return spu_mul(v, k);
 }
 
+static struct vec_4x4
+spu_txp(vector float s, vector float t, vector float r, vector float q)
+{
+   const uint unit = 0;
+   struct vec_4x4 colors;
+   vector float coords[4];
+
+   coords[0] = s;
+   coords[1] = t;
+   coords[2] = r;
+   coords[3] = q;
+   _transpose_matrix4x4(coords, coords);
+
+   /* get four texture samples */
+   colors.v[0] = spu.sample_texture[unit](unit, coords[0]);
+   colors.v[1] = spu.sample_texture[unit](unit, coords[1]);
+   colors.v[2] = spu.sample_texture[unit](unit, coords[2]);
+   colors.v[3] = spu.sample_texture[unit](unit, coords[3]);
+
+   _transpose_matrix4x4(colors.v, colors.v);
+   return colors;
+}
+
 
+/**
+ * Add named function to list of "exported" functions that will be
+ * made available to the PPU-hosted code generator.
+ */
 static void
-add_func(struct cell_spu_function_info *spu_functions,
-             const char *name, void *addr)
+export_func(struct cell_spu_function_info *spu_functions,
+            const char *name, void *addr)
 {
    uint n = spu_functions->num;
    ASSERT(strlen(name) < 16);
    strcpy(spu_functions->names[n], name);
    spu_functions->addrs[n] = (uint) addr;
    spu_functions->num++;
+   ASSERT(spu_functions->num <= 16);
 }
 
 
@@ -119,11 +155,12 @@ return_function_info(void)
    ASSERT(sizeof(funcs) == 256); /* must be multiple of 16 bytes */
 
    funcs.num = 0;
-   add_func(&funcs, "spu_cos", &spu_cos);
-   add_func(&funcs, "spu_sin", &spu_sin);
-   add_func(&funcs, "spu_pow", &spu_pow);
-   add_func(&funcs, "spu_exp2", &spu_exp2);
-   add_func(&funcs, "spu_log2", &spu_log2);
+   export_func(&funcs, "spu_cos", &spu_cos);
+   export_func(&funcs, "spu_sin", &spu_sin);
+   export_func(&funcs, "spu_pow", &spu_pow);
+   export_func(&funcs, "spu_exp2", &spu_exp2);
+   export_func(&funcs, "spu_log2", &spu_log2);
+   export_func(&funcs, "spu_txp", &spu_txp);
 
    /* Send the function info back to the PPU / main memory */
    mfc_put((void *) &funcs,  /* src in local store */
diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index 6039cd80b2..87991c3136 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -286,7 +286,7 @@ emit_quad( int x, int y, mask_t mask)
       spu.cur_ctile_status = TILE_STATUS_DIRTY;
       spu.cur_ztile_status = TILE_STATUS_DIRTY;
 
-      if (spu.texture[0].start) {
+      if (0/*spu.texture[0].start*/) {
          /*
           * Temporary texture mapping path
           * This will go away when fragment programs support TEX inst.
-- 
cgit v1.2.3


From 086a56134f334505ca9cd6f57194280c1ebf44dc Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 10 Oct 2008 08:44:29 -0600
Subject: cell: updates in response to draw's struct vertex_info changes

---
 src/gallium/drivers/cell/spu/spu_tri.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index 87991c3136..a62d4f0f2f 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -215,7 +215,7 @@ clip_emit_quad(struct setup_stage *setup)
 static INLINE void
 eval_coeff(uint slot, float x, float y, vector float result[4])
 {
-   switch (spu.vertex_info.interp_mode[slot]) {
+   switch (spu.vertex_info.attrib[slot].interp_mode) {
    case INTERP_CONSTANT:
       result[QUAD_TOP_LEFT] =
       result[QUAD_TOP_RIGHT] =
@@ -776,7 +776,7 @@ static void setup_tri_coefficients(void)
    uint i;
 
    for (i = 0; i < spu.vertex_info.num_attribs; i++) {
-      switch (spu.vertex_info.interp_mode[i]) {
+      switch (spu.vertex_info.attrib[i].interp_mode) {
       case INTERP_NONE:
          break;
       case INTERP_POS:
-- 
cgit v1.2.3


From 01e312a73b68dc5ddffca0d1b1472fc5dcb6f59e Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 10 Oct 2008 16:36:40 -0600
Subject: cell: pass texture unit (sampler number) to txp() function

The glsl/multitex demo runs now.
---
 src/gallium/drivers/cell/ppu/cell_gen_fp.c | 4 ++++
 src/gallium/drivers/cell/spu/spu_funcs.c   | 5 +++--
 2 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
index 3d0e7976df..ef84059d8f 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
@@ -1285,9 +1285,12 @@ static boolean
 emit_TXP(struct codegen *gen, const struct tgsi_full_instruction *inst)
 {
    const uint addr = lookup_function(gen->cell, "spu_txp");
+   const uint unit = inst->FullSrcRegisters[1].SrcRegister.Index;
    int ch;
    int coord_regs[4], d_regs[4];
 
+   assert(inst->FullSrcRegisters[1].SrcRegister.File == TGSI_FILE_SAMPLER);
+
    spe_comment(gen->f, -4, "CALL txp:");
 
    /* get src/dst reg info */
@@ -1314,6 +1317,7 @@ emit_TXP(struct codegen *gen, const struct tgsi_full_instruction *inst)
       for (i = 0; i < 4; i++) {
          spe_move(gen->f, 3 + i, coord_regs[i]);
       }
+      spe_load_uint(gen->f, 7, unit); /* sampler unit */
 
       /* branch to function, save return addr */
       spe_brasl(gen->f, SPE_REG_RA, addr);
diff --git a/src/gallium/drivers/cell/spu/spu_funcs.c b/src/gallium/drivers/cell/spu/spu_funcs.c
index c7bcb3de9d..7dd7fcd253 100644
--- a/src/gallium/drivers/cell/spu/spu_funcs.c
+++ b/src/gallium/drivers/cell/spu/spu_funcs.c
@@ -101,9 +101,10 @@ spu_log2(vector float x)
 }
 
 static struct vec_4x4
-spu_txp(vector float s, vector float t, vector float r, vector float q)
+spu_txp(vector float s, vector float t, vector float r, vector float q,
+        unsigned unit)
 {
-   const uint unit = 0;
+   //const uint unit = 0;
    struct vec_4x4 colors;
    vector float coords[4];
 
-- 
cgit v1.2.3


From 734685549ca7dbee78845fdef1d65aceaa729845 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Mon, 13 Oct 2008 10:54:11 -0600
Subject: cell: added spu_unpack_A8R8G8B8_transpose4()

Plus, clearer shuffle masks in other funcs.
---
 src/gallium/drivers/cell/spu/spu_colorpack.h | 49 ++++++++++++++++++++++++----
 1 file changed, 42 insertions(+), 7 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/spu/spu_colorpack.h b/src/gallium/drivers/cell/spu/spu_colorpack.h
index fd8dc6ded3..d7ce005524 100644
--- a/src/gallium/drivers/cell/spu/spu_colorpack.h
+++ b/src/gallium/drivers/cell/spu/spu_colorpack.h
@@ -31,6 +31,7 @@
 #define SPU_COLORPACK_H
 
 
+#include <transpose_matrix4x4.h>
 #include <spu_intrinsics.h>
 
 
@@ -84,10 +85,10 @@ spu_unpack_B8G8R8A8(uint color)
    vector unsigned int color_u4 = spu_splats(color);
    color_u4 = spu_shuffle(color_u4, color_u4,
                           ((vector unsigned char) {
-                             10, 10, 10, 10,
-                             5, 5, 5, 5,
+                             2, 2, 2, 2,
+                             1, 1, 1, 1,
                              0, 0, 0, 0,
-                             15, 15, 15, 15}) );
+                             3, 3, 3, 3}) );
    return spu_convtf(color_u4, 32);
 }
 
@@ -98,13 +99,47 @@ spu_unpack_A8R8G8B8(uint color)
    vector unsigned int color_u4 = spu_splats(color);
    color_u4 = spu_shuffle(color_u4, color_u4,
                           ((vector unsigned char) {
-                             5, 5, 5, 5,
-                             10, 10, 10, 10,
-                             15, 15, 15, 15,
+                             1, 1, 1, 1,
+                             2, 2, 2, 2,
+                             3, 3, 3, 3,
                              0, 0, 0, 0}) );
-
    return spu_convtf(color_u4, 32);
 }
 
 
+/**
+ * \param color_in - array of 32-bit packed ARGB colors
+ * \param color_out - returns float colors in RRRR, GGGG, BBBB, AAAA order
+ */
+static INLINE void
+spu_unpack_A8R8G8B8_transpose4(const vector unsigned int color_in[4],
+                               vector float color_out[4])
+{
+   vector unsigned int c0;
+
+   c0 = spu_shuffle(color_in[0], color_in[0],
+                    ((vector unsigned char) {
+                       1, 1, 1, 1,  2, 2, 2, 2,  3, 3, 3, 3,  0, 0, 0, 0}) );
+   color_out[0] = spu_convtf(c0, 32);
+
+   c0 = spu_shuffle(color_in[1], color_in[1],
+                    ((vector unsigned char) {
+                       1, 1, 1, 1,  2, 2, 2, 2,  3, 3, 3, 3,  0, 0, 0, 0}) );
+   color_out[1] = spu_convtf(c0, 32);
+
+   c0 = spu_shuffle(color_in[2], color_in[2],
+                    ((vector unsigned char) {
+                       1, 1, 1, 1,  2, 2, 2, 2,  3, 3, 3, 3,  0, 0, 0, 0}) );
+   color_out[2] = spu_convtf(c0, 32);
+
+   c0 = spu_shuffle(color_in[3], color_in[3],
+                    ((vector unsigned char) {
+                       1, 1, 1, 1,  2, 2, 2, 2,  3, 3, 3, 3,  0, 0, 0, 0}) );
+   color_out[3] = spu_convtf(c0, 32);
+
+   _transpose_matrix4x4(color_out, color_out);
+}
+
+
+
 #endif /* SPU_COLORPACK_H */
-- 
cgit v1.2.3


From 3b07c28dee74c7aa3be5efac8084d610675af291 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Mon, 13 Oct 2008 10:55:08 -0600
Subject: cell: do texture sampling/filtering for four pixels at a time.

---
 src/gallium/drivers/cell/spu/spu_command.c |  11 ++-
 src/gallium/drivers/cell/spu/spu_funcs.c   |   4 +
 src/gallium/drivers/cell/spu/spu_main.h    |  19 ++++-
 src/gallium/drivers/cell/spu/spu_texture.c | 125 ++++++++++++++++++++++++++++-
 src/gallium/drivers/cell/spu/spu_texture.h |  12 +++
 5 files changed, 161 insertions(+), 10 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index 91a4c137e7..c59be7defd 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -301,10 +301,14 @@ cmd_state_sampler(const struct cell_command_sampler *sampler)
    DEBUG_PRINTF("SAMPLER [%u]\n", sampler->unit);
 
    spu.sampler[sampler->unit] = sampler->state;
-   if (spu.sampler[sampler->unit].min_img_filter == PIPE_TEX_FILTER_LINEAR)
+   if (spu.sampler[sampler->unit].min_img_filter == PIPE_TEX_FILTER_LINEAR) {
       spu.sample_texture[sampler->unit] = sample_texture_bilinear;
-   else
+      spu.sample_texture4[sampler->unit] = sample_texture4_bilinear;
+   }
+   else {
       spu.sample_texture[sampler->unit] = sample_texture_nearest;
+      spu.sample_texture4[sampler->unit] = sample_texture4_nearest;
+   }
 }
 
 
@@ -323,6 +327,9 @@ cmd_state_texture(const struct cell_command_texture *texture)
    spu.texture[unit].width = width;
    spu.texture[unit].height = height;
 
+   spu.texture[unit].width4 = spu_splats((float) width);
+   spu.texture[unit].height4 = spu_splats((float) height);
+
    spu.texture[unit].tiles_per_row = width / TILE_SIZE;
 
    spu.texture[unit].tex_size = (vector float) { width, height, 0.0, 0.0};
diff --git a/src/gallium/drivers/cell/spu/spu_funcs.c b/src/gallium/drivers/cell/spu/spu_funcs.c
index 7dd7fcd253..13c234ea2e 100644
--- a/src/gallium/drivers/cell/spu/spu_funcs.c
+++ b/src/gallium/drivers/cell/spu/spu_funcs.c
@@ -106,6 +106,7 @@ spu_txp(vector float s, vector float t, vector float r, vector float q,
 {
    //const uint unit = 0;
    struct vec_4x4 colors;
+#if 0
    vector float coords[4];
 
    coords[0] = s;
@@ -121,6 +122,9 @@ spu_txp(vector float s, vector float t, vector float r, vector float q,
    colors.v[3] = spu.sample_texture[unit](unit, coords[3]);
 
    _transpose_matrix4x4(colors.v, colors.v);
+#else
+   spu.sample_texture4[unit](s, t, r, q, unit, colors.v);
+#endif
    return colors;
 }
 
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 82c9c69a3a..5d14be51c2 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -67,6 +67,14 @@ typedef union {
 typedef vector float (*spu_sample_texture_func)(uint unit,
                                                 vector float texcoord);
 
+typedef void (*spu_sample_texture4_func)(vector float s,
+                                         vector float t,
+                                         vector float r,
+                                         vector float q,
+                                         uint unit,
+                                         vector float colors[4]);
+
+
 /** Function for performing per-fragment ops */
 typedef void (*spu_fragment_ops_func)(uint x, uint y,
                                       tile_t *colorTile,
@@ -107,10 +115,12 @@ struct spu_texture
    void *start;
    ushort width, height;
    ushort tiles_per_row;
-   vector float tex_size;
-   vector unsigned int tex_size_mask; /**< == int(size - 1) */
-   vector unsigned int tex_size_x_mask; /**< == int(size - 1) */
-   vector unsigned int tex_size_y_mask; /**< == int(size - 1) */
+   vector float tex_size; /**< == {width, height, 0, 0} */
+   vector float width4;   /**< == {width, width, width, width} */
+   vector float height4;  /**< == {height, height, height, height} */
+   vector unsigned int tex_size_mask; /**< == {width-1, height-1, 0, 0 } */
+   vector unsigned int tex_size_x_mask; /**< splat(width-1) */
+   vector unsigned int tex_size_y_mask; /**< splat(height-1) */
 } ALIGN16_ATTRIB;
 
 
@@ -159,6 +169,7 @@ struct spu_global
 
    /** Current texture sampler function */
    spu_sample_texture_func sample_texture[CELL_MAX_SAMPLERS];
+   spu_sample_texture4_func sample_texture4[CELL_MAX_SAMPLERS];
 
    /** Fragment program constants */
    vector float constants[4 * CELL_MAX_CONSTANTS];
diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
index 117b8a36f8..12e6ed1ba1 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.c
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -26,6 +26,8 @@
  **************************************************************************/
 
 
+#include <transpose_matrix4x4.h>
+
 #include "pipe/p_compiler.h"
 #include "spu_main.h"
 #include "spu_texture.h"
@@ -91,10 +93,10 @@ static void
 get_four_texels(uint unit, vec_uint4 x, vec_uint4 y, vec_uint4 *texels)
 {
    const unsigned texture_ea = (uintptr_t) spu.texture[unit].start;
-   vec_uint4 tile_x = spu_rlmask(x, -5);
-   vec_uint4 tile_y = spu_rlmask(y, -5);
-   const qword offset_x = si_andi((qword) x, 0x1f);
-   const qword offset_y = si_andi((qword) y, 0x1f);
+   vec_uint4 tile_x = spu_rlmask(x, -5);  /* tile_x = x / 32 */
+   vec_uint4 tile_y = spu_rlmask(y, -5);  /* tile_y = y / 32 */
+   const qword offset_x = si_andi((qword) x, 0x1f); /* offset_x = x & 0x1f */
+   const qword offset_y = si_andi((qword) y, 0x1f); /* offset_y = y & 0x1f */
 
    const qword tiles_per_row = (qword) spu_splats(spu.texture[unit].tiles_per_row);
    const qword tile_size = (qword) spu_splats((unsigned) sizeof(tile_t));
@@ -132,6 +134,31 @@ sample_texture_nearest(uint unit, vector float texcoord)
 }
 
 
+/**
+ * \param colors  returned colors in SOA format (rrrr, gggg, bbbb, aaaa).
+ */
+void
+sample_texture4_nearest(vector float s, vector float t,
+                        vector float r, vector float q,
+                        uint unit, vector float colors[4])
+{
+   vector float ss = spu_mul(s, spu.texture[unit].width4);
+   vector float tt = spu_mul(t, spu.texture[unit].height4);
+   vector unsigned int is = spu_convtu(ss, 0);
+   vector unsigned int it = spu_convtu(tt, 0);
+   vec_uint4 texels[4];
+
+   /* GL_REPEAT wrap mode: */
+   is = spu_and(is, spu.texture[unit].tex_size_x_mask);
+   it = spu_and(it, spu.texture[unit].tex_size_y_mask);
+
+   get_four_texels(unit, is, it, texels);
+
+   /* convert four packed ARGBA pixels to float RRRR,GGGG,BBBB,AAAA */
+   spu_unpack_A8R8G8B8_transpose4(texels, colors);
+}
+
+
 vector float
 sample_texture_bilinear(uint unit, vector float texcoord)
 {
@@ -198,3 +225,93 @@ sample_texture_bilinear(uint unit, vector float texcoord)
 
    return texel_sum;
 }
+
+
+void
+sample_texture4_bilinear(vector float s, vector float t,
+                         vector float r, vector float q,
+                         uint unit, vector float colors[4])
+{
+   vector float ss = spu_madd(s, spu.texture[unit].width4,  spu_splats(-0.5f));
+   vector float tt = spu_madd(t, spu.texture[unit].height4, spu_splats(-0.5f));
+
+   vector unsigned int is0 = spu_convtu(ss, 0);
+   vector unsigned int it0 = spu_convtu(tt, 0);
+
+   /* is + 1, it + 1 */
+   vector unsigned int is1 = spu_add(is0, 1);
+   vector unsigned int it1 = spu_add(it0, 1);
+
+   /* PIPE_TEX_WRAP_REPEAT */
+   is0 = spu_and(is0, spu.texture[unit].tex_size_x_mask);
+   it0 = spu_and(it0, spu.texture[unit].tex_size_y_mask);
+   is1 = spu_and(is1, spu.texture[unit].tex_size_x_mask);
+   it1 = spu_and(it1, spu.texture[unit].tex_size_y_mask);
+
+   /* get packed int texels */
+   vector unsigned int texels[16];
+   get_four_texels(unit, is0, it0, texels + 0);  /* upper-left */
+   get_four_texels(unit, is1, it0, texels + 4);  /* upper-right */
+   get_four_texels(unit, is0, it1, texels + 8);  /* lower-left */
+   get_four_texels(unit, is1, it1, texels + 12); /* lower-right */
+
+   /* XXX possibly rework following code to compute the weighted sample
+    * colors with integer arithmetic for fewer int->float conversions.
+    */
+
+   /* convert packed int texels to float colors */
+   vector float ftexels[16];
+   spu_unpack_A8R8G8B8_transpose4(texels + 0, ftexels + 0);
+   spu_unpack_A8R8G8B8_transpose4(texels + 4, ftexels + 4);
+   spu_unpack_A8R8G8B8_transpose4(texels + 8, ftexels + 8);
+   spu_unpack_A8R8G8B8_transpose4(texels + 12, ftexels + 12);
+
+   /* Compute weighting factors in [0,1]
+    * Multiply texcoord by 1024, AND with 1023, convert back to float.
+    */
+   vector float ss1024 = spu_mul(ss, spu_splats(1024.0f));
+   vector signed int iss1024 = spu_convts(ss1024, 0);
+   iss1024 = spu_and(iss1024, 1023);
+   vector float sWeights0 = spu_convtf(iss1024, 10);
+
+   vector float tt1024 = spu_mul(tt, spu_splats(1024.0f));
+   vector signed int itt1024 = spu_convts(tt1024, 0);
+   itt1024 = spu_and(itt1024, 1023);
+   vector float tWeights0 = spu_convtf(itt1024, 10);
+
+   /* 1 - sWeight and 1 - tWeight */
+   vector float sWeights1 = spu_sub(spu_splats(1.0f), sWeights0);
+   vector float tWeights1 = spu_sub(spu_splats(1.0f), tWeights0);
+
+   /* reds, for four pixels */
+   ftexels[ 0] = spu_mul(ftexels[ 0], spu_mul(sWeights1, tWeights1)); /*ul*/
+   ftexels[ 4] = spu_mul(ftexels[ 4], spu_mul(sWeights0, tWeights1)); /*ur*/
+   ftexels[ 8] = spu_mul(ftexels[ 8], spu_mul(sWeights1, tWeights0)); /*ll*/
+   ftexels[12] = spu_mul(ftexels[12], spu_mul(sWeights0, tWeights0)); /*lr*/
+   colors[0] = spu_add(spu_add(ftexels[0], ftexels[4]),
+                       spu_add(ftexels[8], ftexels[12]));
+
+   /* greens, for four pixels */
+   ftexels[ 1] = spu_mul(ftexels[ 1], spu_mul(sWeights1, tWeights1)); /*ul*/
+   ftexels[ 5] = spu_mul(ftexels[ 5], spu_mul(sWeights0, tWeights1)); /*ur*/
+   ftexels[ 9] = spu_mul(ftexels[ 9], spu_mul(sWeights1, tWeights0)); /*ll*/
+   ftexels[13] = spu_mul(ftexels[13], spu_mul(sWeights0, tWeights0)); /*lr*/
+   colors[1] = spu_add(spu_add(ftexels[1], ftexels[5]),
+                       spu_add(ftexels[9], ftexels[13]));
+
+   /* blues, for four pixels */
+   ftexels[ 2] = spu_mul(ftexels[ 2], spu_mul(sWeights1, tWeights1)); /*ul*/
+   ftexels[ 6] = spu_mul(ftexels[ 6], spu_mul(sWeights0, tWeights1)); /*ur*/
+   ftexels[10] = spu_mul(ftexels[10], spu_mul(sWeights1, tWeights0)); /*ll*/
+   ftexels[14] = spu_mul(ftexels[14], spu_mul(sWeights0, tWeights0)); /*lr*/
+   colors[2] = spu_add(spu_add(ftexels[2], ftexels[6]),
+                       spu_add(ftexels[10], ftexels[14]));
+
+   /* alphas, for four pixels */
+   ftexels[ 3] = spu_mul(ftexels[ 3], spu_mul(sWeights1, tWeights1)); /*ul*/
+   ftexels[ 7] = spu_mul(ftexels[ 7], spu_mul(sWeights0, tWeights1)); /*ur*/
+   ftexels[11] = spu_mul(ftexels[11], spu_mul(sWeights1, tWeights0)); /*ll*/
+   ftexels[15] = spu_mul(ftexels[15], spu_mul(sWeights0, tWeights0)); /*lr*/
+   colors[3] = spu_add(spu_add(ftexels[3], ftexels[7]),
+                       spu_add(ftexels[11], ftexels[15]));
+}
diff --git a/src/gallium/drivers/cell/spu/spu_texture.h b/src/gallium/drivers/cell/spu/spu_texture.h
index f7c9738be8..f019e7d8ef 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.h
+++ b/src/gallium/drivers/cell/spu/spu_texture.h
@@ -40,8 +40,20 @@ extern vector float
 sample_texture_nearest(uint unit, vector float texcoord);
 
 
+extern void
+sample_texture4_nearest(vector float s, vector float t,
+                        vector float r, vector float q,
+                        uint unit, vector float colors[4]);
+
+
 extern vector float
 sample_texture_bilinear(uint unit, vector float texcoord);
 
 
+extern void
+sample_texture4_bilinear(vector float s, vector float t,
+                         vector float r, vector float q,
+                         uint unit, vector float colors[4]);
+
+
 #endif /* SPU_TEXTURE_H */
-- 
cgit v1.2.3


From c8fb3682619ea49c5fefdf8b88cdb95eac7478ff Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Mon, 13 Oct 2008 11:16:04 -0600
Subject: cell: remove old texture code

---
 src/gallium/drivers/cell/spu/spu_command.c |  2 -
 src/gallium/drivers/cell/spu/spu_funcs.c   | 19 -------
 src/gallium/drivers/cell/spu/spu_main.h    |  4 --
 src/gallium/drivers/cell/spu/spu_texture.c | 88 ++----------------------------
 src/gallium/drivers/cell/spu/spu_texture.h |  8 ---
 src/gallium/drivers/cell/spu/spu_tri.c     | 67 +----------------------
 6 files changed, 7 insertions(+), 181 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index c59be7defd..d4cc9a2146 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -302,11 +302,9 @@ cmd_state_sampler(const struct cell_command_sampler *sampler)
 
    spu.sampler[sampler->unit] = sampler->state;
    if (spu.sampler[sampler->unit].min_img_filter == PIPE_TEX_FILTER_LINEAR) {
-      spu.sample_texture[sampler->unit] = sample_texture_bilinear;
       spu.sample_texture4[sampler->unit] = sample_texture4_bilinear;
    }
    else {
-      spu.sample_texture[sampler->unit] = sample_texture_nearest;
       spu.sample_texture4[sampler->unit] = sample_texture4_nearest;
    }
 }
diff --git a/src/gallium/drivers/cell/spu/spu_funcs.c b/src/gallium/drivers/cell/spu/spu_funcs.c
index 13c234ea2e..4c90b701ee 100644
--- a/src/gallium/drivers/cell/spu/spu_funcs.c
+++ b/src/gallium/drivers/cell/spu/spu_funcs.c
@@ -104,27 +104,8 @@ static struct vec_4x4
 spu_txp(vector float s, vector float t, vector float r, vector float q,
         unsigned unit)
 {
-   //const uint unit = 0;
    struct vec_4x4 colors;
-#if 0
-   vector float coords[4];
-
-   coords[0] = s;
-   coords[1] = t;
-   coords[2] = r;
-   coords[3] = q;
-   _transpose_matrix4x4(coords, coords);
-
-   /* get four texture samples */
-   colors.v[0] = spu.sample_texture[unit](unit, coords[0]);
-   colors.v[1] = spu.sample_texture[unit](unit, coords[1]);
-   colors.v[2] = spu.sample_texture[unit](unit, coords[2]);
-   colors.v[3] = spu.sample_texture[unit](unit, coords[3]);
-
-   _transpose_matrix4x4(colors.v, colors.v);
-#else
    spu.sample_texture4[unit](s, t, r, q, unit, colors.v);
-#endif
    return colors;
 }
 
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 5d14be51c2..2a8cb00f8d 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -64,9 +64,6 @@ typedef union {
 
 
 /** Function for sampling textures */
-typedef vector float (*spu_sample_texture_func)(uint unit,
-                                                vector float texcoord);
-
 typedef void (*spu_sample_texture4_func)(vector float s,
                                          vector float t,
                                          vector float r,
@@ -168,7 +165,6 @@ struct spu_global
    spu_fragment_program_func fragment_program;
 
    /** Current texture sampler function */
-   spu_sample_texture_func sample_texture[CELL_MAX_SAMPLERS];
    spu_sample_texture4_func sample_texture4[CELL_MAX_SAMPLERS];
 
    /** Fragment program constants */
diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
index 12e6ed1ba1..ba62ad27fd 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.c
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -120,21 +120,9 @@ get_four_texels(uint unit, vec_uint4 x, vec_uint4 y, vec_uint4 *texels)
 }
 
 
-/**
- * Get texture sample at texcoord.
- */
-vector float
-sample_texture_nearest(uint unit, vector float texcoord)
-{
-   vector float tc = spu_mul(texcoord, spu.texture[unit].tex_size);
-   vector unsigned int itc = spu_convtu(tc, 0);  /* convert to int */
-   itc = spu_and(itc, spu.texture[unit].tex_size_mask); /* mask (GL_REPEAT) */
-   uint texel = get_texel(unit, itc);
-   return spu_unpack_A8R8G8B8(texel);
-}
-
 
 /**
+ * Do nearest texture sampling for four pixels.
  * \param colors  returned colors in SOA format (rrrr, gggg, bbbb, aaaa).
  */
 void
@@ -148,7 +136,7 @@ sample_texture4_nearest(vector float s, vector float t,
    vector unsigned int it = spu_convtu(tt, 0);
    vec_uint4 texels[4];
 
-   /* GL_REPEAT wrap mode: */
+   /* PIPE_TEX_WRAP_REPEAT */
    is = spu_and(is, spu.texture[unit].tex_size_x_mask);
    it = spu_and(it, spu.texture[unit].tex_size_y_mask);
 
@@ -159,74 +147,10 @@ sample_texture4_nearest(vector float s, vector float t,
 }
 
 
-vector float
-sample_texture_bilinear(uint unit, vector float texcoord)
-{
-   static const vec_uint4 offset_x = {0, 0, 1, 1};
-   static const vec_uint4 offset_y = {0, 1, 0, 1};
-
-   vector float tc = spu_mul(texcoord, spu.texture[unit].tex_size);
-   tc = spu_add(tc, spu_splats(-0.5f));  /* half texel bias */
-
-   /* integer texcoords S,T: */
-   vec_uint4 itc = spu_convtu(tc, 0);  /* convert to int */
-
-   vec_uint4 texels[4];
-   
-   /* setup texcoords for quad:
-    *  +-----+-----+
-    *  |x0,y0|x1,y1|
-    *  +-----+-----+
-    *  |x2,y2|x3,y3|
-    *  +-----+-----+
-    */
-   vec_uint4 x = spu_splats(spu_extract(itc, 0));
-   vec_uint4 y = spu_splats(spu_extract(itc, 1));
-   x = spu_add(x, offset_x);
-   y = spu_add(y, offset_y);
-
-   /* GL_REPEAT wrap mode: */
-   x = spu_and(x, spu.texture[unit].tex_size_x_mask);
-   y = spu_and(y, spu.texture[unit].tex_size_y_mask);
-
-   get_four_texels(unit, x, y, texels);
-
-   /* integer A8R8G8B8 to float texel conversion */
-   vector float texel00 = spu_unpack_A8R8G8B8(spu_extract(texels[0], 0));
-   vector float texel01 = spu_unpack_A8R8G8B8(spu_extract(texels[1], 0));
-   vector float texel10 = spu_unpack_A8R8G8B8(spu_extract(texels[2], 0));
-   vector float texel11 = spu_unpack_A8R8G8B8(spu_extract(texels[3], 0));
-
-
-   /* Compute weighting factors in [0,1]
-    * Multiply texcoord by 1024, AND with 1023, convert back to float.
-    */
-   vector float tc1024 = spu_mul(tc, spu_splats(1024.0f));
-   vector signed int itc1024 = spu_convts(tc1024, 0);
-   itc1024 = spu_and(itc1024, spu_splats((1 << 10) - 1));
-   vector float weight = spu_convtf(itc1024, 10);
-
-   /* smeared frac and 1-frac */
-   vector float sfrac = spu_splats(spu_extract(weight, 0));
-   vector float tfrac = spu_splats(spu_extract(weight, 1));
-   vector float sfrac1 = spu_sub(spu_splats(1.0f), sfrac);
-   vector float tfrac1 = spu_sub(spu_splats(1.0f), tfrac);
-
-   /* multiply the samples (colors) by the S/T weights */
-   texel00 = spu_mul(spu_mul(texel00, sfrac1), tfrac1);
-   texel10 = spu_mul(spu_mul(texel10, sfrac ), tfrac1);
-   texel01 = spu_mul(spu_mul(texel01, sfrac1), tfrac );
-   texel11 = spu_mul(spu_mul(texel11, sfrac ), tfrac );
-
-   /* compute sum of weighted samples */
-   vector float texel_sum = spu_add(texel00, texel01);
-   texel_sum = spu_add(texel_sum, texel10);
-   texel_sum = spu_add(texel_sum, texel11);
-
-   return texel_sum;
-}
-
-
+/**
+ * Do bilinear texture sampling for four pixels.
+ * \param colors  returned colors in SOA format (rrrr, gggg, bbbb, aaaa).
+ */
 void
 sample_texture4_bilinear(vector float s, vector float t,
                          vector float r, vector float q,
diff --git a/src/gallium/drivers/cell/spu/spu_texture.h b/src/gallium/drivers/cell/spu/spu_texture.h
index f019e7d8ef..d576aed719 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.h
+++ b/src/gallium/drivers/cell/spu/spu_texture.h
@@ -36,20 +36,12 @@ extern void
 invalidate_tex_cache(void);
 
 
-extern vector float
-sample_texture_nearest(uint unit, vector float texcoord);
-
-
 extern void
 sample_texture4_nearest(vector float s, vector float t,
                         vector float r, vector float q,
                         uint unit, vector float colors[4]);
 
 
-extern vector float
-sample_texture_bilinear(uint unit, vector float texcoord);
-
-
 extern void
 sample_texture4_bilinear(vector float s, vector float t,
                          vector float r, vector float q,
diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index a62d4f0f2f..022d21ba8f 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -286,72 +286,7 @@ emit_quad( int x, int y, mask_t mask)
       spu.cur_ctile_status = TILE_STATUS_DIRTY;
       spu.cur_ztile_status = TILE_STATUS_DIRTY;
 
-      if (0/*spu.texture[0].start*/) {
-         /*
-          * Temporary texture mapping path
-          * This will go away when fragment programs support TEX inst.
-          */
-         const uint unit = 0;
-         vector float colors[4];
-         vector float texcoords[4];
-         eval_coeff(2, (float) x, (float) y, texcoords);
-
-         if (spu_extract(mask, 0))
-            colors[0] = spu.sample_texture[unit](unit, texcoords[0]);
-         if (spu_extract(mask, 1))
-            colors[1] = spu.sample_texture[unit](unit, texcoords[1]);
-         if (spu_extract(mask, 2))
-            colors[2] = spu.sample_texture[unit](unit, texcoords[2]);
-         if (spu_extract(mask, 3))
-            colors[3] = spu.sample_texture[unit](unit, texcoords[3]);
-
-
-         if (spu.texture[1].start) {
-            /* multi-texture mapping */
-            const uint unit = 1;
-            vector float colors1[4];
-
-            eval_coeff(2, (float) x, (float) y, texcoords);
-
-            if (spu_extract(mask, 0))
-               colors1[0] = spu.sample_texture[unit](unit, texcoords[0]);
-            if (spu_extract(mask, 1))
-               colors1[1] = spu.sample_texture[unit](unit, texcoords[1]);
-            if (spu_extract(mask, 2))
-               colors1[2] = spu.sample_texture[unit](unit, texcoords[2]);
-            if (spu_extract(mask, 3))
-               colors1[3] = spu.sample_texture[unit](unit, texcoords[3]);
-
-            /* hack: modulate first texture by second */
-            colors[0] = spu_mul(colors[0], colors1[0]);
-            colors[1] = spu_mul(colors[1], colors1[1]);
-            colors[2] = spu_mul(colors[2], colors1[2]);
-            colors[3] = spu_mul(colors[3], colors1[3]);
-         }
-
-         {
-            /* Convert fragment data from AoS to SoA format.
-             * I.e. (RGBA,RGBA,RGBA,RGBA) -> (RRRR,GGGG,BBBB,AAAA)
-             * This is temporary!
-             */
-            vector float soa_frag[4];
-            _transpose_matrix4x4(soa_frag, colors);
-
-            vector float fragZ = eval_z((float) x, (float) y);
-
-            /* Do all per-fragment/quad operations here, including:
-             * alpha test, z test, stencil test, blend and framebuffer writing.
-             */
-            spu.fragment_ops(ix, iy, &spu.ctile, &spu.ztile,
-                             fragZ,
-                             soa_frag[0], soa_frag[1],
-                             soa_frag[2], soa_frag[3],
-                             mask,
-                             setup.facing);
-         }
-
-      }
-      else {
+      {
          /*
           * Run fragment shader, execute per-fragment ops, update fb/tile.
           */
-- 
cgit v1.2.3


From 67425aaa09df9cab76d7cc5c66e9e4595f0ccf40 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Mon, 13 Oct 2008 14:09:54 -0600
Subject: cell: bilinear texture filtering using integer arithmetic

Fewer float/int conversions involved.
---
 src/gallium/drivers/cell/spu/spu_texture.c | 144 +++++++++++++++++++++++++++++
 src/gallium/drivers/cell/spu/spu_texture.h |   5 +
 2 files changed, 149 insertions(+)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
index ba62ad27fd..c10268131d 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.c
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -239,3 +239,147 @@ sample_texture4_bilinear(vector float s, vector float t,
    colors[3] = spu_add(spu_add(ftexels[3], ftexels[7]),
                        spu_add(ftexels[11], ftexels[15]));
 }
+
+
+
+/**
+ * Adapted from /opt/cell/sdk/usr/spu/include/transpose_matrix4x4.h
+ */
+static INLINE void
+transpose(vector unsigned int *mOut, vector unsigned int *mIn)
+{
+  vector unsigned int abcd, efgh, ijkl, mnop;	/* input vectors */
+  vector unsigned int aeim, bfjn, cgko, dhlp;	/* output vectors */
+  vector unsigned int aibj, ckdl, emfn, gohp;	/* intermediate vectors */
+
+  vector unsigned char shufflehi = ((vector unsigned char) {
+					       0x00, 0x01, 0x02, 0x03,
+					       0x10, 0x11, 0x12, 0x13,
+					       0x04, 0x05, 0x06, 0x07,
+					       0x14, 0x15, 0x16, 0x17});
+  vector unsigned char shufflelo = ((vector unsigned char) {
+					       0x08, 0x09, 0x0A, 0x0B,
+					       0x18, 0x19, 0x1A, 0x1B,
+					       0x0C, 0x0D, 0x0E, 0x0F,
+					       0x1C, 0x1D, 0x1E, 0x1F});
+  abcd = *(mIn+0);
+  efgh = *(mIn+1);
+  ijkl = *(mIn+2);
+  mnop = *(mIn+3);
+
+  aibj = spu_shuffle(abcd, ijkl, shufflehi);
+  ckdl = spu_shuffle(abcd, ijkl, shufflelo);
+  emfn = spu_shuffle(efgh, mnop, shufflehi);
+  gohp = spu_shuffle(efgh, mnop, shufflelo);
+
+  aeim = spu_shuffle(aibj, emfn, shufflehi);
+  bfjn = spu_shuffle(aibj, emfn, shufflelo);
+  cgko = spu_shuffle(ckdl, gohp, shufflehi);
+  dhlp = spu_shuffle(ckdl, gohp, shufflelo);
+
+  *(mOut+0) = aeim;
+  *(mOut+1) = bfjn;
+  *(mOut+2) = cgko;
+  *(mOut+3) = dhlp;
+}
+
+
+/**
+ * Bilinear filtering, using int intead of float arithmetic
+ */
+void
+sample_texture4_bilinear_2(vector float s, vector float t,
+                         vector float r, vector float q,
+                         uint unit, vector float colors[4])
+{
+   static const vector float half = {-0.5f, -0.5f, -0.5f, -0.5f};
+   /* Scale texcoords by size of texture, and add half pixel bias */
+   vector float ss = spu_madd(s, spu.texture[unit].width4, half);
+   vector float tt = spu_madd(t, spu.texture[unit].height4, half);
+
+   /* convert float coords to fixed-pt coords with 8 fraction bits */
+   vector unsigned int is = spu_convtu(ss, 8);
+   vector unsigned int it = spu_convtu(tt, 8);
+
+   /* compute integer texel weights in [0, 255] */
+   vector signed int sWeights0 = spu_and((vector signed int) is, 255);
+   vector signed int tWeights0 = spu_and((vector signed int) it, 255);
+   vector signed int sWeights1 = spu_sub(255, sWeights0);
+   vector signed int tWeights1 = spu_sub(255, tWeights0);
+
+   /* texel coords: is0 = is / 256, it0 = is / 256 */
+   vector unsigned int is0 = spu_rlmask(is, -8);
+   vector unsigned int it0 = spu_rlmask(it, -8);
+
+   /* texel coords: i1 = is0 + 1, it1 = it0 + 1 */
+   vector unsigned int is1 = spu_add(is0, 1);
+   vector unsigned int it1 = spu_add(it0, 1);
+
+   /* PIPE_TEX_WRAP_REPEAT */
+   is0 = spu_and(is0, spu.texture[unit].tex_size_x_mask);
+   it0 = spu_and(it0, spu.texture[unit].tex_size_y_mask);
+   is1 = spu_and(is1, spu.texture[unit].tex_size_x_mask);
+   it1 = spu_and(it1, spu.texture[unit].tex_size_y_mask);
+
+   /* get packed int texels */
+   vector unsigned int texels[16];
+   get_four_texels(unit, is0, it0, texels + 0);  /* upper-left */
+   get_four_texels(unit, is1, it0, texels + 4);  /* upper-right */
+   get_four_texels(unit, is0, it1, texels + 8);  /* lower-left */
+   get_four_texels(unit, is1, it1, texels + 12); /* lower-right */
+
+   /* twiddle packed 32-bit BGRA pixels into RGBA as four unsigned ints */
+   {
+      static const unsigned char ZERO = 0x80;
+      int i;
+      for (i = 0; i < 16; i++) {
+         texels[i] = spu_shuffle(texels[i], texels[i],
+                                 ((vector unsigned char) {
+                                    ZERO, ZERO, ZERO, 1,
+                                    ZERO, ZERO, ZERO, 2,
+                                    ZERO, ZERO, ZERO, 3,
+                                    ZERO, ZERO, ZERO, 0}));
+      }
+   }
+
+   /* convert RGBA,RGBA,RGBA,RGBA to RRRR,GGGG,BBBB,AAAA */
+   transpose(texels + 0, texels + 0);
+   transpose(texels + 4, texels + 4);
+   transpose(texels + 8, texels + 8);
+   transpose(texels + 12, texels + 12);
+
+   /* computed weighted colors */
+   vector unsigned int c0, c1, c2, c3, cSum;
+
+   /* red */
+   c0 = (vector unsigned int) si_mpyu((qword) texels[ 0], si_mpyu((qword) sWeights1, (qword) tWeights1)); /*ul*/
+   c1 = (vector unsigned int) si_mpyu((qword) texels[ 4], si_mpyu((qword) sWeights0, (qword) tWeights1)); /*ur*/
+   c2 = (vector unsigned int) si_mpyu((qword) texels[ 8], si_mpyu((qword) sWeights1, (qword) tWeights0)); /*ll*/
+   c3 = (vector unsigned int) si_mpyu((qword) texels[12], si_mpyu((qword) sWeights0, (qword) tWeights0)); /*lr*/
+   cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3));
+   colors[0] = spu_convtf(cSum, 24);
+
+   /* green */
+   c0 = (vector unsigned int) si_mpyu((qword) texels[ 1], si_mpyu((qword) sWeights1, (qword) tWeights1)); /*ul*/
+   c1 = (vector unsigned int) si_mpyu((qword) texels[ 5], si_mpyu((qword) sWeights0, (qword) tWeights1)); /*ur*/
+   c2 = (vector unsigned int) si_mpyu((qword) texels[ 9], si_mpyu((qword) sWeights1, (qword) tWeights0)); /*ll*/
+   c3 = (vector unsigned int) si_mpyu((qword) texels[13], si_mpyu((qword) sWeights0, (qword) tWeights0)); /*lr*/
+   cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3));
+   colors[1] = spu_convtf(cSum, 24);
+
+   /* blue */
+   c0 = (vector unsigned int) si_mpyu((qword) texels[ 2], si_mpyu((qword) sWeights1, (qword) tWeights1)); /*ul*/
+   c1 = (vector unsigned int) si_mpyu((qword) texels[ 6], si_mpyu((qword) sWeights0, (qword) tWeights1)); /*ur*/
+   c2 = (vector unsigned int) si_mpyu((qword) texels[10], si_mpyu((qword) sWeights1, (qword) tWeights0)); /*ll*/
+   c3 = (vector unsigned int) si_mpyu((qword) texels[14], si_mpyu((qword) sWeights0, (qword) tWeights0)); /*lr*/
+   cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3));
+   colors[2] = spu_convtf(cSum, 24);
+
+   /* alpha */
+   c0 = (vector unsigned int) si_mpyu((qword) texels[ 3], si_mpyu((qword) sWeights1, (qword) tWeights1)); /*ul*/
+   c1 = (vector unsigned int) si_mpyu((qword) texels[ 7], si_mpyu((qword) sWeights0, (qword) tWeights1)); /*ur*/
+   c2 = (vector unsigned int) si_mpyu((qword) texels[11], si_mpyu((qword) sWeights1, (qword) tWeights0)); /*ll*/
+   c3 = (vector unsigned int) si_mpyu((qword) texels[15], si_mpyu((qword) sWeights0, (qword) tWeights0)); /*lr*/
+   cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3));
+   colors[3] = spu_convtf(cSum, 24);
+}
diff --git a/src/gallium/drivers/cell/spu/spu_texture.h b/src/gallium/drivers/cell/spu/spu_texture.h
index d576aed719..38a17deda2 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.h
+++ b/src/gallium/drivers/cell/spu/spu_texture.h
@@ -47,5 +47,10 @@ sample_texture4_bilinear(vector float s, vector float t,
                          vector float r, vector float q,
                          uint unit, vector float colors[4]);
 
+extern void
+sample_texture4_bilinear_2(vector float s, vector float t,
+                         vector float r, vector float q,
+                         uint unit, vector float colors[4]);
+
 
 #endif /* SPU_TEXTURE_H */
-- 
cgit v1.2.3


From 420e8cdf25501dd82e1c178e6300d7b416798e25 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Mon, 13 Oct 2008 14:10:36 -0600
Subject: cell: remove more old texture code

---
 src/gallium/drivers/cell/spu/spu_texture.c | 26 --------------------------
 1 file changed, 26 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
index c10268131d..3f2280436c 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.c
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -50,32 +50,6 @@ invalidate_tex_cache(void)
 }
 
 
-/**
- * XXX look into getting texels for all four pixels in a quad at once.
- */
-static uint
-get_texel(uint unit, vec_uint4 coordinate)
-{
-   /*
-    * XXX we could do the "/ TILE_SIZE" and "% TILE_SIZE" operations as
-    * SIMD since X and Y are already in a SIMD register.
-    */
-   const unsigned texture_ea = (uintptr_t) spu.texture[unit].start;
-   ushort x = spu_extract(coordinate, 0);
-   ushort y = spu_extract(coordinate, 1);
-   unsigned tile_offset = sizeof(tile_t)
-      * ((y / TILE_SIZE * spu.texture[unit].tiles_per_row) + (x / TILE_SIZE));
-   ushort texel_offset = (ushort) 4
-      * (ushort) (((ushort) (y % TILE_SIZE) * (ushort) TILE_SIZE) + (x % TILE_SIZE));
-   vec_uint4 tmp;
-
-   spu_dcache_fetch_unaligned((qword *) & tmp,
-                              texture_ea + tile_offset + texel_offset,
-                              4);
-   return spu_extract(tmp, 0);
-}
-
-
 /**
  * Get four texels from locations (x[0], y[0]), (x[1], y[1]) ...
  *
-- 
cgit v1.2.3


From c05cabd646f1c7384b5187e3427064096aef4673 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Mon, 13 Oct 2008 14:31:11 -0600
Subject: cell: use fewer memory references in sample_texture4_bilinear_2()

---
 src/gallium/drivers/cell/spu/spu_texture.c | 56 +++++++++++++++++-------------
 1 file changed, 31 insertions(+), 25 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
index 3f2280436c..96ef88822a 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.c
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -220,7 +220,11 @@ sample_texture4_bilinear(vector float s, vector float t,
  * Adapted from /opt/cell/sdk/usr/spu/include/transpose_matrix4x4.h
  */
 static INLINE void
-transpose(vector unsigned int *mOut, vector unsigned int *mIn)
+transpose(vector unsigned int *mOut0,
+          vector unsigned int *mOut1,
+          vector unsigned int *mOut2,
+          vector unsigned int *mOut3,
+          vector unsigned int *mIn)
 {
   vector unsigned int abcd, efgh, ijkl, mnop;	/* input vectors */
   vector unsigned int aeim, bfjn, cgko, dhlp;	/* output vectors */
@@ -251,10 +255,10 @@ transpose(vector unsigned int *mOut, vector unsigned int *mIn)
   cgko = spu_shuffle(ckdl, gohp, shufflehi);
   dhlp = spu_shuffle(ckdl, gohp, shufflelo);
 
-  *(mOut+0) = aeim;
-  *(mOut+1) = bfjn;
-  *(mOut+2) = cgko;
-  *(mOut+3) = dhlp;
+  *mOut0 = aeim;
+  *mOut1 = bfjn;
+  *mOut2 = cgko;
+  *mOut3 = dhlp;
 }
 
 
@@ -317,43 +321,45 @@ sample_texture4_bilinear_2(vector float s, vector float t,
    }
 
    /* convert RGBA,RGBA,RGBA,RGBA to RRRR,GGGG,BBBB,AAAA */
-   transpose(texels + 0, texels + 0);
-   transpose(texels + 4, texels + 4);
-   transpose(texels + 8, texels + 8);
-   transpose(texels + 12, texels + 12);
+   vector unsigned int texel0, texel1, texel2, texel3, texel4, texel5, texel6, texel7,
+      texel8, texel9, texel10, texel11, texel12, texel13, texel14, texel15;
+   transpose(&texel0, &texel1, &texel2, &texel3, texels + 0);
+   transpose(&texel4, &texel5, &texel6, &texel7, texels + 4);
+   transpose(&texel8, &texel9, &texel10, &texel11, texels + 8);
+   transpose(&texel12, &texel13, &texel14, &texel15, texels + 12);
 
    /* computed weighted colors */
    vector unsigned int c0, c1, c2, c3, cSum;
 
    /* red */
-   c0 = (vector unsigned int) si_mpyu((qword) texels[ 0], si_mpyu((qword) sWeights1, (qword) tWeights1)); /*ul*/
-   c1 = (vector unsigned int) si_mpyu((qword) texels[ 4], si_mpyu((qword) sWeights0, (qword) tWeights1)); /*ur*/
-   c2 = (vector unsigned int) si_mpyu((qword) texels[ 8], si_mpyu((qword) sWeights1, (qword) tWeights0)); /*ll*/
-   c3 = (vector unsigned int) si_mpyu((qword) texels[12], si_mpyu((qword) sWeights0, (qword) tWeights0)); /*lr*/
+   c0 = (vector unsigned int) si_mpyu((qword) texel0, si_mpyu((qword) sWeights1, (qword) tWeights1)); /*ul*/
+   c1 = (vector unsigned int) si_mpyu((qword) texel4, si_mpyu((qword) sWeights0, (qword) tWeights1)); /*ur*/
+   c2 = (vector unsigned int) si_mpyu((qword) texel8, si_mpyu((qword) sWeights1, (qword) tWeights0)); /*ll*/
+   c3 = (vector unsigned int) si_mpyu((qword) texel12, si_mpyu((qword) sWeights0, (qword) tWeights0)); /*lr*/
    cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3));
    colors[0] = spu_convtf(cSum, 24);
 
    /* green */
-   c0 = (vector unsigned int) si_mpyu((qword) texels[ 1], si_mpyu((qword) sWeights1, (qword) tWeights1)); /*ul*/
-   c1 = (vector unsigned int) si_mpyu((qword) texels[ 5], si_mpyu((qword) sWeights0, (qword) tWeights1)); /*ur*/
-   c2 = (vector unsigned int) si_mpyu((qword) texels[ 9], si_mpyu((qword) sWeights1, (qword) tWeights0)); /*ll*/
-   c3 = (vector unsigned int) si_mpyu((qword) texels[13], si_mpyu((qword) sWeights0, (qword) tWeights0)); /*lr*/
+   c0 = (vector unsigned int) si_mpyu((qword) texel1, si_mpyu((qword) sWeights1, (qword) tWeights1)); /*ul*/
+   c1 = (vector unsigned int) si_mpyu((qword) texel5, si_mpyu((qword) sWeights0, (qword) tWeights1)); /*ur*/
+   c2 = (vector unsigned int) si_mpyu((qword) texel9, si_mpyu((qword) sWeights1, (qword) tWeights0)); /*ll*/
+   c3 = (vector unsigned int) si_mpyu((qword) texel13, si_mpyu((qword) sWeights0, (qword) tWeights0)); /*lr*/
    cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3));
    colors[1] = spu_convtf(cSum, 24);
 
    /* blue */
-   c0 = (vector unsigned int) si_mpyu((qword) texels[ 2], si_mpyu((qword) sWeights1, (qword) tWeights1)); /*ul*/
-   c1 = (vector unsigned int) si_mpyu((qword) texels[ 6], si_mpyu((qword) sWeights0, (qword) tWeights1)); /*ur*/
-   c2 = (vector unsigned int) si_mpyu((qword) texels[10], si_mpyu((qword) sWeights1, (qword) tWeights0)); /*ll*/
-   c3 = (vector unsigned int) si_mpyu((qword) texels[14], si_mpyu((qword) sWeights0, (qword) tWeights0)); /*lr*/
+   c0 = (vector unsigned int) si_mpyu((qword) texel2, si_mpyu((qword) sWeights1, (qword) tWeights1)); /*ul*/
+   c1 = (vector unsigned int) si_mpyu((qword) texel6, si_mpyu((qword) sWeights0, (qword) tWeights1)); /*ur*/
+   c2 = (vector unsigned int) si_mpyu((qword) texel10, si_mpyu((qword) sWeights1, (qword) tWeights0)); /*ll*/
+   c3 = (vector unsigned int) si_mpyu((qword) texel14, si_mpyu((qword) sWeights0, (qword) tWeights0)); /*lr*/
    cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3));
    colors[2] = spu_convtf(cSum, 24);
 
    /* alpha */
-   c0 = (vector unsigned int) si_mpyu((qword) texels[ 3], si_mpyu((qword) sWeights1, (qword) tWeights1)); /*ul*/
-   c1 = (vector unsigned int) si_mpyu((qword) texels[ 7], si_mpyu((qword) sWeights0, (qword) tWeights1)); /*ur*/
-   c2 = (vector unsigned int) si_mpyu((qword) texels[11], si_mpyu((qword) sWeights1, (qword) tWeights0)); /*ll*/
-   c3 = (vector unsigned int) si_mpyu((qword) texels[15], si_mpyu((qword) sWeights0, (qword) tWeights0)); /*lr*/
+   c0 = (vector unsigned int) si_mpyu((qword) texel3, si_mpyu((qword) sWeights1, (qword) tWeights1)); /*ul*/
+   c1 = (vector unsigned int) si_mpyu((qword) texel7, si_mpyu((qword) sWeights0, (qword) tWeights1)); /*ur*/
+   c2 = (vector unsigned int) si_mpyu((qword) texel11, si_mpyu((qword) sWeights1, (qword) tWeights0)); /*ll*/
+   c3 = (vector unsigned int) si_mpyu((qword) texel15, si_mpyu((qword) sWeights0, (qword) tWeights0)); /*lr*/
    cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3));
    colors[3] = spu_convtf(cSum, 24);
 }
-- 
cgit v1.2.3


From b0c136cfb1fcbcea35e17dc699a96acbb24738f5 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Mon, 13 Oct 2008 15:17:01 -0600
Subject: cell: remove old texture-related fields

---
 src/gallium/drivers/cell/spu/spu_command.c | 3 ---
 src/gallium/drivers/cell/spu/spu_main.h    | 2 --
 2 files changed, 5 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index d4cc9a2146..64890f6dbd 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -330,9 +330,6 @@ cmd_state_texture(const struct cell_command_texture *texture)
 
    spu.texture[unit].tiles_per_row = width / TILE_SIZE;
 
-   spu.texture[unit].tex_size = (vector float) { width, height, 0.0, 0.0};
-   spu.texture[unit].tex_size_mask = (vector unsigned int)
-         { width - 1, height - 1, 0, 0 };
    spu.texture[unit].tex_size_x_mask = spu_splats(width - 1);
    spu.texture[unit].tex_size_y_mask = spu_splats(height - 1);
 }
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 2a8cb00f8d..e3960dbe8b 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -112,10 +112,8 @@ struct spu_texture
    void *start;
    ushort width, height;
    ushort tiles_per_row;
-   vector float tex_size; /**< == {width, height, 0, 0} */
    vector float width4;   /**< == {width, width, width, width} */
    vector float height4;  /**< == {height, height, height, height} */
-   vector unsigned int tex_size_mask; /**< == {width-1, height-1, 0, 0 } */
    vector unsigned int tex_size_x_mask; /**< splat(width-1) */
    vector unsigned int tex_size_y_mask; /**< splat(height-1) */
 } ALIGN16_ATTRIB;
-- 
cgit v1.2.3


From 978799beb2a9c51550abb1f37bb6f63d06bc4717 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Mon, 13 Oct 2008 16:43:11 -0600
Subject: cell: initial work for mipmap texture filtering

---
 src/gallium/drivers/cell/common.h              |   6 +-
 src/gallium/drivers/cell/ppu/cell_screen.c     |   4 +-
 src/gallium/drivers/cell/ppu/cell_state_emit.c |  18 ++--
 src/gallium/drivers/cell/ppu/cell_texture.c    |  48 ++++++----
 src/gallium/drivers/cell/ppu/cell_texture.h    |   6 +-
 src/gallium/drivers/cell/spu/spu_command.c     |  37 +++++---
 src/gallium/drivers/cell/spu/spu_funcs.c       |   1 +
 src/gallium/drivers/cell/spu/spu_main.h        |   7 +-
 src/gallium/drivers/cell/spu/spu_texture.c     | 120 ++++++++++++++++++-------
 src/gallium/drivers/cell/spu/spu_texture.h     |   6 ++
 10 files changed, 176 insertions(+), 77 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index 5dc756023f..e4de9a551d 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -67,6 +67,7 @@
 #define CELL_MAX_SPUS 6
 
 #define CELL_MAX_SAMPLERS 4
+#define CELL_MAX_TEXTURE_LEVELS 12  /* 2k x 2k */
 
 #define TILE_SIZE 32
 
@@ -251,8 +252,9 @@ struct cell_command_texture
 {
    uint64_t opcode;     /**< CELL_CMD_STATE_TEXTURE */
    uint unit;
-   void *start;         /**< Address in main memory */
-   ushort width, height;
+   void *start[CELL_MAX_TEXTURE_LEVELS];   /**< Address in main memory */
+   ushort width[CELL_MAX_TEXTURE_LEVELS];
+   ushort height[CELL_MAX_TEXTURE_LEVELS];
 };
 
 
diff --git a/src/gallium/drivers/cell/ppu/cell_screen.c b/src/gallium/drivers/cell/ppu/cell_screen.c
index 47ba6fa290..d223557950 100644
--- a/src/gallium/drivers/cell/ppu/cell_screen.c
+++ b/src/gallium/drivers/cell/ppu/cell_screen.c
@@ -76,11 +76,11 @@ cell_get_param(struct pipe_screen *screen, int param)
    case PIPE_CAP_TEXTURE_SHADOW_MAP:
       return 10;
    case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
-      return 12; /* max 2Kx2K */
+      return CELL_MAX_TEXTURE_LEVELS;
    case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
       return 8;  /* max 128x128x128 */
    case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
-      return 12; /* max 2Kx2K */
+      return CELL_MAX_TEXTURE_LEVELS;
    default:
       return 10;
    }
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index cbfa393cfb..7090b4c99f 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -211,14 +211,20 @@ cell_emit_state(struct cell_context *cell)
          texture->opcode = CELL_CMD_STATE_TEXTURE;
          texture->unit = i;
          if (cell->texture[i]) {
-            texture->start = cell->texture[i]->tiled_data;
-            texture->width = cell->texture[i]->base.width[0];
-            texture->height = cell->texture[i]->base.height[0];
+            uint level;
+            for (level = 0; level < CELL_MAX_TEXTURE_LEVELS; level++) {
+               texture->start[level] = cell->texture[i]->tiled_data[level];
+               texture->width[level] = cell->texture[i]->base.width[level];
+               texture->height[level] = cell->texture[i]->base.height[level];
+            }
          }
          else {
-            texture->start = NULL;
-            texture->width = 1;
-            texture->height = 1;
+            uint level;
+            for (level = 0; level < CELL_MAX_TEXTURE_LEVELS; level++) {
+               texture->start[level] = NULL;
+               texture->width[level] = 1;
+               texture->height[level] = 1;
+            }
          }
       }
    }
diff --git a/src/gallium/drivers/cell/ppu/cell_texture.c b/src/gallium/drivers/cell/ppu/cell_texture.c
index b6590dfb86..f5f81ac3cc 100644
--- a/src/gallium/drivers/cell/ppu/cell_texture.c
+++ b/src/gallium/drivers/cell/ppu/cell_texture.c
@@ -66,6 +66,8 @@ cell_texture_layout(struct cell_texture * spt)
       unsigned size;
       unsigned w_tile, h_tile;
 
+      assert(level < CELL_MAX_TEXTURE_LEVELS);
+
       /* width, height, rounded up to tile size */
       w_tile = align(width, TILE_SIZE);
       h_tile = align(height, TILE_SIZE);
@@ -249,33 +251,41 @@ cell_tile_texture(struct cell_context *cell,
                   struct cell_texture *texture)
 {
    struct pipe_screen *screen = cell->pipe.screen;
-   uint face = 0, level = 0, zslice = 0;
-   struct pipe_surface *surf;
-   const uint w = texture->base.width[0], h = texture->base.height[0];
+   uint face = 0, level, zslice = 0;
    const uint *src;
 
-   /* temporary restrictions: */
-   assert(w >= TILE_SIZE);
-   assert(h >= TILE_SIZE);
-   assert(w % TILE_SIZE == 0);
-   assert(h % TILE_SIZE == 0);
+   for (level = 0; level <= texture->base.last_level; level++) {
+      if (!texture->tiled_data[level]) {
+         struct pipe_surface *surf;
 
-   surf = screen->get_tex_surface(screen, &texture->base, face, level, zslice,
-                                  PIPE_BUFFER_USAGE_CPU_WRITE);
-   ASSERT(surf);
+         const uint w = texture->base.width[level], h = texture->base.height[level];
 
-   src = (const uint *) pipe_surface_map(surf, PIPE_BUFFER_USAGE_CPU_WRITE);
+         if (w < 32 || h < 32)
+            continue;
+         /* temporary restrictions: */
+         assert(w >= TILE_SIZE);
+         assert(h >= TILE_SIZE);
+         assert(w % TILE_SIZE == 0);
+         assert(h % TILE_SIZE == 0);
 
-   if (texture->tiled_data) {
-      align_free(texture->tiled_data);
-   }
-   texture->tiled_data = align_malloc(w * h * 4, 16);
+         surf = screen->get_tex_surface(screen, &texture->base, face, level, zslice,
+                                        PIPE_BUFFER_USAGE_CPU_WRITE);
+         ASSERT(surf);
+         
+         src = (const uint *) pipe_surface_map(surf, PIPE_BUFFER_USAGE_CPU_WRITE);
 
-   tile_copy_data(w, h, TILE_SIZE, texture->tiled_data, src);
+         if (texture->tiled_data[level]) {
+            align_free(texture->tiled_data[level]);
+         }
+         texture->tiled_data[level] = align_malloc(w * h * 4, 16);
 
-   pipe_surface_unmap(surf);
+         tile_copy_data(w, h, TILE_SIZE, texture->tiled_data[level], src);
 
-   pipe_surface_reference(&surf, NULL);
+         pipe_surface_unmap(surf);
+
+         pipe_surface_reference(&surf, NULL);
+      }
+   }
 }
 
 
diff --git a/src/gallium/drivers/cell/ppu/cell_texture.h b/src/gallium/drivers/cell/ppu/cell_texture.h
index 6d37e95ebc..6d35736984 100644
--- a/src/gallium/drivers/cell/ppu/cell_texture.h
+++ b/src/gallium/drivers/cell/ppu/cell_texture.h
@@ -40,15 +40,15 @@ struct cell_texture
 {
    struct pipe_texture base;
 
-   unsigned long level_offset[PIPE_MAX_TEXTURE_LEVELS];
-   unsigned long stride[PIPE_MAX_TEXTURE_LEVELS];
+   unsigned long level_offset[CELL_MAX_TEXTURE_LEVELS];
+   unsigned long stride[CELL_MAX_TEXTURE_LEVELS];
 
    /* The data is held here:
     */
    struct pipe_buffer *buffer;
    unsigned long buffer_size;
 
-   void *tiled_data;  /* XXX this may be temporary */ /*ALIGN16*/
+   void *tiled_data[CELL_MAX_TEXTURE_LEVELS];  /* XXX this may be temporary */ /*ALIGN16*/
 };
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index 64890f6dbd..089af22415 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -301,6 +301,12 @@ cmd_state_sampler(const struct cell_command_sampler *sampler)
    DEBUG_PRINTF("SAMPLER [%u]\n", sampler->unit);
 
    spu.sampler[sampler->unit] = sampler->state;
+#if 0
+   if (spu.sampler[sampler->unit].min_mip_filter != PIPE_TEX_MIPFILTER_NONE) {
+      spu.sample_texture4[sampler->unit] = sample_texture4_lod;
+   }
+   else
+#endif
    if (spu.sampler[sampler->unit].min_img_filter == PIPE_TEX_FILTER_LINEAR) {
       spu.sample_texture4[sampler->unit] = sample_texture4_bilinear;
    }
@@ -314,24 +320,29 @@ static void
 cmd_state_texture(const struct cell_command_texture *texture)
 {
    const uint unit = texture->unit;
-   const uint width = texture->width;
-   const uint height = texture->height;
+   uint i;
 
-   DEBUG_PRINTF("TEXTURE [%u] at %p  size %u x %u\n",
-             texture->unit, texture->start,
-             texture->width, texture->height);
+   DEBUG_PRINTF("TEXTURE [%u]\n", texture->unit);
 
-   spu.texture[unit].start = texture->start;
-   spu.texture[unit].width = width;
-   spu.texture[unit].height = height;
+   for (i = 0; i < CELL_MAX_TEXTURE_LEVELS; i++) {
+      uint width = texture->width[i];
+      uint height = texture->height[i];
 
-   spu.texture[unit].width4 = spu_splats((float) width);
-   spu.texture[unit].height4 = spu_splats((float) height);
+      DEBUG_PRINTF("  LEVEL %u: at %p  size[0] %u x %u\n", i,
+             texture->start[i], texture->width[i], texture->height[i]);
 
-   spu.texture[unit].tiles_per_row = width / TILE_SIZE;
+      spu.texture[unit].level[i].start = texture->start[i];
+      spu.texture[unit].level[i].width = width;
+      spu.texture[unit].level[i].height = height;
 
-   spu.texture[unit].tex_size_x_mask = spu_splats(width - 1);
-   spu.texture[unit].tex_size_y_mask = spu_splats(height - 1);
+      spu.texture[unit].level[i].tiles_per_row = width / TILE_SIZE;
+
+      spu.texture[unit].level[i].width4 = spu_splats((float) width);
+      spu.texture[unit].level[i].height4 = spu_splats((float) height);
+
+      spu.texture[unit].level[i].tex_size_x_mask = spu_splats(width - 1);
+      spu.texture[unit].level[i].tex_size_y_mask = spu_splats(height - 1);
+   }
 }
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_funcs.c b/src/gallium/drivers/cell/spu/spu_funcs.c
index 4c90b701ee..f2946010bd 100644
--- a/src/gallium/drivers/cell/spu/spu_funcs.c
+++ b/src/gallium/drivers/cell/spu/spu_funcs.c
@@ -100,6 +100,7 @@ spu_log2(vector float x)
    return spu_mul(v, k);
 }
 
+
 static struct vec_4x4
 spu_txp(vector float s, vector float t, vector float r, vector float q,
         unsigned unit)
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index e3960dbe8b..9515543efe 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -107,7 +107,7 @@ struct spu_framebuffer
 } ALIGN16_ATTRIB;
 
 
-struct spu_texture
+struct spu_texture_level
 {
    void *start;
    ushort width, height;
@@ -118,6 +118,11 @@ struct spu_texture
    vector unsigned int tex_size_y_mask; /**< splat(height-1) */
 } ALIGN16_ATTRIB;
 
+struct spu_texture
+{
+   struct spu_texture_level level[CELL_MAX_TEXTURE_LEVELS];
+} ALIGN16_ATTRIB;
+
 
 /**
  * All SPU global/context state will be in a singleton object of this type:
diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
index 96ef88822a..96c09e3ccb 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.c
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -27,6 +27,7 @@
 
 
 #include <transpose_matrix4x4.h>
+#include <math.h>
 
 #include "pipe/p_compiler.h"
 #include "spu_main.h"
@@ -42,11 +43,12 @@
 void
 invalidate_tex_cache(void)
 {
+   uint lvl = 0;
    uint unit = 0;
-   uint bytes = 4 * spu.texture[unit].width
-      * spu.texture[unit].height;
+   uint bytes = 4 * spu.texture[unit].level[lvl].width
+      * spu.texture[unit].level[lvl].height;
 
-   spu_dcache_mark_dirty((unsigned) spu.texture[unit].start, bytes);
+   spu_dcache_mark_dirty((unsigned) spu.texture[unit].level[lvl].start, bytes);
 }
 
 
@@ -64,15 +66,17 @@ invalidate_tex_cache(void)
  * a time.
  */
 static void
-get_four_texels(uint unit, vec_uint4 x, vec_uint4 y, vec_uint4 *texels)
+get_four_texels(uint unit, uint level, vec_uint4 x, vec_uint4 y,
+                vec_uint4 *texels)
 {
-   const unsigned texture_ea = (uintptr_t) spu.texture[unit].start;
+   const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
+   const unsigned texture_ea = (uintptr_t) tlevel->start;
    vec_uint4 tile_x = spu_rlmask(x, -5);  /* tile_x = x / 32 */
    vec_uint4 tile_y = spu_rlmask(y, -5);  /* tile_y = y / 32 */
    const qword offset_x = si_andi((qword) x, 0x1f); /* offset_x = x & 0x1f */
    const qword offset_y = si_andi((qword) y, 0x1f); /* offset_y = y & 0x1f */
 
-   const qword tiles_per_row = (qword) spu_splats(spu.texture[unit].tiles_per_row);
+   const qword tiles_per_row = (qword) spu_splats(tlevel->tiles_per_row);
    const qword tile_size = (qword) spu_splats((unsigned) sizeof(tile_t));
 
    qword tile_offset = si_mpya((qword) tile_y, tiles_per_row, (qword) tile_x);
@@ -104,17 +108,18 @@ sample_texture4_nearest(vector float s, vector float t,
                         vector float r, vector float q,
                         uint unit, vector float colors[4])
 {
-   vector float ss = spu_mul(s, spu.texture[unit].width4);
-   vector float tt = spu_mul(t, spu.texture[unit].height4);
+   const uint lvl = 0;
+   vector float ss = spu_mul(s, spu.texture[unit].level[lvl].width4);
+   vector float tt = spu_mul(t, spu.texture[unit].level[lvl].height4);
    vector unsigned int is = spu_convtu(ss, 0);
    vector unsigned int it = spu_convtu(tt, 0);
    vec_uint4 texels[4];
 
    /* PIPE_TEX_WRAP_REPEAT */
-   is = spu_and(is, spu.texture[unit].tex_size_x_mask);
-   it = spu_and(it, spu.texture[unit].tex_size_y_mask);
+   is = spu_and(is, spu.texture[unit].level[lvl].tex_size_x_mask);
+   it = spu_and(it, spu.texture[unit].level[lvl].tex_size_y_mask);
 
-   get_four_texels(unit, is, it, texels);
+   get_four_texels(unit, lvl, is, it, texels);
 
    /* convert four packed ARGBA pixels to float RRRR,GGGG,BBBB,AAAA */
    spu_unpack_A8R8G8B8_transpose4(texels, colors);
@@ -130,8 +135,9 @@ sample_texture4_bilinear(vector float s, vector float t,
                          vector float r, vector float q,
                          uint unit, vector float colors[4])
 {
-   vector float ss = spu_madd(s, spu.texture[unit].width4,  spu_splats(-0.5f));
-   vector float tt = spu_madd(t, spu.texture[unit].height4, spu_splats(-0.5f));
+   const uint lvl = 0;
+   vector float ss = spu_madd(s, spu.texture[unit].level[lvl].width4,  spu_splats(-0.5f));
+   vector float tt = spu_madd(t, spu.texture[unit].level[lvl].height4, spu_splats(-0.5f));
 
    vector unsigned int is0 = spu_convtu(ss, 0);
    vector unsigned int it0 = spu_convtu(tt, 0);
@@ -141,17 +147,17 @@ sample_texture4_bilinear(vector float s, vector float t,
    vector unsigned int it1 = spu_add(it0, 1);
 
    /* PIPE_TEX_WRAP_REPEAT */
-   is0 = spu_and(is0, spu.texture[unit].tex_size_x_mask);
-   it0 = spu_and(it0, spu.texture[unit].tex_size_y_mask);
-   is1 = spu_and(is1, spu.texture[unit].tex_size_x_mask);
-   it1 = spu_and(it1, spu.texture[unit].tex_size_y_mask);
+   is0 = spu_and(is0, spu.texture[unit].level[lvl].tex_size_x_mask);
+   it0 = spu_and(it0, spu.texture[unit].level[lvl].tex_size_y_mask);
+   is1 = spu_and(is1, spu.texture[unit].level[lvl].tex_size_x_mask);
+   it1 = spu_and(it1, spu.texture[unit].level[lvl].tex_size_y_mask);
 
    /* get packed int texels */
    vector unsigned int texels[16];
-   get_four_texels(unit, is0, it0, texels + 0);  /* upper-left */
-   get_four_texels(unit, is1, it0, texels + 4);  /* upper-right */
-   get_four_texels(unit, is0, it1, texels + 8);  /* lower-left */
-   get_four_texels(unit, is1, it1, texels + 12); /* lower-right */
+   get_four_texels(unit, lvl, is0, it0, texels + 0);  /* upper-left */
+   get_four_texels(unit, lvl, is1, it0, texels + 4);  /* upper-right */
+   get_four_texels(unit, lvl, is0, it1, texels + 8);  /* lower-left */
+   get_four_texels(unit, lvl, is1, it1, texels + 12); /* lower-right */
 
    /* XXX possibly rework following code to compute the weighted sample
     * colors with integer arithmetic for fewer int->float conversions.
@@ -270,10 +276,11 @@ sample_texture4_bilinear_2(vector float s, vector float t,
                          vector float r, vector float q,
                          uint unit, vector float colors[4])
 {
+   const uint lvl = 0;
    static const vector float half = {-0.5f, -0.5f, -0.5f, -0.5f};
    /* Scale texcoords by size of texture, and add half pixel bias */
-   vector float ss = spu_madd(s, spu.texture[unit].width4, half);
-   vector float tt = spu_madd(t, spu.texture[unit].height4, half);
+   vector float ss = spu_madd(s, spu.texture[unit].level[lvl].width4, half);
+   vector float tt = spu_madd(t, spu.texture[unit].level[lvl].height4, half);
 
    /* convert float coords to fixed-pt coords with 8 fraction bits */
    vector unsigned int is = spu_convtu(ss, 8);
@@ -294,17 +301,17 @@ sample_texture4_bilinear_2(vector float s, vector float t,
    vector unsigned int it1 = spu_add(it0, 1);
 
    /* PIPE_TEX_WRAP_REPEAT */
-   is0 = spu_and(is0, spu.texture[unit].tex_size_x_mask);
-   it0 = spu_and(it0, spu.texture[unit].tex_size_y_mask);
-   is1 = spu_and(is1, spu.texture[unit].tex_size_x_mask);
-   it1 = spu_and(it1, spu.texture[unit].tex_size_y_mask);
+   is0 = spu_and(is0, spu.texture[unit].level[lvl].tex_size_x_mask);
+   it0 = spu_and(it0, spu.texture[unit].level[lvl].tex_size_y_mask);
+   is1 = spu_and(is1, spu.texture[unit].level[lvl].tex_size_x_mask);
+   it1 = spu_and(it1, spu.texture[unit].level[lvl].tex_size_y_mask);
 
    /* get packed int texels */
    vector unsigned int texels[16];
-   get_four_texels(unit, is0, it0, texels + 0);  /* upper-left */
-   get_four_texels(unit, is1, it0, texels + 4);  /* upper-right */
-   get_four_texels(unit, is0, it1, texels + 8);  /* lower-left */
-   get_four_texels(unit, is1, it1, texels + 12); /* lower-right */
+   get_four_texels(unit, lvl, is0, it0, texels + 0);  /* upper-left */
+   get_four_texels(unit, lvl, is1, it0, texels + 4);  /* upper-right */
+   get_four_texels(unit, lvl, is0, it1, texels + 8);  /* lower-left */
+   get_four_texels(unit, lvl, is1, it1, texels + 12); /* lower-right */
 
    /* twiddle packed 32-bit BGRA pixels into RGBA as four unsigned ints */
    {
@@ -363,3 +370,54 @@ sample_texture4_bilinear_2(vector float s, vector float t,
    cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3));
    colors[3] = spu_convtf(cSum, 24);
 }
+
+
+
+/**
+ * Compute level of detail factor from texcoords.
+ */
+static float
+compute_lambda(uint unit, vector float s, vector float t)
+{
+   uint lvl = 0;
+   float width = spu.texture[unit].level[lvl].width;
+   float height = spu.texture[unit].level[lvl].width;
+   float dsdx = width * (spu_extract(s, 1) - spu_extract(s, 0));
+   float dsdy = width * (spu_extract(s, 2) - spu_extract(s, 0));
+   float dtdx = height * (spu_extract(t, 1) - spu_extract(t, 0));
+   float dtdy = height * (spu_extract(t, 2) - spu_extract(t, 0));
+   float x = dsdx * dsdx + dtdx * dtdx;
+   float y = dsdy * dsdy + dtdy * dtdy;
+   float rho = x > y ? x : y;
+   rho = sqrtf(rho);
+   float lambda = logf(rho) * 1.442695f;
+   return lambda;
+}
+
+
+
+/**
+ * Texture sampling with level of detail selection.
+ */
+void
+sample_texture4_lod(vector float s, vector float t,
+                    vector float r, vector float q,
+                    uint unit, vector float colors[4])
+{
+   float lambda = compute_lambda(unit, s, t);
+
+   if (lambda < spu.sampler[unit].min_lod)
+      lambda = spu.sampler[unit].min_lod;
+   else if (lambda > spu.sampler[unit].max_lod)
+      lambda = spu.sampler[unit].max_lod;
+
+   /* hack for now */
+   int level = (int) lambda;
+   if (level > 3)
+      level = 3;
+
+   /*
+   sample_texture4_bilinear_2(s, t, r, q, unit, level, colors);
+   */
+}
+
diff --git a/src/gallium/drivers/cell/spu/spu_texture.h b/src/gallium/drivers/cell/spu/spu_texture.h
index 38a17deda2..4802f7c47c 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.h
+++ b/src/gallium/drivers/cell/spu/spu_texture.h
@@ -53,4 +53,10 @@ sample_texture4_bilinear_2(vector float s, vector float t,
                          uint unit, vector float colors[4]);
 
 
+extern void
+sample_texture4_lod(vector float s, vector float t,
+                    vector float r, vector float q,
+                    uint unit, vector float colors[4]);
+
+
 #endif /* SPU_TEXTURE_H */
-- 
cgit v1.2.3


From dee18a147d3adaf2578d27837c8f18c92d796c9d Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Mon, 13 Oct 2008 19:41:26 -0600
Subject: cell: finish-up perspective-corrected interpolation

---
 src/gallium/drivers/cell/spu/spu_tri.c | 127 +++++++++++++++++++++------------
 1 file changed, 82 insertions(+), 45 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index 022d21ba8f..3f1fb4f7c9 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -213,7 +213,7 @@ clip_emit_quad(struct setup_stage *setup)
  * Eg: four colors will be computed (in AoS format).
  */
 static INLINE void
-eval_coeff(uint slot, float x, float y, vector float result[4])
+eval_coeff(uint slot, float x, float y, vector float w, vector float result[4])
 {
    switch (spu.vertex_info.attrib[slot].interp_mode) {
    case INTERP_CONSTANT:
@@ -222,23 +222,43 @@ eval_coeff(uint slot, float x, float y, vector float result[4])
       result[QUAD_BOTTOM_LEFT] =
       result[QUAD_BOTTOM_RIGHT] = setup.coef[slot].a0.v;
       break;
-
    case INTERP_LINEAR:
-      /* fall-through, for now */
-   default:
       {
-         register vector float dadx = setup.coef[slot].dadx.v;
-         register vector float dady = setup.coef[slot].dady.v;
-         register vector float topLeft
-            = spu_add(setup.coef[slot].a0.v,
-                      spu_add(spu_mul(spu_splats(x), dadx),
-                              spu_mul(spu_splats(y), dady)));
+         vector float dadx = setup.coef[slot].dadx.v;
+         vector float dady = setup.coef[slot].dady.v;
+         vector float topLeft =
+            spu_add(setup.coef[slot].a0.v,
+                    spu_add(spu_mul(spu_splats(x), dadx),
+                            spu_mul(spu_splats(y), dady)));
 
          result[QUAD_TOP_LEFT] = topLeft;
          result[QUAD_TOP_RIGHT] = spu_add(topLeft, dadx);
          result[QUAD_BOTTOM_LEFT] = spu_add(topLeft, dady);
          result[QUAD_BOTTOM_RIGHT] = spu_add(spu_add(topLeft, dadx), dady);
       }
+      break;
+   case INTERP_PERSPECTIVE:
+      {
+         vector float dadx = setup.coef[slot].dadx.v;
+         vector float dady = setup.coef[slot].dady.v;
+         vector float topLeft =
+            spu_add(setup.coef[slot].a0.v,
+                    spu_add(spu_mul(spu_splats(x), dadx),
+                            spu_mul(spu_splats(y), dady)));
+
+         vector float wInv = spu_re(w);  /* 1.0 / w */
+
+         result[QUAD_TOP_LEFT] = spu_mul(topLeft, wInv);
+         result[QUAD_TOP_RIGHT] = spu_mul(spu_add(topLeft, dadx), wInv);
+         result[QUAD_BOTTOM_LEFT] = spu_mul(spu_add(topLeft, dady), wInv);
+         result[QUAD_BOTTOM_RIGHT] = spu_mul(spu_add(spu_add(topLeft, dadx), dady), wInv);
+      }
+      break;
+   case INTERP_POS:
+   case INTERP_NONE:
+      break;
+   default:
+      ASSERT(0);
    }
 }
 
@@ -248,14 +268,14 @@ eval_coeff(uint slot, float x, float y, vector float result[4])
  * XXX this will all be re-written someday.
  */
 static INLINE void
-eval_coeff_soa(uint slot, float x, float y, vector float result[4])
+eval_coeff_soa(uint slot, float x, float y, vector float w, vector float result[4])
 {
-   eval_coeff(slot, x, y, result);
+   eval_coeff(slot, x, y, w, result);
    _transpose_matrix4x4(result, result);
 }
 
 
-
+/** Evalute coefficients to get Z for four pixels in a quad */
 static INLINE vector float
 eval_z(float x, float y)
 {
@@ -269,6 +289,20 @@ eval_z(float x, float y)
 }
 
 
+/** Evalute coefficients to get W for four pixels in a quad */
+static INLINE vector float
+eval_w(float x, float y)
+{
+   const uint slot = 0;
+   const float dwdx = setup.coef[slot].dadx.f[3];
+   const float dwdy = setup.coef[slot].dady.f[3];
+   const float topLeft = setup.coef[slot].a0.f[3] + x * dwdx + y * dwdy;
+   const vector float topLeftv = spu_splats(topLeft);
+   const vector float derivs = (vector float) { 0.0, dwdx, dwdy, dwdx + dwdy };
+   return spu_add(topLeftv, derivs);
+}
+
+
 /**
  * Emit a quad (pass to next stage).  No clipping is done.
  * Note: about 1/5 to 1/7 of the time, mask is zero and this function
@@ -292,14 +326,15 @@ emit_quad( int x, int y, mask_t mask)
           */
          vector float inputs[4*4], outputs[2*4];
          vector float fragZ = eval_z((float) x, (float) y);
+         vector float fragW = eval_w((float) x, (float) y);
 
          /* setup inputs */
 #if 0
-         eval_coeff_soa(1, (float) x, (float) y, inputs);
+         eval_coeff_soa(1, (float) x, (float) y, fragW, inputs);
 #else
          uint i;
          for (i = 0; i < spu.vertex_info.num_attribs; i++) {
-            eval_coeff_soa(i+1, (float) x, (float) y, inputs + i * 4);
+            eval_coeff_soa(i+1, (float) x, (float) y, fragW, inputs + i * 4);
          }
 #endif
          ASSERT(spu.fragment_program);
@@ -658,7 +693,6 @@ tri_linear_coeff4(uint slot)
 
 
-#if 0
 /**
  * Compute a0, dadx and dady for a perspective-corrected interpolant,
  * for a triangle.
@@ -667,38 +701,41 @@ tri_linear_coeff4(uint slot)
  * Later, when we compute the value at a particular fragment position we'll
  * divide the interpolated value by the interpolated W at that fragment.
  */
-static void tri_persp_coeff( unsigned slot,
-                             unsigned i )
+static void
+tri_persp_coeff4(uint slot)
 {
-   /* premultiply by 1/w:
-    */
-   float mina = setup.vmin->data[slot][i] * setup.vmin->data[0][3];
-   float mida = setup.vmid->data[slot][i] * setup.vmid->data[0][3];
-   float maxa = setup.vmax->data[slot][i] * setup.vmax->data[0][3];
-
-   float botda = mida - mina;
-   float majda = maxa - mina;
-   float a = setup.ebot.dy * majda - botda * setup.emaj.dy;
-   float b = setup.emaj.dx * botda - majda * setup.ebot.dx;
-      
-   /*
-   printf("tri persp %d,%d: %f %f %f\n", slot, i,
-          setup.vmin->data[slot][i],
-          setup.vmid->data[slot][i],
-          setup.vmax->data[slot][i]
-          );
-   */
+   const vector float xxxx = spu_splats(spu_extract(setup.vmin->data[0], 0) - 0.5f);
+   const vector float yyyy = spu_splats(spu_extract(setup.vmin->data[0], 1) - 0.5f);
+
+   const vector float vmin_w = spu_splats(spu_extract(setup.vmin->data[0], 3));
+   const vector float vmid_w = spu_splats(spu_extract(setup.vmid->data[0], 3));
+   const vector float vmax_w = spu_splats(spu_extract(setup.vmax->data[0], 3));
 
-   assert(slot < PIPE_MAX_SHADER_INPUTS);
-   assert(i <= 3);
+   vector float vmin_d = setup.vmin->data[slot];
+   vector float vmid_d = setup.vmid->data[slot];
+   vector float vmax_d = setup.vmax->data[slot];
 
-   setup.coef[slot].dadx.f[i] = a * setup.oneoverarea;
-   setup.coef[slot].dady.f[i] = b * setup.oneoverarea;
-   setup.coef[slot].a0.f[i] = (mina - 
-			    (setup.coef[slot].dadx.f[i] * (setup.vmin->data[0][0] - 0.5f) + 
-			     setup.coef[slot].dady.f[i] * (setup.vmin->data[0][1] - 0.5f)));
+   vmin_d = spu_mul(vmin_d, vmin_w);
+   vmid_d = spu_mul(vmid_d, vmid_w);
+   vmax_d = spu_mul(vmax_d, vmax_w);
+
+   vector float botda = vmid_d - vmin_d;
+   vector float majda = vmax_d - vmin_d;
+
+   vector float a = spu_sub(spu_mul(spu_splats(setup.ebot.dy), majda),
+                            spu_mul(botda, spu_splats(setup.emaj.dy)));
+   vector float b = spu_sub(spu_mul(spu_splats(setup.emaj.dx), botda),
+                            spu_mul(majda, spu_splats(setup.ebot.dx)));
+
+   setup.coef[slot].dadx.v = spu_mul(a, spu_splats(setup.oneoverarea));
+   setup.coef[slot].dady.v = spu_mul(b, spu_splats(setup.oneoverarea));
+
+   vector float tempx = spu_mul(setup.coef[slot].dadx.v, xxxx);
+   vector float tempy = spu_mul(setup.coef[slot].dady.v, yyyy);
+                         
+   setup.coef[slot].a0.v = spu_sub(vmin_d, spu_add(tempx, tempy));
 }
-#endif
+
 
 
 /**
@@ -726,7 +763,7 @@ static void setup_tri_coefficients(void)
          tri_linear_coeff4(i);
          break;
       case INTERP_PERSPECTIVE:
-         tri_linear_coeff4(i);  /* temporary */
+         tri_persp_coeff4(i);
          break;
       default:
          ASSERT(0);
-- 
cgit v1.2.3


From 5d7cc6176de09e683e5b40a69df250d1abfaf6f0 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Mon, 13 Oct 2008 19:50:20 -0600
Subject: cell: remove dead code, clean-up, reformatting

---
 src/gallium/drivers/cell/spu/spu_tri.c | 114 +++++++--------------------------
 1 file changed, 24 insertions(+), 90 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index 3f1fb4f7c9..438fae84a8 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -120,19 +120,11 @@ struct setup_stage {
 
    uint facing;
 
-   uint tx, ty;
+   uint tx, ty;  /**< position of current tile (x, y) */
 
    int cliprect_minx, cliprect_maxx, cliprect_miny, cliprect_maxy;
 
-#if 0
-   struct tgsi_interp_coef coef[PIPE_MAX_SHADER_INPUTS];
-#else
    struct interp_coef coef[PIPE_MAX_SHADER_INPUTS];
-#endif
-
-#if 0
-   struct quad_header quad; 
-#endif
 
    struct {
       int left[2];   /**< [0] = row0, [1] = row1 */
@@ -144,69 +136,9 @@ struct setup_stage {
 };
 
 
-
 static struct setup_stage setup;
 
 
-
-
-#if 0
-/**
- * Basically a cast wrapper.
- */
-static INLINE struct setup_stage *setup_stage( struct draw_stage *stage )
-{
-   return (struct setup_stage *)stage;
-}
-#endif
-
-#if 0
-/**
- * Clip setup.quad against the scissor/surface bounds.
- */
-static INLINE void
-quad_clip(struct setup_stage *setup)
-{
-   const struct pipe_scissor_state *cliprect = &setup.softpipe->cliprect;
-   const int minx = (int) cliprect->minx;
-   const int maxx = (int) cliprect->maxx;
-   const int miny = (int) cliprect->miny;
-   const int maxy = (int) cliprect->maxy;
-
-   if (setup.quad.x0 >= maxx ||
-       setup.quad.y0 >= maxy ||
-       setup.quad.x0 + 1 < minx ||
-       setup.quad.y0 + 1 < miny) {
-      /* totally clipped */
-      setup.quad.mask = 0x0;
-      return;
-   }
-   if (setup.quad.x0 < minx)
-      setup.quad.mask &= (MASK_BOTTOM_RIGHT | MASK_TOP_RIGHT);
-   if (setup.quad.y0 < miny)
-      setup.quad.mask &= (MASK_BOTTOM_LEFT | MASK_BOTTOM_RIGHT);
-   if (setup.quad.x0 == maxx - 1)
-      setup.quad.mask &= (MASK_BOTTOM_LEFT | MASK_TOP_LEFT);
-   if (setup.quad.y0 == maxy - 1)
-      setup.quad.mask &= (MASK_TOP_LEFT | MASK_TOP_RIGHT);
-}
-#endif
-
-#if 0
-/**
- * Emit a quad (pass to next stage) with clipping.
- */
-static INLINE void
-clip_emit_quad(struct setup_stage *setup)
-{
-   quad_clip(setup);
-   if (setup.quad.mask) {
-      struct softpipe_context *sp = setup.softpipe;
-      sp->quad.first->run(sp->quad.first, &setup.quad);
-   }
-}
-#endif
-
 /**
  * Evaluate attribute coefficients (plane equations) to compute
  * attribute values for the four fragments in a quad.
@@ -363,7 +295,8 @@ emit_quad( int x, int y, mask_t mask)
  * Given an X or Y coordinate, return the block/quad coordinate that it
  * belongs to.
  */
-static INLINE int block( int x )
+static INLINE int
+block(int x)
 {
    return x & ~1;
 }
@@ -374,7 +307,8 @@ static INLINE int block( int x )
  * the triangle's bounds.
  * The mask is a uint4 vector and each element will be 0 or 0xffffffff.
  */
-static INLINE mask_t calculate_mask( int x )
+static INLINE mask_t
+calculate_mask(int x)
 {
    /* This is a little tricky.
     * Use & instead of && to avoid branches.
@@ -392,7 +326,8 @@ static INLINE mask_t calculate_mask( int x )
 /**
  * Render a horizontal span of quads
  */
-static void flush_spans( void )
+static void
+flush_spans(void)
 {
    int minleft, maxright;
    int x;
@@ -420,7 +355,6 @@ static void flush_spans( void )
       return;
    }
 
-
    /* OK, we're very likely to need the tile data now.
     * clear or finish waiting if needed.
     */
@@ -456,9 +390,7 @@ static void flush_spans( void )
     * calculate_mask() could be simplified a bit...
     */
    for (x = block(minleft); x <= block(maxright); x += 2) {
-#if 1
       emit_quad( x, setup.span.y, calculate_mask( x ));
-#endif
    }
 
    setup.span.y = 0;
@@ -467,8 +399,10 @@ static void flush_spans( void )
    setup.span.right[1] = 0;
 }
 
+
 #if DEBUG_VERTS
-static void print_vertex(const struct vertex_header *v)
+static void
+print_vertex(const struct vertex_header *v)
 {
    int i;
    fprintf(stderr, "Vertex: (%p)\n", v);
@@ -480,11 +414,11 @@ static void print_vertex(const struct vertex_header *v)
 #endif
 
 
-static boolean setup_sort_vertices(const struct vertex_header *v0,
-                                   const struct vertex_header *v1,
-                                   const struct vertex_header *v2)
+static boolean
+setup_sort_vertices(const struct vertex_header *v0,
+                    const struct vertex_header *v1,
+                    const struct vertex_header *v2)
 {
-
 #if DEBUG_VERTS
    fprintf(stderr, "Triangle:\n");
    print_vertex(v0);
@@ -692,7 +626,6 @@ tri_linear_coeff4(uint slot)
 }
 
 
-
 /**
  * Compute a0, dadx and dady for a perspective-corrected interpolant,
  * for a triangle.
@@ -742,7 +675,8 @@ tri_persp_coeff4(uint slot)
  * Compute the setup.coef[] array dadx, dady, a0 values.
  * Must be called after setup.vmin,vmid,vmax,vprovoke are initialized.
  */
-static void setup_tri_coefficients(void)
+static void
+setup_tri_coefficients(void)
 {
 #if 1
    uint i;
@@ -779,7 +713,8 @@ static void setup_tri_coefficients(void)
 }
 
 
-static void setup_tri_edges(void)
+static void
+setup_tri_edges(void)
 {
    float vmin_x = spu_extract(setup.vmin->data[0], 0) + 0.5f;
    float vmid_x = spu_extract(setup.vmid->data[0], 0) + 0.5f;
@@ -809,9 +744,8 @@ static void setup_tri_edges(void)
  * Render the upper or lower half of a triangle.
  * Scissoring/cliprect is applied here too.
  */
-static void subtriangle( struct edge *eleft,
-			 struct edge *eright,
-			 unsigned lines )
+static void
+subtriangle(struct edge *eleft, struct edge *eright, unsigned lines)
 {
    const int minx = setup.cliprect_minx;
    const int maxx = setup.cliprect_maxx;
@@ -878,10 +812,9 @@ static void subtriangle( struct edge *eleft,
    eright->sy += lines;
 }
 
+
 static float
-determinant( const float *v0,
-             const float *v1,
-             const float *v2 )
+determinant(const float *v0, const float *v1, const float *v2)
 {
    /* edge vectors e = v0 - v2, f = v1 - v2 */
    const float ex = v0[0] - v2[0];
@@ -899,7 +832,8 @@ determinant( const float *v0,
  * The tile data should have already been fetched.
  */
 boolean
-tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty, uint front_winding)
+tri_draw(const float *v0, const float *v1, const float *v2,
+         uint tx, uint ty, uint front_winding)
 {
    setup.tx = tx;
    setup.ty = ty;
-- 
cgit v1.2.3


From fc562a7acd86bee4853d38961e29c8da3d56e548 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Mon, 13 Oct 2008 20:19:51 -0600
Subject: cell: more clean-up in spu_tri.c

---
 src/gallium/drivers/cell/spu/spu_tri.c | 100 ++++++---------------------------
 1 file changed, 16 insertions(+), 84 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index 438fae84a8..03f094373d 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -116,7 +116,7 @@ struct setup_stage {
    struct edge etop;
    struct edge emaj;
 
-   float oneoverarea;
+   float oneOverArea;
 
    uint facing;
 
@@ -507,13 +507,13 @@ setup_sort_vertices(const struct vertex_header *v0,
     * use the prim->det value because its sign is correct.
     */
    {
-      const float area = (setup.emaj.dx * setup.ebot.dy - 
-			    setup.ebot.dx * setup.emaj.dy);
+      const float area = (setup.emaj.dx * setup.ebot.dy -
+                          setup.ebot.dx * setup.emaj.dy);
 
-      setup.oneoverarea = 1.0f / area;
+      setup.oneOverArea = 1.0f / area;
       /*
       _mesa_printf("%s one-over-area %f  area %f  det %f\n",
-                   __FUNCTION__, setup.oneoverarea, area, prim->det );
+                   __FUNCTION__, setup.oneOverArea, area, prim->det );
       */
    }
 
@@ -536,7 +536,7 @@ setup_sort_vertices(const struct vertex_header *v0,
  * \param slot  which attribute slot 
  */
 static INLINE void
-const_coeff(uint slot)
+const_coeff4(uint slot)
 {
    setup.coef[slot].dadx.v = (vector float) {0.0, 0.0, 0.0, 0.0};
    setup.coef[slot].dady.v = (vector float) {0.0, 0.0, 0.0, 0.0};
@@ -544,58 +544,6 @@ const_coeff(uint slot)
 }
 
 
-/**
- * Compute a0, dadx and dady for a linearly interpolated coefficient,
- * for a triangle.
- */
-static INLINE void
-tri_linear_coeff(uint slot, uint firstComp, uint lastComp)
-{
-   uint i;
-   const float *vmin_d = (float *) &setup.vmin->data[slot];
-   const float *vmid_d = (float *) &setup.vmid->data[slot];
-   const float *vmax_d = (float *) &setup.vmax->data[slot];
-   const float x = spu_extract(setup.vmin->data[0], 0) - 0.5f;
-   const float y = spu_extract(setup.vmin->data[0], 1) - 0.5f;
-
-   for (i = firstComp; i < lastComp; i++) {
-      float botda = vmid_d[i] - vmin_d[i];
-      float majda = vmax_d[i] - vmin_d[i];
-      float a = setup.ebot.dy * majda - botda * setup.emaj.dy;
-      float b = setup.emaj.dx * botda - majda * setup.ebot.dx;
-   
-      ASSERT(slot < PIPE_MAX_SHADER_INPUTS);
-
-      setup.coef[slot].dadx.f[i] = a * setup.oneoverarea;
-      setup.coef[slot].dady.f[i] = b * setup.oneoverarea;
-
-      /* calculate a0 as the value which would be sampled for the
-       * fragment at (0,0), taking into account that we want to sample at
-       * pixel centers, in other words (0.5, 0.5).
-       *
-       * this is neat but unfortunately not a good way to do things for
-       * triangles with very large values of dadx or dady as it will
-       * result in the subtraction and re-addition from a0 of a very
-       * large number, which means we'll end up loosing a lot of the
-       * fractional bits and precision from a0.  the way to fix this is
-       * to define a0 as the sample at a pixel center somewhere near vmin
-       * instead - i'll switch to this later.
-       */
-      setup.coef[slot].a0.f[i] = (vmin_d[i] - 
-                                 (setup.coef[slot].dadx.f[i] * x + 
-                                  setup.coef[slot].dady.f[i] * y));
-   }
-
-   /*
-   _mesa_printf("attr[%d].%c: %f dx:%f dy:%f\n",
-		slot, "xyzw"[i], 
-		setup.coef[slot].a0[i],
-		setup.coef[slot].dadx.f[i],
-		setup.coef[slot].dady.f[i]);
-   */
-}
-
-
 /**
  * As above, but interp setup all four vector components.
  */
@@ -616,8 +564,8 @@ tri_linear_coeff4(uint slot)
    vector float b = spu_sub(spu_mul(spu_splats(setup.emaj.dx), botda),
                             spu_mul(majda, spu_splats(setup.ebot.dx)));
 
-   setup.coef[slot].dadx.v = spu_mul(a, spu_splats(setup.oneoverarea));
-   setup.coef[slot].dady.v = spu_mul(b, spu_splats(setup.oneoverarea));
+   setup.coef[slot].dadx.v = spu_mul(a, spu_splats(setup.oneOverArea));
+   setup.coef[slot].dady.v = spu_mul(b, spu_splats(setup.oneOverArea));
 
    vector float tempx = spu_mul(setup.coef[slot].dadx.v, xxxx);
    vector float tempy = spu_mul(setup.coef[slot].dady.v, yyyy);
@@ -660,8 +608,8 @@ tri_persp_coeff4(uint slot)
    vector float b = spu_sub(spu_mul(spu_splats(setup.emaj.dx), botda),
                             spu_mul(majda, spu_splats(setup.ebot.dx)));
 
-   setup.coef[slot].dadx.v = spu_mul(a, spu_splats(setup.oneoverarea));
-   setup.coef[slot].dady.v = spu_mul(b, spu_splats(setup.oneoverarea));
+   setup.coef[slot].dadx.v = spu_mul(a, spu_splats(setup.oneOverArea));
+   setup.coef[slot].dady.v = spu_mul(b, spu_splats(setup.oneOverArea));
 
    vector float tempx = spu_mul(setup.coef[slot].dadx.v, xxxx);
    vector float tempy = spu_mul(setup.coef[slot].dady.v, yyyy);
@@ -678,21 +626,17 @@ tri_persp_coeff4(uint slot)
 static void
 setup_tri_coefficients(void)
 {
-#if 1
    uint i;
 
    for (i = 0; i < spu.vertex_info.num_attribs; i++) {
       switch (spu.vertex_info.attrib[i].interp_mode) {
       case INTERP_NONE:
          break;
-      case INTERP_POS:
-         /*tri_linear_coeff(i, 2, 3);*/
-         /* XXX interp W if PERSPECTIVE... */
-         tri_linear_coeff4(i);
-         break;
       case INTERP_CONSTANT:
-         const_coeff(i);
+         const_coeff4(i);
          break;
+      case INTERP_POS:
+         /* fall-through */
       case INTERP_LINEAR:
          tri_linear_coeff4(i);
          break;
@@ -703,13 +647,6 @@ setup_tri_coefficients(void)
          ASSERT(0);
       }
    }
-#else
-   ASSERT(spu.vertex_info.interp_mode[0] == INTERP_POS);
-   ASSERT(spu.vertex_info.interp_mode[1] == INTERP_LINEAR ||
-          spu.vertex_info.interp_mode[1] == INTERP_CONSTANT);
-   tri_linear_coeff(0, 2, 3);  /* slot 0, z */
-   tri_linear_coeff(1, 0, 4);  /* slot 1, color */
-#endif
 }
 
 
@@ -863,19 +800,14 @@ tri_draw(const float *v0, const float *v1, const float *v2,
    setup.span.y_flags = 0;
    setup.span.right[0] = 0;
    setup.span.right[1] = 0;
-   /*   setup.span.z_mode = tri_z_mode( setup.ctx ); */
 
-   /*   init_constant_attribs( setup ); */
-      
-   if (setup.oneoverarea < 0.0) {
-      /* emaj on left:
-       */
+   if (setup.oneOverArea < 0.0) {
+      /* emaj on left */
       subtriangle( &setup.emaj, &setup.ebot, setup.ebot.lines );
       subtriangle( &setup.emaj, &setup.etop, setup.etop.lines );
    }
    else {
-      /* emaj on right:
-       */
+      /* emaj on right */
       subtriangle( &setup.ebot, &setup.emaj, setup.ebot.lines );
       subtriangle( &setup.etop, &setup.emaj, setup.etop.lines );
    }
-- 
cgit v1.2.3


From f8bddf698d523f597fea0f721b064daee81d8005 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Tue, 14 Oct 2008 12:11:52 -0600
Subject: cell: basic mipmap filtering works now

Though, only GL_MIPMAP_NEAREST / GL_LINEAR works right now.
---
 src/gallium/drivers/cell/spu/spu_command.c |  21 ++++--
 src/gallium/drivers/cell/spu/spu_funcs.c   |   2 +-
 src/gallium/drivers/cell/spu/spu_main.h    |   3 +-
 src/gallium/drivers/cell/spu/spu_texture.c | 106 +++++++++++++++--------------
 src/gallium/drivers/cell/spu/spu_texture.h |   8 +--
 5 files changed, 79 insertions(+), 61 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index 089af22415..4e98eea338 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -301,16 +301,18 @@ cmd_state_sampler(const struct cell_command_sampler *sampler)
    DEBUG_PRINTF("SAMPLER [%u]\n", sampler->unit);
 
    spu.sampler[sampler->unit] = sampler->state;
-#if 0
+
    if (spu.sampler[sampler->unit].min_mip_filter != PIPE_TEX_MIPFILTER_NONE) {
+      /* use lambda/lod to determine min vs. mag filter */
       spu.sample_texture4[sampler->unit] = sample_texture4_lod;
    }
-   else
-#endif
-   if (spu.sampler[sampler->unit].min_img_filter == PIPE_TEX_FILTER_LINEAR) {
+   else if (spu.sampler[sampler->unit].min_img_filter
+            == PIPE_TEX_FILTER_LINEAR) {
+      /* min = mag = bilinear */
       spu.sample_texture4[sampler->unit] = sample_texture4_bilinear;
    }
    else {
+      /* min = mag = inearest */
       spu.sample_texture4[sampler->unit] = sample_texture4_nearest;
    }
 }
@@ -322,8 +324,12 @@ cmd_state_texture(const struct cell_command_texture *texture)
    const uint unit = texture->unit;
    uint i;
 
+   //if (spu.init.id==0) Debug=1;
+
    DEBUG_PRINTF("TEXTURE [%u]\n", texture->unit);
 
+   spu.texture[unit].max_level = 0;
+
    for (i = 0; i < CELL_MAX_TEXTURE_LEVELS; i++) {
       uint width = texture->width[i];
       uint height = texture->height[i];
@@ -335,14 +341,19 @@ cmd_state_texture(const struct cell_command_texture *texture)
       spu.texture[unit].level[i].width = width;
       spu.texture[unit].level[i].height = height;
 
-      spu.texture[unit].level[i].tiles_per_row = width / TILE_SIZE;
+      spu.texture[unit].level[i].tiles_per_row =
+         (width + TILE_SIZE - 1) / TILE_SIZE;
 
       spu.texture[unit].level[i].width4 = spu_splats((float) width);
       spu.texture[unit].level[i].height4 = spu_splats((float) height);
 
       spu.texture[unit].level[i].tex_size_x_mask = spu_splats(width - 1);
       spu.texture[unit].level[i].tex_size_y_mask = spu_splats(height - 1);
+
+      if (texture->start[i])
+         spu.texture[unit].max_level = i;
    }
+   //Debug=0;
 }
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_funcs.c b/src/gallium/drivers/cell/spu/spu_funcs.c
index f2946010bd..66b82f673d 100644
--- a/src/gallium/drivers/cell/spu/spu_funcs.c
+++ b/src/gallium/drivers/cell/spu/spu_funcs.c
@@ -106,7 +106,7 @@ spu_txp(vector float s, vector float t, vector float r, vector float q,
         unsigned unit)
 {
    struct vec_4x4 colors;
-   spu.sample_texture4[unit](s, t, r, q, unit, colors.v);
+   spu.sample_texture4[unit](s, t, r, q, unit, 0, colors.v);
    return colors;
 }
 
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 9515543efe..cfb645add0 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -68,7 +68,7 @@ typedef void (*spu_sample_texture4_func)(vector float s,
                                          vector float t,
                                          vector float r,
                                          vector float q,
-                                         uint unit,
+                                         uint unit, uint level,
                                          vector float colors[4]);
 
 
@@ -121,6 +121,7 @@ struct spu_texture_level
 struct spu_texture
 {
    struct spu_texture_level level[CELL_MAX_TEXTURE_LEVELS];
+   uint max_level;
 } ALIGN16_ATTRIB;
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
index 96c09e3ccb..10036330c6 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.c
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -26,7 +26,6 @@
  **************************************************************************/
 
 
-#include <transpose_matrix4x4.h>
 #include <math.h>
 
 #include "pipe/p_compiler.h"
@@ -43,12 +42,14 @@
 void
 invalidate_tex_cache(void)
 {
-   uint lvl = 0;
-   uint unit = 0;
-   uint bytes = 4 * spu.texture[unit].level[lvl].width
-      * spu.texture[unit].level[lvl].height;
+   uint lvl;
+   for (lvl = 0; lvl < CELL_MAX_TEXTURE_LEVELS; lvl++) {
+      uint unit = 0;
+      uint bytes = 4 * spu.texture[unit].level[lvl].width
+         * spu.texture[unit].level[lvl].height;
 
-   spu_dcache_mark_dirty((unsigned) spu.texture[unit].level[lvl].start, bytes);
+      spu_dcache_mark_dirty((unsigned) spu.texture[unit].level[lvl].start, bytes);
+   }
 }
 
 
@@ -71,8 +72,8 @@ get_four_texels(uint unit, uint level, vec_uint4 x, vec_uint4 y,
 {
    const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
    const unsigned texture_ea = (uintptr_t) tlevel->start;
-   vec_uint4 tile_x = spu_rlmask(x, -5);  /* tile_x = x / 32 */
-   vec_uint4 tile_y = spu_rlmask(y, -5);  /* tile_y = y / 32 */
+   const vec_uint4 tile_x = spu_rlmask(x, -5);  /* tile_x = x / 32 */
+   const vec_uint4 tile_y = spu_rlmask(y, -5);  /* tile_y = y / 32 */
    const qword offset_x = si_andi((qword) x, 0x1f); /* offset_x = x & 0x1f */
    const qword offset_y = si_andi((qword) y, 0x1f); /* offset_y = y & 0x1f */
 
@@ -106,20 +107,19 @@ get_four_texels(uint unit, uint level, vec_uint4 x, vec_uint4 y,
 void
 sample_texture4_nearest(vector float s, vector float t,
                         vector float r, vector float q,
-                        uint unit, vector float colors[4])
+                        uint unit, uint level, vector float colors[4])
 {
-   const uint lvl = 0;
-   vector float ss = spu_mul(s, spu.texture[unit].level[lvl].width4);
-   vector float tt = spu_mul(t, spu.texture[unit].level[lvl].height4);
+   vector float ss = spu_mul(s, spu.texture[unit].level[level].width4);
+   vector float tt = spu_mul(t, spu.texture[unit].level[level].height4);
    vector unsigned int is = spu_convtu(ss, 0);
    vector unsigned int it = spu_convtu(tt, 0);
    vec_uint4 texels[4];
 
    /* PIPE_TEX_WRAP_REPEAT */
-   is = spu_and(is, spu.texture[unit].level[lvl].tex_size_x_mask);
-   it = spu_and(it, spu.texture[unit].level[lvl].tex_size_y_mask);
+   is = spu_and(is, spu.texture[unit].level[level].tex_size_x_mask);
+   it = spu_and(it, spu.texture[unit].level[level].tex_size_y_mask);
 
-   get_four_texels(unit, lvl, is, it, texels);
+   get_four_texels(unit, level, is, it, texels);
 
    /* convert four packed ARGBA pixels to float RRRR,GGGG,BBBB,AAAA */
    spu_unpack_A8R8G8B8_transpose4(texels, colors);
@@ -133,11 +133,10 @@ sample_texture4_nearest(vector float s, vector float t,
 void
 sample_texture4_bilinear(vector float s, vector float t,
                          vector float r, vector float q,
-                         uint unit, vector float colors[4])
+                         uint unit, uint level, vector float colors[4])
 {
-   const uint lvl = 0;
-   vector float ss = spu_madd(s, spu.texture[unit].level[lvl].width4,  spu_splats(-0.5f));
-   vector float tt = spu_madd(t, spu.texture[unit].level[lvl].height4, spu_splats(-0.5f));
+   vector float ss = spu_madd(s, spu.texture[unit].level[level].width4,  spu_splats(-0.5f));
+   vector float tt = spu_madd(t, spu.texture[unit].level[level].height4, spu_splats(-0.5f));
 
    vector unsigned int is0 = spu_convtu(ss, 0);
    vector unsigned int it0 = spu_convtu(tt, 0);
@@ -147,17 +146,17 @@ sample_texture4_bilinear(vector float s, vector float t,
    vector unsigned int it1 = spu_add(it0, 1);
 
    /* PIPE_TEX_WRAP_REPEAT */
-   is0 = spu_and(is0, spu.texture[unit].level[lvl].tex_size_x_mask);
-   it0 = spu_and(it0, spu.texture[unit].level[lvl].tex_size_y_mask);
-   is1 = spu_and(is1, spu.texture[unit].level[lvl].tex_size_x_mask);
-   it1 = spu_and(it1, spu.texture[unit].level[lvl].tex_size_y_mask);
+   is0 = spu_and(is0, spu.texture[unit].level[level].tex_size_x_mask);
+   it0 = spu_and(it0, spu.texture[unit].level[level].tex_size_y_mask);
+   is1 = spu_and(is1, spu.texture[unit].level[level].tex_size_x_mask);
+   it1 = spu_and(it1, spu.texture[unit].level[level].tex_size_y_mask);
 
    /* get packed int texels */
    vector unsigned int texels[16];
-   get_four_texels(unit, lvl, is0, it0, texels + 0);  /* upper-left */
-   get_four_texels(unit, lvl, is1, it0, texels + 4);  /* upper-right */
-   get_four_texels(unit, lvl, is0, it1, texels + 8);  /* lower-left */
-   get_four_texels(unit, lvl, is1, it1, texels + 12); /* lower-right */
+   get_four_texels(unit, level, is0, it0, texels + 0);  /* upper-left */
+   get_four_texels(unit, level, is1, it0, texels + 4);  /* upper-right */
+   get_four_texels(unit, level, is0, it1, texels + 8);  /* lower-left */
+   get_four_texels(unit, level, is1, it1, texels + 12); /* lower-right */
 
    /* XXX possibly rework following code to compute the weighted sample
     * colors with integer arithmetic for fewer int->float conversions.
@@ -273,14 +272,13 @@ transpose(vector unsigned int *mOut0,
  */
 void
 sample_texture4_bilinear_2(vector float s, vector float t,
-                         vector float r, vector float q,
-                         uint unit, vector float colors[4])
+                           vector float r, vector float q,
+                           uint unit, uint level, vector float colors[4])
 {
-   const uint lvl = 0;
    static const vector float half = {-0.5f, -0.5f, -0.5f, -0.5f};
    /* Scale texcoords by size of texture, and add half pixel bias */
-   vector float ss = spu_madd(s, spu.texture[unit].level[lvl].width4, half);
-   vector float tt = spu_madd(t, spu.texture[unit].level[lvl].height4, half);
+   vector float ss = spu_madd(s, spu.texture[unit].level[level].width4, half);
+   vector float tt = spu_madd(t, spu.texture[unit].level[level].height4, half);
 
    /* convert float coords to fixed-pt coords with 8 fraction bits */
    vector unsigned int is = spu_convtu(ss, 8);
@@ -301,17 +299,17 @@ sample_texture4_bilinear_2(vector float s, vector float t,
    vector unsigned int it1 = spu_add(it0, 1);
 
    /* PIPE_TEX_WRAP_REPEAT */
-   is0 = spu_and(is0, spu.texture[unit].level[lvl].tex_size_x_mask);
-   it0 = spu_and(it0, spu.texture[unit].level[lvl].tex_size_y_mask);
-   is1 = spu_and(is1, spu.texture[unit].level[lvl].tex_size_x_mask);
-   it1 = spu_and(it1, spu.texture[unit].level[lvl].tex_size_y_mask);
+   is0 = spu_and(is0, spu.texture[unit].level[level].tex_size_x_mask);
+   it0 = spu_and(it0, spu.texture[unit].level[level].tex_size_y_mask);
+   is1 = spu_and(is1, spu.texture[unit].level[level].tex_size_x_mask);
+   it1 = spu_and(it1, spu.texture[unit].level[level].tex_size_y_mask);
 
    /* get packed int texels */
    vector unsigned int texels[16];
-   get_four_texels(unit, lvl, is0, it0, texels + 0);  /* upper-left */
-   get_four_texels(unit, lvl, is1, it0, texels + 4);  /* upper-right */
-   get_four_texels(unit, lvl, is0, it1, texels + 8);  /* lower-left */
-   get_four_texels(unit, lvl, is1, it1, texels + 12); /* lower-right */
+   get_four_texels(unit, level, is0, it0, texels + 0);  /* upper-left */
+   get_four_texels(unit, level, is1, it0, texels + 4);  /* upper-right */
+   get_four_texels(unit, level, is0, it1, texels + 8);  /* lower-left */
+   get_four_texels(unit, level, is1, it1, texels + 12); /* lower-right */
 
    /* twiddle packed 32-bit BGRA pixels into RGBA as four unsigned ints */
    {
@@ -379,9 +377,9 @@ sample_texture4_bilinear_2(vector float s, vector float t,
 static float
 compute_lambda(uint unit, vector float s, vector float t)
 {
-   uint lvl = 0;
-   float width = spu.texture[unit].level[lvl].width;
-   float height = spu.texture[unit].level[lvl].width;
+   uint baseLevel = 0;
+   float width = spu.texture[unit].level[baseLevel].width;
+   float height = spu.texture[unit].level[baseLevel].width;
    float dsdx = width * (spu_extract(s, 1) - spu_extract(s, 0));
    float dsdy = width * (spu_extract(s, 2) - spu_extract(s, 0));
    float dtdx = height * (spu_extract(t, 1) - spu_extract(t, 0));
@@ -402,22 +400,30 @@ compute_lambda(uint unit, vector float s, vector float t)
 void
 sample_texture4_lod(vector float s, vector float t,
                     vector float r, vector float q,
-                    uint unit, vector float colors[4])
+                    uint unit, uint level, vector float colors[4])
 {
+   /*
+    * Note that we're computing a lambda/lod here that's used for all
+    * four pixels in the quad.
+    */
    float lambda = compute_lambda(unit, s, t);
 
+   /* apply lod bias */
+   lambda += spu.sampler[unit].lod_bias;
+
+   /* clamp */
    if (lambda < spu.sampler[unit].min_lod)
       lambda = spu.sampler[unit].min_lod;
    else if (lambda > spu.sampler[unit].max_lod)
       lambda = spu.sampler[unit].max_lod;
 
-   /* hack for now */
-   int level = (int) lambda;
-   if (level > 3)
-      level = 3;
+   /* convert to int level */
+   level = (int) (lambda + 0.5f);
+   ASSERT(level >= 0);
+
+   if (level > spu.texture[unit].max_level)
+      level = spu.texture[unit].max_level;
 
-   /*
    sample_texture4_bilinear_2(s, t, r, q, unit, level, colors);
-   */
 }
 
diff --git a/src/gallium/drivers/cell/spu/spu_texture.h b/src/gallium/drivers/cell/spu/spu_texture.h
index 4802f7c47c..ec06a50b4a 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.h
+++ b/src/gallium/drivers/cell/spu/spu_texture.h
@@ -39,24 +39,24 @@ invalidate_tex_cache(void);
 extern void
 sample_texture4_nearest(vector float s, vector float t,
                         vector float r, vector float q,
-                        uint unit, vector float colors[4]);
+                        uint unit, uint level, vector float colors[4]);
 
 
 extern void
 sample_texture4_bilinear(vector float s, vector float t,
                          vector float r, vector float q,
-                         uint unit, vector float colors[4]);
+                         uint unit, uint level, vector float colors[4]);
 
 extern void
 sample_texture4_bilinear_2(vector float s, vector float t,
                          vector float r, vector float q,
-                         uint unit, vector float colors[4]);
+                           uint unit, uint level, vector float colors[4]);
 
 
 extern void
 sample_texture4_lod(vector float s, vector float t,
                     vector float r, vector float q,
-                    uint unit, vector float colors[4]);
+                    uint unit, uint level, vector float colors[4]);
 
 
 #endif /* SPU_TEXTURE_H */
-- 
cgit v1.2.3


From 6d2d5ceca21c87bea5e269e8099fb6f1d821b97a Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Tue, 14 Oct 2008 12:42:21 -0600
Subject: cell: use minify vs magnify filters

---
 src/gallium/drivers/cell/spu/spu_command.c | 50 +++++++++++++++++++++++-------
 src/gallium/drivers/cell/spu/spu_main.h    |  2 ++
 src/gallium/drivers/cell/spu/spu_texture.c | 22 +++++++------
 3 files changed, 53 insertions(+), 21 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index 4e98eea338..fa78377c66 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -298,22 +298,48 @@ cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
 static void
 cmd_state_sampler(const struct cell_command_sampler *sampler)
 {
-   DEBUG_PRINTF("SAMPLER [%u]\n", sampler->unit);
+   uint unit = sampler->unit;
 
-   spu.sampler[sampler->unit] = sampler->state;
+   DEBUG_PRINTF("SAMPLER [%u]\n", unit);
 
-   if (spu.sampler[sampler->unit].min_mip_filter != PIPE_TEX_MIPFILTER_NONE) {
-      /* use lambda/lod to determine min vs. mag filter */
-      spu.sample_texture4[sampler->unit] = sample_texture4_lod;
+   spu.sampler[unit] = sampler->state;
+
+   switch (spu.sampler[unit].min_img_filter) {
+   case PIPE_TEX_FILTER_LINEAR:
+      spu.min_sample_texture4[unit] = sample_texture4_bilinear;
+      break;
+   case PIPE_TEX_FILTER_ANISO:
+      /* fall-through, for now */
+   case PIPE_TEX_FILTER_NEAREST:
+      spu.min_sample_texture4[unit] = sample_texture4_nearest;
+      break;
+   default:
+      ASSERT(0);
    }
-   else if (spu.sampler[sampler->unit].min_img_filter
-            == PIPE_TEX_FILTER_LINEAR) {
-      /* min = mag = bilinear */
-      spu.sample_texture4[sampler->unit] = sample_texture4_bilinear;
+
+   switch (spu.sampler[sampler->unit].mag_img_filter) {
+   case PIPE_TEX_FILTER_LINEAR:
+      spu.mag_sample_texture4[unit] = sample_texture4_bilinear;
+      break;
+   case PIPE_TEX_FILTER_ANISO:
+      /* fall-through, for now */
+   case PIPE_TEX_FILTER_NEAREST:
+      spu.mag_sample_texture4[unit] = sample_texture4_nearest;
+      break;
+   default:
+      ASSERT(0);
    }
-   else {
-      /* min = mag = inearest */
-      spu.sample_texture4[sampler->unit] = sample_texture4_nearest;
+
+   switch (spu.sampler[sampler->unit].min_mip_filter) {
+   case PIPE_TEX_MIPFILTER_NEAREST:
+   case PIPE_TEX_MIPFILTER_LINEAR:
+      spu.sample_texture4[unit] = sample_texture4_lod;
+      break;
+   case PIPE_TEX_MIPFILTER_NONE:
+      spu.sample_texture4[unit] = spu.mag_sample_texture4[unit];
+      break;
+   default:
+      ASSERT(0);
    }
 }
 
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index cfb645add0..56aac655e9 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -170,6 +170,8 @@ struct spu_global
 
    /** Current texture sampler function */
    spu_sample_texture4_func sample_texture4[CELL_MAX_SAMPLERS];
+   spu_sample_texture4_func min_sample_texture4[CELL_MAX_SAMPLERS];
+   spu_sample_texture4_func mag_sample_texture4[CELL_MAX_SAMPLERS];
 
    /** Fragment program constants */
    vector float constants[4 * CELL_MAX_CONSTANTS];
diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
index 10036330c6..267f2302f6 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.c
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -400,7 +400,7 @@ compute_lambda(uint unit, vector float s, vector float t)
 void
 sample_texture4_lod(vector float s, vector float t,
                     vector float r, vector float q,
-                    uint unit, uint level, vector float colors[4])
+                    uint unit, uint level_ignored, vector float colors[4])
 {
    /*
     * Note that we're computing a lambda/lod here that's used for all
@@ -417,13 +417,17 @@ sample_texture4_lod(vector float s, vector float t,
    else if (lambda > spu.sampler[unit].max_lod)
       lambda = spu.sampler[unit].max_lod;
 
-   /* convert to int level */
-   level = (int) (lambda + 0.5f);
-   ASSERT(level >= 0);
-
-   if (level > spu.texture[unit].max_level)
-      level = spu.texture[unit].max_level;
-
-   sample_texture4_bilinear_2(s, t, r, q, unit, level, colors);
+   if (lambda <= 0.0f) {
+      /* magnify */
+      spu.mag_sample_texture4[unit](s, t, r, q, unit, 0, colors);
+   }
+   else {
+      /* minify */
+      int level = (int) (lambda + 0.5f);
+      if (level > (int) spu.texture[unit].max_level)
+         level = spu.texture[unit].max_level;
+      spu.min_sample_texture4[unit](s, t, r, q, unit, level, colors);
+      /* XXX to do: mipmap level interpolation */
+   }
 }
 
-- 
cgit v1.2.3


From 4f56d5bbf2e52c815c820138eaad6c0fb93d47ba Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Tue, 14 Oct 2008 12:52:16 -0600
Subject: cell: fix broken negative texcoord conversion

---
 src/gallium/drivers/cell/spu/spu_texture.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
index 267f2302f6..83cf7dc394 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.c
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -138,8 +138,8 @@ sample_texture4_bilinear(vector float s, vector float t,
    vector float ss = spu_madd(s, spu.texture[unit].level[level].width4,  spu_splats(-0.5f));
    vector float tt = spu_madd(t, spu.texture[unit].level[level].height4, spu_splats(-0.5f));
 
-   vector unsigned int is0 = spu_convtu(ss, 0);
-   vector unsigned int it0 = spu_convtu(tt, 0);
+   vector unsigned int is0 = (vector unsigned int) spu_convts(ss, 0);
+   vector unsigned int it0 = (vector unsigned int) spu_convts(tt, 0);
 
    /* is + 1, it + 1 */
    vector unsigned int is1 = spu_add(is0, 1);
@@ -281,8 +281,8 @@ sample_texture4_bilinear_2(vector float s, vector float t,
    vector float tt = spu_madd(t, spu.texture[unit].level[level].height4, half);
 
    /* convert float coords to fixed-pt coords with 8 fraction bits */
-   vector unsigned int is = spu_convtu(ss, 8);
-   vector unsigned int it = spu_convtu(tt, 8);
+   vector unsigned int is = (vector unsigned int) spu_convts(ss, 8);
+   vector unsigned int it = (vector unsigned int) spu_convts(tt, 8);
 
    /* compute integer texel weights in [0, 255] */
    vector signed int sWeights0 = spu_and((vector signed int) is, 255);
-- 
cgit v1.2.3


From 85dc1aec9c5fc63a01bb8db07215b84790d15d8f Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Tue, 14 Oct 2008 15:19:01 -0600
Subject: cell: support NPOT textures, clamp/repeat mode, normalized/unorm
 texcoords

glDrawPixels works now.
---
 src/gallium/drivers/cell/spu/spu_command.c | 48 +++++++++++++--
 src/gallium/drivers/cell/spu/spu_main.h    | 12 ++--
 src/gallium/drivers/cell/spu/spu_texture.c | 99 ++++++++++++++++++++----------
 3 files changed, 117 insertions(+), 42 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index fa78377c66..b1efe97e76 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -295,6 +295,42 @@ cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
 }
 
 
+/**
+ * Tex texture mask_s/t and scale_s/t fields depend on the texture size and
+ * sampler wrap modes.
+ */
+static void
+update_tex_masks(struct spu_texture *texture,
+                 const struct pipe_sampler_state *sampler)
+{
+   uint i;
+
+   for (i = 0; i < CELL_MAX_TEXTURE_LEVELS; i++) {
+      int width = texture->level[i].width;
+      int height = texture->level[i].height;
+
+      if (sampler->wrap_s == PIPE_TEX_WRAP_REPEAT)
+         texture->level[i].mask_s = spu_splats(width - 1);
+      else
+         texture->level[i].mask_s = spu_splats(~0);
+
+      if (sampler->wrap_t == PIPE_TEX_WRAP_REPEAT)
+         texture->level[i].mask_t = spu_splats(height - 1);
+      else
+         texture->level[i].mask_t = spu_splats(~0);
+
+      if (sampler->normalized_coords) {
+         texture->level[i].scale_s = spu_splats((float) width);
+         texture->level[i].scale_t = spu_splats((float) height);
+      }
+      else {
+         texture->level[i].scale_s = spu_splats(1.0f);
+         texture->level[i].scale_t = spu_splats(1.0f);
+      }
+   }
+}
+
+
 static void
 cmd_state_sampler(const struct cell_command_sampler *sampler)
 {
@@ -341,6 +377,8 @@ cmd_state_sampler(const struct cell_command_sampler *sampler)
    default:
       ASSERT(0);
    }
+
+   update_tex_masks(&spu.texture[unit], &spu.sampler[unit]);
 }
 
 
@@ -370,15 +408,15 @@ cmd_state_texture(const struct cell_command_texture *texture)
       spu.texture[unit].level[i].tiles_per_row =
          (width + TILE_SIZE - 1) / TILE_SIZE;
 
-      spu.texture[unit].level[i].width4 = spu_splats((float) width);
-      spu.texture[unit].level[i].height4 = spu_splats((float) height);
-
-      spu.texture[unit].level[i].tex_size_x_mask = spu_splats(width - 1);
-      spu.texture[unit].level[i].tex_size_y_mask = spu_splats(height - 1);
+      spu.texture[unit].level[i].max_s = spu_splats((int) width - 1);
+      spu.texture[unit].level[i].max_t = spu_splats((int) height - 1);
 
       if (texture->start[i])
          spu.texture[unit].max_level = i;
    }
+
+   update_tex_masks(&spu.texture[unit], &spu.sampler[unit]);
+
    //Debug=0;
 }
 
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 56aac655e9..45c6f4ced1 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -107,17 +107,21 @@ struct spu_framebuffer
 } ALIGN16_ATTRIB;
 
 
+/** per-texture level info */
 struct spu_texture_level
 {
    void *start;
    ushort width, height;
    ushort tiles_per_row;
-   vector float width4;   /**< == {width, width, width, width} */
-   vector float height4;  /**< == {height, height, height, height} */
-   vector unsigned int tex_size_x_mask; /**< splat(width-1) */
-   vector unsigned int tex_size_y_mask; /**< splat(height-1) */
+   /** texcoord scale factors */
+   vector float scale_s, scale_t;
+   /** texcoord masks (if REPEAT then size-1, else ~0) */
+   vector signed int mask_s, mask_t;
+   /** texcoord clamp limits */
+   vector signed int max_s, max_t;
 } ALIGN16_ATTRIB;
 
+
 struct spu_texture
 {
    struct spu_texture_level level[CELL_MAX_TEXTURE_LEVELS];
diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
index 83cf7dc394..b21c43a467 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.c
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -67,13 +67,13 @@ invalidate_tex_cache(void)
  * a time.
  */
 static void
-get_four_texels(uint unit, uint level, vec_uint4 x, vec_uint4 y,
+get_four_texels(uint unit, uint level, vec_int4 x, vec_int4 y,
                 vec_uint4 *texels)
 {
    const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
    const unsigned texture_ea = (uintptr_t) tlevel->start;
-   const vec_uint4 tile_x = spu_rlmask(x, -5);  /* tile_x = x / 32 */
-   const vec_uint4 tile_y = spu_rlmask(y, -5);  /* tile_y = y / 32 */
+   const vec_int4 tile_x = spu_rlmask(x, -5);  /* tile_x = x / 32 */
+   const vec_int4 tile_y = spu_rlmask(y, -5);  /* tile_y = y / 32 */
    const qword offset_x = si_andi((qword) x, 0x1f); /* offset_x = x & 0x1f */
    const qword offset_y = si_andi((qword) y, 0x1f); /* offset_y = y & 0x1f */
 
@@ -99,6 +99,20 @@ get_four_texels(uint unit, uint level, vec_uint4 x, vec_uint4 y,
 }
 
 
+/** clamp vec to [0, max] */
+static INLINE vector signed int
+spu_clamp(vector signed int vec, vector signed int max)
+{
+   static const vector signed int zero = {0,0,0,0};
+   vector unsigned int c;
+   c = spu_cmpgt(vec, zero);    /* c = vec > zero ? ~0 : 0 */
+   vec = spu_sel(zero, vec, c);
+   c = spu_cmpgt(vec, max);    /* c = vec > max ? ~0 : 0 */
+   vec = spu_sel(vec, max, c);
+   return vec;
+}
+
+
 
 /**
  * Do nearest texture sampling for four pixels.
@@ -109,15 +123,20 @@ sample_texture4_nearest(vector float s, vector float t,
                         vector float r, vector float q,
                         uint unit, uint level, vector float colors[4])
 {
-   vector float ss = spu_mul(s, spu.texture[unit].level[level].width4);
-   vector float tt = spu_mul(t, spu.texture[unit].level[level].height4);
-   vector unsigned int is = spu_convtu(ss, 0);
-   vector unsigned int it = spu_convtu(tt, 0);
+   const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
+   vector float ss = spu_mul(s, tlevel->scale_s);
+   vector float tt = spu_mul(t, tlevel->scale_t);
+   vector signed int is = spu_convts(ss, 0);
+   vector signed int it = spu_convts(tt, 0);
    vec_uint4 texels[4];
 
    /* PIPE_TEX_WRAP_REPEAT */
-   is = spu_and(is, spu.texture[unit].level[level].tex_size_x_mask);
-   it = spu_and(it, spu.texture[unit].level[level].tex_size_y_mask);
+   is = spu_and(is, tlevel->mask_s);
+   it = spu_and(it, tlevel->mask_t);
+
+   /* PIPE_TEX_WRAP_CLAMP */
+   is = spu_clamp(is, tlevel->max_s);
+   it = spu_clamp(it, tlevel->max_t);
 
    get_four_texels(unit, level, is, it, texels);
 
@@ -135,21 +154,28 @@ sample_texture4_bilinear(vector float s, vector float t,
                          vector float r, vector float q,
                          uint unit, uint level, vector float colors[4])
 {
-   vector float ss = spu_madd(s, spu.texture[unit].level[level].width4,  spu_splats(-0.5f));
-   vector float tt = spu_madd(t, spu.texture[unit].level[level].height4, spu_splats(-0.5f));
+   const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
+   vector float ss = spu_madd(s, tlevel->scale_s,  spu_splats(-0.5f));
+   vector float tt = spu_madd(t, tlevel->scale_t, spu_splats(-0.5f));
 
-   vector unsigned int is0 = (vector unsigned int) spu_convts(ss, 0);
-   vector unsigned int it0 = (vector unsigned int) spu_convts(tt, 0);
+   vector signed int is0 = spu_convts(ss, 0);
+   vector signed int it0 = spu_convts(tt, 0);
 
    /* is + 1, it + 1 */
-   vector unsigned int is1 = spu_add(is0, 1);
-   vector unsigned int it1 = spu_add(it0, 1);
+   vector signed int is1 = spu_add(is0, 1);
+   vector signed int it1 = spu_add(it0, 1);
 
    /* PIPE_TEX_WRAP_REPEAT */
-   is0 = spu_and(is0, spu.texture[unit].level[level].tex_size_x_mask);
-   it0 = spu_and(it0, spu.texture[unit].level[level].tex_size_y_mask);
-   is1 = spu_and(is1, spu.texture[unit].level[level].tex_size_x_mask);
-   it1 = spu_and(it1, spu.texture[unit].level[level].tex_size_y_mask);
+   is0 = spu_and(is0, tlevel->mask_s);
+   it0 = spu_and(it0, tlevel->mask_t);
+   is1 = spu_and(is1, tlevel->mask_s);
+   it1 = spu_and(it1, tlevel->mask_t);
+
+   /* PIPE_TEX_WRAP_CLAMP */
+   is0 = spu_clamp(is0, tlevel->max_s);
+   it0 = spu_clamp(it0, tlevel->max_t);
+   is1 = spu_clamp(is1, tlevel->max_s);
+   it1 = spu_clamp(it1, tlevel->max_t);
 
    /* get packed int texels */
    vector unsigned int texels[16];
@@ -275,34 +301,41 @@ sample_texture4_bilinear_2(vector float s, vector float t,
                            vector float r, vector float q,
                            uint unit, uint level, vector float colors[4])
 {
+   const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
    static const vector float half = {-0.5f, -0.5f, -0.5f, -0.5f};
    /* Scale texcoords by size of texture, and add half pixel bias */
-   vector float ss = spu_madd(s, spu.texture[unit].level[level].width4, half);
-   vector float tt = spu_madd(t, spu.texture[unit].level[level].height4, half);
+   vector float ss = spu_madd(s, tlevel->scale_s, half);
+   vector float tt = spu_madd(t, tlevel->scale_t, half);
 
    /* convert float coords to fixed-pt coords with 8 fraction bits */
-   vector unsigned int is = (vector unsigned int) spu_convts(ss, 8);
-   vector unsigned int it = (vector unsigned int) spu_convts(tt, 8);
+   vector signed int is = spu_convts(ss, 8);
+   vector signed int it = spu_convts(tt, 8);
 
    /* compute integer texel weights in [0, 255] */
-   vector signed int sWeights0 = spu_and((vector signed int) is, 255);
-   vector signed int tWeights0 = spu_and((vector signed int) it, 255);
+   vector signed int sWeights0 = spu_and(is, 255);
+   vector signed int tWeights0 = spu_and(it, 255);
    vector signed int sWeights1 = spu_sub(255, sWeights0);
    vector signed int tWeights1 = spu_sub(255, tWeights0);
 
    /* texel coords: is0 = is / 256, it0 = is / 256 */
-   vector unsigned int is0 = spu_rlmask(is, -8);
-   vector unsigned int it0 = spu_rlmask(it, -8);
+   vector signed int is0 = spu_rlmask(is, -8);
+   vector signed int it0 = spu_rlmask(it, -8);
 
    /* texel coords: i1 = is0 + 1, it1 = it0 + 1 */
-   vector unsigned int is1 = spu_add(is0, 1);
-   vector unsigned int it1 = spu_add(it0, 1);
+   vector signed int is1 = spu_add(is0, 1);
+   vector signed int it1 = spu_add(it0, 1);
 
    /* PIPE_TEX_WRAP_REPEAT */
-   is0 = spu_and(is0, spu.texture[unit].level[level].tex_size_x_mask);
-   it0 = spu_and(it0, spu.texture[unit].level[level].tex_size_y_mask);
-   is1 = spu_and(is1, spu.texture[unit].level[level].tex_size_x_mask);
-   it1 = spu_and(it1, spu.texture[unit].level[level].tex_size_y_mask);
+   is0 = spu_and(is0, tlevel->mask_s);
+   it0 = spu_and(it0, tlevel->mask_t);
+   is1 = spu_and(is1, tlevel->mask_s);
+   it1 = spu_and(it1, tlevel->mask_t);
+
+   /* PIPE_TEX_WRAP_CLAMP */
+   is0 = spu_clamp(is0, tlevel->max_s);
+   it0 = spu_clamp(it0, tlevel->max_t);
+   is1 = spu_clamp(is1, tlevel->max_s);
+   it1 = spu_clamp(it1, tlevel->max_t);
 
    /* get packed int texels */
    vector unsigned int texels[16];
-- 
cgit v1.2.3


From 8f7c6b55ae962e30f32cfec9a14a652d3b5b5943 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Tue, 14 Oct 2008 17:11:29 -0600
Subject: cell: support for cubemaps

Though, progs/demos/cubemap.c doesn't quite work right...
---
 src/gallium/drivers/cell/common.h              |   1 +
 src/gallium/drivers/cell/ppu/cell_state_emit.c |   2 +
 src/gallium/drivers/cell/ppu/cell_texture.c    |  37 ++++--
 src/gallium/drivers/cell/spu/spu_command.c     |  17 ++-
 src/gallium/drivers/cell/spu/spu_funcs.c       |   2 +-
 src/gallium/drivers/cell/spu/spu_main.h        |   4 +-
 src/gallium/drivers/cell/spu/spu_texture.c     | 171 ++++++++++++++++++++++---
 src/gallium/drivers/cell/spu/spu_texture.h     |  21 ++-
 8 files changed, 214 insertions(+), 41 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index e4de9a551d..c1e78f4db3 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -251,6 +251,7 @@ struct cell_command_sampler
 struct cell_command_texture
 {
    uint64_t opcode;     /**< CELL_CMD_STATE_TEXTURE */
+   uint target;         /**< PIPE_TEXTURE_x */
    uint unit;
    void *start[CELL_MAX_TEXTURE_LEVELS];   /**< Address in main memory */
    ushort width[CELL_MAX_TEXTURE_LEVELS];
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index cae546b700..d4a867ffcf 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -217,6 +217,7 @@ cell_emit_state(struct cell_context *cell)
                texture->width[level] = cell->texture[i]->base.width[level];
                texture->height[level] = cell->texture[i]->base.height[level];
             }
+            texture->target = cell->texture[i]->base.target;
          }
          else {
             uint level;
@@ -225,6 +226,7 @@ cell_emit_state(struct cell_context *cell)
                texture->width[level] = 0;
                texture->height[level] = 0;
             }
+            texture->target = 0;
          }
       }
    }
diff --git a/src/gallium/drivers/cell/ppu/cell_texture.c b/src/gallium/drivers/cell/ppu/cell_texture.c
index 4fd66bdea0..4c92ef154f 100644
--- a/src/gallium/drivers/cell/ppu/cell_texture.c
+++ b/src/gallium/drivers/cell/ppu/cell_texture.c
@@ -137,6 +137,7 @@ cell_texture_release(struct pipe_screen *screen,
    */
    if (--(*pt)->refcount <= 0) {
       struct cell_texture *ct = cell_texture(*pt);
+      uint i;
 
       /*
       DBG("%s deleting %p\n", __FUNCTION__, (void *) ct);
@@ -144,6 +145,12 @@ cell_texture_release(struct pipe_screen *screen,
 
       pipe_buffer_reference(screen, &ct->buffer, NULL);
 
+      for (i = 0; i < CELL_MAX_TEXTURE_LEVELS; i++) {
+         if (ct->tiled_data[i]) {
+            FREE(ct->tiled_data[i]);
+         }
+      }
+
       FREE(ct);
    }
    *pt = NULL;
@@ -204,27 +211,33 @@ static void
 cell_twiddle_texture(struct pipe_screen *screen,
                      struct pipe_surface *surface)
 {
-   struct cell_texture *texture = cell_texture(surface->texture);
+   struct cell_texture *ct = cell_texture(surface->texture);
    const uint level = surface->level;
-   const uint texWidth = texture->base.width[level];
-   const uint texHeight = texture->base.height[level];
+   const uint texWidth = ct->base.width[level];
+   const uint texHeight = ct->base.height[level];
    const uint bufWidth = align(texWidth, TILE_SIZE);
    const uint bufHeight = align(texHeight, TILE_SIZE);
    const void *map = pipe_buffer_map(screen, surface->buffer,
                                      PIPE_BUFFER_USAGE_CPU_READ);
    const uint *src = (const uint *) ((const ubyte *) map + surface->offset);
 
-   switch (texture->base.format) {
+   switch (ct->base.format) {
    case PIPE_FORMAT_A8R8G8B8_UNORM:
-      /* free old tiled data */
-      if (texture->tiled_data[level]) {
-         align_free(texture->tiled_data[level]);
+      {
+         int numFaces = ct->base.target == PIPE_TEXTURE_CUBE ? 6 : 1;
+         int offset = bufWidth * bufHeight * 4 * surface->face;
+         uint *dst;
+
+         if (!ct->tiled_data[level]) {
+            ct->tiled_data[level] =
+               align_malloc(bufWidth * bufHeight * 4 * numFaces, 16);
+         }
+
+         dst = (uint *) ((ubyte *) ct->tiled_data[level] + offset);
+
+         twiddle_image_uint(texWidth, texHeight, TILE_SIZE, dst,
+                            surface->stride, src);
       }
-      /* alloc new tiled data */
-      texture->tiled_data[level] = align_malloc(bufWidth * bufHeight * 4, 16);
-      twiddle_image_uint(texWidth, texHeight, TILE_SIZE,
-                         texture->tiled_data[level],
-                         surface->stride, src);
       break;
    default:
       printf("Cell: twiddle unsupported texture format\n");
diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index b1efe97e76..c951fa6f31 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -301,7 +301,8 @@ cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
  */
 static void
 update_tex_masks(struct spu_texture *texture,
-                 const struct pipe_sampler_state *sampler)
+                 const struct pipe_sampler_state *sampler,
+                 uint unit)
 {
    uint i;
 
@@ -328,6 +329,11 @@ update_tex_masks(struct spu_texture *texture,
          texture->level[i].scale_t = spu_splats(1.0f);
       }
    }
+
+   /* XXX temporary hack */
+   if (texture->target == PIPE_TEXTURE_CUBE) {
+      spu.sample_texture4[unit] = sample_texture4_cube;
+   }
 }
 
 
@@ -378,7 +384,7 @@ cmd_state_sampler(const struct cell_command_sampler *sampler)
       ASSERT(0);
    }
 
-   update_tex_masks(&spu.texture[unit], &spu.sampler[unit]);
+   update_tex_masks(&spu.texture[unit], &spu.sampler[unit], unit);
 }
 
 
@@ -393,6 +399,7 @@ cmd_state_texture(const struct cell_command_texture *texture)
    DEBUG_PRINTF("TEXTURE [%u]\n", texture->unit);
 
    spu.texture[unit].max_level = 0;
+   spu.texture[unit].target = texture->target;
 
    for (i = 0; i < CELL_MAX_TEXTURE_LEVELS; i++) {
       uint width = texture->width[i];
@@ -408,6 +415,10 @@ cmd_state_texture(const struct cell_command_texture *texture)
       spu.texture[unit].level[i].tiles_per_row =
          (width + TILE_SIZE - 1) / TILE_SIZE;
 
+      spu.texture[unit].level[i].bytes_per_image =
+         4 * ((width + TILE_SIZE - 1) & ~(TILE_SIZE-1))
+         * ((height + TILE_SIZE - 1) & ~(TILE_SIZE-1));
+
       spu.texture[unit].level[i].max_s = spu_splats((int) width - 1);
       spu.texture[unit].level[i].max_t = spu_splats((int) height - 1);
 
@@ -415,7 +426,7 @@ cmd_state_texture(const struct cell_command_texture *texture)
          spu.texture[unit].max_level = i;
    }
 
-   update_tex_masks(&spu.texture[unit], &spu.sampler[unit]);
+   update_tex_masks(&spu.texture[unit], &spu.sampler[unit], unit);
 
    //Debug=0;
 }
diff --git a/src/gallium/drivers/cell/spu/spu_funcs.c b/src/gallium/drivers/cell/spu/spu_funcs.c
index 66b82f673d..5c3ee305d4 100644
--- a/src/gallium/drivers/cell/spu/spu_funcs.c
+++ b/src/gallium/drivers/cell/spu/spu_funcs.c
@@ -106,7 +106,7 @@ spu_txp(vector float s, vector float t, vector float r, vector float q,
         unsigned unit)
 {
    struct vec_4x4 colors;
-   spu.sample_texture4[unit](s, t, r, q, unit, 0, colors.v);
+   spu.sample_texture4[unit](s, t, r, q, unit, 0, 0, colors.v);
    return colors;
 }
 
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 45c6f4ced1..8781041bff 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -68,7 +68,7 @@ typedef void (*spu_sample_texture4_func)(vector float s,
                                          vector float t,
                                          vector float r,
                                          vector float q,
-                                         uint unit, uint level,
+                                         uint unit, uint level, uint face,
                                          vector float colors[4]);
 
 
@@ -113,6 +113,7 @@ struct spu_texture_level
    void *start;
    ushort width, height;
    ushort tiles_per_row;
+   uint bytes_per_image;
    /** texcoord scale factors */
    vector float scale_s, scale_t;
    /** texcoord masks (if REPEAT then size-1, else ~0) */
@@ -126,6 +127,7 @@ struct spu_texture
 {
    struct spu_texture_level level[CELL_MAX_TEXTURE_LEVELS];
    uint max_level;
+   uint target;  /**< PIPE_TEXTURE_x */
 } ALIGN16_ATTRIB;
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
index b21c43a467..2570f02c73 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.c
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -48,6 +48,9 @@ invalidate_tex_cache(void)
       uint bytes = 4 * spu.texture[unit].level[lvl].width
          * spu.texture[unit].level[lvl].height;
 
+      if (spu.texture[unit].target == PIPE_TEXTURE_CUBE)
+         bytes *= 6;
+
       spu_dcache_mark_dirty((unsigned) spu.texture[unit].level[lvl].start, bytes);
    }
 }
@@ -67,11 +70,11 @@ invalidate_tex_cache(void)
  * a time.
  */
 static void
-get_four_texels(uint unit, uint level, vec_int4 x, vec_int4 y,
+get_four_texels(uint unit, uint level, uint face, vec_int4 x, vec_int4 y,
                 vec_uint4 *texels)
 {
    const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
-   const unsigned texture_ea = (uintptr_t) tlevel->start;
+   unsigned texture_ea = (uintptr_t) tlevel->start;
    const vec_int4 tile_x = spu_rlmask(x, -5);  /* tile_x = x / 32 */
    const vec_int4 tile_y = spu_rlmask(y, -5);  /* tile_y = y / 32 */
    const qword offset_x = si_andi((qword) x, 0x1f); /* offset_x = x & 0x1f */
@@ -88,6 +91,8 @@ get_four_texels(uint unit, uint level, vec_int4 x, vec_int4 y,
    
    vec_uint4 offset = (vec_uint4) si_a(tile_offset, texel_offset);
    
+   texture_ea = texture_ea + face * tlevel->bytes_per_image;
+
    spu_dcache_fetch_unaligned((qword *) & texels[0],
                               texture_ea + spu_extract(offset, 0), 4);
    spu_dcache_fetch_unaligned((qword *) & texels[1],
@@ -121,7 +126,8 @@ spu_clamp(vector signed int vec, vector signed int max)
 void
 sample_texture4_nearest(vector float s, vector float t,
                         vector float r, vector float q,
-                        uint unit, uint level, vector float colors[4])
+                        uint unit, uint level, uint face,
+                        vector float colors[4])
 {
    const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
    vector float ss = spu_mul(s, tlevel->scale_s);
@@ -138,7 +144,7 @@ sample_texture4_nearest(vector float s, vector float t,
    is = spu_clamp(is, tlevel->max_s);
    it = spu_clamp(it, tlevel->max_t);
 
-   get_four_texels(unit, level, is, it, texels);
+   get_four_texels(unit, level, face, is, it, texels);
 
    /* convert four packed ARGBA pixels to float RRRR,GGGG,BBBB,AAAA */
    spu_unpack_A8R8G8B8_transpose4(texels, colors);
@@ -152,11 +158,14 @@ sample_texture4_nearest(vector float s, vector float t,
 void
 sample_texture4_bilinear(vector float s, vector float t,
                          vector float r, vector float q,
-                         uint unit, uint level, vector float colors[4])
+                         uint unit, uint level, uint face,
+                         vector float colors[4])
 {
    const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
-   vector float ss = spu_madd(s, tlevel->scale_s,  spu_splats(-0.5f));
-   vector float tt = spu_madd(t, tlevel->scale_t, spu_splats(-0.5f));
+   static const vector float half = {-0.5f, -0.5f, -0.5f, -0.5f};
+
+   vector float ss = spu_madd(s, tlevel->scale_s, half);
+   vector float tt = spu_madd(t, tlevel->scale_t, half);
 
    vector signed int is0 = spu_convts(ss, 0);
    vector signed int it0 = spu_convts(tt, 0);
@@ -179,10 +188,10 @@ sample_texture4_bilinear(vector float s, vector float t,
 
    /* get packed int texels */
    vector unsigned int texels[16];
-   get_four_texels(unit, level, is0, it0, texels + 0);  /* upper-left */
-   get_four_texels(unit, level, is1, it0, texels + 4);  /* upper-right */
-   get_four_texels(unit, level, is0, it1, texels + 8);  /* lower-left */
-   get_four_texels(unit, level, is1, it1, texels + 12); /* lower-right */
+   get_four_texels(unit, level, face, is0, it0, texels + 0);  /* upper-left */
+   get_four_texels(unit, level, face, is1, it0, texels + 4);  /* upper-right */
+   get_four_texels(unit, level, face, is0, it1, texels + 8);  /* lower-left */
+   get_four_texels(unit, level, face, is1, it1, texels + 12); /* lower-right */
 
    /* XXX possibly rework following code to compute the weighted sample
     * colors with integer arithmetic for fewer int->float conversions.
@@ -299,10 +308,12 @@ transpose(vector unsigned int *mOut0,
 void
 sample_texture4_bilinear_2(vector float s, vector float t,
                            vector float r, vector float q,
-                           uint unit, uint level, vector float colors[4])
+                           uint unit, uint level, uint face,
+                           vector float colors[4])
 {
    const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
    static const vector float half = {-0.5f, -0.5f, -0.5f, -0.5f};
+
    /* Scale texcoords by size of texture, and add half pixel bias */
    vector float ss = spu_madd(s, tlevel->scale_s, half);
    vector float tt = spu_madd(t, tlevel->scale_t, half);
@@ -339,10 +350,10 @@ sample_texture4_bilinear_2(vector float s, vector float t,
 
    /* get packed int texels */
    vector unsigned int texels[16];
-   get_four_texels(unit, level, is0, it0, texels + 0);  /* upper-left */
-   get_four_texels(unit, level, is1, it0, texels + 4);  /* upper-right */
-   get_four_texels(unit, level, is0, it1, texels + 8);  /* lower-left */
-   get_four_texels(unit, level, is1, it1, texels + 12); /* lower-right */
+   get_four_texels(unit, level, face, is0, it0, texels + 0);  /* upper-left */
+   get_four_texels(unit, level, face, is1, it0, texels + 4);  /* upper-right */
+   get_four_texels(unit, level, face, is0, it1, texels + 8);  /* lower-left */
+   get_four_texels(unit, level, face, is1, it1, texels + 12); /* lower-right */
 
    /* twiddle packed 32-bit BGRA pixels into RGBA as four unsigned ints */
    {
@@ -433,7 +444,8 @@ compute_lambda(uint unit, vector float s, vector float t)
 void
 sample_texture4_lod(vector float s, vector float t,
                     vector float r, vector float q,
-                    uint unit, uint level_ignored, vector float colors[4])
+                    uint unit, uint level_ignored, uint face,
+                    vector float colors[4])
 {
    /*
     * Note that we're computing a lambda/lod here that's used for all
@@ -452,15 +464,136 @@ sample_texture4_lod(vector float s, vector float t,
 
    if (lambda <= 0.0f) {
       /* magnify */
-      spu.mag_sample_texture4[unit](s, t, r, q, unit, 0, colors);
+      spu.mag_sample_texture4[unit](s, t, r, q, unit, 0, 0, colors);
    }
    else {
       /* minify */
       int level = (int) (lambda + 0.5f);
       if (level > (int) spu.texture[unit].max_level)
          level = spu.texture[unit].max_level;
-      spu.min_sample_texture4[unit](s, t, r, q, unit, level, colors);
+      spu.min_sample_texture4[unit](s, t, r, q, unit, level, 0, colors);
       /* XXX to do: mipmap level interpolation */
    }
 }
 
+
+/** XXX need a SIMD version of this */
+static unsigned
+choose_cube_face(float rx, float ry, float rz, float *newS, float *newT)
+{
+   /*
+      major axis
+      direction     target                             sc     tc    ma
+      ----------    -------------------------------    ---    ---   ---
+       +rx          TEXTURE_CUBE_MAP_POSITIVE_X_EXT    -rz    -ry   rx
+       -rx          TEXTURE_CUBE_MAP_NEGATIVE_X_EXT    +rz    -ry   rx
+       +ry          TEXTURE_CUBE_MAP_POSITIVE_Y_EXT    +rx    +rz   ry
+       -ry          TEXTURE_CUBE_MAP_NEGATIVE_Y_EXT    +rx    -rz   ry
+       +rz          TEXTURE_CUBE_MAP_POSITIVE_Z_EXT    +rx    -ry   rz
+       -rz          TEXTURE_CUBE_MAP_NEGATIVE_Z_EXT    -rx    -ry   rz
+   */
+   const float arx = fabsf(rx);
+   const float ary = fabsf(ry);
+   const float arz = fabsf(rz);
+   unsigned face;
+   float sc, tc, ma;
+
+   if (arx > ary && arx > arz) {
+      if (rx >= 0.0F) {
+         face = PIPE_TEX_FACE_POS_X;
+         sc = -rz;
+         tc = -ry;
+         ma = arx;
+      }
+      else {
+         face = PIPE_TEX_FACE_NEG_X;
+         sc = rz;
+         tc = -ry;
+         ma = arx;
+      }
+   }
+   else if (ary > arx && ary > arz) {
+      if (ry >= 0.0F) {
+         face = PIPE_TEX_FACE_POS_Y;
+         sc = rx;
+         tc = rz;
+         ma = ary;
+      }
+      else {
+         face = PIPE_TEX_FACE_NEG_Y;
+         sc = rx;
+         tc = -rz;
+         ma = ary;
+      }
+   }
+   else {
+      if (rz > 0.0F) {
+         face = PIPE_TEX_FACE_POS_Z;
+         sc = rx;
+         tc = -ry;
+         ma = arz;
+      }
+      else {
+         face = PIPE_TEX_FACE_NEG_Z;
+         sc = -rx;
+         tc = -ry;
+         ma = arz;
+      }
+   }
+
+   *newS = (sc / ma + 1.0F) * 0.5F;
+   *newT = (tc / ma + 1.0F) * 0.5F;
+
+   return face;
+}
+
+
+
+void
+sample_texture4_cube(vector float s, vector float t,
+                     vector float r, vector float q,
+                     uint unit, uint level, int face_ignored,
+                     vector float colors[4])
+{
+   static const vector float zero = {0.0f, 0.0f, 0.0f, 0.0f};
+   uint p, faces[4];
+   float newS[4], newT[4];
+
+   /* Compute cube face referenced by the four sets of texcoords.
+    * XXX we should SIMD-ize this.
+    */
+   for (p = 0; p < 4; p++) {      
+      float rx = spu_extract(s, p);
+      float ry = spu_extract(t, p);
+      float rz = spu_extract(r, p);
+      faces[p] = choose_cube_face(rx, ry, rz, &newS[p], &newT[p]);
+   }
+
+   if (faces[0] == faces[1] &&
+       faces[0] == faces[2] &&
+       faces[0] == faces[3]) {
+      /* GOOD!  All four texcoords refer to the same cube face */
+      s = (vector float) {newS[0], newS[1], newS[2], newS[3]};
+      t = (vector float) {newT[0], newT[1], newT[2], newT[3]};
+      sample_texture4_nearest(s, t, zero, zero, unit, level, faces[0], colors);
+   }
+   else {
+      /* BAD!  The four texcoords refer to different faces */
+      for (p = 0; p < 4; p++) {      
+         vector float c[4];
+
+         sample_texture4_nearest(spu_splats(newS[p]), spu_splats(newT[p]),
+                                 zero, zero, unit, level, faces[p], c);
+
+         float red = spu_extract(c[0], p);
+         float green = spu_extract(c[1], p);
+         float blue = spu_extract(c[2], p);
+         float alpha = spu_extract(c[3], p);
+
+         colors[0] = spu_insert(red,   colors[0], p);
+         colors[1] = spu_insert(green, colors[1], p);
+         colors[2] = spu_insert(blue,  colors[2], p);
+         colors[3] = spu_insert(alpha, colors[3], p);
+      }
+   }
+}
diff --git a/src/gallium/drivers/cell/spu/spu_texture.h b/src/gallium/drivers/cell/spu/spu_texture.h
index ec06a50b4a..08b891a4a8 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.h
+++ b/src/gallium/drivers/cell/spu/spu_texture.h
@@ -39,24 +39,35 @@ invalidate_tex_cache(void);
 extern void
 sample_texture4_nearest(vector float s, vector float t,
                         vector float r, vector float q,
-                        uint unit, uint level, vector float colors[4]);
+                        uint unit, uint level, uint face,
+                        vector float colors[4]);
 
 
 extern void
 sample_texture4_bilinear(vector float s, vector float t,
                          vector float r, vector float q,
-                         uint unit, uint level, vector float colors[4]);
+                         uint unit, uint level, uint face,
+                         vector float colors[4]);
 
 extern void
 sample_texture4_bilinear_2(vector float s, vector float t,
-                         vector float r, vector float q,
-                           uint unit, uint level, vector float colors[4]);
+                           vector float r, vector float q,
+                           uint unit, uint level, uint face,
+                           vector float colors[4]);
 
 
 extern void
 sample_texture4_lod(vector float s, vector float t,
                     vector float r, vector float q,
-                    uint unit, uint level, vector float colors[4]);
+                    uint unit, uint level, uint face,
+                    vector float colors[4]);
+
+
+extern void
+sample_texture4_cube(vector float s, vector float t,
+                     vector float r, vector float q,
+                     uint unit, uint level_ignored, int face_ignored,
+                     vector float colors[4]);
 
 
 #endif /* SPU_TEXTURE_H */
-- 
cgit v1.2.3


From e42a394ed5ca00a9d0a51a0c26d4fef9959ba43c Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Tue, 14 Oct 2008 17:19:57 -0600
Subject: cell: fix incorrect parameter type

---
 src/gallium/drivers/cell/spu/spu_texture.c | 2 +-
 src/gallium/drivers/cell/spu/spu_texture.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
index 2570f02c73..9e25094d13 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.c
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -552,7 +552,7 @@ choose_cube_face(float rx, float ry, float rz, float *newS, float *newT)
 void
 sample_texture4_cube(vector float s, vector float t,
                      vector float r, vector float q,
-                     uint unit, uint level, int face_ignored,
+                     uint unit, uint level, uint face_ignored,
                      vector float colors[4])
 {
    static const vector float zero = {0.0f, 0.0f, 0.0f, 0.0f};
diff --git a/src/gallium/drivers/cell/spu/spu_texture.h b/src/gallium/drivers/cell/spu/spu_texture.h
index 08b891a4a8..387484c3ad 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.h
+++ b/src/gallium/drivers/cell/spu/spu_texture.h
@@ -66,7 +66,7 @@ sample_texture4_lod(vector float s, vector float t,
 extern void
 sample_texture4_cube(vector float s, vector float t,
                      vector float r, vector float q,
-                     uint unit, uint level_ignored, int face_ignored,
+                     uint unit, uint level_ignored, uint face_ignored,
                      vector float colors[4]);
 
 
-- 
cgit v1.2.3


From 41ccdde767e7aba6e8e6a9a035eacd6338c03a95 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Tue, 14 Oct 2008 17:22:40 -0600
Subject: cell: initial bits for 3D texture support

---
 src/gallium/drivers/cell/common.h              |  1 +
 src/gallium/drivers/cell/ppu/cell_state_emit.c |  2 ++
 src/gallium/drivers/cell/spu/spu_command.c     | 13 +++++++++++--
 src/gallium/drivers/cell/spu/spu_main.h        |  8 ++++----
 src/gallium/drivers/cell/spu/spu_texture.c     |  2 ++
 5 files changed, 20 insertions(+), 6 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index c1e78f4db3..b0169b8e32 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -256,6 +256,7 @@ struct cell_command_texture
    void *start[CELL_MAX_TEXTURE_LEVELS];   /**< Address in main memory */
    ushort width[CELL_MAX_TEXTURE_LEVELS];
    ushort height[CELL_MAX_TEXTURE_LEVELS];
+   ushort depth[CELL_MAX_TEXTURE_LEVELS];
 };
 
 
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index d4a867ffcf..bb694aa107 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -216,6 +216,7 @@ cell_emit_state(struct cell_context *cell)
                texture->start[level] = cell->texture[i]->tiled_data[level];
                texture->width[level] = cell->texture[i]->base.width[level];
                texture->height[level] = cell->texture[i]->base.height[level];
+               texture->depth[level] = cell->texture[i]->base.depth[level];
             }
             texture->target = cell->texture[i]->base.target;
          }
@@ -225,6 +226,7 @@ cell_emit_state(struct cell_context *cell)
                texture->start[level] = NULL;
                texture->width[level] = 0;
                texture->height[level] = 0;
+               texture->depth[level] = 0;
             }
             texture->target = 0;
          }
diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index c951fa6f31..c28677ebf8 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -59,6 +59,14 @@ static unsigned char attribute_fetch_code_buffer[136 * PIPE_MAX_ATTRIBS]
 
 
+static INLINE int
+align(int value, int alignment)
+{
+   return (value + alignment - 1) & ~(alignment - 1);
+}
+
+
+
 /**
  * Tell the PPU that this SPU has finished copying a buffer to
  * local store and that it may be reused by the PPU.
@@ -404,6 +412,7 @@ cmd_state_texture(const struct cell_command_texture *texture)
    for (i = 0; i < CELL_MAX_TEXTURE_LEVELS; i++) {
       uint width = texture->width[i];
       uint height = texture->height[i];
+      uint depth = texture->depth[i];
 
       DEBUG_PRINTF("  LEVEL %u: at %p  size[0] %u x %u\n", i,
              texture->start[i], texture->width[i], texture->height[i]);
@@ -411,13 +420,13 @@ cmd_state_texture(const struct cell_command_texture *texture)
       spu.texture[unit].level[i].start = texture->start[i];
       spu.texture[unit].level[i].width = width;
       spu.texture[unit].level[i].height = height;
+      spu.texture[unit].level[i].depth = depth;
 
       spu.texture[unit].level[i].tiles_per_row =
          (width + TILE_SIZE - 1) / TILE_SIZE;
 
       spu.texture[unit].level[i].bytes_per_image =
-         4 * ((width + TILE_SIZE - 1) & ~(TILE_SIZE-1))
-         * ((height + TILE_SIZE - 1) & ~(TILE_SIZE-1));
+         4 * align(width, TILE_SIZE) * align(height, TILE_SIZE) * depth;
 
       spu.texture[unit].level[i].max_s = spu_splats((int) width - 1);
       spu.texture[unit].level[i].max_t = spu_splats((int) height - 1);
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 8781041bff..eff43b870c 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -111,15 +111,15 @@ struct spu_framebuffer
 struct spu_texture_level
 {
    void *start;
-   ushort width, height;
+   ushort width, height, depth;
    ushort tiles_per_row;
    uint bytes_per_image;
    /** texcoord scale factors */
-   vector float scale_s, scale_t;
+   vector float scale_s, scale_t, scale_r;
    /** texcoord masks (if REPEAT then size-1, else ~0) */
-   vector signed int mask_s, mask_t;
+   vector signed int mask_s, mask_t, mask_r;
    /** texcoord clamp limits */
-   vector signed int max_s, max_t;
+   vector signed int max_s, max_t, max_r;
 } ALIGN16_ATTRIB;
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
index 9e25094d13..42eb06a362 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.c
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -50,6 +50,8 @@ invalidate_tex_cache(void)
 
       if (spu.texture[unit].target == PIPE_TEXTURE_CUBE)
          bytes *= 6;
+      else if (spu.texture[unit].target == PIPE_TEXTURE_3D)
+         bytes *= spu.texture[unit].level[lvl].depth;
 
       spu_dcache_mark_dirty((unsigned) spu.texture[unit].level[lvl].start, bytes);
    }
-- 
cgit v1.2.3


From 4e506f422a13b20fcc95edb6c7048a9de6e32efa Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 15 Oct 2008 10:53:48 -0600
Subject: cell: fix/add some fallback blend cases

---
 src/gallium/drivers/cell/spu/spu_per_fragment_op.c | 49 ++++++++++++++++++++--
 1 file changed, 46 insertions(+), 3 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
index d252fa6dc1..9404704abf 100644
--- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
+++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
@@ -40,6 +40,24 @@
 #define LINEAR_QUAD_LAYOUT 1
 
 
+static INLINE vector float
+spu_min(vector float a, vector float b)
+{
+   vector unsigned int m;
+   m = spu_cmpgt(a, b);    /* m = a > b ? ~0 : 0 */
+   return spu_sel(a, b, m);
+}
+
+
+static INLINE vector float
+spu_max(vector float a, vector float b)
+{
+   vector unsigned int m;
+   m = spu_cmpgt(a, b);    /* m = a > b ? ~0 : 0 */
+   return spu_sel(b, a, m);
+}
+
+
 /**
  * Called by rasterizer for each quad after the shader has run.  Do
  * all the per-fragment operations including alpha test, z test,
@@ -293,9 +311,9 @@ spu_fallback_fragment_ops(uint x, uint y,
        */
       switch (spu.blend.rgb_dst_factor) {
       case PIPE_BLENDFACTOR_ONE:
-         term2r = fragR;
-         term2g = fragG;
-         term2b = fragB;
+         term2r = fbRGBA[0];
+         term2g = fbRGBA[1];
+         term2b = fbRGBA[2];
          break;
       case PIPE_BLENDFACTOR_ZERO:
          term2r =
@@ -361,8 +379,24 @@ spu_fallback_fragment_ops(uint x, uint y,
          fragG = spu_sub(term1g, term2g);
          fragB = spu_sub(term1b, term2b);
          break;
+      case PIPE_BLEND_REVERSE_SUBTRACT:
+         fragR = spu_sub(term2r, term1r);
+         fragG = spu_sub(term2g, term1g);
+         fragB = spu_sub(term2b, term1b);
+         break;
+      case PIPE_BLEND_MIN:
+         fragR = spu_min(term1r, term2r);
+         fragG = spu_min(term1g, term2g);
+         fragB = spu_min(term1b, term2b);
+         break;
+      case PIPE_BLEND_MAX:
+         fragR = spu_max(term1r, term2r);
+         fragG = spu_max(term1g, term2g);
+         fragB = spu_max(term1b, term2b);
+         break;
       /* XXX more cases */
       default:
+         printf("unsup 0x%x\n", spu.blend.rgb_func);
          ASSERT(0);
       }
 
@@ -376,6 +410,15 @@ spu_fallback_fragment_ops(uint x, uint y,
       case PIPE_BLEND_SUBTRACT:
          fragA = spu_sub(term1a, term2a);
          break;
+      case PIPE_BLEND_REVERSE_SUBTRACT:
+         fragA = spu_sub(term2a, term1a);
+         break;
+      case PIPE_BLEND_MIN:
+         fragA = spu_min(term1a, term2a);
+         break;
+      case PIPE_BLEND_MAX:
+         fragA = spu_max(term1a, term2a);
+         break;
       /* XXX more cases */
       default:
          ASSERT(0);
-- 
cgit v1.2.3


From f60c756ed14f25731ff2a52d6b695ceb5b7a6f6b Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 15 Oct 2008 10:54:06 -0600
Subject: cell: additional debug

---
 src/gallium/drivers/cell/spu/spu_command.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index c28677ebf8..a07b312111 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -250,6 +250,7 @@ cmd_state_fs_constants(const uint64_t *buffer, uint pos)
 
    /* Expand each float to float[4] for SOA execution */
    for (i = 0; i < num_const; i++) {
+      DEBUG_PRINTF("  const[%u] = %f\n", i, constants[i]);
       spu.constants[i] = spu_splats(constants[i]);
    }
 
-- 
cgit v1.2.3


From 9382a7100fd6de6e615dc661ed813bf43e24ec15 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 15 Oct 2008 10:54:36 -0600
Subject: cell: updated vertex dump/debug code

---
 src/gallium/drivers/cell/spu/spu_tri.c | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index 03f094373d..2417db8960 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -404,11 +404,14 @@ flush_spans(void)
 static void
 print_vertex(const struct vertex_header *v)
 {
-   int i;
-   fprintf(stderr, "Vertex: (%p)\n", v);
-   for (i = 0; i < setup.quad.nr_attrs; i++) {
-      fprintf(stderr, "  %d: %f %f %f %f\n",  i, 
-              v->data[i][0], v->data[i][1], v->data[i][2], v->data[i][3]);
+   uint i;
+   fprintf(stderr, "  Vertex: (%p)\n", v);
+   for (i = 0; i < spu.vertex_info.num_attribs; i++) {
+      fprintf(stderr, "    %d: %f %f %f %f\n",  i, 
+              spu_extract(v->data[i], 0),
+              spu_extract(v->data[i], 1),
+              spu_extract(v->data[i], 2),
+              spu_extract(v->data[i], 3));
    }
 }
 #endif
@@ -420,10 +423,12 @@ setup_sort_vertices(const struct vertex_header *v0,
                     const struct vertex_header *v2)
 {
 #if DEBUG_VERTS
-   fprintf(stderr, "Triangle:\n");
-   print_vertex(v0);
-   print_vertex(v1);
-   print_vertex(v2);
+   if (spu.init.id==0) {
+      fprintf(stderr, "SPU %u: Triangle:\n", spu.init.id);
+      print_vertex(v0);
+      print_vertex(v1);
+      print_vertex(v2);
+   }
 #endif
 
    setup.vprovoke = v2;
-- 
cgit v1.2.3


From 53951531ae7bfd64afae1ae55aac7f6ebd3fe4f5 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 15 Oct 2008 12:35:51 -0600
Subject: cell: propogate blend color to SPUs for the fallback fragment ops
 code

---
 src/gallium/drivers/cell/common.h                  |  4 ++
 src/gallium/drivers/cell/ppu/cell_context.h        |  1 +
 src/gallium/drivers/cell/ppu/cell_state_emit.c     |  1 +
 src/gallium/drivers/cell/spu/spu_command.c         |  1 +
 src/gallium/drivers/cell/spu/spu_main.h            |  1 +
 src/gallium/drivers/cell/spu/spu_per_fragment_op.c | 75 +++++++++++++++++++---
 6 files changed, 74 insertions(+), 9 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index b0169b8e32..3b5a25e165 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -118,12 +118,16 @@
 
 /**
  * Command to specify per-fragment operations state and generated code.
+ * Note that the dsa, blend, blend_color fields are really only needed
+ * for the fallback/C per-pixel code.  They're not used when we generate
+ * dynamic SPU fragment code (which is the normal case).
  */
 struct cell_command_fragment_ops
 {
    uint64_t opcode;      /**< CELL_CMD_STATE_FRAGMENT_OPS */
    struct pipe_depth_stencil_alpha_state dsa;
    struct pipe_blend_state blend;
+   struct pipe_blend_color blend_color;
    unsigned code[SPU_MAX_FRAGMENT_OPS_INSTS];
 };
 
diff --git a/src/gallium/drivers/cell/ppu/cell_context.h b/src/gallium/drivers/cell/ppu/cell_context.h
index 80a9b3d7e1..1fcf03c2b8 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.h
+++ b/src/gallium/drivers/cell/ppu/cell_context.h
@@ -74,6 +74,7 @@ struct cell_fragment_shader_state
 struct cell_fragment_ops_key
 {
    struct pipe_blend_state blend;
+   struct pipe_blend_color blend_color;
    struct pipe_depth_stencil_alpha_state dsa;
    enum pipe_format color_format;
    enum pipe_format zs_format;
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index bb694aa107..d2427584ba 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -52,6 +52,7 @@ lookup_fragment_ops(struct cell_context *cell)
     */
    memset(&key, 0, sizeof(key));
    key.blend = *cell->blend;
+   key.blend_color = cell->blend_color;
    key.dsa = *cell->depth_stencil;
 
    if (cell->framebuffer.cbufs[0])
diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index a07b312111..b521c3aecf 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -195,6 +195,7 @@ cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops)
    /* Copy state info (for fallback case only) */
    memcpy(&spu.depth_stencil_alpha, &fops->dsa, sizeof(fops->dsa));
    memcpy(&spu.blend, &fops->blend, sizeof(fops->blend));
+   memcpy(&spu.blend_color, &fops->blend_color, sizeof(fops->blend_color));
 
    /* Parity twist!  For now, always use the fallback code by default,
     * only switching to codegen when specifically requested.  This
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index eff43b870c..ca72baea8b 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -145,6 +145,7 @@ struct spu_global
    struct spu_framebuffer fb;
    struct pipe_depth_stencil_alpha_state depth_stencil_alpha;
    struct pipe_blend_state blend;
+   struct pipe_blend_color blend_color;
    struct pipe_sampler_state sampler[PIPE_MAX_SAMPLERS];
    struct spu_texture texture[PIPE_MAX_SAMPLERS];
    struct vertex_info vertex_info;
diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
index 9404704abf..f8ffc70492 100644
--- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
+++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
@@ -260,7 +260,7 @@ spu_fallback_fragment_ops(uint x, uint y,
       }
 
       /*
-       * Compute Src RGB terms
+       * Compute Src RGB terms (fragment color * factor)
        */
       switch (spu.blend.rgb_src_factor) {
       case PIPE_BLENDFACTOR_ONE:
@@ -283,13 +283,33 @@ spu_fallback_fragment_ops(uint x, uint y,
          term1g = spu_mul(fragG, fragA);
          term1b = spu_mul(fragB, fragA);
          break;
+      case PIPE_BLENDFACTOR_DST_COLOR:
+         term1r = spu_mul(fragR, fbRGBA[0]);
+         term1g = spu_mul(fragG, fbRGBA[1]);
+         term1b = spu_mul(fragB, fbRGBA[1]);
+         break;
+      case PIPE_BLENDFACTOR_DST_ALPHA:
+         term1r = spu_mul(fragR, fbRGBA[3]);
+         term1g = spu_mul(fragG, fbRGBA[3]);
+         term1b = spu_mul(fragB, fbRGBA[3]);
+         break;
+      case PIPE_BLENDFACTOR_CONST_COLOR:
+         term1r = spu_mul(fragR, spu_splats(spu.blend_color.color[0]));
+         term1g = spu_mul(fragG, spu_splats(spu.blend_color.color[1]));
+         term1b = spu_mul(fragB, spu_splats(spu.blend_color.color[2]));
+         break;
+      case PIPE_BLENDFACTOR_CONST_ALPHA:
+         term1r = spu_mul(fragR, spu_splats(spu.blend_color.color[3]));
+         term1g = spu_mul(fragG, spu_splats(spu.blend_color.color[3]));
+         term1b = spu_mul(fragB, spu_splats(spu.blend_color.color[3]));
+         break;
       /* XXX more cases */
       default:
          ASSERT(0);
       }
 
       /*
-       * Compute Src Alpha term
+       * Compute Src Alpha term (fragment alpha * factor)
        */
       switch (spu.blend.alpha_src_factor) {
       case PIPE_BLENDFACTOR_ONE:
@@ -301,13 +321,23 @@ spu_fallback_fragment_ops(uint x, uint y,
       case PIPE_BLENDFACTOR_SRC_ALPHA:
          term1a = spu_mul(fragA, fragA);
          break;
+      case PIPE_BLENDFACTOR_DST_COLOR:
+         /* fall-through */
+      case PIPE_BLENDFACTOR_DST_ALPHA:
+         term1a = spu_mul(fragA, fbRGBA[3]);
+         break;
+      case PIPE_BLENDFACTOR_CONST_COLOR:
+         /* fall-through */
+      case PIPE_BLENDFACTOR_CONST_ALPHA:
+         term1a = spu_mul(fragR, spu_splats(spu.blend_color.color[3]));
+         break;
       /* XXX more cases */
       default:
          ASSERT(0);
       }
 
       /*
-       * Compute Dest RGB terms
+       * Compute Dest RGB terms (framebuffer color * factor)
        */
       switch (spu.blend.rgb_dst_factor) {
       case PIPE_BLENDFACTOR_ONE:
@@ -337,17 +367,37 @@ spu_fallback_fragment_ops(uint x, uint y,
          term2g = spu_mul(fbRGBA[1], tmp);
          term2b = spu_mul(fbRGBA[2], tmp);
          break;
-      /* XXX more cases */
+      case PIPE_BLENDFACTOR_DST_COLOR:
+         term2r = spu_mul(fbRGBA[0], fbRGBA[0]);
+         term2g = spu_mul(fbRGBA[1], fbRGBA[1]);
+         term2b = spu_mul(fbRGBA[2], fbRGBA[2]);
+         break;
+      case PIPE_BLENDFACTOR_DST_ALPHA:
+         term2r = spu_mul(fbRGBA[0], fbRGBA[3]);
+         term2g = spu_mul(fbRGBA[1], fbRGBA[3]);
+         term2b = spu_mul(fbRGBA[2], fbRGBA[3]);
+         break;
+      case PIPE_BLENDFACTOR_CONST_COLOR:
+         term2r = spu_mul(fbRGBA[0], spu_splats(spu.blend_color.color[0]));
+         term2g = spu_mul(fbRGBA[1], spu_splats(spu.blend_color.color[1]));
+         term2b = spu_mul(fbRGBA[2], spu_splats(spu.blend_color.color[2]));
+         break;
+      case PIPE_BLENDFACTOR_CONST_ALPHA:
+         term2r = spu_mul(fbRGBA[0], spu_splats(spu.blend_color.color[3]));
+         term2g = spu_mul(fbRGBA[1], spu_splats(spu.blend_color.color[3]));
+         term2b = spu_mul(fbRGBA[2], spu_splats(spu.blend_color.color[3]));
+         break;
+       /* XXX more cases */
       default:
          ASSERT(0);
       }
 
       /*
-       * Compute Dest Alpha term
+       * Compute Dest Alpha term (framebuffer alpha * factor)
        */
       switch (spu.blend.alpha_dst_factor) {
       case PIPE_BLENDFACTOR_ONE:
-         term2a = fragA;
+         term2a = fbRGBA[3];
          break;
       case PIPE_BLENDFACTOR_SRC_COLOR:
          term2a = spu_splats(0.0f);
@@ -360,6 +410,16 @@ spu_fallback_fragment_ops(uint x, uint y,
          tmp = spu_sub(one, fragA);
          term2a = spu_mul(fbRGBA[3], tmp);
          break;
+      case PIPE_BLENDFACTOR_DST_COLOR:
+         /* fall-through */
+      case PIPE_BLENDFACTOR_DST_ALPHA:
+         term2a = spu_mul(fbRGBA[3], fbRGBA[3]);
+         break;
+      case PIPE_BLENDFACTOR_CONST_COLOR:
+         /* fall-through */
+      case PIPE_BLENDFACTOR_CONST_ALPHA:
+         term2a = spu_mul(fbRGBA[3], spu_splats(spu.blend_color.color[3]));
+         break;
       /* XXX more cases */
       default:
          ASSERT(0);
@@ -394,9 +454,7 @@ spu_fallback_fragment_ops(uint x, uint y,
          fragG = spu_max(term1g, term2g);
          fragB = spu_max(term1b, term2b);
          break;
-      /* XXX more cases */
       default:
-         printf("unsup 0x%x\n", spu.blend.rgb_func);
          ASSERT(0);
       }
 
@@ -419,7 +477,6 @@ spu_fallback_fragment_ops(uint x, uint y,
       case PIPE_BLEND_MAX:
          fragA = spu_max(term1a, term2a);
          break;
-      /* XXX more cases */
       default:
          ASSERT(0);
       }
-- 
cgit v1.2.3


From ddeec1ed10d6c12403fe8d30c072ea68f044db99 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 15 Oct 2008 13:55:18 -0600
Subject: cell: simplify spu debug code

---
 src/gallium/drivers/cell/common.h           |  1 +
 src/gallium/drivers/cell/ppu/cell_context.c |  1 +
 src/gallium/drivers/cell/spu/spu_command.c  | 47 +++++++++++++----------------
 src/gallium/drivers/cell/spu/spu_debug.h    |  9 ------
 src/gallium/drivers/cell/spu/spu_main.c     |  9 +-----
 src/gallium/drivers/cell/spu/spu_main.h     | 15 +++++++--
 src/gallium/drivers/cell/spu/spu_render.c   |  7 +++--
 7 files changed, 41 insertions(+), 48 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index 3b5a25e165..8ae78265f2 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -111,6 +111,7 @@
 #define CELL_DEBUG_SYNC                 (1 << 2)
 #define CELL_DEBUG_FRAGMENT_OPS         (1 << 3)
 #define CELL_DEBUG_FRAGMENT_OP_FALLBACK (1 << 4)
+#define CELL_DEBUG_CMD                  (1 << 5)
 
 /** Max instructions for doing per-fragment operations */
 #define SPU_MAX_FRAGMENT_OPS_INSTS 64
diff --git a/src/gallium/drivers/cell/ppu/cell_context.c b/src/gallium/drivers/cell/ppu/cell_context.c
index b66aa9c9d9..f8d5eef3ac 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.c
+++ b/src/gallium/drivers/cell/ppu/cell_context.c
@@ -93,6 +93,7 @@ static const struct debug_named_value cell_debug_flags[] = {
    {"sync", CELL_DEBUG_SYNC},      /**< SPUs do synchronous DMA */
    {"fragops", CELL_DEBUG_FRAGMENT_OPS}, /**< SPUs emit fragment ops debug messages*/
    {"fragopfallback", CELL_DEBUG_FRAGMENT_OP_FALLBACK}, /**< SPUs use reference implementation for fragment ops*/
+   {"cmd", CELL_DEBUG_CMD},       /**< SPUs dump command buffer info */
    {NULL, 0}
 };
 
diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index b521c3aecf..ebbed3d1dc 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -44,7 +44,6 @@
 #include "spu_tile.h"
 #include "spu_vertex_shader.h"
 #include "spu_dcache.h"
-#include "spu_debug.h"
 #include "cell/common.h"
 
 
@@ -97,7 +96,7 @@ release_buffer(uint buffer)
 static void
 cmd_clear_surface(const struct cell_command_clear_surface *clear)
 {
-   DEBUG_PRINTF("CLEAR SURF %u to 0x%08x\n", clear->surface, clear->value);
+   D_PRINTF(CELL_DEBUG_CMD, "CLEAR SURF %u to 0x%08x\n", clear->surface, clear->value);
 
    if (clear->surface == 0) {
       spu.fb.color_clear_value = clear->value;
@@ -165,14 +164,14 @@ cmd_clear_surface(const struct cell_command_clear_surface *clear)
 
 #endif /* CLEAR_OPT */
 
-   DEBUG_PRINTF("CLEAR SURF done\n");
+   D_PRINTF(CELL_DEBUG_CMD, "CLEAR SURF done\n");
 }
 
 
 static void
 cmd_release_verts(const struct cell_command_release_verts *release)
 {
-   DEBUG_PRINTF("RELEASE VERTS %u\n", release->vertex_buf);
+   D_PRINTF(CELL_DEBUG_CMD, "RELEASE VERTS %u\n", release->vertex_buf);
    ASSERT(release->vertex_buf != ~0U);
    release_buffer(release->vertex_buf);
 }
@@ -189,7 +188,7 @@ cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops)
 {
    static int warned = 0;
 
-   DEBUG_PRINTF("CMD_STATE_FRAGMENT_OPS\n");
+   D_PRINTF(CELL_DEBUG_CMD, "CMD_STATE_FRAGMENT_OPS\n");
    /* Copy SPU code from batch buffer to spu buffer */
    memcpy(spu.fragment_ops_code, fops->code, SPU_MAX_FRAGMENT_OPS_INSTS * 4);
    /* Copy state info (for fallback case only) */
@@ -229,7 +228,7 @@ cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops)
 static void
 cmd_state_fragment_program(const struct cell_command_fragment_program *fp)
 {
-   DEBUG_PRINTF("CMD_STATE_FRAGMENT_PROGRAM\n");
+   D_PRINTF(CELL_DEBUG_CMD, "CMD_STATE_FRAGMENT_PROGRAM\n");
    /* Copy SPU code from batch buffer to spu buffer */
    memcpy(spu.fragment_program_code, fp->code,
           SPU_MAX_FRAGMENT_PROGRAM_INSTS * 4);
@@ -247,11 +246,11 @@ cmd_state_fs_constants(const uint64_t *buffer, uint pos)
    const float *constants = (const float *) &buffer[pos + 2];
    uint i;
 
-   DEBUG_PRINTF("CMD_STATE_FS_CONSTANTS (%u)\n", num_const);
+   D_PRINTF(CELL_DEBUG_CMD, "CMD_STATE_FS_CONSTANTS (%u)\n", num_const);
 
    /* Expand each float to float[4] for SOA execution */
    for (i = 0; i < num_const; i++) {
-      DEBUG_PRINTF("  const[%u] = %f\n", i, constants[i]);
+      D_PRINTF(CELL_DEBUG_CMD, "  const[%u] = %f\n", i, constants[i]);
       spu.constants[i] = spu_splats(constants[i]);
    }
 
@@ -263,7 +262,7 @@ cmd_state_fs_constants(const uint64_t *buffer, uint pos)
 static void
 cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
 {
-   DEBUG_PRINTF("FRAMEBUFFER: %d x %d at %p, cformat 0x%x  zformat 0x%x\n",
+   D_PRINTF(CELL_DEBUG_CMD, "FRAMEBUFFER: %d x %d at %p, cformat 0x%x  zformat 0x%x\n",
              cmd->width,
              cmd->height,
              cmd->color_start,
@@ -352,7 +351,7 @@ cmd_state_sampler(const struct cell_command_sampler *sampler)
 {
    uint unit = sampler->unit;
 
-   DEBUG_PRINTF("SAMPLER [%u]\n", unit);
+   D_PRINTF(CELL_DEBUG_CMD, "SAMPLER [%u]\n", unit);
 
    spu.sampler[unit] = sampler->state;
 
@@ -404,9 +403,7 @@ cmd_state_texture(const struct cell_command_texture *texture)
    const uint unit = texture->unit;
    uint i;
 
-   //if (spu.init.id==0) Debug=1;
-
-   DEBUG_PRINTF("TEXTURE [%u]\n", texture->unit);
+   D_PRINTF(CELL_DEBUG_CMD, "TEXTURE [%u]\n", texture->unit);
 
    spu.texture[unit].max_level = 0;
    spu.texture[unit].target = texture->target;
@@ -416,7 +413,7 @@ cmd_state_texture(const struct cell_command_texture *texture)
       uint height = texture->height[i];
       uint depth = texture->depth[i];
 
-      DEBUG_PRINTF("  LEVEL %u: at %p  size[0] %u x %u\n", i,
+      D_PRINTF(CELL_DEBUG_CMD, "  LEVEL %u: at %p  size[0] %u x %u\n", i,
              texture->start[i], texture->width[i], texture->height[i]);
 
       spu.texture[unit].level[i].start = texture->start[i];
@@ -438,15 +435,13 @@ cmd_state_texture(const struct cell_command_texture *texture)
    }
 
    update_tex_masks(&spu.texture[unit], &spu.sampler[unit], unit);
-
-   //Debug=0;
 }
 
 
 static void
 cmd_state_vertex_info(const struct vertex_info *vinfo)
 {
-   DEBUG_PRINTF("VERTEX_INFO num_attribs=%u\n", vinfo->num_attribs);
+   D_PRINTF(CELL_DEBUG_CMD, "VERTEX_INFO num_attribs=%u\n", vinfo->num_attribs);
    ASSERT(vinfo->num_attribs >= 1);
    ASSERT(vinfo->num_attribs <= 8);
    memcpy(&spu.vertex_info, vinfo, sizeof(*vinfo));
@@ -485,7 +480,7 @@ cmd_state_attrib_fetch(const struct cell_attribute_fetch_code *code)
 static void
 cmd_finish(void)
 {
-   DEBUG_PRINTF("FINISH\n");
+   D_PRINTF(CELL_DEBUG_CMD, "FINISH\n");
    really_clear_tiles(0);
    /* wait for all outstanding DMAs to finish */
    mfc_write_tag_mask(~0);
@@ -510,7 +505,7 @@ cmd_batch(uint opcode)
    const unsigned usize = size / sizeof(buffer[0]);
    uint pos;
 
-   DEBUG_PRINTF("BATCH buffer %u, len %u, from %p\n",
+   D_PRINTF(CELL_DEBUG_CMD, "BATCH buffer %u, len %u, from %p\n",
              buf, size, spu.init.buffers[buf]);
 
    ASSERT((opcode & CELL_CMD_OPCODE_MASK) == CELL_CMD_BATCH);
@@ -530,7 +525,7 @@ cmd_batch(uint opcode)
    wait_on_mask(1 << TAG_BATCH_BUFFER);
 
    /* Tell PPU we're done copying the buffer to local store */
-   DEBUG_PRINTF("release batch buf %u\n", buf);
+   D_PRINTF(CELL_DEBUG_CMD, "release batch buf %u\n", buf);
    release_buffer(buf);
 
    /*
@@ -663,7 +658,7 @@ cmd_batch(uint opcode)
       }
    }
 
-   DEBUG_PRINTF("BATCH complete\n");
+   D_PRINTF(CELL_DEBUG_CMD, "BATCH complete\n");
 }
 
 
@@ -677,7 +672,7 @@ command_loop(void)
    struct cell_command cmd;
    int exitFlag = 0;
 
-   DEBUG_PRINTF("Enter command loop\n");
+   D_PRINTF(CELL_DEBUG_CMD, "Enter command loop\n");
 
    ASSERT((sizeof(struct cell_command) & 0xf) == 0);
    ASSERT_ALIGN16(&cmd);
@@ -686,12 +681,12 @@ command_loop(void)
       unsigned opcode;
       int tag = 0;
 
-      DEBUG_PRINTF("Wait for cmd...\n");
+      D_PRINTF(CELL_DEBUG_CMD, "Wait for cmd...\n");
 
       /* read/wait from mailbox */
       opcode = (unsigned int) spu_read_in_mbox();
 
-      DEBUG_PRINTF("got cmd 0x%x\n", opcode);
+      D_PRINTF(CELL_DEBUG_CMD, "got cmd 0x%x\n", opcode);
 
       /* command payload */
       mfc_get(&cmd,  /* dest */
@@ -708,7 +703,7 @@ command_loop(void)
 
       switch (opcode & CELL_CMD_OPCODE_MASK) {
       case CELL_CMD_EXIT:
-         DEBUG_PRINTF("EXIT\n");
+         D_PRINTF(CELL_DEBUG_CMD, "EXIT\n");
          exitFlag = 1;
          break;
       case CELL_CMD_VS_EXECUTE:
@@ -725,7 +720,7 @@ command_loop(void)
 
    }
 
-   DEBUG_PRINTF("Exit command loop\n");
+   D_PRINTF(CELL_DEBUG_CMD, "Exit command loop\n");
 
    spu_dcache_report();
 }
diff --git a/src/gallium/drivers/cell/spu/spu_debug.h b/src/gallium/drivers/cell/spu/spu_debug.h
index eeec052655..25653dcdcd 100644
--- a/src/gallium/drivers/cell/spu/spu_debug.h
+++ b/src/gallium/drivers/cell/spu/spu_debug.h
@@ -30,28 +30,19 @@
 #define SPU_DEBUG_H
 
 
-/* Set to 0 to disable all extraneous debugging code */
-#define DEBUG 1
-
 #if DEBUG
-extern boolean Debug;
-extern boolean force_fragment_ops_fallback;
 
 /* These debug macros use the unusual construction ", ##__VA_ARGS__"
  * which expands to the expected comma + args if variadic arguments
  * are supplied, but swallows the comma if there are no variadic
  * arguments (which avoids syntax errors that would otherwise occur).
  */
-#define DEBUG_PRINTF(format,...) \
-   if (Debug) \
-      printf("SPU %u: " format, spu.init.id, ##__VA_ARGS__)
 #define D_PRINTF(flag, format,...) \
    if (spu.init.debug_flags & (flag)) \
       printf("SPU %u: " format, spu.init.id, ##__VA_ARGS__)
 
 #else
 
-#define DEBUG_PRINTF(...)
 #define D_PRINTF(...)
 
 #endif
diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index 4becd0f92a..c8bb251905 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -40,7 +40,6 @@
 #include "spu_per_fragment_op.h"
 #include "spu_texture.h"
 //#include "spu_test.h"
-#include "spu_debug.h"
 #include "cell/common.h"
 
 
@@ -53,12 +52,6 @@ helpful headers:
 struct spu_global spu;
 
 
-#if DEBUG
-boolean Debug = FALSE;
-boolean force_fragment_ops_fallback = TRUE;
-#endif
-
-
 static void
 one_time_init(void)
 {
@@ -102,7 +95,7 @@ main(main_param_t speid, main_param_t argp)
 
    one_time_init();
 
-   DEBUG_PRINTF("main() speid=%lu\n", (unsigned long) speid);
+   D_PRINTF(CELL_DEBUG_CMD, "main() speid=%lu\n", (unsigned long) speid);
    D_PRINTF(CELL_DEBUG_FRAGMENT_OP_FALLBACK, "using fragment op fallback\n");
 
    /* get initialization data */
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index ca72baea8b..569b9e45d4 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -36,6 +36,19 @@
 #include "pipe/p_state.h"
 
 
+#if DEBUG
+/* These debug macros use the unusual construction ", ##__VA_ARGS__"
+ * which expands to the expected comma + args if variadic arguments
+ * are supplied, but swallows the comma if there are no variadic
+ * arguments (which avoids syntax errors that would otherwise occur).
+ */
+#define D_PRINTF(flag, format,...) \
+   if (spu.init.debug_flags & (flag)) \
+      printf("SPU %u: " format, spu.init.id, ##__VA_ARGS__)
+#else
+#define D_PRINTF(...)
+#endif
+
 
 #define MAX_WIDTH 1024
 #define MAX_HEIGHT 1024
@@ -187,8 +200,6 @@ struct spu_global
 
 
 extern struct spu_global spu;
-extern boolean Debug;
-
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_render.c b/src/gallium/drivers/cell/spu/spu_render.c
index 82dbeb26b7..cfff19b6c0 100644
--- a/src/gallium/drivers/cell/spu/spu_render.c
+++ b/src/gallium/drivers/cell/spu/spu_render.c
@@ -177,7 +177,7 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
    uint i, j;
 
 
-   if (Debug) {
+#if 0
       printf("SPU %u: RENDER prim %u, num_vert=%u  num_ind=%u  "
              "inline_vert=%u\n",
              spu.init.id,
@@ -190,7 +190,7 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
       printf("       bound: %g, %g .. %g, %g\n",
              render->xmin, render->ymin, render->xmax, render->ymax);
       */
-   }
+#endif
 
    ASSERT(sizeof(*render) % 4 == 0);
    ASSERT(total_vertex_bytes % 16 == 0);
@@ -293,7 +293,8 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
       spu.ztile_status[ty][tx] = spu.cur_ztile_status;
    }
 
-   if (Debug)
+#if 0
       printf("SPU %u: RENDER done\n",
              spu.init.id);
+#endif
 }
-- 
cgit v1.2.3


From 708f046c215d070e82f40eee895a8d312b1a64c7 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 15 Oct 2008 13:56:00 -0600
Subject: cell: remove obsolete spu_debug.h file

---
 src/gallium/drivers/cell/spu/spu_debug.h | 51 --------------------------------
 1 file changed, 51 deletions(-)
 delete mode 100644 src/gallium/drivers/cell/spu/spu_debug.h

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/spu/spu_debug.h b/src/gallium/drivers/cell/spu/spu_debug.h
deleted file mode 100644
index 25653dcdcd..0000000000
--- a/src/gallium/drivers/cell/spu/spu_debug.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-
-#ifndef SPU_DEBUG_H
-#define SPU_DEBUG_H
-
-
-#if DEBUG
-
-/* These debug macros use the unusual construction ", ##__VA_ARGS__"
- * which expands to the expected comma + args if variadic arguments
- * are supplied, but swallows the comma if there are no variadic
- * arguments (which avoids syntax errors that would otherwise occur).
- */
-#define D_PRINTF(flag, format,...) \
-   if (spu.init.debug_flags & (flag)) \
-      printf("SPU %u: " format, spu.init.id, ##__VA_ARGS__)
-
-#else
-
-#define D_PRINTF(...)
-
-#endif
-
-
-#endif /* SPU_DEBUG_H */
-- 
cgit v1.2.3


From 79e96b3a77f7d5c7136b380abcc675c7242d0ffe Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 15 Oct 2008 13:58:58 -0600
Subject: cell: move some CELL_MAX constants

---
 src/gallium/drivers/cell/common.h       |  6 +++++-
 src/gallium/drivers/cell/spu/spu_main.h | 11 ++---------
 2 files changed, 7 insertions(+), 10 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index 8ae78265f2..d716a26175 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -68,6 +68,9 @@
 
 #define CELL_MAX_SAMPLERS 4
 #define CELL_MAX_TEXTURE_LEVELS 12  /* 2k x 2k */
+#define CELL_MAX_CONSTANTS 32  /**< number of float[4] constants */
+#define CELL_MAX_WIDTH 1024    /**< max framebuffer width */
+#define CELL_MAX_HEIGHT 1024   /**< max framebuffer width */
 
 #define TILE_SIZE 32
 
@@ -99,13 +102,14 @@
 #define CELL_CMD_VS_EXECUTE          22
 #define CELL_CMD_FLUSH_BUFFER_RANGE  23
 
-
+/** Command/batch buffers */
 #define CELL_NUM_BUFFERS 4
 #define CELL_BUFFER_SIZE (4*1024)  /**< 16KB would be the max */
 
 #define CELL_BUFFER_STATUS_FREE 10
 #define CELL_BUFFER_STATUS_USED 20
 
+/** Debug flags */
 #define CELL_DEBUG_CHECKER              (1 << 0)
 #define CELL_DEBUG_ASM                  (1 << 1)
 #define CELL_DEBUG_SYNC                 (1 << 2)
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 569b9e45d4..f87495b72d 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -50,13 +50,6 @@
 #endif
 
 
-#define MAX_WIDTH 1024
-#define MAX_HEIGHT 1024
-
-
-#define CELL_MAX_CONSTANTS 32  /**< number of float[4] constants */
-
-
 /**
  * A tile is basically a TILE_SIZE x TILE_SIZE block of 4-byte pixels.
  * The data may be addressed through several different types.
@@ -175,8 +168,8 @@ struct spu_global
    ubyte cur_ctile_status, cur_ztile_status;
 
    /** Status of all tiles in framebuffer */
-   ubyte ctile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
-   ubyte ztile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
+   ubyte ctile_status[CELL_MAX_HEIGHT/TILE_SIZE][CELL_MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
+   ubyte ztile_status[CELL_MAX_HEIGHT/TILE_SIZE][CELL_MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
 
    /** Current fragment ops machine code, at 8-byte boundary */
    uint fragment_ops_code[SPU_MAX_FRAGMENT_OPS_INSTS] ALIGN8_ATTRIB;
-- 
cgit v1.2.3


From 0eb0b0a816764a323af7a8d2b5cb6792f886ce04 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 15 Oct 2008 14:12:55 -0600
Subject: cell: remove some old, pre-batchbuffer stuff

---
 src/gallium/drivers/cell/common.h          | 14 --------------
 src/gallium/drivers/cell/ppu/cell_spu.c    |  5 +----
 src/gallium/drivers/cell/ppu/cell_spu.h    |  3 +--
 src/gallium/drivers/cell/spu/spu_command.c | 19 -------------------
 4 files changed, 2 insertions(+), 39 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index d716a26175..600f1b37a2 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -269,19 +269,6 @@ struct cell_command_texture
 };
 
 
-/** XXX unions don't seem to work */
-/* XXX this should go away; all commands should be placed in batch buffers */
-struct cell_command
-{
-#if 0
-   struct cell_command_framebuffer fb;
-   struct cell_command_clear_surface clear;
-   struct cell_command_render render;
-#endif
-   struct cell_command_vs vs;
-} ALIGN16_ATTRIB;
-
-
 #define MAX_SPU_FUNCTIONS 12
 /**
  * Used to tell the PPU about the address of particular functions in the
@@ -302,7 +289,6 @@ struct cell_init_info
    unsigned id;
    unsigned num_spus;
    unsigned debug_flags;  /**< mask of CELL_DEBUG_x flags */
-   struct cell_command *cmd;
 
    /** Buffers for command batches, vertex/index data */
    ubyte *buffers[CELL_NUM_BUFFERS];
diff --git a/src/gallium/drivers/cell/ppu/cell_spu.c b/src/gallium/drivers/cell/ppu/cell_spu.c
index df020c4146..90745da3d2 100644
--- a/src/gallium/drivers/cell/ppu/cell_spu.c
+++ b/src/gallium/drivers/cell/ppu/cell_spu.c
@@ -126,9 +126,6 @@ cell_start_spus(struct cell_context *cell)
 
    assert(cell->num_spus <= MAX_SPUS);
 
-   ASSERT_ALIGN16(&cell_global.command[0]);
-   ASSERT_ALIGN16(&cell_global.command[1]);
-
    ASSERT_ALIGN16(&cell_global.inits[0]);
    ASSERT_ALIGN16(&cell_global.inits[1]);
 
@@ -141,7 +138,7 @@ cell_start_spus(struct cell_context *cell)
       cell_global.inits[i].id = i;
       cell_global.inits[i].num_spus = cell->num_spus;
       cell_global.inits[i].debug_flags = cell->debug_flags;
-      cell_global.inits[i].cmd = &cell_global.command[i];
+
       for (j = 0; j < CELL_NUM_BUFFERS; j++) {
          cell_global.inits[i].buffers[j] = cell->buffer[j];
       }
diff --git a/src/gallium/drivers/cell/ppu/cell_spu.h b/src/gallium/drivers/cell/ppu/cell_spu.h
index 137f26612e..3443331b01 100644
--- a/src/gallium/drivers/cell/ppu/cell_spu.h
+++ b/src/gallium/drivers/cell/ppu/cell_spu.h
@@ -50,10 +50,9 @@ struct cell_global_info
    pthread_t spe_threads[MAX_SPUS];
 
    /**
-    * Data sent to SPUs
+    * Data sent to SPUs at start-up
     */
    struct cell_init_info inits[MAX_SPUS];
-   struct cell_command command[MAX_SPUS];
 };
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index ebbed3d1dc..4febd5385b 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -669,38 +669,19 @@ cmd_batch(uint opcode)
 void
 command_loop(void)
 {
-   struct cell_command cmd;
    int exitFlag = 0;
 
    D_PRINTF(CELL_DEBUG_CMD, "Enter command loop\n");
 
-   ASSERT((sizeof(struct cell_command) & 0xf) == 0);
-   ASSERT_ALIGN16(&cmd);
-
    while (!exitFlag) {
       unsigned opcode;
-      int tag = 0;
 
       D_PRINTF(CELL_DEBUG_CMD, "Wait for cmd...\n");
 
       /* read/wait from mailbox */
       opcode = (unsigned int) spu_read_in_mbox();
-
       D_PRINTF(CELL_DEBUG_CMD, "got cmd 0x%x\n", opcode);
 
-      /* command payload */
-      mfc_get(&cmd,  /* dest */
-              (unsigned int) spu.init.cmd, /* src */
-              sizeof(struct cell_command), /* bytes */
-              tag,
-              0, /* tid */
-              0  /* rid */);
-      wait_on_mask( 1 << tag );
-
-      /*
-       * NOTE: most commands should be contained in a batch buffer
-       */
-
       switch (opcode & CELL_CMD_OPCODE_MASK) {
       case CELL_CMD_EXIT:
          D_PRINTF(CELL_DEBUG_CMD, "EXIT\n");
-- 
cgit v1.2.3


From ec7d6c656178babdf143faa242f7a3df9d0bc22c Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 15 Oct 2008 14:39:16 -0600
Subject: cell: send rasterizer state to SPUs in proper way, remove
 front_winding hack

---
 src/gallium/drivers/cell/common.h              | 18 ++++++++++++++----
 src/gallium/drivers/cell/ppu/cell_state_emit.c |  7 +++++++
 src/gallium/drivers/cell/ppu/cell_vbuf.c       |  1 -
 src/gallium/drivers/cell/spu/spu_command.c     |  8 ++++++++
 src/gallium/drivers/cell/spu/spu_main.h        |  1 +
 src/gallium/drivers/cell/spu/spu_render.c      |  2 +-
 src/gallium/drivers/cell/spu/spu_tri.c         |  4 ++--
 src/gallium/drivers/cell/spu/spu_tri.h         |  2 +-
 8 files changed, 34 insertions(+), 9 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index 1f6f2d494b..0ff2c491fb 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -99,8 +99,9 @@
 #define CELL_CMD_STATE_FRAGMENT_PROGRAM 19
 #define CELL_CMD_STATE_ATTRIB_FETCH  20
 #define CELL_CMD_STATE_FS_CONSTANTS  21
-#define CELL_CMD_VS_EXECUTE          22
-#define CELL_CMD_FLUSH_BUFFER_RANGE  23
+#define CELL_CMD_STATE_RASTERIZER    22
+#define CELL_CMD_VS_EXECUTE          23
+#define CELL_CMD_FLUSH_BUFFER_RANGE  24
 
 /** Command/batch buffers */
 #define CELL_NUM_BUFFERS 4
@@ -156,13 +157,23 @@ struct cell_command_fragment_program
  */
 struct cell_command_framebuffer
 {
-   uint64_t opcode;     /**< CELL_CMD_FRAMEBUFFER */
+   uint64_t opcode;     /**< CELL_CMD_STATE_FRAMEBUFFER */
    int width, height;
    void *color_start, *depth_start;
    enum pipe_format color_format, depth_format;
 };
 
 
+/**
+ * Tell SPUs about rasterizer state.
+ */
+struct cell_command_rasterizer
+{
+   uint64_t opcode;    /**< CELL_CMD_STATE_RASTERIZER */
+   struct pipe_rasterizer_state rasterizer;
+};
+
+
 /**
  * Clear framebuffer to the given value/color.
  */
@@ -238,7 +249,6 @@ struct cell_command_render
    float xmin, ymin, xmax, ymax;  /* XXX another dummy field */
    uint min_index;
    boolean inline_verts;
-   uint front_winding; /* the rasterizer needs to be able to determine facing to apply front/back-facing stencil */
 };
 
 
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index d2427584ba..e6387382f2 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -147,6 +147,13 @@ cell_emit_state(struct cell_context *cell)
 #endif
    }
 
+   if (cell->dirty & (CELL_NEW_RASTERIZER)) {
+      struct cell_command_rasterizer *rast =
+         cell_batch_alloc(cell, sizeof(*rast));
+      rast->opcode = CELL_CMD_STATE_RASTERIZER;
+      rast->rasterizer = *cell->rasterizer;
+   }
+
    if (cell->dirty & (CELL_NEW_FS)) {
       /* Send new fragment program to SPUs */
       struct cell_command_fragment_program *fp
diff --git a/src/gallium/drivers/cell/ppu/cell_vbuf.c b/src/gallium/drivers/cell/ppu/cell_vbuf.c
index 578ddf62dc..aa63435b93 100644
--- a/src/gallium/drivers/cell/ppu/cell_vbuf.c
+++ b/src/gallium/drivers/cell/ppu/cell_vbuf.c
@@ -214,7 +214,6 @@ cell_vbuf_draw(struct vbuf_render *vbr,
 
       render->opcode = CELL_CMD_RENDER;
       render->prim_type = cvbr->prim;
-      render->front_winding = cell->rasterizer->front_winding;
 
       render->num_indexes = nr_indices;
       render->min_index = min_index;
diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index 4febd5385b..d2c282a022 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -583,6 +583,14 @@ cmd_batch(uint opcode)
       case CELL_CMD_STATE_FS_CONSTANTS:
          pos = cmd_state_fs_constants(buffer, pos);
          break;
+      case CELL_CMD_STATE_RASTERIZER:
+         {
+            struct cell_command_rasterizer *rast =
+               (struct cell_command_rasterizer *) &buffer[pos];
+            spu.rasterizer = rast->rasterizer;
+            pos += sizeof(*rast) / 8;
+         }
+         break;
       case CELL_CMD_STATE_SAMPLER:
          {
             struct cell_command_sampler *sampler
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index f87495b72d..4099e52699 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -153,6 +153,7 @@ struct spu_global
    struct pipe_blend_state blend;
    struct pipe_blend_color blend_color;
    struct pipe_sampler_state sampler[PIPE_MAX_SAMPLERS];
+   struct pipe_rasterizer_state rasterizer;
    struct spu_texture texture[PIPE_MAX_SAMPLERS];
    struct vertex_info vertex_info;
 
diff --git a/src/gallium/drivers/cell/spu/spu_render.c b/src/gallium/drivers/cell/spu/spu_render.c
index cfff19b6c0..75a7f75abc 100644
--- a/src/gallium/drivers/cell/spu/spu_render.c
+++ b/src/gallium/drivers/cell/spu/spu_render.c
@@ -279,7 +279,7 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
          v1 = (const float *) (vertices + indexes[j+1] * vertex_size);
          v2 = (const float *) (vertices + indexes[j+2] * vertex_size);
 
-         drawn += tri_draw(v0, v1, v2, tx, ty, render->front_winding);
+         drawn += tri_draw(v0, v1, v2, tx, ty);
       }
 
       //printf("SPU %u: drew %u of %u\n", spu.init.id, drawn, render->num_indexes/3);
diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index 2417db8960..1519b8cd7e 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -775,7 +775,7 @@ determinant(const float *v0, const float *v1, const float *v2)
  */
 boolean
 tri_draw(const float *v0, const float *v1, const float *v2,
-         uint tx, uint ty, uint front_winding)
+         uint tx, uint ty)
 {
    setup.tx = tx;
    setup.ty = ty;
@@ -790,7 +790,7 @@ tri_draw(const float *v0, const float *v1, const float *v2,
     * which will be needed for front/back-face stencil application
     */
    float det = determinant(v0, v1, v2);
-   setup.facing = (det > 0.0) ^ (front_winding == PIPE_WINDING_CW);
+   setup.facing = (det > 0.0) ^ (spu.rasterizer.front_winding == PIPE_WINDING_CW);
 
    if (!setup_sort_vertices((struct vertex_header *) v0,
                             (struct vertex_header *) v1,
diff --git a/src/gallium/drivers/cell/spu/spu_tri.h b/src/gallium/drivers/cell/spu/spu_tri.h
index abc3d35160..aa694dd7c9 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.h
+++ b/src/gallium/drivers/cell/spu/spu_tri.h
@@ -31,7 +31,7 @@
 
 
 extern boolean
-tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty, uint front_winding);
+tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty);
 
 
 #endif /* SPU_TRI_H */
-- 
cgit v1.2.3


From 30d3b581124a9fa5fbc7aa8404f717c5c2a6ab15 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 15 Oct 2008 15:20:09 -0600
Subject: cell: simplify triangle front/back face determination

---
 src/gallium/drivers/cell/spu/spu_tri.c | 69 ++++++++++++----------------------
 1 file changed, 23 insertions(+), 46 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index 1519b8cd7e..bd7547353d 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -116,7 +116,7 @@ struct setup_stage {
    struct edge etop;
    struct edge emaj;
 
-   float oneOverArea;
+   float oneOverArea;  /* XXX maybe make into vector? */
 
    uint facing;
 
@@ -417,11 +417,19 @@ print_vertex(const struct vertex_header *v)
 #endif
 
 
+/**
+ * Sort vertices from top to bottom.
+ * Compute area and determine front vs. back facing.
+ * Do coarse clip test against tile bounds
+ * \return  FALSE if tri is totally outside tile, TRUE otherwise
+ */
 static boolean
 setup_sort_vertices(const struct vertex_header *v0,
                     const struct vertex_header *v1,
                     const struct vertex_header *v2)
 {
+   float area, sign;
+
 #if DEBUG_VERTS
    if (spu.init.id==0) {
       fprintf(stderr, "SPU %u: Triangle:\n", spu.init.id);
@@ -431,8 +439,6 @@ setup_sort_vertices(const struct vertex_header *v0,
    }
 #endif
 
-   setup.vprovoke = v2;
-
    /* determine bottom to top order of vertices */
    {
       float y0 = spu_extract(v0->data[0], 1);
@@ -444,18 +450,21 @@ setup_sort_vertices(const struct vertex_header *v0,
 	    setup.vmin = v0;   
 	    setup.vmid = v1;   
 	    setup.vmax = v2;
+            sign = -1.0f;
 	 }
 	 else if (y2 <= y0) {
 	    /* y2<=y0<=y1 */
 	    setup.vmin = v2;   
 	    setup.vmid = v0;   
 	    setup.vmax = v1;   
+            sign = -1.0f;
 	 }
 	 else {
 	    /* y0<=y2<=y1 */
 	    setup.vmin = v0;   
 	    setup.vmid = v2;   
 	    setup.vmax = v1;  
+            sign = 1.0f;
 	 }
       }
       else {
@@ -464,18 +473,21 @@ setup_sort_vertices(const struct vertex_header *v0,
 	    setup.vmin = v1;   
 	    setup.vmid = v0;   
 	    setup.vmax = v2;  
+            sign = 1.0f;
 	 }
 	 else if (y2 <= y1) {
 	    /* y2<=y1<=y0 */
 	    setup.vmin = v2;   
 	    setup.vmid = v1;   
 	    setup.vmax = v0;  
+            sign = 1.0f;
 	 }
 	 else {
 	    /* y1<=y2<=y0 */
 	    setup.vmin = v1;   
 	    setup.vmid = v2;   
 	    setup.vmax = v0;
+            sign = -1.0f;
 	 }
       }
    }
@@ -504,31 +516,16 @@ setup_sort_vertices(const struct vertex_header *v0,
    /*
     * Compute triangle's area.  Use 1/area to compute partial
     * derivatives of attributes later.
-    *
-    * The area will be the same as prim->det, but the sign may be
-    * different depending on how the vertices get sorted above.
-    *
-    * To determine whether the primitive is front or back facing we
-    * use the prim->det value because its sign is correct.
     */
-   {
-      const float area = (setup.emaj.dx * setup.ebot.dy -
-                          setup.ebot.dx * setup.emaj.dy);
-
-      setup.oneOverArea = 1.0f / area;
-      /*
-      _mesa_printf("%s one-over-area %f  area %f  det %f\n",
-                   __FUNCTION__, setup.oneOverArea, area, prim->det );
-      */
-   }
+   area = setup.emaj.dx * setup.ebot.dy - setup.ebot.dx * setup.emaj.dy;
 
-#if 0
-   /* We need to know if this is a front or back-facing triangle for:
-    *  - the GLSL gl_FrontFacing fragment attribute (bool)
-    *  - two-sided stencil test
-    */
-   setup.quad.facing = (prim->det > 0.0) ^ (setup.softpipe->rasterizer->front_winding == PIPE_WINDING_CW);
-#endif
+   setup.oneOverArea = 1.0f / area;
+
+   /* The product of area * sign indicates front/back orientation (0/1) */
+   setup.facing = (area * sign > 0.0f)
+      ^ (spu.rasterizer.front_winding == PIPE_WINDING_CW);
+
+   setup.vprovoke = v2;
 
    return TRUE;
 }
@@ -755,20 +752,6 @@ subtriangle(struct edge *eleft, struct edge *eright, unsigned lines)
 }
 
 
-static float
-determinant(const float *v0, const float *v1, const float *v2)
-{
-   /* edge vectors e = v0 - v2, f = v1 - v2 */
-   const float ex = v0[0] - v2[0];
-   const float ey = v0[1] - v2[1];
-   const float fx = v1[0] - v2[0];
-   const float fy = v1[1] - v2[1];
-
-   /* det = cross(e,f).z */
-   return ex * fy - ey * fx;
-}
-
-
 /**
  * Draw triangle into tile at (tx, ty) (tile coords)
  * The tile data should have already been fetched.
@@ -786,12 +769,6 @@ tri_draw(const float *v0, const float *v1, const float *v2,
    setup.cliprect_maxx = (tx + 1) * TILE_SIZE;
    setup.cliprect_maxy = (ty + 1) * TILE_SIZE;
 
-   /* Before we sort vertices, determine the facing of the triangle,
-    * which will be needed for front/back-face stencil application
-    */
-   float det = determinant(v0, v1, v2);
-   setup.facing = (det > 0.0) ^ (spu.rasterizer.front_winding == PIPE_WINDING_CW);
-
    if (!setup_sort_vertices((struct vertex_header *) v0,
                             (struct vertex_header *) v1,
                             (struct vertex_header *) v2)) {
-- 
cgit v1.2.3


From 224c19a758466cdfb821e1a40db4928311278e90 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 15 Oct 2008 15:34:02 -0600
Subject: cell: get rid of last usage of float4 union/typedef

Results in slightly tighter code.
---
 src/gallium/drivers/cell/spu/spu_tri.c | 63 ++++++++++++++++------------------
 1 file changed, 29 insertions(+), 34 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index bd7547353d..d83085d0f9 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -43,11 +43,6 @@
 /** Masks are uint[4] vectors with each element being 0 or 0xffffffff */
 typedef vector unsigned int mask_t;
 
-typedef union
-{
-   vector float v;
-   float f[4];
-} float4;
 
 
 /**
@@ -91,9 +86,9 @@ struct edge {
 
 struct interp_coef
 {
-   float4 a0;
-   float4 dadx;
-   float4 dady;
+   vector float a0;
+   vector float dadx;
+   vector float dady;
 };
 
 
@@ -152,14 +147,14 @@ eval_coeff(uint slot, float x, float y, vector float w, vector float result[4])
       result[QUAD_TOP_LEFT] =
       result[QUAD_TOP_RIGHT] =
       result[QUAD_BOTTOM_LEFT] =
-      result[QUAD_BOTTOM_RIGHT] = setup.coef[slot].a0.v;
+      result[QUAD_BOTTOM_RIGHT] = setup.coef[slot].a0;
       break;
    case INTERP_LINEAR:
       {
-         vector float dadx = setup.coef[slot].dadx.v;
-         vector float dady = setup.coef[slot].dady.v;
+         vector float dadx = setup.coef[slot].dadx;
+         vector float dady = setup.coef[slot].dady;
          vector float topLeft =
-            spu_add(setup.coef[slot].a0.v,
+            spu_add(setup.coef[slot].a0,
                     spu_add(spu_mul(spu_splats(x), dadx),
                             spu_mul(spu_splats(y), dady)));
 
@@ -171,10 +166,10 @@ eval_coeff(uint slot, float x, float y, vector float w, vector float result[4])
       break;
    case INTERP_PERSPECTIVE:
       {
-         vector float dadx = setup.coef[slot].dadx.v;
-         vector float dady = setup.coef[slot].dady.v;
+         vector float dadx = setup.coef[slot].dadx;
+         vector float dady = setup.coef[slot].dady;
          vector float topLeft =
-            spu_add(setup.coef[slot].a0.v,
+            spu_add(setup.coef[slot].a0,
                     spu_add(spu_mul(spu_splats(x), dadx),
                             spu_mul(spu_splats(y), dady)));
 
@@ -212,9 +207,9 @@ static INLINE vector float
 eval_z(float x, float y)
 {
    const uint slot = 0;
-   const float dzdx = setup.coef[slot].dadx.f[2];
-   const float dzdy = setup.coef[slot].dady.f[2];
-   const float topLeft = setup.coef[slot].a0.f[2] + x * dzdx + y * dzdy;
+   const float dzdx = spu_extract(setup.coef[slot].dadx, 2);
+   const float dzdy = spu_extract(setup.coef[slot].dady, 2);
+   const float topLeft = spu_extract(setup.coef[slot].a0, 2) + x * dzdx + y * dzdy;
    const vector float topLeftv = spu_splats(topLeft);
    const vector float derivs = (vector float) { 0.0, dzdx, dzdy, dzdx + dzdy };
    return spu_add(topLeftv, derivs);
@@ -226,9 +221,9 @@ static INLINE vector float
 eval_w(float x, float y)
 {
    const uint slot = 0;
-   const float dwdx = setup.coef[slot].dadx.f[3];
-   const float dwdy = setup.coef[slot].dady.f[3];
-   const float topLeft = setup.coef[slot].a0.f[3] + x * dwdx + y * dwdy;
+   const float dwdx = spu_extract(setup.coef[slot].dadx, 3);
+   const float dwdy = spu_extract(setup.coef[slot].dady, 3);
+   const float topLeft = spu_extract(setup.coef[slot].a0, 3) + x * dwdx + y * dwdy;
    const vector float topLeftv = spu_splats(topLeft);
    const vector float derivs = (vector float) { 0.0, dwdx, dwdy, dwdx + dwdy };
    return spu_add(topLeftv, derivs);
@@ -540,9 +535,9 @@ setup_sort_vertices(const struct vertex_header *v0,
 static INLINE void
 const_coeff4(uint slot)
 {
-   setup.coef[slot].dadx.v = (vector float) {0.0, 0.0, 0.0, 0.0};
-   setup.coef[slot].dady.v = (vector float) {0.0, 0.0, 0.0, 0.0};
-   setup.coef[slot].a0.v = setup.vprovoke->data[slot];
+   setup.coef[slot].dadx = (vector float) {0.0, 0.0, 0.0, 0.0};
+   setup.coef[slot].dady = (vector float) {0.0, 0.0, 0.0, 0.0};
+   setup.coef[slot].a0 = setup.vprovoke->data[slot];
 }
 
 
@@ -566,13 +561,13 @@ tri_linear_coeff4(uint slot)
    vector float b = spu_sub(spu_mul(spu_splats(setup.emaj.dx), botda),
                             spu_mul(majda, spu_splats(setup.ebot.dx)));
 
-   setup.coef[slot].dadx.v = spu_mul(a, spu_splats(setup.oneOverArea));
-   setup.coef[slot].dady.v = spu_mul(b, spu_splats(setup.oneOverArea));
+   setup.coef[slot].dadx = spu_mul(a, spu_splats(setup.oneOverArea));
+   setup.coef[slot].dady = spu_mul(b, spu_splats(setup.oneOverArea));
 
-   vector float tempx = spu_mul(setup.coef[slot].dadx.v, xxxx);
-   vector float tempy = spu_mul(setup.coef[slot].dady.v, yyyy);
+   vector float tempx = spu_mul(setup.coef[slot].dadx, xxxx);
+   vector float tempy = spu_mul(setup.coef[slot].dady, yyyy);
                          
-   setup.coef[slot].a0.v = spu_sub(vmin_d, spu_add(tempx, tempy));
+   setup.coef[slot].a0 = spu_sub(vmin_d, spu_add(tempx, tempy));
 }
 
 
@@ -610,13 +605,13 @@ tri_persp_coeff4(uint slot)
    vector float b = spu_sub(spu_mul(spu_splats(setup.emaj.dx), botda),
                             spu_mul(majda, spu_splats(setup.ebot.dx)));
 
-   setup.coef[slot].dadx.v = spu_mul(a, spu_splats(setup.oneOverArea));
-   setup.coef[slot].dady.v = spu_mul(b, spu_splats(setup.oneOverArea));
+   setup.coef[slot].dadx = spu_mul(a, spu_splats(setup.oneOverArea));
+   setup.coef[slot].dady = spu_mul(b, spu_splats(setup.oneOverArea));
 
-   vector float tempx = spu_mul(setup.coef[slot].dadx.v, xxxx);
-   vector float tempy = spu_mul(setup.coef[slot].dady.v, yyyy);
+   vector float tempx = spu_mul(setup.coef[slot].dadx, xxxx);
+   vector float tempy = spu_mul(setup.coef[slot].dady, yyyy);
                          
-   setup.coef[slot].a0.v = spu_sub(vmin_d, spu_add(tempx, tempy));
+   setup.coef[slot].a0 = spu_sub(vmin_d, spu_add(tempx, tempy));
 }
 
 
-- 
cgit v1.2.3


From 1c915b14a545ffb10cc1c98cc69f997b6471617f Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 15 Oct 2008 19:40:51 -0600
Subject: cell: updated debug code

---
 src/gallium/drivers/cell/spu/spu_render.c | 26 +++++++-------------------
 1 file changed, 7 insertions(+), 19 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/spu/spu_render.c b/src/gallium/drivers/cell/spu/spu_render.c
index 75a7f75abc..802455bf79 100644
--- a/src/gallium/drivers/cell/spu/spu_render.c
+++ b/src/gallium/drivers/cell/spu/spu_render.c
@@ -176,21 +176,12 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
    const ushort *indexes;
    uint i, j;
 
-
-#if 0
-      printf("SPU %u: RENDER prim %u, num_vert=%u  num_ind=%u  "
-             "inline_vert=%u\n",
-             spu.init.id,
-             render->prim_type,
-             render->num_verts,
-             render->num_indexes,
-             render->inline_verts);
-
-      /*
-      printf("       bound: %g, %g .. %g, %g\n",
-             render->xmin, render->ymin, render->xmax, render->ymax);
-      */
-#endif
+   D_PRINTF(CELL_DEBUG_CMD,
+            "RENDER prim=%u num_vert=%u num_ind=%u inline_vert=%u\n",
+            render->prim_type,
+            render->num_verts,
+            render->num_indexes,
+            render->inline_verts);
 
    ASSERT(sizeof(*render) % 4 == 0);
    ASSERT(total_vertex_bytes % 16 == 0);
@@ -293,8 +284,5 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
       spu.ztile_status[ty][tx] = spu.cur_ztile_status;
    }
 
-#if 0
-      printf("SPU %u: RENDER done\n",
-             spu.init.id);
-#endif
+   D_PRINTF(CELL_DEBUG_CMD, "RENDER done\n");
 }
-- 
cgit v1.2.3


From 0116ee1d1c341726b6ed23c2dddc4515e8a34385 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 15 Oct 2008 20:46:43 -0600
Subject: cell: start some performance measurements

Use the spu_write_decrementer() and spu_read_decrementer() functions to
measure time.  Convert to milliseconds according to the system timebase value.
---
 src/gallium/drivers/cell/common.h          |  1 +
 src/gallium/drivers/cell/ppu/cell_spu.c    | 31 ++++++++++++++++++++++++++++++
 src/gallium/drivers/cell/spu/spu_command.c | 15 +++++++++++++++
 src/gallium/drivers/cell/spu/spu_render.c  |  9 ++++++++-
 4 files changed, 55 insertions(+), 1 deletion(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index 0ff2c491fb..469d56cda8 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -299,6 +299,7 @@ struct cell_init_info
    unsigned id;
    unsigned num_spus;
    unsigned debug_flags;  /**< mask of CELL_DEBUG_x flags */
+   float inv_timebase;    /**< 1.0/timebase, for perf measurement */
 
    /** Buffers for command batches, vertex/index data */
    ubyte *buffers[CELL_NUM_BUFFERS];
diff --git a/src/gallium/drivers/cell/ppu/cell_spu.c b/src/gallium/drivers/cell/ppu/cell_spu.c
index a6e268b362..28e5e6d706 100644
--- a/src/gallium/drivers/cell/ppu/cell_spu.c
+++ b/src/gallium/drivers/cell/ppu/cell_spu.c
@@ -52,6 +52,35 @@ helpful headers:
 struct cell_global_info cell_global;
 
 
+/**
+ * Scan /proc/cpuinfo to determine the timebase for the system.
+ * This is used by the SPUs to convert 'decrementer' ticks to seconds.
+ * There may be a better way to get this value...
+ */
+static unsigned
+get_timebase(void)
+{
+   FILE *f = fopen("/proc/cpuinfo", "r");
+   unsigned timebase;
+
+   assert(f);
+   while (!feof(f)) {
+      char line[80];
+      fgets(line, sizeof(line), f);
+      if (strncmp(line, "timebase", 8) == 0) {
+         char *colon = strchr(line, ':');
+         if (colon) {
+            timebase = atoi(colon + 2);
+            break;
+         }
+      }
+   }
+   fclose(f);
+
+   return timebase;
+}
+
+
 /**
  * Write a 1-word message to the given SPE mailbox.
  */
@@ -115,6 +144,7 @@ cell_start_spus(struct cell_context *cell)
 {
    static boolean one_time_init = FALSE;
    uint i, j;
+   uint timebase = get_timebase();
 
    if (one_time_init) {
       fprintf(stderr, "PPU: Multiple rendering contexts not yet supported "
@@ -138,6 +168,7 @@ cell_start_spus(struct cell_context *cell)
       cell_global.inits[i].id = i;
       cell_global.inits[i].num_spus = cell->num_spus;
       cell_global.inits[i].debug_flags = cell->debug_flags;
+      cell_global.inits[i].inv_timebase = 1000.0f / timebase;
 
       for (j = 0; j < CELL_NUM_BUFFERS; j++) {
          cell_global.inits[i].buffers[j] = cell->buffer[j];
diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index d2c282a022..57d265fef7 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -670,6 +670,8 @@ cmd_batch(uint opcode)
 }
 
 
+#define PERF 0
+
 
 /**
  * Main loop for SPEs: Get a command, execute it, repeat.
@@ -678,6 +680,7 @@ void
 command_loop(void)
 {
    int exitFlag = 0;
+   uint t0, t1;
 
    D_PRINTF(CELL_DEBUG_CMD, "Enter command loop\n");
 
@@ -686,10 +689,16 @@ command_loop(void)
 
       D_PRINTF(CELL_DEBUG_CMD, "Wait for cmd...\n");
 
+      if (PERF)
+         spu_write_decrementer(~0);
+
       /* read/wait from mailbox */
       opcode = (unsigned int) spu_read_in_mbox();
       D_PRINTF(CELL_DEBUG_CMD, "got cmd 0x%x\n", opcode);
 
+      if (PERF)
+         t0 = spu_read_decrementer();
+
       switch (opcode & CELL_CMD_OPCODE_MASK) {
       case CELL_CMD_EXIT:
          D_PRINTF(CELL_DEBUG_CMD, "EXIT\n");
@@ -707,6 +716,12 @@ command_loop(void)
          printf("Bad opcode 0x%x!\n", opcode & CELL_CMD_OPCODE_MASK);
       }
 
+      if (PERF) {
+         t1 = spu_read_decrementer();
+         printf("wait mbox time: %gms   batch time: %gms\n",
+                (~0u - t0) * spu.init.inv_timebase,
+                (t0 - t1) * spu.init.inv_timebase);
+      }
    }
 
    D_PRINTF(CELL_DEBUG_CMD, "Exit command loop\n");
diff --git a/src/gallium/drivers/cell/spu/spu_render.c b/src/gallium/drivers/cell/spu/spu_render.c
index 802455bf79..5515bb55c9 100644
--- a/src/gallium/drivers/cell/spu/spu_render.c
+++ b/src/gallium/drivers/cell/spu/spu_render.c
@@ -175,6 +175,7 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
    const ubyte *vertices;
    const ushort *indexes;
    uint i, j;
+   uint num_tiles;
 
    D_PRINTF(CELL_DEBUG_CMD,
             "RENDER prim=%u num_vert=%u num_ind=%u inline_vert=%u\n",
@@ -242,6 +243,8 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
    wait_on_mask(1 << TAG_SURFACE_CLEAR); /* XXX temporary */
 
 
+   num_tiles = 0;
+
    /**
     ** loop over tiles, rendering tris
     **/
@@ -255,6 +258,8 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
       if (!my_tile(tx, ty))
          continue;
 
+      num_tiles++;
+
       spu.cur_ctile_status = spu.ctile_status[ty][tx];
       spu.cur_ztile_status = spu.ztile_status[ty][tx];
 
@@ -284,5 +289,7 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
       spu.ztile_status[ty][tx] = spu.cur_ztile_status;
    }
 
-   D_PRINTF(CELL_DEBUG_CMD, "RENDER done\n");
+   D_PRINTF(CELL_DEBUG_CMD,
+            "RENDER done (%u tiles hit)\n",
+            num_tiles);
 }
-- 
cgit v1.2.3


From 926b8dbb3e86360e5968882df94785ae84d0ad43 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Thu, 16 Oct 2008 09:00:05 -0600
Subject: cell: clean up various texture-related things

Distinguish among texture targets in codegen.
progs/demos/cubemap.c runs correctly now too.
---
 src/gallium/drivers/cell/ppu/cell_gen_fp.c | 29 ++++++++++++++---
 src/gallium/drivers/cell/spu/spu_command.c | 24 ++++++--------
 src/gallium/drivers/cell/spu/spu_funcs.c   | 34 +++++++++++++++++---
 src/gallium/drivers/cell/spu/spu_main.h    | 16 +++++-----
 src/gallium/drivers/cell/spu/spu_texture.c | 50 ++++++++++++++----------------
 src/gallium/drivers/cell/spu/spu_texture.h | 34 +++++++++-----------
 6 files changed, 107 insertions(+), 80 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
index 3dfd5f673d..2b34cf1e23 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
@@ -1337,16 +1337,33 @@ emit_function_call(struct codegen *gen,
 
 
 static boolean
-emit_TXP(struct codegen *gen, const struct tgsi_full_instruction *inst)
+emit_TEX(struct codegen *gen, const struct tgsi_full_instruction *inst)
 {
-   const uint addr = lookup_function(gen->cell, "spu_txp");
+   const uint target = inst->InstructionExtTexture.Texture;
    const uint unit = inst->FullSrcRegisters[1].SrcRegister.Index;
+   uint addr;
    int ch;
    int coord_regs[4], d_regs[4];
 
+   switch (target) {
+   case TGSI_TEXTURE_1D:
+   case TGSI_TEXTURE_2D:
+      addr = lookup_function(gen->cell, "spu_tex_2d");
+      break;
+   case TGSI_TEXTURE_3D:
+      addr = lookup_function(gen->cell, "spu_tex_3d");
+      break;
+   case TGSI_TEXTURE_CUBE:
+      addr = lookup_function(gen->cell, "spu_tex_cube");
+      break;
+   default:
+      ASSERT(0 && "unsupported texture target");
+      return FALSE;
+   }
+
    assert(inst->FullSrcRegisters[1].SrcRegister.File == TGSI_FILE_SAMPLER);
 
-   spe_comment(gen->f, -4, "CALL txp:");
+   spe_comment(gen->f, -4, "CALL tex:");
 
    /* get src/dst reg info */
    for (ch = 0; ch < 4; ch++) {
@@ -1368,7 +1385,7 @@ emit_TXP(struct codegen *gen, const struct tgsi_full_instruction *inst)
          spe_stqd(gen->f, reg, SPE_REG_SP, 16 * offset);
       }
 
-      /* setup function arguments */
+      /* setup function arguments (XXX depends on target) */
       for (i = 0; i < 4; i++) {
          spe_move(gen->f, 3 + i, coord_regs[i]);
       }
@@ -1674,8 +1691,10 @@ emit_instruction(struct codegen *gen,
       /* fall-through for now */
    case TGSI_OPCODE_TXB:
       /* fall-through for now */
+   case TGSI_OPCODE_TXL:
+      /* fall-through for now */
    case TGSI_OPCODE_TXP:
-      return emit_TXP(gen, inst);
+      return emit_TEX(gen, inst);
 
    case TGSI_OPCODE_IF:
       return emit_IF(gen, inst);
diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index 57d265fef7..ff4a52d79a 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -310,8 +310,7 @@ cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
  */
 static void
 update_tex_masks(struct spu_texture *texture,
-                 const struct pipe_sampler_state *sampler,
-                 uint unit)
+                 const struct pipe_sampler_state *sampler)
 {
    uint i;
 
@@ -338,11 +337,6 @@ update_tex_masks(struct spu_texture *texture,
          texture->level[i].scale_t = spu_splats(1.0f);
       }
    }
-
-   /* XXX temporary hack */
-   if (texture->target == PIPE_TEXTURE_CUBE) {
-      spu.sample_texture4[unit] = sample_texture4_cube;
-   }
 }
 
 
@@ -357,12 +351,12 @@ cmd_state_sampler(const struct cell_command_sampler *sampler)
 
    switch (spu.sampler[unit].min_img_filter) {
    case PIPE_TEX_FILTER_LINEAR:
-      spu.min_sample_texture4[unit] = sample_texture4_bilinear;
+      spu.min_sample_texture_2d[unit] = sample_texture_2d_bilinear;
       break;
    case PIPE_TEX_FILTER_ANISO:
       /* fall-through, for now */
    case PIPE_TEX_FILTER_NEAREST:
-      spu.min_sample_texture4[unit] = sample_texture4_nearest;
+      spu.min_sample_texture_2d[unit] = sample_texture_2d_nearest;
       break;
    default:
       ASSERT(0);
@@ -370,12 +364,12 @@ cmd_state_sampler(const struct cell_command_sampler *sampler)
 
    switch (spu.sampler[sampler->unit].mag_img_filter) {
    case PIPE_TEX_FILTER_LINEAR:
-      spu.mag_sample_texture4[unit] = sample_texture4_bilinear;
+      spu.mag_sample_texture_2d[unit] = sample_texture_2d_bilinear;
       break;
    case PIPE_TEX_FILTER_ANISO:
       /* fall-through, for now */
    case PIPE_TEX_FILTER_NEAREST:
-      spu.mag_sample_texture4[unit] = sample_texture4_nearest;
+      spu.mag_sample_texture_2d[unit] = sample_texture_2d_nearest;
       break;
    default:
       ASSERT(0);
@@ -384,16 +378,16 @@ cmd_state_sampler(const struct cell_command_sampler *sampler)
    switch (spu.sampler[sampler->unit].min_mip_filter) {
    case PIPE_TEX_MIPFILTER_NEAREST:
    case PIPE_TEX_MIPFILTER_LINEAR:
-      spu.sample_texture4[unit] = sample_texture4_lod;
+      spu.sample_texture_2d[unit] = sample_texture_2d_lod;
       break;
    case PIPE_TEX_MIPFILTER_NONE:
-      spu.sample_texture4[unit] = spu.mag_sample_texture4[unit];
+      spu.sample_texture_2d[unit] = spu.mag_sample_texture_2d[unit];
       break;
    default:
       ASSERT(0);
    }
 
-   update_tex_masks(&spu.texture[unit], &spu.sampler[unit], unit);
+   update_tex_masks(&spu.texture[unit], &spu.sampler[unit]);
 }
 
 
@@ -434,7 +428,7 @@ cmd_state_texture(const struct cell_command_texture *texture)
          spu.texture[unit].max_level = i;
    }
 
-   update_tex_masks(&spu.texture[unit], &spu.sampler[unit], unit);
+   update_tex_masks(&spu.texture[unit], &spu.sampler[unit]);
 }
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_funcs.c b/src/gallium/drivers/cell/spu/spu_funcs.c
index 5c3ee305d4..3534b35000 100644
--- a/src/gallium/drivers/cell/spu/spu_funcs.c
+++ b/src/gallium/drivers/cell/spu/spu_funcs.c
@@ -43,6 +43,7 @@
 #include "cell/common.h"
 #include "spu_main.h"
 #include "spu_funcs.h"
+#include "spu_texture.h"
 
 
 /** For "return"-ing four vectors */
@@ -102,11 +103,34 @@ spu_log2(vector float x)
 
 
 static struct vec_4x4
-spu_txp(vector float s, vector float t, vector float r, vector float q,
-        unsigned unit)
+spu_tex_2d(vector float s, vector float t, vector float r, vector float q,
+           unsigned unit)
 {
    struct vec_4x4 colors;
-   spu.sample_texture4[unit](s, t, r, q, unit, 0, 0, colors.v);
+   (void) r;
+   (void) q;
+   spu.sample_texture_2d[unit](s, t, unit, 0, 0, colors.v);
+   return colors;
+}
+
+static struct vec_4x4
+spu_tex_3d(vector float s, vector float t, vector float r, vector float q,
+           unsigned unit)
+{
+   struct vec_4x4 colors;
+   (void) r;
+   (void) q;
+   spu.sample_texture_2d[unit](s, t, unit, 0, 0, colors.v);
+   return colors;
+}
+
+static struct vec_4x4
+spu_tex_cube(vector float s, vector float t, vector float r, vector float q,
+           unsigned unit)
+{
+   struct vec_4x4 colors;
+   (void) q;
+   sample_texture_cube(s, t, r, unit, colors.v);
    return colors;
 }
 
@@ -147,7 +171,9 @@ return_function_info(void)
    export_func(&funcs, "spu_pow", &spu_pow);
    export_func(&funcs, "spu_exp2", &spu_exp2);
    export_func(&funcs, "spu_log2", &spu_log2);
-   export_func(&funcs, "spu_txp", &spu_txp);
+   export_func(&funcs, "spu_tex_2d", &spu_tex_2d);
+   export_func(&funcs, "spu_tex_3d", &spu_tex_3d);
+   export_func(&funcs, "spu_tex_cube", &spu_tex_cube);
 
    /* Send the function info back to the PPU / main memory */
    mfc_put((void *) &funcs,  /* src in local store */
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 4099e52699..80e9c696f8 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -70,12 +70,10 @@ typedef union {
 
 
 /** Function for sampling textures */
-typedef void (*spu_sample_texture4_func)(vector float s,
-                                         vector float t,
-                                         vector float r,
-                                         vector float q,
-                                         uint unit, uint level, uint face,
-                                         vector float colors[4]);
+typedef void (*spu_sample_texture_2d_func)(vector float s,
+                                           vector float t,
+                                           uint unit, uint level, uint face,
+                                           vector float colors[4]);
 
 
 /** Function for performing per-fragment ops */
@@ -183,9 +181,9 @@ struct spu_global
    spu_fragment_program_func fragment_program;
 
    /** Current texture sampler function */
-   spu_sample_texture4_func sample_texture4[CELL_MAX_SAMPLERS];
-   spu_sample_texture4_func min_sample_texture4[CELL_MAX_SAMPLERS];
-   spu_sample_texture4_func mag_sample_texture4[CELL_MAX_SAMPLERS];
+   spu_sample_texture_2d_func sample_texture_2d[CELL_MAX_SAMPLERS];
+   spu_sample_texture_2d_func min_sample_texture_2d[CELL_MAX_SAMPLERS];
+   spu_sample_texture_2d_func mag_sample_texture_2d[CELL_MAX_SAMPLERS];
 
    /** Fragment program constants */
    vector float constants[4 * CELL_MAX_CONSTANTS];
diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
index 42eb06a362..04202a7657 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.c
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -126,10 +126,9 @@ spu_clamp(vector signed int vec, vector signed int max)
  * \param colors  returned colors in SOA format (rrrr, gggg, bbbb, aaaa).
  */
 void
-sample_texture4_nearest(vector float s, vector float t,
-                        vector float r, vector float q,
-                        uint unit, uint level, uint face,
-                        vector float colors[4])
+sample_texture_2d_nearest(vector float s, vector float t,
+                          uint unit, uint level, uint face,
+                          vector float colors[4])
 {
    const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
    vector float ss = spu_mul(s, tlevel->scale_s);
@@ -158,10 +157,9 @@ sample_texture4_nearest(vector float s, vector float t,
  * \param colors  returned colors in SOA format (rrrr, gggg, bbbb, aaaa).
  */
 void
-sample_texture4_bilinear(vector float s, vector float t,
-                         vector float r, vector float q,
-                         uint unit, uint level, uint face,
-                         vector float colors[4])
+sample_texture_2d_bilinear(vector float s, vector float t,
+                           uint unit, uint level, uint face,
+                           vector float colors[4])
 {
    const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
    static const vector float half = {-0.5f, -0.5f, -0.5f, -0.5f};
@@ -308,10 +306,9 @@ transpose(vector unsigned int *mOut0,
  * Bilinear filtering, using int intead of float arithmetic
  */
 void
-sample_texture4_bilinear_2(vector float s, vector float t,
-                           vector float r, vector float q,
-                           uint unit, uint level, uint face,
-                           vector float colors[4])
+sample_texture_2d_bilinear_int(vector float s, vector float t,
+                               uint unit, uint level, uint face,
+                               vector float colors[4])
 {
    const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
    static const vector float half = {-0.5f, -0.5f, -0.5f, -0.5f};
@@ -444,10 +441,9 @@ compute_lambda(uint unit, vector float s, vector float t)
  * Texture sampling with level of detail selection.
  */
 void
-sample_texture4_lod(vector float s, vector float t,
-                    vector float r, vector float q,
-                    uint unit, uint level_ignored, uint face,
-                    vector float colors[4])
+sample_texture_2d_lod(vector float s, vector float t,
+                      uint unit, uint level_ignored, uint face,
+                      vector float colors[4])
 {
    /*
     * Note that we're computing a lambda/lod here that's used for all
@@ -455,6 +451,9 @@ sample_texture4_lod(vector float s, vector float t,
     */
    float lambda = compute_lambda(unit, s, t);
 
+   (void) face;
+   (void) level_ignored;
+
    /* apply lod bias */
    lambda += spu.sampler[unit].lod_bias;
 
@@ -466,14 +465,14 @@ sample_texture4_lod(vector float s, vector float t,
 
    if (lambda <= 0.0f) {
       /* magnify */
-      spu.mag_sample_texture4[unit](s, t, r, q, unit, 0, 0, colors);
+      spu.mag_sample_texture_2d[unit](s, t, unit, 0, 0, colors);
    }
    else {
       /* minify */
       int level = (int) (lambda + 0.5f);
       if (level > (int) spu.texture[unit].max_level)
          level = spu.texture[unit].max_level;
-      spu.min_sample_texture4[unit](s, t, r, q, unit, level, 0, colors);
+      spu.min_sample_texture_2d[unit](s, t, unit, level, 0, colors);
       /* XXX to do: mipmap level interpolation */
    }
 }
@@ -552,13 +551,10 @@ choose_cube_face(float rx, float ry, float rz, float *newS, float *newT)
 
 
 void
-sample_texture4_cube(vector float s, vector float t,
-                     vector float r, vector float q,
-                     uint unit, uint level, uint face_ignored,
-                     vector float colors[4])
+sample_texture_cube(vector float s, vector float t, vector float r,
+                    uint unit, vector float colors[4])
 {
-   static const vector float zero = {0.0f, 0.0f, 0.0f, 0.0f};
-   uint p, faces[4];
+   uint p, faces[4], level = 0;
    float newS[4], newT[4];
 
    /* Compute cube face referenced by the four sets of texcoords.
@@ -577,15 +573,15 @@ sample_texture4_cube(vector float s, vector float t,
       /* GOOD!  All four texcoords refer to the same cube face */
       s = (vector float) {newS[0], newS[1], newS[2], newS[3]};
       t = (vector float) {newT[0], newT[1], newT[2], newT[3]};
-      sample_texture4_nearest(s, t, zero, zero, unit, level, faces[0], colors);
+      sample_texture_2d_nearest(s, t, unit, level, faces[0], colors);
    }
    else {
       /* BAD!  The four texcoords refer to different faces */
       for (p = 0; p < 4; p++) {      
          vector float c[4];
 
-         sample_texture4_nearest(spu_splats(newS[p]), spu_splats(newT[p]),
-                                 zero, zero, unit, level, faces[p], c);
+         sample_texture_2d_nearest(spu_splats(newS[p]), spu_splats(newT[p]),
+                                   unit, level, faces[p], c);
 
          float red = spu_extract(c[0], p);
          float green = spu_extract(c[1], p);
diff --git a/src/gallium/drivers/cell/spu/spu_texture.h b/src/gallium/drivers/cell/spu/spu_texture.h
index 387484c3ad..7b75b007b5 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.h
+++ b/src/gallium/drivers/cell/spu/spu_texture.h
@@ -37,37 +37,31 @@ invalidate_tex_cache(void);
 
 
 extern void
-sample_texture4_nearest(vector float s, vector float t,
-                        vector float r, vector float q,
-                        uint unit, uint level, uint face,
-                        vector float colors[4]);
+sample_texture_2d_nearest(vector float s, vector float t,
+                          uint unit, uint level, uint face,
+                          vector float colors[4]);
 
 
 extern void
-sample_texture4_bilinear(vector float s, vector float t,
-                         vector float r, vector float q,
-                         uint unit, uint level, uint face,
-                         vector float colors[4]);
-
-extern void
-sample_texture4_bilinear_2(vector float s, vector float t,
-                           vector float r, vector float q,
+sample_texture_2d_bilinear(vector float s, vector float t,
                            uint unit, uint level, uint face,
                            vector float colors[4]);
 
+extern void
+sample_texture_2d_bilinear_int(vector float s, vector float t,
+                               uint unit, uint level, uint face,
+                               vector float colors[4]);
+
 
 extern void
-sample_texture4_lod(vector float s, vector float t,
-                    vector float r, vector float q,
-                    uint unit, uint level, uint face,
-                    vector float colors[4]);
+sample_texture_2d_lod(vector float s, vector float t,
+                      uint unit, uint level, uint face,
+                      vector float colors[4]);
 
 
 extern void
-sample_texture4_cube(vector float s, vector float t,
-                     vector float r, vector float q,
-                     uint unit, uint level_ignored, uint face_ignored,
-                     vector float colors[4]);
+sample_texture_cube(vector float s, vector float t, vector float r,
+                    uint unit, vector float colors[4]);
 
 
 #endif /* SPU_TEXTURE_H */
-- 
cgit v1.2.3


From 1da8f9b005a197214532e124c764a4e04e835519 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Thu, 16 Oct 2008 09:33:45 -0600
Subject: cell: call proper sampler function in sample_texture_cube()

---
 src/gallium/drivers/cell/spu/spu_texture.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
index 04202a7657..b2d5d4aef8 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.c
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -557,7 +557,7 @@ sample_texture_cube(vector float s, vector float t, vector float r,
    uint p, faces[4], level = 0;
    float newS[4], newT[4];
 
-   /* Compute cube face referenced by the four sets of texcoords.
+   /* Compute cube faces referenced by the four sets of texcoords.
     * XXX we should SIMD-ize this.
     */
    for (p = 0; p < 4; p++) {      
@@ -573,15 +573,15 @@ sample_texture_cube(vector float s, vector float t, vector float r,
       /* GOOD!  All four texcoords refer to the same cube face */
       s = (vector float) {newS[0], newS[1], newS[2], newS[3]};
       t = (vector float) {newT[0], newT[1], newT[2], newT[3]};
-      sample_texture_2d_nearest(s, t, unit, level, faces[0], colors);
+      spu.sample_texture_2d[unit](s, t, unit, level, faces[0], colors);
    }
    else {
       /* BAD!  The four texcoords refer to different faces */
       for (p = 0; p < 4; p++) {      
          vector float c[4];
 
-         sample_texture_2d_nearest(spu_splats(newS[p]), spu_splats(newT[p]),
-                                   unit, level, faces[p], c);
+         spu.sample_texture_2d[unit](spu_splats(newS[p]), spu_splats(newT[p]),
+                                     unit, level, faces[p], c);
 
          float red = spu_extract(c[0], p);
          float green = spu_extract(c[1], p);
-- 
cgit v1.2.3


From f0c70f9aabcb8e7c57c71eac2bd4dc86a2f86a0e Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Thu, 16 Oct 2008 09:52:02 -0600
Subject: cell: update comments

---
 src/gallium/drivers/cell/spu/spu_texture.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
index b2d5d4aef8..19c17c9118 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.c
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -193,10 +193,6 @@ sample_texture_2d_bilinear(vector float s, vector float t,
    get_four_texels(unit, level, face, is0, it1, texels + 8);  /* lower-left */
    get_four_texels(unit, level, face, is1, it1, texels + 12); /* lower-right */
 
-   /* XXX possibly rework following code to compute the weighted sample
-    * colors with integer arithmetic for fewer int->float conversions.
-    */
-
    /* convert packed int texels to float colors */
    vector float ftexels[16];
    spu_unpack_A8R8G8B8_transpose4(texels + 0, ftexels + 0);
@@ -303,7 +299,8 @@ transpose(vector unsigned int *mOut0,
 
 
 /**
- * Bilinear filtering, using int intead of float arithmetic
+ * Bilinear filtering, using int instead of float arithmetic for computing
+ * sample weights.
  */
 void
 sample_texture_2d_bilinear_int(vector float s, vector float t,
-- 
cgit v1.2.3


From 5191429b15a3e7a7ef7cda499de8074c2c0df94f Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Thu, 16 Oct 2008 11:19:22 -0600
Subject: cell: trilinear mipmap interpolation

---
 src/gallium/drivers/cell/spu/spu_texture.c | 55 +++++++++++++++++++++++++-----
 1 file changed, 46 insertions(+), 9 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
index 19c17c9118..e3d9a49dc4 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.c
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -415,7 +415,7 @@ sample_texture_2d_bilinear_int(vector float s, vector float t,
  * Compute level of detail factor from texcoords.
  */
 static float
-compute_lambda(uint unit, vector float s, vector float t)
+compute_lambda_2d(uint unit, vector float s, vector float t)
 {
    uint baseLevel = 0;
    float width = spu.texture[unit].level[baseLevel].width;
@@ -433,9 +433,27 @@ compute_lambda(uint unit, vector float s, vector float t)
 }
 
 
+/**
+ * Blend two sets of colors according to weight.
+ */
+static void
+blend_colors(vector float c0[4], const vector float c1[4], float weight)
+{
+   vector float t = spu_splats(weight);
+   vector float dc0 = spu_sub(c1[0], c0[0]);
+   vector float dc1 = spu_sub(c1[1], c0[1]);
+   vector float dc2 = spu_sub(c1[2], c0[2]);
+   vector float dc3 = spu_sub(c1[3], c0[3]);
+   c0[0] = spu_madd(dc0, t, c0[0]);
+   c0[1] = spu_madd(dc1, t, c0[1]);
+   c0[2] = spu_madd(dc2, t, c0[2]);
+   c0[3] = spu_madd(dc3, t, c0[3]);
+}
+
 
 /**
- * Texture sampling with level of detail selection.
+ * Texture sampling with level of detail selection and possibly mipmap
+ * interpolation.
  */
 void
 sample_texture_2d_lod(vector float s, vector float t,
@@ -446,7 +464,7 @@ sample_texture_2d_lod(vector float s, vector float t,
     * Note that we're computing a lambda/lod here that's used for all
     * four pixels in the quad.
     */
-   float lambda = compute_lambda(unit, s, t);
+   float lambda = compute_lambda_2d(unit, s, t);
 
    (void) face;
    (void) level_ignored;
@@ -462,15 +480,34 @@ sample_texture_2d_lod(vector float s, vector float t,
 
    if (lambda <= 0.0f) {
       /* magnify */
-      spu.mag_sample_texture_2d[unit](s, t, unit, 0, 0, colors);
+      spu.mag_sample_texture_2d[unit](s, t, unit, 0, face, colors);
    }
    else {
       /* minify */
-      int level = (int) (lambda + 0.5f);
-      if (level > (int) spu.texture[unit].max_level)
-         level = spu.texture[unit].max_level;
-      spu.min_sample_texture_2d[unit](s, t, unit, level, 0, colors);
-      /* XXX to do: mipmap level interpolation */
+      if (spu.sampler[unit].min_img_filter == PIPE_TEX_FILTER_LINEAR) {
+         /* sample two mipmap levels and interpolate */
+         int level = (int) lambda;
+         if (level > (int) spu.texture[unit].max_level)
+            level = spu.texture[unit].max_level;
+         spu.min_sample_texture_2d[unit](s, t, unit, level, face, colors);
+         if (spu.sampler[unit].min_img_filter == PIPE_TEX_FILTER_LINEAR) {
+            /* sample second mipmap level */
+            float weight = lambda - (float) level;
+            level++;
+            if (level <= (int) spu.texture[unit].max_level) {
+               vector float colors2[4];
+               spu.min_sample_texture_2d[unit](s, t, unit, level, face, colors2);
+               blend_colors(colors, colors2, weight);
+            }
+         }
+      }
+      else {
+         /* sample one mipmap level */
+         int level = (int) (lambda + 0.5f);
+         if (level > (int) spu.texture[unit].max_level)
+            level = spu.texture[unit].max_level;
+         spu.min_sample_texture_2d[unit](s, t, unit, level, face, colors);
+      }
    }
 }
 
-- 
cgit v1.2.3


From 033c90f4c16c1da517d676282508208319bd5ec5 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Thu, 16 Oct 2008 13:49:42 -0600
Subject: cell: implement KIL instruction

---
 src/gallium/drivers/cell/ppu/cell_gen_fp.c | 80 ++++++++++++++++++++++++++++++
 src/gallium/drivers/cell/spu/spu_main.h    |  6 +--
 src/gallium/drivers/cell/spu/spu_tri.c     |  5 +-
 3 files changed, 87 insertions(+), 4 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
index 2b34cf1e23..493ee1a0c9 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
@@ -84,6 +84,9 @@ struct codegen
    /** Index of execution mask register */
    int exec_mask_reg;
 
+   /** KIL mask: indicates which fragments have been killed */
+   int kill_mask_reg;
+
    int frame_size;  /**< Stack frame size, in words */
 
    struct spe_function *f;
@@ -431,8 +434,21 @@ emit_prologue(struct codegen *gen)
 static void
 emit_epilogue(struct codegen *gen)
 {
+   const int return_reg = 3;
+
    spe_comment(gen->f, -4, "Function epilogue:");
 
+   spe_comment(gen->f, 0, "return the killed mask");
+   if (gen->kill_mask_reg > 0) {
+      /* shader called KIL, return the "alive" mask */
+      spe_move(gen->f, return_reg, gen->kill_mask_reg);
+   }
+   else {
+      /* return {0,0,0,0} */
+      spe_load_uint(gen->f, return_reg, 0);
+   }
+
+   spe_comment(gen->f, 0, "restore stack and return");
    if (gen->frame_size >= 512) {
       /* offset is too large for ai instruction */
       int offset_reg = spe_allocate_available_register(gen->f);
@@ -1423,6 +1439,68 @@ emit_TEX(struct codegen *gen, const struct tgsi_full_instruction *inst)
 }
 
 
+/**
+ * KILL if any of src reg values are less than zero.
+ */
+static boolean
+emit_KIL(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+   int s_regs[4], kil_reg = -1, cmp_reg, zero_reg;
+
+   spe_comment(gen->f, -4, "CALL kil:");
+
+   /* zero = {0,0,0,0} */
+   zero_reg = get_itemp(gen);
+   spe_load_uint(gen->f, zero_reg, 0);
+
+   cmp_reg = get_itemp(gen);
+
+   /* get src regs */
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         s_regs[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+      }
+   }
+
+   /* test if any src regs are < 0 */
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         if (kil_reg >= 0) {
+            /* cmp = 0 > src ? : ~0 : 0 */
+            spe_fcgt(gen->f, cmp_reg, zero_reg, s_regs[ch]);
+            /* kil = kil | cmp */
+            spe_or(gen->f, kil_reg, kil_reg, cmp_reg);
+         }
+         else {
+            kil_reg = get_itemp(gen);
+            /* kil = 0 > src ? : ~0 : 0 */
+            spe_fcgt(gen->f, kil_reg, zero_reg, s_regs[ch]);
+         }
+      }
+   }
+
+   if (gen->if_nesting) {
+      /* may have been a conditional kil */
+      spe_and(gen->f, kil_reg, kil_reg, gen->exec_mask_reg);
+   }
+
+   /* allocate the kill mask reg if needed */
+   if (gen->kill_mask_reg <= 0) {
+      gen->kill_mask_reg = spe_allocate_available_register(gen->f);
+      spe_move(gen->f, gen->kill_mask_reg, kil_reg);
+   }
+   else {
+      spe_or(gen->f, gen->kill_mask_reg, gen->kill_mask_reg, kil_reg);
+   }
+
+   free_itemps(gen);
+
+   return TRUE;
+}
+
+
+
 /**
  * Emit max.  See emit_SGT for comments.
  */
@@ -1695,6 +1773,8 @@ emit_instruction(struct codegen *gen,
       /* fall-through for now */
    case TGSI_OPCODE_TXP:
       return emit_TEX(gen, inst);
+   case TGSI_OPCODE_KIL:
+      return emit_KIL(gen, inst);
 
    case TGSI_OPCODE_IF:
       return emit_IF(gen, inst);
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 80e9c696f8..95ef4c9244 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -89,9 +89,9 @@ typedef void (*spu_fragment_ops_func)(uint x, uint y,
                                       uint facing);
 
 /** Function for running fragment program */
-typedef void (*spu_fragment_program_func)(vector float *inputs,
-                                          vector float *outputs,
-                                          vector float *constants);
+typedef vector unsigned int (*spu_fragment_program_func)(vector float *inputs,
+                                                         vector float *outputs,
+                                                         vector float *constants);
 
 
 struct spu_framebuffer
diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index d83085d0f9..4caf7d6b61 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -254,6 +254,7 @@ emit_quad( int x, int y, mask_t mask)
          vector float inputs[4*4], outputs[2*4];
          vector float fragZ = eval_z((float) x, (float) y);
          vector float fragW = eval_w((float) x, (float) y);
+         vector unsigned int kill_mask;
 
          /* setup inputs */
 #if 0
@@ -268,7 +269,9 @@ emit_quad( int x, int y, mask_t mask)
          ASSERT(spu.fragment_ops);
 
          /* Execute the current fragment program */
-         spu.fragment_program(inputs, outputs, spu.constants);
+         kill_mask = spu.fragment_program(inputs, outputs, spu.constants);
+
+         mask = spu_andc(mask, kill_mask);
 
          /* Execute per-fragment/quad operations, including:
           * alpha test, z test, stencil test, blend and framebuffer writing.
-- 
cgit v1.2.3


From 51ffab362b27997f9c6c60bf9bace1b1854817db Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Thu, 16 Oct 2008 13:54:17 -0600
Subject: cell: pass spu_texture_level ptr to get_four_texels()

---
 src/gallium/drivers/cell/spu/spu_texture.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
index e3d9a49dc4..c0af05e46e 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.c
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -72,10 +72,10 @@ invalidate_tex_cache(void)
  * a time.
  */
 static void
-get_four_texels(uint unit, uint level, uint face, vec_int4 x, vec_int4 y,
+get_four_texels(const struct spu_texture_level *tlevel, uint face,
+                vec_int4 x, vec_int4 y,
                 vec_uint4 *texels)
 {
-   const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
    unsigned texture_ea = (uintptr_t) tlevel->start;
    const vec_int4 tile_x = spu_rlmask(x, -5);  /* tile_x = x / 32 */
    const vec_int4 tile_y = spu_rlmask(y, -5);  /* tile_y = y / 32 */
@@ -145,7 +145,7 @@ sample_texture_2d_nearest(vector float s, vector float t,
    is = spu_clamp(is, tlevel->max_s);
    it = spu_clamp(it, tlevel->max_t);
 
-   get_four_texels(unit, level, face, is, it, texels);
+   get_four_texels(tlevel, face, is, it, texels);
 
    /* convert four packed ARGBA pixels to float RRRR,GGGG,BBBB,AAAA */
    spu_unpack_A8R8G8B8_transpose4(texels, colors);
@@ -188,10 +188,10 @@ sample_texture_2d_bilinear(vector float s, vector float t,
 
    /* get packed int texels */
    vector unsigned int texels[16];
-   get_four_texels(unit, level, face, is0, it0, texels + 0);  /* upper-left */
-   get_four_texels(unit, level, face, is1, it0, texels + 4);  /* upper-right */
-   get_four_texels(unit, level, face, is0, it1, texels + 8);  /* lower-left */
-   get_four_texels(unit, level, face, is1, it1, texels + 12); /* lower-right */
+   get_four_texels(tlevel, face, is0, it0, texels + 0);  /* upper-left */
+   get_four_texels(tlevel, face, is1, it0, texels + 4);  /* upper-right */
+   get_four_texels(tlevel, face, is0, it1, texels + 8);  /* lower-left */
+   get_four_texels(tlevel, face, is1, it1, texels + 12); /* lower-right */
 
    /* convert packed int texels to float colors */
    vector float ftexels[16];
@@ -346,10 +346,10 @@ sample_texture_2d_bilinear_int(vector float s, vector float t,
 
    /* get packed int texels */
    vector unsigned int texels[16];
-   get_four_texels(unit, level, face, is0, it0, texels + 0);  /* upper-left */
-   get_four_texels(unit, level, face, is1, it0, texels + 4);  /* upper-right */
-   get_four_texels(unit, level, face, is0, it1, texels + 8);  /* lower-left */
-   get_four_texels(unit, level, face, is1, it1, texels + 12); /* lower-right */
+   get_four_texels(tlevel, face, is0, it0, texels + 0);  /* upper-left */
+   get_four_texels(tlevel, face, is1, it0, texels + 4);  /* upper-right */
+   get_four_texels(tlevel, face, is0, it1, texels + 8);  /* lower-left */
+   get_four_texels(tlevel, face, is1, it1, texels + 12); /* lower-right */
 
    /* twiddle packed 32-bit BGRA pixels into RGBA as four unsigned ints */
    {
-- 
cgit v1.2.3


From fa7b8388066651c5cfafd4ce6461fc43c982d8c7 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Thu, 16 Oct 2008 15:48:04 -0600
Subject: cell: use 7-bit weights in sample_texture_2d_bilinear_int()

This allows us to use 16-bit signed mul/add instructions.  Had to
used unsigned mul before and there's no unsigned mul/add instruction.
---
 src/gallium/drivers/cell/spu/spu_texture.c | 62 +++++++++++++++---------------
 1 file changed, 31 insertions(+), 31 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
index c0af05e46e..4e12a116cd 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.c
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -314,19 +314,19 @@ sample_texture_2d_bilinear_int(vector float s, vector float t,
    vector float ss = spu_madd(s, tlevel->scale_s, half);
    vector float tt = spu_madd(t, tlevel->scale_t, half);
 
-   /* convert float coords to fixed-pt coords with 8 fraction bits */
-   vector signed int is = spu_convts(ss, 8);
-   vector signed int it = spu_convts(tt, 8);
+   /* convert float coords to fixed-pt coords with 7 fraction bits */
+   vector signed int is = spu_convts(ss, 7);  /* XXX really need floor() here */
+   vector signed int it = spu_convts(tt, 7);  /* XXX really need floor() here */
 
-   /* compute integer texel weights in [0, 255] */
-   vector signed int sWeights0 = spu_and(is, 255);
-   vector signed int tWeights0 = spu_and(it, 255);
-   vector signed int sWeights1 = spu_sub(255, sWeights0);
-   vector signed int tWeights1 = spu_sub(255, tWeights0);
+   /* compute integer texel weights in [0, 127] */
+   vector signed int sWeights0 = spu_and(is, 127);
+   vector signed int tWeights0 = spu_and(it, 127);
+   vector signed int sWeights1 = spu_sub(127, sWeights0);
+   vector signed int tWeights1 = spu_sub(127, tWeights0);
 
-   /* texel coords: is0 = is / 256, it0 = is / 256 */
-   vector signed int is0 = spu_rlmask(is, -8);
-   vector signed int it0 = spu_rlmask(it, -8);
+   /* texel coords: is0 = is / 128, it0 = is / 128 */
+   vector signed int is0 = spu_rlmask(is, -7);
+   vector signed int it0 = spu_rlmask(it, -7);
 
    /* texel coords: i1 = is0 + 1, it1 = it0 + 1 */
    vector signed int is1 = spu_add(is0, 1);
@@ -377,36 +377,36 @@ sample_texture_2d_bilinear_int(vector float s, vector float t,
    vector unsigned int c0, c1, c2, c3, cSum;
 
    /* red */
-   c0 = (vector unsigned int) si_mpyu((qword) texel0, si_mpyu((qword) sWeights1, (qword) tWeights1)); /*ul*/
-   c1 = (vector unsigned int) si_mpyu((qword) texel4, si_mpyu((qword) sWeights0, (qword) tWeights1)); /*ur*/
-   c2 = (vector unsigned int) si_mpyu((qword) texel8, si_mpyu((qword) sWeights1, (qword) tWeights0)); /*ll*/
-   c3 = (vector unsigned int) si_mpyu((qword) texel12, si_mpyu((qword) sWeights0, (qword) tWeights0)); /*lr*/
+   c0 = (vector unsigned int) si_mpy((qword) texel0, si_mpy((qword) sWeights1, (qword) tWeights1)); /*ul*/
+   c1 = (vector unsigned int) si_mpy((qword) texel4, si_mpy((qword) sWeights0, (qword) tWeights1)); /*ur*/
+   c2 = (vector unsigned int) si_mpy((qword) texel8, si_mpy((qword) sWeights1, (qword) tWeights0)); /*ll*/
+   c3 = (vector unsigned int) si_mpy((qword) texel12, si_mpy((qword) sWeights0, (qword) tWeights0)); /*lr*/
    cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3));
-   colors[0] = spu_convtf(cSum, 24);
+   colors[0] = spu_convtf(cSum, 22);
 
    /* green */
-   c0 = (vector unsigned int) si_mpyu((qword) texel1, si_mpyu((qword) sWeights1, (qword) tWeights1)); /*ul*/
-   c1 = (vector unsigned int) si_mpyu((qword) texel5, si_mpyu((qword) sWeights0, (qword) tWeights1)); /*ur*/
-   c2 = (vector unsigned int) si_mpyu((qword) texel9, si_mpyu((qword) sWeights1, (qword) tWeights0)); /*ll*/
-   c3 = (vector unsigned int) si_mpyu((qword) texel13, si_mpyu((qword) sWeights0, (qword) tWeights0)); /*lr*/
+   c0 = (vector unsigned int) si_mpy((qword) texel1, si_mpy((qword) sWeights1, (qword) tWeights1)); /*ul*/
+   c1 = (vector unsigned int) si_mpy((qword) texel5, si_mpy((qword) sWeights0, (qword) tWeights1)); /*ur*/
+   c2 = (vector unsigned int) si_mpy((qword) texel9, si_mpy((qword) sWeights1, (qword) tWeights0)); /*ll*/
+   c3 = (vector unsigned int) si_mpy((qword) texel13, si_mpy((qword) sWeights0, (qword) tWeights0)); /*lr*/
    cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3));
-   colors[1] = spu_convtf(cSum, 24);
+   colors[1] = spu_convtf(cSum, 22);
 
    /* blue */
-   c0 = (vector unsigned int) si_mpyu((qword) texel2, si_mpyu((qword) sWeights1, (qword) tWeights1)); /*ul*/
-   c1 = (vector unsigned int) si_mpyu((qword) texel6, si_mpyu((qword) sWeights0, (qword) tWeights1)); /*ur*/
-   c2 = (vector unsigned int) si_mpyu((qword) texel10, si_mpyu((qword) sWeights1, (qword) tWeights0)); /*ll*/
-   c3 = (vector unsigned int) si_mpyu((qword) texel14, si_mpyu((qword) sWeights0, (qword) tWeights0)); /*lr*/
+   c0 = (vector unsigned int) si_mpy((qword) texel2, si_mpy((qword) sWeights1, (qword) tWeights1)); /*ul*/
+   c1 = (vector unsigned int) si_mpy((qword) texel6, si_mpy((qword) sWeights0, (qword) tWeights1)); /*ur*/
+   c2 = (vector unsigned int) si_mpy((qword) texel10, si_mpy((qword) sWeights1, (qword) tWeights0)); /*ll*/
+   c3 = (vector unsigned int) si_mpy((qword) texel14, si_mpy((qword) sWeights0, (qword) tWeights0)); /*lr*/
    cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3));
-   colors[2] = spu_convtf(cSum, 24);
+   colors[2] = spu_convtf(cSum, 22);
 
    /* alpha */
-   c0 = (vector unsigned int) si_mpyu((qword) texel3, si_mpyu((qword) sWeights1, (qword) tWeights1)); /*ul*/
-   c1 = (vector unsigned int) si_mpyu((qword) texel7, si_mpyu((qword) sWeights0, (qword) tWeights1)); /*ur*/
-   c2 = (vector unsigned int) si_mpyu((qword) texel11, si_mpyu((qword) sWeights1, (qword) tWeights0)); /*ll*/
-   c3 = (vector unsigned int) si_mpyu((qword) texel15, si_mpyu((qword) sWeights0, (qword) tWeights0)); /*lr*/
+   c0 = (vector unsigned int) si_mpy((qword) texel3, si_mpy((qword) sWeights1, (qword) tWeights1)); /*ul*/
+   c1 = (vector unsigned int) si_mpy((qword) texel7, si_mpy((qword) sWeights0, (qword) tWeights1)); /*ur*/
+   c2 = (vector unsigned int) si_mpy((qword) texel11, si_mpy((qword) sWeights1, (qword) tWeights0)); /*ll*/
+   c3 = (vector unsigned int) si_mpy((qword) texel15, si_mpy((qword) sWeights0, (qword) tWeights0)); /*lr*/
    cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3));
-   colors[3] = spu_convtf(cSum, 24);
+   colors[3] = spu_convtf(cSum, 22);
 }
 
 
-- 
cgit v1.2.3


From 9fa8671c73fa44a95e2ea7fed6047bddb042796f Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Thu, 16 Oct 2008 20:25:28 -0600
Subject: cell: add new debug flag (cache) to report texture cache stats on
 exit

---
 src/gallium/drivers/cell/common.h           | 1 +
 src/gallium/drivers/cell/ppu/cell_context.c | 1 +
 src/gallium/drivers/cell/spu/spu_command.c  | 3 ++-
 src/gallium/drivers/cell/spu/spu_dcache.c   | 4 +++-
 4 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index 469d56cda8..9ca4e9d67e 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -117,6 +117,7 @@
 #define CELL_DEBUG_FRAGMENT_OPS         (1 << 3)
 #define CELL_DEBUG_FRAGMENT_OP_FALLBACK (1 << 4)
 #define CELL_DEBUG_CMD                  (1 << 5)
+#define CELL_DEBUG_CACHE                (1 << 6)
 
 /** Max instructions for doing per-fragment operations */
 #define SPU_MAX_FRAGMENT_OPS_INSTS 64
diff --git a/src/gallium/drivers/cell/ppu/cell_context.c b/src/gallium/drivers/cell/ppu/cell_context.c
index 4dad490ce1..7a2d93ecb4 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.c
+++ b/src/gallium/drivers/cell/ppu/cell_context.c
@@ -94,6 +94,7 @@ static const struct debug_named_value cell_debug_flags[] = {
    {"fragops", CELL_DEBUG_FRAGMENT_OPS}, /**< SPUs emit fragment ops debug messages*/
    {"fragopfallback", CELL_DEBUG_FRAGMENT_OP_FALLBACK}, /**< SPUs use reference implementation for fragment ops*/
    {"cmd", CELL_DEBUG_CMD},       /**< SPUs dump command buffer info */
+   {"cache", CELL_DEBUG_CACHE},   /**< report texture cache stats on exit */
    {NULL, 0}
 };
 
diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index ff4a52d79a..9c853c0961 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -720,5 +720,6 @@ command_loop(void)
 
    D_PRINTF(CELL_DEBUG_CMD, "Exit command loop\n");
 
-   spu_dcache_report();
+   if (spu.init.debug_flags & CELL_DEBUG_CACHE)
+      spu_dcache_report();
 }
diff --git a/src/gallium/drivers/cell/spu/spu_dcache.c b/src/gallium/drivers/cell/spu/spu_dcache.c
index 167404cdc5..a6d67634fd 100644
--- a/src/gallium/drivers/cell/spu/spu_dcache.c
+++ b/src/gallium/drivers/cell/spu/spu_dcache.c
@@ -36,7 +36,9 @@
 #define CACHE_SET_TAGID(set)  (((set) & 0x03) + TAG_DCACHE0)
 #define CACHE_LOG2NNWAY       2
 #define CACHE_LOG2NSETS       6
-/*#define CACHE_STATS           1*/
+#ifdef DEBUG
+#define CACHE_STATS           1
+#endif
 #include <cache-api.h>
 
 /* Yes folks, this is ugly.
-- 
cgit v1.2.3


From 81724da4f61f2ba678e2e0376209e1b754e1ecab Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 17 Oct 2008 09:09:57 -0600
Subject: cell: use an approximation in compute_lambda_2d() to avoid sqrt

Though, the logf() call still needs attention.
---
 src/gallium/drivers/cell/spu/spu_texture.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
index 4e12a116cd..69784c8978 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.c
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -414,7 +414,7 @@ sample_texture_2d_bilinear_int(vector float s, vector float t,
 /**
  * Compute level of detail factor from texcoords.
  */
-static float
+static INLINE float
 compute_lambda_2d(uint unit, vector float s, vector float t)
 {
    uint baseLevel = 0;
@@ -424,11 +424,21 @@ compute_lambda_2d(uint unit, vector float s, vector float t)
    float dsdy = width * (spu_extract(s, 2) - spu_extract(s, 0));
    float dtdx = height * (spu_extract(t, 1) - spu_extract(t, 0));
    float dtdy = height * (spu_extract(t, 2) - spu_extract(t, 0));
+#if 0
+   /* ideal value */
    float x = dsdx * dsdx + dtdx * dtdx;
    float y = dsdy * dsdy + dtdy * dtdy;
    float rho = x > y ? x : y;
    rho = sqrtf(rho);
-   float lambda = logf(rho) * 1.442695f;
+#else
+   /* approximation */
+   dsdx = fabsf(dsdx);
+   dsdy = fabsf(dsdy);
+   dtdx = fabsf(dtdx);
+   dtdy = fabsf(dtdy);
+   float rho = (dsdx + dsdy + dtdx + dtdy) * 0.5;
+#endif
+   float lambda = logf(rho) * 1.442695f; /* compute logbase2(rho) */
    return lambda;
 }
 
-- 
cgit v1.2.3


From 70dd4379d2cd54f229c3940312537912470218d3 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 22 Oct 2008 10:34:13 -0600
Subject: cell: implement fencing for texture buffers

If we delete a texture, we need to keep the underlying tiled data buffer
around until any rendering that references it has completed.
Keep a list of buffers referenced by a rendering batch.  Unref/free them when
the associated batch's fence is executed/signalled.
---
 src/gallium/drivers/cell/common.h              |  25 ++++
 src/gallium/drivers/cell/ppu/Makefile          |   1 +
 src/gallium/drivers/cell/ppu/cell_batch.c      |  32 +++++
 src/gallium/drivers/cell/ppu/cell_context.c    |   6 +
 src/gallium/drivers/cell/ppu/cell_context.h    |  21 ++++
 src/gallium/drivers/cell/ppu/cell_fence.c      | 158 +++++++++++++++++++++++++
 src/gallium/drivers/cell/ppu/cell_fence.h      |  57 +++++++++
 src/gallium/drivers/cell/ppu/cell_state_emit.c |   2 +-
 src/gallium/drivers/cell/ppu/cell_texture.c    |  33 ++++--
 src/gallium/drivers/cell/ppu/cell_texture.h    |   5 +-
 src/gallium/drivers/cell/ppu/cell_vbuf.c       |   6 +
 src/gallium/drivers/cell/spu/spu_command.c     |  38 +++++-
 src/gallium/drivers/cell/spu/spu_main.h        |   2 +-
 13 files changed, 367 insertions(+), 19 deletions(-)
 create mode 100644 src/gallium/drivers/cell/ppu/cell_fence.c
 create mode 100644 src/gallium/drivers/cell/ppu/cell_fence.h

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index 9ca4e9d67e..23fb0b0831 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -102,6 +102,8 @@
 #define CELL_CMD_STATE_RASTERIZER    22
 #define CELL_CMD_VS_EXECUTE          23
 #define CELL_CMD_FLUSH_BUFFER_RANGE  24
+#define CELL_CMD_FENCE               25
+
 
 /** Command/batch buffers */
 #define CELL_NUM_BUFFERS 4
@@ -123,6 +125,29 @@
 #define SPU_MAX_FRAGMENT_OPS_INSTS 64
 
 
+
+#define CELL_FENCE_IDLE      0
+#define CELL_FENCE_EMITTED   1
+#define CELL_FENCE_SIGNALLED 2
+
+struct cell_fence
+{
+   /** There's a 16-byte status qword per SPU */
+   volatile uint status[CELL_MAX_SPUS][4];
+};
+
+
+/**
+ * Fence command sent to SPUs.  In response, the SPUs will write
+ * CELL_FENCE_STATUS_SIGNALLED back to the fence status word in main memory.
+ */
+struct cell_command_fence
+{
+   uint64_t opcode;      /**< CELL_CMD_FENCE */
+   struct cell_fence *fence;
+};
+
+
 /**
  * Command to specify per-fragment operations state and generated code.
  * Note that the dsa, blend, blend_color fields are really only needed
diff --git a/src/gallium/drivers/cell/ppu/Makefile b/src/gallium/drivers/cell/ppu/Makefile
index b28f4c5c31..9358a47284 100644
--- a/src/gallium/drivers/cell/ppu/Makefile
+++ b/src/gallium/drivers/cell/ppu/Makefile
@@ -24,6 +24,7 @@ SOURCES = \
 	cell_clear.c \
 	cell_context.c \
 	cell_draw_arrays.c \
+	cell_fence.c \
 	cell_flush.c \
 	cell_gen_fragment.c \
 	cell_gen_fp.c \
diff --git a/src/gallium/drivers/cell/ppu/cell_batch.c b/src/gallium/drivers/cell/ppu/cell_batch.c
index 01254aed60..448b723d85 100644
--- a/src/gallium/drivers/cell/ppu/cell_batch.c
+++ b/src/gallium/drivers/cell/ppu/cell_batch.c
@@ -28,6 +28,7 @@
 
 #include "cell_context.h"
 #include "cell_batch.h"
+#include "cell_fence.h"
 #include "cell_spu.h"
 
 
@@ -63,6 +64,10 @@ cell_get_empty_buffer(struct cell_context *cell)
                printf("PPU: ALLOC BUFFER %u, %u tries\n", buf, tries);
                */
                prev_buffer = buf;
+
+               /* release tex buffer associated w/ prev use of this batch buf */
+               cell_free_fenced_buffers(cell, &cell->fenced_buffers[buf]);
+
                return buf;
             }
          }
@@ -84,6 +89,26 @@ cell_get_empty_buffer(struct cell_context *cell)
 }
 
 
+/**
+ * Append a fence command to the current batch buffer.
+ * Note that we're sure there's always room for this because of the
+ * adjusted size check in cell_batch_free_space().
+ */
+static void
+emit_fence(struct cell_context *cell)
+{
+   const uint batch = cell->cur_batch;
+   const uint size = cell->buffer_size[batch];
+   struct cell_command_fence *fence_cmd;
+
+   ASSERT(size + sizeof(struct cell_command_fence) <= CELL_BUFFER_SIZE);
+
+   fence_cmd = (struct cell_command_fence *) (cell->buffer[batch] + size);
+   fence_cmd->opcode = CELL_CMD_FENCE;
+   fence_cmd->fence = &cell->fenced_buffers[batch].fence;
+}
+
+
 /**
  * Flush the current batch buffer to the SPUs.
  * An empty buffer will be found and set as the new current batch buffer
@@ -102,6 +127,12 @@ cell_batch_flush(struct cell_context *cell)
    if (size == 0)
       return;
 
+   /* Before we use this batch buffer, make sure any fenced texture buffers
+    * are released.
+    */
+   if (cell->fenced_buffers[batch].head)
+      emit_fence(cell);
+
    flushing = TRUE;
 
    assert(batch < CELL_NUM_BUFFERS);
@@ -142,6 +173,7 @@ uint
 cell_batch_free_space(const struct cell_context *cell)
 {
    uint free = CELL_BUFFER_SIZE - cell->buffer_size[cell->cur_batch];
+   free -= sizeof(struct cell_command_fence);
    return free;
 }
 
diff --git a/src/gallium/drivers/cell/ppu/cell_context.c b/src/gallium/drivers/cell/ppu/cell_context.c
index 7a2d93ecb4..22d552d8e3 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.c
+++ b/src/gallium/drivers/cell/ppu/cell_context.c
@@ -47,6 +47,7 @@
 #include "cell_clear.h"
 #include "cell_context.h"
 #include "cell_draw_arrays.h"
+#include "cell_fence.h"
 #include "cell_flush.h"
 #include "cell_state.h"
 #include "cell_surface.h"
@@ -104,6 +105,7 @@ cell_create_context(struct pipe_screen *screen,
                     struct cell_winsys *cws)
 {
    struct cell_context *cell;
+   uint i;
 
    /* some fields need to be 16-byte aligned, so align the whole object */
    cell = (struct cell_context*) align_malloc(sizeof(struct cell_context), 16);
@@ -151,6 +153,10 @@ cell_create_context(struct pipe_screen *screen,
                                               cell_debug_flags, 
                                               0 );
 
+   for (i = 0; i < CELL_NUM_BUFFERS; i++)
+      cell_fence_init(&cell->fenced_buffers[i].fence);
+
+
    /*
     * SPU stuff
     */
diff --git a/src/gallium/drivers/cell/ppu/cell_context.h b/src/gallium/drivers/cell/ppu/cell_context.h
index ad1f4829a4..4491ae8cdf 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.h
+++ b/src/gallium/drivers/cell/ppu/cell_context.h
@@ -81,6 +81,19 @@ struct cell_fragment_ops_key
 };
 
 
+struct cell_buffer_node;
+
+/**
+ * Fenced buffer list.  List of buffers which can be unreferenced after
+ * the fence has been executed/signalled.
+ */
+struct cell_buffer_list
+{
+   struct cell_fence fence;
+   struct cell_buffer_node *head;
+};
+
+
 /**
  * Per-context state, subclass of pipe_context.
  */
@@ -154,6 +167,14 @@ struct cell_context
    uint buffer_status[CELL_MAX_SPUS][CELL_NUM_BUFFERS][4] ALIGN16_ATTRIB;
 
 
+   /** Associated with each command/batch buffer is a list of pipe_buffers
+    * that are fenced.  When the last command in a buffer is executed, the
+    * fence will be signalled, indicating that any pipe_buffers preceeding
+    * that fence can be unreferenced (and probably freed).
+    */
+   struct cell_buffer_list fenced_buffers[CELL_NUM_BUFFERS];
+
+
    struct spe_function attrib_fetch;
    unsigned attrib_fetch_offsets[PIPE_MAX_ATTRIBS];
 
diff --git a/src/gallium/drivers/cell/ppu/cell_fence.c b/src/gallium/drivers/cell/ppu/cell_fence.c
new file mode 100644
index 0000000000..ffb3bea12b
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_fence.c
@@ -0,0 +1,158 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include <unistd.h>
+#include "util/u_memory.h"
+#include "pipe/p_inlines.h"
+#include "cell_context.h"
+#include "cell_batch.h"
+#include "cell_fence.h"
+#include "cell_texture.h"
+
+
+void
+cell_fence_init(struct cell_fence *fence)
+{
+   uint i;
+   for (i = 0; i < CELL_MAX_SPUS; i++) {
+      fence->status[i][0] = CELL_FENCE_IDLE;
+   }
+}
+
+
+boolean
+cell_fence_signalled(const struct cell_context *cell,
+                     const struct cell_fence *fence)
+{
+   uint i;
+   for (i = 0; i < cell->num_spus; i++) {
+      //ASSERT(fence->status[i][0] != CELL_FENCE_IDLE);
+      if (fence->status[i][0] == CELL_FENCE_EMITTED)
+         return FALSE;
+   }
+   return TRUE;
+}
+
+
+void
+cell_fence_finish(const struct cell_context *cell,
+                  const struct cell_fence *fence)
+{
+   while (!cell_fence_signalled(cell, fence)) {
+      usleep(10);
+   }
+}
+
+
+
+
+struct cell_buffer_node
+{
+   struct pipe_buffer *buffer;
+   struct cell_buffer_node *next;
+};
+
+
+static void
+cell_add_buffer_to_list(struct cell_context *cell,
+                        struct cell_buffer_list *list,
+                        struct pipe_buffer *buffer)
+{
+   struct pipe_screen *ps = cell->pipe.screen;
+   struct cell_buffer_node *node = CALLOC_STRUCT(cell_buffer_node);
+   /* create new list node which references the buffer, insert at head */
+   if (node) {
+      pipe_buffer_reference(ps, &node->buffer, buffer);
+      node->next = list->head;
+      list->head = node;
+   }
+}
+
+
+/**
+ * Wait for completion of the given fence, then unreference any buffers
+ * on the list.
+ * This typically unrefs/frees texture buffers after any rendering which uses
+ * them has completed.
+ */
+void
+cell_free_fenced_buffers(struct cell_context *cell,
+                         struct cell_buffer_list *list)
+{
+   if (list->head) {
+      struct pipe_screen *ps = cell->pipe.screen;
+      struct cell_buffer_node *node;
+
+      cell_fence_finish(cell, &list->fence);
+
+      /* traverse the list, unreferencing buffers, freeing nodes */
+      node = list->head;
+      while (node) {
+         struct cell_buffer_node *next = node->next;
+         assert(node->buffer);
+         pipe_buffer_unmap(ps, node->buffer);
+#if 0
+         printf("Unref buffer %p\n", node->buffer);
+         if (node->buffer->refcount == 1)
+            printf("   Delete!\n");
+#endif
+         pipe_buffer_reference(ps, &node->buffer, NULL);
+         FREE(node);
+         node = next;
+      }
+      list->head = NULL;
+   }
+}
+
+
+/**
+ * This should be called for each render command.
+ * Any texture buffers that are current bound will be added to a fenced
+ * list to be freed later when the fence is executed/signalled.
+ */
+void
+cell_add_fenced_textures(struct cell_context *cell)
+{
+   struct cell_buffer_list *list = &cell->fenced_buffers[cell->cur_batch];
+   uint i;
+
+   for (i = 0; i < cell->num_textures; i++) {
+      struct cell_texture *ct = cell->texture[i];
+      if (ct) {
+         uint level;
+         for (level = 0; level < CELL_MAX_TEXTURE_LEVELS; level++) {
+            if (ct->tiled_buffer[level]) {
+#if 0
+               printf("Adding texture %p buffer %p to list\n",
+                      ct, ct->tiled_buffer[level]);
+#endif
+               cell_add_buffer_to_list(cell, list, ct->tiled_buffer[level]);
+            }
+         }
+      }
+   }
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_fence.h b/src/gallium/drivers/cell/ppu/cell_fence.h
new file mode 100644
index 0000000000..536b4ba411
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_fence.h
@@ -0,0 +1,57 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef CELL_FENCE_H
+#define CELL_FENCE_H
+
+
+extern void
+cell_fence_init(struct cell_fence *fence);
+
+
+extern boolean
+cell_fence_signalled(const struct cell_context *cell,
+                     const struct cell_fence *fence);
+
+
+extern void
+cell_fence_finish(const struct cell_context *cell,
+                  const struct cell_fence *fence);
+
+
+
+extern void
+cell_free_fenced_buffers(struct cell_context *cell,
+                         struct cell_buffer_list *list);
+
+
+extern void
+cell_add_fenced_textures(struct cell_context *cell);
+
+
+#endif /* CELL_FENCE_H */
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index effcd2a1e1..dd2d7f7d1e 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -225,7 +225,7 @@ cell_emit_state(struct cell_context *cell)
             if (cell->texture[i]) {
                uint level;
                for (level = 0; level < CELL_MAX_TEXTURE_LEVELS; level++) {
-                  texture->start[level] = cell->texture[i]->tiled_data[level];
+                  texture->start[level] = cell->texture[i]->tiled_mapped[level];
                   texture->width[level] = cell->texture[i]->base.width[level];
                   texture->height[level] = cell->texture[i]->base.height[level];
                   texture->depth[level] = cell->texture[i]->base.depth[level];
diff --git a/src/gallium/drivers/cell/ppu/cell_texture.c b/src/gallium/drivers/cell/ppu/cell_texture.c
index 9c6741f1bc..9ac2f3bbb9 100644
--- a/src/gallium/drivers/cell/ppu/cell_texture.c
+++ b/src/gallium/drivers/cell/ppu/cell_texture.c
@@ -136,6 +136,9 @@ cell_texture_release(struct pipe_screen *screen,
        __FUNCTION__, (void *) *pt, (*pt)->refcount - 1);
    */
    if (--(*pt)->refcount <= 0) {
+      /* Delete this texture now.
+       * But note that the underlying pipe_buffer may linger...
+       */
       struct cell_texture *ct = cell_texture(*pt);
       uint i;
 
@@ -146,14 +149,12 @@ cell_texture_release(struct pipe_screen *screen,
       pipe_buffer_reference(screen, &ct->buffer, NULL);
 
       for (i = 0; i < CELL_MAX_TEXTURE_LEVELS; i++) {
-         if (ct->tiled_data[i]) {
-            /* XXX need to use a fenced buffer for tiled data so that
-             * it's properly freed after rendering has completed.
-             * Disabling this free() allows glDrawPixels to work for now.
-             */
-#if 0
-            align_free(ct->tiled_data[i]);
-#endif
+         /* Unreference the tiled image buffer.
+          * It may not actually be deleted until a fence is hit.
+          */
+         if (ct->tiled_buffer[i]) {
+            ct->tiled_mapped[i] = NULL;
+            winsys_buffer_reference(screen->winsys, &ct->tiled_buffer[i], NULL);
          }
       }
 
@@ -234,12 +235,18 @@ cell_twiddle_texture(struct pipe_screen *screen,
          int offset = bufWidth * bufHeight * 4 * surface->face;
          uint *dst;
 
-         if (!ct->tiled_data[level]) {
-            ct->tiled_data[level] =
-               align_malloc(bufWidth * bufHeight * 4 * numFaces, 16);
+         if (!ct->tiled_buffer[level]) {
+            /* allocate buffer for tiled data now */
+            struct pipe_winsys *ws = screen->winsys;
+            uint bytes = bufWidth * bufHeight * 4 * numFaces;
+            ct->tiled_buffer[level] = ws->buffer_create(ws, 16,
+                                                        PIPE_BUFFER_USAGE_PIXEL,
+                                                        bytes);
+            /* and map it */
+            ct->tiled_mapped[level] = ws->buffer_map(ws, ct->tiled_buffer[level],
+                                                     PIPE_BUFFER_USAGE_GPU_READ);
          }
-
-         dst = (uint *) ((ubyte *) ct->tiled_data[level] + offset);
+         dst = (uint *) ((ubyte *) ct->tiled_mapped[level] + offset);
 
          twiddle_image_uint(texWidth, texHeight, TILE_SIZE, dst,
                             surface->stride, src);
diff --git a/src/gallium/drivers/cell/ppu/cell_texture.h b/src/gallium/drivers/cell/ppu/cell_texture.h
index a0757091b0..2f5fe0dd1b 100644
--- a/src/gallium/drivers/cell/ppu/cell_texture.h
+++ b/src/gallium/drivers/cell/ppu/cell_texture.h
@@ -48,7 +48,10 @@ struct cell_texture
    struct pipe_buffer *buffer;
    unsigned long buffer_size;
 
-   void *tiled_data[CELL_MAX_TEXTURE_LEVELS];  /* XXX this may be temporary */ /*ALIGN16*/
+   /** Texture data in tiled layout is held here */
+   struct pipe_buffer *tiled_buffer[CELL_MAX_TEXTURE_LEVELS];
+   /** Mapped, tiled texture data */
+   void *tiled_mapped[CELL_MAX_TEXTURE_LEVELS];
 };
 
 
diff --git a/src/gallium/drivers/cell/ppu/cell_vbuf.c b/src/gallium/drivers/cell/ppu/cell_vbuf.c
index aa63435b93..65ba51b6bb 100644
--- a/src/gallium/drivers/cell/ppu/cell_vbuf.c
+++ b/src/gallium/drivers/cell/ppu/cell_vbuf.c
@@ -38,6 +38,7 @@
 
 #include "cell_batch.h"
 #include "cell_context.h"
+#include "cell_fence.h"
 #include "cell_flush.h"
 #include "cell_spu.h"
 #include "cell_vbuf.h"
@@ -108,6 +109,11 @@ cell_vbuf_release_vertices(struct vbuf_render *vbr, void *vertices,
           __FUNCTION__, cvbr->vertex_buf, vertices_used);
    */
 
+   /* Make sure texture buffers aren't released until we're done rendering
+    * with them.
+    */
+   cell_add_fenced_textures(cell);
+
    /* Tell SPUs they can release the vert buf */
    if (cvbr->vertex_buf != ~0U) {
       struct cell_command_release_verts *release
diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index 9c853c0961..a6ed29ea63 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -76,9 +76,10 @@ static void
 release_buffer(uint buffer)
 {
    /* Evidently, using less than a 16-byte status doesn't work reliably */
-   static const uint status[4] ALIGN16_ATTRIB
-      = {CELL_BUFFER_STATUS_FREE, 0, 0, 0};
-
+   static const vector unsigned int status = {CELL_BUFFER_STATUS_FREE,
+                                              CELL_BUFFER_STATUS_FREE,
+                                              CELL_BUFFER_STATUS_FREE,
+                                              CELL_BUFFER_STATUS_FREE};
    const uint index = 4 * (spu.init.id * CELL_NUM_BUFFERS + buffer);
    uint *dst = spu.init.buffer_status + index;
 
@@ -93,6 +94,29 @@ release_buffer(uint buffer)
 }
 
 
+/**
+ * Write CELL_FENCE_SIGNALLED back to the fence status qword in main memory.
+ * There's a qword of status per SPU.
+ */
+static void
+cmd_fence(struct cell_command_fence *fence_cmd)
+{
+   static const vector unsigned int status = {CELL_FENCE_SIGNALLED,
+                                              CELL_FENCE_SIGNALLED,
+                                              CELL_FENCE_SIGNALLED,
+                                              CELL_FENCE_SIGNALLED};
+   uint *dst = (uint *) fence_cmd->fence;
+   dst += 4 * spu.init.id;  /* main store/memory address, not local store */
+
+   mfc_put((void *) &status,    /* src in local memory */
+           (unsigned int) dst,  /* dst in main memory */
+           sizeof(status),      /* size */
+           TAG_FENCE,           /* tag */
+           0, /* tid */
+           0  /* rid */);
+}
+
+
 static void
 cmd_clear_surface(const struct cell_command_clear_surface *clear)
 {
@@ -637,6 +661,14 @@ cmd_batch(uint opcode)
          cmd_finish();
          pos += 1;
          break;
+      case CELL_CMD_FENCE:
+         {
+            struct cell_command_fence *fence_cmd =
+               (struct cell_command_fence *) &buffer[pos];
+            cmd_fence(fence_cmd);
+            pos += sizeof(*fence_cmd) / 8;
+         }
+         break;
       case CELL_CMD_RELEASE_VERTS:
          {
             struct cell_command_release_verts *release
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 95ef4c9244..668af10be2 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -210,7 +210,7 @@ extern struct spu_global spu;
 #define TAG_DCACHE1           21
 #define TAG_DCACHE2           22
 #define TAG_DCACHE3           23
-
+#define TAG_FENCE             24
 
 
 static INLINE void
-- 
cgit v1.2.3


From db680ac0e3697ecc2c2dbd5f22c4c2fdb136b62c Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Tue, 28 Oct 2008 14:03:51 -0600
Subject: cell: fix a number of fence issues

Plus add assertions to check status, alignment, etc.
---
 src/gallium/drivers/cell/ppu/cell_batch.c   | 19 ++++++++++++++++---
 src/gallium/drivers/cell/ppu/cell_context.h |  2 +-
 src/gallium/drivers/cell/ppu/cell_fence.c   | 14 ++++++++++++--
 src/gallium/drivers/cell/spu/spu_command.c  |  2 +-
 4 files changed, 30 insertions(+), 7 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/ppu/cell_batch.c b/src/gallium/drivers/cell/ppu/cell_batch.c
index 448b723d85..962775cd33 100644
--- a/src/gallium/drivers/cell/ppu/cell_batch.c
+++ b/src/gallium/drivers/cell/ppu/cell_batch.c
@@ -100,12 +100,23 @@ emit_fence(struct cell_context *cell)
    const uint batch = cell->cur_batch;
    const uint size = cell->buffer_size[batch];
    struct cell_command_fence *fence_cmd;
+   struct cell_fence *fence = &cell->fenced_buffers[batch].fence;
+   uint i;
+
+   /* set fence status to emitted, not yet signalled */
+   for (i = 0; i < cell->num_spus; i++) {
+      fence->status[i][0] = CELL_FENCE_EMITTED;
+   }
 
    ASSERT(size + sizeof(struct cell_command_fence) <= CELL_BUFFER_SIZE);
 
    fence_cmd = (struct cell_command_fence *) (cell->buffer[batch] + size);
    fence_cmd->opcode = CELL_CMD_FENCE;
-   fence_cmd->fence = &cell->fenced_buffers[batch].fence;
+   fence_cmd->fence = fence;
+
+   /* update batch buffer size */
+   cell->buffer_size[batch] = size + sizeof(struct cell_command_fence);
+   assert(sizeof(struct cell_command_fence) % 8 == 0);
 }
 
 
@@ -119,7 +130,7 @@ cell_batch_flush(struct cell_context *cell)
 {
    static boolean flushing = FALSE;
    uint batch = cell->cur_batch;
-   const uint size = cell->buffer_size[batch];
+   uint size = cell->buffer_size[batch];
    uint spu, cmd_word;
 
    assert(!flushing);
@@ -130,8 +141,10 @@ cell_batch_flush(struct cell_context *cell)
    /* Before we use this batch buffer, make sure any fenced texture buffers
     * are released.
     */
-   if (cell->fenced_buffers[batch].head)
+   if (cell->fenced_buffers[batch].head) {
       emit_fence(cell);
+      size = cell->buffer_size[batch];
+   }
 
    flushing = TRUE;
 
diff --git a/src/gallium/drivers/cell/ppu/cell_context.h b/src/gallium/drivers/cell/ppu/cell_context.h
index 4491ae8cdf..eb1397bb3f 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.h
+++ b/src/gallium/drivers/cell/ppu/cell_context.h
@@ -89,7 +89,7 @@ struct cell_buffer_node;
  */
 struct cell_buffer_list
 {
-   struct cell_fence fence;
+   struct cell_fence fence ALIGN16_ATTRIB;
    struct cell_buffer_node *head;
 };
 
diff --git a/src/gallium/drivers/cell/ppu/cell_fence.c b/src/gallium/drivers/cell/ppu/cell_fence.c
index ffb3bea12b..867b5dcaa0 100644
--- a/src/gallium/drivers/cell/ppu/cell_fence.c
+++ b/src/gallium/drivers/cell/ppu/cell_fence.c
@@ -38,6 +38,7 @@ void
 cell_fence_init(struct cell_fence *fence)
 {
    uint i;
+   ASSERT_ALIGN16(fence->status);
    for (i = 0; i < CELL_MAX_SPUS; i++) {
       fence->status[i][0] = CELL_FENCE_IDLE;
    }
@@ -50,9 +51,9 @@ cell_fence_signalled(const struct cell_context *cell,
 {
    uint i;
    for (i = 0; i < cell->num_spus; i++) {
-      //ASSERT(fence->status[i][0] != CELL_FENCE_IDLE);
-      if (fence->status[i][0] == CELL_FENCE_EMITTED)
+      if (fence->status[i][0] != CELL_FENCE_SIGNALLED)
          return FALSE;
+      /*assert(fence->status[i][0] == CELL_FENCE_EMITTED);*/
    }
    return TRUE;
 }
@@ -65,6 +66,15 @@ cell_fence_finish(const struct cell_context *cell,
    while (!cell_fence_signalled(cell, fence)) {
       usleep(10);
    }
+
+#ifdef DEBUG
+   {
+      uint i;
+      for (i = 0; i < cell->num_spus; i++) {
+         assert(fence->status[i][0] == CELL_FENCE_SIGNALLED);
+      }
+   }
+#endif
 }
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index a6ed29ea63..63818d4c46 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -107,7 +107,7 @@ cmd_fence(struct cell_command_fence *fence_cmd)
                                               CELL_FENCE_SIGNALLED};
    uint *dst = (uint *) fence_cmd->fence;
    dst += 4 * spu.init.id;  /* main store/memory address, not local store */
-
+   ASSERT_ALIGN16(dst);
    mfc_put((void *) &status,    /* src in local memory */
            (unsigned int) dst,  /* dst in main memory */
            sizeof(status),      /* size */
-- 
cgit v1.2.3


From 8b3af5c5d6fe100707da0d9dcc42500921792638 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 29 Oct 2008 12:12:30 -0600
Subject: cell: use simd utilities for pow, exp2, log2

---
 src/gallium/drivers/cell/spu/spu_funcs.c | 28 ++++++----------------------
 1 file changed, 6 insertions(+), 22 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/spu/spu_funcs.c b/src/gallium/drivers/cell/spu/spu_funcs.c
index 3534b35000..ff3d609d25 100644
--- a/src/gallium/drivers/cell/spu/spu_funcs.c
+++ b/src/gallium/drivers/cell/spu/spu_funcs.c
@@ -38,7 +38,9 @@
 #include <math.h>
 #include <cos14_v.h>
 #include <sin14_v.h>
-#include <transpose_matrix4x4.h>
+#include <simdmath/exp2f4.h>
+#include <simdmath/log2f4.h>
+#include <simdmath/powf4.h>
 
 #include "cell/common.h"
 #include "spu_main.h"
@@ -68,37 +70,19 @@ spu_sin(vector float x)
 static vector float
 spu_pow(vector float x, vector float y)
 {
-   float z0 = powf(spu_extract(x,0), spu_extract(y,0));
-   float z1 = powf(spu_extract(x,1), spu_extract(y,1));
-   float z2 = powf(spu_extract(x,2), spu_extract(y,2));
-   float z3 = powf(spu_extract(x,3), spu_extract(y,3));
-   return (vector float) {z0, z1, z2, z3};
+   return _powf4(x, y);
 }
 
 static vector float
 spu_exp2(vector float x)
 {
-   float z0 = powf(2.0f, spu_extract(x,0));
-   float z1 = powf(2.0f, spu_extract(x,1));
-   float z2 = powf(2.0f, spu_extract(x,2));
-   float z3 = powf(2.0f, spu_extract(x,3));
-   return (vector float) {z0, z1, z2, z3};
+   return _exp2f4(x);
 }
 
 static vector float
 spu_log2(vector float x)
 {
-   /*
-    * log_base_2(x) = log(x) / log(2)
-    * 1.442695 = 1/log(2).
-    */
-   static const vector float k = {1.442695F, 1.442695F, 1.442695F, 1.442695F};
-   float z0 = logf(spu_extract(x,0));
-   float z1 = logf(spu_extract(x,1));
-   float z2 = logf(spu_extract(x,2));
-   float z3 = logf(spu_extract(x,3));
-   vector float v = (vector float) {z0, z1, z2, z3};
-   return spu_mul(v, k);
+   return _log2f4(x);
 }
 
 
-- 
cgit v1.2.3


From 711f8a1dd94e2e1e715615d947e03015ef972326 Mon Sep 17 00:00:00 2001
From: Robert Ellison <papillo@tungstengraphics.com>
Date: Thu, 30 Oct 2008 15:24:23 -0600
Subject: CELL: stencil bug fixes

Two definitive bugs in stenciling were fixed.

The first, reversed registers in the generated Select Bytes (selb)
instruction, caused the stenciling INCR and DECR operations to
fail dramatically, putting new values in where old values were
supposed to be and vice versa.

The second caused stencil tiles to not be read and written from
main memory by the SPUs.  A per-spu flag, spu.read_depth, was used
to indicate whether the SPU should be reading depth tiles, and was set
only when depth was enabled.  A second flag, spu.read_stencil, was
set when stenciling was enabled, but never referenced.

As stenciling and depth are in the same tiles on the Cell, and there
is no corresponding TAG_WRITE_TILE_STENCIL to complement
TAG_WRITE_TILE_COLOR and TAG_WRITE_TILE_Z, I fixed this by
eliminating the unused "spu.read_stencil", renaming "spu.read_depth"
to "spu.read_depth_stencil", and setting it if either stenciling or
depth is enabled.

I also added an optimization to the fragment ops generation code,
that avoids calculating stencil values and/or stencil writemask
when the stencil operations are all KEEP.
---
 progs/trivial/tri-stencil.c                      | 13 ++++++++++--
 src/gallium/drivers/cell/ppu/cell_gen_fragment.c | 25 ++++++++++++++++++------
 src/gallium/drivers/cell/spu/spu_command.c       |  3 +--
 src/gallium/drivers/cell/spu/spu_main.h          |  3 +--
 src/gallium/drivers/cell/spu/spu_render.c        |  4 ++--
 src/gallium/drivers/cell/spu/spu_tri.c           |  2 +-
 6 files changed, 35 insertions(+), 15 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/progs/trivial/tri-stencil.c b/progs/trivial/tri-stencil.c
index 5edbef26ce..7686e16aef 100644
--- a/progs/trivial/tri-stencil.c
+++ b/progs/trivial/tri-stencil.c
@@ -49,7 +49,15 @@ static void Key(unsigned char key, int x, int y)
 
     switch (key) {
       case 27:
+        printf("Exiting...\n");
 	exit(1);
+      case 'r':
+        printf("Redisplaying...\n");
+        glutPostRedisplay();
+        break;
+      default:
+        printf("No such key '%c'...\n", key);
+        break;
     }
 }
 
@@ -89,7 +97,7 @@ static void Draw(void)
    glEnd();
 #endif
 
-#if 0
+#if 1
    glStencilFunc(GL_EQUAL, 1, 1);
    glStencilOp(GL_KEEP, GL_KEEP, GL_KEEP);
 
@@ -130,7 +138,8 @@ int main(int argc, char **argv)
 	exit(1);
     }
 
-    glutInitWindowPosition(0, 0); glutInitWindowSize( 300, 300);
+    glutInitWindowPosition(0, 0); 
+    glutInitWindowSize( 300, 300);
 
     type = GLUT_RGB | GLUT_SINGLE | GLUT_DEPTH | GLUT_STENCIL;
     glutInitDisplayMode(type);
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
index 4e1e53ecdc..8e4dd82404 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
@@ -1282,7 +1282,7 @@ gen_stencil_values(struct spe_function *f, unsigned int stencil_op,
       /* Add Word Immediate computes rT = rA + 10-bit signed immediate */
       spe_ai(f, newS_reg, fbS_reg, 1);
       /* Select from the current value or the new value based on the equality test */
-      spe_selb(f, newS_reg, fbS_reg, newS_reg, equals_reg);
+      spe_selb(f, newS_reg, newS_reg, fbS_reg, equals_reg);
 
       spe_release_register(f, equals_reg);
       break;
@@ -1295,7 +1295,7 @@ gen_stencil_values(struct spe_function *f, unsigned int stencil_op,
       /* Add Word Immediate with a (-1) value works */
       spe_ai(f, newS_reg, fbS_reg, -1);
       /* Select from the current value or the new value based on the equality test */
-      spe_selb(f, newS_reg, fbS_reg, newS_reg, equals_reg);
+      spe_selb(f, newS_reg, newS_reg, fbS_reg, equals_reg);
 
       spe_release_register(f, equals_reg);
       break;
@@ -1534,15 +1534,28 @@ gen_stencil_depth_test(struct spe_function *f,
     * meaning that we have to calculate the stencil values but do not
     * need to mask them), we can avoid generating code.  Don't forget
     * that we need to consider backfacing stencil, if enabled.
+    *
+    * Note that if the backface stencil is *not* enabled, the backface
+    * stencil will have the same values as the frontface stencil.
     */
-   if (dsa->stencil[0].write_mask == 0x0 && (!dsa->stencil[1].enabled || dsa->stencil[1].write_mask == 0x00)) {
-      /* Trivial: don't need to calculate stencil values, and don't need to 
-       * write them back to the framebuffer.
+   if (dsa->stencil[0].fail_op == PIPE_STENCIL_OP_KEEP &&
+       dsa->stencil[0].zfail_op == PIPE_STENCIL_OP_KEEP &&
+       dsa->stencil[0].zpass_op == PIPE_STENCIL_OP_KEEP &&
+       dsa->stencil[1].fail_op == PIPE_STENCIL_OP_KEEP &&
+       dsa->stencil[1].zfail_op == PIPE_STENCIL_OP_KEEP &&
+       dsa->stencil[1].zpass_op == PIPE_STENCIL_OP_KEEP) {
+       /* No changes to any stencil values */
+       need_to_calculate_stencil_values = false;
+       need_to_writemask_stencil_values = false;
+    }
+    else if (dsa->stencil[0].write_mask == 0x0 && dsa->stencil[1].write_mask == 0x0) {
+      /* All changes are writemasked out, so no need to calculate
+       * what those changes might be, and no need to write anything back.
        */
       need_to_calculate_stencil_values = false;
       need_to_writemask_stencil_values = false;
    }
-   else if (dsa->stencil[0].write_mask == 0xff && (!dsa->stencil[1].enabled || dsa->stencil[1].write_mask == 0xff)) {
+   else if (dsa->stencil[0].write_mask == 0xff && dsa->stencil[1].write_mask == 0xff) {
       /* Still trivial, but a little less so.  We need to write the stencil
        * values, but we don't need to mask them.
        */
diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index 63818d4c46..d726622d94 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -244,8 +244,7 @@ cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops)
       }
    }
 
-   spu.read_depth = spu.depth_stencil_alpha.depth.enabled;
-   spu.read_stencil = spu.depth_stencil_alpha.stencil[0].enabled;
+   spu.read_depth_stencil = (spu.depth_stencil_alpha.depth.enabled || spu.depth_stencil_alpha.stencil[0].enabled);
 }
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 668af10be2..692790c9f3 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -160,8 +160,7 @@ struct spu_global
    tile_t ztile ALIGN16_ATTRIB;
 
    /** Read depth/stencil tiles? */
-   boolean read_depth;
-   boolean read_stencil;
+   boolean read_depth_stencil;
 
    /** Current tiles' status */
    ubyte cur_ctile_status, cur_ztile_status;
diff --git a/src/gallium/drivers/cell/spu/spu_render.c b/src/gallium/drivers/cell/spu/spu_render.c
index 5515bb55c9..7c225e2f27 100644
--- a/src/gallium/drivers/cell/spu/spu_render.c
+++ b/src/gallium/drivers/cell/spu/spu_render.c
@@ -98,7 +98,7 @@ my_tile(uint tx, uint ty)
 static INLINE void
 get_cz_tiles(uint tx, uint ty)
 {
-   if (spu.read_depth) {
+   if (spu.read_depth_stencil) {
       if (spu.cur_ztile_status != TILE_STATUS_CLEAR) {
          //printf("SPU %u: getting Z tile %u, %u\n", spu.init.id, tx, ty);
          get_tile(tx, ty, &spu.ztile, TAG_READ_TILE_Z, 1);
@@ -153,7 +153,7 @@ static INLINE void
 wait_put_cz_tiles(void)
 {
    wait_on_mask(1 << TAG_WRITE_TILE_COLOR);
-   if (spu.read_depth) {
+   if (spu.read_depth_stencil) {
       wait_on_mask(1 << TAG_WRITE_TILE_Z);
    }
 }
diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index 4caf7d6b61..5f908159bb 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -369,7 +369,7 @@ flush_spans(void)
    }
    ASSERT(spu.cur_ctile_status != TILE_STATUS_DEFINED);
 
-   if (spu.read_depth) {
+   if (spu.read_depth_stencil) {
       if (spu.cur_ztile_status == TILE_STATUS_GETTING) {
          /* wait for mfc_get() to complete */
          //printf("SPU: %u: waiting for ztile\n", spu.init.id);
-- 
cgit v1.2.3


From 90027f85786406133a5180998a75fb612b6a221e Mon Sep 17 00:00:00 2001
From: Robert Ellison <papillo@tungstengraphics.com>
Date: Tue, 11 Nov 2008 13:57:10 -0700
Subject: CELL: two-sided stencil fixes

With these changes, the tests/stencil_twoside test now works.

- Eliminate blending from the stencil_twoside test, as it produces an
  unneeded dependency on having blending working

- The spe_splat() function will now work if the register being splatted
  and the destination register are the same

- Separate fragment code generated for front-facing and back-facing
  fragments.  Often these are the same; if two-sided stenciling is on,
  they can be different.  This is easier and faster than generating
  code that does both tests and merges the results.

- Fixed a cut/paste bug where if the back Z-pass stencil operation
  were different from all the other operations, the back Z-fail
  results were incorrect.
---
 progs/tests/stencil_twoside.c                      |   2 -
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c        |   7 +-
 src/gallium/drivers/cell/common.h                  |   6 +-
 src/gallium/drivers/cell/ppu/cell_gen_fragment.c   | 239 ++++++---------------
 src/gallium/drivers/cell/ppu/cell_gen_fragment.h   |   2 +-
 src/gallium/drivers/cell/ppu/cell_state_emit.c     |  19 +-
 src/gallium/drivers/cell/spu/spu_command.c         |   6 +-
 src/gallium/drivers/cell/spu/spu_main.c            |   6 +-
 src/gallium/drivers/cell/spu/spu_main.h            |  10 +-
 src/gallium/drivers/cell/spu/spu_per_fragment_op.c |   3 +-
 src/gallium/drivers/cell/spu/spu_per_fragment_op.h |   3 +-
 src/gallium/drivers/cell/spu/spu_tri.c             |  20 +-
 12 files changed, 115 insertions(+), 208 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/progs/tests/stencil_twoside.c b/progs/tests/stencil_twoside.c
index be9d9a776a..8826c46fc2 100644
--- a/progs/tests/stencil_twoside.c
+++ b/progs/tests/stencil_twoside.c
@@ -115,7 +115,6 @@ static void Display( void )
    glVertex2f(-1,  1);
    glEnd();
 
-
    if (use20syntax) {
       stencil_func_separate(GL_FRONT, GL_ALWAYS, 0, ~0);
       stencil_func_separate(GL_BACK, GL_ALWAYS, 0, ~0);
@@ -279,7 +278,6 @@ static void Init( void )
    stencil_op_separate = glutGetProcAddress( "glStencilOpSeparate" );
 
    printf("\nAll 5 squares should be the same color.\n");
-   glEnable( GL_BLEND );
 }
 
 
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
index f8568f690b..1bd9f1c8dd 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
@@ -958,9 +958,12 @@ spe_compare_greater_uint(struct spe_function *p, unsigned rT, unsigned rA, unsig
 void
 spe_splat(struct spe_function *p, unsigned rT, unsigned rA)
 {
+   /* Use a temporary, just in case rT == rA */
+   unsigned int tmp_reg = spe_allocate_available_register(p);
    /* Duplicate bytes 0, 1, 2, and 3 across the whole register */
-   spe_ila(p, rT, 0x00010203);
-   spe_shufb(p, rT, rA, rA, rT);
+   spe_ila(p, tmp_reg, 0x00010203);
+   spe_shufb(p, rT, rA, rA, tmp_reg);
+   spe_release_register(p, tmp_reg);
 }
 
 
diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index 87488ea2d7..a670ed3c6e 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -130,6 +130,9 @@
 #define CELL_FENCE_EMITTED   1
 #define CELL_FENCE_SIGNALLED 2
 
+#define CELL_FACING_FRONT    0
+#define CELL_FACING_BACK     1
+
 struct cell_fence
 {
    /** There's a 16-byte status qword per SPU */
@@ -160,7 +163,8 @@ struct cell_command_fragment_ops
    struct pipe_depth_stencil_alpha_state dsa;
    struct pipe_blend_state blend;
    struct pipe_blend_color blend_color;
-   unsigned code[SPU_MAX_FRAGMENT_OPS_INSTS];
+   unsigned code_front[SPU_MAX_FRAGMENT_OPS_INSTS];
+   unsigned code_back[SPU_MAX_FRAGMENT_OPS_INSTS];
 };
 
 
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
index d9c3ff3f4d..6e425eafaa 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
@@ -1412,144 +1412,72 @@ gen_stencil_values(struct spe_function *f, unsigned int stencil_op,
  * and released by the corresponding spe_release_register_set() call.
  */
 static void
-gen_get_stencil_values(struct spe_function *f, const struct pipe_depth_stencil_alpha_state *dsa,
+gen_get_stencil_values(struct spe_function *f, const struct pipe_stencil_state *stencil,
+                       const unsigned int depth_enabled,
                        unsigned int fbS_reg, 
                        unsigned int *fail_reg, unsigned int *zfail_reg, 
-                       unsigned int *zpass_reg, unsigned int *back_fail_reg, 
-                       unsigned int *back_zfail_reg, unsigned int *back_zpass_reg)
+                       unsigned int *zpass_reg)
 {
-   unsigned zfail_op, back_zfail_op;
+   unsigned zfail_op;
 
    /* Stenciling had better be enabled here */
-   ASSERT(dsa->stencil[0].enabled);
+   ASSERT(stencil->enabled);
 
    /* If the depth test is not enabled, it is treated as though it always
-    * passes.  In particular, that means that the "zfail_op" (and the backfacing
-    * counterpart, if active) are not considered - a failing stencil test will
-    * trigger the "fail_op", and a passing stencil test will trigger the
-    * "zpass_op".
+    * passes, which means that the zfail_op is not considered - a
+    * failing stencil test triggers the fail_op, and a passing one
+    * triggers the zpass_op
     *
-    * By overriding the operations in this case to be PIPE_STENCIL_OP_KEEP,
-    * we keep them from being calculated.
+    * As an optimization, override calculation of the zfail_op values
+    * if they aren't going to be used.  By setting the value of
+    * the operation to PIPE_STENCIL_OP_KEEP, its value will be assumed
+    * to match the incoming stencil values, and no calculation will
+    * be done.
     */
-   if (dsa->depth.enabled) {
-      zfail_op = dsa->stencil[0].zfail_op;
-      back_zfail_op = dsa->stencil[1].zfail_op;
+   if (depth_enabled) {
+      zfail_op = stencil->zfail_op;
    }
    else {
       zfail_op = PIPE_STENCIL_OP_KEEP;
-      back_zfail_op = PIPE_STENCIL_OP_KEEP;
    }
 
    /* One-sided or front-facing stencil */
-   if (dsa->stencil[0].fail_op == PIPE_STENCIL_OP_KEEP) {
+   if (stencil->fail_op == PIPE_STENCIL_OP_KEEP) {
       *fail_reg = fbS_reg;
    }
    else {
       *fail_reg = spe_allocate_available_register(f);
-      gen_stencil_values(f, dsa->stencil[0].fail_op, dsa->stencil[0].ref_value, 
+      gen_stencil_values(f, stencil->fail_op, stencil->ref_value, 
          0xff, fbS_reg, *fail_reg);
    }
 
+   /* Check the possibly overridden value, not the structure value */
    if (zfail_op == PIPE_STENCIL_OP_KEEP) {
       *zfail_reg = fbS_reg;
    }
-   else if (zfail_op == dsa->stencil[0].fail_op) {
+   else if (zfail_op == stencil->fail_op) {
       *zfail_reg = *fail_reg;
    }
    else {
       *zfail_reg = spe_allocate_available_register(f);
-      gen_stencil_values(f, dsa->stencil[0].zfail_op, dsa->stencil[0].ref_value, 
+      gen_stencil_values(f, stencil->zfail_op, stencil->ref_value, 
          0xff, fbS_reg, *zfail_reg);
    }
 
-   if (dsa->stencil[0].zpass_op == PIPE_STENCIL_OP_KEEP) {
+   if (stencil->zpass_op == PIPE_STENCIL_OP_KEEP) {
       *zpass_reg = fbS_reg;
    }
-   else if (dsa->stencil[0].zpass_op == dsa->stencil[0].fail_op) {
+   else if (stencil->zpass_op == stencil->fail_op) {
       *zpass_reg = *fail_reg;
    }
-   else if (dsa->stencil[0].zpass_op == zfail_op) {
+   else if (stencil->zpass_op == zfail_op) {
       *zpass_reg = *zfail_reg;
    }
    else {
       *zpass_reg = spe_allocate_available_register(f);
-      gen_stencil_values(f, dsa->stencil[0].zpass_op, dsa->stencil[0].ref_value, 
+      gen_stencil_values(f, stencil->zpass_op, stencil->ref_value, 
          0xff, fbS_reg, *zpass_reg);
    }
-
-   /* If two-sided stencil is enabled, we have more work to do. */
-   if (!dsa->stencil[1].enabled) {
-      /* This just flags that the registers need not be deallocated later */
-      *back_fail_reg = fbS_reg;
-      *back_zfail_reg = fbS_reg;
-      *back_zpass_reg = fbS_reg;
-   }
-   else {
-      /* Same calculations as above, but for the back stencil */
-      if (dsa->stencil[1].fail_op == PIPE_STENCIL_OP_KEEP) {
-         *back_fail_reg = fbS_reg;
-      }
-      else if (dsa->stencil[1].fail_op == dsa->stencil[0].fail_op) {
-         *back_fail_reg = *fail_reg;
-      }
-      else if (dsa->stencil[1].fail_op == zfail_op) {
-         *back_fail_reg = *zfail_reg;
-      }
-      else if (dsa->stencil[1].fail_op == dsa->stencil[0].zpass_op) {
-         *back_fail_reg = *zpass_reg;
-      }
-      else {
-         *back_fail_reg = spe_allocate_available_register(f);
-         gen_stencil_values(f, dsa->stencil[1].fail_op, dsa->stencil[1].ref_value, 
-            0xff, fbS_reg, *back_fail_reg);
-      }
-
-      if (back_zfail_op == PIPE_STENCIL_OP_KEEP) {
-         *back_zfail_reg = fbS_reg;
-      }
-      else if (back_zfail_op == dsa->stencil[0].fail_op) {
-         *back_zfail_reg = *fail_reg;
-      }
-      else if (back_zfail_op == zfail_op) {
-         *back_zfail_reg = *zfail_reg;
-      }
-      else if (back_zfail_op == dsa->stencil[0].zpass_op) {
-         *back_zfail_reg = *zpass_reg;
-      }
-      else if (back_zfail_op == dsa->stencil[1].fail_op) {
-         *back_zfail_reg = *back_fail_reg;
-      }
-      else {
-         *back_zfail_reg = spe_allocate_available_register(f);
-         gen_stencil_values(f, dsa->stencil[1].zfail_op, dsa->stencil[1].ref_value, 
-            0xff, fbS_reg, *back_zfail_reg);
-      }
-
-      if (dsa->stencil[1].zpass_op == PIPE_STENCIL_OP_KEEP) {
-         *back_zpass_reg = fbS_reg;
-      }
-      else if (dsa->stencil[1].zpass_op == dsa->stencil[0].fail_op) {
-         *back_zpass_reg = *fail_reg;
-      }
-      else if (dsa->stencil[1].zpass_op == zfail_op) {
-         *back_zpass_reg = *zfail_reg;
-      }
-      else if (dsa->stencil[1].zpass_op == dsa->stencil[0].zpass_op) {
-         *back_zpass_reg = *zpass_reg;
-      }
-      else if (dsa->stencil[1].zpass_op == dsa->stencil[1].fail_op) {
-         *back_zpass_reg = *back_fail_reg;
-      }
-      else if (dsa->stencil[1].zpass_op == back_zfail_op) {
-         *back_zpass_reg = *back_zfail_reg;
-      }
-      else {
-         *back_zfail_reg = spe_allocate_available_register(f);
-         gen_stencil_values(f, dsa->stencil[1].zpass_op, dsa->stencil[1].ref_value, 
-            0xff, fbS_reg, *back_zpass_reg);
-      }
-   } /* End of calculations for back-facing stencil */
 }
 
 /* Note that fbZ_reg may *not* be set on entry, if in fact
@@ -1559,7 +1487,7 @@ gen_get_stencil_values(struct spe_function *f, const struct pipe_depth_stencil_a
 static boolean
 gen_stencil_depth_test(struct spe_function *f, 
                        const struct pipe_depth_stencil_alpha_state *dsa, 
-                       const int const facing_reg,
+                       const uint facing,
                        const int mask_reg, const int fragZ_reg, 
                        const int fbZ_reg, const int fbS_reg)
 {
@@ -1571,6 +1499,8 @@ gen_stencil_depth_test(struct spe_function *f,
    boolean need_to_calculate_stencil_values;
    boolean need_to_writemask_stencil_values;
 
+   struct pipe_stencil_state *stencil;
+
    /* Registers.  We may or may not actually allocate these, depending
     * on whether the state values indicate that we need them.
     */
@@ -1598,6 +1528,20 @@ gen_stencil_depth_test(struct spe_function *f,
    spe_comment(f, 0, "Allocating stencil register set");
    spe_allocate_register_set(f);
 
+   /* The facing we're given is the fragment facing; it doesn't
+    * exactly match the stencil facing.  If stencil is enabled,
+    * but two-sided stencil is *not* enabled, we use the same
+    * stencil settings for both front- and back-facing fragments.
+    * We only use the "back-facing" stencil for backfacing fragments
+    * if two-sided stenciling is enabled.
+    */
+   if (facing == CELL_FACING_BACK && dsa->stencil[1].enabled) {
+      stencil = &dsa->stencil[1];
+   }
+   else {
+      stencil = &dsa->stencil[0];
+   }
+
    /* Calculate the writemask.  If the writemask is trivial (either
     * all 0s, meaning that we don't need to calculate any stencil values
     * because they're not going to change the stencil anyway, or all 1s,
@@ -1608,24 +1552,20 @@ gen_stencil_depth_test(struct spe_function *f,
     * Note that if the backface stencil is *not* enabled, the backface
     * stencil will have the same values as the frontface stencil.
     */
-   if (dsa->stencil[0].fail_op == PIPE_STENCIL_OP_KEEP &&
-       dsa->stencil[0].zfail_op == PIPE_STENCIL_OP_KEEP &&
-       dsa->stencil[0].zpass_op == PIPE_STENCIL_OP_KEEP &&
-       dsa->stencil[1].fail_op == PIPE_STENCIL_OP_KEEP &&
-       dsa->stencil[1].zfail_op == PIPE_STENCIL_OP_KEEP &&
-       dsa->stencil[1].zpass_op == PIPE_STENCIL_OP_KEEP) {
-       /* No changes to any stencil values */
+   if (stencil->fail_op == PIPE_STENCIL_OP_KEEP &&
+       stencil->zfail_op == PIPE_STENCIL_OP_KEEP &&
+       stencil->zpass_op == PIPE_STENCIL_OP_KEEP) {
        need_to_calculate_stencil_values = false;
        need_to_writemask_stencil_values = false;
     }
-    else if (dsa->stencil[0].write_mask == 0x0 && dsa->stencil[1].write_mask == 0x0) {
+    else if (stencil->write_mask == 0x0) {
       /* All changes are writemasked out, so no need to calculate
        * what those changes might be, and no need to write anything back.
        */
       need_to_calculate_stencil_values = false;
       need_to_writemask_stencil_values = false;
    }
-   else if (dsa->stencil[0].write_mask == 0xff && dsa->stencil[1].write_mask == 0xff) {
+   else if (stencil->write_mask == 0xff) {
       /* Still trivial, but a little less so.  We need to write the stencil
        * values, but we don't need to mask them.
        */
@@ -1645,14 +1585,7 @@ gen_stencil_depth_test(struct spe_function *f,
        */
       spe_comment(f, 0, "Computing stencil writemask");
       stencil_writemask_reg = spe_allocate_available_register(f);
-      spe_load_uint(f, stencil_writemask_reg, dsa->stencil[0].write_mask);
-      if (dsa->stencil[1].enabled && dsa->stencil[0].write_mask != dsa->stencil[1].write_mask) {
-         unsigned int back_write_mask_reg = spe_allocate_available_register(f);
-         spe_comment(f, 0, "Resolving two-sided stencil writemask");
-         spe_load_uint(f, back_write_mask_reg, dsa->stencil[1].write_mask);
-         spe_selb(f, stencil_writemask_reg, stencil_writemask_reg, back_write_mask_reg, facing_reg);
-         spe_release_register(f, back_write_mask_reg);
-      }
+      spe_load_uint(f, stencil_writemask_reg, dsa->stencil[facing].write_mask);
    }
 
    /* At least one-sided stenciling must be on.  Generate code that
@@ -1666,19 +1599,7 @@ gen_stencil_depth_test(struct spe_function *f,
     */
    spe_comment(f, 0, "Running basic stencil test");
    stencil_pass_reg = spe_allocate_available_register(f);
-   gen_stencil_test(f, &dsa->stencil[0], 0xff, mask_reg, fbS_reg, stencil_pass_reg);
-
-   /* If two-sided stenciling is on, generate code to run the stencil
-    * test on the backfacing stencil as well, and combine the two results
-    * into the one correct result based on facing.
-    */
-   if (dsa->stencil[1].enabled) {
-      unsigned int temp_reg = spe_allocate_available_register(f);
-      spe_comment(f, 0, "Running backface stencil test");
-      gen_stencil_test(f, &dsa->stencil[1], 0xff, mask_reg, fbS_reg, temp_reg);
-      spe_selb(f, stencil_pass_reg, stencil_pass_reg, temp_reg, facing_reg);
-      spe_release_register(f, temp_reg);
-   }
+   gen_stencil_test(f, stencil, 0xff, mask_reg, fbS_reg, stencil_pass_reg);
 
    /* Generate code that, given the mask of valid fragments and the
     * mask of valid fragments that passed the stencil test, computes
@@ -1698,9 +1619,6 @@ gen_stencil_depth_test(struct spe_function *f,
 
    /* We may not need to calculate stencil values, if the writemask is off */
    if (need_to_calculate_stencil_values) {
-      unsigned int back_stencil_fail_values, back_stencil_pass_depth_fail_values, back_stencil_pass_depth_pass_values;
-      unsigned int front_stencil_fail_values, front_stencil_pass_depth_fail_values, front_stencil_pass_depth_pass_values;
-
       /* Generate code that calculates exactly which stencil values we need,
        * without calculating the same value twice (say, if two different
        * stencil ops have the same value).  This code will work for one-sided
@@ -1715,51 +1633,11 @@ gen_stencil_depth_test(struct spe_function *f,
        * This function will allocate a variant number of registers that
        * will be released as part of the register set.
        */
-      spe_comment(f, 0, "Computing stencil values");
-      gen_get_stencil_values(f, dsa, fbS_reg, 
-         &front_stencil_fail_values, &front_stencil_pass_depth_fail_values, 
-         &front_stencil_pass_depth_pass_values, &back_stencil_fail_values, 
-         &back_stencil_pass_depth_fail_values, &back_stencil_pass_depth_pass_values);
-
-      /* Tricky, tricky, tricky - the things we do to create optimal
-       * code...
-       *
-       * The various stencil values registers may overlap with each other
-       * and with fbS_reg arbitrarily (as any particular operation is
-       * only calculated once and stored in one register, no matter
-       * how many times it is used).  So we can't change the values 
-       * within those registers directly - if we change a value in a
-       * register that's being referenced by two different calculations,
-       * we've just unwittingly changed the second value as well...
-       *
-       * Avoid this by allocating new registers to hold the results
-       * (there may be 2, if the depth test is off, or 3, if it is on).
-       * These will be released as part of the register set.
-       */
-      if (!dsa->stencil[1].enabled) {
-         /* The easy case: if two-sided stenciling is *not* enabled, we
-          * just use the front-sided values.
-          */
-         stencil_fail_values = front_stencil_fail_values;
-         stencil_pass_depth_fail_values = front_stencil_pass_depth_fail_values;
-         stencil_pass_depth_pass_values = front_stencil_pass_depth_pass_values;
-      }
-      else { /* two-sided stencil enabled */
-         spe_comment(f, 0, "Resolving backface stencil values");
-         /* Allocate new registers for the needed merged values */
-         stencil_fail_values = spe_allocate_available_register(f);
-         spe_selb(f, stencil_fail_values, front_stencil_fail_values, back_stencil_fail_values, facing_reg);
-         if (dsa->depth.enabled) {
-            stencil_pass_depth_fail_values = spe_allocate_available_register(f);
-            spe_selb(f, stencil_pass_depth_fail_values, front_stencil_pass_depth_fail_values, back_stencil_pass_depth_fail_values, facing_reg);
-         }
-         else {
-            stencil_pass_depth_fail_values = fbS_reg;
-         }
-         stencil_pass_depth_pass_values = spe_allocate_available_register(f);
-         spe_selb(f, stencil_pass_depth_pass_values, front_stencil_pass_depth_pass_values, back_stencil_pass_depth_pass_values, facing_reg);
-      }
-   }
+      spe_comment(f, 0, facing == CELL_FACING_FRONT ? "Computing front-facing stencil values" : "Computing back-facing stencil values");
+      gen_get_stencil_values(f, stencil, dsa->depth.enabled, fbS_reg, 
+         &stencil_fail_values, &stencil_pass_depth_fail_values, 
+         &stencil_pass_depth_pass_values);
+   }  
 
    /* We now have all the stencil values we need.  We also need 
     * the results of the depth test to figure out which
@@ -1896,10 +1774,12 @@ gen_stencil_depth_test(struct spe_function *f,
  * should be much faster.
  *
  * \param cell  the rendering context (in)
+ * \param facing whether the generated code is for front-facing or 
+ *              back-facing fragments
  * \param f     the generated function (out)
  */
 void
-cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
+cell_gen_fragment_function(struct cell_context *cell, uint facing, struct spe_function *f)
 {
    const struct pipe_depth_stencil_alpha_state *dsa = cell->depth_stencil;
    const struct pipe_blend_state *blend = cell->blend;
@@ -1917,7 +1797,8 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
    const int fragB_reg = 10;  /* vector float */
    const int fragA_reg = 11;  /* vector float */
    const int mask_reg = 12;   /* vector uint */
-   const int facing_reg = 13; /* uint */
+
+   ASSERT(facing == CELL_FACING_FRONT || facing == CELL_FACING_BACK);
 
    /* offset of quad from start of tile
     * XXX assuming 4-byte pixels for color AND Z/stencil!!!!
@@ -1945,7 +1826,6 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
    spe_allocate_register(f, fragB_reg);
    spe_allocate_register(f, fragA_reg);
    spe_allocate_register(f, mask_reg);
-   spe_allocate_register(f, facing_reg);
 
    quad_offset_reg = spe_allocate_available_register(f);
    fbRGBA_reg = spe_allocate_available_register(f);
@@ -1969,6 +1849,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
       spe_release_register(f, y2_reg);
    }
 
+   /* Generate the alpha test, if needed. */
    if (dsa->alpha.enabled) {
       gen_alpha_test(dsa, f, mask_reg, fragA_reg);
    }
@@ -2095,7 +1976,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
           * gen_stencil_depth_test() function must ignore the
           * fbZ_reg register if depth is not enabled.
           */
-         write_depth_stencil = gen_stencil_depth_test(f, dsa, facing_reg, mask_reg, fragZ_reg, fbZ_reg, fbS_reg);
+         write_depth_stencil = gen_stencil_depth_test(f, dsa, facing, mask_reg, fragZ_reg, fbZ_reg, fbS_reg);
       }
       else if (dsa->depth.enabled) {
          int zmask_reg = spe_allocate_available_register(f);
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.h b/src/gallium/drivers/cell/ppu/cell_gen_fragment.h
index b59de198dc..2fabfdfb08 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.h
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.h
@@ -31,7 +31,7 @@
 
 
 extern void
-cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f);
+cell_gen_fragment_function(struct cell_context *cell, uint facing, struct spe_function *f);
 
 
 #endif /* CELL_GEN_FRAGMENT_H */
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index dd2d7f7d1e..031b27f11f 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -75,23 +75,29 @@ lookup_fragment_ops(struct cell_context *cell)
     * If not found, create/save new fragment ops command.
     */
    if (!ops) {
-      struct spe_function spe_code;
+      struct spe_function spe_code_front, spe_code_back;
 
       if (0)
          debug_printf("**** Create New Fragment Ops\n");
 
       /* Prepare the buffer that will hold the generated code. */
-      spe_init_func(&spe_code, SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE);
+      spe_init_func(&spe_code_front, SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE);
+      spe_init_func(&spe_code_back, SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE);
 
-      /* generate new code */
-      cell_gen_fragment_function(cell, &spe_code);
+      /* generate new code.  Always generate new code for both front-facing
+       * and back-facing fragments, even if it's the same code in both
+       * cases.
+       */
+      cell_gen_fragment_function(cell, CELL_FACING_FRONT, &spe_code_front);
+      cell_gen_fragment_function(cell, CELL_FACING_BACK, &spe_code_back);
 
       /* alloc new fragment ops command */
       ops = CALLOC_STRUCT(cell_command_fragment_ops);
 
       /* populate the new cell_command_fragment_ops object */
       ops->opcode = CELL_CMD_STATE_FRAGMENT_OPS;
-      memcpy(ops->code, spe_code.store, spe_code_size(&spe_code));
+      memcpy(ops->code_front, spe_code_front.store, spe_code_size(&spe_code_front));
+      memcpy(ops->code_back, spe_code_back.store, spe_code_size(&spe_code_back));
       ops->dsa = *cell->depth_stencil;
       ops->blend = *cell->blend;
 
@@ -99,7 +105,8 @@ lookup_fragment_ops(struct cell_context *cell)
       util_keymap_insert(cell->fragment_ops_cache, &key, ops, NULL);
 
       /* release rtasm buffer */
-      spe_release_func(&spe_code);
+      spe_release_func(&spe_code_front);
+      spe_release_func(&spe_code_back);
    }
    else {
       if (0)
diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index d726622d94..d5faf4e3aa 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -214,7 +214,8 @@ cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops)
 
    D_PRINTF(CELL_DEBUG_CMD, "CMD_STATE_FRAGMENT_OPS\n");
    /* Copy SPU code from batch buffer to spu buffer */
-   memcpy(spu.fragment_ops_code, fops->code, SPU_MAX_FRAGMENT_OPS_INSTS * 4);
+   memcpy(spu.fragment_ops_code_front, fops->code_front, SPU_MAX_FRAGMENT_OPS_INSTS * 4);
+   memcpy(spu.fragment_ops_code_back, fops->code_back, SPU_MAX_FRAGMENT_OPS_INSTS * 4);
    /* Copy state info (for fallback case only) */
    memcpy(&spu.depth_stencil_alpha, &fops->dsa, sizeof(fops->dsa));
    memcpy(&spu.blend, &fops->blend, sizeof(fops->blend));
@@ -234,7 +235,8 @@ cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops)
     * raw state records that the fallback code requires.
     */
    if ((spu.init.debug_flags & CELL_DEBUG_FRAGMENT_OP_FALLBACK) == 0) {
-      spu.fragment_ops = (spu_fragment_ops_func) spu.fragment_ops_code;
+      spu.fragment_ops[CELL_FACING_FRONT] = (spu_fragment_ops_func) spu.fragment_ops_code_front;
+      spu.fragment_ops[CELL_FACING_BACK] = (spu_fragment_ops_func) spu.fragment_ops_code_back;
    }
    else {
       /* otherwise, the default fallback code remains in place */
diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index c8bb251905..7033f6037d 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -63,7 +63,8 @@ one_time_init(void)
     * This will normally be overriden by a code-gen'd function
     * unless CELL_FORCE_FRAGMENT_OPS_FALLBACK is set.
     */
-   spu.fragment_ops = spu_fallback_fragment_ops;
+   spu.fragment_ops[CELL_FACING_FRONT] = spu_fallback_fragment_ops;
+   spu.fragment_ops[CELL_FACING_BACK] = spu_fallback_fragment_ops;
 }
 
 
@@ -90,7 +91,8 @@ main(main_param_t speid, main_param_t argp)
 
    ASSERT(sizeof(tile_t) == TILE_SIZE * TILE_SIZE * 4);
    ASSERT(sizeof(struct cell_command_render) % 8 == 0);
-   ASSERT(((unsigned long) &spu.fragment_ops_code) % 8 == 0);
+   ASSERT(((unsigned long) &spu.fragment_ops_code_front) % 8 == 0);
+   ASSERT(((unsigned long) &spu.fragment_ops_code_back) % 8 == 0);
    ASSERT(((unsigned long) &spu.fragment_program_code) % 8 == 0);
 
    one_time_init();
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 692790c9f3..24cf7d77ce 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -85,8 +85,7 @@ typedef void (*spu_fragment_ops_func)(uint x, uint y,
                                       vector float fragGreen,
                                       vector float fragBlue,
                                       vector float fragAlpha,
-                                      vector unsigned int mask,
-                                      uint facing);
+                                      vector unsigned int mask);
 
 /** Function for running fragment program */
 typedef vector unsigned int (*spu_fragment_program_func)(vector float *inputs,
@@ -170,9 +169,10 @@ struct spu_global
    ubyte ztile_status[CELL_MAX_HEIGHT/TILE_SIZE][CELL_MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
 
    /** Current fragment ops machine code, at 8-byte boundary */
-   uint fragment_ops_code[SPU_MAX_FRAGMENT_OPS_INSTS] ALIGN8_ATTRIB;
-   /** Current fragment ops function */
-   spu_fragment_ops_func fragment_ops;
+   uint fragment_ops_code_front[SPU_MAX_FRAGMENT_OPS_INSTS] ALIGN8_ATTRIB;
+   uint fragment_ops_code_back[SPU_MAX_FRAGMENT_OPS_INSTS] ALIGN8_ATTRIB;
+   /** Current fragment ops functions, 0 = frontfacing, 1 = backfacing */
+   spu_fragment_ops_func fragment_ops[2];
 
    /** Current fragment program machine code, at 8-byte boundary */
    uint fragment_program_code[SPU_MAX_FRAGMENT_PROGRAM_INSTS] ALIGN8_ATTRIB;
diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
index f8ffc70492..683664e8a4 100644
--- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
+++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
@@ -75,8 +75,7 @@ spu_fallback_fragment_ops(uint x, uint y,
                           vector float fragG,
                           vector float fragB,
                           vector float fragA,
-                          vector unsigned int mask,
-                          uint facing)
+                          vector unsigned int mask)
 {
    vector float frag_aos[4];
    unsigned int fbc0, fbc1, fbc2, fbc3 ; /* framebuffer/tile colors */
diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.h b/src/gallium/drivers/cell/spu/spu_per_fragment_op.h
index a61689c83a..f817abf046 100644
--- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.h
+++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.h
@@ -38,8 +38,7 @@ spu_fallback_fragment_ops(uint x, uint y,
                           vector float fragGreen,
                           vector float fragBlue,
                           vector float fragAlpha,
-                          vector unsigned int mask,
-                          uint facing);
+                          vector unsigned int mask);
 
 
 #endif /* SPU_PER_FRAGMENT_OP */
diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index 5f908159bb..22e51a86ae 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -275,15 +275,20 @@ emit_quad( int x, int y, mask_t mask)
 
          /* Execute per-fragment/quad operations, including:
           * alpha test, z test, stencil test, blend and framebuffer writing.
+          * Note that there are two different fragment operations functions
+          * that can be called, one for front-facing fragments, and one
+          * for back-facing fragments.  (Often the two are the same;
+          * but in some cases, like two-sided stenciling, they can be
+          * very different.)  So choose the correct function depending
+          * on the calculated facing.
           */
-         spu.fragment_ops(ix, iy, &spu.ctile, &spu.ztile,
+         spu.fragment_ops[setup.facing](ix, iy, &spu.ctile, &spu.ztile,
                           fragZ,
                           outputs[0*4+0],
                           outputs[0*4+1],
                           outputs[0*4+2],
                           outputs[0*4+3],
-                          mask,
-                          setup.facing);
+                          mask);
       }
    }
 }
@@ -519,7 +524,14 @@ setup_sort_vertices(const struct vertex_header *v0,
 
    setup.oneOverArea = 1.0f / area;
 
-   /* The product of area * sign indicates front/back orientation (0/1) */
+   /* The product of area * sign indicates front/back orientation (0/1).
+    * Just in case someone gets the bright idea of switching the front
+    * and back constants without noticing that we're assuming their
+    * values in this operation, also assert that the values are
+    * what we think they are.
+    */
+   ASSERT(CELL_FACING_FRONT == 0);
+   ASSERT(CELL_FACING_BACK == 1);
    setup.facing = (area * sign > 0.0f)
       ^ (spu.rasterizer.front_winding == PIPE_WINDING_CW);
 
-- 
cgit v1.2.3


From 11fc390f6478526d4f0bdb4b7e628284da31b3b9 Mon Sep 17 00:00:00 2001
From: Robert Ellison <papillo@tungstengraphics.com>
Date: Fri, 21 Nov 2008 11:42:14 -0700
Subject: CELL: use variant-length fragment ops programs

This is a set of changes that optimizes the memory use of fragment
operation programs (by using and transmitting only as much memory as is
needed for the fragment ops programs, instead of maximal sizes), as well
as eliminate the dependency on hard-coded maximal program sizes.  State
that is not dependent on fragment facing (i.e. that isn't using
two-sided stenciling) will only save and transmit a single
fragment operation program, instead of two identical programs.

- Added the ability to emit a LNOP (No Operation (Load)) instruction.
  This is used to pad the generated fragment operations programs to
  a multiple of 8 bytes, which is necessary for proper operation of
  the dual instruction pipeline, and also required for proper SPU-side
  decoding.

- Added the ability to allocate and manage a variant-length
  struct cell_command_fragment_ops.  This structure now puts the
  generated function field at the end, where it can be as large
  as necessary.

- On the PPU side, we now combine the generated front-facing and
  back-facing code into a single variant-length buffer (and only use one
  if the two sets of code are identical) for transmission to the SPU.

- On the SPU side, we pull the correct sizes out of the buffer,
  allocate a new code buffer if the one we have isn't large enough,
  and save the code to that buffer.  The buffer is deallocated when
  the SPU exits.

- Commented out the emit_fetch() static function, which was not being used.
---
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c      |   7 +-
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h      |  11 ++-
 src/gallium/auxiliary/util/u_memory.h            |   2 +
 src/gallium/drivers/cell/common.h                |  31 +++++--
 src/gallium/drivers/cell/ppu/cell_gen_fragment.c |   7 +-
 src/gallium/drivers/cell/ppu/cell_state_emit.c   |  77 ++++++++++++++--
 src/gallium/drivers/cell/ppu/cell_vertex_fetch.c |   3 +
 src/gallium/drivers/cell/spu/spu_command.c       | 111 +++++++++++++++++------
 src/gallium/drivers/cell/spu/spu_command.h       |  32 ++++++-
 src/gallium/drivers/cell/spu/spu_main.c          |  15 +--
 src/gallium/drivers/cell/spu/spu_main.h          |   4 +-
 11 files changed, 232 insertions(+), 68 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
index 1bd9f1c8dd..b9a75ae559 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
@@ -341,7 +341,11 @@ static void emit_RI18(struct spe_function *p, unsigned op, unsigned rT,
 }
 
 
-
+#define EMIT(_name, _op) \
+void _name (struct spe_function *p) \
+{ \
+   emit_RR(p, _op, 0, 0, 0, __FUNCTION__); \
+}
 
 #define EMIT_(_name, _op) \
 void _name (struct spe_function *p, unsigned rT) \
@@ -713,7 +717,6 @@ hbrr;
 #if 0
 stop;
 EMIT_RR  (spe_stopd, 0x140);
-EMIT_    (spe_lnop,  0x001);
 EMIT_    (spe_nop,   0x201);
 sync;
 EMIT_    (spe_dsync, 0x003);
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
index 7c211ffc51..f9ad2acacd 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
@@ -99,7 +99,9 @@ extern void spe_comment(struct spe_function *p, int rel_indent, const char *s);
 
 #endif /* RTASM_PPC_SPE_H */
 
-#ifndef EMIT_
+#ifndef EMIT
+#define EMIT(_name, _op) \
+    extern void _name (struct spe_function *p);
 #define EMIT_(_name, _op) \
     extern void _name (struct spe_function *p, unsigned rT);
 #define EMIT_R(_name, _op) \
@@ -129,7 +131,7 @@ extern void spe_comment(struct spe_function *p, int rel_indent, const char *s);
 #define EMIT_I16(_name, _op) \
     extern void _name (struct spe_function *p, int imm);
 #define UNDEF_EMIT_MACROS
-#endif /* EMIT_ */
+#endif /* EMIT */
 
 
 /* Memory load / store instructions
@@ -294,6 +296,10 @@ EMIT_RI16(spe_brz,       0x040)
 EMIT_RI16(spe_brhnz,     0x046)
 EMIT_RI16(spe_brhz,      0x044)
 
+/* Control instructions
+ */
+EMIT     (spe_lnop,      0x001)
+
 extern void
 spe_lqd(struct spe_function *p, unsigned rT, unsigned rA, int offset);
 
@@ -418,6 +424,7 @@ EMIT_R   (spe_wrch,       0x10d)
 
 
 #ifdef UNDEF_EMIT_MACROS
+#undef EMIT
 #undef EMIT_
 #undef EMIT_R
 #undef EMIT_RR
diff --git a/src/gallium/auxiliary/util/u_memory.h b/src/gallium/auxiliary/util/u_memory.h
index 857102719d..1a6b596421 100644
--- a/src/gallium/auxiliary/util/u_memory.h
+++ b/src/gallium/auxiliary/util/u_memory.h
@@ -151,6 +151,8 @@ REALLOC( void *old_ptr, unsigned old_size, unsigned new_size )
 
 #define CALLOC_STRUCT(T)   (struct T *) CALLOC(1, sizeof(struct T))
 
+#define CALLOC_VARIANT_LENGTH_STRUCT(T,more_size)   ((struct T *) CALLOC(1, sizeof(struct T) + more_size))
+
 
 /**
  * Return memory on given byte alignment
diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index a670ed3c6e..98554d7f52 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -121,11 +121,6 @@
 #define CELL_DEBUG_CMD                  (1 << 5)
 #define CELL_DEBUG_CACHE                (1 << 6)
 
-/** Max instructions for doing per-fragment operations */
-#define SPU_MAX_FRAGMENT_OPS_INSTS 128
-
-
-
 #define CELL_FENCE_IDLE      0
 #define CELL_FENCE_EMITTED   1
 #define CELL_FENCE_SIGNALLED 2
@@ -153,18 +148,36 @@ struct cell_command_fence
 
 /**
  * Command to specify per-fragment operations state and generated code.
- * Note that the dsa, blend, blend_color fields are really only needed
+ * Note that this is a variant-length structure, allocated with as 
+ * much memory as needed to hold the generated code; the "code"
+ * field *must* be the last field in the structure.  Also, the entire
+ * length of the structure (including the variant code field) must be
+ * a multiple of 8 bytes; we require that this structure itself be
+ * a multiple of 8 bytes, and that the generated code also be a multiple
+ * of 8 bytes.
+ *
+ * Also note that the dsa, blend, blend_color fields are really only needed
  * for the fallback/C per-pixel code.  They're not used when we generate
- * dynamic SPU fragment code (which is the normal case).
+ * dynamic SPU fragment code (which is the normal case), and will eventually
+ * be removed from this structure.
  */
 struct cell_command_fragment_ops
 {
    uint64_t opcode;      /**< CELL_CMD_STATE_FRAGMENT_OPS */
+
+   /* Fields for the fallback case */
    struct pipe_depth_stencil_alpha_state dsa;
    struct pipe_blend_state blend;
    struct pipe_blend_color blend_color;
-   unsigned code_front[SPU_MAX_FRAGMENT_OPS_INSTS];
-   unsigned code_back[SPU_MAX_FRAGMENT_OPS_INSTS];
+
+   /* Fields for the generated SPU code */
+   unsigned total_code_size;
+   unsigned front_code_index;
+   unsigned back_code_index;
+   /* this field has variant length, and must be the last field in 
+    * the structure
+    */
+   unsigned code[0];
 };
 
 
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
index 82336d6635..2c64eb1bcc 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
@@ -1776,7 +1776,10 @@ gen_stencil_depth_test(struct spe_function *f,
  * \param cell  the rendering context (in)
  * \param facing whether the generated code is for front-facing or 
  *              back-facing fragments
- * \param f     the generated function (out)
+ * \param f     the generated function (in/out); on input, the function
+ *              must already have been initialized.  On exit, whatever
+ *              instructions within the generated function have had
+ *              the fragment ops appended.
  */
 void
 cell_gen_fragment_function(struct cell_context *cell, const uint facing, struct spe_function *f)
@@ -1808,8 +1811,6 @@ cell_gen_fragment_function(struct cell_context *cell, const uint facing, struct
    int fbRGBA_reg;  /**< framebuffer's RGBA colors for quad */
    int fbZS_reg;    /**< framebuffer's combined z/stencil values for quad */
 
-   spe_init_func(f, SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE);
-
    if (cell->debug_flags & CELL_DEBUG_ASM) {
       spe_print_code(f, true);
       spe_indent(f, 8);
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index 031b27f11f..0a0af81f53 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -76,30 +76,86 @@ lookup_fragment_ops(struct cell_context *cell)
     */
    if (!ops) {
       struct spe_function spe_code_front, spe_code_back;
+      unsigned int facing_dependent, total_code_size;
 
       if (0)
          debug_printf("**** Create New Fragment Ops\n");
 
-      /* Prepare the buffer that will hold the generated code. */
-      spe_init_func(&spe_code_front, SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE);
-      spe_init_func(&spe_code_back, SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE);
+      /* Prepare the buffer that will hold the generated code.  The
+       * "0" passed in for the size means that the SPE code will
+       * use a default size.
+       */
+      spe_init_func(&spe_code_front, 0);
+      spe_init_func(&spe_code_back, 0);
 
-      /* generate new code.  Always generate new code for both front-facing
+      /* Generate new code.  Always generate new code for both front-facing
        * and back-facing fragments, even if it's the same code in both
        * cases.
        */
       cell_gen_fragment_function(cell, CELL_FACING_FRONT, &spe_code_front);
       cell_gen_fragment_function(cell, CELL_FACING_BACK, &spe_code_back);
 
-      /* alloc new fragment ops command */
-      ops = CALLOC_STRUCT(cell_command_fragment_ops);
+      /* Make sure the code is a multiple of 8 bytes long; this is
+       * required to ensure that the dual pipe instruction alignment
+       * is correct.  It's also important for the SPU unpacking,
+       * which assumes 8-byte boundaries.
+       */
+      unsigned int front_code_size = spe_code_size(&spe_code_front);
+      while (front_code_size % 8 != 0) {
+         spe_lnop(&spe_code_front);
+         front_code_size = spe_code_size(&spe_code_front);
+      }
+      unsigned int back_code_size = spe_code_size(&spe_code_back);
+      while (back_code_size % 8 != 0) {
+         spe_lnop(&spe_code_back);
+         back_code_size = spe_code_size(&spe_code_back);
+      }
 
+      /* Determine whether the code we generated is facing-dependent, by
+       * determining whether the generated code is different for the front-
+       * and back-facing fragments.
+       */
+      if (front_code_size == back_code_size && memcmp(spe_code_front.store, spe_code_back.store, front_code_size) == 0) {
+         /* Code is identical; only need one copy. */
+         facing_dependent = 0;
+         total_code_size = front_code_size;
+      }
+      else {
+         /* Code is different for front-facing and back-facing fragments.
+          * Need to send both copies.
+          */
+         facing_dependent = 1;
+         total_code_size = front_code_size + back_code_size;
+      }
+
+      /* alloc new fragment ops command.  Note that this structure
+       * has variant length based on the total code size required.
+       */
+      ops = CALLOC_VARIANT_LENGTH_STRUCT(cell_command_fragment_ops, total_code_size);
       /* populate the new cell_command_fragment_ops object */
       ops->opcode = CELL_CMD_STATE_FRAGMENT_OPS;
-      memcpy(ops->code_front, spe_code_front.store, spe_code_size(&spe_code_front));
-      memcpy(ops->code_back, spe_code_back.store, spe_code_size(&spe_code_back));
+      ops->total_code_size = total_code_size;
+      ops->front_code_index = 0;
+      memcpy(ops->code, spe_code_front.store, front_code_size);
+      if (facing_dependent) {
+        /* We have separate front- and back-facing code.  Append the
+         * back-facing code to the buffer.  Be careful because the code
+         * size is in bytes, but the buffer is of unsigned elements.
+         */
+        ops->back_code_index = front_code_size / sizeof(spe_code_front.store[0]);
+        memcpy(ops->code + ops->back_code_index, spe_code_back.store, back_code_size);
+      }
+      else {
+        /* Use the same code for front- and back-facing fragments */
+        ops->back_code_index = ops->front_code_index;
+      }
+
+      /* Set the fields for the fallback case.  Note that these fields
+       * (and the whole fallback case) will eventually go away.
+       */
       ops->dsa = *cell->depth_stencil;
       ops->blend = *cell->blend;
+      ops->blend_color = cell->blend_color;
 
       /* insert cell_command_fragment_ops object into keymap/cache */
       util_keymap_insert(cell->fragment_ops_cache, &key, ops, NULL);
@@ -200,9 +256,10 @@ cell_emit_state(struct cell_context *cell)
                       CELL_NEW_DEPTH_STENCIL |
                       CELL_NEW_BLEND)) {
       struct cell_command_fragment_ops *fops, *fops_cmd;
-      fops_cmd = cell_batch_alloc(cell, sizeof(*fops_cmd));
+      /* Note that cell_command_fragment_ops is a variant-sized record */
       fops = lookup_fragment_ops(cell);
-      memcpy(fops_cmd, fops, sizeof(*fops));
+      fops_cmd = cell_batch_alloc(cell, sizeof(*fops_cmd) + fops->total_code_size);
+      memcpy(fops_cmd, fops, sizeof(*fops) + fops->total_code_size);
    }
 
    if (cell->dirty & CELL_NEW_SAMPLER) {
diff --git a/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c b/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c
index 18969005b0..9cba537d9e 100644
--- a/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c
+++ b/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c
@@ -145,6 +145,8 @@ emit_matrix_transpose(struct spe_function *p,
 }
 
 
+#if 0
+/* This appears to not be used currently */
 static void
 emit_fetch(struct spe_function *p,
 	   unsigned in_ptr, unsigned *offset,
@@ -256,6 +258,7 @@ emit_fetch(struct spe_function *p,
       spe_release_register(p, float_one);
    }
 }
+#endif
 
 
 void cell_update_vertex_fetch(struct draw_context *draw)
diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index d5faf4e3aa..8500d19754 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -210,45 +210,72 @@ cmd_release_verts(const struct cell_command_release_verts *release)
 static void
 cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops)
 {
-   static int warned = 0;
-
    D_PRINTF(CELL_DEBUG_CMD, "CMD_STATE_FRAGMENT_OPS\n");
-   /* Copy SPU code from batch buffer to spu buffer */
-   memcpy(spu.fragment_ops_code_front, fops->code_front, SPU_MAX_FRAGMENT_OPS_INSTS * 4);
-   memcpy(spu.fragment_ops_code_back, fops->code_back, SPU_MAX_FRAGMENT_OPS_INSTS * 4);
-   /* Copy state info (for fallback case only) */
+
+   /* Copy state info (for fallback case only - this will eventually
+    * go away when the fallback case goes away)
+    */
    memcpy(&spu.depth_stencil_alpha, &fops->dsa, sizeof(fops->dsa));
    memcpy(&spu.blend, &fops->blend, sizeof(fops->blend));
    memcpy(&spu.blend_color, &fops->blend_color, sizeof(fops->blend_color));
 
-   /* Parity twist!  For now, always use the fallback code by default,
-    * only switching to codegen when specifically requested.  This
-    * allows us to develop freely without risking taking down the
-    * branch.
-    *
-    * Later, the parity of this check will be reversed, so that
-    * codegen is *always* used, unless we specifically indicate that
-    * we don't want it.
-    *
-    * Eventually, the option will be removed completely, because in
-    * final code we'll always use codegen and won't even provide the
-    * raw state records that the fallback code requires.
+   /* Make sure the SPU knows which buffers it's expected to read when
+    * it's told to pull tiles.
     */
-   if ((spu.init.debug_flags & CELL_DEBUG_FRAGMENT_OP_FALLBACK) == 0) {
-      spu.fragment_ops[CELL_FACING_FRONT] = (spu_fragment_ops_func) spu.fragment_ops_code_front;
-      spu.fragment_ops[CELL_FACING_BACK] = (spu_fragment_ops_func) spu.fragment_ops_code_back;
-   }
-   else {
-      /* otherwise, the default fallback code remains in place */
+   spu.read_depth_stencil = (spu.depth_stencil_alpha.depth.enabled || spu.depth_stencil_alpha.stencil[0].enabled);
+
+   /* If we're forcing the fallback code to be used (for debug purposes),
+    * install that.  Otherwise install the incoming SPU code.
+    */
+   if ((spu.init.debug_flags & CELL_DEBUG_FRAGMENT_OP_FALLBACK) != 0) {
+      static unsigned int warned = 0;
       if (!warned) {
          fprintf(stderr, "Cell Warning: using fallback per-fragment code\n");
          warned = 1;
       }
+      /* The following two lines aren't really necessary if you
+       * know the debug flags won't change during a run, and if you
+       * know that the function pointers are initialized correctly.
+       * We set them here to allow a person to change the debug
+       * flags during a run (from inside a debugger).
+       */
+      spu.fragment_ops[CELL_FACING_FRONT] = spu_fallback_fragment_ops;
+      spu.fragment_ops[CELL_FACING_BACK] = spu_fallback_fragment_ops;
+      return;
    }
 
-   spu.read_depth_stencil = (spu.depth_stencil_alpha.depth.enabled || spu.depth_stencil_alpha.stencil[0].enabled);
-}
+   /* Make sure the SPU code buffer is large enough to hold the incoming code.
+    * Note that we *don't* use align_malloc() and align_free(), because
+    * those utility functions are *not* available in SPU code.
+    * */
+   if (spu.fragment_ops_code_size < fops->total_code_size) {
+      if (spu.fragment_ops_code != NULL) {
+         free(spu.fragment_ops_code);
+      }
+      spu.fragment_ops_code_size = fops->total_code_size;
+      spu.fragment_ops_code = malloc(fops->total_code_size);
+      if (spu.fragment_ops_code == NULL) {
+         /* Whoops. */
+         fprintf(stderr, "CELL Warning: failed to allocate fragment ops code (%d bytes) - using fallback\n", fops->total_code_size);
+         spu.fragment_ops_code = NULL;
+         spu.fragment_ops_code_size = 0;
+         spu.fragment_ops[CELL_FACING_FRONT] = spu_fallback_fragment_ops;
+         spu.fragment_ops[CELL_FACING_BACK] = spu_fallback_fragment_ops;
+         return;
+      }
+   }
 
+   /* Copy the SPU code from the command buffer to the spu buffer */
+   memcpy(spu.fragment_ops_code, fops->code, fops->total_code_size);
+
+   /* Set the pointers for the front-facing and back-facing fragments
+    * to the specified offsets within the code.  Note that if the
+    * front-facing and back-facing code are the same, they'll have
+    * the same offset.
+    */
+   spu.fragment_ops[CELL_FACING_FRONT] = (spu_fragment_ops_func) &spu.fragment_ops_code[fops->front_code_index];
+   spu.fragment_ops[CELL_FACING_BACK] = (spu_fragment_ops_func) &spu.fragment_ops_code[fops->back_code_index];
+}
 
 static void
 cmd_state_fragment_program(const struct cell_command_fragment_program *fp)
@@ -588,7 +615,8 @@ cmd_batch(uint opcode)
             struct cell_command_fragment_ops *fops
                = (struct cell_command_fragment_ops *) &buffer[pos];
             cmd_state_fragment_ops(fops);
-            pos += sizeof(*fops) / 8;
+            /* This is a variant-sized command */
+            pos += (sizeof(*fops) + fops->total_code_size)/ 8;
          }
          break;
       case CELL_CMD_STATE_FRAGMENT_PROGRAM:
@@ -756,3 +784,32 @@ command_loop(void)
    if (spu.init.debug_flags & CELL_DEBUG_CACHE)
       spu_dcache_report();
 }
+
+/* Initialize this module; we manage the fragment ops buffer here. */
+void
+spu_command_init(void)
+{
+   /* Install default/fallback fragment processing function.
+    * This will normally be overriden by a code-gen'd function
+    * unless CELL_FORCE_FRAGMENT_OPS_FALLBACK is set.
+    */
+   spu.fragment_ops[CELL_FACING_FRONT] = spu_fallback_fragment_ops;
+   spu.fragment_ops[CELL_FACING_BACK] = spu_fallback_fragment_ops;
+
+   /* Set up the basic empty buffer for code-gen'ed fragment ops */
+   spu.fragment_ops_code = NULL;
+   spu.fragment_ops_code_size = 0;
+}
+
+void
+spu_command_close(void)
+{
+   /* Deallocate the code-gen buffer for fragment ops, and reset the
+    * fragment ops functions to their initial setting (just to leave
+    * things in a good state).
+    */
+   if (spu.fragment_ops_code != NULL) {
+      free(spu.fragment_ops_code);
+   }
+   spu_command_init();
+}
diff --git a/src/gallium/drivers/cell/spu/spu_command.h b/src/gallium/drivers/cell/spu/spu_command.h
index 853e9aa549..83dcdade28 100644
--- a/src/gallium/drivers/cell/spu/spu_command.h
+++ b/src/gallium/drivers/cell/spu/spu_command.h
@@ -1,7 +1,35 @@
-
-
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
 
 extern void
 command_loop(void);
 
+extern void
+spu_command_init(void);
 
+extern void
+spu_command_close(void);
diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index 7033f6037d..97c86d194d 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -58,17 +58,8 @@ one_time_init(void)
    memset(spu.ctile_status, TILE_STATUS_DEFINED, sizeof(spu.ctile_status));
    memset(spu.ztile_status, TILE_STATUS_DEFINED, sizeof(spu.ztile_status));
    invalidate_tex_cache();
-
-   /* Install default/fallback fragment processing function.
-    * This will normally be overriden by a code-gen'd function
-    * unless CELL_FORCE_FRAGMENT_OPS_FALLBACK is set.
-    */
-   spu.fragment_ops[CELL_FACING_FRONT] = spu_fallback_fragment_ops;
-   spu.fragment_ops[CELL_FACING_BACK] = spu_fallback_fragment_ops;
 }
 
-
-
 /* In some versions of the SDK the SPE main takes 'unsigned long' as a
  * parameter.  In others it takes 'unsigned long long'.  Use a define to
  * select between the two.
@@ -91,11 +82,11 @@ main(main_param_t speid, main_param_t argp)
 
    ASSERT(sizeof(tile_t) == TILE_SIZE * TILE_SIZE * 4);
    ASSERT(sizeof(struct cell_command_render) % 8 == 0);
-   ASSERT(((unsigned long) &spu.fragment_ops_code_front) % 8 == 0);
-   ASSERT(((unsigned long) &spu.fragment_ops_code_back) % 8 == 0);
+   ASSERT(sizeof(struct cell_command_fragment_ops) % 8 == 0);
    ASSERT(((unsigned long) &spu.fragment_program_code) % 8 == 0);
 
    one_time_init();
+   spu_command_init();
 
    D_PRINTF(CELL_DEBUG_CMD, "main() speid=%lu\n", (unsigned long) speid);
    D_PRINTF(CELL_DEBUG_FRAGMENT_OP_FALLBACK, "using fragment op fallback\n");
@@ -120,5 +111,7 @@ main(main_param_t speid, main_param_t argp)
 
    command_loop();
 
+   spu_command_close();
+
    return 0;
 }
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 24cf7d77ce..33767e7c51 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -169,8 +169,8 @@ struct spu_global
    ubyte ztile_status[CELL_MAX_HEIGHT/TILE_SIZE][CELL_MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
 
    /** Current fragment ops machine code, at 8-byte boundary */
-   uint fragment_ops_code_front[SPU_MAX_FRAGMENT_OPS_INSTS] ALIGN8_ATTRIB;
-   uint fragment_ops_code_back[SPU_MAX_FRAGMENT_OPS_INSTS] ALIGN8_ATTRIB;
+   uint *fragment_ops_code;
+   uint fragment_ops_code_size;
    /** Current fragment ops functions, 0 = frontfacing, 1 = backfacing */
    spu_fragment_ops_func fragment_ops[2];
 
-- 
cgit v1.2.3


From 921ec940f0c639831d378d425dbc454d496da9eb Mon Sep 17 00:00:00 2001
From: Jonathan Adamczewski <jadamcze@utas.edu.a>
Date: Sun, 4 Jan 2009 18:46:44 -0700
Subject: cell: new spu_shuffle.h header

Facilitates creation of shuffle patterns for use with spu_shuffle()
and si_shufb() intrinsics.

To be used by subsequent patches.
---
 src/gallium/drivers/cell/spu/spu_shuffle.h | 186 +++++++++++++++++++++++++++++
 1 file changed, 186 insertions(+)
 create mode 100644 src/gallium/drivers/cell/spu/spu_shuffle.h

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/spu/spu_shuffle.h b/src/gallium/drivers/cell/spu/spu_shuffle.h
new file mode 100644
index 0000000000..7cbdb814d2
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_shuffle.h
@@ -0,0 +1,186 @@
+#ifndef SPU_SHUFFLE_H
+#define SPU_SHUFFLE_H
+
+/*
+ * Generate shuffle patterns with minimal fuss.
+ *
+ * Based on ideas from 
+ * http://www.insomniacgames.com/tech/articles/0408/files/shuffles.pdf
+ *
+ * A-P indicates 0-15th position in first vector
+ * a-p indicates 0-15th position in second vector
+ *
+ * +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
+ * |00|01|02|03|04|05|06|07|08|09|0a|0b|0c|0d|0e|0f|
+ * +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
+ * |          A|          B|          C|          D|
+ * +-----+-----+-----+-----+-----+-----+-----+-----+
+ * |    A|    B|    C|    D|    E|    F|    G|    H|
+ * +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
+ * | A| B| C| D| E| F| G| H| I| J| K| L| M| N| O| P|
+ * +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
+ *
+ * x or X indicates 0xff
+ * 8 indicates 0x80
+ * 0 indicates 0x00
+ *
+ * The macros SHUFFLE4() SHUFFLE8() and SHUFFLE16() provide a const vector 
+ * unsigned char literal suitable for use with spu_shuffle().
+ *
+ * The macros SHUFB4() SHUFB8() and SHUFB16() provide a const qword vector 
+ * literal suitable for use with si_shufb().
+ *
+ *
+ * For example :
+ * SHUFB4(A,A,A,A)
+ * expands to :
+ * ((const qword){0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3})
+ * 
+ * SHUFFLE8(A,B,a,b,C,c,8,8)
+ * expands to :
+ * ((const vector unsigned char){0x00,0x01,0x02,0x03,0x10,0x11,0x12,0x13,
+ *				 0x04,0x05,0x14,0x15,0xe0,0xe0,0xe0,0xe0})
+ *
+ */
+
+#include <spu_intrinsics.h>
+
+#define SHUFFLE_PATTERN_4_A__  0x00, 0x01, 0x02, 0x03
+#define SHUFFLE_PATTERN_4_B__  0x04, 0x05, 0x06, 0x07
+#define SHUFFLE_PATTERN_4_C__  0x08, 0x09, 0x0a, 0x0b
+#define SHUFFLE_PATTERN_4_D__  0x0c, 0x0d, 0x0e, 0x0f
+#define SHUFFLE_PATTERN_4_a__  0x10, 0x11, 0x12, 0x13
+#define SHUFFLE_PATTERN_4_b__  0x14, 0x15, 0x16, 0x17
+#define SHUFFLE_PATTERN_4_c__  0x18, 0x19, 0x1a, 0x1b
+#define SHUFFLE_PATTERN_4_d__  0x1c, 0x1d, 0x1e, 0x1f
+#define SHUFFLE_PATTERN_4_X__  0xc0, 0xc0, 0xc0, 0xc0
+#define SHUFFLE_PATTERN_4_x__  0xc0, 0xc0, 0xc0, 0xc0
+#define SHUFFLE_PATTERN_4_0__  0x80, 0x80, 0x80, 0x80
+#define SHUFFLE_PATTERN_4_8__  0xe0, 0xe0, 0xe0, 0xe0
+
+#define SHUFFLE_VECTOR_4__(A, B, C, D) \
+   SHUFFLE_PATTERN_4_##A##__, \
+   SHUFFLE_PATTERN_4_##B##__, \
+   SHUFFLE_PATTERN_4_##C##__, \
+   SHUFFLE_PATTERN_4_##D##__
+
+#define SHUFFLE4(A, B, C, D) \
+   ((const vector unsigned char){ \
+      SHUFFLE_VECTOR_4__(A, B, C, D) \
+   })
+
+#define SHUFB4(A, B, C, D) \
+   ((const qword){ \
+      SHUFFLE_VECTOR_4__(A, B, C, D) \
+   })
+
+
+#define SHUFFLE_PATTERN_8_A__  0x00, 0x01
+#define SHUFFLE_PATTERN_8_B__  0x02, 0x03
+#define SHUFFLE_PATTERN_8_C__  0x04, 0x05
+#define SHUFFLE_PATTERN_8_D__  0x06, 0x07
+#define SHUFFLE_PATTERN_8_E__  0x08, 0x09
+#define SHUFFLE_PATTERN_8_F__  0x0a, 0x0b
+#define SHUFFLE_PATTERN_8_G__  0x0c, 0x0d
+#define SHUFFLE_PATTERN_8_H__  0x0e, 0x0f
+#define SHUFFLE_PATTERN_8_a__  0x10, 0x11
+#define SHUFFLE_PATTERN_8_b__  0x12, 0x13
+#define SHUFFLE_PATTERN_8_c__  0x14, 0x15
+#define SHUFFLE_PATTERN_8_d__  0x16, 0x17
+#define SHUFFLE_PATTERN_8_e__  0x18, 0x19
+#define SHUFFLE_PATTERN_8_f__  0x1a, 0x1b
+#define SHUFFLE_PATTERN_8_g__  0x1c, 0x1d
+#define SHUFFLE_PATTERN_8_h__  0x1e, 0x1f
+#define SHUFFLE_PATTERN_8_X__  0xc0, 0xc0
+#define SHUFFLE_PATTERN_8_x__  0xc0, 0xc0
+#define SHUFFLE_PATTERN_8_0__  0x80, 0x80
+#define SHUFFLE_PATTERN_8_8__  0xe0, 0xe0
+
+
+#define SHUFFLE_VECTOR_8__(A, B, C, D, E, F, G, H) \
+   SHUFFLE_PATTERN_8_##A##__, \
+   SHUFFLE_PATTERN_8_##B##__, \
+   SHUFFLE_PATTERN_8_##C##__, \
+   SHUFFLE_PATTERN_8_##D##__, \
+   SHUFFLE_PATTERN_8_##E##__, \
+   SHUFFLE_PATTERN_8_##F##__, \
+   SHUFFLE_PATTERN_8_##G##__, \
+   SHUFFLE_PATTERN_8_##H##__
+
+#define SHUFFLE8(A, B, C, D, E, F, G, H) \
+   ((const vector unsigned char){ \
+      SHUFFLE_VECTOR_8__(A, B, C, D, E, F, G, H) \
+   })
+
+#define SHUFB8(A, B, C, D, E, F, G, H) \
+   ((const qword){ \
+      SHUFFLE_VECTOR_8__(A, B, C, D, E, F, G, H) \
+   })
+
+
+#define SHUFFLE_PATTERN_16_A__  0x00
+#define SHUFFLE_PATTERN_16_B__  0x01
+#define SHUFFLE_PATTERN_16_C__  0x02
+#define SHUFFLE_PATTERN_16_D__  0x03
+#define SHUFFLE_PATTERN_16_E__  0x04
+#define SHUFFLE_PATTERN_16_F__  0x05
+#define SHUFFLE_PATTERN_16_G__  0x06
+#define SHUFFLE_PATTERN_16_H__  0x07
+#define SHUFFLE_PATTERN_16_I__  0x08
+#define SHUFFLE_PATTERN_16_J__  0x09
+#define SHUFFLE_PATTERN_16_K__  0x0a
+#define SHUFFLE_PATTERN_16_L__  0x0b
+#define SHUFFLE_PATTERN_16_M__  0x0c
+#define SHUFFLE_PATTERN_16_N__  0x0d
+#define SHUFFLE_PATTERN_16_O__  0x0e
+#define SHUFFLE_PATTERN_16_P__  0x0f
+#define SHUFFLE_PATTERN_16_a__  0x10
+#define SHUFFLE_PATTERN_16_b__  0x11
+#define SHUFFLE_PATTERN_16_c__  0x12
+#define SHUFFLE_PATTERN_16_d__  0x13
+#define SHUFFLE_PATTERN_16_e__  0x14
+#define SHUFFLE_PATTERN_16_f__  0x15
+#define SHUFFLE_PATTERN_16_g__  0x16
+#define SHUFFLE_PATTERN_16_h__  0x17
+#define SHUFFLE_PATTERN_16_i__  0x18
+#define SHUFFLE_PATTERN_16_j__  0x19
+#define SHUFFLE_PATTERN_16_k__  0x1a
+#define SHUFFLE_PATTERN_16_l__  0x1b
+#define SHUFFLE_PATTERN_16_m__  0x1c
+#define SHUFFLE_PATTERN_16_n__  0x1d
+#define SHUFFLE_PATTERN_16_o__  0x1e
+#define SHUFFLE_PATTERN_16_p__  0x1f
+#define SHUFFLE_PATTERN_16_X__  0xc0
+#define SHUFFLE_PATTERN_16_x__  0xc0
+#define SHUFFLE_PATTERN_16_0__  0x80
+#define SHUFFLE_PATTERN_16_8__  0xe0
+
+#define SHUFFLE_VECTOR_16__(A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P) \
+   SHUFFLE_PATTERN_16_##A##__, \
+   SHUFFLE_PATTERN_16_##B##__, \
+   SHUFFLE_PATTERN_16_##C##__, \
+   SHUFFLE_PATTERN_16_##D##__, \
+   SHUFFLE_PATTERN_16_##E##__, \
+   SHUFFLE_PATTERN_16_##F##__, \
+   SHUFFLE_PATTERN_16_##G##__, \
+   SHUFFLE_PATTERN_16_##H##__, \
+   SHUFFLE_PATTERN_16_##I##__, \
+   SHUFFLE_PATTERN_16_##J##__, \
+   SHUFFLE_PATTERN_16_##K##__, \
+   SHUFFLE_PATTERN_16_##L##__, \
+   SHUFFLE_PATTERN_16_##M##__, \
+   SHUFFLE_PATTERN_16_##N##__, \
+   SHUFFLE_PATTERN_16_##O##__, \
+   SHUFFLE_PATTERN_16_##P
+
+#define SHUFFLE16(A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P) \
+   ((const vector unsigned char){ \
+      SHUFFLE_VECTOR_16__(A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P) \
+   })
+
+#define SHUFB16(A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P) \
+   ((const qword){ \
+      SHUFFLE_VECTOR_16__(A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P) \
+   })
+
+#endif
-- 
cgit v1.2.3


From bd2e8888ed6dac7420466404f61ce5ea15bf52bc Mon Sep 17 00:00:00 2001
From: Jonathan Adamczewski <jadamcze@utas.edu.a>
Date: Sun, 4 Jan 2009 18:48:46 -0700
Subject: cell: improvements to spu_tri.c

Replace int setup.span{left,right}[2] with vec_uint4 setup.span.quad
SIMDize calculate_mask() and inline into into flush_spans()
Set setup.span.quad members using spu_shuffle() or spu_sel().
Reduces spu_tri.o by ~116 bytes.
---
 src/gallium/drivers/cell/spu/spu_tri.c | 94 +++++++++++++++++++---------------
 1 file changed, 52 insertions(+), 42 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index 22e51a86ae..30531d38f8 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -35,6 +35,7 @@
 #include "util/u_math.h"
 #include "spu_colorpack.h"
 #include "spu_main.h"
+#include "spu_shuffle.h"
 #include "spu_texture.h"
 #include "spu_tile.h"
 #include "spu_tri.h"
@@ -122,8 +123,7 @@ struct setup_stage {
    struct interp_coef coef[PIPE_MAX_SHADER_INPUTS];
 
    struct {
-      int left[2];   /**< [0] = row0, [1] = row1 */
-      int right[2];
+      vec_int4 quad; /**< [0] = row0, [1] = row1; {left[0],left[1],right[0],right[1]} */
       int y;
       unsigned y_flags;
       unsigned mask;     /**< mask of MASK_BOTTOM/TOP_LEFT/RIGHT bits */
@@ -305,27 +305,6 @@ block(int x)
 }
 
 
-/**
- * Compute mask which indicates which pixels in the 2x2 quad are actually inside
- * the triangle's bounds.
- * The mask is a uint4 vector and each element will be 0 or 0xffffffff.
- */
-static INLINE mask_t
-calculate_mask(int x)
-{
-   /* This is a little tricky.
-    * Use & instead of && to avoid branches.
-    * Use negation to convert true/false to ~0/0 values.
-    */
-   mask_t mask;
-   mask = spu_insert(-((x   >= setup.span.left[0]) & (x   < setup.span.right[0])), mask, 0);
-   mask = spu_insert(-((x+1 >= setup.span.left[0]) & (x+1 < setup.span.right[0])), mask, 1);
-   mask = spu_insert(-((x   >= setup.span.left[1]) & (x   < setup.span.right[1])), mask, 2);
-   mask = spu_insert(-((x+1 >= setup.span.left[1]) & (x+1 < setup.span.right[1])), mask, 3);
-   return mask;
-}
-
-
 /**
  * Render a horizontal span of quads
  */
@@ -333,25 +312,29 @@ static void
 flush_spans(void)
 {
    int minleft, maxright;
-   int x;
+
+   const int l0 = spu_extract(setup.span.quad, 0);
+   const int l1 = spu_extract(setup.span.quad, 1);
+   const int r0 = spu_extract(setup.span.quad, 2);
+   const int r1 = spu_extract(setup.span.quad, 3);
 
    switch (setup.span.y_flags) {
    case 0x3:
       /* both odd and even lines written (both quad rows) */
-      minleft = MIN2(setup.span.left[0], setup.span.left[1]);
-      maxright = MAX2(setup.span.right[0], setup.span.right[1]);
+      minleft = MIN2(l0, l1);
+      maxright = MAX2(r0, r1);
       break;
 
    case 0x1:
       /* only even line written (quad top row) */
-      minleft = setup.span.left[0];
-      maxright = setup.span.right[0];
+      minleft = l0;
+      maxright = r0;
       break;
 
    case 0x2:
       /* only odd line written (quad bottom row) */
-      minleft = setup.span.left[1];
-      maxright = setup.span.right[1];
+      minleft = l1;
+      maxright = r1;
       break;
 
    default:
@@ -389,17 +372,42 @@ flush_spans(void)
       ASSERT(spu.cur_ztile_status != TILE_STATUS_DEFINED);
    }
 
-   /* XXX this loop could be moved into the above switch cases and
-    * calculate_mask() could be simplified a bit...
-    */
-   for (x = block(minleft); x <= block(maxright); x += 2) {
-      emit_quad( x, setup.span.y, calculate_mask( x ));
+   /* XXX this loop could be moved into the above switch cases... */
+   
+   /* Setup for mask calculation */
+   const vec_int4 quad_LlRr = setup.span.quad;
+   const vec_int4 quad_RrLl = spu_rlqwbyte(quad_LlRr, 8);
+   const vec_int4 quad_LLll = spu_shuffle(quad_LlRr, quad_LlRr, SHUFFLE4(A,A,B,B));
+   const vec_int4 quad_RRrr = spu_shuffle(quad_RrLl, quad_RrLl, SHUFFLE4(A,A,B,B));
+
+   const vec_int4 twos = spu_splats(2);
+
+   const int x = block(minleft);
+   vec_int4 xs = {x, x+1, x, x+1};
+
+   for (; spu_extract(xs, 0) <= block(maxright); xs += twos) {
+      /**
+       * Computes mask to indicate which pixels in the 2x2 quad are actually
+       * inside the triangle's bounds.
+       */
+      
+      /* Calculate ({x,x+1,x,x+1} >= {l[0],l[0],l[1],l[1]}) */
+      const mask_t gt_LLll_xs = spu_cmpgt(quad_LLll, xs);
+      const mask_t gte_xs_LLll = spu_nand(gt_LLll_xs, gt_LLll_xs); 
+      
+      /* Calculate ({r[0],r[0],r[1],r[1]} > {x,x+1,x,x+1}) */
+      const mask_t gt_RRrr_xs = spu_cmpgt(quad_RRrr, xs);
+
+      /* Combine results to create mask */
+      const mask_t mask = spu_and(gte_xs_LLll, gt_RRrr_xs);
+
+      emit_quad(spu_extract(xs, 0), setup.span.y, mask);
    }
 
    setup.span.y = 0;
    setup.span.y_flags = 0;
-   setup.span.right[0] = 0;
-   setup.span.right[1] = 0;
+   /* Zero right elements */
+   setup.span.quad = spu_shuffle(setup.span.quad, setup.span.quad, SHUFFLE4(A,B,0,0));
 }
 
 
@@ -746,9 +754,11 @@ subtriangle(struct edge *eleft, struct edge *eright, unsigned lines)
             setup.span.y = block(_y);
          }
 
-         setup.span.left[_y&1] = left;
-         setup.span.right[_y&1] = right;
-         setup.span.y_flags |= 1<<(_y&1);
+         int offset = _y&1;
+         vec_int4 quad_LlRr = {left, left, right, right};
+         /* Store left and right in 0 or 1 row of quad based on offset */
+         setup.span.quad = spu_sel(quad_LlRr, setup.span.quad, spu_maskw(5<<offset));
+         setup.span.y_flags |= 1<<offset;
       }
    }
 
@@ -790,8 +800,8 @@ tri_draw(const float *v0, const float *v1, const float *v2,
 
    setup.span.y = 0;
    setup.span.y_flags = 0;
-   setup.span.right[0] = 0;
-   setup.span.right[1] = 0;
+   /* Zero right elements */
+   setup.span.quad = spu_shuffle(setup.span.quad, setup.span.quad, SHUFFLE4(A,B,0,0));
 
    if (setup.oneOverArea < 0.0) {
       /* emaj on left */
-- 
cgit v1.2.3


From 90167d0ee94384327d4a8482b21fdd28ba2c7171 Mon Sep 17 00:00:00 2001
From: Jonathan Adamczewski <jadamcze@utas.edu.a>
Date: Mon, 5 Jan 2009 07:49:48 -0700
Subject: cell: SIMDize some subtractions

Put edge.{dx,dy} into a union with a vector and perform subtractions in
setup_sort_vertices() on vectors.

Reduces spu_tri.o by ~300 bytes.
---
 src/gallium/drivers/cell/spu/spu_tri.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index 30531d38f8..1ef75b386f 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -77,8 +77,13 @@ struct vertex_header {
  * Triangle edge info
  */
 struct edge {
-   float dx;		/**< X(v1) - X(v0), used only during setup */
-   float dy;		/**< Y(v1) - Y(v0), used only during setup */
+   union {
+      struct {
+         float dx;	/**< X(v1) - X(v0), used only during setup */
+         float dy;	/**< Y(v1) - Y(v0), used only during setup */
+      };
+      vec_float4 ds;    /**< vector accessor for dx and dy */
+   };
    float dxdy;		/**< dx/dy */
    float sx, sy;	/**< first sample point coord */
    int lines;		/**< number of lines on this edge */
@@ -517,12 +522,9 @@ setup_sort_vertices(const struct vertex_header *v0,
        spu_extract(setup.vmax->data[0], 0) > setup.cliprect_maxx)
       return FALSE;
 
-   setup.ebot.dx = spu_extract(setup.vmid->data[0], 0) - spu_extract(setup.vmin->data[0], 0);
-   setup.ebot.dy = spu_extract(setup.vmid->data[0], 1) - spu_extract(setup.vmin->data[0], 1);
-   setup.emaj.dx = spu_extract(setup.vmax->data[0], 0) - spu_extract(setup.vmin->data[0], 0);
-   setup.emaj.dy = spu_extract(setup.vmax->data[0], 1) - spu_extract(setup.vmin->data[0], 1);
-   setup.etop.dx = spu_extract(setup.vmax->data[0], 0) - spu_extract(setup.vmid->data[0], 0);
-   setup.etop.dy = spu_extract(setup.vmax->data[0], 1) - spu_extract(setup.vmid->data[0], 1);
+   setup.ebot.ds = spu_sub(setup.vmid->data[0], setup.vmin->data[0]);
+   setup.emaj.ds = spu_sub(setup.vmax->data[0], setup.vmin->data[0]);
+   setup.etop.ds = spu_sub(setup.vmax->data[0], setup.vmid->data[0]);
 
    /*
     * Compute triangle's area.  Use 1/area to compute partial
-- 
cgit v1.2.3


From 785e90a7dcfcaaf671157cd03699a165d45eabb0 Mon Sep 17 00:00:00 2001
From: Jonathan Adamczewski <jadamcze@utas.edu.a>
Date: Mon, 5 Jan 2009 07:52:17 -0700
Subject: cell: SIMDize sorting in setup_sort_vertices()

Put setup.v{min,mid,max,provoke} into a union with qword vertex_headers.
Rewrite vertex sorting to more efficiently handle the packed data items.
Reduces spu_tri.o by ~128 bytes.
---
 src/gallium/drivers/cell/spu/spu_tri.c | 97 +++++++++++++++-------------------
 1 file changed, 42 insertions(+), 55 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index 1ef75b386f..322be1252e 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -108,10 +108,15 @@ struct setup_stage {
     * turn.  Currently fixed at 4 floats, but should change in time.
     * Codegen will help cope with this.
     */
-   const struct vertex_header *vmax;
-   const struct vertex_header *vmid;
-   const struct vertex_header *vmin;
-   const struct vertex_header *vprovoke;
+   union {
+      struct {
+         const struct vertex_header *vmin;
+         const struct vertex_header *vmid;
+         const struct vertex_header *vmax;
+         const struct vertex_header *vprovoke;
+      };
+      qword vertex_headers;
+   };
 
    struct edge ebot;
    struct edge etop;
@@ -457,55 +462,39 @@ setup_sort_vertices(const struct vertex_header *v0,
 
    /* determine bottom to top order of vertices */
    {
-      float y0 = spu_extract(v0->data[0], 1);
-      float y1 = spu_extract(v1->data[0], 1);
-      float y2 = spu_extract(v2->data[0], 1);
-      if (y0 <= y1) {
-	 if (y1 <= y2) {
-	    /* y0<=y1<=y2 */
-	    setup.vmin = v0;   
-	    setup.vmid = v1;   
-	    setup.vmax = v2;
-            sign = -1.0f;
-	 }
-	 else if (y2 <= y0) {
-	    /* y2<=y0<=y1 */
-	    setup.vmin = v2;   
-	    setup.vmid = v0;   
-	    setup.vmax = v1;   
-            sign = -1.0f;
-	 }
-	 else {
-	    /* y0<=y2<=y1 */
-	    setup.vmin = v0;   
-	    setup.vmid = v2;   
-	    setup.vmax = v1;  
-            sign = 1.0f;
-	 }
-      }
-      else {
-	 if (y0 <= y2) {
-	    /* y1<=y0<=y2 */
-	    setup.vmin = v1;   
-	    setup.vmid = v0;   
-	    setup.vmax = v2;  
-            sign = 1.0f;
-	 }
-	 else if (y2 <= y1) {
-	    /* y2<=y1<=y0 */
-	    setup.vmin = v2;   
-	    setup.vmid = v1;   
-	    setup.vmax = v0;  
-            sign = 1.0f;
-	 }
-	 else {
-	    /* y1<=y2<=y0 */
-	    setup.vmin = v1;   
-	    setup.vmid = v2;   
-	    setup.vmax = v0;
-            sign = -1.0f;
-	 }
-      }
+      /* A table of shuffle patterns for putting vertex_header pointers into
+         correct order.  Quite magical. */
+      const vec_uchar16 sort_order_patterns[] = {
+         SHUFFLE4(A,B,C,C),
+         SHUFFLE4(C,A,B,C),
+         SHUFFLE4(A,C,B,C),
+         SHUFFLE4(B,C,A,C),
+         SHUFFLE4(B,A,C,C),
+         SHUFFLE4(C,B,A,C) };
+
+      /* The vertex_header pointers, packed for easy shuffling later */
+      const vec_uint4 vs = {(unsigned)v0, (unsigned)v1, (unsigned)v2};
+
+      /* Collate y values into two vectors for comparison.
+         Using only one shuffle constant! ;) */
+      const vec_float4 y_02_ = spu_shuffle(v0->data[0], v2->data[0], SHUFFLE4(0,B,b,C));
+      const vec_float4 y_10_ = spu_shuffle(v1->data[0], v0->data[0], SHUFFLE4(0,B,b,C));
+      const vec_float4 y_012 = spu_shuffle(y_02_, v1->data[0], SHUFFLE4(0,B,b,C));
+      const vec_float4 y_120 = spu_shuffle(y_10_, v2->data[0], SHUFFLE4(0,B,b,C));
+
+      /* Perform comparison: {y0,y1,y2} > {y1,y2,y0} */
+      const vec_uint4 compare = spu_cmpgt(y_012, y_120);
+      /* Compress the result of the comparison into 4 bits */
+      const vec_uint4 gather = spu_gather(compare);
+      /* Subtract one to attain the index into the LUT.  Magical. */
+      const unsigned int index = spu_extract(gather, 0) - 1;
+
+      /* Load the appropriate pattern and construct the desired vector. */
+      setup.vertex_headers = (qword)spu_shuffle(vs, vs, sort_order_patterns[index]);
+
+      /* Using the result of the comparison, set sign.
+         Very magical. */
+      sign = ((si_to_uint(si_cntb((qword)gather)) == 2) ? 1.0f : -1.0f);
    }
 
    /* Check if triangle is completely outside the tile bounds */
@@ -545,8 +534,6 @@ setup_sort_vertices(const struct vertex_header *v0,
    setup.facing = (area * sign > 0.0f)
       ^ (spu.rasterizer.front_winding == PIPE_WINDING_CW);
 
-   setup.vprovoke = v2;
-
    return TRUE;
 }
 
-- 
cgit v1.2.3


From 402e6752b53d04af0bbfc5391547c2d127bce859 Mon Sep 17 00:00:00 2001
From: Jonathan Adamczewski <jadamcze@utas.edu.au>
Date: Mon, 12 Jan 2009 16:24:49 -0700
Subject: cell: allocate batch buffers w/ 16-byte alignment

Replace cell_batch{align,alloc)*() with cell_batch_alloc16(), allocating
multiples of 16 bytes that are 16 byte aligned.

Opcodes are stored in preferred slot of SPU machine word.

Various structures are explicitly padded to 16 byte multiples.

Added STATIC_ASSERT().
---
 src/gallium/drivers/cell/common.h              | 43 +++++++++++----
 src/gallium/drivers/cell/ppu/cell_batch.c      | 73 ++++----------------------
 src/gallium/drivers/cell/ppu/cell_batch.h      |  9 +---
 src/gallium/drivers/cell/ppu/cell_clear.c      |  5 +-
 src/gallium/drivers/cell/ppu/cell_flush.c      | 13 ++---
 src/gallium/drivers/cell/ppu/cell_state_emit.c | 43 ++++++++-------
 src/gallium/drivers/cell/ppu/cell_vbuf.c       | 16 +++---
 src/gallium/drivers/cell/spu/spu_command.c     | 52 +++++++++---------
 8 files changed, 111 insertions(+), 143 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index 98554d7f52..1f6860da11 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -49,6 +49,15 @@
    }
 
 
+
+#define JOIN(x, y) JOIN_AGAIN(x, y)
+#define JOIN_AGAIN(x, y) x ## y
+
+#define STATIC_ASSERT(e) \
+{typedef char JOIN(assertion_failed_at_line_, __LINE__) [(e) ? 1 : -1];}
+
+
+
 /** for sanity checking */
 #define ASSERT_ALIGN16(ptr) \
   ASSERT((((unsigned long) (ptr)) & 0xf) == 0);
@@ -134,6 +143,11 @@ struct cell_fence
    volatile uint status[CELL_MAX_SPUS][4];
 };
 
+#ifdef __SPU__
+typedef vector unsigned int opcode_t;
+#else
+typedef unsigned int opcode_t[4];
+#endif
 
 /**
  * Fence command sent to SPUs.  In response, the SPUs will write
@@ -141,8 +155,9 @@ struct cell_fence
  */
 struct cell_command_fence
 {
-   uint64_t opcode;      /**< CELL_CMD_FENCE */
+   opcode_t opcode;      /**< CELL_CMD_FENCE */
    struct cell_fence *fence;
+   uint32_t pad_[3];
 };
 
 
@@ -163,7 +178,7 @@ struct cell_command_fence
  */
 struct cell_command_fragment_ops
 {
-   uint64_t opcode;      /**< CELL_CMD_STATE_FRAGMENT_OPS */
+   opcode_t opcode;      /**< CELL_CMD_STATE_FRAGMENT_OPS */
 
    /* Fields for the fallback case */
    struct pipe_depth_stencil_alpha_state dsa;
@@ -189,8 +204,9 @@ struct cell_command_fragment_ops
  */
 struct cell_command_fragment_program
 {
-   uint64_t opcode;      /**< CELL_CMD_STATE_FRAGMENT_PROGRAM */
+   opcode_t opcode;      /**< CELL_CMD_STATE_FRAGMENT_PROGRAM */
    uint num_inst;        /**< Number of instructions */
+   uint32_t pad[3];
    unsigned code[SPU_MAX_FRAGMENT_PROGRAM_INSTS];
 };
 
@@ -200,10 +216,11 @@ struct cell_command_fragment_program
  */
 struct cell_command_framebuffer
 {
-   uint64_t opcode;     /**< CELL_CMD_STATE_FRAMEBUFFER */
+   opcode_t opcode;     /**< CELL_CMD_STATE_FRAMEBUFFER */
    int width, height;
    void *color_start, *depth_start;
    enum pipe_format color_format, depth_format;
+   uint32_t pad_[2];
 };
 
 
@@ -212,7 +229,7 @@ struct cell_command_framebuffer
  */
 struct cell_command_rasterizer
 {
-   uint64_t opcode;    /**< CELL_CMD_STATE_RASTERIZER */
+   opcode_t opcode;    /**< CELL_CMD_STATE_RASTERIZER */
    struct pipe_rasterizer_state rasterizer;
 };
 
@@ -222,9 +239,10 @@ struct cell_command_rasterizer
  */
 struct cell_command_clear_surface
 {
-   uint64_t opcode;     /**< CELL_CMD_CLEAR_SURFACE */
+   opcode_t opcode;     /**< CELL_CMD_CLEAR_SURFACE */
    uint surface; /**< Temporary: 0=color, 1=Z */
    uint value;
+   uint32_t pad[2];
 };
 
 
@@ -271,7 +289,7 @@ struct cell_shader_info
 #define SPU_VERTS_PER_BATCH 64
 struct cell_command_vs
 {
-   uint64_t opcode;       /**< CELL_CMD_VS_EXECUTE */
+   opcode_t opcode;       /**< CELL_CMD_VS_EXECUTE */
    uint64_t vOut[SPU_VERTS_PER_BATCH];
    unsigned num_elts;
    unsigned elts[SPU_VERTS_PER_BATCH];
@@ -283,7 +301,7 @@ struct cell_command_vs
 
 struct cell_command_render
 {
-   uint64_t opcode;   /**< CELL_CMD_RENDER */
+   opcode_t opcode;   /**< CELL_CMD_RENDER */
    uint prim_type;    /**< PIPE_PRIM_x */
    uint num_verts;
    uint vertex_size;  /**< bytes per vertex */
@@ -292,27 +310,30 @@ struct cell_command_render
    float xmin, ymin, xmax, ymax;  /* XXX another dummy field */
    uint min_index;
    boolean inline_verts;
+   uint32_t pad_[1];
 };
 
 
 struct cell_command_release_verts
 {
-   uint64_t opcode;         /**< CELL_CMD_RELEASE_VERTS */
+   opcode_t opcode;         /**< CELL_CMD_RELEASE_VERTS */
    uint vertex_buf;    /**< in [0, CELL_NUM_BUFFERS-1] */
+   uint32_t pad_[3];
 };
 
 
 struct cell_command_sampler
 {
-   uint64_t opcode;         /**< CELL_CMD_STATE_SAMPLER */
+   opcode_t opcode;         /**< CELL_CMD_STATE_SAMPLER */
    uint unit;
    struct pipe_sampler_state state;
+   uint32_t pad_[1];
 };
 
 
 struct cell_command_texture
 {
-   uint64_t opcode;     /**< CELL_CMD_STATE_TEXTURE */
+   opcode_t opcode;     /**< CELL_CMD_STATE_TEXTURE */
    uint target;         /**< PIPE_TEXTURE_x */
    uint unit;
    void *start[CELL_MAX_TEXTURE_LEVELS];   /**< Address in main memory */
diff --git a/src/gallium/drivers/cell/ppu/cell_batch.c b/src/gallium/drivers/cell/ppu/cell_batch.c
index 962775cd33..fe144f8b84 100644
--- a/src/gallium/drivers/cell/ppu/cell_batch.c
+++ b/src/gallium/drivers/cell/ppu/cell_batch.c
@@ -108,15 +108,16 @@ emit_fence(struct cell_context *cell)
       fence->status[i][0] = CELL_FENCE_EMITTED;
    }
 
+   STATIC_ASSERT(sizeof(struct cell_command_fence) % 16 == 0);
+   ASSERT(size % 16 == 0);
    ASSERT(size + sizeof(struct cell_command_fence) <= CELL_BUFFER_SIZE);
 
    fence_cmd = (struct cell_command_fence *) (cell->buffer[batch] + size);
-   fence_cmd->opcode = CELL_CMD_FENCE;
+   fence_cmd->opcode[0] = CELL_CMD_FENCE;
    fence_cmd->fence = fence;
 
    /* update batch buffer size */
    cell->buffer_size[batch] = size + sizeof(struct cell_command_fence);
-   assert(sizeof(struct cell_command_fence) % 8 == 0);
 }
 
 
@@ -191,70 +192,19 @@ cell_batch_free_space(const struct cell_context *cell)
 }
 
 
-/**
- * Append data to the current batch buffer.
- * \param data  address of block of bytes to append
- * \param bytes  size of block of bytes
- */
-void
-cell_batch_append(struct cell_context *cell, const void *data, uint bytes)
-{
-   uint size;
-
-   ASSERT(bytes % 8 == 0);
-   ASSERT(bytes <= CELL_BUFFER_SIZE);
-   ASSERT(cell->cur_batch >= 0);
-
-#ifdef ASSERT
-   {
-      uint spu;
-      for (spu = 0; spu < cell->num_spus; spu++) {
-         ASSERT(cell->buffer_status[spu][cell->cur_batch][0]
-                 == CELL_BUFFER_STATUS_USED);
-      }
-   }
-#endif
-
-   size = cell->buffer_size[cell->cur_batch];
-
-   if (bytes > cell_batch_free_space(cell)) {
-      cell_batch_flush(cell);
-      size = 0;
-   }
-
-   ASSERT(size + bytes <= CELL_BUFFER_SIZE);
-
-   memcpy(cell->buffer[cell->cur_batch] + size, data, bytes);
-
-   cell->buffer_size[cell->cur_batch] = size + bytes;
-}
-
-
 /**
  * Allocate space in the current batch buffer for 'bytes' space.
+ * Bytes must be a multiple of 16 bytes.  Allocation will be 16 byte aligned.
  * \return address in batch buffer to put data
  */
 void *
-cell_batch_alloc(struct cell_context *cell, uint bytes)
-{
-   return cell_batch_alloc_aligned(cell, bytes, 1);
-}
-
-
-/**
- * Same as \sa cell_batch_alloc, but return an address at a particular
- * alignment.
- */
-void *
-cell_batch_alloc_aligned(struct cell_context *cell, uint bytes,
-                         uint alignment)
+cell_batch_alloc16(struct cell_context *cell, uint bytes)
 {
    void *pos;
-   uint size, padbytes;
+   uint size;
 
-   ASSERT(bytes % 8 == 0);
+   ASSERT(bytes % 16 == 0);
    ASSERT(bytes <= CELL_BUFFER_SIZE);
-   ASSERT(alignment > 0);
    ASSERT(cell->cur_batch >= 0);
 
 #ifdef ASSERT
@@ -269,17 +219,12 @@ cell_batch_alloc_aligned(struct cell_context *cell, uint bytes,
 
    size = cell->buffer_size[cell->cur_batch];
 
-   padbytes = (alignment - (size % alignment)) % alignment;
-
-   if (padbytes + bytes > cell_batch_free_space(cell)) {
+   if (bytes > cell_batch_free_space(cell)) {
       cell_batch_flush(cell);
       size = 0;
    }
-   else {
-      size += padbytes;
-   }
 
-   ASSERT(size % alignment == 0);
+   ASSERT(size % 16 == 0);
    ASSERT(size + bytes <= CELL_BUFFER_SIZE);
 
    pos = (void *) (cell->buffer[cell->cur_batch] + size);
diff --git a/src/gallium/drivers/cell/ppu/cell_batch.h b/src/gallium/drivers/cell/ppu/cell_batch.h
index f74dd60079..290136031a 100644
--- a/src/gallium/drivers/cell/ppu/cell_batch.h
+++ b/src/gallium/drivers/cell/ppu/cell_batch.h
@@ -44,15 +44,8 @@ cell_batch_flush(struct cell_context *cell);
 extern uint
 cell_batch_free_space(const struct cell_context *cell);
 
-extern void
-cell_batch_append(struct cell_context *cell, const void *data, uint bytes);
-
-extern void *
-cell_batch_alloc(struct cell_context *cell, uint bytes);
-
 extern void *
-cell_batch_alloc_aligned(struct cell_context *cell, uint bytes,
-                         uint alignment);
+cell_batch_alloc16(struct cell_context *cell, uint bytes);
 
 extern void
 cell_init_batch_buffers(struct cell_context *cell);
diff --git a/src/gallium/drivers/cell/ppu/cell_clear.c b/src/gallium/drivers/cell/ppu/cell_clear.c
index 037635e466..c2e276988c 100644
--- a/src/gallium/drivers/cell/ppu/cell_clear.c
+++ b/src/gallium/drivers/cell/ppu/cell_clear.c
@@ -99,10 +99,11 @@ cell_clear_surface(struct pipe_context *pipe, struct pipe_surface *ps,
 
    /* Build a CLEAR command and place it in the current batch buffer */
    {
+      STATIC_ASSERT(sizeof(struct cell_command_clear_surface) % 16 == 0);
       struct cell_command_clear_surface *clr
          = (struct cell_command_clear_surface *)
-         cell_batch_alloc(cell, sizeof(*clr));
-      clr->opcode = CELL_CMD_CLEAR_SURFACE;
+         cell_batch_alloc16(cell, sizeof(*clr));
+      clr->opcode[0] = CELL_CMD_CLEAR_SURFACE;
       clr->surface = surfIndex;
       clr->value = clearValue;
    }
diff --git a/src/gallium/drivers/cell/ppu/cell_flush.c b/src/gallium/drivers/cell/ppu/cell_flush.c
index a64967b4b9..8275c9dc9c 100644
--- a/src/gallium/drivers/cell/ppu/cell_flush.c
+++ b/src/gallium/drivers/cell/ppu/cell_flush.c
@@ -72,8 +72,9 @@ cell_flush_int(struct cell_context *cell, unsigned flags)
    flushing = TRUE;
 
    if (flags & CELL_FLUSH_WAIT) {
-      uint64_t *cmd = (uint64_t *) cell_batch_alloc(cell, sizeof(uint64_t));
-      *cmd = CELL_CMD_FINISH;
+      STATIC_ASSERT(sizeof(opcode_t) % 16 == 0);
+      opcode_t *cmd = (opcode_t*) cell_batch_alloc16(cell, sizeof(opcode_t));
+      *cmd[0] = CELL_CMD_FINISH;
    }
 
    cell_batch_flush(cell);
@@ -101,11 +102,11 @@ void
 cell_flush_buffer_range(struct cell_context *cell, void *ptr,
 			unsigned size)
 {
-   uint64_t batch[1 + (ROUNDUP8(sizeof(struct cell_buffer_range)) / 8)];
-   struct cell_buffer_range *br = (struct cell_buffer_range *) & batch[1];
-
+   STATIC_ASSERT((sizeof(opcode_t) + sizeof(struct cell_buffer_range)) % 16 == 0);
+   uint32_t *batch = (uint32_t*)cell_batch_alloc16(cell, 
+      sizeof(opcode_t) + sizeof(struct cell_buffer_range));
+   struct cell_buffer_range *br = (struct cell_buffer_range *) &batch[4];
    batch[0] = CELL_CMD_FLUSH_BUFFER_RANGE;
    br->base = (uintptr_t) ptr;
    br->size = size;
-   cell_batch_append(cell, batch, sizeof(batch));
 }
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index 0a0af81f53..39b85faeb8 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -133,7 +133,7 @@ lookup_fragment_ops(struct cell_context *cell)
        */
       ops = CALLOC_VARIANT_LENGTH_STRUCT(cell_command_fragment_ops, total_code_size);
       /* populate the new cell_command_fragment_ops object */
-      ops->opcode = CELL_CMD_STATE_FRAGMENT_OPS;
+      ops->opcode[0] = CELL_CMD_STATE_FRAGMENT_OPS;
       ops->total_code_size = total_code_size;
       ops->front_code_index = 0;
       memcpy(ops->code, spe_code_front.store, front_code_size);
@@ -178,10 +178,10 @@ static void
 emit_state_cmd(struct cell_context *cell, uint cmd,
                const void *state, uint state_size)
 {
-   uint64_t *dst = (uint64_t *) 
-       cell_batch_alloc(cell, ROUNDUP8(sizeof(uint64_t) + state_size));
+   uint32_t *dst = (uint32_t *) 
+       cell_batch_alloc16(cell, ROUNDUP16(sizeof(opcode_t) + state_size));
    *dst = cmd;
-   memcpy(dst + 1, state, state_size);
+   memcpy(dst + 4, state, state_size);
 }
 
 
@@ -195,9 +195,10 @@ cell_emit_state(struct cell_context *cell)
    if (cell->dirty & CELL_NEW_FRAMEBUFFER) {
       struct pipe_surface *cbuf = cell->framebuffer.cbufs[0];
       struct pipe_surface *zbuf = cell->framebuffer.zsbuf;
+      STATIC_ASSERT(sizeof(struct cell_command_framebuffer) % 16 == 0);
       struct cell_command_framebuffer *fb
-         = cell_batch_alloc(cell, sizeof(*fb));
-      fb->opcode = CELL_CMD_STATE_FRAMEBUFFER;
+         = cell_batch_alloc16(cell, sizeof(*fb));
+      fb->opcode[0] = CELL_CMD_STATE_FRAMEBUFFER;
       fb->color_start = cell->cbuf_map[0];
       fb->color_format = cbuf->format;
       fb->depth_start = cell->zsbuf_map;
@@ -211,17 +212,19 @@ cell_emit_state(struct cell_context *cell)
    }
 
    if (cell->dirty & (CELL_NEW_RASTERIZER)) {
+      STATIC_ASSERT(sizeof(struct cell_command_rasterizer) % 16 == 0);
       struct cell_command_rasterizer *rast =
-         cell_batch_alloc(cell, sizeof(*rast));
-      rast->opcode = CELL_CMD_STATE_RASTERIZER;
+         cell_batch_alloc16(cell, sizeof(*rast));
+      rast->opcode[0] = CELL_CMD_STATE_RASTERIZER;
       rast->rasterizer = *cell->rasterizer;
    }
 
    if (cell->dirty & (CELL_NEW_FS)) {
       /* Send new fragment program to SPUs */
+      STATIC_ASSERT(sizeof(struct cell_command_fragment_program) % 16 == 0);
       struct cell_command_fragment_program *fp
-            = cell_batch_alloc(cell, sizeof(*fp));
-      fp->opcode = CELL_CMD_STATE_FRAGMENT_PROGRAM;
+            = cell_batch_alloc16(cell, sizeof(*fp));
+      fp->opcode[0] = CELL_CMD_STATE_FRAGMENT_PROGRAM;
       fp->num_inst = cell->fs->code.num_inst;
       memcpy(&fp->code, cell->fs->code.store,
              SPU_MAX_FRAGMENT_PROGRAM_INSTS * SPE_INST_SIZE);
@@ -238,14 +241,14 @@ cell_emit_state(struct cell_context *cell)
       const uint shader = PIPE_SHADER_FRAGMENT;
       const uint num_const = cell->constants[shader].size / sizeof(float);
       uint i, j;
-      float *buf = cell_batch_alloc(cell, 16 + num_const * sizeof(float));
-      uint64_t *ibuf = (uint64_t *) buf;
+      float *buf = cell_batch_alloc16(cell, ROUNDUP16(32 + num_const * sizeof(float)));
+      uint32_t *ibuf = (uint32_t *) buf;
       const float *constants = pipe_buffer_map(cell->pipe.screen,
                                                cell->constants[shader].buffer,
                                                PIPE_BUFFER_USAGE_CPU_READ);
       ibuf[0] = CELL_CMD_STATE_FS_CONSTANTS;
-      ibuf[1] = num_const;
-      j = 4;
+      ibuf[4] = num_const;
+      j = 8;
       for (i = 0; i < num_const; i++) {
          buf[j++] = constants[i];
       }
@@ -258,7 +261,7 @@ cell_emit_state(struct cell_context *cell)
       struct cell_command_fragment_ops *fops, *fops_cmd;
       /* Note that cell_command_fragment_ops is a variant-sized record */
       fops = lookup_fragment_ops(cell);
-      fops_cmd = cell_batch_alloc(cell, sizeof(*fops_cmd) + fops->total_code_size);
+      fops_cmd = cell_batch_alloc16(cell, ROUNDUP16(sizeof(*fops_cmd) + fops->total_code_size));
       memcpy(fops_cmd, fops, sizeof(*fops) + fops->total_code_size);
    }
 
@@ -267,9 +270,10 @@ cell_emit_state(struct cell_context *cell)
       for (i = 0; i < CELL_MAX_SAMPLERS; i++) {
          if (cell->dirty_samplers & (1 << i)) {
             if (cell->sampler[i]) {
+               STATIC_ASSERT(sizeof(struct cell_command_sampler) % 16 == 0);
                struct cell_command_sampler *sampler
-                  = cell_batch_alloc(cell, sizeof(*sampler));
-               sampler->opcode = CELL_CMD_STATE_SAMPLER;
+                  = cell_batch_alloc16(cell, sizeof(*sampler));
+               sampler->opcode[0] = CELL_CMD_STATE_SAMPLER;
                sampler->unit = i;
                sampler->state = *cell->sampler[i];
             }
@@ -282,9 +286,10 @@ cell_emit_state(struct cell_context *cell)
       uint i;
       for (i = 0;i < CELL_MAX_SAMPLERS; i++) {
          if (cell->dirty_textures & (1 << i)) {
+            STATIC_ASSERT(sizeof(struct cell_command_texture) % 16 == 0);
             struct cell_command_texture *texture
-               =  cell_batch_alloc(cell, sizeof(*texture));
-            texture->opcode = CELL_CMD_STATE_TEXTURE;
+               =  (struct cell_command_texture *)cell_batch_alloc16(cell, sizeof(*texture));
+            texture->opcode[0] = CELL_CMD_STATE_TEXTURE;
             texture->unit = i;
             if (cell->texture[i]) {
                uint level;
diff --git a/src/gallium/drivers/cell/ppu/cell_vbuf.c b/src/gallium/drivers/cell/ppu/cell_vbuf.c
index 65ba51b6bb..ab54e79689 100644
--- a/src/gallium/drivers/cell/ppu/cell_vbuf.c
+++ b/src/gallium/drivers/cell/ppu/cell_vbuf.c
@@ -116,10 +116,11 @@ cell_vbuf_release_vertices(struct vbuf_render *vbr, void *vertices,
 
    /* Tell SPUs they can release the vert buf */
    if (cvbr->vertex_buf != ~0U) {
+      STATIC_ASSERT(sizeof(struct cell_command_release_verts) % 16 == 0);
       struct cell_command_release_verts *release
          = (struct cell_command_release_verts *)
-         cell_batch_alloc(cell, sizeof(struct cell_command_release_verts));
-      release->opcode = CELL_CMD_RELEASE_VERTS;
+         cell_batch_alloc16(cell, sizeof(struct cell_command_release_verts));
+      release->opcode[0] = CELL_CMD_RELEASE_VERTS;
       release->vertex_buf = cvbr->vertex_buf;
    }
 
@@ -210,15 +211,16 @@ cell_vbuf_draw(struct vbuf_render *vbr,
 
    /* build/insert batch RENDER command */
    {
-      const uint index_bytes = ROUNDUP8(nr_indices * 2);
-      const uint vertex_bytes = nr_vertices * 4 * cell->vertex_info.size;
+      const uint index_bytes = ROUNDUP16(nr_indices * 2);
+      const uint vertex_bytes = ROUNDUP16(nr_vertices * 4 * cell->vertex_info.size);
+      STATIC_ASSERT(sizeof(struct cell_command_render) % 16 == 0);
       const uint batch_size = sizeof(struct cell_command_render) + index_bytes;
 
       struct cell_command_render *render
          = (struct cell_command_render *)
-         cell_batch_alloc(cell, batch_size);
+         cell_batch_alloc16(cell, batch_size);
 
-      render->opcode = CELL_CMD_RENDER;
+      render->opcode[0] = CELL_CMD_RENDER;
       render->prim_type = cvbr->prim;
 
       render->num_indexes = nr_indices;
@@ -236,7 +238,7 @@ cell_vbuf_draw(struct vbuf_render *vbr,
           min_index == 0 &&
           vertex_bytes + 16 <= cell_batch_free_space(cell)) {
          /* vertex data inlined, after indices, at 16-byte boundary */
-         void *dst = cell_batch_alloc_aligned(cell, vertex_bytes, 16);
+         void *dst = cell_batch_alloc16(cell, vertex_bytes);
          memcpy(dst, vertices, vertex_bytes);
          render->inline_verts = TRUE;
          render->vertex_buf = ~0;
diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index 8500d19754..5c0179d954 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -292,10 +292,10 @@ cmd_state_fragment_program(const struct cell_command_fragment_program *fp)
 
 
 static uint
-cmd_state_fs_constants(const uint64_t *buffer, uint pos)
+cmd_state_fs_constants(const qword *buffer, uint pos)
 {
-   const uint num_const = buffer[pos + 1];
-   const float *constants = (const float *) &buffer[pos + 2];
+   const uint num_const = spu_extract((vector unsigned int)buffer[pos+1], 0);
+   const float *constants = (const float *) &buffer[pos+2];
    uint i;
 
    D_PRINTF(CELL_DEBUG_CMD, "CMD_STATE_FS_CONSTANTS (%u)\n", num_const);
@@ -306,8 +306,8 @@ cmd_state_fs_constants(const uint64_t *buffer, uint pos)
       spu.constants[i] = spu_splats(constants[i]);
    }
 
-   /* return new buffer pos (in 8-byte words) */
-   return pos + 2 + num_const / 2;
+   /* return new buffer pos (in 16-byte words) */
+   return pos + 2 + (ROUNDUP16(num_const * sizeof(float)) / 16);
 }
 
 
@@ -547,8 +547,8 @@ cmd_batch(uint opcode)
 {
    const uint buf = (opcode >> 8) & 0xff;
    uint size = (opcode >> 16);
-   uint64_t buffer[CELL_BUFFER_SIZE / 8] ALIGN16_ATTRIB;
-   const unsigned usize = size / sizeof(buffer[0]);
+   qword buffer[CELL_BUFFER_SIZE / 16] ALIGN16_ATTRIB;
+   const unsigned usize = ROUNDUP16(size) / sizeof(buffer[0]);
    uint pos;
 
    D_PRINTF(CELL_DEBUG_CMD, "BATCH buffer %u, len %u, from %p\n",
@@ -578,7 +578,7 @@ cmd_batch(uint opcode)
     * Loop over commands in the batch buffer
     */
    for (pos = 0; pos < usize; /* no incr */) {
-      switch (buffer[pos]) {
+      switch (si_to_uint(buffer[pos])) {
       /*
        * rendering commands
        */
@@ -587,7 +587,7 @@ cmd_batch(uint opcode)
             struct cell_command_clear_surface *clr
                = (struct cell_command_clear_surface *) &buffer[pos];
             cmd_clear_surface(clr);
-            pos += sizeof(*clr) / 8;
+            pos += sizeof(*clr) / 16;
          }
          break;
       case CELL_CMD_RENDER:
@@ -596,7 +596,7 @@ cmd_batch(uint opcode)
                = (struct cell_command_render *) &buffer[pos];
             uint pos_incr;
             cmd_render(render, &pos_incr);
-            pos += pos_incr;
+            pos += ((pos_incr+1)&~1) / 2; // should 'fix' cmd_render return
          }
          break;
       /*
@@ -607,7 +607,7 @@ cmd_batch(uint opcode)
             struct cell_command_framebuffer *fb
                = (struct cell_command_framebuffer *) &buffer[pos];
             cmd_state_framebuffer(fb);
-            pos += sizeof(*fb) / 8;
+            pos += sizeof(*fb) / 16;
          }
          break;
       case CELL_CMD_STATE_FRAGMENT_OPS:
@@ -616,7 +616,7 @@ cmd_batch(uint opcode)
                = (struct cell_command_fragment_ops *) &buffer[pos];
             cmd_state_fragment_ops(fops);
             /* This is a variant-sized command */
-            pos += (sizeof(*fops) + fops->total_code_size)/ 8;
+            pos += ROUNDUP16(sizeof(*fops) + fops->total_code_size) / 16;
          }
          break;
       case CELL_CMD_STATE_FRAGMENT_PROGRAM:
@@ -624,7 +624,7 @@ cmd_batch(uint opcode)
             struct cell_command_fragment_program *fp
                = (struct cell_command_fragment_program *) &buffer[pos];
             cmd_state_fragment_program(fp);
-            pos += sizeof(*fp) / 8;
+            pos += sizeof(*fp) / 16;
          }
          break;
       case CELL_CMD_STATE_FS_CONSTANTS:
@@ -635,7 +635,7 @@ cmd_batch(uint opcode)
             struct cell_command_rasterizer *rast =
                (struct cell_command_rasterizer *) &buffer[pos];
             spu.rasterizer = rast->rasterizer;
-            pos += sizeof(*rast) / 8;
+            pos += sizeof(*rast) / 16;
          }
          break;
       case CELL_CMD_STATE_SAMPLER:
@@ -643,7 +643,7 @@ cmd_batch(uint opcode)
             struct cell_command_sampler *sampler
                = (struct cell_command_sampler *) &buffer[pos];
             cmd_state_sampler(sampler);
-            pos += sizeof(*sampler) / 8;
+            pos += sizeof(*sampler) / 16;
          }
          break;
       case CELL_CMD_STATE_TEXTURE:
@@ -651,37 +651,37 @@ cmd_batch(uint opcode)
             struct cell_command_texture *texture
                = (struct cell_command_texture *) &buffer[pos];
             cmd_state_texture(texture);
-            pos += sizeof(*texture) / 8;
+            pos += sizeof(*texture) / 16;
          }
          break;
       case CELL_CMD_STATE_VERTEX_INFO:
          cmd_state_vertex_info((struct vertex_info *) &buffer[pos+1]);
-         pos += (1 + ROUNDUP8(sizeof(struct vertex_info)) / 8);
+         pos += 1 + ROUNDUP16(sizeof(struct vertex_info)) / 16;
          break;
       case CELL_CMD_STATE_VIEWPORT:
          (void) memcpy(& draw.viewport, &buffer[pos+1],
                        sizeof(struct pipe_viewport_state));
-         pos += (1 + ROUNDUP8(sizeof(struct pipe_viewport_state)) / 8);
+         pos += 1 + ROUNDUP16(sizeof(struct pipe_viewport_state)) / 16;
          break;
       case CELL_CMD_STATE_UNIFORMS:
-         draw.constants = (const float (*)[4]) (uintptr_t) buffer[pos + 1];
+         draw.constants = (const float (*)[4]) (uintptr_t)spu_extract((vector unsigned int)buffer[pos+1],0);
          pos += 2;
          break;
       case CELL_CMD_STATE_VS_ARRAY_INFO:
          cmd_state_vs_array_info((struct cell_array_info *) &buffer[pos+1]);
-         pos += (1 + ROUNDUP8(sizeof(struct cell_array_info)) / 8);
+         pos += 1 + ROUNDUP16(sizeof(struct cell_array_info)) / 16;
          break;
       case CELL_CMD_STATE_BIND_VS:
 #if 0
          spu_bind_vertex_shader(&draw,
                                 (struct cell_shader_info *) &buffer[pos+1]);
 #endif
-         pos += (1 + ROUNDUP8(sizeof(struct cell_shader_info)) / 8);
+         pos += 1 + ROUNDUP16(sizeof(struct cell_shader_info)) / 16;
          break;
       case CELL_CMD_STATE_ATTRIB_FETCH:
          cmd_state_attrib_fetch((struct cell_attribute_fetch_code *)
                                 &buffer[pos+1]);
-         pos += (1 + ROUNDUP8(sizeof(struct cell_attribute_fetch_code)) / 8);
+         pos += 1 + ROUNDUP16(sizeof(struct cell_attribute_fetch_code)) / 16;
          break;
       /*
        * misc commands
@@ -695,7 +695,7 @@ cmd_batch(uint opcode)
             struct cell_command_fence *fence_cmd =
                (struct cell_command_fence *) &buffer[pos];
             cmd_fence(fence_cmd);
-            pos += sizeof(*fence_cmd) / 8;
+            pos += sizeof(*fence_cmd) / 16;
          }
          break;
       case CELL_CMD_RELEASE_VERTS:
@@ -703,7 +703,7 @@ cmd_batch(uint opcode)
             struct cell_command_release_verts *release
                = (struct cell_command_release_verts *) &buffer[pos];
             cmd_release_verts(release);
-            pos += sizeof(*release) / 8;
+            pos += sizeof(*release) / 16;
          }
          break;
       case CELL_CMD_FLUSH_BUFFER_RANGE: {
@@ -711,11 +711,11 @@ cmd_batch(uint opcode)
 	     &buffer[pos+1];
 
 	 spu_dcache_mark_dirty((unsigned) br->base, br->size);
-         pos += (1 + ROUNDUP8(sizeof(struct cell_buffer_range)) / 8);
+         pos += 1 + ROUNDUP16(sizeof(struct cell_buffer_range)) / 16;
 	 break;
       }
       default:
-         printf("SPU %u: bad opcode: 0x%llx\n", spu.init.id, buffer[pos]);
+         printf("SPU %u: bad opcode: 0x%x\n", spu.init.id, si_to_uint(buffer[pos]));
          ASSERT(0);
          break;
       }
-- 
cgit v1.2.3


From 068107b5ad0d3b6e2575cc712398d876f266bb90 Mon Sep 17 00:00:00 2001
From: Jonathan Adamczewski <jadamcze@utas.edu.au>
Date: Tue, 13 Jan 2009 14:02:18 +1100
Subject: cell: Add missing suffix to SHUFFLE macro

---
 src/gallium/drivers/cell/spu/spu_shuffle.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/spu/spu_shuffle.h b/src/gallium/drivers/cell/spu/spu_shuffle.h
index 7cbdb814d2..74f2a0b6d2 100644
--- a/src/gallium/drivers/cell/spu/spu_shuffle.h
+++ b/src/gallium/drivers/cell/spu/spu_shuffle.h
@@ -171,7 +171,7 @@
    SHUFFLE_PATTERN_16_##M##__, \
    SHUFFLE_PATTERN_16_##N##__, \
    SHUFFLE_PATTERN_16_##O##__, \
-   SHUFFLE_PATTERN_16_##P
+   SHUFFLE_PATTERN_16_##P##__
 
 #define SHUFFLE16(A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P) \
    ((const vector unsigned char){ \
-- 
cgit v1.2.3


From f6d09531ff1588ea18048a842ab24338ae4bc5a7 Mon Sep 17 00:00:00 2001
From: Jonathan Adamczewski <jadamcze@utas.edu.au>
Date: Wed, 14 Jan 2009 12:37:46 +1100
Subject: cell: Specify constant as float for CEILF().

Without the f, the constant is treated as a double, resulting in
slower arithmetic and libgcc conversion calls each time CEILF()
is used.
---
 src/gallium/drivers/cell/spu/spu_tri.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index 322be1252e..0d9fcb9997 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -57,7 +57,7 @@ struct vertex_header {
 
 /* XXX fix this */
 #undef CEILF
-#define CEILF(X) ((float) (int) ((X) + 0.99999))
+#define CEILF(X) ((float) (int) ((X) + 0.99999f))
 
 
 #define QUAD_TOP_LEFT     0
-- 
cgit v1.2.3


From 96d230e107abcf4c105e6e7c871124f246763222 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Sat, 7 Feb 2009 13:01:53 -0700
Subject: cell: compile fix: alpha.ref is now alpha.ref_value

---
 src/gallium/drivers/cell/ppu/cell_gen_fragment.c   | 2 +-
 src/gallium/drivers/cell/spu/spu_per_fragment_op.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'src/gallium/drivers/cell/spu')

diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
index 9bdc71b676..66d4b3b6a3 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
@@ -161,7 +161,7 @@ gen_alpha_test(const struct pipe_depth_stencil_alpha_state *dsa,
    if ((dsa->alpha.func != PIPE_FUNC_NEVER) &&
        (dsa->alpha.func != PIPE_FUNC_ALWAYS)) {
       /* load/splat the alpha reference float value */
-      spe_load_float(f, ref_reg, dsa->alpha.ref);
+      spe_load_float(f, ref_reg, dsa->alpha.ref_value);
    }
 
    /* emit code to do the alpha comparison, updating 'mask' */
diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
index 683664e8a4..eba9f95cf1 100644
--- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
+++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
@@ -85,7 +85,7 @@ spu_fallback_fragment_ops(uint x, uint y,
     * Do alpha test
     */
    if (spu.depth_stencil_alpha.alpha.enabled) {
-      vector float ref = spu_splats(spu.depth_stencil_alpha.alpha.ref);
+      vector float ref = spu_splats(spu.depth_stencil_alpha.alpha.ref_value);
       vector unsigned int amask;
 
       switch (spu.depth_stencil_alpha.alpha.func) {
-- 
cgit v1.2.3